Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import pickle | |
| from pathlib import Path | |
| from typing import List, Dict, Any, Tuple | |
| from data_loader import BhagavadGitaDataLoader | |
| class SemanticSearch: | |
| def __init__(self, model_name: str = "all-MiniLM-L6-v2", cache_dir: str = "cache"): | |
| self.model_name = model_name | |
| self.cache_dir = Path(cache_dir) | |
| self.cache_dir.mkdir(exist_ok=True) | |
| self.embeddings_cache_file = self.cache_dir / f"embeddings_{model_name.replace('/', '_')}.pkl" | |
| self.model = SentenceTransformer(model_name) | |
| self.data_loader = BhagavadGitaDataLoader(cache_dir) | |
| self.embeddings = None | |
| self.verses_df = None | |
| def _compute_embeddings(self, force_refresh: bool = False) -> np.ndarray: | |
| if not force_refresh and self.embeddings_cache_file.exists(): | |
| print("Loading cached embeddings...") | |
| with open(self.embeddings_cache_file, 'rb') as f: | |
| self.embeddings = pickle.load(f) | |
| return self.embeddings | |
| print("Computing embeddings for all verses...") | |
| self.verses_df = self.data_loader.load_dataset() | |
| texts_to_embed = [] | |
| for _, row in self.verses_df.iterrows(): | |
| combined_text = f"{row['english_text']} {row['sanskrit_text']}" | |
| texts_to_embed.append(combined_text) | |
| self.embeddings = self.model.encode(texts_to_embed, show_progress_bar=True) | |
| with open(self.embeddings_cache_file, 'wb') as f: | |
| pickle.dump(self.embeddings, f) | |
| print(f"Computed and cached {len(self.embeddings)} embeddings") | |
| return self.embeddings | |
| def search_similar_verses(self, query: str, top_k: int = 5, min_similarity: float = 0.3) -> List[Dict[str, Any]]: | |
| if self.embeddings is None: | |
| self._compute_embeddings() | |
| if self.verses_df is None: | |
| self.verses_df = self.data_loader.load_dataset() | |
| query_embedding = self.model.encode([query]) | |
| similarities = cosine_similarity(query_embedding, self.embeddings)[0] | |
| top_indices = np.argsort(similarities)[::-1][:top_k] | |
| results = [] | |
| for idx in top_indices: | |
| similarity_score = similarities[idx] | |
| if similarity_score >= min_similarity: | |
| verse_data = self.verses_df.iloc[idx].to_dict() | |
| verse_data['similarity_score'] = float(similarity_score) | |
| results.append(verse_data) | |
| return results | |
| def search_by_themes(self, themes: List[str], top_k: int = 3) -> List[Dict[str, Any]]: | |
| all_results = [] | |
| for theme in themes: | |
| theme_results = self.search_similar_verses(theme, top_k=top_k, min_similarity=0.2) | |
| all_results.extend(theme_results) | |
| unique_results = {} | |
| for result in all_results: | |
| verse_id = result['verse_id'] | |
| if verse_id not in unique_results or result['similarity_score'] > unique_results[verse_id]['similarity_score']: | |
| unique_results[verse_id] = result | |
| sorted_results = sorted(unique_results.values(), key=lambda x: x['similarity_score'], reverse=True) | |
| return sorted_results[:top_k * 2] | |
| def get_contextual_verses(self, problem_description: str, emotion_keywords: List[str] = None, top_k: int = 5) -> List[Dict[str, Any]]: | |
| search_queries = [problem_description] | |
| if emotion_keywords: | |
| search_queries.extend(emotion_keywords) | |
| search_queries.extend([ | |
| "duty dharma purpose life", | |
| "overcoming fear anxiety doubt", | |
| "wisdom knowledge enlightenment", | |
| "action without attachment", | |
| "finding peace inner strength" | |
| ]) | |
| return self.search_by_themes(search_queries, top_k=top_k) |