import numpy as np import pandas as pd from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import pickle from pathlib import Path from typing import List, Dict, Any, Tuple from data_loader import BhagavadGitaDataLoader class SemanticSearch: def __init__(self, model_name: str = "all-MiniLM-L6-v2", cache_dir: str = "cache"): self.model_name = model_name self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(exist_ok=True) self.embeddings_cache_file = self.cache_dir / f"embeddings_{model_name.replace('/', '_')}.pkl" self.model = SentenceTransformer(model_name) self.data_loader = BhagavadGitaDataLoader(cache_dir) self.embeddings = None self.verses_df = None def _compute_embeddings(self, force_refresh: bool = False) -> np.ndarray: if not force_refresh and self.embeddings_cache_file.exists(): print("Loading cached embeddings...") with open(self.embeddings_cache_file, 'rb') as f: self.embeddings = pickle.load(f) return self.embeddings print("Computing embeddings for all verses...") self.verses_df = self.data_loader.load_dataset() texts_to_embed = [] for _, row in self.verses_df.iterrows(): combined_text = f"{row['english_text']} {row['sanskrit_text']}" texts_to_embed.append(combined_text) self.embeddings = self.model.encode(texts_to_embed, show_progress_bar=True) with open(self.embeddings_cache_file, 'wb') as f: pickle.dump(self.embeddings, f) print(f"Computed and cached {len(self.embeddings)} embeddings") return self.embeddings def search_similar_verses(self, query: str, top_k: int = 5, min_similarity: float = 0.3) -> List[Dict[str, Any]]: if self.embeddings is None: self._compute_embeddings() if self.verses_df is None: self.verses_df = self.data_loader.load_dataset() query_embedding = self.model.encode([query]) similarities = cosine_similarity(query_embedding, self.embeddings)[0] top_indices = np.argsort(similarities)[::-1][:top_k] results = [] for idx in top_indices: similarity_score = similarities[idx] if similarity_score >= min_similarity: verse_data = self.verses_df.iloc[idx].to_dict() verse_data['similarity_score'] = float(similarity_score) results.append(verse_data) return results def search_by_themes(self, themes: List[str], top_k: int = 3) -> List[Dict[str, Any]]: all_results = [] for theme in themes: theme_results = self.search_similar_verses(theme, top_k=top_k, min_similarity=0.2) all_results.extend(theme_results) unique_results = {} for result in all_results: verse_id = result['verse_id'] if verse_id not in unique_results or result['similarity_score'] > unique_results[verse_id]['similarity_score']: unique_results[verse_id] = result sorted_results = sorted(unique_results.values(), key=lambda x: x['similarity_score'], reverse=True) return sorted_results[:top_k * 2] def get_contextual_verses(self, problem_description: str, emotion_keywords: List[str] = None, top_k: int = 5) -> List[Dict[str, Any]]: search_queries = [problem_description] if emotion_keywords: search_queries.extend(emotion_keywords) search_queries.extend([ "duty dharma purpose life", "overcoming fear anxiety doubt", "wisdom knowledge enlightenment", "action without attachment", "finding peace inner strength" ]) return self.search_by_themes(search_queries, top_k=top_k)