gita_krishna_bot / semantic_search.py
Kartheek Akella
Initial Working Commit
9e4c237
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from pathlib import Path
from typing import List, Dict, Any, Tuple
from data_loader import BhagavadGitaDataLoader
class SemanticSearch:
def __init__(self, model_name: str = "all-MiniLM-L6-v2", cache_dir: str = "cache"):
self.model_name = model_name
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
self.embeddings_cache_file = self.cache_dir / f"embeddings_{model_name.replace('/', '_')}.pkl"
self.model = SentenceTransformer(model_name)
self.data_loader = BhagavadGitaDataLoader(cache_dir)
self.embeddings = None
self.verses_df = None
def _compute_embeddings(self, force_refresh: bool = False) -> np.ndarray:
if not force_refresh and self.embeddings_cache_file.exists():
print("Loading cached embeddings...")
with open(self.embeddings_cache_file, 'rb') as f:
self.embeddings = pickle.load(f)
return self.embeddings
print("Computing embeddings for all verses...")
self.verses_df = self.data_loader.load_dataset()
texts_to_embed = []
for _, row in self.verses_df.iterrows():
combined_text = f"{row['english_text']} {row['sanskrit_text']}"
texts_to_embed.append(combined_text)
self.embeddings = self.model.encode(texts_to_embed, show_progress_bar=True)
with open(self.embeddings_cache_file, 'wb') as f:
pickle.dump(self.embeddings, f)
print(f"Computed and cached {len(self.embeddings)} embeddings")
return self.embeddings
def search_similar_verses(self, query: str, top_k: int = 5, min_similarity: float = 0.3) -> List[Dict[str, Any]]:
if self.embeddings is None:
self._compute_embeddings()
if self.verses_df is None:
self.verses_df = self.data_loader.load_dataset()
query_embedding = self.model.encode([query])
similarities = cosine_similarity(query_embedding, self.embeddings)[0]
top_indices = np.argsort(similarities)[::-1][:top_k]
results = []
for idx in top_indices:
similarity_score = similarities[idx]
if similarity_score >= min_similarity:
verse_data = self.verses_df.iloc[idx].to_dict()
verse_data['similarity_score'] = float(similarity_score)
results.append(verse_data)
return results
def search_by_themes(self, themes: List[str], top_k: int = 3) -> List[Dict[str, Any]]:
all_results = []
for theme in themes:
theme_results = self.search_similar_verses(theme, top_k=top_k, min_similarity=0.2)
all_results.extend(theme_results)
unique_results = {}
for result in all_results:
verse_id = result['verse_id']
if verse_id not in unique_results or result['similarity_score'] > unique_results[verse_id]['similarity_score']:
unique_results[verse_id] = result
sorted_results = sorted(unique_results.values(), key=lambda x: x['similarity_score'], reverse=True)
return sorted_results[:top_k * 2]
def get_contextual_verses(self, problem_description: str, emotion_keywords: List[str] = None, top_k: int = 5) -> List[Dict[str, Any]]:
search_queries = [problem_description]
if emotion_keywords:
search_queries.extend(emotion_keywords)
search_queries.extend([
"duty dharma purpose life",
"overcoming fear anxiety doubt",
"wisdom knowledge enlightenment",
"action without attachment",
"finding peace inner strength"
])
return self.search_by_themes(search_queries, top_k=top_k)