import os import pandas as pd from datasets import load_dataset from typing import List, Dict, Any import pickle from pathlib import Path class BhagavadGitaDataLoader: def __init__(self, cache_dir: str = "cache"): self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(exist_ok=True) self.data_cache_file = self.cache_dir / "bhagavad_gita_data.pkl" self.dataset = None def load_dataset(self, force_refresh: bool = False) -> pd.DataFrame: if not force_refresh and self.data_cache_file.exists(): print("Loading cached dataset...") with open(self.data_cache_file, 'rb') as f: self.dataset = pickle.load(f) return self.dataset print("Downloading dataset from HuggingFace...") dataset = load_dataset("JDhruv14/Bhagavad-Gita_Dataset") df = pd.DataFrame(dataset['train']) df = df.rename(columns={ 'chapter': 'chapter_num', 'verse': 'verse_num', 'sanskrit': 'sanskrit_text', 'hindi': 'hindi_text', 'english': 'english_text' }) df['verse_id'] = df['chapter_num'].astype(str) + '.' + df['verse_num'].astype(str) df['combined_text'] = df['english_text'] + ' ' + df['sanskrit_text'] with open(self.data_cache_file, 'wb') as f: pickle.dump(df, f) self.dataset = df return df def get_verse_by_id(self, verse_id: str) -> Dict[str, Any]: if self.dataset is None: self.load_dataset() verse_row = self.dataset[self.dataset['verse_id'] == verse_id] if verse_row.empty: return None return verse_row.iloc[0].to_dict() def get_verses_by_chapter(self, chapter_num: int) -> List[Dict[str, Any]]: if self.dataset is None: self.load_dataset() chapter_verses = self.dataset[self.dataset['chapter_num'] == chapter_num] return chapter_verses.to_dict('records') def search_verses(self, query_text: str, top_k: int = 5) -> List[Dict[str, Any]]: if self.dataset is None: self.load_dataset() query_lower = query_text.lower() matches = [] for _, row in self.dataset.iterrows(): english_text = row['english_text'].lower() if any(word in english_text for word in query_lower.split()): score = sum(1 for word in query_lower.split() if word in english_text) matches.append((score, row.to_dict())) matches.sort(key=lambda x: x[0], reverse=True) return [match[1] for match in matches[:top_k]] def get_all_verses(self) -> pd.DataFrame: if self.dataset is None: self.load_dataset() return self.dataset