Spaces:

mohbay
/

searchcsv2

Running

App Files Files Community

mohbay commited on Jul 1

Commit

89f676e

verified ·

1 Parent(s): 8db3552

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -136

app.py CHANGED Viewed

@@ -1,156 +1,118 @@
-# import gradio as gr
-# import pandas as pd
-# from sentence_transformers import SentenceTransformer, util
-# # Load files
-# df = pd.read_excel("IslamWeb_output.xlsx")
-# df2 = pd.read_excel("JordanFatwas_all.xlsx")
-# # Validate
-# for d, name in [(df, "IslamWeb"), (df2, "JordanFatwas")]:
-#     if not {"question", "link"}.issubset(d.columns):
-#         raise ValueError(f"❌ Missing required columns in {name}")
-# # Load model + encode
-# model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
-# embeddings = model.encode(df["question"].fillna('').tolist(), convert_to_tensor=True)
-# embeddings2 = model.encode(df2["question"].fillna('').tolist(), convert_to_tensor=True)
-# # Define function
-# def search_fatwa(query):
-#     query_embedding = model.encode(query, convert_to_tensor=True)
-#     scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
-#     top_idx = int(scores.argmax())
-#     scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
-#     top_idx2 = int(scores2.argmax())
-#     return {
-#         "question1": df.iloc[top_idx]["question"],
-#         "link1": df.iloc[top_idx]["link"],
-#         "question2": df2.iloc[top_idx2]["question"],
-#         "link2": df2.iloc[top_idx2]["link"],
-#     }
-# # Interface
-# iface = gr.Interface(
-#     fn=search_fatwa,
-#     inputs="text",
-#     outputs="json",
-#     allow_flagging="never",
-#     title="Fatwa Search (Dual Source)",
-#     description="Get the most relevant fatwas from both datasets"
-# )
-# iface.launch()
-# import torch
-# import pandas as pd
-# from sentence_transformers import SentenceTransformer, util
-# import gradio as gr
-# model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
-# df = pd.read_csv("cleaned1.csv")
-# df2 = pd.read_csv("cleaned2.csv")
-# embeddings = torch.load("embeddings1.pt")
-# embeddings2 = torch.load("embeddings2.pt")
-# # def search_fatwa(data):
-# #     query = data[0] if data else ""
-# #     query_embedding = model.encode(query, convert_to_tensor=True)
-# #     top_idx = int(util.pytorch_cos_sim(query_embedding, embeddings)[0].argmax())
-# #     top_idx2 = int(util.pytorch_cos_sim(query_embedding, embeddings2)[0].argmax())
-# #     return {
-# #         "question1": df.iloc[top_idx]["question"],
-# #         "link1": df.iloc[top_idx]["link"],
-# #         "question2": df2.iloc[top_idx2]["question"],
-# #         "link2": df2.iloc[top_idx2]["link"]
-# #     }
-# def search_fatwa(data):
-#     query = data[0] if isinstance(data, list) else data
-#     if not query:
-#         return {"question1": "", "link1": "", "question2": "", "link2": ""}
-#     query_embedding = model.encode(query, convert_to_tensor=True)
-#     top_idx = int(util.pytorch_cos_sim(query_embedding, embeddings)[0].argmax())
-#     top_idx2 = int(util.pytorch_cos_sim(query_embedding, embeddings2)[0].argmax())
-#     # return {
-#     #     "question1": df.iloc[top_idx]["question"],
-#     #     "link1": df.iloc[top_idx]["link"],
-#     #     "question2": df2.iloc[top_idx2]["question"],
-#     #     "link2": df2.iloc[top_idx2]["link"]
-#     # }
-#     result = f"""Question 1: {df.iloc[top_idx]["question"]}
-#         Link 1: {df.iloc[top_idx]["link"]}
-#         Question 2: {df2.iloc[top_idx2]["question"]}
-#         Link 2: {df2.iloc[top_idx2]["link"]}"""
-#     return result
-# iface = gr.Interface(
-#     fn=search_fatwa,
-#     inputs=[gr.Textbox(label="text", lines=3)],
-#     outputs="text"  # Changed from "json" to "text"
-# )
-# # iface = gr.Interface(fn=search_fatwa, inputs=[gr.Textbox(label="text", lines=3)], outputs="json")
-# # iface = gr.Interface(
-# #   fn=predict,
-# #   inputs=[gr.Textbox(label="text", lines=3)],
-# #   outputs='text',
-# #   title=title,
-# # )
-# iface.launch()
 import torch
 import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
-model = SentenceTransformer("distilbert-base-multilingual-cased")
-# model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
 df = pd.read_csv("cleaned1.csv")
 df2 = pd.read_csv("cleaned2.csv")
 df3 = pd.read_csv("cleaned3.csv")
-embeddings = torch.load("embeddings1_1.pt")
-embeddings2 = torch.load("embeddings2_1.pt")
-embeddings3 = torch.load("embeddings3_1.pt")
-# embeddings = torch.load("embeddings1.pt")
-# embeddings2 = torch.load("embeddings2.pt")
-# embeddings3 = torch.load("embeddings3.pt")
-# Pre-extract DataFrame columns to avoid repeated iloc calls
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
 df2_links = df2["link"].values
 df3_questions = df3["question"].values
 df3_links = df3["url"].values
-def predict(text):
     if not text or text.strip() == "":
         return "No query provided"
-    query_embedding = model.encode(text, convert_to_tensor=True)
-    # Compute similarity scores
-    sim_scores1 = util.pytorch_cos_sim(query_embedding, embeddings)[0]
-    sim_scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
-    sim_scores3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0]
-    # Get top 3 values and indices in one call
     top3_scores1, top3_idx1 = sim_scores1.topk(3)
     top3_scores2, top3_idx2 = sim_scores2.topk(3)
     top3_scores3, top3_idx3 = sim_scores3.topk(3)
     # Convert to CPU once
     top3_idx1_cpu = top3_idx1.cpu().numpy()
     top3_idx2_cpu = top3_idx2.cpu().numpy()
@@ -159,7 +121,91 @@ def predict(text):
     top3_scores1_cpu = top3_scores1.cpu().numpy()
     top3_scores2_cpu = top3_scores2.cpu().numpy()
     top3_scores3_cpu = top3_scores3.cpu().numpy()
-    # Prepare results using pre-extracted arrays
     results = {
         "top2": [
@@ -190,12 +236,15 @@ def predict(text):
     return results
-# Match the EXACT structure of your working translation app
-title = "Search CSV"
 iface = gr.Interface(
-    fn=predict,  # Changed from search_fatwa to predict
-    inputs=[gr.Textbox(label="text", lines=3)],
     outputs='json',
     title=title,
 )
-iface.launch()

 import torch
 import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
+import numpy as np
+from transformers import MarianMTModel, MarianTokenizer
+import re
+translator_model_name = "Helsinki-NLP/opus-mt-en-ar"
+translator_tokenizer = MarianTokenizer.from_pretrained(translator_model_name)
+translator_model = MarianMTModel.from_pretrained(translator_model_name)
+models = [
+    SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2"),
+    SentenceTransformer("distilbert-base-multilingual-cased"),
+    SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
+]
 df = pd.read_csv("cleaned1.csv")
 df2 = pd.read_csv("cleaned2.csv")
 df3 = pd.read_csv("cleaned3.csv")
+# Load embeddings for each model - match embeddings to their corresponding models
+embeddings_list = [
+    torch.load("embeddings1.pt"),    # Model 1 embeddings (distilbert-base-multilingual-cased)
+    torch.load("embeddings1_1.pt"),  # Model 2 embeddings (paraphrase-multilingual-MiniLM-L12-v2)
+    torch.load("embeddings1_2.pt")   # Model 3 embeddings (paraphrase-multilingual-mpnet-base-v2)
+]
+embeddings2_list = [
+    torch.load("embeddings2.pt"),    # Model 1 embeddings
+    torch.load("embeddings2_1.pt"),  # Model 2 embeddings
+    torch.load("embeddings2_2.pt")   # Model 3 embeddings
+]
+embeddings3_list = [
+    torch.load("embeddings3.pt"),    # Model 1 embeddings
+    torch.load("embeddings3_1.pt"),  # Model 2 embeddings
+    torch.load("embeddings3_2.pt")   # Model 3 embeddings
+]
+# Pre-extract DataFrame columns
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
 df2_links = df2["link"].values
 df3_questions = df3["question"].values
 df3_links = df3["url"].values
+def is_arabic(text):
+    """Check if text contains Arabic characters"""
+    arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]')
+    return bool(arabic_pattern.search(text))
+def translate_to_arabic(text):
+    """Translate English text to Arabic"""
+    if is_arabic(text):
+        return text  # Already Arabic, no translation needed
+    try:
+        # Tokenize and translate
+        inputs = translator_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        translated = translator_model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
+        arabic_text = translator_tokenizer.decode(translated[0], skip_special_tokens=True)
+        return arabic_text
+    except Exception as e:
+        print(f"Translation error: {e}")
+        return text  # Return original text if translation fails
+def fast_ensemble_similarity(query_text, embeddings_list, models, weights=None):
+    """
+    Fast ensemble similarity calculation - matches each model with its correct embeddings
+    """
+    if weights is None:
+        weights = [1.0] * len(models)
+    all_scores = []
+    for i, (model, embeddings, weight) in enumerate(zip(models, embeddings_list, weights)):
+        # Each model uses its corresponding embeddings
+        query_embedding = model.encode(query_text, convert_to_tensor=True)
+        sim_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
+        weighted_scores = sim_scores * weight
+        all_scores.append(weighted_scores)
+    # Combine scores efficiently
+    ensemble_scores = torch.stack(all_scores).mean(dim=0)
+    return ensemble_scores
+def predict_with_translation(text):
+    """Fast prediction with translation support - correctly matches models with embeddings"""
     if not text or text.strip() == "":
         return "No query provided"
+    # Translate to Arabic if needed
+    arabic_text = translate_to_arabic(text)
+    # Model weights - adjust based on your testing
+    model_weights = [0.35, 0.4, 0.25]
+    # Choose which text to use for search
+    search_text = arabic_text if arabic_text != text else text
+    # Fast ensemble similarity calculation for each dataset
+    # Each model now uses its correct corresponding embeddings
+    sim_scores1 = fast_ensemble_similarity(search_text, embeddings_list, models, model_weights)
+    sim_scores2 = fast_ensemble_similarity(search_text, embeddings2_list, models, model_weights)
+    sim_scores3 = fast_ensemble_similarity(search_text, embeddings3_list, models, model_weights)
+    # Get top 3 results efficiently
     top3_scores1, top3_idx1 = sim_scores1.topk(3)
     top3_scores2, top3_idx2 = sim_scores2.topk(3)
     top3_scores3, top3_idx3 = sim_scores3.topk(3)
     # Convert to CPU once
     top3_idx1_cpu = top3_idx1.cpu().numpy()
     top3_idx2_cpu = top3_idx2.cpu().numpy()
     top3_scores1_cpu = top3_scores1.cpu().numpy()
     top3_scores2_cpu = top3_scores2.cpu().numpy()
     top3_scores3_cpu = top3_scores3.cpu().numpy()
+    # Format results
+    results = {
+        "top2": [
+            {
+                "question": df2_questions[idx],
+                "link": df2_links[idx],
+                "score": float(score)
+            }
+            for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
+        ],
+        "top3": [
+            {
+                "question": df3_questions[idx],
+                "link": df3_links[idx],
+                "score": float(score)
+            }
+            for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
+        ],
+        "top1": [
+            {
+                "question": df_questions[idx],
+                "link": df_links[idx],
+                "score": float(score)
+            }
+            for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
+        ],
+    }
+    return results
+# Alternative version with dynamic model loading (saves memory)
+def predict_dynamic(text):
+    """
+    Alternative approach: encode with multiple models on-the-fly
+    Uses more computation but less memory
+    """
+    if not text or text.strip() == "":
+        return "No query provided"
+    # Load your original embeddings (generated with first model)
+    embeddings1 = torch.load("embeddings1_1.pt")
+    embeddings2 = torch.load("embeddings2_1.pt")
+    embeddings3 = torch.load("embeddings3_1.pt")
+    model_weights = [0.4, 0.35, 0.25]
+    # Calculate ensemble scores for each dataset
+    all_sim_scores1 = []
+    all_sim_scores2 = []
+    all_sim_scores3 = []
+    for i, model in enumerate(models):
+        query_embedding = model.encode(text, convert_to_tensor=True)
+        # For this example, using same embeddings for all models
+        # In practice, you'd want different embeddings for each model
+        sim1 = util.pytorch_cos_sim(query_embedding, embeddings1)[0] * model_weights[i]
+        sim2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0] * model_weights[i]
+        sim3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0] * model_weights[i]
+        all_sim_scores1.append(sim1)
+        all_sim_scores2.append(sim2)
+        all_sim_scores3.append(sim3)
+    # Combine scores
+    final_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
+    final_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
+    final_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
+    # Get top results
+    top3_scores1, top3_idx1 = final_scores1.topk(3)
+    top3_scores2, top3_idx2 = final_scores2.topk(3)
+    top3_scores3, top3_idx3 = final_scores3.topk(3)
+    # Convert and format results (same as before)
+    top3_idx1_cpu = top3_idx1.cpu().numpy()
+    top3_idx2_cpu = top3_idx2.cpu().numpy()
+    top3_idx3_cpu = top3_idx3.cpu().numpy()
+    top3_scores1_cpu = top3_scores1.cpu().numpy()
+    top3_scores2_cpu = top3_scores2.cpu().numpy()
+    top3_scores3_cpu = top3_scores3.cpu().numpy()
     results = {
         "top2": [
     return results
+# Create Gradio interface
+title = "Enhanced Multi-Model Search with Translation"
 iface = gr.Interface(
+    fn=predict_with_translation,  # Use the new function with translation
+    inputs=[gr.Textbox(label="Enter your question (English or Arabic)", lines=3)],
     outputs='json',
     title=title,
+    description="Ask questions in English or Arabic. English queries will be translated to Arabic for better matching."
 )
+if __name__ == "__main__":
+    iface.launch()