mohbay commited on
Commit
89f676e
·
verified ·
1 Parent(s): 8db3552

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -136
app.py CHANGED
@@ -1,156 +1,118 @@
1
- # import gradio as gr
2
- # import pandas as pd
3
- # from sentence_transformers import SentenceTransformer, util
4
-
5
- # # Load files
6
- # df = pd.read_excel("IslamWeb_output.xlsx")
7
- # df2 = pd.read_excel("JordanFatwas_all.xlsx")
8
-
9
- # # Validate
10
- # for d, name in [(df, "IslamWeb"), (df2, "JordanFatwas")]:
11
- # if not {"question", "link"}.issubset(d.columns):
12
- # raise ValueError(f"❌ Missing required columns in {name}")
13
-
14
- # # Load model + encode
15
- # model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
16
- # embeddings = model.encode(df["question"].fillna('').tolist(), convert_to_tensor=True)
17
- # embeddings2 = model.encode(df2["question"].fillna('').tolist(), convert_to_tensor=True)
18
-
19
- # # Define function
20
- # def search_fatwa(query):
21
- # query_embedding = model.encode(query, convert_to_tensor=True)
22
-
23
- # scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
24
- # top_idx = int(scores.argmax())
25
-
26
- # scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
27
- # top_idx2 = int(scores2.argmax())
28
-
29
- # return {
30
- # "question1": df.iloc[top_idx]["question"],
31
- # "link1": df.iloc[top_idx]["link"],
32
- # "question2": df2.iloc[top_idx2]["question"],
33
- # "link2": df2.iloc[top_idx2]["link"],
34
- # }
35
-
36
- # # Interface
37
- # iface = gr.Interface(
38
- # fn=search_fatwa,
39
- # inputs="text",
40
- # outputs="json",
41
- # allow_flagging="never",
42
- # title="Fatwa Search (Dual Source)",
43
- # description="Get the most relevant fatwas from both datasets"
44
- # )
45
-
46
- # iface.launch()
47
-
48
-
49
- # import torch
50
- # import pandas as pd
51
- # from sentence_transformers import SentenceTransformer, util
52
- # import gradio as gr
53
-
54
- # model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
55
- # df = pd.read_csv("cleaned1.csv")
56
- # df2 = pd.read_csv("cleaned2.csv")
57
- # embeddings = torch.load("embeddings1.pt")
58
- # embeddings2 = torch.load("embeddings2.pt")
59
-
60
- # # def search_fatwa(data):
61
- # # query = data[0] if data else ""
62
- # # query_embedding = model.encode(query, convert_to_tensor=True)
63
- # # top_idx = int(util.pytorch_cos_sim(query_embedding, embeddings)[0].argmax())
64
- # # top_idx2 = int(util.pytorch_cos_sim(query_embedding, embeddings2)[0].argmax())
65
- # # return {
66
- # # "question1": df.iloc[top_idx]["question"],
67
- # # "link1": df.iloc[top_idx]["link"],
68
- # # "question2": df2.iloc[top_idx2]["question"],
69
- # # "link2": df2.iloc[top_idx2]["link"]
70
- # # }
71
-
72
- # def search_fatwa(data):
73
- # query = data[0] if isinstance(data, list) else data
74
- # if not query:
75
- # return {"question1": "", "link1": "", "question2": "", "link2": ""}
76
- # query_embedding = model.encode(query, convert_to_tensor=True)
77
- # top_idx = int(util.pytorch_cos_sim(query_embedding, embeddings)[0].argmax())
78
- # top_idx2 = int(util.pytorch_cos_sim(query_embedding, embeddings2)[0].argmax())
79
- # # return {
80
- # # "question1": df.iloc[top_idx]["question"],
81
- # # "link1": df.iloc[top_idx]["link"],
82
- # # "question2": df2.iloc[top_idx2]["question"],
83
- # # "link2": df2.iloc[top_idx2]["link"]
84
- # # }
85
- # result = f"""Question 1: {df.iloc[top_idx]["question"]}
86
- # Link 1: {df.iloc[top_idx]["link"]}
87
-
88
- # Question 2: {df2.iloc[top_idx2]["question"]}
89
- # Link 2: {df2.iloc[top_idx2]["link"]}"""
90
- # return result
91
-
92
- # iface = gr.Interface(
93
- # fn=search_fatwa,
94
- # inputs=[gr.Textbox(label="text", lines=3)],
95
- # outputs="text" # Changed from "json" to "text"
96
- # )
97
-
98
- # # iface = gr.Interface(fn=search_fatwa, inputs=[gr.Textbox(label="text", lines=3)], outputs="json")
99
-
100
-
101
-
102
-
103
- # # iface = gr.Interface(
104
- # # fn=predict,
105
- # # inputs=[gr.Textbox(label="text", lines=3)],
106
- # # outputs='text',
107
- # # title=title,
108
- # # )
109
-
110
- # iface.launch()
111
-
112
-
113
  import torch
114
  import pandas as pd
115
  from sentence_transformers import SentenceTransformer, util
116
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- model = SentenceTransformer("distilbert-base-multilingual-cased")
119
- # model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
120
  df = pd.read_csv("cleaned1.csv")
121
  df2 = pd.read_csv("cleaned2.csv")
122
  df3 = pd.read_csv("cleaned3.csv")
123
 
124
- embeddings = torch.load("embeddings1_1.pt")
125
- embeddings2 = torch.load("embeddings2_1.pt")
126
- embeddings3 = torch.load("embeddings3_1.pt")
127
-
128
- # embeddings = torch.load("embeddings1.pt")
129
- # embeddings2 = torch.load("embeddings2.pt")
130
- # embeddings3 = torch.load("embeddings3.pt")
131
-
132
- # Pre-extract DataFrame columns to avoid repeated iloc calls
 
 
 
 
 
 
 
 
 
 
 
133
  df_questions = df["question"].values
134
  df_links = df["link"].values
135
  df2_questions = df2["question"].values
136
  df2_links = df2["link"].values
137
  df3_questions = df3["question"].values
138
  df3_links = df3["url"].values
139
- def predict(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  if not text or text.strip() == "":
141
  return "No query provided"
142
 
143
- query_embedding = model.encode(text, convert_to_tensor=True)
 
144
 
145
- # Compute similarity scores
146
- sim_scores1 = util.pytorch_cos_sim(query_embedding, embeddings)[0]
147
- sim_scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
148
- sim_scores3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0]
149
 
150
- # Get top 3 values and indices in one call
 
 
 
 
 
 
 
 
 
151
  top3_scores1, top3_idx1 = sim_scores1.topk(3)
152
  top3_scores2, top3_idx2 = sim_scores2.topk(3)
153
  top3_scores3, top3_idx3 = sim_scores3.topk(3)
 
154
  # Convert to CPU once
155
  top3_idx1_cpu = top3_idx1.cpu().numpy()
156
  top3_idx2_cpu = top3_idx2.cpu().numpy()
@@ -159,7 +121,91 @@ def predict(text):
159
  top3_scores1_cpu = top3_scores1.cpu().numpy()
160
  top3_scores2_cpu = top3_scores2.cpu().numpy()
161
  top3_scores3_cpu = top3_scores3.cpu().numpy()
162
- # Prepare results using pre-extracted arrays
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  results = {
164
 
165
  "top2": [
@@ -190,12 +236,15 @@ def predict(text):
190
 
191
  return results
192
 
193
- # Match the EXACT structure of your working translation app
194
- title = "Search CSV"
195
  iface = gr.Interface(
196
- fn=predict, # Changed from search_fatwa to predict
197
- inputs=[gr.Textbox(label="text", lines=3)],
198
  outputs='json',
199
  title=title,
 
200
  )
201
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import pandas as pd
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
+ import numpy as np
6
+ from transformers import MarianMTModel, MarianTokenizer
7
+ import re
8
+
9
+ translator_model_name = "Helsinki-NLP/opus-mt-en-ar"
10
+ translator_tokenizer = MarianTokenizer.from_pretrained(translator_model_name)
11
+ translator_model = MarianMTModel.from_pretrained(translator_model_name)
12
+
13
+ models = [
14
+ SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2"),
15
+ SentenceTransformer("distilbert-base-multilingual-cased"),
16
+ SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
17
+ ]
18
+
19
 
 
 
20
  df = pd.read_csv("cleaned1.csv")
21
  df2 = pd.read_csv("cleaned2.csv")
22
  df3 = pd.read_csv("cleaned3.csv")
23
 
24
+ # Load embeddings for each model - match embeddings to their corresponding models
25
+ embeddings_list = [
26
+ torch.load("embeddings1.pt"), # Model 1 embeddings (distilbert-base-multilingual-cased)
27
+ torch.load("embeddings1_1.pt"), # Model 2 embeddings (paraphrase-multilingual-MiniLM-L12-v2)
28
+ torch.load("embeddings1_2.pt") # Model 3 embeddings (paraphrase-multilingual-mpnet-base-v2)
29
+ ]
30
+
31
+ embeddings2_list = [
32
+ torch.load("embeddings2.pt"), # Model 1 embeddings
33
+ torch.load("embeddings2_1.pt"), # Model 2 embeddings
34
+ torch.load("embeddings2_2.pt") # Model 3 embeddings
35
+ ]
36
+
37
+ embeddings3_list = [
38
+ torch.load("embeddings3.pt"), # Model 1 embeddings
39
+ torch.load("embeddings3_1.pt"), # Model 2 embeddings
40
+ torch.load("embeddings3_2.pt") # Model 3 embeddings
41
+ ]
42
+
43
+ # Pre-extract DataFrame columns
44
  df_questions = df["question"].values
45
  df_links = df["link"].values
46
  df2_questions = df2["question"].values
47
  df2_links = df2["link"].values
48
  df3_questions = df3["question"].values
49
  df3_links = df3["url"].values
50
+
51
+ def is_arabic(text):
52
+ """Check if text contains Arabic characters"""
53
+ arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]')
54
+ return bool(arabic_pattern.search(text))
55
+
56
+ def translate_to_arabic(text):
57
+ """Translate English text to Arabic"""
58
+ if is_arabic(text):
59
+ return text # Already Arabic, no translation needed
60
+
61
+ try:
62
+ # Tokenize and translate
63
+ inputs = translator_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
64
+ translated = translator_model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
65
+ arabic_text = translator_tokenizer.decode(translated[0], skip_special_tokens=True)
66
+ return arabic_text
67
+ except Exception as e:
68
+ print(f"Translation error: {e}")
69
+ return text # Return original text if translation fails
70
+
71
+ def fast_ensemble_similarity(query_text, embeddings_list, models, weights=None):
72
+ """
73
+ Fast ensemble similarity calculation - matches each model with its correct embeddings
74
+ """
75
+ if weights is None:
76
+ weights = [1.0] * len(models)
77
+
78
+ all_scores = []
79
+
80
+ for i, (model, embeddings, weight) in enumerate(zip(models, embeddings_list, weights)):
81
+ # Each model uses its corresponding embeddings
82
+ query_embedding = model.encode(query_text, convert_to_tensor=True)
83
+ sim_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
84
+ weighted_scores = sim_scores * weight
85
+ all_scores.append(weighted_scores)
86
+
87
+ # Combine scores efficiently
88
+ ensemble_scores = torch.stack(all_scores).mean(dim=0)
89
+ return ensemble_scores
90
+
91
+ def predict_with_translation(text):
92
+ """Fast prediction with translation support - correctly matches models with embeddings"""
93
  if not text or text.strip() == "":
94
  return "No query provided"
95
 
96
+ # Translate to Arabic if needed
97
+ arabic_text = translate_to_arabic(text)
98
 
99
+ # Model weights - adjust based on your testing
100
+ model_weights = [0.35, 0.4, 0.25]
 
 
101
 
102
+ # Choose which text to use for search
103
+ search_text = arabic_text if arabic_text != text else text
104
+
105
+ # Fast ensemble similarity calculation for each dataset
106
+ # Each model now uses its correct corresponding embeddings
107
+ sim_scores1 = fast_ensemble_similarity(search_text, embeddings_list, models, model_weights)
108
+ sim_scores2 = fast_ensemble_similarity(search_text, embeddings2_list, models, model_weights)
109
+ sim_scores3 = fast_ensemble_similarity(search_text, embeddings3_list, models, model_weights)
110
+
111
+ # Get top 3 results efficiently
112
  top3_scores1, top3_idx1 = sim_scores1.topk(3)
113
  top3_scores2, top3_idx2 = sim_scores2.topk(3)
114
  top3_scores3, top3_idx3 = sim_scores3.topk(3)
115
+
116
  # Convert to CPU once
117
  top3_idx1_cpu = top3_idx1.cpu().numpy()
118
  top3_idx2_cpu = top3_idx2.cpu().numpy()
 
121
  top3_scores1_cpu = top3_scores1.cpu().numpy()
122
  top3_scores2_cpu = top3_scores2.cpu().numpy()
123
  top3_scores3_cpu = top3_scores3.cpu().numpy()
124
+
125
+ # Format results
126
+ results = {
127
+ "top2": [
128
+ {
129
+ "question": df2_questions[idx],
130
+ "link": df2_links[idx],
131
+ "score": float(score)
132
+ }
133
+ for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
134
+ ],
135
+ "top3": [
136
+ {
137
+ "question": df3_questions[idx],
138
+ "link": df3_links[idx],
139
+ "score": float(score)
140
+ }
141
+ for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
142
+ ],
143
+ "top1": [
144
+ {
145
+ "question": df_questions[idx],
146
+ "link": df_links[idx],
147
+ "score": float(score)
148
+ }
149
+ for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
150
+ ],
151
+ }
152
+
153
+ return results
154
+
155
+
156
+ # Alternative version with dynamic model loading (saves memory)
157
+ def predict_dynamic(text):
158
+ """
159
+ Alternative approach: encode with multiple models on-the-fly
160
+ Uses more computation but less memory
161
+ """
162
+ if not text or text.strip() == "":
163
+ return "No query provided"
164
+
165
+ # Load your original embeddings (generated with first model)
166
+ embeddings1 = torch.load("embeddings1_1.pt")
167
+ embeddings2 = torch.load("embeddings2_1.pt")
168
+ embeddings3 = torch.load("embeddings3_1.pt")
169
+
170
+ model_weights = [0.4, 0.35, 0.25]
171
+
172
+ # Calculate ensemble scores for each dataset
173
+ all_sim_scores1 = []
174
+ all_sim_scores2 = []
175
+ all_sim_scores3 = []
176
+
177
+ for i, model in enumerate(models):
178
+ query_embedding = model.encode(text, convert_to_tensor=True)
179
+
180
+ # For this example, using same embeddings for all models
181
+ # In practice, you'd want different embeddings for each model
182
+ sim1 = util.pytorch_cos_sim(query_embedding, embeddings1)[0] * model_weights[i]
183
+ sim2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0] * model_weights[i]
184
+ sim3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0] * model_weights[i]
185
+
186
+ all_sim_scores1.append(sim1)
187
+ all_sim_scores2.append(sim2)
188
+ all_sim_scores3.append(sim3)
189
+
190
+ # Combine scores
191
+ final_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
192
+ final_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
193
+ final_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
194
+
195
+ # Get top results
196
+ top3_scores1, top3_idx1 = final_scores1.topk(3)
197
+ top3_scores2, top3_idx2 = final_scores2.topk(3)
198
+ top3_scores3, top3_idx3 = final_scores3.topk(3)
199
+
200
+ # Convert and format results (same as before)
201
+ top3_idx1_cpu = top3_idx1.cpu().numpy()
202
+ top3_idx2_cpu = top3_idx2.cpu().numpy()
203
+ top3_idx3_cpu = top3_idx3.cpu().numpy()
204
+
205
+ top3_scores1_cpu = top3_scores1.cpu().numpy()
206
+ top3_scores2_cpu = top3_scores2.cpu().numpy()
207
+ top3_scores3_cpu = top3_scores3.cpu().numpy()
208
+
209
  results = {
210
 
211
  "top2": [
 
236
 
237
  return results
238
 
239
+ # Create Gradio interface
240
+ title = "Enhanced Multi-Model Search with Translation"
241
  iface = gr.Interface(
242
+ fn=predict_with_translation, # Use the new function with translation
243
+ inputs=[gr.Textbox(label="Enter your question (English or Arabic)", lines=3)],
244
  outputs='json',
245
  title=title,
246
+ description="Ask questions in English or Arabic. English queries will be translated to Arabic for better matching."
247
  )
248
+
249
+ if __name__ == "__main__":
250
+ iface.launch()