jeevitha-app commited on
Commit
5902010
·
verified ·
1 Parent(s): c0ba20b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -108
app.py CHANGED
@@ -6,8 +6,10 @@ import torch.nn.functional as F
6
  import faiss
7
  import numpy as np
8
  import matplotlib.pyplot as plt
 
 
9
  import os
10
- from google.colab import files
11
  # Load Models
12
  lang_detect_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
13
  lang_detect_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
@@ -31,14 +33,23 @@ xlm_to_nllb = {
31
  "sa": "san_Deva"
32
  }
33
 
34
- # Get input directly
35
- input_text = input("✍️ Enter your text here for translation:\n").strip()
36
-
37
- if not input_text:
38
- print("🚫 No input text provided. Exiting.")
39
- raise SystemExit
 
 
 
 
 
 
 
 
 
40
 
41
- # Language detection
42
  def detect_language(text):
43
  inputs = lang_detect_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
44
  with torch.no_grad():
@@ -47,24 +58,7 @@ def detect_language(text):
47
  pred = torch.argmax(probs, dim=1).item()
48
  return id2lang[pred]
49
 
50
- if input_text.strip():
51
- detected_lang = detect_language(input_text)
52
- print(f"\n🔍 Detected Language Code: {detected_lang}")
53
- else:
54
- print("🚫 Empty input text. Exiting.")
55
- raise SystemExit
56
-
57
- # Choose target language
58
- print("\n🌐 Available Output Languages:")
59
- for code, lang in nllb_langs.items():
60
- print(f"{code} → {lang}")
61
-
62
- target_code = input("\n🔤 Enter target language code (e.g., eng_Latn): ").strip()
63
- if target_code not in nllb_langs:
64
- print("❌ Invalid code. Defaulting to English (eng_Latn).")
65
- target_code = "eng_Latn"
66
-
67
- # Translation
68
  def translate(text, src_code, tgt_code):
69
  trans_tokenizer.src_lang = src_code
70
  encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
@@ -73,93 +67,15 @@ def translate(text, src_code, tgt_code):
73
  generated = trans_model.generate(**encoded, forced_bos_token_id=target_lang_id)
74
  return trans_tokenizer.decode(generated[0], skip_special_tokens=True)
75
  except:
76
- print("❌ Translation failed.")
77
  return ""
78
 
79
- src_nllb = xlm_to_nllb.get(detected_lang, "eng_Latn")
80
- print(f"\n📜 Text to Translate:\n{input_text}\n")
81
- print(f"🌍 Source Language: {src_nllb} → Target Language: {target_code}")
82
-
83
- translated_text = translate(input_text, src_nllb, target_code)
84
- # Output translated text
85
- if translated_text.strip():
86
- print("\n✅ Translation Complete!\n")
87
- print("🔸 Translated Text:\n")
88
- print(translated_text)
89
-
90
- with open("translated_output.txt", "w", encoding="utf-8") as f:
91
- f.write(translated_text)
92
- files.download("translated_output.txt")
93
- else:
94
- print("❌ No translated text produced.")
95
- raise SystemExit
96
- #Create Corpus and FAISS Index
97
- corpus = [
98
- "धर्म एव हतो हन्ति धर्मो रक्षति रक्षितः",
99
- "Dharma when destroyed, destroys; when protected, protects.",
100
- "The moon affects tides and mood, according to Jyotisha",
101
- "One should eat according to the season – Rituacharya",
102
- "Balance of Tridosha is health – Ayurveda principle",
103
- "Ethics in Mahabharata reflect situational dharma",
104
- "Meditation improves memory and mental clarity",
105
- "Jyotisha links planetary motion with life patterns"
106
- ]
107
-
108
- corpus_embeddings = embed_model.encode(corpus, convert_to_numpy=True)
109
- dimension = corpus_embeddings.shape[1]
110
- index = faiss.IndexFlatL2(dimension)
111
- index.add(corpus_embeddings)
112
-
113
-
114
- # Semantic Search Function
115
  def search_semantic(query, top_k=3):
116
  query_embedding = embed_model.encode([query])
117
  distances, indices = index.search(query_embedding, top_k)
118
  return [(corpus[i], float(distances[0][idx])) for idx, i in enumerate(indices[0])]
119
 
120
- # Perform Semantic Search
121
- print("\n🔎 Searching for similar Sanskrit knowledge...")
122
- results = search_semantic(translated_text)
123
-
124
- print("\n🔍 Top Semantic Matches:")
125
- for i, (text, score) in enumerate(results, 1):
126
- print(f"\n{i}. {text}\n Similarity Score: {score:.4f}")
127
-
128
- # Visualize Semantic Scores
129
- labels = [f"{i+1}. Match {i+1}" for i in range(len(results))]
130
- scores = [score for _, score in results]
131
-
132
- plt.figure(figsize=(10, 6))
133
- bars = plt.barh(labels, scores, color="skyblue")
134
-
135
- plt.xlabel("Similarity Score", fontsize=12)
136
- plt.title("Top Semantic Matches", fontsize=14)
137
- plt.gca().invert_yaxis()
138
-
139
- for bar in bars:
140
- plt.text(bar.get_width() + 0.5, bar.get_y() + 0.25, f"{bar.get_width():.2f}", fontsize=10)
141
-
142
- plt.tight_layout()
143
- plt.savefig("semantic_similarity_plot.png")
144
- plt.show()
145
-
146
- files.download("semantic_similarity_plot.png")
147
-
148
- # BLEU Score Evaluation
149
- from sacrebleu import corpus_bleu
150
-
151
- reference = input("📘 Enter correct human translation (for BLEU evaluation): ").strip()
152
- if reference:
153
- bleu = corpus_bleu([translated_text], [[reference]])
154
- print(f"\n📏 BLEU Score: {bleu.score:.2f}")
155
- else:
156
- print("ℹ️ BLEU evaluation skipped (no reference entered).")
157
-
158
- # ✅ Gradio App Interface
159
- import gradio as gr
160
- import matplotlib.pyplot as plt
161
- from sacrebleu import corpus_bleu
162
-
163
  def full_pipeline(user_input_text, target_lang_code, human_ref=""):
164
  if not user_input_text.strip():
165
  return "⚠️ Empty input", "", [], "", ""
@@ -174,6 +90,7 @@ def full_pipeline(user_input_text, target_lang_code, human_ref=""):
174
  sem_results = search_semantic(translated)
175
  result_list = [f"{i+1}. {txt} (Score: {score:.2f})" for i, (txt, score) in enumerate(sem_results)]
176
 
 
177
  labels = [f"{i+1}" for i in range(len(sem_results))]
178
  scores = [score for _, score in sem_results]
179
  plt.figure(figsize=(6, 4))
@@ -195,7 +112,7 @@ def full_pipeline(user_input_text, target_lang_code, human_ref=""):
195
 
196
  return detected_lang, translated, result_list, plot_path, bleu_score
197
 
198
- # 🚀 Launch Gradio Interface
199
  gr.Interface(
200
  fn=full_pipeline,
201
  inputs=[
@@ -212,4 +129,4 @@ gr.Interface(
212
  ],
213
  title="🌍 Multilingual Translator + Semantic Search",
214
  description="Detects language → Translates → Finds related Sanskrit concepts → BLEU optional."
215
- ).launch(debug=True)
 
6
  import faiss
7
  import numpy as np
8
  import matplotlib.pyplot as plt
9
+ import gradio as gr
10
+ from sacrebleu import corpus_bleu
11
  import os
12
+
13
  # Load Models
14
  lang_detect_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
15
  lang_detect_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
 
33
  "sa": "san_Deva"
34
  }
35
 
36
+ # Static Corpus
37
+ corpus = [
38
+ "धर्म एव हतो हन्ति धर्मो रक्षति रक्षितः",
39
+ "Dharma when destroyed, destroys; when protected, protects.",
40
+ "The moon affects tides and mood, according to Jyotisha",
41
+ "One should eat according to the season – Rituacharya",
42
+ "Balance of Tridosha is health – Ayurveda principle",
43
+ "Ethics in Mahabharata reflect situational dharma",
44
+ "Meditation improves memory and mental clarity",
45
+ "Jyotisha links planetary motion with life patterns"
46
+ ]
47
+ corpus_embeddings = embed_model.encode(corpus, convert_to_numpy=True)
48
+ dimension = corpus_embeddings.shape[1]
49
+ index = faiss.IndexFlatL2(dimension)
50
+ index.add(corpus_embeddings)
51
 
52
+ # Detect Language
53
  def detect_language(text):
54
  inputs = lang_detect_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
55
  with torch.no_grad():
 
58
  pred = torch.argmax(probs, dim=1).item()
59
  return id2lang[pred]
60
 
61
+ # Translate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def translate(text, src_code, tgt_code):
63
  trans_tokenizer.src_lang = src_code
64
  encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
 
67
  generated = trans_model.generate(**encoded, forced_bos_token_id=target_lang_id)
68
  return trans_tokenizer.decode(generated[0], skip_special_tokens=True)
69
  except:
 
70
  return ""
71
 
72
+ # Semantic Search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def search_semantic(query, top_k=3):
74
  query_embedding = embed_model.encode([query])
75
  distances, indices = index.search(query_embedding, top_k)
76
  return [(corpus[i], float(distances[0][idx])) for idx, i in enumerate(indices[0])]
77
 
78
+ # Full pipeline for Gradio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def full_pipeline(user_input_text, target_lang_code, human_ref=""):
80
  if not user_input_text.strip():
81
  return "⚠️ Empty input", "", [], "", ""
 
90
  sem_results = search_semantic(translated)
91
  result_list = [f"{i+1}. {txt} (Score: {score:.2f})" for i, (txt, score) in enumerate(sem_results)]
92
 
93
+ # Plot
94
  labels = [f"{i+1}" for i in range(len(sem_results))]
95
  scores = [score for _, score in sem_results]
96
  plt.figure(figsize=(6, 4))
 
112
 
113
  return detected_lang, translated, result_list, plot_path, bleu_score
114
 
115
+ # Gradio App
116
  gr.Interface(
117
  fn=full_pipeline,
118
  inputs=[
 
129
  ],
130
  title="🌍 Multilingual Translator + Semantic Search",
131
  description="Detects language → Translates → Finds related Sanskrit concepts → BLEU optional."
132
+ ).launch()