jeevitha-app commited on
Commit
c0ba20b
·
verified ·
1 Parent(s): bbeaf50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -28
app.py CHANGED
@@ -1,16 +1,21 @@
 
 
 
1
  import torch
2
  import torch.nn.functional as F
3
- from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
4
- from sentence_transformers import SentenceTransformer
5
- import gradio as gr
6
-
7
- # Load models
 
8
  lang_detect_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
9
  lang_detect_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
10
  trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
11
  trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
 
12
 
13
- # Language maps
14
  id2lang = lang_detect_model.config.id2label
15
 
16
  nllb_langs = {
@@ -26,7 +31,14 @@ xlm_to_nllb = {
26
  "sa": "san_Deva"
27
  }
28
 
29
- # Detection
 
 
 
 
 
 
 
30
  def detect_language(text):
31
  inputs = lang_detect_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
32
  with torch.no_grad():
@@ -35,30 +47,169 @@ def detect_language(text):
35
  pred = torch.argmax(probs, dim=1).item()
36
  return id2lang[pred]
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # Translation
39
- def translate_text(input_text, target_code):
40
- detected = detect_language(input_text)
41
- src_nllb = xlm_to_nllb.get(detected, "eng_Latn")
42
- trans_tokenizer.src_lang = src_nllb
43
- encoded = trans_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
44
  try:
45
- lang_id = trans_tokenizer.convert_tokens_to_ids([target_code])[0]
46
- generated = trans_model.generate(**encoded, forced_bos_token_id=lang_id)
47
- result = trans_tokenizer.decode(generated[0], skip_special_tokens=True)
48
- return f"Detected: {detected}\n\nTranslated:\n{result}"
49
  except:
50
- return "Translation failed."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # Gradio UI
53
- demo = gr.Interface(
54
- fn=translate_text,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  inputs=[
56
- gr.Textbox(label="Input Text", lines=6),
57
- gr.Dropdown(choices=list(nllb_langs.keys()), label="Target Language")
 
58
  ],
59
- outputs="text",
60
- title="Multilingual Text Translator 🌍",
61
- description="Enter your text and select a target language to translate."
62
- )
63
-
64
- demo.launch()
 
 
 
 
 
1
+ # Import Libraries
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
3
+ from sentence_transformers import SentenceTransformer
4
  import torch
5
  import torch.nn.functional as F
6
+ import faiss
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+ import os
10
+ from google.colab import files
11
+ # Load Models
12
  lang_detect_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
13
  lang_detect_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
14
  trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
15
  trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
16
+ embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
17
 
18
+ # Language Code Mappings
19
  id2lang = lang_detect_model.config.id2label
20
 
21
  nllb_langs = {
 
31
  "sa": "san_Deva"
32
  }
33
 
34
+ # Get input directly
35
+ input_text = input("✍️ Enter your text here for translation:\n").strip()
36
+
37
+ if not input_text:
38
+ print("🚫 No input text provided. Exiting.")
39
+ raise SystemExit
40
+
41
+ # Language detection
42
  def detect_language(text):
43
  inputs = lang_detect_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
44
  with torch.no_grad():
 
47
  pred = torch.argmax(probs, dim=1).item()
48
  return id2lang[pred]
49
 
50
+ if input_text.strip():
51
+ detected_lang = detect_language(input_text)
52
+ print(f"\n🔍 Detected Language Code: {detected_lang}")
53
+ else:
54
+ print("🚫 Empty input text. Exiting.")
55
+ raise SystemExit
56
+
57
+ # Choose target language
58
+ print("\n🌐 Available Output Languages:")
59
+ for code, lang in nllb_langs.items():
60
+ print(f"{code} → {lang}")
61
+
62
+ target_code = input("\n🔤 Enter target language code (e.g., eng_Latn): ").strip()
63
+ if target_code not in nllb_langs:
64
+ print("❌ Invalid code. Defaulting to English (eng_Latn).")
65
+ target_code = "eng_Latn"
66
+
67
  # Translation
68
+ def translate(text, src_code, tgt_code):
69
+ trans_tokenizer.src_lang = src_code
70
+ encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
 
 
71
  try:
72
+ target_lang_id = trans_tokenizer.convert_tokens_to_ids([tgt_code])[0]
73
+ generated = trans_model.generate(**encoded, forced_bos_token_id=target_lang_id)
74
+ return trans_tokenizer.decode(generated[0], skip_special_tokens=True)
 
75
  except:
76
+ print("❌ Translation failed.")
77
+ return ""
78
+
79
+ src_nllb = xlm_to_nllb.get(detected_lang, "eng_Latn")
80
+ print(f"\n📜 Text to Translate:\n{input_text}\n")
81
+ print(f"🌍 Source Language: {src_nllb} → Target Language: {target_code}")
82
+
83
+ translated_text = translate(input_text, src_nllb, target_code)
84
+ # Output translated text
85
+ if translated_text.strip():
86
+ print("\n✅ Translation Complete!\n")
87
+ print("🔸 Translated Text:\n")
88
+ print(translated_text)
89
+
90
+ with open("translated_output.txt", "w", encoding="utf-8") as f:
91
+ f.write(translated_text)
92
+ files.download("translated_output.txt")
93
+ else:
94
+ print("❌ No translated text produced.")
95
+ raise SystemExit
96
+ #Create Corpus and FAISS Index
97
+ corpus = [
98
+ "धर्म एव हतो हन्ति धर्मो रक्षति रक्षितः",
99
+ "Dharma when destroyed, destroys; when protected, protects.",
100
+ "The moon affects tides and mood, according to Jyotisha",
101
+ "One should eat according to the season – Rituacharya",
102
+ "Balance of Tridosha is health – Ayurveda principle",
103
+ "Ethics in Mahabharata reflect situational dharma",
104
+ "Meditation improves memory and mental clarity",
105
+ "Jyotisha links planetary motion with life patterns"
106
+ ]
107
+
108
+ corpus_embeddings = embed_model.encode(corpus, convert_to_numpy=True)
109
+ dimension = corpus_embeddings.shape[1]
110
+ index = faiss.IndexFlatL2(dimension)
111
+ index.add(corpus_embeddings)
112
+
113
+
114
+ # Semantic Search Function
115
+ def search_semantic(query, top_k=3):
116
+ query_embedding = embed_model.encode([query])
117
+ distances, indices = index.search(query_embedding, top_k)
118
+ return [(corpus[i], float(distances[0][idx])) for idx, i in enumerate(indices[0])]
119
+
120
+ # Perform Semantic Search
121
+ print("\n🔎 Searching for similar Sanskrit knowledge...")
122
+ results = search_semantic(translated_text)
123
+
124
+ print("\n🔍 Top Semantic Matches:")
125
+ for i, (text, score) in enumerate(results, 1):
126
+ print(f"\n{i}. {text}\n Similarity Score: {score:.4f}")
127
+
128
+ # Visualize Semantic Scores
129
+ labels = [f"{i+1}. Match {i+1}" for i in range(len(results))]
130
+ scores = [score for _, score in results]
131
 
132
+ plt.figure(figsize=(10, 6))
133
+ bars = plt.barh(labels, scores, color="skyblue")
134
+
135
+ plt.xlabel("Similarity Score", fontsize=12)
136
+ plt.title("Top Semantic Matches", fontsize=14)
137
+ plt.gca().invert_yaxis()
138
+
139
+ for bar in bars:
140
+ plt.text(bar.get_width() + 0.5, bar.get_y() + 0.25, f"{bar.get_width():.2f}", fontsize=10)
141
+
142
+ plt.tight_layout()
143
+ plt.savefig("semantic_similarity_plot.png")
144
+ plt.show()
145
+
146
+ files.download("semantic_similarity_plot.png")
147
+
148
+ # BLEU Score Evaluation
149
+ from sacrebleu import corpus_bleu
150
+
151
+ reference = input("📘 Enter correct human translation (for BLEU evaluation): ").strip()
152
+ if reference:
153
+ bleu = corpus_bleu([translated_text], [[reference]])
154
+ print(f"\n📏 BLEU Score: {bleu.score:.2f}")
155
+ else:
156
+ print("ℹ️ BLEU evaluation skipped (no reference entered).")
157
+
158
+ # ✅ Gradio App Interface
159
+ import gradio as gr
160
+ import matplotlib.pyplot as plt
161
+ from sacrebleu import corpus_bleu
162
+
163
+ def full_pipeline(user_input_text, target_lang_code, human_ref=""):
164
+ if not user_input_text.strip():
165
+ return "⚠️ Empty input", "", [], "", ""
166
+
167
+ detected_lang = detect_language(user_input_text)
168
+ src_nllb = xlm_to_nllb.get(detected_lang, "eng_Latn")
169
+
170
+ translated = translate(user_input_text, src_nllb, target_lang_code)
171
+ if not translated:
172
+ return detected_lang, "❌ Translation failed", [], "", ""
173
+
174
+ sem_results = search_semantic(translated)
175
+ result_list = [f"{i+1}. {txt} (Score: {score:.2f})" for i, (txt, score) in enumerate(sem_results)]
176
+
177
+ labels = [f"{i+1}" for i in range(len(sem_results))]
178
+ scores = [score for _, score in sem_results]
179
+ plt.figure(figsize=(6, 4))
180
+ bars = plt.barh(labels, scores, color="lightgreen")
181
+ plt.xlabel("Similarity Score")
182
+ plt.title("Top Semantic Matches")
183
+ plt.gca().invert_yaxis()
184
+ for bar in bars:
185
+ plt.text(bar.get_width() + 0.01, bar.get_y() + 0.1, f"{bar.get_width():.2f}", fontsize=8)
186
+ plt.tight_layout()
187
+ plot_path = "/tmp/sem_plot.png"
188
+ plt.savefig(plot_path)
189
+ plt.close()
190
+
191
+ bleu_score = ""
192
+ if human_ref.strip():
193
+ bleu = corpus_bleu([translated], [[human_ref]])
194
+ bleu_score = f"{bleu.score:.2f}"
195
+
196
+ return detected_lang, translated, result_list, plot_path, bleu_score
197
+
198
+ # 🚀 Launch Gradio Interface
199
+ gr.Interface(
200
+ fn=full_pipeline,
201
  inputs=[
202
+ gr.Textbox(label="Input Text", lines=4, placeholder="Enter text to translate..."),
203
+ gr.Dropdown(label="Target Language", choices=list(nllb_langs.keys()), value="eng_Latn"),
204
+ gr.Textbox(label="(Optional) Human Reference Translation", lines=2, placeholder="Paste human translation here (for BLEU)...")
205
  ],
206
+ outputs=[
207
+ gr.Textbox(label="Detected Language"),
208
+ gr.Textbox(label="Translated Text"),
209
+ gr.Textbox(label="Top Semantic Matches"),
210
+ gr.Image(label="Semantic Similarity Plot"),
211
+ gr.Textbox(label="BLEU Score")
212
+ ],
213
+ title="🌍 Multilingual Translator + Semantic Search",
214
+ description="Detects language → Translates → Finds related Sanskrit concepts → BLEU optional."
215
+ ).launch(debug=True)