Spaces:
Running
Running
import torch | |
import torch.nn.functional as F | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM | |
from sentence_transformers import SentenceTransformer | |
import gradio as gr | |
import faiss | |
import numpy as np | |
# Load models | |
lang_detect_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
lang_detect_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") | |
trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") | |
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") | |
# Language mappings | |
id2lang = lang_detect_model.config.id2label | |
xlm_to_nllb = { | |
"en": "eng_Latn", "fr": "fra_Latn", "hi": "hin_Deva", "es": "spa_Latn", "de": "deu_Latn", | |
"ta": "tam_Taml", "te": "tel_Telu", "ja": "jpn_Jpan", "zh": "zho_Hans", "ar": "arb_Arab", | |
"sa": "san_Deva" | |
} | |
nllb_langs = { | |
"eng_Latn": "English", "fra_Latn": "French", "hin_Deva": "Hindi", | |
"spa_Latn": "Spanish", "deu_Latn": "German", "tam_Taml": "Tamil", | |
"tel_Telu": "Telugu", "jpn_Jpan": "Japanese", "zho_Hans": "Chinese", | |
"arb_Arab": "Arabic", "san_Deva": "Sanskrit" | |
} | |
# Sample knowledge corpus | |
corpus = [ | |
"धर्म एव हतो हन्ति धर्मो रक्षति रक्षितः", | |
"Dharma when destroyed, destroys; when protected, protects.", | |
"The moon affects tides and mood, according to Jyotisha", | |
"One should eat according to the season – Rituacharya", | |
"Balance of Tridosha is health – Ayurveda principle", | |
"Ethics in Mahabharata reflect situational dharma", | |
"Meditation improves memory and mental clarity", | |
"Jyotisha links planetary motion with life patterns" | |
] | |
# Semantic index | |
corpus_embeddings = embed_model.encode(corpus, convert_to_numpy=True) | |
dimension = corpus_embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(corpus_embeddings) | |
# Language Detection | |
def detect_language(text): | |
inputs = lang_detect_tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
with torch.no_grad(): | |
outputs = lang_detect_model(**inputs) | |
probs = F.softmax(outputs.logits, dim=1) | |
pred = torch.argmax(probs, dim=1).item() | |
return id2lang[pred] | |
# Translation | |
def translate(text, src_code, tgt_code): | |
trans_tokenizer.src_lang = src_code | |
encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
target_lang_id = trans_tokenizer.convert_tokens_to_ids([tgt_code])[0] | |
generated = trans_model.generate(**encoded, forced_bos_token_id=target_lang_id) | |
return trans_tokenizer.decode(generated[0], skip_special_tokens=True) | |
# Semantic Search | |
def search_semantic(query, top_k=3): | |
query_embedding = embed_model.encode([query]) | |
distances, indices = index.search(query_embedding, top_k) | |
results = [] | |
for i, idx in enumerate(indices[0]): | |
results.append(f"{i+1}. {corpus[idx]} (Score: {distances[0][i]:.2f})") | |
return "\n".join(results) | |
# Main function | |
def pipeline(text, target_lang_code): | |
if not text.strip(): | |
return "Empty input", "", "", "" | |
detected = detect_language(text) | |
src_code = xlm_to_nllb.get(detected, "eng_Latn") | |
translated = translate(text, src_code, target_lang_code) | |
matches = search_semantic(translated) | |
return text, detected, translated, matches | |
# Language Dropdown | |
lang_choices = list(nllb_langs.keys()) | |
# Gradio UI | |
iface = gr.Interface( | |
fn=pipeline, | |
inputs=[ | |
gr.Textbox(label="Enter your sentence"), | |
gr.Dropdown(choices=lang_choices, value="san_Deva", label="Target Language") | |
], | |
outputs=[ | |
gr.Textbox(label="Input"), | |
gr.Textbox(label="Detected Language"), | |
gr.Textbox(label="Translated Output"), | |
gr.Textbox(label="Semantic Matches") | |
], | |
title="🌍 Sanskrit Translator + Semantic Search" | |
) | |
iface.launch() | |
import gradio as gr | |
# Dropdown options | |
lang_options = list(nllb_langs.keys()) | |
# Voice Input Interface | |
def voice_pipeline(audio, target_code): | |
import whisper | |
model = whisper.load_model("base") | |
result = model.transcribe(audio) | |
text = result["text"] | |
detected = detect_language(text) | |
src_code = xlm_to_nllb.get(detected, "eng_Latn") | |
translated = translate(text, src_code, target_code) | |
matches = search_semantic(translated) | |
matches_text = "\n".join([f"{i+1}. {txt} (Score: {score:.2f})" for i, (txt, score) in enumerate(matches)]) | |
return text, detected, translated, matches_text | |
voice_tab = gr.Interface( | |
fn=voice_pipeline, | |
inputs=[ | |
gr.Audio(source="microphone", type="filepath", label="🎙️ Speak Something"), | |
gr.Dropdown(lang_options, value="san_Deva", label="🌐 Target Language"), | |
], | |
outputs=[ | |
gr.Textbox(label="📝 Detected Text"), | |
gr.Textbox(label="🌐 Detected Language"), | |
gr.Textbox(label="🗣️ Translated Output"), | |
gr.Textbox(label="🧠 Semantic Matches"), | |
] | |
) | |
# Tabs | |
gr.TabbedInterface( | |
[voice_tab, text_tab], | |
["🎤 Voice Input", "📝 Text Input"] | |
).launch() | |