Spaces:
Running
Running
File size: 5,265 Bytes
16e7329 8e3d7cf 3be937a 8e3d7cf 3be937a 8e3d7cf 3be937a 8e3d7cf 3be937a 8e3d7cf 3be937a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import gradio as gr
import faiss
import numpy as np
# Load models
lang_detect_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
lang_detect_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
# Language mappings
id2lang = lang_detect_model.config.id2label
xlm_to_nllb = {
"en": "eng_Latn", "fr": "fra_Latn", "hi": "hin_Deva", "es": "spa_Latn", "de": "deu_Latn",
"ta": "tam_Taml", "te": "tel_Telu", "ja": "jpn_Jpan", "zh": "zho_Hans", "ar": "arb_Arab",
"sa": "san_Deva"
}
nllb_langs = {
"eng_Latn": "English", "fra_Latn": "French", "hin_Deva": "Hindi",
"spa_Latn": "Spanish", "deu_Latn": "German", "tam_Taml": "Tamil",
"tel_Telu": "Telugu", "jpn_Jpan": "Japanese", "zho_Hans": "Chinese",
"arb_Arab": "Arabic", "san_Deva": "Sanskrit"
}
# Sample knowledge corpus
corpus = [
"धर्म एव हतो हन्ति धर्मो रक्षति रक्षितः",
"Dharma when destroyed, destroys; when protected, protects.",
"The moon affects tides and mood, according to Jyotisha",
"One should eat according to the season – Rituacharya",
"Balance of Tridosha is health – Ayurveda principle",
"Ethics in Mahabharata reflect situational dharma",
"Meditation improves memory and mental clarity",
"Jyotisha links planetary motion with life patterns"
]
# Semantic index
corpus_embeddings = embed_model.encode(corpus, convert_to_numpy=True)
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings)
# Language Detection
def detect_language(text):
inputs = lang_detect_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = lang_detect_model(**inputs)
probs = F.softmax(outputs.logits, dim=1)
pred = torch.argmax(probs, dim=1).item()
return id2lang[pred]
# Translation
def translate(text, src_code, tgt_code):
trans_tokenizer.src_lang = src_code
encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
target_lang_id = trans_tokenizer.convert_tokens_to_ids([tgt_code])[0]
generated = trans_model.generate(**encoded, forced_bos_token_id=target_lang_id)
return trans_tokenizer.decode(generated[0], skip_special_tokens=True)
# Semantic Search
def search_semantic(query, top_k=3):
query_embedding = embed_model.encode([query])
distances, indices = index.search(query_embedding, top_k)
results = []
for i, idx in enumerate(indices[0]):
results.append(f"{i+1}. {corpus[idx]} (Score: {distances[0][i]:.2f})")
return "\n".join(results)
# Main function
def pipeline(text, target_lang_code):
if not text.strip():
return "Empty input", "", "", ""
detected = detect_language(text)
src_code = xlm_to_nllb.get(detected, "eng_Latn")
translated = translate(text, src_code, target_lang_code)
matches = search_semantic(translated)
return text, detected, translated, matches
# Language Dropdown
lang_choices = list(nllb_langs.keys())
# Gradio UI
iface = gr.Interface(
fn=pipeline,
inputs=[
gr.Textbox(label="Enter your sentence"),
gr.Dropdown(choices=lang_choices, value="san_Deva", label="Target Language")
],
outputs=[
gr.Textbox(label="Input"),
gr.Textbox(label="Detected Language"),
gr.Textbox(label="Translated Output"),
gr.Textbox(label="Semantic Matches")
],
title="🌍 Sanskrit Translator + Semantic Search"
)
iface.launch()
import gradio as gr
# Dropdown options
lang_options = list(nllb_langs.keys())
# Voice Input Interface
def voice_pipeline(audio, target_code):
import whisper
model = whisper.load_model("base")
result = model.transcribe(audio)
text = result["text"]
detected = detect_language(text)
src_code = xlm_to_nllb.get(detected, "eng_Latn")
translated = translate(text, src_code, target_code)
matches = search_semantic(translated)
matches_text = "\n".join([f"{i+1}. {txt} (Score: {score:.2f})" for i, (txt, score) in enumerate(matches)])
return text, detected, translated, matches_text
voice_tab = gr.Interface(
fn=voice_pipeline,
inputs=[
gr.Audio(source="microphone", type="filepath", label="🎙️ Speak Something"),
gr.Dropdown(lang_options, value="san_Deva", label="🌐 Target Language"),
],
outputs=[
gr.Textbox(label="📝 Detected Text"),
gr.Textbox(label="🌐 Detected Language"),
gr.Textbox(label="🗣️ Translated Output"),
gr.Textbox(label="🧠 Semantic Matches"),
]
)
# Tabs
gr.TabbedInterface(
[voice_tab, text_tab],
["🎤 Voice Input", "📝 Text Input"]
).launch()
|