File size: 5,923 Bytes
9c8f5e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F
import faiss
import numpy as np
import fitz  # PyMuPDF for PDF
import docx  # for DOCX
from sacrebleu import corpus_bleu
import matplotlib.pyplot as plt

# Load Models
lang_detect_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
lang_detect_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Language Mappings
id2lang = lang_detect_model.config.id2label
nllb_langs = {
    "eng_Latn": "English", "fra_Latn": "French", "hin_Deva": "Hindi",
    "spa_Latn": "Spanish", "deu_Latn": "German", "tam_Taml": "Tamil",
    "tel_Telu": "Telugu", "jpn_Jpan": "Japanese", "zho_Hans": "Chinese",
    "arb_Arab": "Arabic", "san_Deva": "Sanskrit"
}
xlm_to_nllb = {
    "en": "eng_Latn", "fr": "fra_Latn", "hi": "hin_Deva", "es": "spa_Latn", "de": "deu_Latn",
    "ta": "tam_Taml", "te": "tel_Telu", "ja": "jpn_Jpan", "zh": "zho_Hans", "ar": "arb_Arab",
    "sa": "san_Deva"
}

# Semantic Corpus and Index
corpus = [
    "धर्म एव हतो हन्ति धर्मो रक्षति रक्षितः",
    "Dharma when destroyed, destroys; when protected, protects.",
    "The moon affects tides and mood, according to Jyotisha",
    "One should eat according to the season – Rituacharya",
    "Balance of Tridosha is health – Ayurveda principle",
    "Ethics in Mahabharata reflect situational dharma",
    "Meditation improves memory and mental clarity",
    "Jyotisha links planetary motion with life patterns"
]
corpus_embeddings = embed_model.encode(corpus, convert_to_numpy=True)
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings)

# Utility Functions
def detect_language(text):
    inputs = lang_detect_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = lang_detect_model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
    return id2lang[pred]

def translate(text, src_code, tgt_code):
    trans_tokenizer.src_lang = src_code
    encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    try:
        target_lang_id = trans_tokenizer.convert_tokens_to_ids([tgt_code])[0]
        generated = trans_model.generate(**encoded, forced_bos_token_id=target_lang_id)
        return trans_tokenizer.decode(generated[0], skip_special_tokens=True)
    except:
        return ""

def search_semantic(query, top_k=3):
    query_embedding = embed_model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [(corpus[i], float(distances[0][idx])) for idx, i in enumerate(indices[0])]

def extract_text_from_file(file):
    name = file.name.lower()
    if name.endswith(".txt"):
        return file.read().decode("utf-8")
    elif name.endswith(".pdf"):
        with fitz.open(stream=file.read(), filetype="pdf") as doc:
            return "\n".join([page.get_text() for page in doc])
    elif name.endswith(".docx"):
        document = docx.Document(file)
        return "\n".join([para.text for para in document.paragraphs])
    return "❌ Unsupported file format."

def full_pipeline_file(file, target_lang_code, human_ref=""):
    user_input_text = extract_text_from_file(file)
    if not user_input_text.strip():
        return "⚠️ Empty file", "", [], "", ""
    detected_lang = detect_language(user_input_text)
    src_nllb = xlm_to_nllb.get(detected_lang, "eng_Latn")
    translated = translate(user_input_text, src_nllb, target_lang_code)
    if not translated:
        return detected_lang, "❌ Translation failed", [], "", ""
    sem_results = search_semantic(translated)
    result_list = [f"{i+1}. {txt} (Score: {score:.2f})" for i, (txt, score) in enumerate(sem_results)]
    labels = [f"{i+1}" for i in range(len(sem_results))]
    scores = [score for _, score in sem_results]
    plt.figure(figsize=(6, 4))
    bars = plt.barh(labels, scores, color="lightgreen")
    plt.xlabel("Similarity Score")
    plt.title("Top Semantic Matches")
    plt.gca().invert_yaxis()
    for bar in bars:
        plt.text(bar.get_width() + 0.01, bar.get_y() + 0.1, f"{bar.get_width():.2f}", fontsize=8)
    plt.tight_layout()
    plot_path = "/tmp/sem_plot.png"
    plt.savefig(plot_path)
    plt.close()
    bleu_score = ""
    if human_ref.strip():
        bleu = corpus_bleu([translated], [[human_ref]])
        bleu_score = f"{bleu.score:.2f}"
    return detected_lang, translated, result_list, plot_path, bleu_score

# Launch Gradio App
gr.Interface(
    fn=full_pipeline_file,
    inputs=[
        gr.File(label="Upload .txt / .pdf / .docx file", file_types=[".txt", ".pdf", ".docx"]),
        gr.Dropdown(label="Target Language", choices=list(nllb_langs.keys()), value="eng_Latn"),
        gr.Textbox(label="(Optional) Human Reference Translation", lines=2, placeholder="Paste human translation (for BLEU)...")
    ],
    outputs=[
        gr.Textbox(label="Detected Language"),
        gr.Textbox(label="Translated Text"),
        gr.Textbox(label="Top Semantic Matches"),
        gr.Image(label="Semantic Similarity Plot"),
        gr.Textbox(label="BLEU Score")
    ],
    title="📂 File-Based Multilingual Translator + Semantic Search",
    description="Upload a `.txt`, `.pdf`, or `.docx` file in any language. Translates it and provides semantic search."
).launch(debug=True)