File size: 2,573 Bytes
6683f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    

import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Load summarizer model (LaMini-Flan-T5)
summarizer_tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")

# Load translators
translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
translator_te = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul")

# Extract text from PDF
def extract_text_from_pdf(file):
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Summarize based on doc type
def summarize_text(text, doc_type):
    prompt = f"Summarize this {doc_type} document clearly:\n{text}\nSummary:"
    inputs = summarizer_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    outputs = summarizer_model.generate(**inputs, max_length=300, num_beams=4, early_stopping=True)
    return summarizer_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Translate summary
def translate_summary(summary, lang):
    if lang == "hindi":
        return translator_hi(summary)[0]["translation_text"]
    elif lang == "telugu":
        return translator_te(summary)[0]["translation_text"]
    else:
        return summary  # English or unsupported

# Main processing logic
def process(file, lang, doc_type):
    text = extract_text_from_pdf(file)
    if not text.strip():
        return "Error: PDF has no extractable text."
    
    summary = summarize_text(text, doc_type)
    return translate_summary(summary, lang)

# Gradio UI
with gr.Blocks() as app:
    gr.Markdown("## Multilingual AI Document Summarizer")
    gr.Markdown("Upload a document and get summaries in multiple languages using mT5.")

    file_input = gr.File(label="Upload PDF")

    with gr.Row():
        language_input = gr.Dropdown(
            label="Select Language",
            choices=["english", "hindi", "telugu"],
            value="english"
        )
        type_input = gr.Dropdown(
            label="Select Document Type",
            choices=["legal", "medical", "general"],
            value="general"
        )

    output = gr.Textbox(label="Summary Output", lines=10)

    with gr.Row():
        clear = gr.Button("Clear")
        submit = gr.Button("Submit")

    submit.click(fn=process, inputs=[file_input, language_input, type_input], outputs=output)
    clear.click(lambda: "", inputs=[], outputs=output)

app.launch()