File size: 3,881 Bytes
d83d604
98ac441
5813702
73fdc8f
6ed741d
5813702
 
861709c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ed741d
 
 
5813702
73fdc8f
5813702
0c65a3b
98ac441
 
 
 
 
 
 
 
 
 
 
5813702
 
 
 
 
 
d83d604
98ac441
6ed741d
5813702
 
 
98ac441
 
 
5813702
 
 
98ac441
6ed741d
5813702
d83d604
5813702
6ed741d
5813702
 
6ed741d
5813702
 
 
 
6ed741d
 
5813702
 
 
 
 
 
6ed741d
 
5813702
6ed741d
 
 
5813702
f96da32
5813702
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Model choices ordered by accuracy
model_choices = {
    "Pegasus (google/pegasus-xsum)": "google/pegasus-xsum",
    "BigBird-Pegasus (google/bigbird-pegasus-large-arxiv)": "google/bigbird-pegasus-large-arxiv",
    "LongT5 Large (google/long-t5-tglobal-large)": "google/long-t5-tglobal-large",
    "BART Large CNN (facebook/bart-large-cnn)": "facebook/bart-large-cnn",
    "ProphetNet (microsoft/prophetnet-large-uncased-cnndm)": "microsoft/prophetnet-large-uncased-cnndm",
    "LED (allenai/led-base-16384)": "allenai/led-base-16384",
    "T5 Large (t5-large)": "t5-large",
    "Flan-T5 Large (google/flan-t5-large)": "google/flan-t5-large",
    "DistilBART CNN (sshleifer/distilbart-cnn-12-6)": "sshleifer/distilbart-cnn-12-6",
    "DistilBART XSum (mrm8488/distilbart-xsum-12-6)": "mrm8488/distilbart-xsum-12-6",
    "T5 Base (t5-base)": "t5-base",
    "Flan-T5 Base (google/flan-t5-base)": "google/flan-t5-base",
    "BART CNN SamSum (philschmid/bart-large-cnn-samsum)": "philschmid/bart-large-cnn-samsum",
    "T5 SamSum (knkarthick/pegasus-samsum)": "knkarthick/pegasus-samsum",
    "LongT5 Base (google/long-t5-tglobal-base)": "google/long-t5-tglobal-base",
    "T5 Small (t5-small)": "t5-small",
    "MBART (facebook/mbart-large-cc25)": "facebook/mbart-large-cc25",
    "MarianMT (Helsinki-NLP/opus-mt-en-ro)": "Helsinki-NLP/opus-mt-en-ro",
    "Falcon Instruct (tiiuae/falcon-7b-instruct)": "tiiuae/falcon-7b-instruct",
    "BART ELI5 (yjernite/bart_eli5)": "yjernite/bart_eli5"
}

model_cache = {}

# Function to clean input text (remove special characters and extra spaces)
def clean_text(input_text):
    # Replace special characters with a space
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', input_text)
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # Strip leading and trailing spaces
    cleaned_text = cleaned_text.strip()
    return cleaned_text

# Load model and tokenizer
def load_model(model_name):
    if model_name not in model_cache:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        model_cache[model_name] = (tokenizer, model)
    return model_cache[model_name]

# Summarize the text using a selected model
def summarize_text(input_text, model_label, char_limit):
    if not input_text.strip():
        return "Please enter some text."

    # Clean the input text by removing special characters and extra spaces
    input_text = clean_text(input_text)

    model_name = model_choices[model_label]
    tokenizer, model = load_model(model_name)

    # Adjust the input format for T5 and FLAN models
    if "t5" in model_name.lower() or "flan" in model_name.lower():
        input_text = "summarize: " + input_text

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=20,  # Still approximate; can be tuned per model
        min_length=5,
        do_sample=False
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary[:char_limit]  # Enforce character limit

# Gradio UI
iface = gr.Interface(
    fn=summarize_text,
    inputs=[
        gr.Textbox(lines=6, label="Enter text to summarize"),
        gr.Dropdown(choices=list(model_choices.keys()), label="Choose summarization model", value="Pegasus (google/pegasus-xsum)"),
        gr.Slider(minimum=30, maximum=200, value=65, step=1, label="Max Character Limit")
    ],
    outputs=gr.Textbox(lines=3, label="Summary (truncated to character limit)"),
    title="Multi-Model Text Summarizer",
    description="Summarize text using different Hugging Face models with a user-defined character limit."
)

iface.launch()