SlouchyBuffalo commited on
Commit
8544542
·
verified ·
1 Parent(s): 8c29564

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -322
app.py DELETED
@@ -1,322 +0,0 @@
1
- import gradio as gr
2
- import spaces
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
- import torch
5
- from huggingface_hub import InferenceClient
6
- import os
7
- import fitz # PyMuPDF for PDF processing
8
- from PIL import Image
9
- import pytesseract
10
-
11
- # Initialize Cerebras client for Llama 4
12
- cerebras_client = InferenceClient(
13
- "meta-llama/Llama-4-Scout-17B-16E-Instruct",
14
- provider="cerebras",
15
- token=os.getenv("HF_TOKEN"),
16
- )
17
-
18
- # Global variables for models and tokenizers
19
- en_es_tokenizer = None
20
- en_es_model = None
21
- es_en_tokenizer = None
22
- es_en_model = None
23
-
24
- @spaces.GPU(duration=60)
25
- def translate_en_to_es(text):
26
- global en_es_tokenizer, en_es_model
27
-
28
- # Initialize EN->ES model if needed
29
- if en_es_tokenizer is None or en_es_model is None:
30
- en_es_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="spa_Latn")
31
- en_es_model = AutoModelForSeq2SeqLM.from_pretrained(
32
- "facebook/nllb-200-distilled-600M",
33
- torch_dtype=torch.float16
34
- ).cuda()
35
-
36
- # Translate
37
- inputs = en_es_tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to("cuda")
38
- with torch.no_grad():
39
- outputs = en_es_model.generate(
40
- **inputs,
41
- forced_bos_token_id=en_es_tokenizer.convert_tokens_to_ids("spa_Latn"),
42
- max_length=512,
43
- num_beams=5,
44
- early_stopping=True
45
- )
46
-
47
- translation = en_es_tokenizer.decode(outputs[0], skip_special_tokens=True)
48
- return translation
49
-
50
- @spaces.GPU(duration=60)
51
- def translate_es_to_en(text):
52
- global es_en_tokenizer, es_en_model
53
-
54
- # Initialize ES->EN model if needed
55
- if es_en_tokenizer is None or es_en_model is None:
56
- es_en_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="spa_Latn", tgt_lang="eng_Latn")
57
- es_en_model = AutoModelForSeq2SeqLM.from_pretrained(
58
- "facebook/nllb-200-distilled-600M",
59
- torch_dtype=torch.float16
60
- ).cuda()
61
-
62
- # Translate
63
- inputs = es_en_tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to("cuda")
64
- with torch.no_grad():
65
- outputs = es_en_model.generate(
66
- **inputs,
67
- forced_bos_token_id=es_en_tokenizer.convert_tokens_to_ids("eng_Latn"),
68
- max_length=512,
69
- num_beams=5,
70
- early_stopping=True
71
- )
72
-
73
- translation = es_en_tokenizer.decode(outputs[0], skip_special_tokens=True)
74
- return translation
75
-
76
- def extract_text_from_pdf(file_path):
77
- """Extract text from PDF file"""
78
- try:
79
- doc = fitz.open(file_path)
80
- text = ""
81
- for page in doc:
82
- text += page.get_text()
83
- doc.close()
84
- return text
85
- except Exception as e:
86
- return f"Error extracting text from PDF: {str(e)}"
87
-
88
- def extract_text_from_image(file_path):
89
- """Extract text from image using OCR"""
90
- try:
91
- image = Image.open(file_path)
92
- text = pytesseract.image_to_string(image)
93
- return text
94
- except Exception as e:
95
- return f"Error extracting text from image: {str(e)}"
96
-
97
- def process_uploaded_file(file):
98
- """Process uploaded file and extract text"""
99
- if file is None:
100
- return "No file uploaded"
101
-
102
- file_path = file.name
103
- file_extension = os.path.splitext(file_path)[1].lower()
104
-
105
- if file_extension == '.pdf':
106
- return extract_text_from_pdf(file_path)
107
- elif file_extension in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
108
- return extract_text_from_image(file_path)
109
- else:
110
- return "Unsupported file format. Please upload PDF or image files."
111
-
112
- def refine_with_llama(original_text, translation, direction, region="Mexico", formality="neutral"):
113
- if direction == "en_to_es":
114
- refine_prompt = f"""You are an expert Spanish translator specializing in {region} Spanish. Refine the following translation and explain your changes:
115
-
116
- Original English: {original_text}
117
- Initial Spanish translation: {translation}
118
- Region: {region}
119
- Formality level: {formality}
120
-
121
- Requirements:
122
- 1. Use {region} Spanish vocabulary and expressions
123
- 2. Adjust for {formality} formality level
124
- 3. Fix any contextual errors or awkward phrasing
125
- 4. Preserve idiomatic expressions appropriately for {region} Spanish
126
-
127
- Respond in this format:
128
- TRANSLATION: [your refined translation]
129
- EXPLANATION: [Brief explanation of changes made and why this version fits {formality} {region} Spanish better]"""
130
- else:
131
- refine_prompt = f"""You are an expert English translator. Refine the following translation and explain your changes:
132
-
133
- Original Spanish: {original_text}
134
- Initial English translation: {translation}
135
- Formality level: {formality}
136
-
137
- Requirements:
138
- 1. Use natural English expressions
139
- 2. Adjust for {formality} formality level
140
- 3. Fix any contextual errors or awkward phrasing
141
- 4. Preserve meaning while making it sound natural
142
-
143
- Respond in this format:
144
- TRANSLATION: [your refined translation]
145
- EXPLANATION: [Brief explanation of changes made and why this version fits {formality} English better]"""
146
-
147
- try:
148
- response = cerebras_client.chat_completion(
149
- messages=[{"role": "user", "content": refine_prompt}],
150
- max_tokens=512,
151
- temperature=0.3
152
- )
153
-
154
- # Parse response to extract translation and explanation
155
- content = response.choices[0].message.content.strip()
156
-
157
- if "TRANSLATION:" in content and "EXPLANATION:" in content:
158
- translation_part = content.split("TRANSLATION:")[1].split("EXPLANATION:")[0].strip()
159
- explanation_part = content.split("EXPLANATION:")[1].strip()
160
- return translation_part, explanation_part
161
- else:
162
- return content, "Explanation not available in expected format"
163
-
164
- except Exception as e:
165
- return f"Refinement error: {str(e)}", ""
166
-
167
- def complete_translation(text, direction, region, formality):
168
- if not text.strip():
169
- return "", "", ""
170
-
171
- try:
172
- # Step 1: Initial translation
173
- if direction == "English to Spanish":
174
- initial_translation = translate_en_to_es(text)
175
- refined_translation, explanation = refine_with_llama(text, initial_translation, "en_to_es", region, formality)
176
- else: # Spanish to English
177
- initial_translation = translate_es_to_en(text)
178
- refined_translation, explanation = refine_with_llama(text, initial_translation, "es_to_en", region, formality)
179
-
180
- return initial_translation, refined_translation, explanation
181
- except Exception as e:
182
- return f"Error: {str(e)}", "", ""
183
-
184
- def translate_from_file(file, direction, region, formality):
185
- # Extract text from uploaded file
186
- extracted_text = process_uploaded_file(file)
187
-
188
- if "Error" in extracted_text or "No file" in extracted_text:
189
- return extracted_text, "", "", ""
190
-
191
- # Translate extracted text
192
- initial_translation, refined_translation, explanation = complete_translation(extracted_text, direction, region, formality)
193
-
194
- return extracted_text, initial_translation, refined_translation, explanation
195
-
196
- # Create Gradio interface
197
- with gr.Blocks(title="Document Translation with Regional Spanish") as demo:
198
- gr.Markdown("# Document Translation with Regional Spanish")
199
- gr.Markdown("Upload PDFs or images for OCR, or type text directly. Powered by NLLB-200 + Llama 4 with regional variants")
200
-
201
- with gr.Tabs():
202
- # Text Translation Tab
203
- with gr.TabItem("Text Translation"):
204
- with gr.Row():
205
- with gr.Column(scale=2):
206
- input_text = gr.Textbox(
207
- label="Text to Translate",
208
- placeholder="Enter text in English or Spanish...",
209
- lines=6
210
- )
211
-
212
- with gr.Row():
213
- direction = gr.Dropdown(
214
- choices=["English to Spanish", "Spanish to English"],
215
- value="English to Spanish",
216
- label="Translation Direction"
217
- )
218
-
219
- with gr.Row():
220
- region = gr.Dropdown(
221
- choices=["Mexico", "Spain", "Argentina", "Colombia", "Peru", "General"],
222
- value="Mexico",
223
- label="Spanish Variant"
224
- )
225
- formality = gr.Dropdown(
226
- choices=["informal", "neutral", "formal"],
227
- value="neutral",
228
- label="Formality Level"
229
- )
230
-
231
- translate_btn = gr.Button("Translate", variant="primary", size="lg")
232
-
233
- with gr.Column(scale=2):
234
- initial_output = gr.Textbox(
235
- label="Initial Translation (NLLB-200)",
236
- lines=2,
237
- interactive=False
238
- )
239
- refined_output = gr.Textbox(
240
- label="Refined Translation (Llama 4)",
241
- lines=2,
242
- interactive=False
243
- )
244
- explanation_output = gr.Textbox(
245
- label="Explanation of Changes",
246
- lines=4,
247
- interactive=False
248
- )
249
-
250
- # Document Upload Tab
251
- with gr.TabItem("Document Translation"):
252
- with gr.Row():
253
- with gr.Column(scale=2):
254
- file_input = gr.File(
255
- label="Upload PDF or Image",
256
- file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"]
257
- )
258
-
259
- with gr.Row():
260
- doc_direction = gr.Dropdown(
261
- choices=["English to Spanish", "Spanish to English"],
262
- value="English to Spanish",
263
- label="Translation Direction"
264
- )
265
-
266
- with gr.Row():
267
- doc_region = gr.Dropdown(
268
- choices=["Mexico", "Spain", "Argentina", "Colombia", "Peru", "General"],
269
- value="Mexico",
270
- label="Spanish Variant"
271
- )
272
- doc_formality = gr.Dropdown(
273
- choices=["informal", "neutral", "formal"],
274
- value="neutral",
275
- label="Formality Level"
276
- )
277
-
278
- translate_doc_btn = gr.Button("Extract & Translate", variant="primary", size="lg")
279
-
280
- with gr.Column(scale=2):
281
- extracted_text = gr.Textbox(
282
- label="Extracted Text",
283
- lines=3,
284
- interactive=False
285
- )
286
- doc_initial = gr.Textbox(
287
- label="Initial Translation (NLLB-200)",
288
- lines=3,
289
- interactive=False
290
- )
291
- doc_refined = gr.Textbox(
292
- label="Refined Translation (Llama 4)",
293
- lines=3,
294
- interactive=False
295
- )
296
- doc_explanation = gr.Textbox(
297
- label="Explanation of Changes",
298
- lines=3,
299
- interactive=False
300
- )
301
-
302
- # Connect functions
303
- translate_btn.click(
304
- fn=complete_translation,
305
- inputs=[input_text, direction, region, formality],
306
- outputs=[initial_output, refined_output, explanation_output]
307
- )
308
-
309
- input_text.submit(
310
- fn=complete_translation,
311
- inputs=[input_text, direction, region, formality],
312
- outputs=[initial_output, refined_output, explanation_output]
313
- )
314
-
315
- translate_doc_btn.click(
316
- fn=translate_from_file,
317
- inputs=[file_input, doc_direction, doc_region, doc_formality],
318
- outputs=[extracted_text, doc_initial, doc_refined, doc_explanation]
319
- )
320
-
321
- if __name__ == "__main__":
322
- demo.launch()