carlosep93 commited on
Commit
100f3e3
·
1 Parent(s): 8030df1

gradio app for windows

Browse files
Files changed (2) hide show
  1. gradio_app.py +39 -0
  2. translate_docx.py +368 -0
gradio_app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pathlib import Path
3
+ import requests
4
+ import json
5
+ from translate_docx import translate_document, translate, Aligner
6
+ from nltk.tokenize.treebank import TreebankWordDetokenizer
7
+
8
+
9
+ ip='10.192.31.127'
10
+ config_folder = 'fast_align_config'
11
+ source_lang = 'en'
12
+ target_lang = 'ca'
13
+ temp_folder = 'tmp'
14
+ aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
15
+ detokenizer = TreebankWordDetokenizer()
16
+
17
+
18
+ def upload_file(filepath):
19
+ translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
20
+ return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
21
+
22
+ def download_file():
23
+ return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
24
+
25
+
26
+ with gr.Blocks() as demo:
27
+
28
+ with gr.Tab("Text"):
29
+ gr.Interface(fn=translate, inputs=["text","text","text"], outputs="text")
30
+ with gr.Tab("Docx documents"):
31
+ gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
32
+ with gr.Row():
33
+ u = gr.UploadButton("Upload a file", file_count="single")
34
+ d = gr.DownloadButton("Download the file", visible=False)
35
+
36
+ u.upload(upload_file, u, [u, d])
37
+ d.click(download_file, None, [u, d])
38
+ if __name__ == "__main__":
39
+ demo.launch()
translate_docx.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import json
3
+ import requests
4
+ import tqdm
5
+ import os
6
+ import string
7
+ from collections import defaultdict
8
+
9
+ from docx import Document
10
+ from docx.text.hyperlink import Hyperlink
11
+ from docx.text.run import Run
12
+ import nltk
13
+
14
+ nltk.download('punkt')
15
+ nltk.download('punkt_tab')
16
+
17
+ from nltk.tokenize import sent_tokenize, word_tokenize
18
+ from nltk.tokenize.treebank import TreebankWordDetokenizer
19
+
20
+ from subprocess import Popen, PIPE
21
+
22
+ from itertools import groupby
23
+ import fileinput
24
+
25
+ ip="192.168.20.216"
26
+ port="8000"
27
+
28
+ def translate(text, ip, port):
29
+
30
+ myobj = {
31
+ 'id': '1',
32
+ 'src': text,
33
+ }
34
+ port = str(int(port))
35
+ url = 'http://' + ip + ':' + port + '/translate'
36
+ x = requests.post(url, json = myobj)
37
+ json_response = json.loads(x.text)
38
+ return json_response['tgt']
39
+
40
+ # Class to align original and translated sentences
41
+ # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
42
+ class Aligner():
43
+ def __init__(self, config_folder, source_lang, target_lang, temp_folder):
44
+ forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
45
+ reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
46
+
47
+ fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
48
+ rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
49
+
50
+ self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
51
+ self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
52
+
53
+ self.forward_command = lambda \
54
+ x: f'fast_align.exe -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
55
+ self.reverse_command = lambda \
56
+ x: f'fast_align.exe -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
57
+
58
+ self.symmetric_command = f'atools.exe -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
59
+
60
+ def __simplify_alignment_file(self, file):
61
+ with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
62
+ for line in f:
63
+ print(line.split('|||')[2].strip())
64
+
65
+ def __read_err(self, err):
66
+ (T, m) = ('', '')
67
+ for line in open(err):
68
+ # expected target length = source length * N
69
+ if 'expected target length' in line:
70
+ m = line.split()[-1]
71
+ # final tension: N
72
+ elif 'final tension' in line:
73
+ T = line.split()[-1]
74
+ return T, m
75
+
76
+ def align(self, file):
77
+ # generate forward alignment
78
+ process = Popen(self.forward_command(file), shell=True)
79
+ process.wait()
80
+ # generate reverse alignment
81
+ process = Popen(self.reverse_command(file), shell=True)
82
+ process.wait()
83
+
84
+ # for some reason the output file contains more information than needed, remove it
85
+ self.__simplify_alignment_file(self.forward_alignment_file_path)
86
+ self.__simplify_alignment_file(self.reverse_alignment_file_path)
87
+
88
+ # generate symmetrical alignment
89
+ process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
90
+ process.wait()
91
+
92
+ # get final alignments and format them
93
+ alignments_str = process.communicate()[0].decode('utf-8')
94
+ alignments = []
95
+ for line in alignments_str.splitlines():
96
+ alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
97
+
98
+ return alignments
99
+
100
+
101
+ # Function to extract paragraphs with their runs
102
+ def extract_paragraphs_with_runs(doc):
103
+ paragraphs_with_runs = []
104
+ for idx, paragraph in enumerate(doc.paragraphs):
105
+ runs = []
106
+ for item in paragraph.iter_inner_content():
107
+ if isinstance(item, Run):
108
+ runs.append({
109
+ 'text': item.text,
110
+ 'bold': item.bold,
111
+ 'italic': item.italic,
112
+ 'underline': item.underline,
113
+ 'font_name': item.font.name,
114
+ 'font_size': item.font.size,
115
+ 'font_color': item.font.color.rgb,
116
+ 'paragraph_index': idx
117
+ })
118
+ elif isinstance(item, Hyperlink):
119
+ runs.append({
120
+ 'text': item.runs[0].text,
121
+ 'bold': item.runs[0].bold,
122
+ 'italic': item.runs[0].italic,
123
+ 'underline': item.runs[0].underline,
124
+ 'font_name': item.runs[0].font.name,
125
+ 'font_size': item.runs[0].font.size,
126
+ 'font_color': item.runs[0].font.color.rgb,
127
+ 'paragraph_index': idx
128
+ })
129
+
130
+ paragraphs_with_runs.append(runs)
131
+ return paragraphs_with_runs
132
+
133
+
134
+ def tokenize_paragraph_with_runs2(runs_in_paragraph):
135
+ text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
136
+ sentences = sent_tokenize(text_paragraph)
137
+ tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
138
+
139
+ tokenized_sentences_with_style = []
140
+ for tokenized_sentence in tokenized_sentences:
141
+ tokenized_sentence_with_style = []
142
+ token_idx = 0
143
+ for run in runs_in_paragraph:
144
+ text_in_run = run["text"].strip()
145
+
146
+ if text_in_run == tokenized_sentence[token_idx]:
147
+ new_run = run.copy()
148
+ new_run["text"] = text_in_run
149
+ tokenized_sentence_with_style.append(new_run)
150
+ token_idx += 1
151
+ if token_idx >= len(tokenized_sentence):
152
+ break
153
+ elif len(text_in_run) > len(tokenized_sentence[token_idx]):
154
+ if text_in_run.startswith(tokenized_sentence[token_idx]):
155
+ for token in word_tokenize(text_in_run):
156
+ if token == tokenized_sentence[token_idx]:
157
+ new_run = run.copy()
158
+ new_run["text"] = token
159
+ tokenized_sentence_with_style.append(new_run)
160
+ token_idx += 1
161
+ else:
162
+ raise "oops"
163
+ tokenized_sentences_with_style.append(tokenized_sentence_with_style)
164
+ return tokenized_sentences_with_style
165
+
166
+
167
+ def tokenize_with_runs(runs, detokenizer):
168
+ text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
169
+ sentences = sent_tokenize(text_paragraph)
170
+ tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
171
+
172
+ tokens_with_style = []
173
+ for run in runs:
174
+ tokens = word_tokenize(run["text"])
175
+ for token in tokens:
176
+ tokens_with_style.append(run.copy())
177
+ tokens_with_style[-1]["text"] = token
178
+
179
+ token_index = 0
180
+ tokenized_sentences_with_style = []
181
+ for sentence in tokenized_sentences:
182
+ sentence_with_style = []
183
+ for word in sentence:
184
+ if word == tokens_with_style[token_index]["text"]:
185
+ sentence_with_style.append(tokens_with_style[token_index])
186
+ token_index += 1
187
+ else:
188
+ if word.startswith(tokens_with_style[token_index]["text"]):
189
+ # this token might be split into several runs
190
+ word_left = word
191
+
192
+ while word_left:
193
+ sentence_with_style.append(tokens_with_style[token_index])
194
+ word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
195
+ token_index += 1
196
+ else:
197
+ raise "oops"
198
+ tokenized_sentences_with_style.append(sentence_with_style)
199
+ return tokenized_sentences_with_style
200
+
201
+
202
+ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, aligner, temp_folder, detokenizer):
203
+ # clean temp folder
204
+ for f in os.listdir(temp_folder):
205
+ os.remove(os.path.join(temp_folder, f))
206
+
207
+ temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
208
+
209
+ # tokenize the original text by sentence and words while keeping the style
210
+ original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
211
+ original_paragraphs_with_runs]
212
+
213
+ # flatten all the runs so we can align with just one call instead of one per paragraph
214
+ original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
215
+ sublist]
216
+
217
+ # tokenize the translated text by sentence and word
218
+ translated_tokenized_sentences = [word_tokenize(sentence) for
219
+ translated_paragraph in translated_paragraphs for sentence in
220
+ sent_tokenize(translated_paragraph)]
221
+
222
+ # write the file that fastalign will use
223
+ with open(temp_file_path, "w") as out_file:
224
+ for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
225
+ out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
226
+
227
+ alignments = aligner.align(temp_file_path)
228
+
229
+ # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
230
+ translated_sentences_with_style = []
231
+ for sentence_idx, sentence_alignments in enumerate(alignments):
232
+
233
+ # reverse the order of the alignments and build a dict with it
234
+ sentence_alignments = {target: source for source, target in sentence_alignments}
235
+
236
+ translated_sentence_with_style = []
237
+ for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
238
+ # fastalign has found a token aligned with the translated one
239
+ if token_idx in sentence_alignments.keys():
240
+ # get the aligned token
241
+ original_idx = sentence_alignments[token_idx]
242
+ new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
243
+ new_entry["text"] = translated_token
244
+ translated_sentence_with_style.append(new_entry)
245
+ else:
246
+ # WARNING this is a test
247
+ # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
248
+ new_entry = translated_sentence_with_style[-1].copy()
249
+ new_entry["text"] = translated_token
250
+ translated_sentence_with_style.append(new_entry)
251
+
252
+ translated_sentences_with_style.append(translated_sentence_with_style)
253
+
254
+ return translated_sentences_with_style
255
+
256
+
257
+ # group contiguous elements with the same boolean values
258
+ def group_by_style(values, detokenizer):
259
+ groups = []
260
+ for key, group in groupby(values, key=lambda x: (
261
+ x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'],
262
+ x['paragraph_index'])):
263
+ text = detokenizer.detokenize([item['text'] for item in group])
264
+
265
+ if groups and not text.startswith((",", ";", ":", ".", ")")):
266
+ text = " " + text
267
+
268
+ groups.append({"text": text,
269
+ "bold": key[0],
270
+ "italic": key[1],
271
+ "underline": key[2],
272
+ "font_name": key[3],
273
+ "font_size": key[4],
274
+ "font_color": key[5],
275
+ 'paragraph_index': key[6]})
276
+ return groups
277
+
278
+
279
+ def preprocess_runs(runs_in_paragraph):
280
+ new_runs = []
281
+
282
+ for run in runs_in_paragraph:
283
+
284
+ # sometimes the parameters are False and sometimes they are None, set them all to False
285
+ for key, value in run.items():
286
+ if value is None and not key.startswith("font"):
287
+ run[key] = False
288
+
289
+ if not new_runs:
290
+ new_runs.append(run)
291
+ else:
292
+ # if the previous run has the same format as the current run, we merge the two runs together
293
+ if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
294
+ new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
295
+ and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
296
+ and new_runs[-1]["underline"] == run["underline"]
297
+ and new_runs[-1]["paragraph_index"] == run["paragraph_index"]):
298
+ new_runs[-1]["text"] += run["text"]
299
+ else:
300
+ new_runs.append(run)
301
+
302
+ # we want to split runs that contain more than one sentence to avoid problems later when aligning styles
303
+ sentences = sent_tokenize(new_runs[-1]["text"])
304
+ if len(sentences) > 1:
305
+ new_runs[-1]["text"] = sentences[0]
306
+ for sentence in sentences[1:]:
307
+ new_run = new_runs[-1].copy()
308
+ new_run["text"] = sentence
309
+ new_runs.append(new_run)
310
+
311
+ return new_runs
312
+
313
+
314
+
315
+ def translate_document(input_file,
316
+ aligner,
317
+ detokenizer,
318
+ ip="192.168.20.216",
319
+ temp_folder="tmp",
320
+ port="8000"):
321
+ os.makedirs(temp_folder, exist_ok=True)
322
+ # load original file, extract the paragraphs with their runs (which include style and formatting)
323
+ doc = Document(input_file)
324
+ paragraphs_with_runs = extract_paragraphs_with_runs(doc)
325
+
326
+ # translate each paragraph
327
+ translated_paragraphs = []
328
+ for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
329
+ paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
330
+ translated_paragraphs.append(translate(paragraph_text, ip, port))
331
+
332
+ out_doc = Document()
333
+
334
+ processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
335
+
336
+ translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
337
+ translated_paragraphs, aligner,
338
+ temp_folder, detokenizer)
339
+ # flatten the sentences into a list of tokens
340
+ translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
341
+ # group the tokens by style/run
342
+ translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
343
+
344
+ # group the runs by original paragraph
345
+ translated_paragraphs_with_style = defaultdict(list)
346
+ for item in translated_runs_with_style:
347
+ translated_paragraphs_with_style[item['paragraph_index']].append(item)
348
+
349
+ for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
350
+ # in case there are empty paragraphs
351
+ if not original_paragraph.text:
352
+ out_doc.add_paragraph(style=original_paragraph.style)
353
+ continue
354
+
355
+ para = out_doc.add_paragraph(style=original_paragraph.style)
356
+
357
+ for item in translated_paragraphs_with_style[paragraph_index]:
358
+ run = para.add_run(item["text"])
359
+ # Preserve original run formatting
360
+ run.bold = item['bold']
361
+ run.italic = item['italic']
362
+ run.underline = item['underline']
363
+ run.font.name = item['font_name']
364
+ run.font.size = item['font_size']
365
+ run.font.color.rgb = item['font_color']
366
+
367
+ out_doc.save("translated.docx")
368
+ return "translated.docx"