Spaces:
Sleeping
Sleeping
Fixed a lot of error, now the script should crash much less often.
Browse filesBiggest change: translated the sentences one by one instead of the whole paragraph. This way there's no need to align the sentences and we know exactly which sentence is the translation of which. This is important when the translation is bad, there's different number of sentences in the source and target texts and everything crashes.
I also changed the tokenizer and detokenizers to use spacy's
- requirements.txt +2 -3
- src/salamandraTA7b_translator.py +4 -0
- src/translate_any_doc.py +139 -108
requirements.txt
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
-
nltk~=3.9.1
|
2 |
-
python-docx~=1.1.2
|
3 |
iso-639~=0.4.5
|
4 |
protobuf~=6.30.2
|
5 |
requests~=2.32.3
|
6 |
tqdm~=4.67.1
|
7 |
gradio~=5.25.1
|
8 |
gradio_client~=1.8.0
|
9 |
-
setuptools~=80.0.0
|
|
|
|
|
|
|
|
1 |
iso-639~=0.4.5
|
2 |
protobuf~=6.30.2
|
3 |
requests~=2.32.3
|
4 |
tqdm~=4.67.1
|
5 |
gradio~=5.25.1
|
6 |
gradio_client~=1.8.0
|
7 |
+
setuptools~=80.0.0
|
8 |
+
spacy~=3.8.6
|
src/salamandraTA7b_translator.py
CHANGED
@@ -1,11 +1,15 @@
|
|
1 |
from gradio_client import Client
|
2 |
from iso639 import languages
|
3 |
|
|
|
4 |
class SalamandraTA7bTranslator:
|
5 |
def __init__(self, hf_token):
|
6 |
self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
|
7 |
|
8 |
def translate(self, text, source_lang, target_lang):
|
|
|
|
|
|
|
9 |
# we assume that they are specifying the language by code so we need to convert it to name
|
10 |
lang1 = languages.get(alpha2=source_lang).name
|
11 |
lang2 = languages.get(alpha2=target_lang).name
|
|
|
1 |
from gradio_client import Client
|
2 |
from iso639 import languages
|
3 |
|
4 |
+
|
5 |
class SalamandraTA7bTranslator:
|
6 |
def __init__(self, hf_token):
|
7 |
self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
|
8 |
|
9 |
def translate(self, text, source_lang, target_lang):
|
10 |
+
if not text:
|
11 |
+
return ""
|
12 |
+
|
13 |
# we assume that they are specifying the language by code so we need to convert it to name
|
14 |
lang1 = languages.get(alpha2=source_lang).name
|
15 |
lang2 = languages.get(alpha2=target_lang).name
|
src/translate_any_doc.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import shutil
|
|
|
2 |
import time
|
3 |
import os
|
4 |
from itertools import groupby
|
@@ -8,8 +9,8 @@ import re
|
|
8 |
from src.aligner import Aligner
|
9 |
|
10 |
import glob
|
11 |
-
from sacremoses import MosesTokenizer, MosesDetokenizer
|
12 |
import spacy
|
|
|
13 |
|
14 |
import tqdm
|
15 |
|
@@ -19,6 +20,7 @@ spacy_nlp = spacy.load("xx_ent_wiki_sm")
|
|
19 |
if "sentencizer" not in spacy_nlp.pipe_names:
|
20 |
spacy_nlp.add_pipe("sentencizer")
|
21 |
|
|
|
22 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
23 |
original_xliff_file_path: str) -> str:
|
24 |
"""
|
@@ -120,6 +122,7 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
|
|
120 |
|
121 |
return runs
|
122 |
|
|
|
123 |
def tokenize_text(text, tokenizer):
|
124 |
# To avoid the tokenizer destroying the url
|
125 |
def preserve_urls(text):
|
@@ -145,7 +148,8 @@ def tokenize_text(text, tokenizer):
|
|
145 |
|
146 |
return tokens
|
147 |
|
148 |
-
|
|
|
149 |
"""
|
150 |
Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
|
151 |
to its original run
|
@@ -159,57 +163,80 @@ def tokenize_with_runs(runs: list[dict[str, str]], tokenizer, detokenizer) -> li
|
|
159 |
"""
|
160 |
|
161 |
# it's a bit of a mess but first we get the tokenized sentences
|
162 |
-
|
163 |
-
|
164 |
-
tokenized_sentences = [tokenize_text(sentence.text, tokenizer) for sentence in sentences]
|
165 |
|
166 |
-
#
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
for run in runs:
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
else:
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
sentence_with_style.append(tokens_with_style[token_index])
|
193 |
-
word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
|
194 |
-
token_index += 1
|
195 |
-
else:
|
196 |
-
raise "Something unexpected happened I'm afraid"
|
197 |
tokenized_sentences_with_style.append(sentence_with_style)
|
198 |
-
|
|
|
|
|
199 |
|
200 |
|
201 |
-
def generate_alignments(
|
202 |
-
|
203 |
-
source_tokenizer, source_detokenizer, target_tokenizer) -> list[list[dict[str, str]]]:
|
204 |
"""
|
205 |
-
Given some original
|
206 |
the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
|
207 |
forget about paragraphs and work only in sentences, so the output is a list of sentences but with information about
|
208 |
from which paragraph that sentence came from
|
209 |
|
210 |
Parameters:
|
211 |
-
|
212 |
-
|
213 |
aligner: Object of the aligner class, uses fastalign
|
214 |
temp_folder: Path to folder where to put all the intermediate files
|
215 |
source_lang: original language of the document
|
@@ -223,21 +250,17 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
|
|
223 |
for f in glob.glob(os.path.join(temp_folder, "*align*")):
|
224 |
os.remove(f)
|
225 |
|
226 |
-
# tokenize the original text by sentence and words while keeping the style
|
227 |
-
original_tokenized_sentences_with_style = [tokenize_with_runs(runs, source_tokenizer, source_detokenizer) for runs in
|
228 |
-
original_paragraphs_with_runs]
|
229 |
-
|
230 |
-
# flatten all the runs so we can align with just one call instead of one per paragraph
|
231 |
-
original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
|
232 |
-
sublist]
|
233 |
-
|
234 |
# tokenize the translated text by sentence and word
|
235 |
-
translated_tokenized_sentences = [
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
238 |
|
239 |
assert len(translated_tokenized_sentences) == len(
|
240 |
-
original_tokenized_sentences_with_style), "The original and translated texts contain a different number of
|
241 |
|
242 |
original_sentences = []
|
243 |
translated_sentences = []
|
@@ -272,27 +295,28 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
|
|
272 |
|
273 |
translated_sentences_with_style.append(translated_sentence_with_style)
|
274 |
|
275 |
-
return translated_sentences_with_style
|
276 |
|
277 |
|
278 |
-
def group_by_style(tokens: list[dict[str, str]],
|
279 |
"""
|
280 |
To avoid having issues in the future, we group the contiguous tokens that have the same style. Basically, we
|
281 |
reconstruct the runs.
|
282 |
|
283 |
Parameters:
|
284 |
tokens: Tokens with style information
|
285 |
-
detokenizer: Detokenizer object to merge tokens back together
|
286 |
|
287 |
Returns:
|
288 |
list[dict]: A list of translated runs with format and style
|
289 |
"""
|
290 |
groups = []
|
291 |
-
|
292 |
-
|
|
|
|
|
|
|
293 |
|
294 |
-
|
295 |
-
text = " " + text
|
296 |
|
297 |
groups.append({"text": text,
|
298 |
"id": key[0],
|
@@ -300,7 +324,7 @@ def group_by_style(tokens: list[dict[str, str]], detokenizer) -> list[dict[str,
|
|
300 |
return groups
|
301 |
|
302 |
|
303 |
-
def runs_to_plain_text(paragraphs_with_style: dict[
|
304 |
"""
|
305 |
Generate a plain text file restoring the original tag structure like <g id=1> </g>
|
306 |
|
@@ -310,59 +334,39 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
|
|
310 |
"""
|
311 |
with open(out_file_path, "w") as out_file:
|
312 |
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
|
|
|
|
317 |
|
318 |
-
def open_tags(
|
319 |
tag = ""
|
320 |
-
for gid in
|
321 |
tag_type, tag_id = gid.split("_")
|
322 |
tag += f'<{tag_type} id="{tag_id}">'
|
323 |
return tag
|
324 |
|
325 |
for key, paragraph in paragraphs_with_style.items():
|
326 |
-
output = []
|
327 |
for run in paragraph:
|
328 |
ids = list(run["id"]) if run["id"] else []
|
329 |
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
if a == b:
|
334 |
-
common_prefix_len += 1
|
335 |
-
else:
|
336 |
-
break
|
337 |
-
|
338 |
-
# Close tags not in the new stack
|
339 |
-
to_close = current_stack[common_prefix_len:]
|
340 |
-
if to_close:
|
341 |
-
output.append(close_tags(to_close))
|
342 |
-
|
343 |
-
# Open new tags
|
344 |
-
to_open = ids[common_prefix_len:]
|
345 |
-
if to_open:
|
346 |
-
output.append(open_tags(to_open))
|
347 |
-
|
348 |
-
# Add text
|
349 |
-
output.append(run["text"])
|
350 |
-
|
351 |
-
# Update the stack
|
352 |
-
current_stack = ids
|
353 |
|
354 |
-
|
355 |
-
|
356 |
-
output.append(close_tags(current_stack))
|
357 |
|
358 |
-
out_file.write("
|
359 |
|
360 |
|
361 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
362 |
translator,
|
363 |
aligner: Aligner,
|
364 |
temp_folder: str = "tmp",
|
365 |
-
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
|
366 |
input_filename = input_file.split("/")[-1]
|
367 |
# copy the original file to the temporal folder to avoid common issues with tikal
|
368 |
temp_input_file = os.path.join(temp_folder, input_filename)
|
@@ -371,33 +375,53 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
371 |
original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
|
372 |
plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
|
373 |
|
374 |
-
source_tokenizer = MosesTokenizer(lang=source_lang)
|
375 |
-
source_detokenizer = MosesDetokenizer(lang=source_lang)
|
376 |
-
target_tokenizer = MosesTokenizer(lang=target_lang)
|
377 |
-
target_detokenizer = MosesDetokenizer(lang=target_lang)
|
378 |
-
|
379 |
# get paragraphs with runs
|
380 |
paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
|
381 |
enumerate(open(plain_text_file).readlines())]
|
382 |
|
383 |
# translate using plaintext file
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
# time to align the translation with the original
|
390 |
print("Generating alignments...")
|
391 |
start_time = time.time()
|
392 |
-
translated_sentences_with_style = generate_alignments(
|
393 |
-
|
|
|
|
|
394 |
print(f"Finished alignments in {time.time() - start_time} seconds")
|
395 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
# flatten the sentences into a list of tokens
|
397 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
|
|
398 |
|
399 |
# group the tokens by style/run
|
400 |
-
translated_runs_with_style = group_by_style(translated_tokens_with_style,
|
401 |
|
402 |
# group the runs by original paragraph
|
403 |
translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
|
@@ -424,6 +448,13 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
424 |
"-noalttrans", "-to", original_xliff_file]
|
425 |
Popen(tikal_moses_to_xliff_command).wait()
|
426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
# merge into a docx again
|
428 |
tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
|
429 |
final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
|
|
|
1 |
import shutil
|
2 |
+
import string
|
3 |
import time
|
4 |
import os
|
5 |
from itertools import groupby
|
|
|
9 |
from src.aligner import Aligner
|
10 |
|
11 |
import glob
|
|
|
12 |
import spacy
|
13 |
+
from spacy.tokens import Doc
|
14 |
|
15 |
import tqdm
|
16 |
|
|
|
20 |
if "sentencizer" not in spacy_nlp.pipe_names:
|
21 |
spacy_nlp.add_pipe("sentencizer")
|
22 |
|
23 |
+
|
24 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
25 |
original_xliff_file_path: str) -> str:
|
26 |
"""
|
|
|
122 |
|
123 |
return runs
|
124 |
|
125 |
+
|
126 |
def tokenize_text(text, tokenizer):
|
127 |
# To avoid the tokenizer destroying the url
|
128 |
def preserve_urls(text):
|
|
|
148 |
|
149 |
return tokens
|
150 |
|
151 |
+
|
152 |
+
def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str, str]]], list[list[bool]]]:
|
153 |
"""
|
154 |
Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
|
155 |
to its original run
|
|
|
163 |
"""
|
164 |
|
165 |
# it's a bit of a mess but first we get the tokenized sentences
|
166 |
+
# join runs and send through spacy to split into clean tokens
|
167 |
+
doc_from_runs = spacy_nlp("".join([run["text"] for run in runs]).strip())
|
|
|
168 |
|
169 |
+
# extract sentences and tokenize each into words
|
170 |
+
tokenized_sentences = [[token.text.strip() for token in sent if token.text.strip()] for sent in doc_from_runs.sents]
|
171 |
+
tokenized_sentences_spaces = [[token.whitespace_ != '' for token in sent if token.text.strip()] for sent in
|
172 |
+
doc_from_runs.sents]
|
173 |
+
|
174 |
+
flat_tokens = [token for sentence in tokenized_sentences for token in sentence]
|
175 |
+
flat_spaces = [token for sentence in tokenized_sentences_spaces for token in sentence]
|
176 |
+
|
177 |
+
flat_tokens_with_style = []
|
178 |
+
flat_spaces_with_style = []
|
179 |
+
token_idx = 0
|
180 |
for run in runs:
|
181 |
+
run["text"] = run["text"].strip()
|
182 |
+
while run["text"]:
|
183 |
+
if run["text"].startswith(flat_tokens[token_idx]):
|
184 |
+
run["text"] = run["text"][len(flat_tokens[token_idx]):]
|
185 |
+
if flat_spaces[token_idx]:
|
186 |
+
run["text"] = run["text"].lstrip()
|
187 |
+
item = run.copy()
|
188 |
+
item["text"] = flat_tokens[token_idx]
|
189 |
+
flat_tokens_with_style.append(item)
|
190 |
+
flat_spaces_with_style.append(flat_spaces[token_idx])
|
191 |
+
token_idx += 1
|
192 |
+
elif flat_tokens[token_idx].startswith(run["text"]):
|
193 |
+
subtoken = flat_tokens[token_idx][:len(run["text"])]
|
194 |
+
item = run.copy()
|
195 |
+
item["text"] = subtoken
|
196 |
+
flat_tokens_with_style.append(item)
|
197 |
+
flat_spaces_with_style.append(False)
|
198 |
+
flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
|
199 |
+
run["text"] = run["text"][len(subtoken):]
|
200 |
+
|
201 |
+
# reconstruct the sentences
|
202 |
+
token_idx = 0
|
203 |
+
tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
|
204 |
+
for sentence, sentence_spaces in zip(tokenized_sentences, tokenized_sentences_spaces):
|
205 |
+
sentence_with_style, sentence_spaces_with_style = [], []
|
206 |
+
for token in sentence:
|
207 |
+
if token == flat_tokens_with_style[token_idx]["text"]:
|
208 |
+
sentence_with_style.append(flat_tokens_with_style[token_idx])
|
209 |
+
sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
|
210 |
+
token_idx += 1
|
211 |
+
elif token.startswith(flat_tokens_with_style[token_idx]["text"]):
|
212 |
+
while token:
|
213 |
+
token = token[len(flat_tokens_with_style[token_idx]["text"]):]
|
214 |
+
sentence_with_style.append(flat_tokens_with_style[token_idx])
|
215 |
+
sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
|
216 |
+
token_idx += 1
|
217 |
else:
|
218 |
+
print(token)
|
219 |
+
print(sentence)
|
220 |
+
print(token_idx)
|
221 |
+
print(flat_tokens_with_style)
|
222 |
+
raise Exception(f"Something unexpected happened")
|
|
|
|
|
|
|
|
|
|
|
223 |
tokenized_sentences_with_style.append(sentence_with_style)
|
224 |
+
tokenized_sentences_spaces_with_style.append(sentence_spaces_with_style)
|
225 |
+
|
226 |
+
return tokenized_sentences_with_style, tokenized_sentences_spaces_with_style
|
227 |
|
228 |
|
229 |
+
def generate_alignments(original_tokenized_sentences_with_style: list[list[dict[str, str]]],
|
230 |
+
translated_sentences: list[str], aligner, temp_folder: str):
|
|
|
231 |
"""
|
232 |
+
Given some original sentences with style and formatting and its translation without formatting, try to match
|
233 |
the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
|
234 |
forget about paragraphs and work only in sentences, so the output is a list of sentences but with information about
|
235 |
from which paragraph that sentence came from
|
236 |
|
237 |
Parameters:
|
238 |
+
original_tokenized_sentences_with_style: Original text split into sentences with style information
|
239 |
+
translated_sentences: Translated text, split into sentences
|
240 |
aligner: Object of the aligner class, uses fastalign
|
241 |
temp_folder: Path to folder where to put all the intermediate files
|
242 |
source_lang: original language of the document
|
|
|
250 |
for f in glob.glob(os.path.join(temp_folder, "*align*")):
|
251 |
os.remove(f)
|
252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
# tokenize the translated text by sentence and word
|
254 |
+
translated_tokenized_sentences = []
|
255 |
+
# keep spacing information to detokenize properly later
|
256 |
+
translated_tokenized_sentences_spaces = []
|
257 |
+
for sentence in translated_sentences:
|
258 |
+
tokens = spacy_nlp(sentence)
|
259 |
+
translated_tokenized_sentences_spaces.append([token.whitespace_ != '' for token in tokens])
|
260 |
+
translated_tokenized_sentences.append([token.text for token in tokens])
|
261 |
|
262 |
assert len(translated_tokenized_sentences) == len(
|
263 |
+
original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentences, likely due to a translation error"
|
264 |
|
265 |
original_sentences = []
|
266 |
translated_sentences = []
|
|
|
295 |
|
296 |
translated_sentences_with_style.append(translated_sentence_with_style)
|
297 |
|
298 |
+
return translated_sentences_with_style, translated_tokenized_sentences_spaces
|
299 |
|
300 |
|
301 |
+
def group_by_style(tokens: list[dict[str, str]], spaces: list[bool]) -> list[dict[str, str]]:
|
302 |
"""
|
303 |
To avoid having issues in the future, we group the contiguous tokens that have the same style. Basically, we
|
304 |
reconstruct the runs.
|
305 |
|
306 |
Parameters:
|
307 |
tokens: Tokens with style information
|
|
|
308 |
|
309 |
Returns:
|
310 |
list[dict]: A list of translated runs with format and style
|
311 |
"""
|
312 |
groups = []
|
313 |
+
zipped = zip(tokens, spaces)
|
314 |
+
for key, group in groupby(zipped, key=lambda x: (x[0]["id"], x[0]["paragraph_index"])):
|
315 |
+
group = list(group)
|
316 |
+
tokens = [item[0]['text'] for item in group]
|
317 |
+
spaces = [item[1] for item in group]
|
318 |
|
319 |
+
text = Doc(spacy_nlp.vocab, words=tokens, spaces=spaces).text
|
|
|
320 |
|
321 |
groups.append({"text": text,
|
322 |
"id": key[0],
|
|
|
324 |
return groups
|
325 |
|
326 |
|
327 |
+
def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]]], out_file_path: str):
|
328 |
"""
|
329 |
Generate a plain text file restoring the original tag structure like <g id=1> </g>
|
330 |
|
|
|
334 |
"""
|
335 |
with open(out_file_path, "w") as out_file:
|
336 |
|
337 |
+
def close_tags(ids):
|
338 |
+
tag = ""
|
339 |
+
for gid in ids:
|
340 |
+
tag_type, tag_id = gid.split("_")
|
341 |
+
tag += f'</{tag_type}>'
|
342 |
+
return tag
|
343 |
|
344 |
+
def open_tags(ids):
|
345 |
tag = ""
|
346 |
+
for gid in ids:
|
347 |
tag_type, tag_id = gid.split("_")
|
348 |
tag += f'<{tag_type} id="{tag_id}">'
|
349 |
return tag
|
350 |
|
351 |
for key, paragraph in paragraphs_with_style.items():
|
|
|
352 |
for run in paragraph:
|
353 |
ids = list(run["id"]) if run["id"] else []
|
354 |
|
355 |
+
if ids:
|
356 |
+
output = open_tags(ids) + run["text"] + close_tags(ids)
|
357 |
+
out_file.write(output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
|
359 |
+
else:
|
360 |
+
out_file.write("".join(run["text"]))
|
|
|
361 |
|
362 |
+
out_file.write("\n")
|
363 |
|
364 |
|
365 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
366 |
translator,
|
367 |
aligner: Aligner,
|
368 |
temp_folder: str = "tmp",
|
369 |
+
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0", with_format: bool = True) -> str:
|
370 |
input_filename = input_file.split("/")[-1]
|
371 |
# copy the original file to the temporal folder to avoid common issues with tikal
|
372 |
temp_input_file = os.path.join(temp_folder, input_filename)
|
|
|
375 |
original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
|
376 |
plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
|
377 |
|
|
|
|
|
|
|
|
|
|
|
378 |
# get paragraphs with runs
|
379 |
paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
|
380 |
enumerate(open(plain_text_file).readlines())]
|
381 |
|
382 |
# translate using plaintext file
|
383 |
+
original_tokenized_sentences_with_style = []
|
384 |
+
original_spacing = []
|
385 |
+
for run in paragraphs_with_runs:
|
386 |
+
tokens, spaces = tokenize_with_runs(run)
|
387 |
+
original_tokenized_sentences_with_style += tokens
|
388 |
+
original_spacing += spaces
|
389 |
+
|
390 |
+
translated_sentences = []
|
391 |
+
for sentence, spacing in tqdm.tqdm(zip(original_tokenized_sentences_with_style, original_spacing),
|
392 |
+
desc="Translating paragraphs...",
|
393 |
+
total=len(original_tokenized_sentences_with_style)):
|
394 |
+
text = Doc(spacy_nlp.vocab, words=[token["text"] for token in sentence], spaces=spacing).text
|
395 |
+
|
396 |
+
while True:
|
397 |
+
try:
|
398 |
+
translated_sentences.append(translator.translate(text, source_lang, target_lang))
|
399 |
+
break
|
400 |
+
except:
|
401 |
+
continue
|
402 |
|
403 |
# time to align the translation with the original
|
404 |
print("Generating alignments...")
|
405 |
start_time = time.time()
|
406 |
+
translated_sentences_with_style, translated_sentences_spacing = generate_alignments(
|
407 |
+
original_tokenized_sentences_with_style,
|
408 |
+
translated_sentences, aligner,
|
409 |
+
temp_folder)
|
410 |
print(f"Finished alignments in {time.time() - start_time} seconds")
|
411 |
|
412 |
+
# since we tokenized these sentences independently, the spacing information does not contain spaces after punctuation
|
413 |
+
# at the end of the sentence (there's no space at the end of a sentence that ends with ".", unless there's a sentence
|
414 |
+
# right after
|
415 |
+
for sentence, sentence_spaces in zip(translated_sentences_with_style, translated_sentences_spacing):
|
416 |
+
if sentence[-1]["text"] in string.punctuation:
|
417 |
+
sentence_spaces[-1] = True
|
418 |
+
|
419 |
# flatten the sentences into a list of tokens
|
420 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
421 |
+
tokens_spaces = [item for sublist in translated_sentences_spacing for item in sublist]
|
422 |
|
423 |
# group the tokens by style/run
|
424 |
+
translated_runs_with_style = group_by_style(translated_tokens_with_style, tokens_spaces)
|
425 |
|
426 |
# group the runs by original paragraph
|
427 |
translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
|
|
|
448 |
"-noalttrans", "-to", original_xliff_file]
|
449 |
Popen(tikal_moses_to_xliff_command).wait()
|
450 |
|
451 |
+
# any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
|
452 |
+
# them. This may happen if a word in the original language has been split in more that one words that have other
|
453 |
+
# words in between, or an error in fastalign
|
454 |
+
text = open(original_xliff_file).read()
|
455 |
+
result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
|
456 |
+
open(original_xliff_file, "w").write(result)
|
457 |
+
|
458 |
# merge into a docx again
|
459 |
tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
|
460 |
final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
|