Spaces:
Sleeping
Sleeping
changed nltk tokenizer to multilingual tokenizers
Browse files- src/translate_any_doc.py +55 -19
src/translate_any_doc.py
CHANGED
@@ -7,14 +7,17 @@ import re
|
|
7 |
|
8 |
from src.aligner import Aligner
|
9 |
|
10 |
-
import nltk
|
11 |
import glob
|
12 |
-
from
|
13 |
-
import
|
14 |
|
15 |
-
|
16 |
-
nltk.download('punkt_tab')
|
17 |
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
20 |
original_xliff_file_path: str) -> str:
|
@@ -117,26 +120,53 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
|
|
117 |
|
118 |
return runs
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
-
|
|
|
|
|
122 |
"""
|
123 |
Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
|
124 |
to its original run
|
125 |
|
126 |
Parameters:
|
127 |
runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
|
128 |
-
|
129 |
|
130 |
Returns:
|
131 |
list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
|
132 |
"""
|
|
|
|
|
133 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
134 |
-
sentences =
|
135 |
-
tokenized_sentences = [
|
136 |
|
|
|
137 |
tokens_with_style = []
|
138 |
for run in runs:
|
139 |
-
tokens =
|
140 |
if tokens:
|
141 |
for token in tokens:
|
142 |
tokens_with_style.append(run.copy())
|
@@ -144,6 +174,7 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
|
|
144 |
else:
|
145 |
tokens_with_style.append(run.copy())
|
146 |
|
|
|
147 |
token_index = 0
|
148 |
tokenized_sentences_with_style = []
|
149 |
for sentence in tokenized_sentences:
|
@@ -169,7 +200,7 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
|
|
169 |
|
170 |
def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
|
171 |
translated_paragraphs: list[str], aligner, temp_folder: str,
|
172 |
-
|
173 |
"""
|
174 |
Given some original paragraphs with style and formatting and its translation without formatting, try to match
|
175 |
the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
|
@@ -181,7 +212,8 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
|
|
181 |
translated_paragraphs: Translated text, split into paragraphs
|
182 |
aligner: Object of the aligner class, uses fastalign
|
183 |
temp_folder: Path to folder where to put all the intermediate files
|
184 |
-
|
|
|
185 |
|
186 |
Returns:
|
187 |
list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
|
@@ -192,7 +224,7 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
|
|
192 |
os.remove(f)
|
193 |
|
194 |
# tokenize the original text by sentence and words while keeping the style
|
195 |
-
original_tokenized_sentences_with_style = [tokenize_with_runs(runs,
|
196 |
original_paragraphs_with_runs]
|
197 |
|
198 |
# flatten all the runs so we can align with just one call instead of one per paragraph
|
@@ -200,9 +232,9 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
|
|
200 |
sublist]
|
201 |
|
202 |
# tokenize the translated text by sentence and word
|
203 |
-
translated_tokenized_sentences = [
|
204 |
translated_paragraph in translated_paragraphs for sentence in
|
205 |
-
|
206 |
|
207 |
assert len(translated_tokenized_sentences) == len(
|
208 |
original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
|
@@ -329,7 +361,6 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
|
|
329 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
330 |
translator,
|
331 |
aligner: Aligner,
|
332 |
-
detokenizer,
|
333 |
temp_folder: str = "tmp",
|
334 |
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
|
335 |
input_filename = input_file.split("/")[-1]
|
@@ -340,6 +371,11 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
340 |
original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
|
341 |
plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
|
342 |
|
|
|
|
|
|
|
|
|
|
|
343 |
# get paragraphs with runs
|
344 |
paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
|
345 |
enumerate(open(plain_text_file).readlines())]
|
@@ -347,21 +383,21 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
347 |
# translate using plaintext file
|
348 |
translated_paragraphs = []
|
349 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
350 |
-
paragraph_text =
|
351 |
translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
|
352 |
|
353 |
# time to align the translation with the original
|
354 |
print("Generating alignments...")
|
355 |
start_time = time.time()
|
356 |
translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
|
357 |
-
temp_folder,
|
358 |
print(f"Finished alignments in {time.time() - start_time} seconds")
|
359 |
|
360 |
# flatten the sentences into a list of tokens
|
361 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
362 |
|
363 |
# group the tokens by style/run
|
364 |
-
translated_runs_with_style = group_by_style(translated_tokens_with_style,
|
365 |
|
366 |
# group the runs by original paragraph
|
367 |
translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
|
|
|
7 |
|
8 |
from src.aligner import Aligner
|
9 |
|
|
|
10 |
import glob
|
11 |
+
from sacremoses import MosesTokenizer, MosesDetokenizer
|
12 |
+
import spacy
|
13 |
|
14 |
+
import tqdm
|
|
|
15 |
|
16 |
+
# Load multilingual model to use as sentence tokenizer
|
17 |
+
spacy_nlp = spacy.load("xx_ent_wiki_sm")
|
18 |
+
# Add the rule-based sentencizer
|
19 |
+
if "sentencizer" not in spacy_nlp.pipe_names:
|
20 |
+
spacy_nlp.add_pipe("sentencizer")
|
21 |
|
22 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
23 |
original_xliff_file_path: str) -> str:
|
|
|
120 |
|
121 |
return runs
|
122 |
|
123 |
+
def tokenize_text(text, tokenizer):
|
124 |
+
# To avoid the tokenizer destroying the url
|
125 |
+
def preserve_urls(text):
|
126 |
+
url_pattern = r'https?://[^\s\)\]\}\>]+|www\.[^\s\)\]\}\>]+'
|
127 |
+
# Find URLs using regex and replace them with a placeholder
|
128 |
+
urls = re.findall(url_pattern, text)
|
129 |
+
for idx, url in enumerate(urls):
|
130 |
+
placeholder = f"URL{idx}"
|
131 |
+
text = text.replace(url, placeholder)
|
132 |
+
|
133 |
+
return text, urls
|
134 |
+
|
135 |
+
# Replace URLs with placeholders
|
136 |
+
text, urls = preserve_urls(text)
|
137 |
+
|
138 |
+
# Tokenize using Sacremoses
|
139 |
+
tokens = tokenizer.tokenize(text)
|
140 |
+
|
141 |
+
# Revert placeholders back to original URLs
|
142 |
+
for idx, url in enumerate(urls):
|
143 |
+
placeholder = f"URL{idx}"
|
144 |
+
tokens = [token.replace(placeholder, url) for token in tokens]
|
145 |
|
146 |
+
return tokens
|
147 |
+
|
148 |
+
def tokenize_with_runs(runs: list[dict[str, str]], tokenizer, detokenizer) -> list[list[dict[str, str]]]:
|
149 |
"""
|
150 |
Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
|
151 |
to its original run
|
152 |
|
153 |
Parameters:
|
154 |
runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
|
155 |
+
source_lang: Language of the document
|
156 |
|
157 |
Returns:
|
158 |
list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
|
159 |
"""
|
160 |
+
|
161 |
+
# it's a bit of a mess but first we get the tokenized sentences
|
162 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
163 |
+
sentences = spacy_nlp(text_paragraph).sents
|
164 |
+
tokenized_sentences = [tokenize_text(sentence.text, tokenizer) for sentence in sentences]
|
165 |
|
166 |
+
# then we assign a run/style to each token
|
167 |
tokens_with_style = []
|
168 |
for run in runs:
|
169 |
+
tokens = tokenize_text(run["text"], tokenizer)
|
170 |
if tokens:
|
171 |
for token in tokens:
|
172 |
tokens_with_style.append(run.copy())
|
|
|
174 |
else:
|
175 |
tokens_with_style.append(run.copy())
|
176 |
|
177 |
+
# and finally we combine both things, where each token of each sentence is assigned a run/style
|
178 |
token_index = 0
|
179 |
tokenized_sentences_with_style = []
|
180 |
for sentence in tokenized_sentences:
|
|
|
200 |
|
201 |
def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
|
202 |
translated_paragraphs: list[str], aligner, temp_folder: str,
|
203 |
+
source_tokenizer, source_detokenizer, target_tokenizer) -> list[list[dict[str, str]]]:
|
204 |
"""
|
205 |
Given some original paragraphs with style and formatting and its translation without formatting, try to match
|
206 |
the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
|
|
|
212 |
translated_paragraphs: Translated text, split into paragraphs
|
213 |
aligner: Object of the aligner class, uses fastalign
|
214 |
temp_folder: Path to folder where to put all the intermediate files
|
215 |
+
source_lang: original language of the document
|
216 |
+
target_lang: target language of the translation
|
217 |
|
218 |
Returns:
|
219 |
list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
|
|
|
224 |
os.remove(f)
|
225 |
|
226 |
# tokenize the original text by sentence and words while keeping the style
|
227 |
+
original_tokenized_sentences_with_style = [tokenize_with_runs(runs, source_tokenizer, source_detokenizer) for runs in
|
228 |
original_paragraphs_with_runs]
|
229 |
|
230 |
# flatten all the runs so we can align with just one call instead of one per paragraph
|
|
|
232 |
sublist]
|
233 |
|
234 |
# tokenize the translated text by sentence and word
|
235 |
+
translated_tokenized_sentences = [tokenize_text(sentence.text, target_tokenizer) for
|
236 |
translated_paragraph in translated_paragraphs for sentence in
|
237 |
+
spacy_nlp(translated_paragraph).sents]
|
238 |
|
239 |
assert len(translated_tokenized_sentences) == len(
|
240 |
original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
|
|
|
361 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
362 |
translator,
|
363 |
aligner: Aligner,
|
|
|
364 |
temp_folder: str = "tmp",
|
365 |
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
|
366 |
input_filename = input_file.split("/")[-1]
|
|
|
371 |
original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
|
372 |
plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
|
373 |
|
374 |
+
source_tokenizer = MosesTokenizer(lang=source_lang)
|
375 |
+
source_detokenizer = MosesDetokenizer(lang=source_lang)
|
376 |
+
target_tokenizer = MosesTokenizer(lang=target_lang)
|
377 |
+
target_detokenizer = MosesDetokenizer(lang=target_lang)
|
378 |
+
|
379 |
# get paragraphs with runs
|
380 |
paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
|
381 |
enumerate(open(plain_text_file).readlines())]
|
|
|
383 |
# translate using plaintext file
|
384 |
translated_paragraphs = []
|
385 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
386 |
+
paragraph_text = source_detokenizer.detokenize([run["text"] for run in paragraph])
|
387 |
translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
|
388 |
|
389 |
# time to align the translation with the original
|
390 |
print("Generating alignments...")
|
391 |
start_time = time.time()
|
392 |
translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
|
393 |
+
temp_folder, source_tokenizer, source_detokenizer, target_tokenizer)
|
394 |
print(f"Finished alignments in {time.time() - start_time} seconds")
|
395 |
|
396 |
# flatten the sentences into a list of tokens
|
397 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
398 |
|
399 |
# group the tokens by style/run
|
400 |
+
translated_runs_with_style = group_by_style(translated_tokens_with_style, target_detokenizer)
|
401 |
|
402 |
# group the runs by original paragraph
|
403 |
translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
|