mjuvilla commited on
Commit
41ceab1
·
1 Parent(s): 4420a7f

changed nltk tokenizer to multilingual tokenizers

Browse files
Files changed (1) hide show
  1. src/translate_any_doc.py +55 -19
src/translate_any_doc.py CHANGED
@@ -7,14 +7,17 @@ import re
7
 
8
  from src.aligner import Aligner
9
 
10
- import nltk
11
  import glob
12
- from nltk.tokenize import sent_tokenize, word_tokenize
13
- import tqdm
14
 
15
- nltk.download('punkt')
16
- nltk.download('punkt_tab')
17
 
 
 
 
 
 
18
 
19
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
20
  original_xliff_file_path: str) -> str:
@@ -117,26 +120,53 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
117
 
118
  return runs
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dict[str, str]]]:
 
 
122
  """
123
  Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
124
  to its original run
125
 
126
  Parameters:
127
  runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
128
- detokenizer: Detokenizer object to merge tokens back together
129
 
130
  Returns:
131
  list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
132
  """
 
 
133
  text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
134
- sentences = sent_tokenize(text_paragraph)
135
- tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
136
 
 
137
  tokens_with_style = []
138
  for run in runs:
139
- tokens = word_tokenize(run["text"])
140
  if tokens:
141
  for token in tokens:
142
  tokens_with_style.append(run.copy())
@@ -144,6 +174,7 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
144
  else:
145
  tokens_with_style.append(run.copy())
146
 
 
147
  token_index = 0
148
  tokenized_sentences_with_style = []
149
  for sentence in tokenized_sentences:
@@ -169,7 +200,7 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
169
 
170
  def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
171
  translated_paragraphs: list[str], aligner, temp_folder: str,
172
- detokenizer) -> list[list[dict[str, str]]]:
173
  """
174
  Given some original paragraphs with style and formatting and its translation without formatting, try to match
175
  the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
@@ -181,7 +212,8 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
181
  translated_paragraphs: Translated text, split into paragraphs
182
  aligner: Object of the aligner class, uses fastalign
183
  temp_folder: Path to folder where to put all the intermediate files
184
- detokenizer: Detokenizer object to merge tokens back together
 
185
 
186
  Returns:
187
  list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
@@ -192,7 +224,7 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
192
  os.remove(f)
193
 
194
  # tokenize the original text by sentence and words while keeping the style
195
- original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
196
  original_paragraphs_with_runs]
197
 
198
  # flatten all the runs so we can align with just one call instead of one per paragraph
@@ -200,9 +232,9 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
200
  sublist]
201
 
202
  # tokenize the translated text by sentence and word
203
- translated_tokenized_sentences = [word_tokenize(sentence) for
204
  translated_paragraph in translated_paragraphs for sentence in
205
- sent_tokenize(translated_paragraph)]
206
 
207
  assert len(translated_tokenized_sentences) == len(
208
  original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
@@ -329,7 +361,6 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
329
  def translate_document(input_file: str, source_lang: str, target_lang: str,
330
  translator,
331
  aligner: Aligner,
332
- detokenizer,
333
  temp_folder: str = "tmp",
334
  tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
335
  input_filename = input_file.split("/")[-1]
@@ -340,6 +371,11 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
340
  original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
341
  plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
342
 
 
 
 
 
 
343
  # get paragraphs with runs
344
  paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
345
  enumerate(open(plain_text_file).readlines())]
@@ -347,21 +383,21 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
347
  # translate using plaintext file
348
  translated_paragraphs = []
349
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
350
- paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
351
  translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
352
 
353
  # time to align the translation with the original
354
  print("Generating alignments...")
355
  start_time = time.time()
356
  translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
357
- temp_folder, detokenizer)
358
  print(f"Finished alignments in {time.time() - start_time} seconds")
359
 
360
  # flatten the sentences into a list of tokens
361
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
362
 
363
  # group the tokens by style/run
364
- translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
365
 
366
  # group the runs by original paragraph
367
  translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
 
7
 
8
  from src.aligner import Aligner
9
 
 
10
  import glob
11
+ from sacremoses import MosesTokenizer, MosesDetokenizer
12
+ import spacy
13
 
14
+ import tqdm
 
15
 
16
+ # Load multilingual model to use as sentence tokenizer
17
+ spacy_nlp = spacy.load("xx_ent_wiki_sm")
18
+ # Add the rule-based sentencizer
19
+ if "sentencizer" not in spacy_nlp.pipe_names:
20
+ spacy_nlp.add_pipe("sentencizer")
21
 
22
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
23
  original_xliff_file_path: str) -> str:
 
120
 
121
  return runs
122
 
123
+ def tokenize_text(text, tokenizer):
124
+ # To avoid the tokenizer destroying the url
125
+ def preserve_urls(text):
126
+ url_pattern = r'https?://[^\s\)\]\}\>]+|www\.[^\s\)\]\}\>]+'
127
+ # Find URLs using regex and replace them with a placeholder
128
+ urls = re.findall(url_pattern, text)
129
+ for idx, url in enumerate(urls):
130
+ placeholder = f"URL{idx}"
131
+ text = text.replace(url, placeholder)
132
+
133
+ return text, urls
134
+
135
+ # Replace URLs with placeholders
136
+ text, urls = preserve_urls(text)
137
+
138
+ # Tokenize using Sacremoses
139
+ tokens = tokenizer.tokenize(text)
140
+
141
+ # Revert placeholders back to original URLs
142
+ for idx, url in enumerate(urls):
143
+ placeholder = f"URL{idx}"
144
+ tokens = [token.replace(placeholder, url) for token in tokens]
145
 
146
+ return tokens
147
+
148
+ def tokenize_with_runs(runs: list[dict[str, str]], tokenizer, detokenizer) -> list[list[dict[str, str]]]:
149
  """
150
  Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
151
  to its original run
152
 
153
  Parameters:
154
  runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
155
+ source_lang: Language of the document
156
 
157
  Returns:
158
  list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
159
  """
160
+
161
+ # it's a bit of a mess but first we get the tokenized sentences
162
  text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
163
+ sentences = spacy_nlp(text_paragraph).sents
164
+ tokenized_sentences = [tokenize_text(sentence.text, tokenizer) for sentence in sentences]
165
 
166
+ # then we assign a run/style to each token
167
  tokens_with_style = []
168
  for run in runs:
169
+ tokens = tokenize_text(run["text"], tokenizer)
170
  if tokens:
171
  for token in tokens:
172
  tokens_with_style.append(run.copy())
 
174
  else:
175
  tokens_with_style.append(run.copy())
176
 
177
+ # and finally we combine both things, where each token of each sentence is assigned a run/style
178
  token_index = 0
179
  tokenized_sentences_with_style = []
180
  for sentence in tokenized_sentences:
 
200
 
201
  def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
202
  translated_paragraphs: list[str], aligner, temp_folder: str,
203
+ source_tokenizer, source_detokenizer, target_tokenizer) -> list[list[dict[str, str]]]:
204
  """
205
  Given some original paragraphs with style and formatting and its translation without formatting, try to match
206
  the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
 
212
  translated_paragraphs: Translated text, split into paragraphs
213
  aligner: Object of the aligner class, uses fastalign
214
  temp_folder: Path to folder where to put all the intermediate files
215
+ source_lang: original language of the document
216
+ target_lang: target language of the translation
217
 
218
  Returns:
219
  list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
 
224
  os.remove(f)
225
 
226
  # tokenize the original text by sentence and words while keeping the style
227
+ original_tokenized_sentences_with_style = [tokenize_with_runs(runs, source_tokenizer, source_detokenizer) for runs in
228
  original_paragraphs_with_runs]
229
 
230
  # flatten all the runs so we can align with just one call instead of one per paragraph
 
232
  sublist]
233
 
234
  # tokenize the translated text by sentence and word
235
+ translated_tokenized_sentences = [tokenize_text(sentence.text, target_tokenizer) for
236
  translated_paragraph in translated_paragraphs for sentence in
237
+ spacy_nlp(translated_paragraph).sents]
238
 
239
  assert len(translated_tokenized_sentences) == len(
240
  original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
 
361
  def translate_document(input_file: str, source_lang: str, target_lang: str,
362
  translator,
363
  aligner: Aligner,
 
364
  temp_folder: str = "tmp",
365
  tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
366
  input_filename = input_file.split("/")[-1]
 
371
  original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
372
  plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
373
 
374
+ source_tokenizer = MosesTokenizer(lang=source_lang)
375
+ source_detokenizer = MosesDetokenizer(lang=source_lang)
376
+ target_tokenizer = MosesTokenizer(lang=target_lang)
377
+ target_detokenizer = MosesDetokenizer(lang=target_lang)
378
+
379
  # get paragraphs with runs
380
  paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
381
  enumerate(open(plain_text_file).readlines())]
 
383
  # translate using plaintext file
384
  translated_paragraphs = []
385
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
386
+ paragraph_text = source_detokenizer.detokenize([run["text"] for run in paragraph])
387
  translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
388
 
389
  # time to align the translation with the original
390
  print("Generating alignments...")
391
  start_time = time.time()
392
  translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
393
+ temp_folder, source_tokenizer, source_detokenizer, target_tokenizer)
394
  print(f"Finished alignments in {time.time() - start_time} seconds")
395
 
396
  # flatten the sentences into a list of tokens
397
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
398
 
399
  # group the tokens by style/run
400
+ translated_runs_with_style = group_by_style(translated_tokens_with_style, target_detokenizer)
401
 
402
  # group the runs by original paragraph
403
  translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in