mjuvilla commited on
Commit
186c0af
·
1 Parent(s): 41ceab1

Fixed a lot of error, now the script should crash much less often.

Browse files

Biggest change: translated the sentences one by one instead of the whole paragraph. This way there's no need to align the sentences and we know exactly which sentence is the translation of which. This is important when the translation is bad, there's different number of sentences in the source and target texts and everything crashes.
I also changed the tokenizer and detokenizers to use spacy's

requirements.txt CHANGED
@@ -1,9 +1,8 @@
1
- nltk~=3.9.1
2
- python-docx~=1.1.2
3
  iso-639~=0.4.5
4
  protobuf~=6.30.2
5
  requests~=2.32.3
6
  tqdm~=4.67.1
7
  gradio~=5.25.1
8
  gradio_client~=1.8.0
9
- setuptools~=80.0.0
 
 
 
 
1
  iso-639~=0.4.5
2
  protobuf~=6.30.2
3
  requests~=2.32.3
4
  tqdm~=4.67.1
5
  gradio~=5.25.1
6
  gradio_client~=1.8.0
7
+ setuptools~=80.0.0
8
+ spacy~=3.8.6
src/salamandraTA7b_translator.py CHANGED
@@ -1,11 +1,15 @@
1
  from gradio_client import Client
2
  from iso639 import languages
3
 
 
4
  class SalamandraTA7bTranslator:
5
  def __init__(self, hf_token):
6
  self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
7
 
8
  def translate(self, text, source_lang, target_lang):
 
 
 
9
  # we assume that they are specifying the language by code so we need to convert it to name
10
  lang1 = languages.get(alpha2=source_lang).name
11
  lang2 = languages.get(alpha2=target_lang).name
 
1
  from gradio_client import Client
2
  from iso639 import languages
3
 
4
+
5
  class SalamandraTA7bTranslator:
6
  def __init__(self, hf_token):
7
  self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
8
 
9
  def translate(self, text, source_lang, target_lang):
10
+ if not text:
11
+ return ""
12
+
13
  # we assume that they are specifying the language by code so we need to convert it to name
14
  lang1 = languages.get(alpha2=source_lang).name
15
  lang2 = languages.get(alpha2=target_lang).name
src/translate_any_doc.py CHANGED
@@ -1,4 +1,5 @@
1
  import shutil
 
2
  import time
3
  import os
4
  from itertools import groupby
@@ -8,8 +9,8 @@ import re
8
  from src.aligner import Aligner
9
 
10
  import glob
11
- from sacremoses import MosesTokenizer, MosesDetokenizer
12
  import spacy
 
13
 
14
  import tqdm
15
 
@@ -19,6 +20,7 @@ spacy_nlp = spacy.load("xx_ent_wiki_sm")
19
  if "sentencizer" not in spacy_nlp.pipe_names:
20
  spacy_nlp.add_pipe("sentencizer")
21
 
 
22
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
23
  original_xliff_file_path: str) -> str:
24
  """
@@ -120,6 +122,7 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
120
 
121
  return runs
122
 
 
123
  def tokenize_text(text, tokenizer):
124
  # To avoid the tokenizer destroying the url
125
  def preserve_urls(text):
@@ -145,7 +148,8 @@ def tokenize_text(text, tokenizer):
145
 
146
  return tokens
147
 
148
- def tokenize_with_runs(runs: list[dict[str, str]], tokenizer, detokenizer) -> list[list[dict[str, str]]]:
 
149
  """
150
  Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
151
  to its original run
@@ -159,57 +163,80 @@ def tokenize_with_runs(runs: list[dict[str, str]], tokenizer, detokenizer) -> li
159
  """
160
 
161
  # it's a bit of a mess but first we get the tokenized sentences
162
- text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
163
- sentences = spacy_nlp(text_paragraph).sents
164
- tokenized_sentences = [tokenize_text(sentence.text, tokenizer) for sentence in sentences]
165
 
166
- # then we assign a run/style to each token
167
- tokens_with_style = []
 
 
 
 
 
 
 
 
 
168
  for run in runs:
169
- tokens = tokenize_text(run["text"], tokenizer)
170
- if tokens:
171
- for token in tokens:
172
- tokens_with_style.append(run.copy())
173
- tokens_with_style[-1]["text"] = token
174
- else:
175
- tokens_with_style.append(run.copy())
176
-
177
- # and finally we combine both things, where each token of each sentence is assigned a run/style
178
- token_index = 0
179
- tokenized_sentences_with_style = []
180
- for sentence in tokenized_sentences:
181
- sentence_with_style = []
182
- for word in sentence:
183
- if word == tokens_with_style[token_index]["text"]:
184
- sentence_with_style.append(tokens_with_style[token_index])
185
- token_index += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  else:
187
- if word.startswith(tokens_with_style[token_index]["text"]):
188
- # this token might be split into several runs
189
- word_left = word
190
-
191
- while word_left:
192
- sentence_with_style.append(tokens_with_style[token_index])
193
- word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
194
- token_index += 1
195
- else:
196
- raise "Something unexpected happened I'm afraid"
197
  tokenized_sentences_with_style.append(sentence_with_style)
198
- return tokenized_sentences_with_style
 
 
199
 
200
 
201
- def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
202
- translated_paragraphs: list[str], aligner, temp_folder: str,
203
- source_tokenizer, source_detokenizer, target_tokenizer) -> list[list[dict[str, str]]]:
204
  """
205
- Given some original paragraphs with style and formatting and its translation without formatting, try to match
206
  the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
207
  forget about paragraphs and work only in sentences, so the output is a list of sentences but with information about
208
  from which paragraph that sentence came from
209
 
210
  Parameters:
211
- original_paragraphs_with_runs: Original text split into paragraphs and runs
212
- translated_paragraphs: Translated text, split into paragraphs
213
  aligner: Object of the aligner class, uses fastalign
214
  temp_folder: Path to folder where to put all the intermediate files
215
  source_lang: original language of the document
@@ -223,21 +250,17 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
223
  for f in glob.glob(os.path.join(temp_folder, "*align*")):
224
  os.remove(f)
225
 
226
- # tokenize the original text by sentence and words while keeping the style
227
- original_tokenized_sentences_with_style = [tokenize_with_runs(runs, source_tokenizer, source_detokenizer) for runs in
228
- original_paragraphs_with_runs]
229
-
230
- # flatten all the runs so we can align with just one call instead of one per paragraph
231
- original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
232
- sublist]
233
-
234
  # tokenize the translated text by sentence and word
235
- translated_tokenized_sentences = [tokenize_text(sentence.text, target_tokenizer) for
236
- translated_paragraph in translated_paragraphs for sentence in
237
- spacy_nlp(translated_paragraph).sents]
 
 
 
 
238
 
239
  assert len(translated_tokenized_sentences) == len(
240
- original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
241
 
242
  original_sentences = []
243
  translated_sentences = []
@@ -272,27 +295,28 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
272
 
273
  translated_sentences_with_style.append(translated_sentence_with_style)
274
 
275
- return translated_sentences_with_style
276
 
277
 
278
- def group_by_style(tokens: list[dict[str, str]], detokenizer) -> list[dict[str, str]]:
279
  """
280
  To avoid having issues in the future, we group the contiguous tokens that have the same style. Basically, we
281
  reconstruct the runs.
282
 
283
  Parameters:
284
  tokens: Tokens with style information
285
- detokenizer: Detokenizer object to merge tokens back together
286
 
287
  Returns:
288
  list[dict]: A list of translated runs with format and style
289
  """
290
  groups = []
291
- for key, group in groupby(tokens, key=lambda x: (x["id"], x["paragraph_index"])):
292
- text = detokenizer.detokenize([item['text'] for item in group])
 
 
 
293
 
294
- if groups and not text.startswith((",", ";", ":", ".", ")", "!", "?")):
295
- text = " " + text
296
 
297
  groups.append({"text": text,
298
  "id": key[0],
@@ -300,7 +324,7 @@ def group_by_style(tokens: list[dict[str, str]], detokenizer) -> list[dict[str,
300
  return groups
301
 
302
 
303
- def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]]], out_file_path: str):
304
  """
305
  Generate a plain text file restoring the original tag structure like <g id=1> </g>
306
 
@@ -310,59 +334,39 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
310
  """
311
  with open(out_file_path, "w") as out_file:
312
 
313
- current_stack = []
314
-
315
- def close_tags(to_close):
316
- return ''.join(f'</g>' for _ in to_close)
 
 
317
 
318
- def open_tags(to_open):
319
  tag = ""
320
- for gid in to_open:
321
  tag_type, tag_id = gid.split("_")
322
  tag += f'<{tag_type} id="{tag_id}">'
323
  return tag
324
 
325
  for key, paragraph in paragraphs_with_style.items():
326
- output = []
327
  for run in paragraph:
328
  ids = list(run["id"]) if run["id"] else []
329
 
330
- # Find the point where current and new IDs diverge
331
- common_prefix_len = 0
332
- for a, b in zip(current_stack, ids):
333
- if a == b:
334
- common_prefix_len += 1
335
- else:
336
- break
337
-
338
- # Close tags not in the new stack
339
- to_close = current_stack[common_prefix_len:]
340
- if to_close:
341
- output.append(close_tags(to_close))
342
-
343
- # Open new tags
344
- to_open = ids[common_prefix_len:]
345
- if to_open:
346
- output.append(open_tags(to_open))
347
-
348
- # Add text
349
- output.append(run["text"])
350
-
351
- # Update the stack
352
- current_stack = ids
353
 
354
- # Close any remaining open tags
355
- if current_stack:
356
- output.append(close_tags(current_stack))
357
 
358
- out_file.write("".join(output) + "\n")
359
 
360
 
361
  def translate_document(input_file: str, source_lang: str, target_lang: str,
362
  translator,
363
  aligner: Aligner,
364
  temp_folder: str = "tmp",
365
- tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
366
  input_filename = input_file.split("/")[-1]
367
  # copy the original file to the temporal folder to avoid common issues with tikal
368
  temp_input_file = os.path.join(temp_folder, input_filename)
@@ -371,33 +375,53 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
371
  original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
372
  plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
373
 
374
- source_tokenizer = MosesTokenizer(lang=source_lang)
375
- source_detokenizer = MosesDetokenizer(lang=source_lang)
376
- target_tokenizer = MosesTokenizer(lang=target_lang)
377
- target_detokenizer = MosesDetokenizer(lang=target_lang)
378
-
379
  # get paragraphs with runs
380
  paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
381
  enumerate(open(plain_text_file).readlines())]
382
 
383
  # translate using plaintext file
384
- translated_paragraphs = []
385
- for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
386
- paragraph_text = source_detokenizer.detokenize([run["text"] for run in paragraph])
387
- translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
  # time to align the translation with the original
390
  print("Generating alignments...")
391
  start_time = time.time()
392
- translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
393
- temp_folder, source_tokenizer, source_detokenizer, target_tokenizer)
 
 
394
  print(f"Finished alignments in {time.time() - start_time} seconds")
395
 
 
 
 
 
 
 
 
396
  # flatten the sentences into a list of tokens
397
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
 
398
 
399
  # group the tokens by style/run
400
- translated_runs_with_style = group_by_style(translated_tokens_with_style, target_detokenizer)
401
 
402
  # group the runs by original paragraph
403
  translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
@@ -424,6 +448,13 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
424
  "-noalttrans", "-to", original_xliff_file]
425
  Popen(tikal_moses_to_xliff_command).wait()
426
 
 
 
 
 
 
 
 
427
  # merge into a docx again
428
  tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
429
  final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
 
1
  import shutil
2
+ import string
3
  import time
4
  import os
5
  from itertools import groupby
 
9
  from src.aligner import Aligner
10
 
11
  import glob
 
12
  import spacy
13
+ from spacy.tokens import Doc
14
 
15
  import tqdm
16
 
 
20
  if "sentencizer" not in spacy_nlp.pipe_names:
21
  spacy_nlp.add_pipe("sentencizer")
22
 
23
+
24
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
25
  original_xliff_file_path: str) -> str:
26
  """
 
122
 
123
  return runs
124
 
125
+
126
  def tokenize_text(text, tokenizer):
127
  # To avoid the tokenizer destroying the url
128
  def preserve_urls(text):
 
148
 
149
  return tokens
150
 
151
+
152
+ def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str, str]]], list[list[bool]]]:
153
  """
154
  Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
155
  to its original run
 
163
  """
164
 
165
  # it's a bit of a mess but first we get the tokenized sentences
166
+ # join runs and send through spacy to split into clean tokens
167
+ doc_from_runs = spacy_nlp("".join([run["text"] for run in runs]).strip())
 
168
 
169
+ # extract sentences and tokenize each into words
170
+ tokenized_sentences = [[token.text.strip() for token in sent if token.text.strip()] for sent in doc_from_runs.sents]
171
+ tokenized_sentences_spaces = [[token.whitespace_ != '' for token in sent if token.text.strip()] for sent in
172
+ doc_from_runs.sents]
173
+
174
+ flat_tokens = [token for sentence in tokenized_sentences for token in sentence]
175
+ flat_spaces = [token for sentence in tokenized_sentences_spaces for token in sentence]
176
+
177
+ flat_tokens_with_style = []
178
+ flat_spaces_with_style = []
179
+ token_idx = 0
180
  for run in runs:
181
+ run["text"] = run["text"].strip()
182
+ while run["text"]:
183
+ if run["text"].startswith(flat_tokens[token_idx]):
184
+ run["text"] = run["text"][len(flat_tokens[token_idx]):]
185
+ if flat_spaces[token_idx]:
186
+ run["text"] = run["text"].lstrip()
187
+ item = run.copy()
188
+ item["text"] = flat_tokens[token_idx]
189
+ flat_tokens_with_style.append(item)
190
+ flat_spaces_with_style.append(flat_spaces[token_idx])
191
+ token_idx += 1
192
+ elif flat_tokens[token_idx].startswith(run["text"]):
193
+ subtoken = flat_tokens[token_idx][:len(run["text"])]
194
+ item = run.copy()
195
+ item["text"] = subtoken
196
+ flat_tokens_with_style.append(item)
197
+ flat_spaces_with_style.append(False)
198
+ flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
199
+ run["text"] = run["text"][len(subtoken):]
200
+
201
+ # reconstruct the sentences
202
+ token_idx = 0
203
+ tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
204
+ for sentence, sentence_spaces in zip(tokenized_sentences, tokenized_sentences_spaces):
205
+ sentence_with_style, sentence_spaces_with_style = [], []
206
+ for token in sentence:
207
+ if token == flat_tokens_with_style[token_idx]["text"]:
208
+ sentence_with_style.append(flat_tokens_with_style[token_idx])
209
+ sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
210
+ token_idx += 1
211
+ elif token.startswith(flat_tokens_with_style[token_idx]["text"]):
212
+ while token:
213
+ token = token[len(flat_tokens_with_style[token_idx]["text"]):]
214
+ sentence_with_style.append(flat_tokens_with_style[token_idx])
215
+ sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
216
+ token_idx += 1
217
  else:
218
+ print(token)
219
+ print(sentence)
220
+ print(token_idx)
221
+ print(flat_tokens_with_style)
222
+ raise Exception(f"Something unexpected happened")
 
 
 
 
 
223
  tokenized_sentences_with_style.append(sentence_with_style)
224
+ tokenized_sentences_spaces_with_style.append(sentence_spaces_with_style)
225
+
226
+ return tokenized_sentences_with_style, tokenized_sentences_spaces_with_style
227
 
228
 
229
+ def generate_alignments(original_tokenized_sentences_with_style: list[list[dict[str, str]]],
230
+ translated_sentences: list[str], aligner, temp_folder: str):
 
231
  """
232
+ Given some original sentences with style and formatting and its translation without formatting, try to match
233
  the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
234
  forget about paragraphs and work only in sentences, so the output is a list of sentences but with information about
235
  from which paragraph that sentence came from
236
 
237
  Parameters:
238
+ original_tokenized_sentences_with_style: Original text split into sentences with style information
239
+ translated_sentences: Translated text, split into sentences
240
  aligner: Object of the aligner class, uses fastalign
241
  temp_folder: Path to folder where to put all the intermediate files
242
  source_lang: original language of the document
 
250
  for f in glob.glob(os.path.join(temp_folder, "*align*")):
251
  os.remove(f)
252
 
 
 
 
 
 
 
 
 
253
  # tokenize the translated text by sentence and word
254
+ translated_tokenized_sentences = []
255
+ # keep spacing information to detokenize properly later
256
+ translated_tokenized_sentences_spaces = []
257
+ for sentence in translated_sentences:
258
+ tokens = spacy_nlp(sentence)
259
+ translated_tokenized_sentences_spaces.append([token.whitespace_ != '' for token in tokens])
260
+ translated_tokenized_sentences.append([token.text for token in tokens])
261
 
262
  assert len(translated_tokenized_sentences) == len(
263
+ original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentences, likely due to a translation error"
264
 
265
  original_sentences = []
266
  translated_sentences = []
 
295
 
296
  translated_sentences_with_style.append(translated_sentence_with_style)
297
 
298
+ return translated_sentences_with_style, translated_tokenized_sentences_spaces
299
 
300
 
301
+ def group_by_style(tokens: list[dict[str, str]], spaces: list[bool]) -> list[dict[str, str]]:
302
  """
303
  To avoid having issues in the future, we group the contiguous tokens that have the same style. Basically, we
304
  reconstruct the runs.
305
 
306
  Parameters:
307
  tokens: Tokens with style information
 
308
 
309
  Returns:
310
  list[dict]: A list of translated runs with format and style
311
  """
312
  groups = []
313
+ zipped = zip(tokens, spaces)
314
+ for key, group in groupby(zipped, key=lambda x: (x[0]["id"], x[0]["paragraph_index"])):
315
+ group = list(group)
316
+ tokens = [item[0]['text'] for item in group]
317
+ spaces = [item[1] for item in group]
318
 
319
+ text = Doc(spacy_nlp.vocab, words=tokens, spaces=spaces).text
 
320
 
321
  groups.append({"text": text,
322
  "id": key[0],
 
324
  return groups
325
 
326
 
327
+ def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]]], out_file_path: str):
328
  """
329
  Generate a plain text file restoring the original tag structure like <g id=1> </g>
330
 
 
334
  """
335
  with open(out_file_path, "w") as out_file:
336
 
337
+ def close_tags(ids):
338
+ tag = ""
339
+ for gid in ids:
340
+ tag_type, tag_id = gid.split("_")
341
+ tag += f'</{tag_type}>'
342
+ return tag
343
 
344
+ def open_tags(ids):
345
  tag = ""
346
+ for gid in ids:
347
  tag_type, tag_id = gid.split("_")
348
  tag += f'<{tag_type} id="{tag_id}">'
349
  return tag
350
 
351
  for key, paragraph in paragraphs_with_style.items():
 
352
  for run in paragraph:
353
  ids = list(run["id"]) if run["id"] else []
354
 
355
+ if ids:
356
+ output = open_tags(ids) + run["text"] + close_tags(ids)
357
+ out_file.write(output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
+ else:
360
+ out_file.write("".join(run["text"]))
 
361
 
362
+ out_file.write("\n")
363
 
364
 
365
  def translate_document(input_file: str, source_lang: str, target_lang: str,
366
  translator,
367
  aligner: Aligner,
368
  temp_folder: str = "tmp",
369
+ tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0", with_format: bool = True) -> str:
370
  input_filename = input_file.split("/")[-1]
371
  # copy the original file to the temporal folder to avoid common issues with tikal
372
  temp_input_file = os.path.join(temp_folder, input_filename)
 
375
  original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
376
  plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
377
 
 
 
 
 
 
378
  # get paragraphs with runs
379
  paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
380
  enumerate(open(plain_text_file).readlines())]
381
 
382
  # translate using plaintext file
383
+ original_tokenized_sentences_with_style = []
384
+ original_spacing = []
385
+ for run in paragraphs_with_runs:
386
+ tokens, spaces = tokenize_with_runs(run)
387
+ original_tokenized_sentences_with_style += tokens
388
+ original_spacing += spaces
389
+
390
+ translated_sentences = []
391
+ for sentence, spacing in tqdm.tqdm(zip(original_tokenized_sentences_with_style, original_spacing),
392
+ desc="Translating paragraphs...",
393
+ total=len(original_tokenized_sentences_with_style)):
394
+ text = Doc(spacy_nlp.vocab, words=[token["text"] for token in sentence], spaces=spacing).text
395
+
396
+ while True:
397
+ try:
398
+ translated_sentences.append(translator.translate(text, source_lang, target_lang))
399
+ break
400
+ except:
401
+ continue
402
 
403
  # time to align the translation with the original
404
  print("Generating alignments...")
405
  start_time = time.time()
406
+ translated_sentences_with_style, translated_sentences_spacing = generate_alignments(
407
+ original_tokenized_sentences_with_style,
408
+ translated_sentences, aligner,
409
+ temp_folder)
410
  print(f"Finished alignments in {time.time() - start_time} seconds")
411
 
412
+ # since we tokenized these sentences independently, the spacing information does not contain spaces after punctuation
413
+ # at the end of the sentence (there's no space at the end of a sentence that ends with ".", unless there's a sentence
414
+ # right after
415
+ for sentence, sentence_spaces in zip(translated_sentences_with_style, translated_sentences_spacing):
416
+ if sentence[-1]["text"] in string.punctuation:
417
+ sentence_spaces[-1] = True
418
+
419
  # flatten the sentences into a list of tokens
420
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
421
+ tokens_spaces = [item for sublist in translated_sentences_spacing for item in sublist]
422
 
423
  # group the tokens by style/run
424
+ translated_runs_with_style = group_by_style(translated_tokens_with_style, tokens_spaces)
425
 
426
  # group the runs by original paragraph
427
  translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
 
448
  "-noalttrans", "-to", original_xliff_file]
449
  Popen(tikal_moses_to_xliff_command).wait()
450
 
451
+ # any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
452
+ # them. This may happen if a word in the original language has been split in more that one words that have other
453
+ # words in between, or an error in fastalign
454
+ text = open(original_xliff_file).read()
455
+ result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
456
+ open(original_xliff_file, "w").write(result)
457
+
458
  # merge into a docx again
459
  tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
460
  final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)