Spaces:

LangTech-MT
/

document-translator

Sleeping

mjuvilla commited on Jun 18

Commit

186c0af

1 Parent(s): 41ceab1

Fixed a lot of error, now the script should crash much less often.

Biggest change: translated the sentences one by one instead of the whole paragraph. This way there's no need to align the sentences and we know exactly which sentence is the translation of which. This is important when the translation is bad, there's different number of sentences in the source and target texts and everything crashes.
I also changed the tokenizer and detokenizers to use spacy's

Files changed (3) hide show

requirements.txt +2 -3
src/salamandraTA7b_translator.py +4 -0
src/translate_any_doc.py +139 -108

requirements.txt CHANGED Viewed

@@ -1,9 +1,8 @@
-nltk~=3.9.1
-python-docx~=1.1.2
 iso-639~=0.4.5
 protobuf~=6.30.2
 requests~=2.32.3
 tqdm~=4.67.1
 gradio~=5.25.1
 gradio_client~=1.8.0
-setuptools~=80.0.0

 iso-639~=0.4.5
 protobuf~=6.30.2
 requests~=2.32.3
 tqdm~=4.67.1
 gradio~=5.25.1
 gradio_client~=1.8.0
+setuptools~=80.0.0
+spacy~=3.8.6

src/salamandraTA7b_translator.py CHANGED Viewed

@@ -1,11 +1,15 @@
 from gradio_client import Client
 from iso639 import languages
 class SalamandraTA7bTranslator:
     def __init__(self, hf_token):
         self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
     def translate(self, text, source_lang, target_lang):
         # we assume that they are specifying the language by code so we need to convert it to name
         lang1 = languages.get(alpha2=source_lang).name
         lang2 = languages.get(alpha2=target_lang).name

 from gradio_client import Client
 from iso639 import languages
 class SalamandraTA7bTranslator:
     def __init__(self, hf_token):
         self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
     def translate(self, text, source_lang, target_lang):
+        if not text:
+            return ""
         # we assume that they are specifying the language by code so we need to convert it to name
         lang1 = languages.get(alpha2=source_lang).name
         lang2 = languages.get(alpha2=target_lang).name

src/translate_any_doc.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import shutil
 import time
 import os
 from itertools import groupby
@@ -8,8 +9,8 @@ import re
 from src.aligner import Aligner
 import glob
-from sacremoses import MosesTokenizer, MosesDetokenizer
 import spacy
 import tqdm
@@ -19,6 +20,7 @@ spacy_nlp = spacy.load("xx_ent_wiki_sm")
 if "sentencizer" not in spacy_nlp.pipe_names:
     spacy_nlp.add_pipe("sentencizer")
 def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
                       original_xliff_file_path: str) -> str:
     """
@@ -120,6 +122,7 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
     return runs
 def tokenize_text(text, tokenizer):
     # To avoid the tokenizer destroying the url
     def preserve_urls(text):
@@ -145,7 +148,8 @@ def tokenize_text(text, tokenizer):
     return tokens
-def tokenize_with_runs(runs: list[dict[str, str]], tokenizer, detokenizer) -> list[list[dict[str, str]]]:
     """
     Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
     to its original run
@@ -159,57 +163,80 @@ def tokenize_with_runs(runs: list[dict[str, str]], tokenizer, detokenizer) -> li
     """
     # it's a bit of a mess but first we get the tokenized sentences
-    text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
-    sentences = spacy_nlp(text_paragraph).sents
-    tokenized_sentences = [tokenize_text(sentence.text, tokenizer) for sentence in sentences]
-    # then we assign a run/style to each token
-    tokens_with_style = []
     for run in runs:
-        tokens = tokenize_text(run["text"], tokenizer)
-        if tokens:
-            for token in tokens:
-                tokens_with_style.append(run.copy())
-                tokens_with_style[-1]["text"] = token
-        else:
-            tokens_with_style.append(run.copy())
-    # and finally we combine both things, where each token of each sentence is assigned a run/style
-    token_index = 0
-    tokenized_sentences_with_style = []
-    for sentence in tokenized_sentences:
-        sentence_with_style = []
-        for word in sentence:
-            if word == tokens_with_style[token_index]["text"]:
-                sentence_with_style.append(tokens_with_style[token_index])
-                token_index += 1
             else:
-                if word.startswith(tokens_with_style[token_index]["text"]):
-                    # this token might be split into several runs
-                    word_left = word
-                    while word_left:
-                        sentence_with_style.append(tokens_with_style[token_index])
-                        word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
-                        token_index += 1
-                else:
-                    raise "Something unexpected happened I'm afraid"
         tokenized_sentences_with_style.append(sentence_with_style)
-    return tokenized_sentences_with_style
-def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
-                        translated_paragraphs: list[str], aligner, temp_folder: str,
-                        source_tokenizer, source_detokenizer, target_tokenizer) -> list[list[dict[str, str]]]:
     """
-    Given some original paragraphs with style and formatting and its translation without formatting, try to match
     the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
     forget about paragraphs and work only in sentences, so the output is a list of sentences but with information about
     from which paragraph that sentence came from
     Parameters:
-    original_paragraphs_with_runs: Original text split into paragraphs and runs
-    translated_paragraphs: Translated text, split into paragraphs
     aligner: Object of the aligner class, uses fastalign
     temp_folder: Path to folder where to put all the intermediate files
     source_lang: original language of the document
@@ -223,21 +250,17 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
     for f in glob.glob(os.path.join(temp_folder, "*align*")):
         os.remove(f)
-    # tokenize the original text by sentence and words while keeping the style
-    original_tokenized_sentences_with_style = [tokenize_with_runs(runs, source_tokenizer, source_detokenizer) for runs in
-                                               original_paragraphs_with_runs]
-    # flatten all the runs so we can align with just one call instead of one per paragraph
-    original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
-                                               sublist]
     # tokenize the translated text by sentence and word
-    translated_tokenized_sentences = [tokenize_text(sentence.text, target_tokenizer) for
-                                      translated_paragraph in translated_paragraphs for sentence in
-                                      spacy_nlp(translated_paragraph).sents]
     assert len(translated_tokenized_sentences) == len(
-        original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
     original_sentences = []
     translated_sentences = []
@@ -272,27 +295,28 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
         translated_sentences_with_style.append(translated_sentence_with_style)
-    return translated_sentences_with_style
-def group_by_style(tokens: list[dict[str, str]], detokenizer) -> list[dict[str, str]]:
     """
     To avoid having issues in the future, we group the contiguous tokens that have the same style. Basically, we
     reconstruct the runs.
     Parameters:
     tokens: Tokens with style information
-    detokenizer: Detokenizer object to merge tokens back together
     Returns:
     list[dict]: A list of translated runs with format and style
     """
     groups = []
-    for key, group in groupby(tokens, key=lambda x: (x["id"], x["paragraph_index"])):
-        text = detokenizer.detokenize([item['text'] for item in group])
-        if groups and not text.startswith((",", ";", ":", ".", ")", "!", "?")):
-            text = " " + text
         groups.append({"text": text,
                        "id": key[0],
@@ -300,7 +324,7 @@ def group_by_style(tokens: list[dict[str, str]], detokenizer) -> list[dict[str,
     return groups
-def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]]], out_file_path: str):
     """
     Generate a plain text file restoring the original tag structure like <g id=1> </g>
@@ -310,59 +334,39 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
     """
     with open(out_file_path, "w") as out_file:
-        current_stack = []
-        def close_tags(to_close):
-            return ''.join(f'</g>' for _ in to_close)
-        def open_tags(to_open):
             tag = ""
-            for gid in to_open:
                 tag_type, tag_id = gid.split("_")
                 tag += f'<{tag_type} id="{tag_id}">'
             return tag
         for key, paragraph in paragraphs_with_style.items():
-            output = []
             for run in paragraph:
                 ids = list(run["id"]) if run["id"] else []
-                # Find the point where current and new IDs diverge
-                common_prefix_len = 0
-                for a, b in zip(current_stack, ids):
-                    if a == b:
-                        common_prefix_len += 1
-                    else:
-                        break
-                # Close tags not in the new stack
-                to_close = current_stack[common_prefix_len:]
-                if to_close:
-                    output.append(close_tags(to_close))
-                # Open new tags
-                to_open = ids[common_prefix_len:]
-                if to_open:
-                    output.append(open_tags(to_open))
-                # Add text
-                output.append(run["text"])
-                # Update the stack
-                current_stack = ids
-            # Close any remaining open tags
-            if current_stack:
-                output.append(close_tags(current_stack))
-            out_file.write("".join(output) + "\n")
 def translate_document(input_file: str, source_lang: str, target_lang: str,
                        translator,
                        aligner: Aligner,
                        temp_folder: str = "tmp",
-                       tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
     input_filename = input_file.split("/")[-1]
     # copy the original file to the temporal folder to avoid common issues with tikal
     temp_input_file = os.path.join(temp_folder, input_filename)
@@ -371,33 +375,53 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
     original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
     plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
-    source_tokenizer = MosesTokenizer(lang=source_lang)
-    source_detokenizer = MosesDetokenizer(lang=source_lang)
-    target_tokenizer = MosesTokenizer(lang=target_lang)
-    target_detokenizer = MosesDetokenizer(lang=target_lang)
     # get paragraphs with runs
     paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
                             enumerate(open(plain_text_file).readlines())]
     # translate using plaintext file
-    translated_paragraphs = []
-    for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
-        paragraph_text = source_detokenizer.detokenize([run["text"] for run in paragraph])
-        translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
     # time to align the translation with the original
     print("Generating alignments...")
     start_time = time.time()
-    translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
-                                                          temp_folder, source_tokenizer, source_detokenizer, target_tokenizer)
     print(f"Finished alignments in {time.time() - start_time} seconds")
     # flatten the sentences into a list of tokens
     translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
     # group the tokens by style/run
-    translated_runs_with_style = group_by_style(translated_tokens_with_style, target_detokenizer)
     # group the runs by original paragraph
     translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
@@ -424,6 +448,13 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
                                     "-noalttrans", "-to", original_xliff_file]
     Popen(tikal_moses_to_xliff_command).wait()
     # merge into a docx again
     tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
     final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)

 import shutil
+import string
 import time
 import os
 from itertools import groupby
 from src.aligner import Aligner
 import glob
 import spacy
+from spacy.tokens import Doc
 import tqdm
 if "sentencizer" not in spacy_nlp.pipe_names:
     spacy_nlp.add_pipe("sentencizer")
 def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
                       original_xliff_file_path: str) -> str:
     """
     return runs
 def tokenize_text(text, tokenizer):
     # To avoid the tokenizer destroying the url
     def preserve_urls(text):
     return tokens
+def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str, str]]], list[list[bool]]]:
     """
     Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
     to its original run
     """
     # it's a bit of a mess but first we get the tokenized sentences
+    # join runs and send through spacy to split into clean tokens
+    doc_from_runs = spacy_nlp("".join([run["text"] for run in runs]).strip())
+    # extract sentences and tokenize each into words
+    tokenized_sentences = [[token.text.strip() for token in sent if token.text.strip()] for sent in doc_from_runs.sents]
+    tokenized_sentences_spaces = [[token.whitespace_ != '' for token in sent if token.text.strip()] for sent in
+                                  doc_from_runs.sents]
+    flat_tokens = [token for sentence in tokenized_sentences for token in sentence]
+    flat_spaces = [token for sentence in tokenized_sentences_spaces for token in sentence]
+    flat_tokens_with_style = []
+    flat_spaces_with_style = []
+    token_idx = 0
     for run in runs:
+        run["text"] = run["text"].strip()
+        while run["text"]:
+            if run["text"].startswith(flat_tokens[token_idx]):
+                run["text"] = run["text"][len(flat_tokens[token_idx]):]
+                if flat_spaces[token_idx]:
+                    run["text"] = run["text"].lstrip()
+                item = run.copy()
+                item["text"] = flat_tokens[token_idx]
+                flat_tokens_with_style.append(item)
+                flat_spaces_with_style.append(flat_spaces[token_idx])
+                token_idx += 1
+            elif flat_tokens[token_idx].startswith(run["text"]):
+                subtoken = flat_tokens[token_idx][:len(run["text"])]
+                item = run.copy()
+                item["text"] = subtoken
+                flat_tokens_with_style.append(item)
+                flat_spaces_with_style.append(False)
+                flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
+                run["text"] = run["text"][len(subtoken):]
+    # reconstruct the sentences
+    token_idx = 0
+    tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
+    for sentence, sentence_spaces in zip(tokenized_sentences, tokenized_sentences_spaces):
+        sentence_with_style, sentence_spaces_with_style = [], []
+        for token in sentence:
+            if token == flat_tokens_with_style[token_idx]["text"]:
+                sentence_with_style.append(flat_tokens_with_style[token_idx])
+                sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
+                token_idx += 1
+            elif token.startswith(flat_tokens_with_style[token_idx]["text"]):
+                while token:
+                    token = token[len(flat_tokens_with_style[token_idx]["text"]):]
+                    sentence_with_style.append(flat_tokens_with_style[token_idx])
+                    sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
+                    token_idx += 1
             else:
+                print(token)
+                print(sentence)
+                print(token_idx)
+                print(flat_tokens_with_style)
+                raise Exception(f"Something unexpected happened")
         tokenized_sentences_with_style.append(sentence_with_style)
+        tokenized_sentences_spaces_with_style.append(sentence_spaces_with_style)
+    return tokenized_sentences_with_style, tokenized_sentences_spaces_with_style
+def generate_alignments(original_tokenized_sentences_with_style: list[list[dict[str, str]]],
+                        translated_sentences: list[str], aligner, temp_folder: str):
     """
+    Given some original sentences with style and formatting and its translation without formatting, try to match
     the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
     forget about paragraphs and work only in sentences, so the output is a list of sentences but with information about
     from which paragraph that sentence came from
     Parameters:
+    original_tokenized_sentences_with_style: Original text split into sentences with style information
+    translated_sentences: Translated text, split into sentences
     aligner: Object of the aligner class, uses fastalign
     temp_folder: Path to folder where to put all the intermediate files
     source_lang: original language of the document
     for f in glob.glob(os.path.join(temp_folder, "*align*")):
         os.remove(f)
     # tokenize the translated text by sentence and word
+    translated_tokenized_sentences = []
+    # keep spacing information to detokenize properly later
+    translated_tokenized_sentences_spaces = []
+    for sentence in translated_sentences:
+        tokens = spacy_nlp(sentence)
+        translated_tokenized_sentences_spaces.append([token.whitespace_ != '' for token in tokens])
+        translated_tokenized_sentences.append([token.text for token in tokens])
     assert len(translated_tokenized_sentences) == len(
+        original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentences, likely due to a translation error"
     original_sentences = []
     translated_sentences = []
         translated_sentences_with_style.append(translated_sentence_with_style)
+    return translated_sentences_with_style, translated_tokenized_sentences_spaces
+def group_by_style(tokens: list[dict[str, str]], spaces: list[bool]) -> list[dict[str, str]]:
     """
     To avoid having issues in the future, we group the contiguous tokens that have the same style. Basically, we
     reconstruct the runs.
     Parameters:
     tokens: Tokens with style information
     Returns:
     list[dict]: A list of translated runs with format and style
     """
     groups = []
+    zipped = zip(tokens, spaces)
+    for key, group in groupby(zipped, key=lambda x: (x[0]["id"], x[0]["paragraph_index"])):
+        group = list(group)
+        tokens = [item[0]['text'] for item in group]
+        spaces = [item[1] for item in group]
+        text = Doc(spacy_nlp.vocab, words=tokens, spaces=spaces).text
         groups.append({"text": text,
                        "id": key[0],
     return groups
+def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]]], out_file_path: str):
     """
     Generate a plain text file restoring the original tag structure like <g id=1> </g>
     """
     with open(out_file_path, "w") as out_file:
+        def close_tags(ids):
+            tag = ""
+            for gid in ids:
+                tag_type, tag_id = gid.split("_")
+                tag += f'</{tag_type}>'
+            return tag
+        def open_tags(ids):
             tag = ""
+            for gid in ids:
                 tag_type, tag_id = gid.split("_")
                 tag += f'<{tag_type} id="{tag_id}">'
             return tag
         for key, paragraph in paragraphs_with_style.items():
             for run in paragraph:
                 ids = list(run["id"]) if run["id"] else []
+                if ids:
+                    output = open_tags(ids) + run["text"] + close_tags(ids)
+                    out_file.write(output)
+                else:
+                    out_file.write("".join(run["text"]))
+            out_file.write("\n")
 def translate_document(input_file: str, source_lang: str, target_lang: str,
                        translator,
                        aligner: Aligner,
                        temp_folder: str = "tmp",
+                       tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0", with_format: bool = True) -> str:
     input_filename = input_file.split("/")[-1]
     # copy the original file to the temporal folder to avoid common issues with tikal
     temp_input_file = os.path.join(temp_folder, input_filename)
     original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
     plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
     # get paragraphs with runs
     paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
                             enumerate(open(plain_text_file).readlines())]
     # translate using plaintext file
+    original_tokenized_sentences_with_style = []
+    original_spacing = []
+    for run in paragraphs_with_runs:
+        tokens, spaces = tokenize_with_runs(run)
+        original_tokenized_sentences_with_style += tokens
+        original_spacing += spaces
+    translated_sentences = []
+    for sentence, spacing in tqdm.tqdm(zip(original_tokenized_sentences_with_style, original_spacing),
+                                       desc="Translating paragraphs...",
+                                       total=len(original_tokenized_sentences_with_style)):
+        text = Doc(spacy_nlp.vocab, words=[token["text"] for token in sentence], spaces=spacing).text
+        while True:
+            try:
+                translated_sentences.append(translator.translate(text, source_lang, target_lang))
+                break
+            except:
+                continue
     # time to align the translation with the original
     print("Generating alignments...")
     start_time = time.time()
+    translated_sentences_with_style, translated_sentences_spacing = generate_alignments(
+        original_tokenized_sentences_with_style,
+        translated_sentences, aligner,
+        temp_folder)
     print(f"Finished alignments in {time.time() - start_time} seconds")
+    # since we tokenized these sentences independently, the spacing information does not contain spaces after punctuation
+    # at the end of the sentence (there's no space at the end of a sentence that ends with ".", unless there's a sentence
+    # right after
+    for sentence, sentence_spaces in zip(translated_sentences_with_style, translated_sentences_spacing):
+        if sentence[-1]["text"] in string.punctuation:
+            sentence_spaces[-1] = True
     # flatten the sentences into a list of tokens
     translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
+    tokens_spaces = [item for sublist in translated_sentences_spacing for item in sublist]
     # group the tokens by style/run
+    translated_runs_with_style = group_by_style(translated_tokens_with_style, tokens_spaces)
     # group the runs by original paragraph
     translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
                                     "-noalttrans", "-to", original_xliff_file]
     Popen(tikal_moses_to_xliff_command).wait()
+    # any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
+    # them. This may happen if a word in the original language has been split in more that one words that have other
+    # words in between, or an error in fastalign
+    text = open(original_xliff_file).read()
+    result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
+    open(original_xliff_file, "w").write(result)
     # merge into a docx again
     tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
     final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)