Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on May 20

Commit

41ceab1

1 Parent(s): 4420a7f

changed nltk tokenizer to multilingual tokenizers

Browse files

Files changed (1) hide show

src/translate_any_doc.py +55 -19

src/translate_any_doc.py CHANGED Viewed

@@ -7,14 +7,17 @@ import re
 from src.aligner import Aligner
-import nltk
 import glob
-from nltk.tokenize import sent_tokenize, word_tokenize
-import tqdm
-nltk.download('punkt')
-nltk.download('punkt_tab')
 def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
                       original_xliff_file_path: str) -> str:
@@ -117,26 +120,53 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
     return runs
-def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dict[str, str]]]:
     """
     Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
     to its original run
     Parameters:
     runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
-    detokenizer: Detokenizer object to merge tokens back together
     Returns:
     list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
     """
     text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
-    sentences = sent_tokenize(text_paragraph)
-    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
     tokens_with_style = []
     for run in runs:
-        tokens = word_tokenize(run["text"])
         if tokens:
             for token in tokens:
                 tokens_with_style.append(run.copy())
@@ -144,6 +174,7 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
         else:
             tokens_with_style.append(run.copy())
     token_index = 0
     tokenized_sentences_with_style = []
     for sentence in tokenized_sentences:
@@ -169,7 +200,7 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
 def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
                         translated_paragraphs: list[str], aligner, temp_folder: str,
-                        detokenizer) -> list[list[dict[str, str]]]:
     """
     Given some original paragraphs with style and formatting and its translation without formatting, try to match
     the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
@@ -181,7 +212,8 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
     translated_paragraphs: Translated text, split into paragraphs
     aligner: Object of the aligner class, uses fastalign
     temp_folder: Path to folder where to put all the intermediate files
-    detokenizer: Detokenizer object to merge tokens back together
     Returns:
     list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
@@ -192,7 +224,7 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
         os.remove(f)
     # tokenize the original text by sentence and words while keeping the style
-    original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
                                                original_paragraphs_with_runs]
     # flatten all the runs so we can align with just one call instead of one per paragraph
@@ -200,9 +232,9 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
                                                sublist]
     # tokenize the translated text by sentence and word
-    translated_tokenized_sentences = [word_tokenize(sentence) for
                                       translated_paragraph in translated_paragraphs for sentence in
-                                      sent_tokenize(translated_paragraph)]
     assert len(translated_tokenized_sentences) == len(
         original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
@@ -329,7 +361,6 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
 def translate_document(input_file: str, source_lang: str, target_lang: str,
                        translator,
                        aligner: Aligner,
-                       detokenizer,
                        temp_folder: str = "tmp",
                        tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
     input_filename = input_file.split("/")[-1]
@@ -340,6 +371,11 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
     original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
     plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
     # get paragraphs with runs
     paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
                             enumerate(open(plain_text_file).readlines())]
@@ -347,21 +383,21 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
     # translate using plaintext file
     translated_paragraphs = []
     for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
-        paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
         translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
     # time to align the translation with the original
     print("Generating alignments...")
     start_time = time.time()
     translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
-                                                          temp_folder, detokenizer)
     print(f"Finished alignments in {time.time() - start_time} seconds")
     # flatten the sentences into a list of tokens
     translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
     # group the tokens by style/run
-    translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
     # group the runs by original paragraph
     translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in

 from src.aligner import Aligner
 import glob
+from sacremoses import MosesTokenizer, MosesDetokenizer
+import spacy
+import tqdm
+# Load multilingual model to use as sentence tokenizer
+spacy_nlp = spacy.load("xx_ent_wiki_sm")
+# Add the rule-based sentencizer
+if "sentencizer" not in spacy_nlp.pipe_names:
+    spacy_nlp.add_pipe("sentencizer")
 def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
                       original_xliff_file_path: str) -> str:
     return runs
+def tokenize_text(text, tokenizer):
+    # To avoid the tokenizer destroying the url
+    def preserve_urls(text):
+        url_pattern = r'https?://[^\s\)\]\}\>]+|www\.[^\s\)\]\}\>]+'
+        # Find URLs using regex and replace them with a placeholder
+        urls = re.findall(url_pattern, text)
+        for idx, url in enumerate(urls):
+            placeholder = f"URL{idx}"
+            text = text.replace(url, placeholder)
+        return text, urls
+    # Replace URLs with placeholders
+    text, urls = preserve_urls(text)
+    # Tokenize using Sacremoses
+    tokens = tokenizer.tokenize(text)
+    # Revert placeholders back to original URLs
+    for idx, url in enumerate(urls):
+        placeholder = f"URL{idx}"
+        tokens = [token.replace(placeholder, url) for token in tokens]
+    return tokens
+def tokenize_with_runs(runs: list[dict[str, str]], tokenizer, detokenizer) -> list[list[dict[str, str]]]:
     """
     Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
     to its original run
     Parameters:
     runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
+    source_lang: Language of the document
     Returns:
     list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
     """
+    # it's a bit of a mess but first we get the tokenized sentences
     text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
+    sentences = spacy_nlp(text_paragraph).sents
+    tokenized_sentences = [tokenize_text(sentence.text, tokenizer) for sentence in sentences]
+    # then we assign a run/style to each token
     tokens_with_style = []
     for run in runs:
+        tokens = tokenize_text(run["text"], tokenizer)
         if tokens:
             for token in tokens:
                 tokens_with_style.append(run.copy())
         else:
             tokens_with_style.append(run.copy())
+    # and finally we combine both things, where each token of each sentence is assigned a run/style
     token_index = 0
     tokenized_sentences_with_style = []
     for sentence in tokenized_sentences:
 def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
                         translated_paragraphs: list[str], aligner, temp_folder: str,
+                        source_tokenizer, source_detokenizer, target_tokenizer) -> list[list[dict[str, str]]]:
     """
     Given some original paragraphs with style and formatting and its translation without formatting, try to match
     the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
     translated_paragraphs: Translated text, split into paragraphs
     aligner: Object of the aligner class, uses fastalign
     temp_folder: Path to folder where to put all the intermediate files
+    source_lang: original language of the document
+    target_lang: target language of the translation
     Returns:
     list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
         os.remove(f)
     # tokenize the original text by sentence and words while keeping the style
+    original_tokenized_sentences_with_style = [tokenize_with_runs(runs, source_tokenizer, source_detokenizer) for runs in
                                                original_paragraphs_with_runs]
     # flatten all the runs so we can align with just one call instead of one per paragraph
                                                sublist]
     # tokenize the translated text by sentence and word
+    translated_tokenized_sentences = [tokenize_text(sentence.text, target_tokenizer) for
                                       translated_paragraph in translated_paragraphs for sentence in
+                                      spacy_nlp(translated_paragraph).sents]
     assert len(translated_tokenized_sentences) == len(
         original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
 def translate_document(input_file: str, source_lang: str, target_lang: str,
                        translator,
                        aligner: Aligner,
                        temp_folder: str = "tmp",
                        tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
     input_filename = input_file.split("/")[-1]
     original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
     plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
+    source_tokenizer = MosesTokenizer(lang=source_lang)
+    source_detokenizer = MosesDetokenizer(lang=source_lang)
+    target_tokenizer = MosesTokenizer(lang=target_lang)
+    target_detokenizer = MosesDetokenizer(lang=target_lang)
     # get paragraphs with runs
     paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
                             enumerate(open(plain_text_file).readlines())]
     # translate using plaintext file
     translated_paragraphs = []
     for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
+        paragraph_text = source_detokenizer.detokenize([run["text"] for run in paragraph])
         translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
     # time to align the translation with the original
     print("Generating alignments...")
     start_time = time.time()
     translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
+                                                          temp_folder, source_tokenizer, source_detokenizer, target_tokenizer)
     print(f"Finished alignments in {time.time() - start_time} seconds")
     # flatten the sentences into a list of tokens
     translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
     # group the tokens by style/run
+    translated_runs_with_style = group_by_style(translated_tokens_with_style, target_detokenizer)
     # group the runs by original paragraph
     translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in