Moore-Language-Space-ZeroGPU

Sleeping

App Files Files Community

ArissBandoss commited on May 19

Commit

54108c5

verified ·

1 Parent(s): 7b6884e

Update goai_helpers/goai_traduction.py

Browse files

Files changed (1) hide show

goai_helpers/goai_traduction.py +56 -19

goai_helpers/goai_traduction.py CHANGED Viewed

@@ -1,18 +1,59 @@
 import torch
 import spaces
-from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
-from peft import PeftModel, PeftConfig
-import os
-import unicodedata
 from huggingface_hub import login
 max_length = 512
 auth_token = os.getenv('HF_SPACE_TOKEN')
 login(token=auth_token)
 @spaces.GPU
-def goai_traduction(text, src_lang, tgt_lang):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if src_lang == "mos_Latn" and tgt_lang == "fra_Latn":
@@ -27,25 +68,21 @@ def goai_traduction(text, src_lang, tgt_lang):
     tokenizer.src_lang = src_lang
     # Tokenisation
-    inputs = tokenizer(text, return_tensors="pt", truncation=False).to(device)
-    input_length = inputs["input_ids"].shape[1]
     # ID du token de langue cible
     tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang)
-    # ID du token EOS
-    eos_token_id = tokenizer.eos_token_id
-    # Bloquer complètement le token EOS jusqu'à un certain point
     outputs = model.generate(
-         **inputs,
-         forced_bos_token_id=tgt_lang_id,
-         max_new_tokens=1024,
-         early_stopping=False,
-         num_beams=5,
-         no_repeat_ngram_size=0,
-         length_penalty=1.0
     )
     # Décodage

 import torch
 import spaces
+import re
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from huggingface_hub import login
+import os
 max_length = 512
 auth_token = os.getenv('HF_SPACE_TOKEN')
 login(token=auth_token)
+def split_text_intelligently(text, max_chunk_length=100):
+    """
+    Divise le texte en chunks en respectant les phrases complètes.
+    """
+    # Séparation basée sur les phrases (utilise les points, points d'interrogation, etc.)
+    sentences = re.split(r'([.!?])', text)
+    chunks = []
+    current_chunk = ""
+    for i in range(0, len(sentences), 2):
+        # Reconstruire la phrase avec sa ponctuation
+        if i + 1 < len(sentences):
+            sentence = sentences[i] + sentences[i+1]
+        else:
+            sentence = sentences[i]
+        # Si l'ajout de cette phrase dépasse la longueur maximale, on crée un nouveau chunk
+        if len(current_chunk) + len(sentence) > max_chunk_length and current_chunk:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence
+        else:
+            current_chunk += sentence
+    # Ajouter le dernier chunk s'il reste du texte
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
 @spaces.GPU
+def goai_traduction(text, src_lang, tgt_lang, max_chunk_length=100):
+    # Si le texte est trop long, le diviser en chunks
+    if len(text) > max_chunk_length:
+        chunks = split_text_intelligently(text, max_chunk_length)
+        translations = []
+        for chunk in chunks:
+            translated_chunk = translate_chunk(chunk, src_lang, tgt_lang)
+            translations.append(translated_chunk)
+        return " ".join(translations)
+    else:
+        return translate_chunk(text, src_lang, tgt_lang)
+def translate_chunk(text, src_lang, tgt_lang):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if src_lang == "mos_Latn" and tgt_lang == "fra_Latn":
     tokenizer.src_lang = src_lang
     # Tokenisation
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
     # ID du token de langue cible
     tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang)
+    # Paramètres de génération optimisés pour éviter les répétitions
     outputs = model.generate(
+        **inputs,
+        forced_bos_token_id=tgt_lang_id,
+        max_new_tokens=512,
+        num_beams=5,
+        no_repeat_ngram_size=4,
+        repetition_penalty=2.0,
+        length_penalty=1.0,
+        early_stopping=True
     )
     # Décodage