Spaces:
Running
Running
import re | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
def get_max_word_length(target_languages: list[str]) -> int: | |
helsinki_word_limits = { | |
"el": 50, | |
"et": 50, | |
"fi": 50, | |
"fr": 40, | |
"sv": 140, | |
"hu": 50, | |
"lt": 50, | |
"sk": 140, | |
"bg": 50, | |
"cs": 140, | |
"da": 140, | |
"de": 150, | |
} | |
max_word_length = 700 # Default for non-Helsinki languages | |
for lang in target_languages: | |
if lang in helsinki_word_limits: | |
if helsinki_word_limits[lang] < max_word_length: | |
max_word_length = helsinki_word_limits[lang] | |
return max_word_length | |
def chunk_text(text: str, safe_word_limit: int) -> list[str]: | |
sentences = re.split(r'(?<=[.!?])\s+', text.strip()) | |
chunks = [] | |
current_chunk = [] | |
current_word_count = 0 | |
for sentence in sentences: | |
sentence = sentence.strip() | |
if not sentence: | |
continue | |
word_count = len(sentence.split()) | |
# If sentence is longer than the safe word limit by itself, split it | |
if word_count > safe_word_limit: | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [] | |
current_word_count = 0 | |
words = sentence.split() | |
for i in range(0, len(words), safe_word_limit): | |
chunks.append(' '.join(words[i:i+safe_word_limit])) | |
continue | |
# Otherwise, see if it fits in the current chunk | |
if current_word_count + word_count <= safe_word_limit: | |
current_chunk.append(sentence) | |
current_word_count += word_count | |
else: | |
# Start a new chunk | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [sentence] | |
current_word_count = word_count | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
return chunks | |