Spaces:
Sleeping
Sleeping
""" | |
Smart Multilingual Translation Application | |
Built with Gradio and Hugging Face Transformers | |
Supports automatic language detection and translation to multiple languages | |
""" | |
import gradio as gr | |
import json | |
import os | |
from transformers import pipeline | |
from langdetect import detect, DetectorFactory | |
import warnings | |
warnings.filterwarnings("ignore") | |
# Set seed for consistent language detection results | |
DetectorFactory.seed = 0 | |
# Optional: Try to import fasttext for better language detection | |
try: | |
import fasttext | |
FASTTEXT_AVAILABLE = True | |
except ImportError: | |
FASTTEXT_AVAILABLE = False | |
print("FastText not available. Using langdetect only.") | |
class RobustLanguageDetector: | |
""" | |
Robust language detection using multiple methods | |
""" | |
def __init__(self): | |
self.fasttext_model = None | |
if FASTTEXT_AVAILABLE: | |
self.load_fasttext_model() | |
def load_fasttext_model(self): | |
""" | |
Load FastText language identification model | |
Downloads automatically if not present | |
""" | |
model_path = "lid.176.ftz" | |
if not os.path.exists(model_path): | |
print("FastText model not found. Using langdetect only.") | |
return | |
try: | |
self.fasttext_model = fasttext.load_model(model_path) | |
print("FastText model loaded successfully") | |
except Exception as e: | |
print(f"Error loading FastText model: {e}") | |
self.fasttext_model = None | |
def detect_language(self, text): | |
""" | |
Detect language using fasttext with langdetect fallback and pattern matching | |
Args: | |
text (str): Input text to analyze | |
Returns: | |
tuple: (language_code, language_name, detection_method) | |
""" | |
text = text.strip() | |
if not text: | |
return "unknown", "Unknown", "empty" | |
# Try pattern-based detection for common phrases first | |
detected_lang = self.pattern_based_detection(text) | |
if detected_lang: | |
return detected_lang, detected_lang.upper(), "pattern_matching" | |
# Try FastText first if available | |
if self.fasttext_model: | |
try: | |
ft_pred = self.fasttext_model.predict(text, k=3) # Get top 3 predictions | |
ft_langs = [lang.replace("__label__", "") for lang in ft_pred[0]] | |
ft_confs = ft_pred[1] | |
# If highest confidence is high enough, use it | |
if ft_confs[0] >= 0.8: | |
return ft_langs[0], ft_langs[0].upper(), "fasttext_high_conf" | |
# If multiple similar Slavic languages detected, use context | |
slavic_langs = [lang for lang in ft_langs if lang in ['ru', 'mk', 'bg', 'sr', 'uk']] | |
if slavic_langs and self.is_cyrillic_russian(text): | |
return 'ru', 'RU', "fasttext_cyrillic_context" | |
# Use the highest confidence prediction | |
if ft_confs[0] >= 0.6: | |
return ft_langs[0], ft_langs[0].upper(), "fasttext_medium_conf" | |
except Exception as e: | |
print(f"FastText detection error: {e}") | |
# Fallback to langdetect with post-processing | |
try: | |
ld_lang = detect(text) | |
# Post-process common misdetections | |
if ld_lang == 'mk' and self.is_cyrillic_russian(text): | |
return 'ru', 'RU', "langdetect_corrected" | |
elif ld_lang == 'so' and self.is_likely_english(text): | |
return 'en', 'EN', "langdetect_corrected" | |
elif ld_lang in ['no', 'da', 'sv'] and self.is_likely_english(text): | |
return 'en', 'EN', "langdetect_corrected" | |
return ld_lang, ld_lang.upper(), "langdetect" | |
except Exception as e: | |
return "unknown", f"Detection Error: {str(e)}", "error" | |
def pattern_based_detection(self, text): | |
""" | |
Simple pattern-based language detection for common phrases | |
""" | |
text_lower = text.lower() | |
# Common English patterns | |
english_patterns = [ | |
'hello', 'how are you', 'thank you', 'please', 'sorry', 'good', 'bad', | |
'yes', 'no', 'today', 'tomorrow', 'yesterday', 'morning', 'evening', | |
'welcome', 'goodbye', 'nice to meet you', 'see you later' | |
] | |
# Common Russian patterns | |
russian_patterns = [ | |
'ะฟัะธะฒะตั', 'ะบะฐะบ ะดะตะปะฐ', 'ัะฟะฐัะธะฑะพ', 'ะฟะพะถะฐะปัะนััะฐ', 'ะทะดัะฐะฒััะฒัะนัะต', | |
'ะดะพ ัะฒะธะดะฐะฝะธั', 'ะดะพะฑัะพ ะฟะพะถะฐะปะพะฒะฐัั', 'ะธะทะฒะธะฝะธัะต', 'ั ะพัะพัะพ', 'ัะตะณะพะดะฝั' | |
] | |
# Common Hebrew patterns | |
hebrew_patterns = [ | |
'ืฉืืื', 'ืืื', 'ืชืืื', 'ืืืงืฉื', 'ืกืืืื', 'ืืื', 'ืจืข', 'ืื', 'ืื', | |
'ืืืงืจ ืืื', 'ืืืื ืืื', 'ืื ืฉืืืื', 'ื ืขืื ืืืืืจ' | |
] | |
# Common Spanish patterns | |
spanish_patterns = [ | |
'hola', 'como estas', 'como estรกs', 'gracias', 'por favor', 'perdon', | |
'perdรณn', 'bueno', 'malo', 'buenos dias', 'buenas noches' | |
] | |
# Common French patterns | |
french_patterns = [ | |
'bonjour', 'comment allez-vous', 'comment รงa va', 'merci', | |
's\'il vous plaรฎt', 'pardon', 'au revoir', 'bonne nuit' | |
] | |
# Check English first (most common in examples) | |
for pattern in english_patterns: | |
if pattern in text_lower: | |
return 'en' | |
for pattern in russian_patterns: | |
if pattern in text_lower: | |
return 'ru' | |
for pattern in hebrew_patterns: | |
if pattern in text: # Don't lowercase Hebrew | |
return 'he' | |
for pattern in spanish_patterns: | |
if pattern in text_lower: | |
return 'es' | |
for pattern in french_patterns: | |
if pattern in text_lower: | |
return 'fr' | |
return None | |
def is_likely_english(self, text): | |
""" | |
Check if text is likely English based on common English words | |
""" | |
text_lower = text.lower() | |
english_indicators = [ | |
'the', 'and', 'you', 'are', 'how', 'what', 'where', 'when', 'why', | |
'hello', 'today', 'tomorrow', 'good', 'thank', 'please', 'welcome' | |
] | |
# Check if text contains common English words | |
word_count = 0 | |
english_word_count = 0 | |
for word in text_lower.split(): | |
word_count += 1 | |
if word in english_indicators: | |
english_word_count += 1 | |
# If more than 30% are English words, likely English | |
if word_count > 0: | |
return (english_word_count / word_count) > 0.3 | |
return False | |
def is_cyrillic_russian(self, text): | |
""" | |
Check if Cyrillic text is likely Russian based on character patterns | |
""" | |
# Russian-specific Cyrillic characters | |
russian_chars = {'ั', 'ั', 'ั', 'ั'} | |
# Macedonian-specific characters | |
macedonian_chars = {'ั', 'ั', 'ั', 'ั', 'ั', 'ั'} | |
text_chars = set(text.lower()) | |
# If contains Russian-specific chars, likely Russian | |
if any(char in text_chars for char in russian_chars): | |
return True | |
# If contains Macedonian-specific chars, likely not Russian | |
if any(char in text_chars for char in macedonian_chars): | |
return False | |
# Default: if mostly Cyrillic and no Macedonian markers, assume Russian | |
cyrillic_count = sum(1 for char in text if '\u0400' <= char <= '\u04FF') | |
return cyrillic_count > len(text) * 0.7 | |
class SmartTranslator: | |
""" | |
Smart multilingual translator with robust language detection | |
""" | |
def __init__(self): | |
self.language_detector = RobustLanguageDetector() | |
self.translators = {} | |
self.supported_languages = { | |
'en': 'English', | |
'he': 'Hebrew', | |
'ar': 'Arabic', | |
'es': 'Spanish', | |
'fr': 'French', | |
'de': 'German', | |
'it': 'Italian', | |
'pt': 'Portuguese', | |
'ru': 'Russian', | |
'zh': 'Chinese', | |
'ja': 'Japanese', | |
'ko': 'Korean', | |
'fi': 'Finnish', | |
'sv': 'Swedish', | |
'no': 'Norwegian', | |
'da': 'Danish', | |
'nl': 'Dutch' | |
} | |
self.load_language_models() | |
def load_language_models(self): | |
""" | |
Load translation models from Hugging Face Hub | |
Uses Helsinki-NLP OPUS-MT models for high-quality translation | |
Falls back to simpler models if main models fail to load | |
""" | |
print("Loading translation models...") | |
# List of models to try loading | |
model_configs = [ | |
('to_en', 'Helsinki-NLP/opus-mt-mul-en'), | |
('to_he', 'Helsinki-NLP/opus-mt-en-he'), | |
('to_es', 'Helsinki-NLP/opus-mt-en-es'), | |
('to_fr', 'Helsinki-NLP/opus-mt-en-fr'), | |
('to_ru', 'Helsinki-NLP/opus-mt-en-ru') | |
] | |
# Try to load each model individually | |
for key, model_name in model_configs: | |
try: | |
print(f"Loading {model_name}...") | |
self.translators[key] = pipeline("translation", model=model_name) | |
print(f"โ Successfully loaded {key}") | |
except Exception as e: | |
print(f"โ Failed to load {key}: {e}") | |
self.translators[key] = None | |
# Check if at least one model loaded | |
loaded_models = [k for k, v in self.translators.items() if v is not None] | |
print(f"Successfully loaded {len(loaded_models)} out of {len(model_configs)} models: {loaded_models}") | |
def detect_language(self, text): | |
""" | |
Detect the language of input text using robust detection | |
Args: | |
text (str): Input text to analyze | |
Returns: | |
tuple: (language_code, language_name, detection_method) | |
""" | |
detected_lang, lang_display, method = self.language_detector.detect_language(text) | |
language_name = self.supported_languages.get(detected_lang, lang_display) | |
return detected_lang, language_name, method | |
def translate_text(self, text, source_lang, target_lang): | |
""" | |
Translate text from source language to target language | |
Uses two-step translation for non-English source languages | |
Args: | |
text (str): Text to translate | |
source_lang (str): Source language code | |
target_lang (str): Target language code | |
Returns: | |
str: Translated text or error message | |
""" | |
try: | |
if not text.strip(): | |
return "No text to translate" | |
# If source is same as target, return original text | |
if source_lang == target_lang: | |
return text | |
# Handle non-English to non-English translation via English | |
if source_lang != 'en' and target_lang != 'en': | |
# Two-step translation: source -> English -> target | |
if self.translators.get('to_en'): | |
try: | |
english_text = self.translators['to_en'](text)[0]['translation_text'] | |
except Exception as e: | |
return f"Step 1 translation failed: {str(e)}" | |
else: | |
english_text = text # Fallback to original text | |
# Then translate English to target language | |
translator_key = f'to_{target_lang}' | |
if self.translators.get(translator_key): | |
try: | |
result = self.translators[translator_key](english_text)[0]['translation_text'] | |
return result | |
except Exception as e: | |
return f"Step 2 translation failed: {str(e)}" | |
else: | |
return f"Translation to {target_lang} not available" | |
# Direct translation from non-English to English | |
elif source_lang != 'en': | |
if self.translators.get('to_en'): | |
try: | |
return self.translators['to_en'](text)[0]['translation_text'] | |
except Exception as e: | |
return f"Translation to English failed: {str(e)}" | |
else: | |
return "Translation to English not available" | |
# Direct translation from English to target language | |
else: | |
translator_key = f'to_{target_lang}' | |
if self.translators.get(translator_key): | |
try: | |
return self.translators[translator_key](text)[0]['translation_text'] | |
except Exception as e: | |
return f"Translation to {target_lang} failed: {str(e)}" | |
else: | |
return f"Translation to {target_lang} not available" | |
except Exception as e: | |
return f"Translation error: {str(e)}" | |
def process_text(self, input_text): | |
""" | |
Main processing function that handles language detection and translation | |
Args: | |
input_text (str): User input text | |
Returns: | |
tuple: All outputs for Gradio interface | |
""" | |
if not input_text.strip(): | |
return ( | |
"Please enter some text to translate.", # detection_output | |
"", # translation1 | |
"", # translation2 | |
"", # translation3 | |
"", # translation4 | |
"", # translation5 | |
"", # status_output | |
) | |
try: | |
# Detect the language of input text with robust detection | |
detected_lang, language_name, detection_method = self.detect_language(input_text) | |
# Define target languages for translation (English, Hebrew, Spanish, Russian, French) | |
target_languages = ['en', 'he', 'es', 'ru', 'fr'] | |
# Generate translations for each target language | |
translations = [] | |
for target_lang in target_languages: | |
if detected_lang != target_lang: | |
translation = self.translate_text(input_text, detected_lang, target_lang) | |
lang_name = self.supported_languages.get(target_lang, target_lang.upper()) | |
translations.append(f"**{lang_name}:** {translation}") | |
else: | |
lang_name = self.supported_languages.get(target_lang, target_lang.upper()) | |
translations.append(f"**{lang_name}:** (Original text)") | |
# Prepare formatted output with detection method info | |
detection_result = f"**Detected Language:** {language_name} ({detected_lang}) - *Method: {detection_method}*" | |
# Ensure we have exactly 5 translations | |
while len(translations) < 5: | |
translations.append("") | |
return ( | |
detection_result, # detection_output | |
translations[0], # translation1 (English) | |
translations[1], # translation2 (Hebrew) | |
translations[2], # translation3 (Spanish) | |
translations[3], # translation4 (Russian) | |
translations[4], # translation5 (French) | |
f"โ **Translation Complete!** Processed text in {language_name} using {detection_method}" # status_output | |
) | |
except Exception as e: | |
error_msg = f"Processing error: {str(e)}" | |
return ( | |
error_msg, # detection_output | |
"", # translation1 | |
"", # translation2 | |
"", # translation3 | |
"", # translation4 | |
"", # translation5 | |
error_msg # status_output | |
) | |
# Initialize the translator instance | |
print("Initializing Smart Translator...") | |
translator = SmartTranslator() | |
def create_interface(): | |
""" | |
Create the Gradio interface for the Smart Multilingual Translator | |
Returns: | |
gr.Blocks: Configured Gradio application interface | |
""" | |
with gr.Blocks(title="Smart Multilingual Translator", theme=gr.themes.Soft()) as app: | |
gr.Markdown(""" | |
# ๐ Smart Multilingual Translator | |
### Powered by Hugging Face Transformers + Robust Language Detection | |
Enter text in any language and get automatic translations to English, Hebrew, Spanish, Russian, and French! | |
**Features:** | |
- ๐ฏ Smart language detection with pattern matching for common phrases | |
- ๐ Multi-layer detection: Pattern โ FastText โ langdetect with corrections | |
- ๐ High-quality translation with Helsinki-NLP models | |
- ๐ Support for 15+ languages with Slavic language disambiguation | |
- ๐จ Translation to 5 target languages: English, Hebrew, Spanish, Russian, French | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
input_text = gr.Textbox( | |
label="Enter text to translate", | |
placeholder="Type or paste text in any language...", | |
lines=4, | |
max_lines=10 | |
) | |
translate_btn = gr.Button("๐ Translate", variant="primary", size="lg") | |
clear_btn = gr.Button("๐๏ธ Clear", variant="secondary") | |
with gr.Column(scale=3): | |
with gr.Group(): | |
detection_output = gr.Markdown(label="Language Detection") | |
translation1 = gr.Markdown(label="English Translation") | |
translation2 = gr.Markdown(label="Hebrew Translation") | |
translation3 = gr.Markdown(label="Spanish Translation") | |
translation4 = gr.Markdown(label="Russian Translation") | |
translation5 = gr.Markdown(label="French Translation") | |
status_output = gr.Markdown(label="Status") | |
# Example inputs for testing different languages | |
gr.Markdown("### ๐ Try these examples:") | |
gr.Examples( | |
examples=[ | |
["Hello, how are you today?"], | |
["ืฉืืื, ืืื ืืชื ืืืื?"], | |
["Hola, ยฟcรณmo estรกs hoy?"], | |
["Bonjour, comment allez-vous?"], | |
["Guten Tag, wie geht es Ihnen?"], | |
["ะัะธะฒะตั, ะบะฐะบ ะดะตะปะฐ?"], | |
["ใใใซใกใฏใๅ ๆฐใงใใ๏ผ"], | |
["ู ุฑุญุจุงุ ููู ุญุงูู ุงูููู ุ"], | |
["Ciao, come stai?"], | |
["Hej, hur mรฅr du?"] | |
], | |
inputs=input_text | |
) | |
# Event handlers for user interactions | |
translate_btn.click( | |
fn=translator.process_text, | |
inputs=[input_text], | |
outputs=[detection_output, translation1, translation2, translation3, translation4, translation5, status_output] | |
) | |
clear_btn.click( | |
fn=lambda: ("", "", "", "", "", "", ""), | |
outputs=[input_text, detection_output, translation1, translation2, translation3, translation4, translation5] | |
) | |
# Auto-translate when user presses Enter | |
input_text.submit( | |
fn=translator.process_text, | |
inputs=[input_text], | |
outputs=[detection_output, translation1, translation2, translation3, translation4, translation5, status_output] | |
) | |
gr.Markdown(""" | |
--- | |
**Technical Details:** | |
- Language Detection: 3-layer system (Pattern matching โ FastText โ langdetect + corrections) | |
- Slavic Language Disambiguation: Special handling for Russian/Macedonian/Bulgarian confusion | |
- Translation Models: Helsinki-NLP OPUS-MT series | |
- Target Languages: English, Hebrew, Spanish, Russian, French | |
- Supported Input Languages: 15+ major world languages | |
- Two-step translation for optimal quality | |
**Note:** Translation quality may vary depending on the source and target languages. | |
Models load individually, so some translations may be unavailable if models fail to load. | |
""") | |
return app | |
if __name__ == "__main__": | |
print("Creating Gradio interface...") | |
# Create and launch the Gradio application | |
app = create_interface() | |
print("Launching application...") | |
app.launch(share=True) # share=True creates a public link |