Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -98,6 +98,10 @@ class RobustLanguageDetector:
|
|
98 |
# Post-process common misdetections
|
99 |
if ld_lang == 'mk' and self.is_cyrillic_russian(text):
|
100 |
return 'ru', 'RU', "langdetect_corrected"
|
|
|
|
|
|
|
|
|
101 |
|
102 |
return ld_lang, ld_lang.upper(), "langdetect"
|
103 |
except Exception as e:
|
@@ -109,27 +113,42 @@ class RobustLanguageDetector:
|
|
109 |
"""
|
110 |
text_lower = text.lower()
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
# Common Russian patterns
|
113 |
russian_patterns = [
|
114 |
'привет', 'как дела', 'спасибо', 'пожалуйста', 'здравствуйте',
|
115 |
-
'до свидания', 'добро пожаловать', 'извините', 'хорошо'
|
116 |
]
|
117 |
|
118 |
# Common Hebrew patterns
|
119 |
hebrew_patterns = [
|
120 |
-
'שלום', 'איך', 'תודה', 'בבקשה', 'סליחה', 'טוב', 'רע', 'כן', 'לא'
|
|
|
121 |
]
|
122 |
|
123 |
# Common Spanish patterns
|
124 |
spanish_patterns = [
|
125 |
-
'hola', 'como estas', 'gracias', 'por favor', 'perdon',
|
|
|
126 |
]
|
127 |
|
128 |
# Common French patterns
|
129 |
french_patterns = [
|
130 |
-
'bonjour', 'comment allez-vous', '
|
|
|
131 |
]
|
132 |
|
|
|
|
|
|
|
|
|
|
|
133 |
for pattern in russian_patterns:
|
134 |
if pattern in text_lower:
|
135 |
return 'ru'
|
@@ -148,6 +167,31 @@ class RobustLanguageDetector:
|
|
148 |
|
149 |
return None
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
def is_cyrillic_russian(self, text):
|
152 |
"""
|
153 |
Check if Cyrillic text is likely Russian based on character patterns
|
|
|
98 |
# Post-process common misdetections
|
99 |
if ld_lang == 'mk' and self.is_cyrillic_russian(text):
|
100 |
return 'ru', 'RU', "langdetect_corrected"
|
101 |
+
elif ld_lang == 'so' and self.is_likely_english(text):
|
102 |
+
return 'en', 'EN', "langdetect_corrected"
|
103 |
+
elif ld_lang in ['no', 'da', 'sv'] and self.is_likely_english(text):
|
104 |
+
return 'en', 'EN', "langdetect_corrected"
|
105 |
|
106 |
return ld_lang, ld_lang.upper(), "langdetect"
|
107 |
except Exception as e:
|
|
|
113 |
"""
|
114 |
text_lower = text.lower()
|
115 |
|
116 |
+
# Common English patterns
|
117 |
+
english_patterns = [
|
118 |
+
'hello', 'how are you', 'thank you', 'please', 'sorry', 'good', 'bad',
|
119 |
+
'yes', 'no', 'today', 'tomorrow', 'yesterday', 'morning', 'evening',
|
120 |
+
'welcome', 'goodbye', 'nice to meet you', 'see you later'
|
121 |
+
]
|
122 |
+
|
123 |
# Common Russian patterns
|
124 |
russian_patterns = [
|
125 |
'привет', 'как дела', 'спасибо', 'пожалуйста', 'здравствуйте',
|
126 |
+
'до свидания', 'добро пожаловать', 'извините', 'хорошо', 'сегодня'
|
127 |
]
|
128 |
|
129 |
# Common Hebrew patterns
|
130 |
hebrew_patterns = [
|
131 |
+
'שלום', 'איך', 'תודה', 'בבקשה', 'סליחה', 'טוב', 'רע', 'כן', 'לא',
|
132 |
+
'בוקר טוב', 'לילה טוב', 'מה שלומך', 'נעים להכיר'
|
133 |
]
|
134 |
|
135 |
# Common Spanish patterns
|
136 |
spanish_patterns = [
|
137 |
+
'hola', 'como estas', 'como estás', 'gracias', 'por favor', 'perdon',
|
138 |
+
'perdón', 'bueno', 'malo', 'buenos dias', 'buenas noches'
|
139 |
]
|
140 |
|
141 |
# Common French patterns
|
142 |
french_patterns = [
|
143 |
+
'bonjour', 'comment allez-vous', 'comment ça va', 'merci',
|
144 |
+
's\'il vous plaît', 'pardon', 'au revoir', 'bonne nuit'
|
145 |
]
|
146 |
|
147 |
+
# Check English first (most common in examples)
|
148 |
+
for pattern in english_patterns:
|
149 |
+
if pattern in text_lower:
|
150 |
+
return 'en'
|
151 |
+
|
152 |
for pattern in russian_patterns:
|
153 |
if pattern in text_lower:
|
154 |
return 'ru'
|
|
|
167 |
|
168 |
return None
|
169 |
|
170 |
+
def is_likely_english(self, text):
|
171 |
+
"""
|
172 |
+
Check if text is likely English based on common English words
|
173 |
+
"""
|
174 |
+
text_lower = text.lower()
|
175 |
+
english_indicators = [
|
176 |
+
'the', 'and', 'you', 'are', 'how', 'what', 'where', 'when', 'why',
|
177 |
+
'hello', 'today', 'tomorrow', 'good', 'thank', 'please', 'welcome'
|
178 |
+
]
|
179 |
+
|
180 |
+
# Check if text contains common English words
|
181 |
+
word_count = 0
|
182 |
+
english_word_count = 0
|
183 |
+
|
184 |
+
for word in text_lower.split():
|
185 |
+
word_count += 1
|
186 |
+
if word in english_indicators:
|
187 |
+
english_word_count += 1
|
188 |
+
|
189 |
+
# If more than 30% are English words, likely English
|
190 |
+
if word_count > 0:
|
191 |
+
return (english_word_count / word_count) > 0.3
|
192 |
+
|
193 |
+
return False
|
194 |
+
|
195 |
def is_cyrillic_russian(self, text):
|
196 |
"""
|
197 |
Check if Cyrillic text is likely Russian based on character patterns
|