YanaGabelev commited on
Commit
117bc9e
·
verified ·
1 Parent(s): 0d8f631

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -4
app.py CHANGED
@@ -98,6 +98,10 @@ class RobustLanguageDetector:
98
  # Post-process common misdetections
99
  if ld_lang == 'mk' and self.is_cyrillic_russian(text):
100
  return 'ru', 'RU', "langdetect_corrected"
 
 
 
 
101
 
102
  return ld_lang, ld_lang.upper(), "langdetect"
103
  except Exception as e:
@@ -109,27 +113,42 @@ class RobustLanguageDetector:
109
  """
110
  text_lower = text.lower()
111
 
 
 
 
 
 
 
 
112
  # Common Russian patterns
113
  russian_patterns = [
114
  'привет', 'как дела', 'спасибо', 'пожалуйста', 'здравствуйте',
115
- 'до свидания', 'добро пожаловать', 'извините', 'хорошо'
116
  ]
117
 
118
  # Common Hebrew patterns
119
  hebrew_patterns = [
120
- 'שלום', 'איך', 'תודה', 'בבקשה', 'סליחה', 'טוב', 'רע', 'כן', 'לא'
 
121
  ]
122
 
123
  # Common Spanish patterns
124
  spanish_patterns = [
125
- 'hola', 'como estas', 'gracias', 'por favor', 'perdon', 'bueno', 'malo'
 
126
  ]
127
 
128
  # Common French patterns
129
  french_patterns = [
130
- 'bonjour', 'comment allez-vous', 'merci', 's\'il vous plaît', 'pardon'
 
131
  ]
132
 
 
 
 
 
 
133
  for pattern in russian_patterns:
134
  if pattern in text_lower:
135
  return 'ru'
@@ -148,6 +167,31 @@ class RobustLanguageDetector:
148
 
149
  return None
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  def is_cyrillic_russian(self, text):
152
  """
153
  Check if Cyrillic text is likely Russian based on character patterns
 
98
  # Post-process common misdetections
99
  if ld_lang == 'mk' and self.is_cyrillic_russian(text):
100
  return 'ru', 'RU', "langdetect_corrected"
101
+ elif ld_lang == 'so' and self.is_likely_english(text):
102
+ return 'en', 'EN', "langdetect_corrected"
103
+ elif ld_lang in ['no', 'da', 'sv'] and self.is_likely_english(text):
104
+ return 'en', 'EN', "langdetect_corrected"
105
 
106
  return ld_lang, ld_lang.upper(), "langdetect"
107
  except Exception as e:
 
113
  """
114
  text_lower = text.lower()
115
 
116
+ # Common English patterns
117
+ english_patterns = [
118
+ 'hello', 'how are you', 'thank you', 'please', 'sorry', 'good', 'bad',
119
+ 'yes', 'no', 'today', 'tomorrow', 'yesterday', 'morning', 'evening',
120
+ 'welcome', 'goodbye', 'nice to meet you', 'see you later'
121
+ ]
122
+
123
  # Common Russian patterns
124
  russian_patterns = [
125
  'привет', 'как дела', 'спасибо', 'пожалуйста', 'здравствуйте',
126
+ 'до свидания', 'добро пожаловать', 'извините', 'хорошо', 'сегодня'
127
  ]
128
 
129
  # Common Hebrew patterns
130
  hebrew_patterns = [
131
+ 'שלום', 'איך', 'תודה', 'בבקשה', 'סליחה', 'טוב', 'רע', 'כן', 'לא',
132
+ 'בוקר טוב', 'לילה טוב', 'מה שלומך', 'נעים להכיר'
133
  ]
134
 
135
  # Common Spanish patterns
136
  spanish_patterns = [
137
+ 'hola', 'como estas', 'como estás', 'gracias', 'por favor', 'perdon',
138
+ 'perdón', 'bueno', 'malo', 'buenos dias', 'buenas noches'
139
  ]
140
 
141
  # Common French patterns
142
  french_patterns = [
143
+ 'bonjour', 'comment allez-vous', 'comment ça va', 'merci',
144
+ 's\'il vous plaît', 'pardon', 'au revoir', 'bonne nuit'
145
  ]
146
 
147
+ # Check English first (most common in examples)
148
+ for pattern in english_patterns:
149
+ if pattern in text_lower:
150
+ return 'en'
151
+
152
  for pattern in russian_patterns:
153
  if pattern in text_lower:
154
  return 'ru'
 
167
 
168
  return None
169
 
170
+ def is_likely_english(self, text):
171
+ """
172
+ Check if text is likely English based on common English words
173
+ """
174
+ text_lower = text.lower()
175
+ english_indicators = [
176
+ 'the', 'and', 'you', 'are', 'how', 'what', 'where', 'when', 'why',
177
+ 'hello', 'today', 'tomorrow', 'good', 'thank', 'please', 'welcome'
178
+ ]
179
+
180
+ # Check if text contains common English words
181
+ word_count = 0
182
+ english_word_count = 0
183
+
184
+ for word in text_lower.split():
185
+ word_count += 1
186
+ if word in english_indicators:
187
+ english_word_count += 1
188
+
189
+ # If more than 30% are English words, likely English
190
+ if word_count > 0:
191
+ return (english_word_count / word_count) > 0.3
192
+
193
+ return False
194
+
195
  def is_cyrillic_russian(self, text):
196
  """
197
  Check if Cyrillic text is likely Russian based on character patterns