sema-api / app /services /languages.py
kamau1's picture
updated african language list
1299535
"""
Language support service - provides information about supported languages
"""
from typing import Dict, List, Optional
from ..core.logging import get_logger
logger = get_logger()
# FLORES-200 language codes with human-readable names and regions
SUPPORTED_LANGUAGES = {
# African Languages (55+ languages) - Complete FLORES-200 African language support
"afr_Latn": {"name": "Afrikaans", "native_name": "Afrikaans", "region": "Africa", "script": "Latin"},
"aka_Latn": {"name": "Akan", "native_name": "Akan", "region": "Africa", "script": "Latin"},
"amh_Ethi": {"name": "Amharic", "native_name": "አማርኛ", "region": "Africa", "script": "Ethiopic"},
"bam_Latn": {"name": "Bambara", "native_name": "Bamanankan", "region": "Africa", "script": "Latin"},
"bem_Latn": {"name": "Bemba", "native_name": "Ichibemba", "region": "Africa", "script": "Latin"},
"dik_Latn": {"name": "Dinka", "native_name": "Thuɔŋjäŋ", "region": "Africa", "script": "Latin"},
"dyu_Latn": {"name": "Dyula", "native_name": "Jula", "region": "Africa", "script": "Latin"},
"ewe_Latn": {"name": "Ewe", "native_name": "Eʋegbe", "region": "Africa", "script": "Latin"},
"fon_Latn": {"name": "Fon", "native_name": "Fɔngbe", "region": "Africa", "script": "Latin"},
"fuv_Latn": {"name": "Nigerian Fulfulde", "native_name": "Fulfulde", "region": "Africa", "script": "Latin"},
"gaz_Latn": {"name": "West Central Oromo", "native_name": "Oromoo", "region": "Africa", "script": "Latin"},
"hau_Latn": {"name": "Hausa", "native_name": "Harshen Hausa", "region": "Africa", "script": "Latin"},
"ibo_Latn": {"name": "Igbo", "native_name": "Asụsụ Igbo", "region": "Africa", "script": "Latin"},
"kab_Latn": {"name": "Kabyle", "native_name": "Taqbaylit", "region": "Africa", "script": "Latin"},
"kam_Latn": {"name": "Kamba", "native_name": "Kikamba", "region": "Africa", "script": "Latin"},
"kbp_Latn": {"name": "Kabiyè", "native_name": "Kabɩyɛ", "region": "Africa", "script": "Latin"},
"kea_Latn": {"name": "Kabuverdianu", "native_name": "Kabuverdianu", "region": "Africa", "script": "Latin"},
"kik_Latn": {"name": "Kikuyu", "native_name": "Gĩkũyũ", "region": "Africa", "script": "Latin"},
"kin_Latn": {"name": "Kinyarwanda", "native_name": "Ikinyarwanda", "region": "Africa", "script": "Latin"},
"kmb_Latn": {"name": "Kimbundu", "native_name": "Kimbundu", "region": "Africa", "script": "Latin"},
"knc_Arab": {"name": "Central Kanuri", "native_name": "Kanuri", "region": "Africa", "script": "Arabic"},
"knc_Latn": {"name": "Central Kanuri", "native_name": "Kanuri", "region": "Africa", "script": "Latin"},
"kon_Latn": {"name": "Kikongo", "native_name": "Kikongo", "region": "Africa", "script": "Latin"},
"lin_Latn": {"name": "Lingala", "native_name": "Lingála", "region": "Africa", "script": "Latin"},
"lua_Latn": {"name": "Luba-Lulua", "native_name": "Tshiluba", "region": "Africa", "script": "Latin"},
"lug_Latn": {"name": "Luganda", "native_name": "Luganda", "region": "Africa", "script": "Latin"},
"luo_Latn": {"name": "Luo", "native_name": "Dholuo", "region": "Africa", "script": "Latin"},
"lus_Latn": {"name": "Mizo", "native_name": "Mizo ṭawng", "region": "Africa", "script": "Latin"},
"mos_Latn": {"name": "Mossi", "native_name": "Mooré", "region": "Africa", "script": "Latin"},
"nso_Latn": {"name": "Northern Sotho", "native_name": "Sesotho sa Leboa", "region": "Africa", "script": "Latin"},
"nus_Latn": {"name": "Nuer", "native_name": "Thok Nath", "region": "Africa", "script": "Latin"},
"nya_Latn": {"name": "Nyanja", "native_name": "Chinyanja", "region": "Africa", "script": "Latin"},
"orm_Latn": {"name": "Oromo", "native_name": "Afaan Oromoo", "region": "Africa", "script": "Latin"},
"run_Latn": {"name": "Rundi", "native_name": "Ikirundi", "region": "Africa", "script": "Latin"},
"sag_Latn": {"name": "Sango", "native_name": "Sängö", "region": "Africa", "script": "Latin"},
"sna_Latn": {"name": "Shona", "native_name": "ChiShona", "region": "Africa", "script": "Latin"},
"som_Latn": {"name": "Somali", "native_name": "Soomaali", "region": "Africa", "script": "Latin"},
"sot_Latn": {"name": "Southern Sotho", "native_name": "Sesotho", "region": "Africa", "script": "Latin"},
"ssw_Latn": {"name": "Swati", "native_name": "SiSwati", "region": "Africa", "script": "Latin"},
"swh_Latn": {"name": "Swahili", "native_name": "Kiswahili", "region": "Africa", "script": "Latin"},
"taq_Latn": {"name": "Tamasheq", "native_name": "Tamasheq", "region": "Africa", "script": "Latin"},
"taq_Tfng": {"name": "Tamasheq", "native_name": "ⵜⴰⵎⴰⵌⴰⵖ", "region": "Africa", "script": "Tifinagh"},
"tir_Ethi": {"name": "Tigrinya", "native_name": "ትግርኛ", "region": "Africa", "script": "Ethiopic"},
"tsn_Latn": {"name": "Tswana", "native_name": "Setswana", "region": "Africa", "script": "Latin"},
"tso_Latn": {"name": "Tsonga", "native_name": "Xitsonga", "region": "Africa", "script": "Latin"},
"tum_Latn": {"name": "Tumbuka", "native_name": "Chitumbuka", "region": "Africa", "script": "Latin"},
"twi_Latn": {"name": "Twi", "native_name": "Twi", "region": "Africa", "script": "Latin"},
"tzm_Tfng": {"name": "Central Atlas Tamazight", "native_name": "ⵜⴰⵎⴰⵣⵉⵖⵜ", "region": "Africa", "script": "Tifinagh"},
"umb_Latn": {"name": "Umbundu", "native_name": "Umbundu", "region": "Africa", "script": "Latin"},
"wol_Latn": {"name": "Wolof", "native_name": "Wolof", "region": "Africa", "script": "Latin"},
"xho_Latn": {"name": "Xhosa", "native_name": "isiXhosa", "region": "Africa", "script": "Latin"},
"yor_Latn": {"name": "Yoruba", "native_name": "Yorùbá", "region": "Africa", "script": "Latin"},
"zul_Latn": {"name": "Zulu", "native_name": "isiZulu", "region": "Africa", "script": "Latin"},
# European Languages
"eng_Latn": {"name": "English", "native_name": "English", "region": "Europe", "script": "Latin"},
"fra_Latn": {"name": "French", "native_name": "Français", "region": "Europe", "script": "Latin"},
"deu_Latn": {"name": "German", "native_name": "Deutsch", "region": "Europe", "script": "Latin"},
"spa_Latn": {"name": "Spanish", "native_name": "Español", "region": "Europe", "script": "Latin"},
"ita_Latn": {"name": "Italian", "native_name": "Italiano", "region": "Europe", "script": "Latin"},
"por_Latn": {"name": "Portuguese", "native_name": "Português", "region": "Europe", "script": "Latin"},
"rus_Cyrl": {"name": "Russian", "native_name": "Русский", "region": "Europe", "script": "Cyrillic"},
"nld_Latn": {"name": "Dutch", "native_name": "Nederlands", "region": "Europe", "script": "Latin"},
"pol_Latn": {"name": "Polish", "native_name": "Polski", "region": "Europe", "script": "Latin"},
"ces_Latn": {"name": "Czech", "native_name": "Čeština", "region": "Europe", "script": "Latin"},
"hun_Latn": {"name": "Hungarian", "native_name": "Magyar", "region": "Europe", "script": "Latin"},
"ron_Latn": {"name": "Romanian", "native_name": "Română", "region": "Europe", "script": "Latin"},
"bul_Cyrl": {"name": "Bulgarian", "native_name": "Български", "region": "Europe", "script": "Cyrillic"},
"hrv_Latn": {"name": "Croatian", "native_name": "Hrvatski", "region": "Europe", "script": "Latin"},
"srp_Cyrl": {"name": "Serbian", "native_name": "Српски", "region": "Europe", "script": "Cyrillic"},
"slk_Latn": {"name": "Slovak", "native_name": "Slovenčina", "region": "Europe", "script": "Latin"},
"slv_Latn": {"name": "Slovenian", "native_name": "Slovenščina", "region": "Europe", "script": "Latin"},
"est_Latn": {"name": "Estonian", "native_name": "Eesti", "region": "Europe", "script": "Latin"},
"lav_Latn": {"name": "Latvian", "native_name": "Latviešu", "region": "Europe", "script": "Latin"},
"lit_Latn": {"name": "Lithuanian", "native_name": "Lietuvių", "region": "Europe", "script": "Latin"},
# Asian Languages
"cmn_Hans": {"name": "Chinese (Simplified)", "native_name": "中文 (简体)", "region": "Asia", "script": "Han"},
"cmn_Hant": {"name": "Chinese (Traditional)", "native_name": "中文 (繁體)", "region": "Asia", "script": "Han"},
"jpn_Jpan": {"name": "Japanese", "native_name": "日本語", "region": "Asia", "script": "Japanese"},
"kor_Hang": {"name": "Korean", "native_name": "한국어", "region": "Asia", "script": "Hangul"},
"hin_Deva": {"name": "Hindi", "native_name": "हिन्दी", "region": "Asia", "script": "Devanagari"},
"ben_Beng": {"name": "Bengali", "native_name": "বাংলা", "region": "Asia", "script": "Bengali"},
"urd_Arab": {"name": "Urdu", "native_name": "اردو", "region": "Asia", "script": "Arabic"},
"tam_Taml": {"name": "Tamil", "native_name": "தமிழ்", "region": "Asia", "script": "Tamil"},
"tel_Telu": {"name": "Telugu", "native_name": "తెలుగు", "region": "Asia", "script": "Telugu"},
"mar_Deva": {"name": "Marathi", "native_name": "मराठी", "region": "Asia", "script": "Devanagari"},
"guj_Gujr": {"name": "Gujarati", "native_name": "ગુજરાતી", "region": "Asia", "script": "Gujarati"},
"kan_Knda": {"name": "Kannada", "native_name": "ಕನ್ನಡ", "region": "Asia", "script": "Kannada"},
"mal_Mlym": {"name": "Malayalam", "native_name": "മലയാളം", "region": "Asia", "script": "Malayalam"},
"ori_Orya": {"name": "Odia", "native_name": "ଓଡ଼ିଆ", "region": "Asia", "script": "Odia"},
"pan_Guru": {"name": "Punjabi", "native_name": "ਪੰਜਾਬੀ", "region": "Asia", "script": "Gurmukhi"},
"tha_Thai": {"name": "Thai", "native_name": "ไทย", "region": "Asia", "script": "Thai"},
"vie_Latn": {"name": "Vietnamese", "native_name": "Tiếng Việt", "region": "Asia", "script": "Latin"},
"ind_Latn": {"name": "Indonesian", "native_name": "Bahasa Indonesia", "region": "Asia", "script": "Latin"},
"msa_Latn": {"name": "Malay", "native_name": "Bahasa Melayu", "region": "Asia", "script": "Latin"},
"tgl_Latn": {"name": "Tagalog", "native_name": "Tagalog", "region": "Asia", "script": "Latin"},
# Middle Eastern Languages
"ara_Arab": {"name": "Arabic", "native_name": "العربية", "region": "Middle East", "script": "Arabic"},
"heb_Hebr": {"name": "Hebrew", "native_name": "עברית", "region": "Middle East", "script": "Hebrew"},
"fas_Arab": {"name": "Persian", "native_name": "فارسی", "region": "Middle East", "script": "Arabic"},
"tur_Latn": {"name": "Turkish", "native_name": "Türkçe", "region": "Middle East", "script": "Latin"},
# Americas Languages
"spa_Latn": {"name": "Spanish", "native_name": "Español", "region": "Americas", "script": "Latin"},
"por_Latn": {"name": "Portuguese", "native_name": "Português", "region": "Americas", "script": "Latin"},
"eng_Latn": {"name": "English", "native_name": "English", "region": "Americas", "script": "Latin"},
"fra_Latn": {"name": "French", "native_name": "Français", "region": "Americas", "script": "Latin"},
}
def get_all_languages() -> Dict[str, Dict[str, str]]:
"""Get all supported languages with their metadata"""
return SUPPORTED_LANGUAGES
def get_languages_by_region(region: str) -> Dict[str, Dict[str, str]]:
"""Get languages filtered by region"""
return {
code: info for code, info in SUPPORTED_LANGUAGES.items()
if info["region"].lower() == region.lower()
}
def get_language_info(language_code: str) -> Optional[Dict[str, str]]:
"""Get information about a specific language"""
return SUPPORTED_LANGUAGES.get(language_code)
def is_language_supported(language_code: str) -> bool:
"""Check if a language code is supported"""
return language_code in SUPPORTED_LANGUAGES
def get_popular_languages() -> Dict[str, Dict[str, str]]:
"""Get most commonly used languages"""
popular_codes = [
# Global languages
"eng_Latn", "spa_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "por_Latn",
"rus_Cyrl", "cmn_Hans", "jpn_Jpan", "kor_Hang", "ara_Arab", "hin_Deva",
# Popular African languages
"swh_Latn", "hau_Latn", "yor_Latn", "amh_Ethi", "som_Latn", "kik_Latn",
"afr_Latn", "ibo_Latn", "orm_Latn", "aka_Latn", "bam_Latn", "fon_Latn",
"lin_Latn", "lug_Latn", "nya_Latn", "sna_Latn", "tir_Ethi", "wol_Latn",
"xho_Latn", "zul_Latn", "tsn_Latn", "sot_Latn"
]
return {code: SUPPORTED_LANGUAGES[code] for code in popular_codes if code in SUPPORTED_LANGUAGES}
def get_african_languages() -> Dict[str, Dict[str, str]]:
"""Get African languages specifically"""
return get_languages_by_region("Africa")
def search_languages(query: str) -> Dict[str, Dict[str, str]]:
"""Search languages by name or native name"""
query_lower = query.lower()
results = {}
for code, info in SUPPORTED_LANGUAGES.items():
if (query_lower in info["name"].lower() or
query_lower in info["native_name"].lower() or
query_lower in code.lower()):
results[code] = info
return results
def get_language_statistics() -> Dict[str, int]:
"""Get statistics about supported languages"""
stats = {
"total_languages": len(SUPPORTED_LANGUAGES),
"regions": len(set(info["region"] for info in SUPPORTED_LANGUAGES.values())),
"scripts": len(set(info["script"] for info in SUPPORTED_LANGUAGES.values()))
}
# Count by region
region_counts = {}
for info in SUPPORTED_LANGUAGES.values():
region = info["region"]
region_counts[region] = region_counts.get(region, 0) + 1
stats["by_region"] = region_counts
return stats