Spaces:
Runtime error
Runtime error
import json | |
from functools import partial | |
import gradio as gr | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import pycountry | |
from gradio_rangeslider import RangeSlider | |
from tqdm import tqdm | |
with open("results.json") as f: | |
languages = json.load(f) | |
languages_with_scores = [lang for lang in languages if lang["t2t_score"] is not None] | |
# Global constants for metric mappings | |
METRICS = { | |
"t2t": [ | |
{ | |
"display_name": "Overall Text-to-Text Performance", | |
"field_name": "t2t_score", | |
"label": "Overall Score", | |
"explanation": """ | |
**Overall Score for Text-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks. | |
Higher scores indicate better overall language capabilities. | |
""", | |
}, | |
{ | |
"display_name": "Translation (BLEU)", | |
"field_name": "mt_bleu", | |
"label": "BLEU Score", | |
"explanation": """ | |
**Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations. | |
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality. | |
""", | |
}, | |
{ | |
"display_name": "Translation (ChrF)", | |
"field_name": "mt_chrf", | |
"label": "ChrF Score", | |
"explanation": """ | |
**Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level. | |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches. | |
Higher scores (0-1) indicate better translations. | |
""", | |
}, | |
{ | |
"display_name": "Classification (Accuracy)", | |
"field_name": "cls_acc", | |
"label": "Classification Accuracy", | |
"explanation": """ | |
**Classification Accuracy**: Measures how accurately models can classify text into predefined categories. | |
This evaluates a model's understanding of content and context across different languages. | |
Reported as a percentage where higher values indicate better classification performance. | |
""", | |
}, | |
{ | |
"display_name": "Masked Language Modeling (ChrF)", | |
"field_name": "mlm_chrf", | |
"label": "MLM ChrF Score", | |
"explanation": """ | |
**Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text. | |
This tests a model's understanding of language structure and semantics by measuring the character-level similarity | |
between predicted and actual text. Higher scores indicate better language understanding. | |
""", | |
}, | |
], | |
"s2t": [ | |
{ | |
"display_name": "Overall Speech-to-Text Performance", | |
"field_name": "s2t_score", | |
"label": "Overall Score", | |
"explanation": """ | |
**Overall Score for Speech-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks. | |
Higher scores indicate better overall language capabilities. | |
""", | |
}, | |
{ | |
"display_name": "Automatic Speech Recognition (WER)", | |
"field_name": "asr_wer", | |
"label": "WER", | |
"explanation": """ | |
**Automatic Speech Recognition Word Error Rate**: Measures the accuracy of speech-to-text transcription. | |
It calculates the minimum number of word edits (insertions, deletions, substitutions) needed to transform the | |
transcription into the reference text, divided by the number of words in the reference. | |
Lower scores indicate better performance, with 0 being perfect transcription. | |
""", | |
}, | |
{ | |
"display_name": "Automatic Speech Recognition (ChrF)", | |
"field_name": "asr_chrf", | |
"label": "ChrF", | |
"explanation": """ | |
**Automatic Speech Recognition ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level. | |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches. | |
Higher scores (0-1) indicate better translations. | |
""", | |
}, | |
], | |
} | |
def mean(lst): | |
return sum(lst) / len(lst) | |
def create_leaderboard_df(model_type, metric=None): | |
metric = metric or METRICS[model_type][0] | |
_model_type = {"t2t": "text-to-text", "s2t": "speech-to-text"}[model_type] | |
models = { | |
score["model"] | |
for lang in languages_with_scores | |
for score in lang["scores"] | |
if score["model_type"] == _model_type | |
} | |
model_scores = [ | |
{"model": score["model"], metric["field_name"]: score[metric["field_name"]]} | |
for lang in languages_with_scores | |
for score in lang["scores"] | |
for model in models | |
if score["model"] == model | |
] | |
df = ( | |
pd.DataFrame(model_scores) | |
.groupby("model") | |
.agg({metric["field_name"]: ["mean", "count"]}) | |
.reset_index() | |
) | |
# Flatten the multi-level column names | |
df.columns = df.columns.map( | |
lambda x: f"{x[0]}_{x[1]}" if isinstance(x, tuple) else x | |
) | |
df = df.rename( | |
columns={ | |
f"{metric['field_name']}_mean": metric["label"], | |
f"{metric['field_name']}_count": "Languages Tested", | |
"model_": "Model", | |
} | |
) | |
df[metric["label"]] = df[metric["label"]].round(3) | |
df = df.sort_values(metric["label"], ascending=False) | |
df["Rank"] = range(1, len(df) + 1) | |
df["Rank"] = df["Rank"].apply( | |
lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x) | |
) | |
df = df[["Rank", "Model", metric["label"]]] | |
return gr.DataFrame( | |
value=df, | |
label="Model Leaderboard", | |
show_search=False, | |
datatype=["number", "markdown", "number"], | |
) | |
def create_model_comparison_plot(metric): | |
top_languages = sorted( | |
languages_with_scores, key=lambda x: x["speakers"], reverse=True | |
)[:10] | |
# Create appropriate title and y-axis label based on metric | |
title = f"{metric['display_name']} by Model and Language" | |
y_label = metric["label"] | |
# Flatten the data for the selected metric | |
scores_flat = [] | |
for lang in top_languages: | |
for score in lang["scores"]: | |
# Get the value directly using the field name | |
if metric["field_name"] not in score: | |
continue | |
value = score[metric["field_name"]] | |
if value is not None: | |
scores_flat.append( | |
{ | |
"language": lang["language_name"], | |
"model": score["model"], | |
"value": value, | |
} | |
) | |
df = pd.DataFrame(scores_flat) | |
fig = px.bar(df, x="language", y="value", color="model", barmode="group") | |
fig.update_layout( | |
title=title, | |
xaxis_title=None, | |
yaxis_title=y_label, | |
barmode="group", | |
height=500, | |
legend=dict( | |
orientation="h", # horizontal orientation | |
yanchor="bottom", | |
y=-0.3, # position below plot | |
xanchor="center", | |
x=0.5, # center horizontally | |
), | |
) | |
return fig | |
def create_language_stats_df(metric): | |
# Create a list to store flattened data | |
flat_data = [] | |
for lang in languages: | |
# Find the best model and its BLEU score | |
best_model = ( | |
max( | |
lang["scores"] or [{"t2t_score": None, "model": None}], | |
key=lambda x: x.get("t2t_score", 0), | |
) | |
if lang["t2t_score"] is not None | |
else None | |
) | |
model = best_model["model"] if best_model else None | |
model_name = model.split("/")[-1] if model else "N/A" | |
model_link = ( | |
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" | |
if model | |
else "N/A" | |
) | |
commonvoice_link = ( | |
f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {round(lang['commonvoice_hours'])}h</a>" | |
if lang["commonvoice_hours"] | |
else "N/A" | |
) | |
language_link = f"<a href='/{lang['bcp_47']}' style='text-decoration: none; font-weight: bold;'>{lang['language_name']}</a>" | |
row = { | |
"Language": language_link, | |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1), | |
# "Models Tested": len(lang["scores"]), | |
# "Overall": round(lang["overall_score"], 3) | |
# if lang["overall_score"] is not None | |
# else "N/A", | |
"Best Model": model_link, | |
"Translation": round(lang["mt_chrf"], 3) | |
if lang["mt_chrf"] is not None | |
else "N/A", | |
"Classification": round(lang["cls_acc"], 3) | |
if lang["cls_acc"] is not None | |
else "N/A", | |
"Masked Language Modeling": round(lang["mlm_chrf"], 3) | |
if lang["mlm_chrf"] is not None | |
else "N/A", | |
"Speech Recognition": round(lang["asr_chrf"], 3) if lang["asr_wer"] is not None else "N/A", | |
"CommonVoice": commonvoice_link, | |
} | |
flat_data.append(row) | |
df = pd.DataFrame(flat_data) | |
return gr.DataFrame( | |
value=df, | |
label="Language Results", | |
show_search="search", | |
pinned_columns=1, | |
column_widths=[ | |
"100px", | |
"100px", | |
# "100px", | |
# "100px", | |
"200px", # Best Model | |
"100px", # MT | |
"100px", # CLS | |
"100px", # MLM | |
"100px", # ASR | |
"100px", # Common Voice | |
], | |
datatype=[ | |
"markdown", # Language | |
"number", # Speakers | |
# "number", # Models Tested | |
# "number", # Overall | |
"markdown", # Best Model | |
"number", # Translation | |
"number", # Classification | |
"number", # MLM | |
"number", # ASR | |
"markdown", # CommonVoice Hours | |
], | |
) | |
def create_scatter_plot(metric): | |
# Create a list to store data for the scatter plot | |
scatter_data = [] | |
for lang in languages_with_scores: | |
if lang["speakers"] < 100_000: | |
continue | |
# Calculate average score for this metric across all models | |
scores = [ | |
score[metric["field_name"]] | |
for score in lang["scores"] | |
if metric["field_name"] in score and score[metric["field_name"]] is not None | |
] | |
if scores: # Only include if we have valid scores | |
avg_score = sum(scores) / len(scores) | |
scatter_data.append( | |
{ | |
"language": lang["language_name"], | |
"speakers": lang["speakers"], | |
"score": avg_score, | |
"family": lang["language_family"], | |
} | |
) | |
fig = go.Figure() | |
x_vals = [data["speakers"] / 1_000_000 for data in scatter_data] | |
y_vals = [data["score"] for data in scatter_data] | |
s_vals = [data["speakers"] / 20_000_000 for data in scatter_data] | |
color_pallette = [ | |
"LightSkyBlue", | |
"LightGreen", | |
"LightCoral", | |
"LightPink", | |
"LightGoldenRodYellow", | |
"LightGray", | |
"LightSalmon", | |
"LightSeaGreen", | |
] | |
color_mapping = { | |
family: color | |
for family, color in zip( | |
sorted(set(data["family"] for data in scatter_data)), color_pallette | |
) | |
} | |
c_vals = [color_mapping.get(data["family"], "LightGray") for data in scatter_data] | |
labels = [data["language"] for data in scatter_data] | |
hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>" | |
fig.add_trace( | |
go.Scatter( | |
x=x_vals, | |
y=y_vals, | |
marker=dict(size=s_vals, color=c_vals), | |
mode="markers+text", | |
text=labels, | |
textposition="top center", | |
hovertemplate=hover_template, | |
) | |
) | |
fig.update_layout( | |
title=None, | |
xaxis_title="Number of Speakers (Millions)", | |
yaxis_title=metric["label"], | |
height=500, | |
showlegend=False, | |
) | |
fig.update_xaxes(type="log") | |
return fig | |
def format_number(n): | |
"""Format number with K/M suffix""" | |
if n >= 1_000_000: | |
return f"{n/1_000_000:.1f}M" | |
elif n >= 1_000: | |
return f"{n/1_000:.0f}K" | |
return str(n) | |
def get_population_data(): | |
import xml.etree.ElementTree as ET | |
from language_data.util import data_filename | |
filename = data_filename("supplementalData.xml") | |
root = ET.fromstring(open(filename).read()) | |
territories = root.findall("./territoryInfo/territory") | |
data = {} | |
for territory in territories: | |
t_code = territory.attrib["type"] | |
t_population = float(territory.attrib["population"]) | |
data[t_code] = t_population | |
return data | |
# Helper functions for visualization | |
def make_black_bar(value, max_width=10): | |
filled = int(value * max_width) | |
return "⬛️" * filled + "⬜️" * (max_width - filled) | |
def make_colored_bar(score, max_width=10): | |
"""Create a colored bar using Unicode blocks based on normalized score | |
🟦 for high values (>0.35) | |
🟨 for medium values (0.25-0.35) | |
🟥 for low values (<0.25) | |
⬜ for empty space | |
This function handles both normalization and bar creation. | |
""" | |
# Create the bar based on normalized value | |
filled = int(score * max_width) | |
filled = max(0, min(filled, max_width)) | |
empty = max_width - filled | |
if score > 0.35: | |
return "🟦" * filled + "⬜" * empty | |
elif score > 0.25: | |
return "🟨" * filled + "⬜" * empty | |
else: | |
return "🟥" * filled + "⬜" * empty | |
def create_world_map(metric): | |
# Collect all country data | |
population_data = get_population_data() | |
country_data = {} | |
for lang in languages: | |
# Skip languages without the required data | |
if "population" not in lang or lang[metric["field_name"]] is None: | |
continue | |
for country_code, speakers in lang["population"].items(): | |
try: | |
# Convert alpha_2 (2-letter) to alpha_3 (3-letter) code | |
country = pycountry.countries.get(alpha_2=country_code) | |
if country is None: | |
continue | |
iso3_code = country.alpha_3 | |
if iso3_code not in country_data: | |
country_data[iso3_code] = { | |
"total_speakers": 0, | |
"population": population_data.get(country_code, 0), | |
"weighted_score_sum": 0, | |
"languages": [], | |
} | |
country_data[iso3_code]["total_speakers"] += speakers | |
country_data[iso3_code]["weighted_score_sum"] += ( | |
speakers * lang[metric["field_name"]] | |
) | |
country_data[iso3_code]["languages"].append( | |
{ | |
"name": lang["language_name"], | |
"speakers": speakers, | |
"score": lang[metric["field_name"]], | |
} | |
) | |
except (KeyError, AttributeError): | |
# Skip invalid or unrecognized country codes | |
continue | |
# Calculate final weighted averages and prepare hover text | |
countries = [] | |
scores = [] | |
hover_texts = [] | |
for country_code, data in country_data.items(): | |
weighted_avg = data["weighted_score_sum"] / data["total_speakers"] if data["total_speakers"] > 0 else None | |
try: | |
country_name = pycountry.countries.get(alpha_3=country_code).name | |
except AttributeError: | |
country_name = country_code | |
# Sort languages by number of speakers | |
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True) | |
# Take top 5 languages and summarize the rest | |
main_langs = langs[:5] | |
other_langs = langs[5:] | |
# Create language rows with bars | |
lang_rows = [] | |
for lang in main_langs: | |
percentage = (lang["speakers"] / data["population"]) * 100 | |
speaker_bar = make_black_bar(percentage / 100) | |
# Use the integrated make_colored_bar function directly | |
score_bar = make_colored_bar(lang["score"]) | |
lang_rows.append( | |
f"<b>{lang['name']}</b><br>" | |
f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>" | |
f"{score_bar} {lang['score']:.3f} {metric['label']}<br>" | |
) | |
# Add summary for other languages if any | |
if other_langs: | |
other_speakers = sum(lang["speakers"] for lang in other_langs) | |
other_percentage = (other_speakers / data["population"]) * 100 | |
other_avg_score = sum(lang["score"] for lang in other_langs) / len( | |
other_langs | |
) | |
speaker_bar = make_black_bar(other_percentage / 100) | |
# Use the integrated make_colored_bar function directly | |
score_bar = make_colored_bar(other_avg_score) | |
lang_rows.append( | |
f"<b>+{len(other_langs)} other languages</b><br>" | |
f"{speaker_bar} {format_number(other_speakers)} speakers<br>" | |
f"{score_bar} {other_avg_score:.3f} {metric['label']}<br>" | |
) | |
hover_text = f"<b>{country_name}</b><br><br>" f"{'<br>'.join(lang_rows)}" | |
countries.append(country_code) | |
scores.append(weighted_avg) | |
hover_texts.append(hover_text) | |
fig = go.Figure( | |
data=go.Choropleth( | |
locations=countries, | |
locationmode="ISO-3", | |
z=scores, | |
text=hover_texts, | |
hoverinfo="text", | |
colorscale=[[0, "#ff9999"], [1, "#99ccff"]], | |
colorbar=dict( | |
title=metric["label"], | |
orientation="h", # horizontal orientation | |
y=-0.2, # position below map | |
yanchor="bottom", | |
len=0.5, # length of colorbar | |
x=0.5, # center horizontally | |
xanchor="center", | |
thickness=20, # make it a bit thicker when horizontal | |
), | |
) | |
) | |
fig.update_layout( | |
title=dict( | |
text=f"{metric['display_name']} by Country", x=0.5, xanchor="center" | |
), | |
geo=dict( | |
showframe=True, | |
showcoastlines=True, | |
projection_type="equal earth", | |
showland=True, | |
landcolor="#f8f9fa", | |
coastlinecolor="#e0e0e0", | |
countrycolor="#e0e0e0", | |
), | |
height=600, | |
margin=dict(l=0, r=0, t=30, b=0), | |
paper_bgcolor="white", | |
hoverlabel=dict( | |
bgcolor="beige", | |
font_size=12, | |
), | |
) | |
return fig | |
def create_metric_selector(model_type): | |
match model_type: | |
case "t2t": | |
choices = [m["display_name"] for m in METRICS["t2t"]] | |
case "s2t": | |
choices = [m["display_name"] for m in METRICS["s2t"]] | |
return gr.Dropdown( | |
choices=choices, value=choices[0], label="Select Metric", interactive=True | |
) | |
def create_metric_explanation(metric): | |
return gr.Markdown(metric["explanation"], container=True) | |
css=""" | |
.radio-group .wrap { | |
display: grid !important; | |
grid-template-columns: 1fr 1fr; | |
} | |
.nav-holder {display: none;} | |
.share-link { | |
display: inline-flex; | |
align-items: center; | |
background-color: #f0f0f0; | |
border-radius: 8px; | |
padding: 8px 12px; | |
margin: 10px 0; | |
font-family: monospace; | |
transition: all 0.2s ease; | |
cursor: pointer; | |
text-decoration: none; | |
color: #333; | |
} | |
.share-link:hover { | |
background-color: #e0e0e0; | |
} | |
.share-link .icon { | |
margin-left: 8px; | |
} | |
.title-row { | |
display: flex; | |
align-items: center; | |
justify-content: space-between; | |
margin-bottom: 1rem; | |
} | |
.title-row h2 { | |
margin: 0; | |
} | |
""" | |
shortcut_js = """ | |
<script> | |
// Handle URL parameters for direct language access | |
const params = new URLSearchParams(window.location.search); | |
const lang = params.get("lang"); | |
if (lang) { | |
window.location.href = "/" + lang; | |
} | |
// Function to copy link to clipboard | |
const copyLinkToClipboard = (link) => { | |
navigator.clipboard.writeText(link); | |
console.log("Copied link to clipboard: " + link); | |
} | |
const redirect_to_lang = lang_descriptor => { | |
lang_code = lang_descriptor.split("(")[1].split(")")[0]; | |
console.log("redirecting to /" + lang_code); | |
window.location.href = "/" + lang_code; | |
} | |
const empty_search = () => { | |
console.log("empty search"); | |
document.getElementById("search-dropdown").value = ""; | |
} | |
</script> | |
""" | |
# Create the visualization components | |
with gr.Blocks(title="AI Language Proficiency Benchmark", css=css, head=shortcut_js) as demo: | |
language_choices = [ | |
f"{lang['language_name']} ({lang['bcp_47']})" for lang in languages | |
] | |
models = {score["model"] for lang in languages for score in lang["scores"]} | |
search = gr.Dropdown( | |
choices=language_choices, # + list(models), | |
value="Search for Language or Model", | |
allow_custom_value=True, | |
interactive=True, | |
container=False, | |
elem_id="search-dropdown" | |
) | |
search.focus(fn=lambda x: None, inputs=search, outputs=None, js="(x) => {empty_search()}") | |
search.change(fn=lambda x: None, inputs=search, outputs=None, js="(x) => {redirect_to_lang(x)}") | |
gr.Markdown("# AI Language Proficiency Benchmark") | |
gr.Markdown("Comparing language proficiency across different models and languages.") | |
with gr.Row(): | |
start_model_type = "Text-to-Text" | |
model_type = gr.Radio( | |
choices=["Text-to-Text", "Speech-to-Text"], | |
value=start_model_type, | |
label="Select Model Type", | |
interactive=True, | |
elem_classes="radio-group", | |
) | |
start_metric = METRICS["t2t"][0] | |
metric = gr.Dropdown( | |
choices=[metric["display_name"] for metric in METRICS["t2t"]], | |
value=start_metric["display_name"], | |
label="Main task and metric to display in figures and map", | |
interactive=True, | |
) | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion("Model Filters", open=False): | |
model_licenses = gr.CheckboxGroup( | |
choices=["open source", "commercial"], | |
value=["open source", "commercial"], | |
label="Filter by Model License", | |
interactive=True, | |
) | |
model_sizes = RangeSlider( | |
minimum=0, | |
maximum=1000, | |
value=(0, 1000), | |
label="Filter by Model Size (in Billion Parameters)", | |
interactive=True, | |
) | |
with gr.Column(): | |
with gr.Accordion("Language Filters", open=False): | |
unit_of_analysis = gr.Radio( | |
choices=["Languages", "Language Families", "Regions"], | |
value="Languages", | |
label="Select Unit of Analysis", | |
interactive=True, | |
) | |
family_filter = gr.CheckboxGroup( | |
choices=[ | |
"Indo-European", | |
"Sino-Tibetan", | |
"Afro-Asiatic", | |
"Dravidian", | |
"Uralic", | |
"Austronesian", | |
"Other", | |
], | |
value=[ | |
"Indo-European", | |
"Sino-Tibetan", | |
"Afro-Asiatic", | |
"Dravidian", | |
"Uralic", | |
"Austronesian", | |
"Other", | |
], | |
label="Filter by Language Family", | |
interactive=True, | |
) | |
speakers_filter = RangeSlider( | |
minimum=0, | |
maximum=100_000_000, | |
value=(0, 100_000_000), | |
label="Filter by Number of Speakers", | |
interactive=True, | |
) | |
gr.Markdown("## Model Comparison") | |
leaderboard_df = create_leaderboard_df("t2t", start_metric) | |
model_comparison_plot = gr.Plot( | |
value=create_model_comparison_plot(start_metric), | |
label="Model Comparison", | |
) | |
gr.Markdown("## Language Stats") | |
create_language_stats_df(start_metric) | |
scatter_plot = gr.Plot( | |
value=create_scatter_plot(start_metric), | |
label="Speaker Population vs. Metric", | |
) | |
world_map = gr.Plot( | |
value=create_world_map(start_metric), | |
label="World Map", | |
container=False, | |
elem_classes="fullwidth-plot", | |
) | |
def update_model_type(model_type_choice): | |
model_type = {"Text-to-Text": "t2t", "Speech-to-Text": "s2t"}[model_type_choice] | |
return create_metric_selector(model_type), create_leaderboard_df(model_type) | |
model_type.change( | |
fn=update_model_type, | |
inputs=model_type, | |
outputs=[metric, leaderboard_df], | |
) | |
def update_component(fn, model_type_choice, metric_choice): | |
model_type = {"Text-to-Text": "t2t", "Speech-to-Text": "s2t"}[model_type_choice] | |
metric = [m for m in METRICS[model_type] if m["display_name"] == metric_choice][ | |
0 | |
] | |
return fn(metric) | |
metric.change( | |
fn=partial(update_component, create_model_comparison_plot), | |
inputs=[model_type, metric], | |
outputs=model_comparison_plot, | |
) | |
metric.change( | |
fn=partial(update_component, create_scatter_plot), | |
inputs=[model_type, metric], | |
outputs=scatter_plot, | |
) | |
metric.change( | |
fn=partial(update_component, create_world_map), | |
inputs=[model_type, metric], | |
outputs=world_map, | |
) | |
with gr.Accordion("Methodology", open=False): | |
gr.Markdown( | |
""" | |
### Benchmark Data | |
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one. | |
Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package. | |
### AI Models | |
We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API. | |
### Evaluation Tasks | |
Our benchmark includes three core tasks to assess different aspects of language understanding: | |
1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using: | |
- [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty | |
- [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations | |
2. **Text Classification**: Models classify text into predefined topics after being shown examples. We: | |
- Group sentences by URL into paragraphs with the same topic | |
- Use the 5 most common topics, encoded as numbers rather than English labels | |
- Provide 5 examples of each topic as few-shot examples | |
- Test the model's ability to classify new text | |
- Report accuracy as the primary metric | |
3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We: | |
- Mask approximately 5% of each sentence at a random position | |
- Provide 10 examples of complete sentences paired with masked versions in a few-shot setting | |
- Evaluate predictions using ChrF score against the original text | |
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages. | |
""" | |
) | |
for lang in tqdm(languages[:20], desc="Generating pages"): | |
with demo.route(lang['language_name'], f"/{lang['bcp_47']}"): | |
gr.Button("← Back to Main Dashboard", link="/") | |
url = f"hf.co/spaces/datenlaborbmz/ai-language-monitor?lang={lang['bcp_47']}" | |
gr.Markdown( | |
f''' | |
<div class="title-row"> | |
<h2>{lang['language_name']}</h2> | |
<div class="share-link" onclick="copyLinkToClipboard('{url}')">{url}<span class="icon">📋</span></div> | |
</div> | |
''', | |
sanitize_html=False | |
) | |
# Language overview section | |
with gr.Row(): | |
with gr.Column(scale=2): | |
gr.Markdown(f""" | |
## Language Overview | |
- **Native name**: {lang.get('native_name', 'N/A')} | |
- **Language family**: {lang.get('language_family', 'N/A')} | |
- **BCP-47 code**: `{lang['bcp_47']}` | |
- **ISO 639-3 code**: `{lang.get('iso_639_3', 'N/A')}` | |
- **Number of speakers**: {format_number(lang['speakers'])} | |
- **Script**: {lang.get('script', 'N/A')} | |
- **CommonVoice hours**: {round(lang.get('commonvoice_hours', 0) or 0)} | |
""") | |
# Resource links | |
resource_links = [] | |
if lang.get('commonvoice_locale'): | |
resource_links.append(f"[CommonVoice Dataset](https://commonvoice.mozilla.org/{lang['commonvoice_locale']})") | |
if lang.get('wikipedia_code'): | |
resource_links.append(f"[Wikipedia](https://{lang['wikipedia_code']}.wikipedia.org)") | |
if lang.get('bcp_47'): | |
resource_links.append(f"[FLORES+ Dataset](https://huggingface.co/datasets/openlanguagedata/flores_plus/viewer/all/{lang['bcp_47']})") | |
if resource_links: | |
gr.Markdown("### Resources\n" + "\n".join(resource_links)) | |
with gr.Column(scale=3): | |
# Create a mini-map showing where the language is spoken | |
country_data = {} | |
if "population" in lang: | |
for country_code, speakers in lang["population"].items(): | |
try: | |
country = pycountry.countries.get(alpha_2=country_code) | |
if country: | |
country_data[country.alpha_3] = speakers / lang["speakers"] | |
except (KeyError, AttributeError): | |
continue | |
locations = list(country_data.keys()) | |
values = list(country_data.values()) | |
if locations: | |
fig = go.Figure(data=go.Choropleth( | |
locations=locations, | |
z=values, | |
locationmode="ISO-3", | |
colorscale="Blues", | |
marker_line_color='white', | |
marker_line_width=0.5, | |
colorbar_title="Speaker %" | |
)) | |
fig.update_layout( | |
title_text=f"Distribution of {lang['language_name']} Speakers", | |
geo=dict( | |
showframe=False, | |
showcoastlines=True, | |
projection_type='natural earth' | |
), | |
height=300, | |
margin={"r":0,"t":30,"l":0,"b":0} | |
) | |
gr.Plot(value=fig) | |
else: | |
gr.Markdown("*Geographic data not available*") | |
# Performance metrics section | |
gr.Markdown("## AI Model Performance") | |
with gr.Row(): | |
with gr.Column(): | |
# Create metrics dashboard for this language | |
metrics_data = [] | |
for metric_key, display_name in [ | |
("t2t_score", "Overall Text Performance"), | |
("mt_bleu", "Translation (BLEU)"), | |
("mt_chrf", "Translation (ChrF)"), | |
("cls_acc", "Classification"), | |
("mlm_chrf", "Masked Language Modeling"), | |
("s2t_score", "Overall Speech Performance"), | |
("asr_wer", "Speech Recognition (WER)"), | |
("asr_chrf", "Speech Recognition (ChrF)") | |
]: | |
if metric_key in lang and lang[metric_key] is not None: | |
value = lang[metric_key] | |
color = "green" if value > 0.5 else "orange" if value > 0.25 else "red" | |
# For WER, lower is better, so invert the color logic | |
if metric_key == "asr_wer": | |
color = "green" if value < 0.3 else "orange" if value < 0.6 else "red" | |
metrics_data.append({ | |
"Metric": display_name, | |
"Value": round(value, 3), | |
"Visual": make_colored_bar(value if metric_key != "asr_wer" else 1 - value) | |
}) | |
if metrics_data: | |
gr.DataFrame( | |
pd.DataFrame(metrics_data), | |
label=f"Performance Metrics for {lang['language_name']}", | |
show_search=False | |
) | |
else: | |
gr.Markdown("*No performance metrics available*") | |
# Model comparison table | |
gr.Markdown("## Model Comparison") | |
with gr.Row(): | |
models_data = [] | |
for score in lang["scores"]: | |
if score.get("t2t_score") is not None: | |
model_name = score["model"].split("/")[-1] | |
models_data.append({ | |
"Model": model_name, | |
"Overall": round(score.get("t2t_score", 0), 3), | |
"Translation": round(score.get("mt_chrf", 0), 3), | |
"Classification": round(score.get("cls_acc", 0), 3), | |
"Lang Model": round(score.get("mlm_chrf", 0), 3), | |
"Speech": round(score.get("asr_chrf", 0), 3) if "asr_chrf" in score else "N/A" | |
}) | |
if models_data: | |
df = pd.DataFrame(models_data).sort_values("Overall", ascending=False) | |
gr.DataFrame( | |
df, | |
label=f"Model Performance on {lang['language_name']}", | |
show_search=False | |
) | |
else: | |
gr.Markdown("*No model comparison data available*") | |
# Performance comparison with similar languages | |
if lang.get("language_family"): | |
gr.Markdown("## Comparison with Related Languages") | |
# Find related languages | |
related_langs = [l for l in languages if l.get("language_family") == lang["language_family"] and l["t2t_score"] is not None] | |
related_langs = sorted(related_langs, key=lambda x: x["t2t_score"], reverse=True)[:10] | |
if len(related_langs) > 1: | |
lang_names = [l["language_name"] for l in related_langs] | |
t2t_scores = [l["t2t_score"] for l in related_langs] | |
fig = px.bar( | |
x=lang_names, | |
y=t2t_scores, | |
labels={"x": "Language", "y": "Text-to-Text Score"}, | |
title=f"Performance Across {lang['language_family']} Languages" | |
) | |
# Highlight the current language | |
for i, name in enumerate(lang_names): | |
if name == lang["language_name"]: | |
fig.data[0].marker.color = ["lightblue"] * i + ["orange"] + ["lightblue"] * (len(lang_names) - i - 1) | |
fig.update_layout(height=400) | |
gr.Plot(value=fig) | |
demo.launch() | |