davidpomerenke commited on
Commit
c790fdb
·
verified ·
1 Parent(s): f88768f

Upload from GitHub Actions: Add auto-translated datasets

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.github/workflows/nightly-evals.yml CHANGED
@@ -8,8 +8,6 @@ on:
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
11
- # checking if this is working in case eval runs take longer than 6h github actions allowance
12
- timeout-minutes: 1440 # 24 hours timeout
13
  steps:
14
  - uses: actions/checkout@v3
15
 
@@ -27,8 +25,6 @@ jobs:
27
  env:
28
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
29
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
30
- N_SENTENCES: 20
31
- MAX_LANGUAGES: 150
32
  run: |
33
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
34
  uv run evals/download_data.py
 
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
 
 
11
  steps:
12
  - uses: actions/checkout@v3
13
 
 
25
  env:
26
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
27
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
 
 
28
  run: |
29
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
30
  uv run evals/download_data.py
.gitignore CHANGED
@@ -20,6 +20,3 @@ wheels/
20
  # folders and files to be ignored
21
  .specstory/
22
  .cursorindexingignore
23
-
24
- # Project-specific files
25
- .dockerignore.eval
 
20
  # folders and files to be ignored
21
  .specstory/
22
  .cursorindexingignore
 
 
 
Dockerfile CHANGED
@@ -14,7 +14,7 @@ ENV HOME=/home/user \
14
  RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
15
  USER user
16
  WORKDIR $HOME/app
17
- COPY --chown=user pyproject.toml uv.lock README.md ./
18
  RUN uv sync --frozen --no-dev
19
  COPY --chown=user evals/ evals/
20
  COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
 
14
  RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
15
  USER user
16
  WORKDIR $HOME/app
17
+ COPY --chown=user pyproject.toml uv.lock ./
18
  RUN uv sync --frozen --no-dev
19
  COPY --chown=user evals/ evals/
20
  COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
README.md CHANGED
@@ -43,147 +43,12 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
43
 
44
  _Tracking language proficiency of AI models for every language_
45
 
46
- ## System Architecture
47
-
48
- The AI Language Monitor evaluates language models across 100+ languages using a comprehensive pipeline that combines model discovery, automated evaluation, and real-time visualization.
49
-
50
- ```mermaid
51
- flowchart TD
52
- %% Model Sources
53
- A1["important_models<br/>Static Curated List"] --> D[load_models]
54
- A2["get_historical_popular_models<br/>Web Scraping - Top 20"] --> D
55
- A3["get_current_popular_models<br/>Web Scraping - Top 10"] --> D
56
- A4["blocklist<br/>Exclusions"] --> D
57
-
58
- %% Model Processing
59
- D --> |"Combine & Dedupe"| E["Dynamic Model List<br/>~40-50 models"]
60
- E --> |get_or_metadata| F["OpenRouter API<br/>Model Metadata"]
61
- F --> |get_hf_metadata| G["HuggingFace API<br/>Model Details"]
62
- G --> H["Enriched Model DataFrame"]
63
- H --> |Save| I[models.json]
64
-
65
- %% Model Validation & Cost Filtering
66
- H --> |"Validate Models<br/>Check API Availability"| H1["Valid Models Only<br/>Cost ≤ $20/1M tokens"]
67
- H1 --> |"Timeout Protection<br/>120s for Large Models"| H2["Robust Model List"]
68
-
69
- %% Language Data
70
- J["languages.py<br/>BCP-47 + Population"] --> K["Top 100 Languages"]
71
-
72
- %% Task Registry with Unified Prompting
73
- L["tasks.py<br/>7 Evaluation Tasks"] --> M["Task Functions<br/>Unified English Zero-Shot"]
74
- M --> M1["translation_from/to<br/>BLEU + ChrF"]
75
- M --> M2["classification<br/>Accuracy"]
76
- M --> M3["mmlu<br/>Accuracy"]
77
- M --> M4["arc<br/>Accuracy"]
78
- M --> M5["truthfulqa<br/>Accuracy"]
79
- M --> M6["mgsm<br/>Accuracy"]
80
-
81
- %% On-the-fly Translation with Origin Tagging
82
- subgraph OTF [On-the-fly Dataset Translation]
83
- direction LR
84
- DS_raw["Raw English Dataset<br/>(e.g., MMLU)"] --> Google_Translate["Google Translate API"]
85
- Google_Translate --> DS_translated["Translated Dataset<br/>(e.g., German MMLU)<br/>Origin: 'machine'"]
86
- DS_native["Native Dataset<br/>(e.g., German MMLU)<br/>Origin: 'human'"]
87
- end
88
-
89
- %% Evaluation Pipeline
90
- H2 --> |"models ID"| N["main.py / main_gcs.py<br/>evaluate"]
91
- K --> |"languages bcp_47"| N
92
- L --> |"tasks.items"| N
93
- N --> |"Filter by model.tasks"| O["Valid Combinations<br/>Model × Language × Task"]
94
- O --> |"10 samples each"| P["Evaluation Execution<br/>Batch Processing"]
95
-
96
- %% Task Execution with Origin Tracking
97
- P --> Q1[translate_and_evaluate<br/>Origin: 'human']
98
- P --> Q2[classify_and_evaluate<br/>Origin: 'human']
99
- P --> Q3[mmlu_and_evaluate<br/>Origin: 'human'/'machine']
100
- P --> Q4[arc_and_evaluate<br/>Origin: 'human'/'machine']
101
- P --> Q5[truthfulqa_and_evaluate<br/>Origin: 'human'/'machine']
102
- P --> Q6[mgsm_and_evaluate<br/>Origin: 'human'/'machine']
103
-
104
- %% API Calls with Error Handling
105
- Q1 --> |"complete() API<br/>Rate Limiting"| R["OpenRouter<br/>Model Inference"]
106
- Q2 --> |"complete() API<br/>Rate Limiting"| R
107
- Q3 --> |"complete() API<br/>Rate Limiting"| R
108
- Q4 --> |"complete() API<br/>Rate Limiting"| R
109
- Q5 --> |"complete() API<br/>Rate Limiting"| R
110
- Q6 --> |"complete() API<br/>Rate Limiting"| R
111
-
112
- %% Results Processing with Origin Aggregation
113
- R --> |Scores| S["Result Aggregation<br/>Mean by model+lang+task+origin"]
114
- S --> |Save| T[results.json]
115
-
116
- %% Backend & Frontend with Origin-Specific Metrics
117
- T --> |Read| U[backend.py]
118
- I --> |Read| U
119
- U --> |make_model_table| V["Model Rankings<br/>Origin-Specific Metrics"]
120
- U --> |make_country_table| W["Country Aggregation"]
121
- U --> |"API Endpoint"| X["FastAPI /api/data<br/>arc_accuracy_human<br/>arc_accuracy_machine"]
122
- X --> |"JSON Response"| Y["Frontend React App"]
123
-
124
- %% UI Components
125
- Y --> Z1["WorldMap.js<br/>Country Visualization"]
126
- Y --> Z2["ModelTable.js<br/>Model Rankings"]
127
- Y --> Z3["LanguageTable.js<br/>Language Coverage"]
128
- Y --> Z4["DatasetTable.js<br/>Task Performance"]
129
-
130
- %% Data Sources with Origin Information
131
- subgraph DS ["Data Sources"]
132
- DS1["Flores-200<br/>Translation Sentences<br/>Origin: 'human'"]
133
- DS2["MMLU/AfriMMLU<br/>Knowledge QA<br/>Origin: 'human'"]
134
- DS3["ARC<br/>Science Reasoning<br/>Origin: 'human'"]
135
- DS4["TruthfulQA<br/>Truthfulness<br/>Origin: 'human'"]
136
- DS5["MGSM<br/>Math Problems<br/>Origin: 'human'"]
137
- end
138
-
139
- DS1 --> Q1
140
- DS2 --> Q3
141
- DS3 --> Q4
142
- DS4 --> Q5
143
- DS5 --> Q6
144
-
145
- DS_translated --> Q3
146
- DS_translated --> Q4
147
- DS_translated --> Q5
148
-
149
- DS_native --> Q3
150
- DS_native --> Q4
151
- DS_native --> Q5
152
-
153
- %% Styling - Neutral colors that work in both dark and light modes
154
- classDef modelSource fill:#f8f9fa,stroke:#6c757d,color:#212529
155
- classDef evaluation fill:#e9ecef,stroke:#495057,color:#212529
156
- classDef api fill:#dee2e6,stroke:#6c757d,color:#212529
157
- classDef storage fill:#d1ecf1,stroke:#0c5460,color:#0c5460
158
- classDef frontend fill:#f8d7da,stroke:#721c24,color:#721c24
159
- classDef translation fill:#d4edda,stroke:#155724,color:#155724
160
-
161
- class A1,A2,A3,A4 modelSource
162
- class Q1,Q2,Q3,Q4,Q5,Q6,P evaluation
163
- class R,F,G,X api
164
- class T,I storage
165
- class Y,Z1,Z2,Z3,Z4 frontend
166
- class Google_Translate,DS_translated,DS_native translation
167
- ```
168
-
169
- **Key Features:**
170
- - **Model Discovery**: Combines curated models with real-time trending models via web scraping
171
- - **Multi-Task Evaluation**: 7 tasks across 100+ languages with origin tracking (human vs machine-translated)
172
- - **Scalable Architecture**: Dual deployment (local/GitHub vs Google Cloud)
173
- - **Real-time Visualization**: Interactive web interface with country-level insights
174
-
175
  ## Evaluate
176
 
177
- ### Local Development
178
  ```bash
179
  uv run --extra dev evals/main.py
180
  ```
181
 
182
- ### Google Cloud Deployment
183
- ```bash
184
- uv run --extra dev evals/main_gcs.py
185
- ```
186
-
187
  ## Explore
188
 
189
  ```bash
 
43
 
44
  _Tracking language proficiency of AI models for every language_
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  ## Evaluate
47
 
 
48
  ```bash
49
  uv run --extra dev evals/main.py
50
  ```
51
 
 
 
 
 
 
52
  ## Explore
53
 
54
  ```bash
datasets.json CHANGED
@@ -219,7 +219,7 @@
219
  "parallel": true,
220
  "translation": "machine",
221
  "base": "MMLU",
222
- "implemented": false,
223
  "group": "Multitask Language Understanding"
224
  },
225
  {
@@ -256,7 +256,7 @@
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
- "implemented": false,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
@@ -360,7 +360,7 @@
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
- "implemented": false,
364
  "group": "ARC Question Answering"
365
  },
366
  {
@@ -375,7 +375,7 @@
375
  "parallel": true,
376
  "translation": "machine",
377
  "base": "AI2 ARC",
378
- "implemented": false,
379
  "group": "ARC Question Answering"
380
  },
381
  {
@@ -420,7 +420,7 @@
420
  "parallel": true,
421
  "translation": "machine",
422
  "base": "TruthfulQA",
423
- "implemented": false,
424
  "group": "Truthfulness"
425
  },
426
  {
@@ -435,7 +435,7 @@
435
  "parallel": true,
436
  "translation": "machine",
437
  "base": "TruthfulQA",
438
- "implemented": false,
439
  "group": "Truthfulness"
440
  },
441
  {
 
219
  "parallel": true,
220
  "translation": "machine",
221
  "base": "MMLU",
222
+ "implemented": true,
223
  "group": "Multitask Language Understanding"
224
  },
225
  {
 
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
+ "implemented": true,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
 
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
+ "implemented": true,
364
  "group": "ARC Question Answering"
365
  },
366
  {
 
375
  "parallel": true,
376
  "translation": "machine",
377
  "base": "AI2 ARC",
378
+ "implemented": true,
379
  "group": "ARC Question Answering"
380
  },
381
  {
 
420
  "parallel": true,
421
  "translation": "machine",
422
  "base": "TruthfulQA",
423
+ "implemented": true,
424
  "group": "Truthfulness"
425
  },
426
  {
 
435
  "parallel": true,
436
  "translation": "machine",
437
  "base": "TruthfulQA",
438
+ "implemented": true,
439
  "group": "Truthfulness"
440
  },
441
  {
evals/backend.py CHANGED
@@ -4,18 +4,7 @@ import os
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
7
-
8
- # Robust import so this file works both as a package module and as a script
9
- try:
10
- # When executed as a package module (recommended): `python -m uvicorn evals.backend:app`
11
- from .countries import make_country_table
12
- except Exception:
13
- try:
14
- # When executed from project root with package path available
15
- from evals.countries import make_country_table
16
- except Exception:
17
- # When executed directly from evals/ directory
18
- from countries import make_country_table
19
  from fastapi import FastAPI, Request
20
  from fastapi.middleware.cors import CORSMiddleware
21
  from fastapi.middleware.gzip import GZipMiddleware
@@ -37,7 +26,7 @@ task_metrics = [
37
  "classification_accuracy",
38
  "mmlu_accuracy",
39
  "arc_accuracy",
40
- "truthfulqa_accuracy",
41
  "mgsm_accuracy",
42
  ]
43
 
@@ -56,145 +45,66 @@ def compute_normalized_average(df, metrics):
56
  return normalized_df.mean(axis=1, skipna=False)
57
 
58
 
59
- def make_model_table(scores_df, models):
60
- # Create a combined task_metric for origin
61
- scores_df["task_metric_origin"] = (
62
- scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
63
- )
64
-
65
- # Pivot to get scores for each origin-specific metric
66
- scores_pivot = scores_df.pivot_table(
67
- index="model",
68
- columns="task_metric_origin",
69
- values="score",
70
- aggfunc="mean",
71
- )
72
-
73
- # Create the regular task_metric for the main average calculation
74
- scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
75
- main_pivot = scores_df.pivot_table(
76
- index="model", columns="task_metric", values="score", aggfunc="mean"
77
  )
78
-
79
- # Merge the two pivots
80
- df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
81
-
82
  for metric in task_metrics:
83
  if metric not in df.columns:
84
  df[metric] = np.nan
85
-
86
  df["average"] = compute_normalized_average(df, task_metrics)
87
-
88
- # Compute origin presence per model+metric
89
- origin_presence = (
90
- scores_df.groupby(["model", "task_metric", "origin"]).size().unstack(fill_value=0)
91
- )
92
- # Add boolean flags: show asterisk only if exclusively machine-origin contributed
93
- for metric in task_metrics:
94
- human_col_name = "human" if "human" in origin_presence.columns else None
95
- machine_col_name = "machine" if "machine" in origin_presence.columns else None
96
- if human_col_name or machine_col_name:
97
- flags = []
98
- for model in df.index:
99
- try:
100
- counts = origin_presence.loc[(model, metric)]
101
- except KeyError:
102
- flags.append(False)
103
- continue
104
- human_count = counts.get(human_col_name, 0) if human_col_name else 0
105
- machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
106
- flags.append(machine_count > 0 and human_count == 0)
107
- df[f"{metric}_is_machine"] = flags
108
- else:
109
- df[f"{metric}_is_machine"] = False
110
  df = df.sort_values(by="average", ascending=False).reset_index()
111
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
112
  df["rank"] = df.index + 1
113
-
114
- # Dynamically find all metric columns to include
115
- final_cols = df.columns
116
- metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
117
-
118
  df = df[
119
  [
120
- "rank", "model", "name", "provider_name", "hf_id", "creation_date",
121
- "size", "type", "license", "cost", "average",
122
- *sorted(list(set(metric_cols)))
 
 
 
 
 
 
 
 
 
123
  ]
124
  ]
125
  return df
126
 
127
 
128
- def make_language_table(scores_df, languages):
129
- # Create a combined task_metric for origin
130
- scores_df["task_metric_origin"] = (
131
- scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
132
- )
133
-
134
- # Pivot to get scores for each origin-specific metric
135
- scores_pivot = scores_df.pivot_table(
136
- index="bcp_47",
137
- columns="task_metric_origin",
138
- values="score",
139
- aggfunc="mean",
140
- )
141
-
142
- # Create the regular task_metric for the main average calculation
143
- scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
144
- main_pivot = scores_df.pivot_table(
145
- index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
146
  )
147
-
148
- # Merge the two pivots
149
- df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
150
-
151
  for metric in task_metrics:
152
  if metric not in df.columns:
153
  df[metric] = np.nan
154
-
155
  df["average"] = compute_normalized_average(df, task_metrics)
156
-
157
- # Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
158
- origin_presence = (
159
- scores_df.groupby(["bcp_47", "task_metric", "origin"]).size().unstack(fill_value=0)
160
- )
161
- for metric in task_metrics:
162
- human_col_name = "human" if "human" in origin_presence.columns else None
163
- machine_col_name = "machine" if "machine" in origin_presence.columns else None
164
- if human_col_name or machine_col_name:
165
- flags = []
166
- for bcp in df.index:
167
- try:
168
- counts = origin_presence.loc[(bcp, metric)]
169
- except KeyError:
170
- flags.append(False)
171
- continue
172
- human_count = counts.get(human_col_name, 0) if human_col_name else 0
173
- machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
174
- flags.append(machine_count > 0 and human_count == 0)
175
- df[f"{metric}_is_machine"] = flags
176
- else:
177
- df[f"{metric}_is_machine"] = False
178
-
179
- # Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
180
- for metric in task_metrics:
181
- machine_col = f"{metric}_machine"
182
- if machine_col in df.columns:
183
- df[f"{metric}_is_machine"] = df[machine_col].notna()
184
- else:
185
- df[f"{metric}_is_machine"] = False
186
  df = pd.merge(languages, df, on="bcp_47", how="outer")
187
  df = df.sort_values(by="speakers", ascending=False)
188
-
189
- # Dynamically find all metric columns to include
190
- final_cols = df.columns
191
- metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
192
-
193
  df = df[
194
  [
195
- "bcp_47", "language_name", "autonym", "speakers", "family",
196
- "average", "in_benchmark",
197
- *sorted(list(set(metric_cols)))
 
 
 
 
 
198
  ]
199
  ]
200
  return df
@@ -215,18 +125,10 @@ async def data(request: Request):
215
  body = await request.body()
216
  data = json.loads(body)
217
  selected_languages = data.get("selectedLanguages", {})
218
- df = scores.groupby(["model", "bcp_47", "task", "metric", "origin"]).mean().reset_index()
219
  # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
220
  language_table = make_language_table(df, languages)
221
  datasets_df = pd.read_json("datasets.json")
222
-
223
- # Identify which metrics have machine translations available
224
- machine_translated_metrics = set()
225
- for _, row in df.iterrows():
226
- if row["origin"] == "machine":
227
- metric_name = f"{row['task']}_{row['metric']}"
228
- machine_translated_metrics.add(metric_name)
229
-
230
  if selected_languages:
231
  # the filtering is only applied for the model table and the country data
232
  df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
@@ -241,7 +143,6 @@ async def data(request: Request):
241
  "language_table": serialize(language_table),
242
  "dataset_table": serialize(datasets_df),
243
  "countries": serialize(countries),
244
- "machine_translated_metrics": list(machine_translated_metrics),
245
  }
246
  return JSONResponse(content=all_tables)
247
 
 
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
7
+ from countries import make_country_table
 
 
 
 
 
 
 
 
 
 
 
8
  from fastapi import FastAPI, Request
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.middleware.gzip import GZipMiddleware
 
26
  "classification_accuracy",
27
  "mmlu_accuracy",
28
  "arc_accuracy",
29
+ # "truthfulqa_accuracy",
30
  "mgsm_accuracy",
31
  ]
32
 
 
45
  return normalized_df.mean(axis=1, skipna=False)
46
 
47
 
48
+ def make_model_table(df, models):
49
+ df = (
50
+ df.groupby(["model", "task", "metric"])
51
+ .agg({"score": "mean", "bcp_47": "nunique"})
52
+ .reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  )
54
+ df["task_metric"] = df["task"] + "_" + df["metric"]
55
+ df = df.drop(columns=["task", "metric"])
56
+ df = df.pivot(index="model", columns="task_metric", values="score")
 
57
  for metric in task_metrics:
58
  if metric not in df.columns:
59
  df[metric] = np.nan
 
60
  df["average"] = compute_normalized_average(df, task_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  df = df.sort_values(by="average", ascending=False).reset_index()
62
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
63
  df["rank"] = df.index + 1
 
 
 
 
 
64
  df = df[
65
  [
66
+ "rank",
67
+ "model",
68
+ "name",
69
+ "provider_name",
70
+ "hf_id",
71
+ "creation_date",
72
+ "size",
73
+ "type",
74
+ "license",
75
+ "cost",
76
+ "average",
77
+ *task_metrics,
78
  ]
79
  ]
80
  return df
81
 
82
 
83
+ def make_language_table(df, languages):
84
+ df = (
85
+ df.groupby(["bcp_47", "task", "metric"])
86
+ .agg({"score": "mean", "model": "nunique"})
87
+ .reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  )
89
+ df["task_metric"] = df["task"] + "_" + df["metric"]
90
+ df = df.drop(columns=["task", "metric"])
91
+ df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
 
92
  for metric in task_metrics:
93
  if metric not in df.columns:
94
  df[metric] = np.nan
 
95
  df["average"] = compute_normalized_average(df, task_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  df = pd.merge(languages, df, on="bcp_47", how="outer")
97
  df = df.sort_values(by="speakers", ascending=False)
 
 
 
 
 
98
  df = df[
99
  [
100
+ "bcp_47",
101
+ "language_name",
102
+ "autonym",
103
+ "speakers",
104
+ "family",
105
+ "average",
106
+ "in_benchmark",
107
+ *task_metrics,
108
  ]
109
  ]
110
  return df
 
125
  body = await request.body()
126
  data = json.loads(body)
127
  selected_languages = data.get("selectedLanguages", {})
128
+ df = scores.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
129
  # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
130
  language_table = make_language_table(df, languages)
131
  datasets_df = pd.read_json("datasets.json")
 
 
 
 
 
 
 
 
132
  if selected_languages:
133
  # the filtering is only applied for the model table and the country data
134
  df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
 
143
  "language_table": serialize(language_table),
144
  "dataset_table": serialize(datasets_df),
145
  "countries": serialize(countries),
 
146
  }
147
  return JSONResponse(content=all_tables)
148
 
evals/countries.py CHANGED
@@ -30,15 +30,10 @@ def make_country_table(language_table):
30
  )
31
  for country, languages in countries.items():
32
  speaker_pop = sum(entry["population"] for entry in languages)
33
-
34
- if speaker_pop < 1000: # 🎯 Grey out low-population countries
35
- score = None # This will make them appear grey on the map
36
- else:
37
- score = (
38
- sum(entry["score"] * entry["population"] for entry in languages)
39
- / speaker_pop
40
- )
41
-
42
  countries[country] = {
43
  "score": score,
44
  "languages": languages,
 
30
  )
31
  for country, languages in countries.items():
32
  speaker_pop = sum(entry["population"] for entry in languages)
33
+ score = (
34
+ sum(entry["score"] * entry["population"] for entry in languages)
35
+ / speaker_pop
36
+ )
 
 
 
 
 
37
  countries[country] = {
38
  "score": score,
39
  "languages": languages,
evals/datasets_/arc.py CHANGED
@@ -1,10 +1,11 @@
1
  import random
 
2
 
3
- from langcodes import standardize_tag
4
  from rich import print
5
- from models import translate_google, get_google_supported_languages
6
  from tqdm import tqdm
7
- from datasets import load_dataset, Dataset
8
  import asyncio
9
  from tqdm.asyncio import tqdm_asyncio
10
  import os
@@ -13,33 +14,27 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
13
 
14
  slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
15
  tags_uhura_arc_easy = {
16
- standardize_tag(a.split("_")[0], macro=True): a
17
- for a in _get_dataset_config_names(slug_uhura_arc_easy)
18
  if not a.endswith("unmatched")
19
  }
20
 
21
 
22
  random.seed(42)
23
- id_sets_train = [
24
- set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
25
- for tag in tags_uhura_arc_easy.values()
26
- ]
27
  common_ids_train = list(sorted(set.intersection(*id_sets_train)))
28
  random.shuffle(common_ids_train)
29
- id_sets_test = [
30
- set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
31
- for tag in tags_uhura_arc_easy.values()
32
- ]
33
  common_ids_test = list(sorted(set.intersection(*id_sets_test)))
34
  random.shuffle(common_ids_test)
35
 
36
  slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
37
  tags_uhura_arc_easy_translated = {
38
- standardize_tag(a.split("_")[0], macro=True): a
39
- for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
40
  }
41
 
42
 
 
 
43
  def add_choices(row):
44
  row["choices"] = row["choices"]["text"]
45
  return row
@@ -50,36 +45,27 @@ def load_uhura_arc_easy(language_bcp_47, nr):
50
  ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
51
  ds = ds.map(add_choices)
52
  ds = ds.rename_column("answerKey", "answer")
 
 
53
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
54
- return "masakhane/uhura-arc-easy", task, "human"
55
  if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
56
- ds = _load_dataset(
57
- slug_uhura_arc_easy_translated,
58
- tags_uhura_arc_easy_translated[language_bcp_47],
59
- )
60
  ds = ds.rename_column("answerKey", "answer")
 
 
 
61
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
62
- return "fair-forward/arc-easy-autotranslated", task, "machine"
63
  else:
64
  return None, None, None
65
 
66
-
67
- def load_uhura_arc_challenge(language_bcp_47, nr):
68
- ds_name = "jlahd/uhura_arc_challenge"
69
- if language_bcp_47 in _get_dataset_config_names(ds_name):
70
- ds = _load_dataset(ds_name, language_bcp_47)
71
- task = ds["test"][nr]
72
- return ds_name, task
73
- else:
74
- return None, None, None
75
-
76
-
77
  def translate_arc(languages):
78
  human_translated = tags_uhura_arc_easy.keys()
79
  untranslated = [
80
  lang
81
  for lang in languages["bcp_47"].values[:100]
82
- if lang not in human_translated and lang in get_google_supported_languages()
83
  ]
84
  n_samples = 10
85
  train_ids = common_ids_train[:n_samples+3]
 
1
  import random
2
+ from collections import Counter, defaultdict
3
 
4
+ from langcodes import Language, standardize_tag
5
  from rich import print
6
+ from models import translate_google, google_supported_languages
7
  from tqdm import tqdm
8
+ from datasets import Dataset, load_dataset
9
  import asyncio
10
  from tqdm.asyncio import tqdm_asyncio
11
  import os
 
14
 
15
  slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
16
  tags_uhura_arc_easy = {
17
+ standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy)
 
18
  if not a.endswith("unmatched")
19
  }
20
 
21
 
22
  random.seed(42)
23
+ id_sets_train = [set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"]) for tag in tags_uhura_arc_easy.values()]
 
 
 
24
  common_ids_train = list(sorted(set.intersection(*id_sets_train)))
25
  random.shuffle(common_ids_train)
26
+ id_sets_test = [set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"]) for tag in tags_uhura_arc_easy.values()]
 
 
 
27
  common_ids_test = list(sorted(set.intersection(*id_sets_test)))
28
  random.shuffle(common_ids_test)
29
 
30
  slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
31
  tags_uhura_arc_easy_translated = {
32
+ standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
 
33
  }
34
 
35
 
36
+
37
+
38
  def add_choices(row):
39
  row["choices"] = row["choices"]["text"]
40
  return row
 
45
  ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
46
  ds = ds.map(add_choices)
47
  ds = ds.rename_column("answerKey", "answer")
48
+ train_ids = common_ids_train[nr:nr+3]
49
+ examples = ds["train"].filter(lambda x: x["id"] in train_ids)
50
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
51
+ return "masakhane/uhura-arc-easy", examples, task
52
  if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
53
+ ds = _load_dataset(slug_uhura_arc_easy_translated, tags_uhura_arc_easy_translated[language_bcp_47])
 
 
 
54
  ds = ds.rename_column("answerKey", "answer")
55
+ train_ids = common_ids_train[nr:nr+3]
56
+ examples = ds["train"].filter(lambda x: x["id"] in train_ids)
57
+ # raise Exception(language_bcp_47)
58
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
59
+ return "fair-forward/arc-easy-autotranslated", examples, task
60
  else:
61
  return None, None, None
62
 
 
 
 
 
 
 
 
 
 
 
 
63
  def translate_arc(languages):
64
  human_translated = tags_uhura_arc_easy.keys()
65
  untranslated = [
66
  lang
67
  for lang in languages["bcp_47"].values[:100]
68
+ if lang not in human_translated and lang in google_supported_languages
69
  ]
70
  n_samples = 10
71
  train_ids = common_ids_train[:n_samples+3]
evals/datasets_/mgsm.py CHANGED
@@ -1,12 +1,10 @@
1
  import asyncio
2
  import os
3
- import random
4
 
5
  from datasets import Dataset, load_dataset
6
- from datasets_.util import _get_dataset_config_names, _load_dataset, cache
7
- from langcodes import Language, standardize_tag
8
- from models import get_google_supported_languages, translate_google
9
- from rich import print
10
  from tqdm import tqdm
11
  from tqdm.asyncio import tqdm_asyncio
12
 
@@ -39,41 +37,31 @@ def parse_number(i):
39
  return None
40
 
41
 
42
- @cache
43
- def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
44
- """Cache individual MGSM items efficiently"""
45
- try:
46
- ds = _load_dataset(dataset_slug, subset=subset_tag, split="test", trust_remote_code=trust_remote_code)
47
- if nr >= len(ds):
48
- return None
49
-
50
- row = ds[nr]
51
-
52
- # Post-process based on dataset type
53
- if dataset_slug == slug_gsm8kx:
54
- row["answer_number"] = row["answer"].split("####")[1].strip()
55
-
56
- return row
57
- except Exception:
58
- # Dataset doesn't exist or doesn't have test split
59
- return None
60
-
61
-
62
  def load_mgsm(language_bcp_47, nr):
63
  if language_bcp_47 in tags_mgsm.keys():
64
- item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
65
- return slug_mgsm, item, "human" if item else (None, None, None)
66
  elif language_bcp_47 in tags_afrimgsm.keys():
67
- item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
68
- return slug_afrimgsm, item, "human" if item else (None, None, None)
69
- elif language_bcp_47 in tags_gsm8kx.keys():
70
- item = _get_mgsm_item(slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True)
71
- return slug_gsm8kx, item, "machine" if item else (None, None, None)
72
  elif language_bcp_47 in tags_gsm_autotranslated.keys():
73
- item = _get_mgsm_item(slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr)
74
- return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
 
 
 
 
 
 
 
 
 
 
 
75
  else:
76
- return None, None, None
77
 
78
 
79
  def translate_mgsm(languages):
@@ -81,7 +69,7 @@ def translate_mgsm(languages):
81
  untranslated = [
82
  lang
83
  for lang in languages["bcp_47"].values[:100]
84
- if lang not in human_translated and lang in get_google_supported_languages()
85
  ]
86
  en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
87
  slug = "fair-forward/gsm-autotranslated"
 
1
  import asyncio
2
  import os
 
3
 
4
  from datasets import Dataset, load_dataset
5
+ from datasets_.util import _get_dataset_config_names, _load_dataset
6
+ from langcodes import standardize_tag
7
+ from models import google_supported_languages, translate_google
 
8
  from tqdm import tqdm
9
  from tqdm.asyncio import tqdm_asyncio
10
 
 
37
  return None
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def load_mgsm(language_bcp_47, nr):
41
  if language_bcp_47 in tags_mgsm.keys():
42
+ ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
43
+ return slug_mgsm, ds[nr]
44
  elif language_bcp_47 in tags_afrimgsm.keys():
45
+ ds = _load_dataset(
46
+ slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
47
+ )
48
+ return slug_afrimgsm, ds[nr]
 
49
  elif language_bcp_47 in tags_gsm_autotranslated.keys():
50
+ ds = _load_dataset(
51
+ slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
52
+ )
53
+ return slug_gsm_autotranslated, ds[nr]
54
+ elif language_bcp_47 in tags_gsm8kx.keys():
55
+ row = _load_dataset(
56
+ slug_gsm8kx,
57
+ subset=tags_gsm8kx[language_bcp_47],
58
+ split="test",
59
+ trust_remote_code=True,
60
+ )[nr]
61
+ row["answer_number"] = row["answer"].split("####")[1].strip()
62
+ return slug_gsm8kx, row
63
  else:
64
+ return None, None
65
 
66
 
67
  def translate_mgsm(languages):
 
69
  untranslated = [
70
  lang
71
  for lang in languages["bcp_47"].values[:100]
72
+ if lang not in human_translated and lang in google_supported_languages
73
  ]
74
  en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
75
  slug = "fair-forward/gsm-autotranslated"
evals/datasets_/mmlu.py CHANGED
@@ -4,9 +4,9 @@ import random
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
- from datasets_.util import _get_dataset_config_names, _load_dataset, cache
8
  from langcodes import Language, standardize_tag
9
- from models import get_google_supported_languages, translate_google
10
  from rich import print
11
  from tqdm import tqdm
12
  from tqdm.asyncio import tqdm_asyncio
@@ -111,7 +111,6 @@ def print_datasets_analysis():
111
  # MMLUX is translated using DeepL
112
  # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
113
 
114
-
115
  # print_datasets_analysis()
116
 
117
 
@@ -144,51 +143,32 @@ tags_mmlux = set(
144
  a.rsplit("_", 1)[1].split("-")[0].lower()
145
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
146
  )
147
- tags_mmlu_autotranslated = {
148
- standardize_tag(a, macro=True): a
149
- for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
150
- }
151
 
152
  categories = sorted(
153
  list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
154
  )
155
 
156
 
157
- @cache
158
- def _get_processed_mmlu_dataset(dataset_name, subset_tag):
159
- """Cache processed datasets to avoid reprocessing"""
160
- ds = _load_dataset(dataset_name, subset_tag)
161
- if dataset_name == "masakhane/afrimmlu":
162
- ds = ds.map(parse_choices)
163
- elif dataset_name == "CohereForAI/Global-MMLU":
164
- ds = ds.map(add_choices)
165
- return ds
166
-
167
-
168
- @cache
169
- def _get_mmlu_item(dataset_name, subset_tag, category, nr):
170
- """Cache individual MMLU items efficiently"""
171
- ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
172
- if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
173
- filtered = ds["test"].filter(lambda x: x["subject"] == category)
174
- return filtered[nr] if nr < len(filtered) else None
175
- else: # fair-forward/mmlu-autotranslated
176
- filtered = ds["test"].filter(lambda x: x["subject"] == category)
177
- return filtered[nr] if nr < len(filtered) else None
178
-
179
-
180
- async def load_mmlu(language_bcp_47, nr):
181
  category = categories[nr % len(categories)]
182
  if language_bcp_47 in tags_afrimmlu.keys():
183
- task = _get_mmlu_item("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr)
184
- return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
 
 
 
185
  elif language_bcp_47 in tags_global_mmlu.keys():
186
- task = _get_mmlu_item("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr)
187
- return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
188
- # TODO: add in Okapi, MMLUX @Jonas
 
 
189
  elif language_bcp_47 in tags_mmlu_autotranslated:
190
- task = _get_mmlu_item("fair-forward/mmlu-autotranslated", language_bcp_47, category, nr)
191
- return "fair-forward/mmlu-autotranslated", task, "machine" if task else (None, None, None)
 
 
192
  else:
193
  return None, None, None
194
 
@@ -197,10 +177,10 @@ def translate_mmlu(languages):
197
  human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
198
  untranslated = [
199
  lang
200
- for lang in languages["bcp_47"].values[:150]
201
- if lang not in human_translated and lang in get_google_supported_languages()
202
  ]
203
- n_samples = 20
204
 
205
  slug = "fair-forward/mmlu-autotranslated"
206
  for lang in tqdm(untranslated):
@@ -216,10 +196,8 @@ def translate_mmlu(languages):
216
  if split == "dev":
217
  samples.extend(ds.filter(lambda x: x["subject"] == category))
218
  else:
219
- # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
220
- filtered = ds.filter(lambda x: x["subject"] == category)
221
- for i in range(min(n_samples, len(filtered))):
222
- task = filtered[i]
223
  samples.append(task)
224
  questions_tr = [
225
  translate_google(s["question"], "en", lang) for s in samples
 
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
+ from datasets_.util import _get_dataset_config_names, _load_dataset
8
  from langcodes import Language, standardize_tag
9
+ from models import google_supported_languages, translate_google
10
  from rich import print
11
  from tqdm import tqdm
12
  from tqdm.asyncio import tqdm_asyncio
 
111
  # MMLUX is translated using DeepL
112
  # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
113
 
 
114
  # print_datasets_analysis()
115
 
116
 
 
143
  a.rsplit("_", 1)[1].split("-")[0].lower()
144
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
145
  )
146
+ tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
 
 
 
147
 
148
  categories = sorted(
149
  list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
150
  )
151
 
152
 
153
+ def load_mmlu(language_bcp_47, nr):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  category = categories[nr % len(categories)]
155
  if language_bcp_47 in tags_afrimmlu.keys():
156
+ ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
157
+ ds = ds.map(parse_choices)
158
+ examples = ds["dev"].filter(lambda x: x["subject"] == category)
159
+ task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
160
+ return "masakhane/afrimmlu", examples, task
161
  elif language_bcp_47 in tags_global_mmlu.keys():
162
+ ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
163
+ ds = ds.map(add_choices)
164
+ examples = ds["dev"].filter(lambda x: x["subject"] == category)
165
+ task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
166
+ return "CohereForAI/Global-MMLU", examples, task
167
  elif language_bcp_47 in tags_mmlu_autotranslated:
168
+ ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
169
+ examples = ds["dev"].filter(lambda x: x["subject"] == category)
170
+ task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
171
+ return "fair-forward/mmlu-autotranslated", examples, task
172
  else:
173
  return None, None, None
174
 
 
177
  human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
178
  untranslated = [
179
  lang
180
+ for lang in languages["bcp_47"].values[:100]
181
+ if lang not in human_translated and lang in google_supported_languages
182
  ]
183
+ n_samples = 10
184
 
185
  slug = "fair-forward/mmlu-autotranslated"
186
  for lang in tqdm(untranslated):
 
196
  if split == "dev":
197
  samples.extend(ds.filter(lambda x: x["subject"] == category))
198
  else:
199
+ for i in range(n_samples):
200
+ task = ds.filter(lambda x: x["subject"] == category)[i]
 
 
201
  samples.append(task)
202
  questions_tr = [
203
  translate_google(s["question"], "en", lang) for s in samples
evals/datasets_/truthfulqa.py CHANGED
@@ -9,26 +9,16 @@ from tqdm.asyncio import tqdm_asyncio
9
  import os
10
 
11
  from datasets import Dataset, load_dataset
12
- from models import translate_google, get_google_supported_languages
13
 
14
  from datasets_.util import _get_dataset_config_names, _load_dataset
15
 
16
  slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
17
- slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
18
-
19
  tags_uhura_truthfulqa = {
20
  standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
21
  if a.endswith("multiple_choice")
22
  }
23
 
24
- # Get available auto-translated languages
25
- try:
26
- tags_truthfulqa_autotranslated = {
27
- standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
28
- }
29
- except Exception:
30
- tags_truthfulqa_autotranslated = {}
31
-
32
 
33
  def add_choices(row):
34
  row["choices"] = row["mc1_targets"]["choices"]
@@ -36,36 +26,27 @@ def add_choices(row):
36
  return row
37
 
38
 
39
- async def load_truthfulqa(language_bcp_47, nr):
40
  if language_bcp_47 in tags_uhura_truthfulqa.keys():
41
- ds = _load_dataset(
42
- slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
43
- )
44
  ds = ds.map(add_choices)
 
45
  task = ds["test"][nr]
46
- return "masakhane/uhura-truthfulqa", task, "human"
47
- elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
48
- # Load from auto-translated dataset (same samples as translation)
49
- ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
50
- test_split = ds["test"] if "test" in ds else ds
51
- task = test_split[nr]
52
- return slug_truthfulqa_autotranslated, task, "machine"
53
- # TODO: add Okapi, TruthfulQA-X @Jonas
54
  else:
55
  return None, None, None
56
 
 
 
57
  def translate_truthfulqa(languages):
58
  human_translated = [*tags_uhura_truthfulqa.keys()]
59
  untranslated = [
60
  lang
61
- for lang in languages["bcp_47"].values[:150]
62
- if lang not in human_translated and lang in get_google_supported_languages()
63
  ]
64
- n_samples = 20
65
 
66
- # Set fixed seed for consistent sample selection across all languages
67
- random.seed(42)
68
-
69
  slug = "fair-forward/truthfulqa-autotranslated"
70
  for lang in tqdm(untranslated):
71
  # check if already exists on hub
@@ -79,40 +60,32 @@ def translate_truthfulqa(languages):
79
  if split == "train":
80
  samples.extend(ds)
81
  else:
82
- # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
83
- for i in range(min(n_samples, len(ds))):
84
  task = ds[i]
85
  samples.append(task)
86
-
87
- # Translate questions
88
  questions_tr = [
89
  translate_google(s["question"], "en", lang) for s in samples
90
  ]
91
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
92
-
93
- # Translate choices for each sample
94
- all_choices_tr = []
95
- all_labels = []
96
-
97
  for s in samples:
98
- # Get choices from mc1_targets
99
- choices = s["mc1_targets"]["choices"]
100
- labels = s["mc1_targets"]["labels"]
101
-
102
- # Translate choices
103
- choices_tr = [
104
- translate_google(choice, "en", lang) for choice in choices
105
- ]
106
- choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
107
-
108
- all_choices_tr.append(choices_tr)
109
- all_labels.append(labels)
110
 
111
  ds_lang = Dataset.from_dict(
112
  {
 
113
  "question": questions_tr,
114
- "choices": all_choices_tr,
115
- "labels": all_labels,
116
  }
117
  )
118
  ds_lang.push_to_hub(
@@ -122,7 +95,7 @@ def translate_truthfulqa(languages):
122
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
123
  )
124
  ds_lang.to_json(
125
- f"data/translations/truthfulqa/{lang}_{split}.json",
126
  lines=False,
127
  force_ascii=False,
128
  indent=2,
 
9
  import os
10
 
11
  from datasets import Dataset, load_dataset
12
+ from models import translate_google, google_supported_languages
13
 
14
  from datasets_.util import _get_dataset_config_names, _load_dataset
15
 
16
  slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
 
 
17
  tags_uhura_truthfulqa = {
18
  standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
19
  if a.endswith("multiple_choice")
20
  }
21
 
 
 
 
 
 
 
 
 
22
 
23
  def add_choices(row):
24
  row["choices"] = row["mc1_targets"]["choices"]
 
26
  return row
27
 
28
 
29
+ def load_truthfulqa(language_bcp_47, nr):
30
  if language_bcp_47 in tags_uhura_truthfulqa.keys():
31
+ ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
 
 
32
  ds = ds.map(add_choices)
33
+ examples = ds["train"]
34
  task = ds["test"][nr]
35
+ return "masakhane/uhura-truthfulqa", examples, task
 
 
 
 
 
 
 
36
  else:
37
  return None, None, None
38
 
39
+
40
+
41
  def translate_truthfulqa(languages):
42
  human_translated = [*tags_uhura_truthfulqa.keys()]
43
  untranslated = [
44
  lang
45
+ for lang in languages["bcp_47"].values[:100]
46
+ if lang not in human_translated and lang in google_supported_languages
47
  ]
48
+ n_samples = 10
49
 
 
 
 
50
  slug = "fair-forward/truthfulqa-autotranslated"
51
  for lang in tqdm(untranslated):
52
  # check if already exists on hub
 
60
  if split == "train":
61
  samples.extend(ds)
62
  else:
63
+ for i in range(n_samples):
 
64
  task = ds[i]
65
  samples.append(task)
 
 
66
  questions_tr = [
67
  translate_google(s["question"], "en", lang) for s in samples
68
  ]
69
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
70
+ choices_texts_concatenated = []
 
 
 
 
71
  for s in samples:
72
+ for choice in eval(s["choices"]):
73
+ choices_texts_concatenated.append(choice)
74
+ choices_tr = [
75
+ translate_google(c, "en", lang) for c in choices_texts_concatenated
76
+ ]
77
+ choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
78
+ # group into chunks of 4
79
+ choices_tr = [
80
+ choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
81
+ ]
 
 
82
 
83
  ds_lang = Dataset.from_dict(
84
  {
85
+ "subject": [s["subject"] for s in samples],
86
  "question": questions_tr,
87
+ "choices": choices_tr,
88
+ "answer": [s["answer"] for s in samples],
89
  }
90
  )
91
  ds_lang.push_to_hub(
 
95
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
96
  )
97
  ds_lang.to_json(
98
+ f"data/translations/mmlu/{lang}_{split}.json",
99
  lines=False,
100
  force_ascii=False,
101
  indent=2,
evals/datasets_/util.py CHANGED
@@ -12,10 +12,3 @@ def _get_dataset_config_names(dataset, **kwargs):
12
  @cache
13
  def _load_dataset(dataset, subset, **kwargs):
14
  return load_dataset(dataset, subset, **kwargs)
15
-
16
- # Cache individual dataset items to avoid reloading entire datasets
17
- @cache
18
- def _get_dataset_item(dataset, subset, split, index, **kwargs):
19
- """Load a single item from a dataset efficiently"""
20
- ds = load_dataset(dataset, subset, split=split, **kwargs)
21
- return ds[index] if index < len(ds) else None
 
12
  @cache
13
  def _load_dataset(dataset, subset, **kwargs):
14
  return load_dataset(dataset, subset, **kwargs)
 
 
 
 
 
 
 
evals/main.py CHANGED
@@ -1,172 +1,62 @@
1
  import asyncio
 
2
  import pandas as pd
3
- import time
4
- from datetime import datetime, timedelta
5
  from models import models
6
  from tasks import tasks
7
- from languages import languages
8
- import os
9
 
10
- async def evaluate():
11
- # Configuration - easily adjustable defaults
12
- n_sentences = int(os.environ.get("N_SENTENCES", 20)) # Default: 20 sentences per task
13
- max_languages = int(os.environ.get("MAX_LANGUAGES", 150)) # Default: 150 top languages
14
- single_model = os.environ.get("SINGLE_MODEL") # Optional: run only one specific model
15
- test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes") # Optional: skip results loading/saving
16
-
17
- # Keep original DataFrames for saving metadata - distinction added for single model test runs.
18
- original_models_df = pd.DataFrame(models)
19
- original_languages_df = pd.DataFrame(languages)
20
-
21
- # Create working copies for single evaluation runs
22
- models_df = original_models_df.copy()
23
- languages_df = original_languages_df.copy()
24
- top_languages = languages.head(max_languages)
25
-
26
- # Filter to single model if specified (only affects evaluation, not saving)
27
- if single_model:
28
- models_df = models_df[models_df["id"] == single_model]
29
- if len(models_df) == 0:
30
- print(f"Error: Model '{single_model}' not found. Available models:")
31
- for model_id in original_models_df["id"]:
32
- print(f" {model_id}")
33
- return pd.DataFrame()
34
 
35
- print(f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task")
36
- if test_mode:
37
- print("TEST MODE: Skipping results loading/saving")
38
- start_time = time.time()
39
-
40
- # Load existing results to avoid re-evaluation (skip in test mode)
41
- if test_mode:
42
- old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
43
- else:
44
- try:
45
- old_results = pd.read_json("results.json")
46
- if old_results.empty:
47
- old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
48
- except FileNotFoundError:
49
- old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
50
-
51
- # Get all combinations that need evaluation
52
- combis = [
53
- (model, lang.bcp_47, task_name)
54
- for model in models_df["id"]
55
- for lang in top_languages.itertuples()
56
- for task_name, task in tasks.items()
57
- if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
58
- ]
59
-
60
- # Filter out already evaluated combinations
61
- combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
62
- if not old_results.empty:
63
- completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1))
64
- # set + combis is faster than merge (locally it made a difference for me when loading all data/tasks into memory)
65
- mask = ~combis.apply(lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1)
66
- combis = combis[mask]
67
-
68
- # Create all evaluation tasks
69
- all_tasks = []
70
- for i in range(n_sentences):
71
- for model, bcp_47, task_name in combis.itertuples(index=False):
72
- all_tasks.append((tasks[task_name], model, bcp_47, i))
73
-
74
- print(f"Running {len(all_tasks)} evaluation tasks...")
75
-
76
- # For single model runs, we stop immediately on first API error to inspect.
77
- # For full evaluations, we continue despite errors to get maximum coverage.
78
- stop_on_error = single_model is not None
79
-
80
- # Process tasks in batches to avoid memory issues (for full evaluation locally that helped a lot)
81
- batch_size = 1000
82
- all_results = []
83
-
84
- try:
85
- for i in range(0, len(all_tasks), batch_size):
86
- batch = all_tasks[i:i + batch_size]
87
- batch_results = await asyncio.gather(
88
- *[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in batch],
89
- return_exceptions=not stop_on_error
90
- )
91
- all_results.extend(batch_results)
92
-
93
- results = all_results
94
-
95
- # Process results and logging API errors separately to understand what are the main issues.
96
- valid_results = []
97
- errors = []
98
-
99
- for i, r in enumerate(results):
100
- if isinstance(r, Exception):
101
- if i < len(all_tasks):
102
- task_info = all_tasks[i]
103
- errors.append(f"{task_info[1]},{task_info[2]},{str(r)}")
104
- elif isinstance(r, list):
105
- valid_results.extend(r)
106
- elif r is not None:
107
- valid_results.append(r)
108
-
109
- # log errors and store
110
- if errors:
111
- with open("errors.log", "w") as f:
112
- f.write("model,task,error\n")
113
- for error in errors:
114
- f.write(error + "\n")
115
-
116
- # Track model completion (TO BE DELETED - was for local run only)
117
- if valid_results:
118
- completed_models = set()
119
- for result in valid_results:
120
- if isinstance(result, dict) and "model" in result:
121
- model = result["model"]
122
- if model not in completed_models:
123
- completed_models.add(model)
124
- print(f"Completed: {model}")
125
 
126
- print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors")
127
-
128
- # this is for local single model runs - for testing and development
129
- except Exception as e:
130
- print(f"EVALUATION STOPPED - API Error occurred:")
131
- print(f"Error type: {type(e).__name__}")
132
- print(f"Error message: {str(e)}")
133
- return pd.DataFrame()
134
-
135
- # Save results (skipped in test mode as we do not want to overwrite existing results)
136
- if valid_results:
137
- results_df = pd.DataFrame(valid_results)
138
-
139
- # Aggregate results
140
- results_df = (
141
- results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
142
- .agg({"score": "mean"})
143
- .reset_index()
144
- )
145
-
146
- if not test_mode:
147
- args = dict(orient="records", indent=2, force_ascii=False)
148
-
149
- # Merge with existing results
150
- if not old_results.empty:
151
- results_df = pd.concat([old_results, results_df])
152
- results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
153
-
154
- results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
155
- results_df.to_json("results.json", **args)
156
-
157
- # Save model and language info (always save complete metadata, not filtered)
158
- original_models_df.to_json("models.json", **args)
159
- original_languages_df.to_json("languages.json", **args)
160
- else:
161
- print("TEST MODE: Skipping results saving")
162
-
163
- elapsed = time.time() - start_time
164
- print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
165
-
166
- return results_df
167
-
168
- return pd.DataFrame()
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
 
172
  if __name__ == "__main__":
 
1
  import asyncio
2
+
3
  import pandas as pd
4
+ from languages import languages
 
5
  from models import models
6
  from tasks import tasks
7
+ from tqdm.asyncio import tqdm_asyncio
 
8
 
9
+ # ===== config =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ n_sentences = 10
12
+
13
+ # ===== run evaluation and aggregate results =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ async def evaluate():
17
+ # FIXME we should not need this for-loop, but it helps
18
+ for n_languages in range(10, 101, 10):
19
+ print(f"running evaluations for {n_languages} languages")
20
+ old_results = pd.read_json("results.json")
21
+ old_models = pd.read_json("models.json")
22
+ # get all combinations of model, language and task
23
+ combis = [
24
+ (model, lang.bcp_47, task_name)
25
+ for model in models["id"]
26
+ for lang in languages.iloc[:n_languages].itertuples()
27
+ for task_name, task in tasks.items()
28
+ if task_name in models[models["id"] == model]["tasks"].iloc[0]
29
+ ]
30
+ # filter out combinations that have already been evaluated
31
+ combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
32
+ combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
33
+ combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
34
+ # run evaluations
35
+ results = [
36
+ tasks[task_name](model, bcp_47, i)
37
+ for i in range(n_sentences)
38
+ for model, bcp_47, task_name in combis.itertuples(index=False)
39
+ ]
40
+ results = await tqdm_asyncio.gather(*results, miniters=1)
41
+ results = [r for group in results for r in group]
42
+ args = dict(orient="records", indent=2, force_ascii=False)
43
+ if results:
44
+ # aggregate results
45
+ results = pd.DataFrame(results)
46
+ results = (
47
+ results.groupby(["model", "bcp_47", "task", "metric"])
48
+ .agg({"score": "mean"})
49
+ .reset_index()
50
+ )
51
+ # save results
52
+ results = pd.concat([old_results, results])
53
+ results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
54
+ results.to_json("results.json", **args)
55
+ # save up-to-date info on models and languages
56
+ all_models = pd.concat([pd.DataFrame(models), old_models])
57
+ all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
58
+ all_models.to_json("models.json", **args)
59
+ pd.DataFrame(languages).to_json("languages.json", **args)
60
 
61
 
62
  if __name__ == "__main__":
evals/models.py CHANGED
@@ -1,4 +1,3 @@
1
- import asyncio
2
  import json
3
  import re
4
  from collections import defaultdict
@@ -8,11 +7,7 @@ from os import getenv
8
  import pandas as pd
9
  from aiolimiter import AsyncLimiter
10
  from dotenv import load_dotenv
11
- # Make ElevenLabs optional to avoid hard dependency when not using speech tasks
12
- try:
13
- from elevenlabs import AsyncElevenLabs
14
- except Exception: # ImportError or other env-specific issues
15
- AsyncElevenLabs = None
16
  from google.cloud import translate_v2 as translate
17
  from huggingface_hub import AsyncInferenceClient, HfApi
18
  from joblib.memory import Memory
@@ -27,17 +22,14 @@ important_models = [
27
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
28
  "meta-llama/llama-3-70b-instruct", # 0.4$
29
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
30
- "openai/gpt-5",
31
- "openai/gpt-5-nano", # include if/when available
32
  "openai/gpt-4.1", # 8$
33
  "openai/gpt-4.1-mini", # 1.6$
34
  "openai/gpt-4.1-nano", # 0.4$
35
  "openai/gpt-4o-mini", # 0.6$
36
- "openai/gpt-4o-2024-11-20", # 10$
37
- "openai/gpt-oss-120b",
38
- "anthropic/claude-3.7-sonnet", # 15$ - added for full coverage
39
- "anthropic/claude-sonnet-4", # 15$ - added for full coverage
40
- "anthropic/claude-opus-4.1", # 15$ - added for full coverage
41
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
42
  "mistralai/mistral-saba", # 0.6$
43
  "mistralai/mistral-nemo", # 0.08$
@@ -56,13 +48,10 @@ important_models = [
56
  "microsoft/phi-4", # 0.07$
57
  "microsoft/phi-4-multimodal-instruct", # 0.1$
58
  "amazon/nova-micro-v1", # 0.09$
59
- "moonshotai/kimi-k2", # 0.6$ - added to prevent missing from models.json
60
- "x-ai/grok-4"
61
  ]
62
 
63
  blocklist = [
64
  "google/gemini-2.5-pro-preview",
65
- "google/gemini-2.5-pro",
66
  "google/gemini-2.5-flash-preview",
67
  "google/gemini-2.5-flash-lite-preview",
68
  "google/gemini-2.5-flash-preview-04-17",
@@ -70,7 +59,6 @@ blocklist = [
70
  "google/gemini-2.5-flash-lite-preview-06-17",
71
  "google/gemini-2.5-pro-preview-06-05",
72
  "google/gemini-2.5-pro-preview-05-06",
73
- "perplexity/sonar-deep-research"
74
  ]
75
 
76
  transcription_models = [
@@ -97,82 +85,36 @@ def get_model(permaslug):
97
  and m["endpoint"]
98
  and not m["endpoint"]["is_free"]
99
  ]
 
 
 
100
  return slugs[0] if len(slugs) >= 1 else None
101
 
102
 
103
  @cache
104
  def get_historical_popular_models(date: date):
105
- try:
106
- raw = get("https://openrouter.ai/rankings").text
107
-
108
- # Extract model data from rankingData using regex
109
- import re
110
- import json
111
-
112
- # Find all count and model_permaslug pairs in the data
113
- # Format: "count":number,"model_permaslug":"model/name"
114
- pattern = r'\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\"'
115
- matches = re.findall(pattern, raw)
116
-
117
- if matches:
118
- # Aggregate model counts
119
- model_counts = {}
120
- for count_str, model_slug in matches:
121
- count = float(count_str)
122
- if not model_slug.startswith('openrouter') and model_slug != 'Others':
123
- # Remove variant suffixes for aggregation
124
- base_model = model_slug.split(':')[0]
125
- model_counts[base_model] = model_counts.get(base_model, 0) + count
126
-
127
- # Sort by popularity and return top models
128
- sorted_models = sorted(model_counts.items(), key=lambda x: x[1], reverse=True)
129
- result = []
130
- for model_slug, count in sorted_models[:20]: # Top 20
131
- result.append({"slug": model_slug, "count": int(count)})
132
-
133
- return result
134
- else:
135
- return []
136
-
137
- except Exception as e:
138
- return []
139
-
140
-
141
- @cache
142
  def get_current_popular_models(date: date):
143
- try:
144
- raw = get("https://openrouter.ai/rankings?view=day").text
145
-
146
- # Extract model data from daily rankings
147
- import re
148
- import json
149
-
150
- # Find all count and model_permaslug pairs in the daily data
151
- pattern = r'\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\"'
152
- matches = re.findall(pattern, raw)
153
-
154
- if matches:
155
- # Aggregate model counts
156
- model_counts = {}
157
- for count_str, model_slug in matches:
158
- count = float(count_str)
159
- if not model_slug.startswith('openrouter') and model_slug != 'Others':
160
- # Remove variant suffixes for aggregation
161
- base_model = model_slug.split(':')[0]
162
- model_counts[base_model] = model_counts.get(base_model, 0) + count
163
-
164
- # Sort by popularity and return top models
165
- sorted_models = sorted(model_counts.items(), key=lambda x: x[1], reverse=True)
166
- result = []
167
- for model_slug, count in sorted_models[:10]: # Top 10
168
- result.append({"slug": model_slug, "count": int(count)})
169
-
170
- return result
171
- else:
172
- return []
173
-
174
- except Exception as e:
175
- return []
176
 
177
 
178
  def get_translation_models():
@@ -206,52 +148,26 @@ google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
206
 
207
  @cache
208
  async def complete(**kwargs) -> str | None:
209
- # Add longer timeout for slower, premium, or reasoning-focused models
210
- model_id = kwargs.get('model', '')
211
- slow_model_keywords = [
212
- 'claude-3.5', 'claude-3.7', 'claude-4', 'sonnet-4', # Claude
213
- 'gpt-4', 'o1', 'o3', # OpenAI
214
- 'gemini-2.5', 'gemini-pro', # Google
215
- 'llama-4', # Meta
216
- 'reasoning', 'thinking' # General
217
- ]
218
- timeout = 120 if any(keyword in model_id for keyword in slow_model_keywords) else 60
219
-
220
  async with openrouter_rate_limit:
221
  try:
222
- response = await asyncio.wait_for(
223
- client.chat.completions.create(**kwargs),
224
- timeout=timeout
225
- )
226
  except BadRequestError as e:
227
  if "filtered" in e.message:
228
  return None
229
  raise e
230
- except asyncio.TimeoutError:
231
- return None
232
  if not response.choices:
233
  raise Exception(response)
234
  return response.choices[0].message.content.strip()
235
 
236
- translate_client = None
237
-
238
- def get_google_translate_client():
239
- global translate_client
240
- if translate_client is None:
241
- translate_client = translate.Client()
242
- return translate_client
243
-
244
 
245
- def get_google_supported_languages():
246
- client = get_google_translate_client()
247
- return [l["language"] for l in client.get_languages()]
248
 
249
 
250
  @cache
251
  async def translate_google(text, source_language, target_language):
252
- client = get_google_translate_client()
253
  async with google_rate_limit:
254
- response = client.translate(
255
  text, source_language=source_language, target_language=target_language
256
  )
257
  return response["translatedText"]
@@ -315,14 +231,12 @@ def get_hf_metadata(row):
315
  return empty
316
  try:
317
  info = api.model_info(id)
318
- license = ""
319
- if info.card_data and hasattr(info.card_data, 'license') and info.card_data.license:
320
- license = (
321
- info.card_data.license
322
- .replace("-", " ")
323
- .replace("mit", "MIT")
324
- .title()
325
- )
326
  return {
327
  "hf_id": info.id,
328
  "creation_date": info.created_at,
@@ -335,14 +249,8 @@ def get_hf_metadata(row):
335
 
336
 
337
  def get_cost(row):
338
- """
339
- row: a row from the OpenRouter models dataframe
340
- """
341
- try:
342
- cost = float(row["endpoint"]["pricing"]["completion"])
343
- return round(cost * 1_000_000, 2)
344
- except (TypeError, KeyError):
345
- return None
346
 
347
 
348
  @cache
@@ -352,17 +260,8 @@ def load_models(date: date):
352
  + get_current_popular_models(date.today())[:10]
353
  )
354
  popular_models = [m["slug"] for m in popular_models]
355
- all_model_candidates = set(important_models + popular_models) - set(blocklist)
356
-
357
- # Validate models exist on OpenRouter before including them
358
- valid_models = []
359
-
360
- for model_id in all_model_candidates:
361
- metadata = get_or_metadata(model_id)
362
- if metadata is not None:
363
- valid_models.append(model_id)
364
-
365
- models = pd.DataFrame(sorted(valid_models), columns=["id"])
366
  or_metadata = models["id"].apply(get_or_metadata)
367
  hf_metadata = or_metadata.apply(get_hf_metadata)
368
  creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
@@ -382,8 +281,7 @@ def load_models(date: date):
382
  license=hf_metadata.str["license"],
383
  creation_date=creation_date_hf.combine_first(creation_date_or),
384
  )
385
- # Filter out expensive models to keep costs reasonable
386
- models = models[models["cost"] <= 15.0].reset_index(drop=True)
387
  models["tasks"] = [
388
  ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
389
  ] * len(models)
 
 
1
  import json
2
  import re
3
  from collections import defaultdict
 
7
  import pandas as pd
8
  from aiolimiter import AsyncLimiter
9
  from dotenv import load_dotenv
10
+ from elevenlabs import AsyncElevenLabs
 
 
 
 
11
  from google.cloud import translate_v2 as translate
12
  from huggingface_hub import AsyncInferenceClient, HfApi
13
  from joblib.memory import Memory
 
22
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
23
  "meta-llama/llama-3-70b-instruct", # 0.4$
24
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
 
 
25
  "openai/gpt-4.1", # 8$
26
  "openai/gpt-4.1-mini", # 1.6$
27
  "openai/gpt-4.1-nano", # 0.4$
28
  "openai/gpt-4o-mini", # 0.6$
29
+ # "openai/gpt-4o-2024-11-20", # 10$
30
+ "openai/gpt-3.5-turbo-0613", # 2$
31
+ # "openai/gpt-3.5-turbo", # 1.5$
32
+ # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
 
33
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
34
  "mistralai/mistral-saba", # 0.6$
35
  "mistralai/mistral-nemo", # 0.08$
 
48
  "microsoft/phi-4", # 0.07$
49
  "microsoft/phi-4-multimodal-instruct", # 0.1$
50
  "amazon/nova-micro-v1", # 0.09$
 
 
51
  ]
52
 
53
  blocklist = [
54
  "google/gemini-2.5-pro-preview",
 
55
  "google/gemini-2.5-flash-preview",
56
  "google/gemini-2.5-flash-lite-preview",
57
  "google/gemini-2.5-flash-preview-04-17",
 
59
  "google/gemini-2.5-flash-lite-preview-06-17",
60
  "google/gemini-2.5-pro-preview-06-05",
61
  "google/gemini-2.5-pro-preview-05-06",
 
62
  ]
63
 
64
  transcription_models = [
 
85
  and m["endpoint"]
86
  and not m["endpoint"]["is_free"]
87
  ]
88
+ if len(slugs) == 0:
89
+ # the problem is that free models typically have very high rate-limiting
90
+ print(f"no non-free model found for {permaslug}")
91
  return slugs[0] if len(slugs) >= 1 else None
92
 
93
 
94
  @cache
95
  def get_historical_popular_models(date: date):
96
+ raw = get("https://openrouter.ai/rankings").text
97
+ data = re.search(r'{\\"data\\":(.*),\\"isPercentage\\"', raw).group(1)
98
+ data = json.loads(data.replace("\\", ""))
99
+ counts = defaultdict(int)
100
+ for day in data:
101
+ for model, count in day["ys"].items():
102
+ if model.startswith("openrouter") or model == "Others":
103
+ continue
104
+ counts[model.split(":")[0]] += count
105
+ counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
106
+ models = [get_model(model) for model, _ in counts]
107
+ return [m for m in models if m]
108
+
109
+
110
+ @cache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def get_current_popular_models(date: date):
112
+ raw = get("https://openrouter.ai/rankings?view=day").text.replace("\\", "")
113
+ data = re.search(r'"rankingData":(.*),"rankingType":"day"', raw).group(1)
114
+ data = json.loads(data)
115
+ data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
116
+ models = [get_model(model["model_permaslug"]) for model in data]
117
+ return [m for m in models if m]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
 
120
  def get_translation_models():
 
148
 
149
  @cache
150
  async def complete(**kwargs) -> str | None:
 
 
 
 
 
 
 
 
 
 
 
151
  async with openrouter_rate_limit:
152
  try:
153
+ response = await client.chat.completions.create(**kwargs)
 
 
 
154
  except BadRequestError as e:
155
  if "filtered" in e.message:
156
  return None
157
  raise e
 
 
158
  if not response.choices:
159
  raise Exception(response)
160
  return response.choices[0].message.content.strip()
161
 
 
 
 
 
 
 
 
 
162
 
163
+ translate_client = translate.Client()
164
+ google_supported_languages = [l["language"] for l in translate_client.get_languages()]
 
165
 
166
 
167
  @cache
168
  async def translate_google(text, source_language, target_language):
 
169
  async with google_rate_limit:
170
+ response = translate_client.translate(
171
  text, source_language=source_language, target_language=target_language
172
  )
173
  return response["translatedText"]
 
231
  return empty
232
  try:
233
  info = api.model_info(id)
234
+ license = (
235
+ (info.card_data.license or "")
236
+ .replace("-", " ")
237
+ .replace("mit", "MIT")
238
+ .title()
239
+ )
 
 
240
  return {
241
  "hf_id": info.id,
242
  "creation_date": info.created_at,
 
249
 
250
 
251
  def get_cost(row):
252
+ cost = float(row["endpoint"]["pricing"]["completion"])
253
+ return round(cost * 1_000_000, 2)
 
 
 
 
 
 
254
 
255
 
256
  @cache
 
260
  + get_current_popular_models(date.today())[:10]
261
  )
262
  popular_models = [m["slug"] for m in popular_models]
263
+ models = set(important_models + popular_models) - set(blocklist)
264
+ models = pd.DataFrame(sorted(list(models)), columns=["id"])
 
 
 
 
 
 
 
 
 
265
  or_metadata = models["id"].apply(get_or_metadata)
266
  hf_metadata = or_metadata.apply(get_hf_metadata)
267
  creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
 
281
  license=hf_metadata.str["license"],
282
  creation_date=creation_date_hf.combine_first(creation_date_or),
283
  )
284
+ # models = models[models["cost"] <= 2.0].reset_index(drop=True)
 
285
  models["tasks"] = [
286
  ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
287
  ] * len(models)
evals/tasks.py CHANGED
@@ -1,4 +1,3 @@
1
- import asyncio
2
  import random
3
  from functools import partial
4
  from textwrap import dedent
@@ -11,8 +10,10 @@ from datasets_.mgsm import load_mgsm, parse_number
11
  from datasets_.mmlu import load_mmlu
12
  from datasets_.arc import load_uhura_arc_easy
13
  from datasets_.truthfulqa import load_truthfulqa
 
 
14
  from languages import languages, script_name
15
- from models import complete, transcribe
16
 
17
  bleu = evaluate.load("bleu")
18
  chrf = evaluate.load("chrf")
@@ -26,6 +27,9 @@ target_languages = languages[languages["in_benchmark"]].sample(
26
  frac=1, weights="speakers", replace=True, random_state=42
27
  )
28
 
 
 
 
29
 
30
  async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
31
  original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
@@ -43,20 +47,31 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
43
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
44
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
45
  script = script_name(target_language.flores_path.split("_")[1])
46
- translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
47
- prediction = await complete(
48
- model=model,
49
- messages=[
50
- {
51
- "role": "user",
52
- "content": translation_prompt,
53
- }
54
- ],
55
- temperature=0,
56
- max_tokens=1024,
57
- )
58
-
59
-
 
 
 
 
 
 
 
 
 
 
 
60
  if prediction:
61
  bleu_score = bleu.compute(
62
  predictions=[prediction],
@@ -69,9 +84,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
69
  else:
70
  bleu_score = {"bleu": 0}
71
  chrf_score = {"score": 0}
72
-
73
-
74
-
75
  return [
76
  {
77
  "model": model,
@@ -79,7 +91,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
79
  "task": f"translation_{mode}",
80
  "metric": metric,
81
  "score": score,
82
- "origin": "human", # FLORES+ is human-translated
83
  "sentence_nr": sentence_nr,
84
  }
85
  for metric, score in (
@@ -101,36 +112,57 @@ async def classify_and_evaluate(model, bcp_47, nr):
101
  )
102
  top_topics = paragraphs.value_counts("topic").head(5).index
103
  paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
104
- test_paragraph = paragraphs.sample(n=1, random_state=nr).iloc[0]
105
-
106
- prompt = f"""Classify the following text into one of these topics: {', '.join(top_topics)}.
107
- Reply with only the topic name.
108
-
109
- Text:
110
- {test_paragraph.text}
111
- """
112
- response = await complete(
113
- model=model,
114
- messages=[{"role": "user", "content": prompt}],
115
- temperature=0,
116
- max_tokens=30,
117
  )
118
-
119
 
120
-
121
- pred = response.lower().strip() if response else ""
122
- true = test_paragraph.topic.lower().strip()
123
- others = [t for t in top_topics if t != true]
124
- acc = (
125
- int(
126
- pred.startswith(true)
127
- or (true in pred and not any(o in pred for o in others))
128
- )
129
- if pred
130
- else 0
131
- )
132
-
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  return [
135
  {
136
  "model": model,
@@ -138,7 +170,6 @@ Text:
138
  "task": "classification",
139
  "metric": "accuracy",
140
  "score": acc,
141
- "origin": "human", # FLORES+ is human-translated
142
  "sentence_nr": nr,
143
  }
144
  ]
@@ -203,41 +234,37 @@ def format_multiple_choice(item):
203
  C: {item["choices"][2]}
204
  D: {item["choices"][3]}
205
 
206
- Answer with the letter of the correct answer."""
207
 
208
 
209
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
210
- ds_name, task, origin = await load_mmlu(language_bcp_47, nr)
211
  if not task:
212
  return []
213
-
214
- messages = [
215
- {
216
- "role": "user",
217
- "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
218
 
219
- Response format: <reasoning> #### <letter>
220
-
221
- ---
222
-
223
- {format_multiple_choice(task)}""",
224
- },
225
- ]
226
- response = await complete(
227
- model=model,
228
- messages=messages,
229
- temperature=0,
230
- max_tokens=1024,
231
- )
232
- if response and "####" in response:
233
- answer = response.split("####")[-1].strip()
234
- acc = int(answer[:1] == task["answer"])
235
- else:
236
- acc = 0
237
- answer = "NO_ANSWER"
238
-
239
-
240
-
 
241
  return [
242
  {
243
  "model": model,
@@ -245,41 +272,39 @@ Response format: <reasoning> #### <letter>
245
  "task": "mmlu",
246
  "metric": "accuracy",
247
  "score": acc,
248
- "origin": origin, # Add origin tag to results
249
  "sentence_nr": nr,
250
  }
251
  ]
252
 
253
 
254
  async def arc_and_evaluate(model, language_bcp_47, nr):
255
- ds_name, task, origin = load_uhura_arc_easy(language_bcp_47, nr)
256
  if not task:
257
  return []
258
 
259
- messages = [
260
- {
261
- "role": "user",
262
- "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
263
-
264
- Response format: <reasoning> #### <letter>
265
-
266
- ---
267
-
268
- {format_multiple_choice(task)}""",
269
- },
270
- ]
271
- response = await complete(
272
- model=model,
273
- messages=messages,
274
- temperature=0,
275
- max_tokens=1024,
276
- )
277
- if response and "####" in response:
278
- answer = response.split("####")[-1].strip()
279
- acc = int(answer[:1] == task["answer"])
280
- else:
281
- acc = 0
282
- answer = "NO_ANSWER"
283
  return [
284
  {
285
  "model": model,
@@ -287,7 +312,6 @@ Response format: <reasoning> #### <letter>
287
  "task": "arc",
288
  "metric": "accuracy",
289
  "score": acc,
290
- "origin": origin,
291
  "sentence_nr": nr,
292
  }
293
  ]
@@ -308,48 +332,40 @@ def format_multiple_choice_truthfulqa(item):
308
  text = item["question"] + "\n\n"
309
  for i, choice in enumerate(item["choices"]):
310
  text += f"{letters[i]}: {choice}\n"
 
311
  return text
312
 
313
 
314
  async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
315
- ds_name, task, origin = await load_truthfulqa(language_bcp_47, nr)
316
  if not task:
317
  return []
318
-
319
- # Find the correct answer
 
 
 
 
 
 
 
 
320
  try:
321
- correct_choice_index = task["labels"].index(1)
322
- answer = letters[correct_choice_index]
323
- except (ValueError, IndexError):
324
- # Handle cases where there is no correct answer or labels are malformed
325
- return []
326
-
327
- messages = [
328
- {
329
- "role": "user",
330
- "content": f"""Answer the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
331
-
332
- Response format: <reasoning> #### <letter>
333
-
334
- ---
335
-
336
- {format_multiple_choice_truthfulqa(task)}""",
337
- },
338
- ]
339
- response = await complete(
340
- model=model,
341
- messages=messages,
342
- temperature=0,
343
- max_tokens=1024, # Increased for reasoning
344
- )
345
- if response and "####" in response:
346
- pred_answer = response.split("####")[-1].strip()
347
- acc = int(pred_answer[:1].upper() == answer)
348
- else:
349
- acc = 0
350
- pred_answer = "NO_ANSWER"
351
-
352
-
353
  return [
354
  {
355
  "model": model,
@@ -357,43 +373,34 @@ Response format: <reasoning> #### <letter>
357
  "task": "truthfulqa",
358
  "metric": "accuracy",
359
  "score": acc,
360
- "origin": origin,
361
  "sentence_nr": nr,
362
  }
363
  ]
364
 
365
 
366
  async def mgsm_and_evaluate(model, language_bcp_47, nr):
367
- ds_slug, question, origin = load_mgsm(language_bcp_47, nr)
 
 
 
 
 
368
  if not question:
369
  return []
370
-
371
- messages = [
372
- {
373
- "role": "user",
374
- "content": f"""Solve the following math problem. Reason step-by-step and then write the final answer as a number.
375
-
376
- Response format: <reasoning> #### <number>
377
-
378
- ---
379
-
380
- {question["question"]}""",
381
- },
382
- ]
383
  response = await complete(
384
  model=model,
385
- messages=messages,
 
 
 
386
  temperature=0,
387
  max_tokens=1024,
388
  )
389
- if response and "####" in response:
390
  number = response.split("####")[1].strip()
391
  accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
392
  else:
393
  accuracy = 0
394
- number = "NO_ANSWER"
395
-
396
-
397
 
398
  return [
399
  {
@@ -402,7 +409,6 @@ Response format: <reasoning> #### <number>
402
  "task": "mgsm",
403
  "metric": "accuracy",
404
  "score": accuracy,
405
- "origin": origin,
406
  "sentence_nr": nr,
407
  }
408
  ]
@@ -443,8 +449,10 @@ tasks = {
443
  "translation_from": partial(translate_and_evaluate, mode="from"),
444
  "translation_to": partial(translate_and_evaluate, mode="to"),
445
  "classification": classify_and_evaluate,
 
446
  "mmlu": mmlu_and_evaluate,
447
  "arc": arc_and_evaluate,
448
  "truthfulqa": truthfulqa_and_evaluate,
449
  "mgsm": mgsm_and_evaluate,
 
450
  }
 
 
1
  import random
2
  from functools import partial
3
  from textwrap import dedent
 
10
  from datasets_.mmlu import load_mmlu
11
  from datasets_.arc import load_uhura_arc_easy
12
  from datasets_.truthfulqa import load_truthfulqa
13
+ from google.cloud import translate_v2 as translate
14
+ from langcodes import closest_supported_match
15
  from languages import languages, script_name
16
+ from models import complete, transcribe, translate_google
17
 
18
  bleu = evaluate.load("bleu")
19
  chrf = evaluate.load("chrf")
 
27
  frac=1, weights="speakers", replace=True, random_state=42
28
  )
29
 
30
+ translate_client = translate.Client()
31
+ supported_languages = [l["language"] for l in translate_client.get_languages()]
32
+
33
 
34
  async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
35
  original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
 
47
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
48
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
49
  script = script_name(target_language.flores_path.split("_")[1])
50
+ if model == "google/translate-v2":
51
+ original_language = closest_supported_match(
52
+ original_language, supported_languages
53
+ )
54
+ target_language = closest_supported_match(target_language, supported_languages)
55
+ if original_language == target_language:
56
+ prediction = original_sentence
57
+ elif original_language is None or target_language is None:
58
+ prediction = None
59
+ else:
60
+ prediction = await translate_google(
61
+ original_sentence, original_language.bcp_47, target_language.bcp_47
62
+ )
63
+ else:
64
+ prediction = await complete(
65
+ model=model,
66
+ messages=[
67
+ {
68
+ "role": "user",
69
+ "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
70
+ }
71
+ ],
72
+ temperature=0,
73
+ max_tokens=1024,
74
+ )
75
  if prediction:
76
  bleu_score = bleu.compute(
77
  predictions=[prediction],
 
84
  else:
85
  bleu_score = {"bleu": 0}
86
  chrf_score = {"score": 0}
 
 
 
87
  return [
88
  {
89
  "model": model,
 
91
  "task": f"translation_{mode}",
92
  "metric": metric,
93
  "score": score,
 
94
  "sentence_nr": sentence_nr,
95
  }
96
  for metric, score in (
 
112
  )
113
  top_topics = paragraphs.value_counts("topic").head(5).index
114
  paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
115
+ examples = pd.concat(
116
+ [
117
+ paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42)
118
+ for t in top_topics
119
+ ]
120
+ ).sample(frac=1, random_state=nr)
121
+ test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
122
+ frac=1, random_state=42
 
 
 
 
 
123
  )
124
+ test_paragraph = test_paragraphs.iloc[nr]
125
 
126
+ def format_prompt(text):
127
+ return f"{text}\n\nTopic: {'|'.join(top_topics)}?"
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ messages = []
130
+ for example in examples.itertuples():
131
+ messages += [
132
+ {"role": "user", "content": format_prompt(example.text)},
133
+ {"role": "assistant", "content": example.topic},
134
+ ]
135
+ # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
136
+ # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
137
+ try:
138
+ pred = await complete(
139
+ model=model,
140
+ messages=[
141
+ *messages,
142
+ {
143
+ "role": "user",
144
+ "content": format_prompt(test_paragraph.text),
145
+ },
146
+ ],
147
+ temperature=0,
148
+ max_tokens=30,
149
+ )
150
+ true = test_paragraph.topic
151
+ others = [t for t in top_topics if t != true]
152
+ acc = (
153
+ int(
154
+ pred.startswith(true)
155
+ or (true in pred and not any(o in pred for o in others))
156
+ )
157
+ if pred
158
+ else 0
159
+ )
160
+ except Exception as e:
161
+ if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
162
+ print(f"Max tokens exceeded for {model} in {bcp_47}")
163
+ acc = 0
164
+ else:
165
+ raise e
166
  return [
167
  {
168
  "model": model,
 
170
  "task": "classification",
171
  "metric": "accuracy",
172
  "score": acc,
 
173
  "sentence_nr": nr,
174
  }
175
  ]
 
234
  C: {item["choices"][2]}
235
  D: {item["choices"][3]}
236
 
237
+ A|B|C|D?"""
238
 
239
 
240
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
241
+ ds_name, examples, task = load_mmlu(language_bcp_47, nr)
242
  if not task:
243
  return []
 
 
 
 
 
244
 
245
+ messages = []
246
+ for example in examples:
247
+ messages += [
248
+ {"role": "user", "content": format_multiple_choice(example)},
249
+ {"role": "assistant", "content": example["answer"]},
250
+ ]
251
+ messages += [{"role": "user", "content": format_multiple_choice(task)}]
252
+ try:
253
+ response = await complete(
254
+ model=model,
255
+ messages=messages,
256
+ temperature=0,
257
+ max_tokens=1,
258
+ )
259
+ if response:
260
+ acc = int(response[:1].strip() == task["answer"])
261
+ else:
262
+ acc = 0
263
+ except Exception as e:
264
+ if "ResponsibleAIPolicyViolation" in str(e):
265
+ acc = 0
266
+ else:
267
+ raise e
268
  return [
269
  {
270
  "model": model,
 
272
  "task": "mmlu",
273
  "metric": "accuracy",
274
  "score": acc,
 
275
  "sentence_nr": nr,
276
  }
277
  ]
278
 
279
 
280
  async def arc_and_evaluate(model, language_bcp_47, nr):
281
+ ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
282
  if not task:
283
  return []
284
 
285
+ messages = []
286
+ for example in examples:
287
+ messages += [
288
+ {"role": "user", "content": format_multiple_choice(example)},
289
+ {"role": "assistant", "content": example["answer"]},
290
+ ]
291
+ messages += [{"role": "user", "content": format_multiple_choice(task)}]
292
+ try:
293
+ response = await complete(
294
+ model=model,
295
+ messages=messages,
296
+ temperature=0,
297
+ max_tokens=1,
298
+ )
299
+ if response:
300
+ acc = int(response[:1].strip() == task["answer"])
301
+ else:
302
+ acc = 0
303
+ except Exception as e:
304
+ if "ResponsibleAIPolicyViolation" in str(e):
305
+ acc = 0
306
+ else:
307
+ raise e
 
308
  return [
309
  {
310
  "model": model,
 
312
  "task": "arc",
313
  "metric": "accuracy",
314
  "score": acc,
 
315
  "sentence_nr": nr,
316
  }
317
  ]
 
332
  text = item["question"] + "\n\n"
333
  for i, choice in enumerate(item["choices"]):
334
  text += f"{letters[i]}: {choice}\n"
335
+ text += "|".join(letters[: len(item["choices"])]) + "?"
336
  return text
337
 
338
 
339
  async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
340
+ ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
341
  if not task:
342
  return []
343
+ task = shuffle_choices_and_labels(task)
344
+ answer = letters[task["labels"].index(1)]
345
+ messages = []
346
+ for example in examples:
347
+ example = shuffle_choices_and_labels(example)
348
+ messages += [
349
+ {"role": "user", "content": format_multiple_choice_truthfulqa(example)},
350
+ {"role": "assistant", "content": letters[example["labels"].index(1)]},
351
+ ]
352
+ messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
353
  try:
354
+ response = await complete(
355
+ model=model,
356
+ messages=messages,
357
+ temperature=0,
358
+ max_tokens=1,
359
+ )
360
+ if response:
361
+ acc = int(response[:1].strip() == answer)
362
+ else:
363
+ acc = 0
364
+ except Exception as e:
365
+ if "ResponsibleAIPolicyViolation" in str(e):
366
+ acc = 0
367
+ else:
368
+ raise e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  return [
370
  {
371
  "model": model,
 
373
  "task": "truthfulqa",
374
  "metric": "accuracy",
375
  "score": acc,
 
376
  "sentence_nr": nr,
377
  }
378
  ]
379
 
380
 
381
  async def mgsm_and_evaluate(model, language_bcp_47, nr):
382
+ system_prompt = """
383
+ Solve the math problem. Use reasoning, and finally give the answer as a number.
384
+ Response format: <reasoning> #### <number>
385
+ """
386
+ system_prompt = dedent(system_prompt).strip()
387
+ ds_slug, question = load_mgsm(language_bcp_47, nr)
388
  if not question:
389
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  response = await complete(
391
  model=model,
392
+ messages=[
393
+ {"role": "system", "content": system_prompt},
394
+ {"role": "user", "content": question["question"]},
395
+ ],
396
  temperature=0,
397
  max_tokens=1024,
398
  )
399
+ if response and len(response.split("####")) == 2:
400
  number = response.split("####")[1].strip()
401
  accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
402
  else:
403
  accuracy = 0
 
 
 
404
 
405
  return [
406
  {
 
409
  "task": "mgsm",
410
  "metric": "accuracy",
411
  "score": accuracy,
 
412
  "sentence_nr": nr,
413
  }
414
  ]
 
449
  "translation_from": partial(translate_and_evaluate, mode="from"),
450
  "translation_to": partial(translate_and_evaluate, mode="to"),
451
  "classification": classify_and_evaluate,
452
+ # "mlm": mlm_and_evaluate,
453
  "mmlu": mmlu_and_evaluate,
454
  "arc": arc_and_evaluate,
455
  "truthfulqa": truthfulqa_and_evaluate,
456
  "mgsm": mgsm_and_evaluate,
457
+ # "asr": transcribe_and_evaluate,
458
  }
frontend/src/App.js CHANGED
@@ -19,14 +19,9 @@ function App () {
19
  const [loading, setLoading] = useState(true)
20
  const [error, setError] = useState(null)
21
  const [selectedLanguages, setSelectedLanguages] = useState([])
22
- const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
23
  const [dialogVisible, setDialogVisible] = useState(false)
24
  const [aboutVisible, setAboutVisible] = useState(false)
25
  const [contributeVisible, setContributeVisible] = useState(false)
26
-
27
- // Add state for carousel items
28
- const [carouselItems, setCarouselItems] = useState([])
29
- const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
30
 
31
  useEffect(() => {
32
  fetch('/api/data', {
@@ -41,7 +36,6 @@ function App () {
41
  })
42
  .then(jsonData => {
43
  setData(jsonData)
44
- setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
45
  setLoading(false)
46
  })
47
  .catch(err => {
@@ -50,27 +44,8 @@ function App () {
50
  })
51
  }, [selectedLanguages])
52
 
53
- // Create carousel items when data is loaded
54
- useEffect(() => {
55
- if (data) {
56
- // Add a small delay to ensure components are ready
57
- const timer = setTimeout(() => {
58
- setCarouselItems([
59
- <WorldMap key="worldmap-0" data={data.countries} allLanguages={data.language_table} width={750} height={500} />,
60
- <LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
61
- <SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
62
- <HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
63
- <CostPlot key="costplot-4" data={data} width={750} height={500} />
64
- ]);
65
- }, 100);
66
-
67
- return () => clearTimeout(timer);
68
- }
69
- }, [data])
70
-
71
  const [windowWidth, setWindowWidth] = useState(window.innerWidth)
72
  const [windowHeight, setWindowHeight] = useState(window.innerHeight)
73
-
74
  useEffect(() => {
75
  const handleResize = () => {
76
  setWindowWidth(window.innerWidth)
@@ -80,44 +55,6 @@ function App () {
80
  return () => window.removeEventListener('resize', handleResize)
81
  }, [])
82
 
83
- // Create full-screen carousel items when data or window size changes
84
- useEffect(() => {
85
- if (data) {
86
- const timer = setTimeout(() => {
87
- setFullScreenCarouselItems([
88
- <WorldMap
89
- key="fs-worldmap-0"
90
- data={data.countries}
91
- allLanguages={data.language_table}
92
- width={windowWidth * 0.7}
93
- height={windowHeight * 0.6}
94
- />,
95
- <LanguagePlot
96
- key="fs-langplot-1"
97
- data={data}
98
- width={windowWidth * 0.7}
99
- height={windowHeight * 0.6}
100
- />,
101
- <SpeakerPlot
102
- key="fs-speakerplot-2"
103
- data={data}
104
- width={windowWidth * 0.7}
105
- height={windowHeight * 0.6}
106
- />,
107
- <HistoryPlot
108
- key="fs-histplot-3"
109
- data={data}
110
- width={windowWidth * 0.7}
111
- height={windowHeight * 0.6}
112
- />,
113
- <CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
114
- ]);
115
- }, 100);
116
-
117
- return () => clearTimeout(timer);
118
- }
119
- }, [data, windowWidth, windowHeight])
120
-
121
  return (
122
  <PrimeReactProvider>
123
  <div
@@ -132,50 +69,35 @@ function App () {
132
  style={{
133
  backgroundColor: '#fff3cd',
134
  color: '#856404',
135
- padding: '1rem 1.5rem',
136
  marginBottom: '1rem',
137
  border: '1px solid #ffeeba',
138
  borderRadius: '0.25rem',
139
- textAlign: 'center',
140
- lineHeight: '1.5',
141
- position: 'relative'
142
  }}
143
  >
144
  <strong>Work in Progress:</strong> This dashboard is currently under
145
- active development. Evaluation results are not yet final. Note that the visualised results currently stem from sampling 20 instances per combination of model, task, and language. We have evaluated 139 languages across 41 models and 7 tasks, totaling over 300,000 individual evaluations. Only the top 150 languages by speaker count are included in the current evaluation scope. More extensive evaluation runs will be released later this year.
146
- </div>
147
- <div
148
- style={{
149
- display: 'flex',
150
- justifyContent: 'flex-end',
151
- padding: '0 1.5rem',
152
- marginBottom: '1rem'
153
- }}
154
- >
155
  <a
156
  href='https://github.com/datenlabor-bmz/ai-language-monitor'
157
  target='_blank'
158
  rel='noopener noreferrer'
159
  style={{
160
  textDecoration: 'none',
161
- color: '#6c757d',
162
- fontSize: '1rem',
163
- fontWeight: '500',
164
- padding: '0.5rem 1rem',
165
- borderRadius: '0.375rem',
166
- backgroundColor: '#f8f9fa',
167
- border: '1px solid #e9ecef',
168
- display: 'flex',
169
- alignItems: 'center',
170
- gap: '0.5rem',
171
- transition: 'all 0.2s ease',
172
- ':hover': {
173
- backgroundColor: '#e9ecef',
174
- color: '#495057'
175
- }
176
  }}
177
  >
178
- <i className='pi pi-github' title='View on GitHub' />
 
 
 
 
179
  GitHub
180
  </a>
181
  </div>
@@ -227,88 +149,39 @@ function App () {
227
  <div
228
  style={{
229
  display: 'flex',
230
- gap: '0.75rem',
231
- marginBottom: '2rem',
232
  flexWrap: 'wrap',
233
  justifyContent: 'center'
234
  }}
235
  >
236
- <button
 
 
237
  onClick={() => setAboutVisible(true)}
238
  style={{
239
- background: 'linear-gradient(135deg, #667eea 0%, #764ba2 100%)',
240
- color: 'white',
241
- border: 'none',
242
- padding: '0.75rem 1.5rem',
243
- borderRadius: '12px',
244
- fontSize: '0.95rem',
245
- fontWeight: '500',
246
- cursor: 'pointer',
247
- display: 'flex',
248
- alignItems: 'center',
249
- gap: '0.5rem',
250
- boxShadow: '0 4px 15px rgba(102, 126, 234, 0.25)',
251
- transition: 'all 0.3s ease',
252
- ':hover': {
253
- transform: 'translateY(-2px)',
254
- boxShadow: '0 8px 25px rgba(102, 126, 234, 0.35)'
255
- }
256
- }}
257
- onMouseEnter={(e) => {
258
- e.target.style.transform = 'translateY(-2px)';
259
- e.target.style.boxShadow = '0 8px 25px rgba(102, 126, 234, 0.35)';
260
  }}
261
- onMouseLeave={(e) => {
262
- e.target.style.transform = 'translateY(0)';
263
- e.target.style.boxShadow = '0 4px 15px rgba(102, 126, 234, 0.25)';
264
- }}
265
- >
266
- <span style={{ fontSize: '1.1rem' }}>📚</span>
267
- About this tool
268
- </button>
269
 
270
- <button
 
 
271
  onClick={() => setContributeVisible(true)}
272
- title='This feature is on our roadmap and will be available soon.'
 
273
  style={{
274
- background: 'linear-gradient(135deg, #ff9a9e 0%, #fecfef 50%, #fecfef 100%)',
275
- color: '#6b46c1',
276
- border: 'none',
277
- padding: '0.75rem 1.5rem',
278
- borderRadius: '12px',
279
- fontSize: '0.95rem',
280
- fontWeight: '500',
281
- cursor: 'pointer',
282
- display: 'flex',
283
- alignItems: 'center',
284
- gap: '0.5rem',
285
- boxShadow: '0 4px 15px rgba(255, 154, 158, 0.25)',
286
- transition: 'all 0.3s ease',
287
- position: 'relative',
288
- overflow: 'hidden'
289
  }}
290
- onMouseEnter={(e) => {
291
- e.target.style.transform = 'translateY(-2px)';
292
- e.target.style.boxShadow = '0 8px 25px rgba(255, 154, 158, 0.35)';
293
- }}
294
- onMouseLeave={(e) => {
295
- e.target.style.transform = 'translateY(0)';
296
- e.target.style.boxShadow = '0 4px 15px rgba(255, 154, 158, 0.25)';
297
- }}
298
- >
299
- <span style={{ fontSize: '1.1rem' }}>🚀</span>
300
- Add your model
301
- <span style={{
302
- fontSize: '0.75rem',
303
- backgroundColor: 'rgba(107, 70, 193, 0.15)',
304
- padding: '0.2rem 0.5rem',
305
- borderRadius: '6px',
306
- marginLeft: '0.5rem',
307
- fontWeight: '600'
308
- }}>
309
- soon
310
- </span>
311
- </button>
312
  </div>
313
 
314
  {data && (
@@ -347,7 +220,6 @@ function App () {
347
  data={data.model_table}
348
  selectedLanguages={selectedLanguages}
349
  allLanguages={data.language_table || []}
350
- machineTranslatedMetrics={machineTranslatedMetrics}
351
  />
352
  <LanguageTable
353
  data={data.language_table}
@@ -376,18 +248,20 @@ function App () {
376
  color: '#666'
377
  }}
378
  />
379
- {carouselItems.length > 0 && (
380
- <Carousel
381
- key={`main-carousel-${carouselItems.length}-${Date.now()}`}
382
- value={carouselItems}
383
- numScroll={1}
384
- numVisible={1}
385
- itemTemplate={item => item}
386
- circular={false}
387
- activeIndex={0}
388
- style={{ width: '100%', minHeight: '650px' }}
389
- />
390
- )}
 
 
391
  </div>
392
  </>
393
  )}
@@ -535,16 +409,36 @@ function App () {
535
  modal
536
  header={null}
537
  >
538
- {fullScreenCarouselItems.length > 0 && (
539
  <div style={{ width: '100%', height: '100%' }}>
540
  <Carousel
541
- key={`fs-carousel-${fullScreenCarouselItems.length}-${Date.now()}`}
542
- value={fullScreenCarouselItems}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  numScroll={1}
544
  numVisible={1}
545
  itemTemplate={item => item}
546
- circular={false}
547
- activeIndex={0}
548
  style={{ width: '100%', height: 'calc(90vh - 120px)' }}
549
  />
550
  </div>
@@ -555,4 +449,4 @@ function App () {
555
  )
556
  }
557
 
558
- export default App
 
19
  const [loading, setLoading] = useState(true)
20
  const [error, setError] = useState(null)
21
  const [selectedLanguages, setSelectedLanguages] = useState([])
 
22
  const [dialogVisible, setDialogVisible] = useState(false)
23
  const [aboutVisible, setAboutVisible] = useState(false)
24
  const [contributeVisible, setContributeVisible] = useState(false)
 
 
 
 
25
 
26
  useEffect(() => {
27
  fetch('/api/data', {
 
36
  })
37
  .then(jsonData => {
38
  setData(jsonData)
 
39
  setLoading(false)
40
  })
41
  .catch(err => {
 
44
  })
45
  }, [selectedLanguages])
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  const [windowWidth, setWindowWidth] = useState(window.innerWidth)
48
  const [windowHeight, setWindowHeight] = useState(window.innerHeight)
 
49
  useEffect(() => {
50
  const handleResize = () => {
51
  setWindowWidth(window.innerWidth)
 
55
  return () => window.removeEventListener('resize', handleResize)
56
  }, [])
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  return (
59
  <PrimeReactProvider>
60
  <div
 
69
  style={{
70
  backgroundColor: '#fff3cd',
71
  color: '#856404',
72
+ padding: '0.75rem 1.25rem',
73
  marginBottom: '1rem',
74
  border: '1px solid #ffeeba',
75
  borderRadius: '0.25rem',
76
+ textAlign: 'center'
 
 
77
  }}
78
  >
79
  <strong>Work in Progress:</strong> This dashboard is currently under
80
+ active development. Evaluation results are not yet final.
 
 
 
 
 
 
 
 
 
81
  <a
82
  href='https://github.com/datenlabor-bmz/ai-language-monitor'
83
  target='_blank'
84
  rel='noopener noreferrer'
85
  style={{
86
  textDecoration: 'none',
87
+ color: '#856404',
88
+ float: 'right',
89
+ fontSize: '1.2rem',
90
+ fontWeight: 'bold',
91
+ padding: '0 0.5rem',
92
+ borderRadius: '3px',
93
+ backgroundColor: 'rgba(255,255,255,0.3)'
 
 
 
 
 
 
 
 
94
  }}
95
  >
96
+ <i
97
+ className='pi pi-github'
98
+ title='View on GitHub'
99
+ style={{ marginRight: '0.3rem' }}
100
+ />
101
  GitHub
102
  </a>
103
  </div>
 
149
  <div
150
  style={{
151
  display: 'flex',
152
+ gap: '1rem',
153
+ marginBottom: '1.5rem',
154
  flexWrap: 'wrap',
155
  justifyContent: 'center'
156
  }}
157
  >
158
+ <Button
159
+ label='📚 About this tool'
160
+ className='p-button-text'
161
  onClick={() => setAboutVisible(true)}
162
  style={{
163
+ color: '#666',
164
+ border: '1px solid #ddd',
165
+ padding: '0.5rem 1rem',
166
+ borderRadius: '4px',
167
+ fontSize: '0.9rem'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  }}
169
+ />
 
 
 
 
 
 
 
170
 
171
+ <Button
172
+ label='🚀 Add your model (soon)'
173
+ className='p-button-text'
174
  onClick={() => setContributeVisible(true)}
175
+ tooltip='This feature is on our roadmap and will be available soon.'
176
+ tooltipOptions={{ position: 'bottom' }}
177
  style={{
178
+ color: '#666',
179
+ border: '1px solid #ddd',
180
+ padding: '0.5rem 1rem',
181
+ borderRadius: '4px',
182
+ fontSize: '0.9rem'
 
 
 
 
 
 
 
 
 
 
183
  }}
184
+ />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  </div>
186
 
187
  {data && (
 
220
  data={data.model_table}
221
  selectedLanguages={selectedLanguages}
222
  allLanguages={data.language_table || []}
 
223
  />
224
  <LanguageTable
225
  data={data.language_table}
 
248
  color: '#666'
249
  }}
250
  />
251
+ <Carousel
252
+ value={[
253
+ <WorldMap data={data.countries} />,
254
+ <LanguagePlot data={data} />,
255
+ <SpeakerPlot data={data} />,
256
+ <HistoryPlot data={data} />,
257
+ <CostPlot data={data} />
258
+ ]}
259
+ numScroll={1}
260
+ numVisible={1}
261
+ itemTemplate={item => item}
262
+ circular
263
+ style={{ width: '100%', minHeight: '650px' }}
264
+ />
265
  </div>
266
  </>
267
  )}
 
409
  modal
410
  header={null}
411
  >
412
+ {data && (
413
  <div style={{ width: '100%', height: '100%' }}>
414
  <Carousel
415
+ value={[
416
+ <WorldMap
417
+ data={data.countries}
418
+ width={windowWidth * 0.7}
419
+ height={windowHeight * 0.6}
420
+ />,
421
+ <LanguagePlot
422
+ data={data}
423
+ width={windowWidth * 0.7}
424
+ height={windowHeight * 0.6}
425
+ />,
426
+ <SpeakerPlot
427
+ data={data}
428
+ width={windowWidth * 0.7}
429
+ height={windowHeight * 0.6}
430
+ />,
431
+ <HistoryPlot
432
+ data={data}
433
+ width={windowWidth * 0.7}
434
+ height={windowHeight * 0.6}
435
+ />,
436
+ <CostPlot data={data} />
437
+ ]}
438
  numScroll={1}
439
  numVisible={1}
440
  itemTemplate={item => item}
441
+ circular
 
442
  style={{ width: '100%', height: 'calc(90vh - 120px)' }}
443
  />
444
  </div>
 
449
  )
450
  }
451
 
452
+ export default App
frontend/src/components/HistoryPlot.js CHANGED
@@ -50,12 +50,12 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
50
  ...models.filter(d => d.newRecord),
51
  {
52
  creation_date: new Date(),
53
- maxAverage: models[models.length - 1]?.maxAverage || 0
54
  }
55
  ],
56
  {
57
  x: d => d.creation_date,
58
- y: d => d.maxAverage || 0,
59
  curve: 'step-after',
60
  strokeOpacity: 0.3
61
  }
 
50
  ...models.filter(d => d.newRecord),
51
  {
52
  creation_date: new Date(),
53
+ maxAverage: models[models.length - 1].maxAverage
54
  }
55
  ],
56
  {
57
  x: d => d.creation_date,
58
+ y: d => d.maxAverage,
59
  curve: 'step-after',
60
  strokeOpacity: 0.3
61
  }
frontend/src/components/LanguageTable.js CHANGED
@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
172
  filterElement={familyRowFilterTemplate}
173
  style={{ minWidth: '10rem' }}
174
  />
175
- {ScoreColumns()}
176
  </DataTable>
177
  )
178
  }
 
172
  filterElement={familyRowFilterTemplate}
173
  style={{ minWidth: '10rem' }}
174
  />
175
+ {ScoreColumns}
176
  </DataTable>
177
  )
178
  }
frontend/src/components/ModelTable.js CHANGED
@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
6
  import Medal from './Medal'
7
  import { Slider } from 'primereact/slider'
8
  import ScoreColumns from './ScoreColumns'
9
- const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTranslatedMetrics = [] }) => {
10
  const [filters, setFilters] = useState({
11
  type: { value: null, matchMode: FilterMatchMode.IN },
12
  size: { value: null, matchMode: FilterMatchMode.BETWEEN },
@@ -50,10 +50,10 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
50
  }
51
 
52
  const SliderWithLabel = ({ value, onChange, min, max }) => {
53
- const p = 10;
54
- const start = value === null || value[0] === null ? min : Math.log(value[0]) / Math.log(p);
55
- const stop = value === null || value[1] === null ? max : Math.log(value[1]) / Math.log(p);
56
- const [_value, _setValue] = useState([start, stop]);
57
  useEffect(() => {
58
  const timer = setTimeout(() => {
59
  onChange({
@@ -61,11 +61,11 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
61
  // set to "no filter" when (almost) the whole range is selected
62
  _value[0] <= min + 0.1 && _value[1] >= max - 0.1
63
  ? null
64
- : [p ** _value[0], p ** _value[1]],
65
- });
66
- }, 1000);
67
- return () => clearTimeout(timer);
68
- }, [_value, onChange, min, max]);
69
  return (
70
  <div style={{ minWidth: '20rem' }}>
71
  <div>{formatSize(p ** _value[0])}</div>
@@ -147,35 +147,21 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
147
  }
148
 
149
  const costBodyTemplate = rowData => {
150
- return (
151
- <div style={{ textAlign: 'center' }}>
152
- {rowData.cost === null ? 'n/a' : `$${rowData.cost.toFixed(2)}`}
153
- </div>
154
- )
155
  }
156
 
157
  const getHeaderText = () => {
158
- // Count languages that have any evaluation data (any task scores available)
159
- const evaluatedLanguagesCount = allLanguages.filter(lang => {
160
- // Check if language has any task scores (not just average)
161
- const hasAnyScores = [
162
- 'translation_from_bleu',
163
- 'translation_to_bleu',
164
- 'classification_accuracy',
165
- 'mmlu_accuracy',
166
- 'arc_accuracy',
167
- 'truthfulqa_accuracy',
168
- 'mgsm_accuracy'
169
- ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
170
- return hasAnyScores
171
- }).length
172
 
173
  if (selectedLanguages.length === 0) {
174
  return (
175
  <span>
176
  <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
177
  <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
178
- Performance across {evaluatedLanguagesCount} evaluated languages
179
  </span>
180
  </span>
181
  )
@@ -259,7 +245,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
259
  body={costBodyTemplate}
260
  style={{ minWidth: '5rem' }}
261
  />
262
- {ScoreColumns(machineTranslatedMetrics)}
263
  </DataTable>
264
  )
265
  }
 
6
  import Medal from './Medal'
7
  import { Slider } from 'primereact/slider'
8
  import ScoreColumns from './ScoreColumns'
9
+ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
10
  const [filters, setFilters] = useState({
11
  type: { value: null, matchMode: FilterMatchMode.IN },
12
  size: { value: null, matchMode: FilterMatchMode.BETWEEN },
 
50
  }
51
 
52
  const SliderWithLabel = ({ value, onChange, min, max }) => {
53
+ const p = 10
54
+ const start = value === null ? min : Math.log(value[0]) / Math.log(p)
55
+ const stop = value === null ? max : Math.log(value[1]) / Math.log(p)
56
+ const [_value, _setValue] = useState([start, stop])
57
  useEffect(() => {
58
  const timer = setTimeout(() => {
59
  onChange({
 
61
  // set to "no filter" when (almost) the whole range is selected
62
  _value[0] <= min + 0.1 && _value[1] >= max - 0.1
63
  ? null
64
+ : [p ** _value[0], p ** _value[1]]
65
+ })
66
+ }, 1000)
67
+ return () => clearTimeout(timer)
68
+ }, [_value, onChange, min, max])
69
  return (
70
  <div style={{ minWidth: '20rem' }}>
71
  <div>{formatSize(p ** _value[0])}</div>
 
147
  }
148
 
149
  const costBodyTemplate = rowData => {
150
+ return <div style={{ textAlign: 'center' }}>${rowData.cost?.toFixed(2)}</div>
 
 
 
 
151
  }
152
 
153
  const getHeaderText = () => {
154
+ // Count languages that have evaluation data (average score available)
155
+ const evaluatedLanguagesCount = allLanguages.filter(lang =>
156
+ lang.average !== null && lang.average !== undefined
157
+ ).length
 
 
 
 
 
 
 
 
 
 
158
 
159
  if (selectedLanguages.length === 0) {
160
  return (
161
  <span>
162
  <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
163
  <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
164
+ Average performance across {evaluatedLanguagesCount} evaluated languages
165
  </span>
166
  </span>
167
  )
 
245
  body={costBodyTemplate}
246
  style={{ minWidth: '5rem' }}
247
  />
248
+ {ScoreColumns}
249
  </DataTable>
250
  )
251
  }
frontend/src/components/ScoreColumns.js CHANGED
@@ -2,28 +2,21 @@ import { Column } from 'primereact/column'
2
  import ScoreField from './ScoreField'
3
 
4
  const scoreBodyTemplate = (field, options = {}) => {
5
- const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
6
 
7
  return rowData => {
8
  const score = rowData[field]
9
- // Prefer per-row flag if present (backend sets `<metric>_is_machine`),
10
- // otherwise fall back to global list
11
- const rowFlagKey = `${field}_is_machine`
12
- const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
13
- const isMachineTranslated = hasRowFlag
14
- ? !!rowData[rowFlagKey]
15
- : machineTranslatedMetrics.includes(field)
16
- return ScoreField(score, minScore, maxScore, isMachineTranslated)
17
  }
18
  }
19
 
20
- const ScoreColumns = (machineTranslatedMetrics = []) => [
21
  <Column
22
  field='average'
23
  header='Proficiency'
24
  headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
25
  sortable
26
- body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
27
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
28
  />,
29
  <Column
@@ -33,8 +26,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
33
  sortable
34
  body={scoreBodyTemplate('translation_from_bleu', {
35
  minScore: 0,
36
- maxScore: 0.5,
37
- machineTranslatedMetrics
38
  })}
39
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
40
  />,
@@ -45,8 +37,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
45
  sortable
46
  body={scoreBodyTemplate('translation_to_bleu', {
47
  minScore: 0,
48
- maxScore: 0.5,
49
- machineTranslatedMetrics
50
  })}
51
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
52
  />,
@@ -57,8 +48,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
57
  sortable
58
  body={scoreBodyTemplate('classification_accuracy', {
59
  minScore: 0,
60
- maxScore: 0.5,
61
- machineTranslatedMetrics
62
  })}
63
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
64
  />,
@@ -79,8 +69,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
79
  sortable
80
  body={scoreBodyTemplate('mmlu_accuracy', {
81
  minScore: 0,
82
- maxScore: 1,
83
- machineTranslatedMetrics
84
  })}
85
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
86
  />,
@@ -91,8 +80,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
91
  sortable
92
  body={scoreBodyTemplate('arc_accuracy', {
93
  minScore: 0,
94
- maxScore: 1,
95
- machineTranslatedMetrics
96
  })}
97
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
98
  />,
@@ -103,8 +91,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
103
  sortable
104
  body={scoreBodyTemplate('mgsm_accuracy', {
105
  minScore: 0,
106
- maxScore: 1,
107
- machineTranslatedMetrics
108
  })}
109
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
110
  />,
 
2
  import ScoreField from './ScoreField'
3
 
4
  const scoreBodyTemplate = (field, options = {}) => {
5
+ const { minScore = 0, maxScore = 1 } = options
6
 
7
  return rowData => {
8
  const score = rowData[field]
9
+ return ScoreField(score, minScore, maxScore)
 
 
 
 
 
 
 
10
  }
11
  }
12
 
13
+ const ScoreColumns = [
14
  <Column
15
  field='average'
16
  header='Proficiency'
17
  headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
18
  sortable
19
+ body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
20
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
21
  />,
22
  <Column
 
26
  sortable
27
  body={scoreBodyTemplate('translation_from_bleu', {
28
  minScore: 0,
29
+ maxScore: 0.5
 
30
  })}
31
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
32
  />,
 
37
  sortable
38
  body={scoreBodyTemplate('translation_to_bleu', {
39
  minScore: 0,
40
+ maxScore: 0.5
 
41
  })}
42
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
43
  />,
 
48
  sortable
49
  body={scoreBodyTemplate('classification_accuracy', {
50
  minScore: 0,
51
+ maxScore: 0.5
 
52
  })}
53
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
54
  />,
 
69
  sortable
70
  body={scoreBodyTemplate('mmlu_accuracy', {
71
  minScore: 0,
72
+ maxScore: 1
 
73
  })}
74
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
75
  />,
 
80
  sortable
81
  body={scoreBodyTemplate('arc_accuracy', {
82
  minScore: 0,
83
+ maxScore: 1
 
84
  })}
85
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
86
  />,
 
91
  sortable
92
  body={scoreBodyTemplate('mgsm_accuracy', {
93
  minScore: 0,
94
+ maxScore: 1
 
95
  })}
96
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
97
  />,
frontend/src/components/ScoreField.js CHANGED
@@ -1,4 +1,4 @@
1
- const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
2
  let percentage = 100
3
  let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
4
  if (score !== null) {
@@ -50,7 +50,6 @@ const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
50
  }}
51
  >
52
  {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
53
- {isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
54
  </span>
55
  </div>
56
  )
 
1
+ const ScoreField = (score, minScore, maxScore) => {
2
  let percentage = 100
3
  let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
4
  if (score !== null) {
 
50
  }}
51
  >
52
  {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
 
53
  </span>
54
  </div>
55
  )
frontend/src/components/SpeakerPlot.js CHANGED
@@ -73,10 +73,10 @@ const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
73
  textStrokeOpacity: 0,
74
  textFillOpacity: 0
75
  }),
76
- ...(languages.length >= 40 ? [Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
77
  x: 40,
78
  y: languages[39].cumSpeakers / 1e6
79
- })] : [])
80
  ]
81
  })
82
  containerRef.current.append(plot)
 
73
  textStrokeOpacity: 0,
74
  textFillOpacity: 0
75
  }),
76
+ Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
77
  x: 40,
78
  y: languages[39].cumSpeakers / 1e6
79
+ })
80
  ]
81
  })
82
  containerRef.current.append(plot)
frontend/src/components/WorldMap.js CHANGED
@@ -26,13 +26,13 @@ const makeTitle = data => d => {
26
  a =>
27
  `${smoothProgressBar(a.population / pop)} ${
28
  a.name
29
- } – ${a.score === null || a.score === undefined ? "n/a" : a.score.toFixed(2)}`
30
  )
31
  .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
32
- return `${d.properties.ADMIN} – ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
33
  }
34
 
35
- const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
36
  const containerRef = useRef()
37
  const [mapData, setMapData] = useState()
38
 
@@ -48,22 +48,8 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
48
  acc[country.iso2] = country
49
  return acc
50
  }, {})
51
- // Count languages that have any evaluation data
52
- const evaluatedLanguagesCount = allLanguages.filter(lang => {
53
- const hasAnyScores = [
54
- 'translation_from_bleu',
55
- 'translation_to_bleu',
56
- 'classification_accuracy',
57
- 'mmlu_accuracy',
58
- 'arc_accuracy',
59
- 'truthfulqa_accuracy',
60
- 'mgsm_accuracy'
61
- ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
62
- return hasAnyScores
63
- }).length
64
-
65
  const plot = Plot.plot({
66
- subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
67
  width: width,
68
  height: height,
69
  projection: 'equal-earth',
@@ -75,12 +61,11 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
75
  })
76
  ],
77
  color: {
78
- scheme: 'RdYlGn',
79
- unknown: '#d0d0d0',
80
  label: 'Score',
81
  legend: true,
82
- domain: [0, 1],
83
- pivot: 0.5
84
  },
85
  style: {
86
  fontFamily: 'monospace'
 
26
  a =>
27
  `${smoothProgressBar(a.population / pop)} ${
28
  a.name
29
+ } – ${a.score.toFixed(2)}`
30
  )
31
  .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
32
+ return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
33
  }
34
 
35
+ const WorldMap = ({ data, width = 750, height = 500 }) => {
36
  const containerRef = useRef()
37
  const [mapData, setMapData] = useState()
38
 
 
48
  acc[country.iso2] = country
49
  return acc
50
  }, {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  const plot = Plot.plot({
52
+ subtitle: 'Language Proficiency Score by Country',
53
  width: width,
54
  height: height,
55
  projection: 'equal-earth',
 
61
  })
62
  ],
63
  color: {
64
+ scheme: 'Greens',
65
+ unknown: 'gray',
66
  label: 'Score',
67
  legend: true,
68
+ domain: [0, 1]
 
69
  },
70
  style: {
71
  fontFamily: 'monospace'
languages.json CHANGED
@@ -7,7 +7,7 @@
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
- "commonvoice_hours":2683.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
@@ -32,7 +32,7 @@
32
  "flores_path":"hin_Deva",
33
  "fleurs_tag":"hi_in",
34
  "commonvoice_hours":16.0,
35
- "commonvoice_locale":"hi",
36
  "in_benchmark":true
37
  },
38
  {
@@ -43,7 +43,7 @@
43
  "family":"Indo-European",
44
  "flores_path":"spa_Latn",
45
  "fleurs_tag":"es_419",
46
- "commonvoice_hours":449.0,
47
  "commonvoice_locale":"es",
48
  "in_benchmark":true
49
  },
@@ -79,7 +79,7 @@
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
- "commonvoice_hours":1073.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
@@ -103,7 +103,7 @@
103
  "family":"Indo-European",
104
  "flores_path":"por_Latn",
105
  "fleurs_tag":"pt_br",
106
- "commonvoice_hours":181.0,
107
  "commonvoice_locale":"pt",
108
  "in_benchmark":true
109
  },
@@ -115,7 +115,7 @@
115
  "family":"Indo-European",
116
  "flores_path":"pan_Guru",
117
  "fleurs_tag":"pa_in",
118
- "commonvoice_hours":2.5,
119
  "commonvoice_locale":"pa-IN",
120
  "in_benchmark":true
121
  },
@@ -127,7 +127,7 @@
127
  "family":"Indo-European",
128
  "flores_path":"rus_Cyrl",
129
  "fleurs_tag":"ru_ru",
130
- "commonvoice_hours":247.0,
131
  "commonvoice_locale":"ru",
132
  "in_benchmark":true
133
  },
@@ -139,7 +139,7 @@
139
  "family":"Atlantic-Congo",
140
  "flores_path":"swh_Latn",
141
  "fleurs_tag":"sw_ke",
142
- "commonvoice_hours":412.0,
143
  "commonvoice_locale":"sw",
144
  "in_benchmark":true
145
  },
@@ -151,7 +151,7 @@
151
  "family":"Austronesian",
152
  "flores_path":"ind_Latn",
153
  "fleurs_tag":"id_id",
154
- "commonvoice_hours":34.0,
155
  "commonvoice_locale":"id",
156
  "in_benchmark":true
157
  },
@@ -163,7 +163,7 @@
163
  "family":"Indo-European",
164
  "flores_path":"deu_Latn",
165
  "fleurs_tag":"de_de",
166
- "commonvoice_hours":1372.0,
167
  "commonvoice_locale":"de",
168
  "in_benchmark":true
169
  },
@@ -379,7 +379,7 @@
379
  "family":"Indo-European",
380
  "flores_path":null,
381
  "fleurs_tag":"ps_af",
382
- "commonvoice_hours":82.0,
383
  "commonvoice_locale":"ps",
384
  "in_benchmark":false
385
  },
@@ -439,7 +439,7 @@
439
  "family":"Indo-European",
440
  "flores_path":"pol_Latn",
441
  "fleurs_tag":"pl_pl",
442
- "commonvoice_hours":176.0,
443
  "commonvoice_locale":"pl",
444
  "in_benchmark":true
445
  },
@@ -619,7 +619,7 @@
619
  "family":"Indo-European",
620
  "flores_path":"nld_Latn",
621
  "fleurs_tag":"nl_nl",
622
- "commonvoice_hours":123.0,
623
  "commonvoice_locale":"nl",
624
  "in_benchmark":true
625
  },
@@ -655,7 +655,7 @@
655
  "family":"Atlantic-Congo",
656
  "flores_path":"yor_Latn",
657
  "fleurs_tag":"yo_ng",
658
- "commonvoice_hours":6.4,
659
  "commonvoice_locale":"yo",
660
  "in_benchmark":true
661
  },
@@ -979,7 +979,7 @@
979
  "family":"Turkic",
980
  "flores_path":"kaz_Cyrl",
981
  "fleurs_tag":"kk_kz",
982
- "commonvoice_hours":2.3,
983
  "commonvoice_locale":"kk",
984
  "in_benchmark":true
985
  },
@@ -1027,7 +1027,7 @@
1027
  "family":"Uralic",
1028
  "flores_path":"hun_Latn",
1029
  "fleurs_tag":"hu_hu",
1030
- "commonvoice_hours":94.0,
1031
  "commonvoice_locale":"hu",
1032
  "in_benchmark":true
1033
  },
@@ -1099,7 +1099,7 @@
1099
  "family":"Indo-European",
1100
  "flores_path":"ckb_Arab",
1101
  "fleurs_tag":"ckb_iq",
1102
- "commonvoice_hours":136.0,
1103
  "commonvoice_locale":"ckb",
1104
  "in_benchmark":true
1105
  },
@@ -1183,7 +1183,7 @@
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
- "commonvoice_hours":1812.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
@@ -1207,7 +1207,7 @@
1207
  "family":"Indo-European",
1208
  "flores_path":"tgk_Cyrl",
1209
  "fleurs_tag":"tg_tj",
1210
- "commonvoice_hours":0.6,
1211
  "commonvoice_locale":"tg",
1212
  "in_benchmark":true
1213
  },
@@ -1243,7 +1243,7 @@
1243
  "family":"Indo-European",
1244
  "flores_path":"afr_Latn",
1245
  "fleurs_tag":"af_za",
1246
- "commonvoice_hours":0.6,
1247
  "commonvoice_locale":"af",
1248
  "in_benchmark":true
1249
  },
@@ -1291,7 +1291,7 @@
1291
  "family":"Indo-European",
1292
  "flores_path":"cat_Latn",
1293
  "fleurs_tag":"ca_es",
1294
- "commonvoice_hours":2884.0,
1295
  "commonvoice_locale":"ca",
1296
  "in_benchmark":true
1297
  },
@@ -1303,7 +1303,7 @@
1303
  "family":"Afro-Asiatic",
1304
  "flores_path":"heb_Hebr",
1305
  "fleurs_tag":"he_il",
1306
- "commonvoice_hours":2.0,
1307
  "commonvoice_locale":"he",
1308
  "in_benchmark":true
1309
  },
@@ -1375,7 +1375,7 @@
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
- "commonvoice_hours":437.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
@@ -1519,7 +1519,7 @@
1519
  "family":"Indo-European",
1520
  "flores_path":"kmr_Latn",
1521
  "fleurs_tag":null,
1522
- "commonvoice_hours":71.0,
1523
  "commonvoice_locale":"kmr",
1524
  "in_benchmark":true
1525
  },
@@ -1555,7 +1555,7 @@
1555
  "family":"Indo-European",
1556
  "flores_path":"slk_Latn",
1557
  "fleurs_tag":"sk_sk",
1558
- "commonvoice_hours":52.0,
1559
  "commonvoice_locale":"sk",
1560
  "in_benchmark":true
1561
  },
@@ -1675,7 +1675,7 @@
1675
  "family":"Tupian",
1676
  "flores_path":"gug_Latn",
1677
  "fleurs_tag":null,
1678
- "commonvoice_hours":4.5,
1679
  "commonvoice_locale":"gn",
1680
  "in_benchmark":true
1681
  },
@@ -1747,7 +1747,7 @@
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
- "commonvoice_hours":1.8,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
@@ -2155,7 +2155,7 @@
2155
  "family":"Kartvelian",
2156
  "flores_path":"kat_Geor",
2157
  "fleurs_tag":"ka_ge",
2158
- "commonvoice_hours":167.0,
2159
  "commonvoice_locale":"ka",
2160
  "in_benchmark":true
2161
  },
@@ -2167,7 +2167,7 @@
2167
  "family":"Indo-European",
2168
  "flores_path":"glg_Latn",
2169
  "fleurs_tag":"gl_es",
2170
- "commonvoice_hours":166.0,
2171
  "commonvoice_locale":"gl",
2172
  "in_benchmark":true
2173
  },
@@ -2323,7 +2323,7 @@
2323
  "family":"Dravidian",
2324
  "flores_path":null,
2325
  "fleurs_tag":null,
2326
- "commonvoice_hours":11.0,
2327
  "commonvoice_locale":"brh",
2328
  "in_benchmark":false
2329
  },
@@ -2623,7 +2623,7 @@
2623
  "family":"Indo-European",
2624
  "flores_path":null,
2625
  "fleurs_tag":null,
2626
- "commonvoice_hours":11.0,
2627
  "commonvoice_locale":"haz",
2628
  "in_benchmark":false
2629
  },
@@ -2695,7 +2695,7 @@
2695
  "family":"Indo-European",
2696
  "flores_path":"oci_Latn",
2697
  "fleurs_tag":"oc_fr",
2698
- "commonvoice_hours":1.9,
2699
  "commonvoice_locale":"oc",
2700
  "in_benchmark":true
2701
  },
@@ -3175,8 +3175,8 @@
3175
  "family":"Atlantic-Congo",
3176
  "flores_path":null,
3177
  "fleurs_tag":null,
3178
- "commonvoice_hours":0.0,
3179
- "commonvoice_locale":"seh",
3180
  "in_benchmark":false
3181
  },
3182
  {
@@ -3319,8 +3319,8 @@
3319
  "family":"Indo-European",
3320
  "flores_path":null,
3321
  "fleurs_tag":null,
3322
- "commonvoice_hours":0.0,
3323
- "commonvoice_locale":"mfe",
3324
  "in_benchmark":false
3325
  },
3326
  {
@@ -3331,7 +3331,7 @@
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
- "commonvoice_hours":9.3,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
@@ -3487,7 +3487,7 @@
3487
  "family":"Indo-European",
3488
  "flores_path":"lvs_Latn",
3489
  "fleurs_tag":"lv_lv",
3490
- "commonvoice_hours":263.0,
3491
  "commonvoice_locale":"lv",
3492
  "in_benchmark":true
3493
  },
@@ -3535,7 +3535,7 @@
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
- "commonvoice_hours":453.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
@@ -3559,7 +3559,7 @@
3559
  "family":"Abkhaz-Adyge",
3560
  "flores_path":null,
3561
  "fleurs_tag":null,
3562
- "commonvoice_hours":108.0,
3563
  "commonvoice_locale":"kbd",
3564
  "in_benchmark":false
3565
  },
@@ -3679,7 +3679,7 @@
3679
  "family":"Indo-European",
3680
  "flores_path":"ydd_Hebr",
3681
  "fleurs_tag":null,
3682
- "commonvoice_hours":1.8,
3683
  "commonvoice_locale":"yi",
3684
  "in_benchmark":true
3685
  },
@@ -3991,8 +3991,8 @@
3991
  "family":"Atlantic-Congo",
3992
  "flores_path":null,
3993
  "fleurs_tag":null,
3994
- "commonvoice_hours":0.0,
3995
- "commonvoice_locale":"gaa",
3996
  "in_benchmark":false
3997
  },
3998
  {
@@ -4099,8 +4099,8 @@
4099
  "family":"Indo-European",
4100
  "flores_path":null,
4101
  "fleurs_tag":null,
4102
- "commonvoice_hours":0.0,
4103
- "commonvoice_locale":"pcd",
4104
  "in_benchmark":false
4105
  },
4106
  {
@@ -4351,7 +4351,7 @@
4351
  "family":"Indo-European",
4352
  "flores_path":null,
4353
  "fleurs_tag":null,
4354
- "commonvoice_hours":30.0,
4355
  "commonvoice_locale":"br",
4356
  "in_benchmark":false
4357
  },
@@ -4651,7 +4651,7 @@
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
- "commonvoice_hours":32.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
@@ -5011,7 +5011,7 @@
5011
  "family":"Nakh-Daghestanian",
5012
  "flores_path":"dar_Cyrl",
5013
  "fleurs_tag":null,
5014
- "commonvoice_hours":1.3,
5015
  "commonvoice_locale":"dar",
5016
  "in_benchmark":true
5017
  },
@@ -7879,7 +7879,7 @@
7879
  "family":"Artificial Language",
7880
  "flores_path":"epo_Latn",
7881
  "fleurs_tag":null,
7882
- "commonvoice_hours":1437.0,
7883
  "commonvoice_locale":"eo",
7884
  "in_benchmark":true
7885
  },
 
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
+ "commonvoice_hours":2674.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
 
32
  "flores_path":"hin_Deva",
33
  "fleurs_tag":"hi_in",
34
  "commonvoice_hours":16.0,
35
+ "commonvoice_locale":"hi-IN",
36
  "in_benchmark":true
37
  },
38
  {
 
43
  "family":"Indo-European",
44
  "flores_path":"spa_Latn",
45
  "fleurs_tag":"es_419",
46
+ "commonvoice_hours":448.0,
47
  "commonvoice_locale":"es",
48
  "in_benchmark":true
49
  },
 
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
+ "commonvoice_hours":1065.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
 
103
  "family":"Indo-European",
104
  "flores_path":"por_Latn",
105
  "fleurs_tag":"pt_br",
106
+ "commonvoice_hours":180.0,
107
  "commonvoice_locale":"pt",
108
  "in_benchmark":true
109
  },
 
115
  "family":"Indo-European",
116
  "flores_path":"pan_Guru",
117
  "fleurs_tag":"pa_in",
118
+ "commonvoice_hours":2.3,
119
  "commonvoice_locale":"pa-IN",
120
  "in_benchmark":true
121
  },
 
127
  "family":"Indo-European",
128
  "flores_path":"rus_Cyrl",
129
  "fleurs_tag":"ru_ru",
130
+ "commonvoice_hours":245.0,
131
  "commonvoice_locale":"ru",
132
  "in_benchmark":true
133
  },
 
139
  "family":"Atlantic-Congo",
140
  "flores_path":"swh_Latn",
141
  "fleurs_tag":"sw_ke",
142
+ "commonvoice_hours":411.0,
143
  "commonvoice_locale":"sw",
144
  "in_benchmark":true
145
  },
 
151
  "family":"Austronesian",
152
  "flores_path":"ind_Latn",
153
  "fleurs_tag":"id_id",
154
+ "commonvoice_hours":33.0,
155
  "commonvoice_locale":"id",
156
  "in_benchmark":true
157
  },
 
163
  "family":"Indo-European",
164
  "flores_path":"deu_Latn",
165
  "fleurs_tag":"de_de",
166
+ "commonvoice_hours":1369.0,
167
  "commonvoice_locale":"de",
168
  "in_benchmark":true
169
  },
 
379
  "family":"Indo-European",
380
  "flores_path":null,
381
  "fleurs_tag":"ps_af",
382
+ "commonvoice_hours":81.0,
383
  "commonvoice_locale":"ps",
384
  "in_benchmark":false
385
  },
 
439
  "family":"Indo-European",
440
  "flores_path":"pol_Latn",
441
  "fleurs_tag":"pl_pl",
442
+ "commonvoice_hours":175.0,
443
  "commonvoice_locale":"pl",
444
  "in_benchmark":true
445
  },
 
619
  "family":"Indo-European",
620
  "flores_path":"nld_Latn",
621
  "fleurs_tag":"nl_nl",
622
+ "commonvoice_hours":120.0,
623
  "commonvoice_locale":"nl",
624
  "in_benchmark":true
625
  },
 
655
  "family":"Atlantic-Congo",
656
  "flores_path":"yor_Latn",
657
  "fleurs_tag":"yo_ng",
658
+ "commonvoice_hours":6.3,
659
  "commonvoice_locale":"yo",
660
  "in_benchmark":true
661
  },
 
979
  "family":"Turkic",
980
  "flores_path":"kaz_Cyrl",
981
  "fleurs_tag":"kk_kz",
982
+ "commonvoice_hours":2.2,
983
  "commonvoice_locale":"kk",
984
  "in_benchmark":true
985
  },
 
1027
  "family":"Uralic",
1028
  "flores_path":"hun_Latn",
1029
  "fleurs_tag":"hu_hu",
1030
+ "commonvoice_hours":93.0,
1031
  "commonvoice_locale":"hu",
1032
  "in_benchmark":true
1033
  },
 
1099
  "family":"Indo-European",
1100
  "flores_path":"ckb_Arab",
1101
  "fleurs_tag":"ckb_iq",
1102
+ "commonvoice_hours":135.0,
1103
  "commonvoice_locale":"ckb",
1104
  "in_benchmark":true
1105
  },
 
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
+ "commonvoice_hours":1810.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
 
1207
  "family":"Indo-European",
1208
  "flores_path":"tgk_Cyrl",
1209
  "fleurs_tag":"tg_tj",
1210
+ "commonvoice_hours":0.4,
1211
  "commonvoice_locale":"tg",
1212
  "in_benchmark":true
1213
  },
 
1243
  "family":"Indo-European",
1244
  "flores_path":"afr_Latn",
1245
  "fleurs_tag":"af_za",
1246
+ "commonvoice_hours":0.5,
1247
  "commonvoice_locale":"af",
1248
  "in_benchmark":true
1249
  },
 
1291
  "family":"Indo-European",
1292
  "flores_path":"cat_Latn",
1293
  "fleurs_tag":"ca_es",
1294
+ "commonvoice_hours":2863.0,
1295
  "commonvoice_locale":"ca",
1296
  "in_benchmark":true
1297
  },
 
1303
  "family":"Afro-Asiatic",
1304
  "flores_path":"heb_Hebr",
1305
  "fleurs_tag":"he_il",
1306
+ "commonvoice_hours":1.4,
1307
  "commonvoice_locale":"he",
1308
  "in_benchmark":true
1309
  },
 
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
+ "commonvoice_hours":411.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
 
1519
  "family":"Indo-European",
1520
  "flores_path":"kmr_Latn",
1521
  "fleurs_tag":null,
1522
+ "commonvoice_hours":69.0,
1523
  "commonvoice_locale":"kmr",
1524
  "in_benchmark":true
1525
  },
 
1555
  "family":"Indo-European",
1556
  "flores_path":"slk_Latn",
1557
  "fleurs_tag":"sk_sk",
1558
+ "commonvoice_hours":51.0,
1559
  "commonvoice_locale":"sk",
1560
  "in_benchmark":true
1561
  },
 
1675
  "family":"Tupian",
1676
  "flores_path":"gug_Latn",
1677
  "fleurs_tag":null,
1678
+ "commonvoice_hours":4.0,
1679
  "commonvoice_locale":"gn",
1680
  "in_benchmark":true
1681
  },
 
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
+ "commonvoice_hours":0.5,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
 
2155
  "family":"Kartvelian",
2156
  "flores_path":"kat_Geor",
2157
  "fleurs_tag":"ka_ge",
2158
+ "commonvoice_hours":166.0,
2159
  "commonvoice_locale":"ka",
2160
  "in_benchmark":true
2161
  },
 
2167
  "family":"Indo-European",
2168
  "flores_path":"glg_Latn",
2169
  "fleurs_tag":"gl_es",
2170
+ "commonvoice_hours":117.0,
2171
  "commonvoice_locale":"gl",
2172
  "in_benchmark":true
2173
  },
 
2323
  "family":"Dravidian",
2324
  "flores_path":null,
2325
  "fleurs_tag":null,
2326
+ "commonvoice_hours":1.2,
2327
  "commonvoice_locale":"brh",
2328
  "in_benchmark":false
2329
  },
 
2623
  "family":"Indo-European",
2624
  "flores_path":null,
2625
  "fleurs_tag":null,
2626
+ "commonvoice_hours":0.9,
2627
  "commonvoice_locale":"haz",
2628
  "in_benchmark":false
2629
  },
 
2695
  "family":"Indo-European",
2696
  "flores_path":"oci_Latn",
2697
  "fleurs_tag":"oc_fr",
2698
+ "commonvoice_hours":1.8,
2699
  "commonvoice_locale":"oc",
2700
  "in_benchmark":true
2701
  },
 
3175
  "family":"Atlantic-Congo",
3176
  "flores_path":null,
3177
  "fleurs_tag":null,
3178
+ "commonvoice_hours":null,
3179
+ "commonvoice_locale":null,
3180
  "in_benchmark":false
3181
  },
3182
  {
 
3319
  "family":"Indo-European",
3320
  "flores_path":null,
3321
  "fleurs_tag":null,
3322
+ "commonvoice_hours":null,
3323
+ "commonvoice_locale":null,
3324
  "in_benchmark":false
3325
  },
3326
  {
 
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
+ "commonvoice_hours":8.3,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
 
3487
  "family":"Indo-European",
3488
  "flores_path":"lvs_Latn",
3489
  "fleurs_tag":"lv_lv",
3490
+ "commonvoice_hours":262.0,
3491
  "commonvoice_locale":"lv",
3492
  "in_benchmark":true
3493
  },
 
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
+ "commonvoice_hours":440.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
 
3559
  "family":"Abkhaz-Adyge",
3560
  "flores_path":null,
3561
  "fleurs_tag":null,
3562
+ "commonvoice_hours":83.0,
3563
  "commonvoice_locale":"kbd",
3564
  "in_benchmark":false
3565
  },
 
3679
  "family":"Indo-European",
3680
  "flores_path":"ydd_Hebr",
3681
  "fleurs_tag":null,
3682
+ "commonvoice_hours":0.7,
3683
  "commonvoice_locale":"yi",
3684
  "in_benchmark":true
3685
  },
 
3991
  "family":"Atlantic-Congo",
3992
  "flores_path":null,
3993
  "fleurs_tag":null,
3994
+ "commonvoice_hours":null,
3995
+ "commonvoice_locale":null,
3996
  "in_benchmark":false
3997
  },
3998
  {
 
4099
  "family":"Indo-European",
4100
  "flores_path":null,
4101
  "fleurs_tag":null,
4102
+ "commonvoice_hours":null,
4103
+ "commonvoice_locale":null,
4104
  "in_benchmark":false
4105
  },
4106
  {
 
4351
  "family":"Indo-European",
4352
  "flores_path":null,
4353
  "fleurs_tag":null,
4354
+ "commonvoice_hours":29.0,
4355
  "commonvoice_locale":"br",
4356
  "in_benchmark":false
4357
  },
 
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
+ "commonvoice_hours":30.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
 
5011
  "family":"Nakh-Daghestanian",
5012
  "flores_path":"dar_Cyrl",
5013
  "fleurs_tag":null,
5014
+ "commonvoice_hours":0.0,
5015
  "commonvoice_locale":"dar",
5016
  "in_benchmark":true
5017
  },
 
7879
  "family":"Artificial Language",
7880
  "flores_path":"epo_Latn",
7881
  "fleurs_tag":null,
7882
+ "commonvoice_hours":1436.0,
7883
  "commonvoice_locale":"eo",
7884
  "in_benchmark":true
7885
  },
models.json CHANGED
@@ -20,15 +20,15 @@
20
  ]
21
  },
22
  {
23
- "id":"anthropic\/claude-3.7-sonnet",
24
- "name":"Claude 3.7 Sonnet",
25
  "provider_name":"Anthropic",
26
  "cost":15.0,
27
  "hf_id":null,
28
  "size":null,
29
  "type":"closed-source",
30
  "license":null,
31
- "creation_date":1740355200000,
32
  "tasks":[
33
  "translation_from",
34
  "translation_to",
@@ -40,15 +40,15 @@
40
  ]
41
  },
42
  {
43
- "id":"anthropic\/claude-sonnet-4",
44
- "name":"Claude Sonnet 4",
45
  "provider_name":"Anthropic",
46
  "cost":15.0,
47
  "hf_id":null,
48
  "size":null,
49
  "type":"closed-source",
50
  "license":null,
51
- "creation_date":1747872000000,
52
  "tasks":[
53
  "translation_from",
54
  "translation_to",
@@ -60,15 +60,15 @@
60
  ]
61
  },
62
  {
63
- "id":"cohere\/command-r-plus-04-2024",
64
- "name":"Command R+ (04-2024)",
65
- "provider_name":"Cohere",
66
  "cost":15.0,
67
  "hf_id":null,
68
  "size":null,
69
  "type":"closed-source",
70
  "license":null,
71
- "creation_date":1712016000000,
72
  "tasks":[
73
  "translation_from",
74
  "translation_to",
@@ -83,7 +83,7 @@
83
  "id":"deepseek\/deepseek-chat",
84
  "name":"DeepSeek V3",
85
  "provider_name":"DeepSeek",
86
- "cost":0.8,
87
  "hf_id":"deepseek-ai\/DeepSeek-V3",
88
  "size":684531386000.0,
89
  "type":"open-source",
@@ -120,15 +120,35 @@
120
  ]
121
  },
122
  {
123
- "id":"deepseek\/deepseek-chat-v3.1",
124
- "name":"DeepSeek V3.1",
125
  "provider_name":"DeepSeek",
126
  "cost":0.0,
127
- "hf_id":"deepseek-ai\/DeepSeek-V3.1",
128
  "size":684531386000.0,
129
  "type":"open-source",
130
  "license":"Mit",
131
- "creation_date":1755734400000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  "tasks":[
133
  "translation_from",
134
  "translation_to",
@@ -200,15 +220,145 @@
200
  ]
201
  },
202
  {
203
- "id":"google\/gemma-3-12b-it",
204
- "name":"Gemma 3 12B",
205
  "provider_name":"Google",
206
- "cost":0.0,
207
- "hf_id":"google\/gemma-3-12b-it",
208
- "size":12187325040.0,
209
- "type":"open-source",
210
- "license":"Gemma",
211
- "creation_date":1740787200000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  "tasks":[
213
  "translation_from",
214
  "translation_to",
@@ -240,15 +390,30 @@
240
  ]
241
  },
242
  {
243
- "id":"meta-llama\/llama-3-70b-instruct",
244
- "name":"Llama 3 70B Instruct",
245
- "provider_name":"Meta",
246
- "cost":0.4,
247
- "hf_id":"meta-llama\/Meta-Llama-3-70B-Instruct",
248
- "size":70553706496.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  "type":"open-source",
250
- "license":"Llama3",
251
- "creation_date":1713312000000,
252
  "tasks":[
253
  "translation_from",
254
  "translation_to",
@@ -260,12 +425,12 @@
260
  ]
261
  },
262
  {
263
- "id":"meta-llama\/llama-3-8b-instruct",
264
- "name":"Llama 3 8B Instruct",
265
  "provider_name":"Meta",
266
- "cost":0.06,
267
- "hf_id":"meta-llama\/Meta-Llama-3-8B-Instruct",
268
- "size":8030261248.0,
269
  "type":"open-source",
270
  "license":"Llama3",
271
  "creation_date":1713312000000,
@@ -299,6 +464,30 @@
299
  "mgsm"
300
  ]
301
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  {
303
  "id":"meta-llama\/llama-3.3-70b-instruct",
304
  "name":"Llama 3.3 70B Instruct",
@@ -339,26 +528,6 @@
339
  "mgsm"
340
  ]
341
  },
342
- {
343
- "id":"meta-llama\/llama-guard-3-8b",
344
- "name":"Llama Guard 3 8B",
345
- "provider_name":"Llama Guard 3 8B",
346
- "cost":0.06,
347
- "hf_id":"meta-llama\/Llama-Guard-3-8B",
348
- "size":8030261248.0,
349
- "type":"open-source",
350
- "license":"Llama3.1",
351
- "creation_date":1721606400000,
352
- "tasks":[
353
- "translation_from",
354
- "translation_to",
355
- "classification",
356
- "mmlu",
357
- "arc",
358
- "truthfulqa",
359
- "mgsm"
360
- ]
361
- },
362
  {
363
  "id":"microsoft\/phi-4",
364
  "name":"Phi 4",
@@ -399,26 +568,6 @@
399
  "mgsm"
400
  ]
401
  },
402
- {
403
- "id":"microsoft\/wizardlm-2-8x22b",
404
- "name":"WizardLM-2 8x22B",
405
- "provider_name":"WizardLM-2 8x22B",
406
- "cost":0.48,
407
- "hf_id":null,
408
- "size":null,
409
- "type":"closed-source",
410
- "license":null,
411
- "creation_date":1713225600000,
412
- "tasks":[
413
- "translation_from",
414
- "translation_to",
415
- "classification",
416
- "mmlu",
417
- "arc",
418
- "truthfulqa",
419
- "mgsm"
420
- ]
421
- },
422
  {
423
  "id":"mistralai\/mistral-nemo",
424
  "name":"Mistral Nemo",
@@ -459,26 +608,6 @@
459
  "mgsm"
460
  ]
461
  },
462
- {
463
- "id":"mistralai\/mistral-small-24b-instruct-2501",
464
- "name":"Mistral Small 3",
465
- "provider_name":"Mistral",
466
- "cost":0.0,
467
- "hf_id":"mistralai\/Mistral-Small-24B-Instruct-2501",
468
- "size":23572403200.0,
469
- "type":"open-source",
470
- "license":"Apache 2.0",
471
- "creation_date":1738022400000,
472
- "tasks":[
473
- "translation_from",
474
- "translation_to",
475
- "classification",
476
- "mmlu",
477
- "arc",
478
- "truthfulqa",
479
- "mgsm"
480
- ]
481
- },
482
  {
483
  "id":"mistralai\/mistral-small-3.1-24b-instruct",
484
  "name":"Mistral Small 3.1 24B",
@@ -499,106 +628,6 @@
499
  "mgsm"
500
  ]
501
  },
502
- {
503
- "id":"moonshotai\/kimi-k2",
504
- "name":"Kimi K2",
505
- "provider_name":"MoonshotAI",
506
- "cost":0.0,
507
- "hf_id":"moonshotai\/Kimi-K2-Instruct",
508
- "size":null,
509
- "type":"open-source",
510
- "license":"Other",
511
- "creation_date":1752192000000,
512
- "tasks":[
513
- "translation_from",
514
- "translation_to",
515
- "classification",
516
- "mmlu",
517
- "arc",
518
- "truthfulqa",
519
- "mgsm"
520
- ]
521
- },
522
- {
523
- "id":"nousresearch\/deephermes-3-llama-3-8b-preview",
524
- "name":"DeepHermes 3 Llama 3 8B Preview",
525
- "provider_name":"Nous",
526
- "cost":0.0,
527
- "hf_id":"NousResearch\/DeepHermes-3-Llama-3-8B-Preview",
528
- "size":8030261248.0,
529
- "type":"open-source",
530
- "license":"Llama3",
531
- "creation_date":1739318400000,
532
- "tasks":[
533
- "translation_from",
534
- "translation_to",
535
- "classification",
536
- "mmlu",
537
- "arc",
538
- "truthfulqa",
539
- "mgsm"
540
- ]
541
- },
542
- {
543
- "id":"nousresearch\/hermes-2-pro-llama-3-8b",
544
- "name":"Hermes 2 Pro - Llama-3 8B",
545
- "provider_name":"NousResearch",
546
- "cost":0.04,
547
- "hf_id":"NousResearch\/Hermes-2-Pro-Llama-3-8B",
548
- "size":8030523392.0,
549
- "type":"open-source",
550
- "license":"Llama3",
551
- "creation_date":1714435200000,
552
- "tasks":[
553
- "translation_from",
554
- "translation_to",
555
- "classification",
556
- "mmlu",
557
- "arc",
558
- "truthfulqa",
559
- "mgsm"
560
- ]
561
- },
562
- {
563
- "id":"nousresearch\/hermes-3-llama-3.1-405b",
564
- "name":"Hermes 3 405B Instruct",
565
- "provider_name":"Nous",
566
- "cost":0.8,
567
- "hf_id":"NousResearch\/Hermes-3-Llama-3.1-405B",
568
- "size":405853388800.0,
569
- "type":"open-source",
570
- "license":"Llama3",
571
- "creation_date":1723507200000,
572
- "tasks":[
573
- "translation_from",
574
- "translation_to",
575
- "classification",
576
- "mmlu",
577
- "arc",
578
- "truthfulqa",
579
- "mgsm"
580
- ]
581
- },
582
- {
583
- "id":"nousresearch\/hermes-3-llama-3.1-70b",
584
- "name":"Hermes 3 70B Instruct",
585
- "provider_name":"Nous",
586
- "cost":0.28,
587
- "hf_id":"NousResearch\/Hermes-3-Llama-3.1-70B",
588
- "size":70553706496.0,
589
- "type":"open-source",
590
- "license":"Llama3",
591
- "creation_date":1722211200000,
592
- "tasks":[
593
- "translation_from",
594
- "translation_to",
595
- "classification",
596
- "mmlu",
597
- "arc",
598
- "truthfulqa",
599
- "mgsm"
600
- ]
601
- },
602
  {
603
  "id":"openai\/gpt-3.5-turbo-0613",
604
  "name":"GPT-3.5 Turbo (older v0613)",
@@ -679,26 +708,6 @@
679
  "mgsm"
680
  ]
681
  },
682
- {
683
- "id":"openai\/gpt-4o-2024-11-20",
684
- "name":"GPT-4o (2024-11-20)",
685
- "provider_name":"OpenAI",
686
- "cost":10.0,
687
- "hf_id":null,
688
- "size":null,
689
- "type":"closed-source",
690
- "license":null,
691
- "creation_date":1732060800000,
692
- "tasks":[
693
- "translation_from",
694
- "translation_to",
695
- "classification",
696
- "mmlu",
697
- "arc",
698
- "truthfulqa",
699
- "mgsm"
700
- ]
701
- },
702
  {
703
  "id":"openai\/gpt-4o-mini",
704
  "name":"GPT-4o-mini",
@@ -719,86 +728,6 @@
719
  "mgsm"
720
  ]
721
  },
722
- {
723
- "id":"openai\/gpt-5",
724
- "name":"GPT-5",
725
- "provider_name":"OpenAI",
726
- "cost":10.0,
727
- "hf_id":null,
728
- "size":null,
729
- "type":"closed-source",
730
- "license":null,
731
- "creation_date":1754524800000,
732
- "tasks":[
733
- "translation_from",
734
- "translation_to",
735
- "classification",
736
- "mmlu",
737
- "arc",
738
- "truthfulqa",
739
- "mgsm"
740
- ]
741
- },
742
- {
743
- "id":"openai\/gpt-5-nano",
744
- "name":"GPT-5 Nano",
745
- "provider_name":"OpenAI",
746
- "cost":0.4,
747
- "hf_id":null,
748
- "size":null,
749
- "type":"closed-source",
750
- "license":null,
751
- "creation_date":1754524800000,
752
- "tasks":[
753
- "translation_from",
754
- "translation_to",
755
- "classification",
756
- "mmlu",
757
- "arc",
758
- "truthfulqa",
759
- "mgsm"
760
- ]
761
- },
762
- {
763
- "id":"openai\/gpt-oss-120b",
764
- "name":"gpt-oss-120b",
765
- "provider_name":"OpenAI",
766
- "cost":0.0,
767
- "hf_id":"openai\/gpt-oss-120b",
768
- "size":120412337472.0,
769
- "type":"open-source",
770
- "license":"Apache 2.0",
771
- "creation_date":1754265600000,
772
- "tasks":[
773
- "translation_from",
774
- "translation_to",
775
- "classification",
776
- "mmlu",
777
- "arc",
778
- "truthfulqa",
779
- "mgsm"
780
- ]
781
- },
782
- {
783
- "id":"openai\/gpt-oss-20b",
784
- "name":"gpt-oss-20b",
785
- "provider_name":"OpenAI",
786
- "cost":0.0,
787
- "hf_id":"openai\/gpt-oss-20b",
788
- "size":21511953984.0,
789
- "type":"open-source",
790
- "license":"Apache 2.0",
791
- "creation_date":1754265600000,
792
- "tasks":[
793
- "translation_from",
794
- "translation_to",
795
- "classification",
796
- "mmlu",
797
- "arc",
798
- "truthfulqa",
799
- "mgsm"
800
- ]
801
- },
802
  {
803
  "id":"qwen\/qwen3-235b-a22b",
804
  "name":"Qwen3 235B A22B",
@@ -843,7 +772,7 @@
843
  "id":"qwen\/qwen3-32b",
844
  "name":"Qwen3 32B",
845
  "provider_name":"Qwen",
846
- "cost":0.07,
847
  "hf_id":"Qwen\/Qwen3-32B",
848
  "size":32762123264.0,
849
  "type":"open-source",
@@ -858,140 +787,5 @@
858
  "truthfulqa",
859
  "mgsm"
860
  ]
861
- },
862
- {
863
- "id":"sao10k\/l3-lunaris-8b",
864
- "name":"Llama 3 8B Lunaris",
865
- "provider_name":"Sao10K",
866
- "cost":0.05,
867
- "hf_id":"Sao10K\/L3-8B-Lunaris-v1",
868
- "size":8030261248.0,
869
- "type":"open-source",
870
- "license":"Llama3",
871
- "creation_date":1719360000000,
872
- "tasks":[
873
- "translation_from",
874
- "translation_to",
875
- "classification",
876
- "mmlu",
877
- "arc",
878
- "truthfulqa",
879
- "mgsm"
880
- ]
881
- },
882
- {
883
- "id":"scb10x\/llama3.1-typhoon2-70b-instruct",
884
- "name":"Typhoon2 70B Instruct",
885
- "provider_name":"Typhoon2 70B Instruct",
886
- "cost":0.88,
887
- "hf_id":"scb10x\/llama3.1-typhoon2-70b-instruct",
888
- "size":70553706496.0,
889
- "type":"open-source",
890
- "license":"Llama3.1",
891
- "creation_date":1734220800000,
892
- "tasks":[
893
- "translation_from",
894
- "translation_to",
895
- "classification",
896
- "mmlu",
897
- "arc",
898
- "truthfulqa",
899
- "mgsm"
900
- ]
901
- },
902
- {
903
- "id":"shisa-ai\/shisa-v2-llama3.3-70b",
904
- "name":"Shisa V2 Llama 3.3 70B ",
905
- "provider_name":"Shisa AI",
906
- "cost":0.0,
907
- "hf_id":"shisa-ai\/shisa-v2-llama3.3-70b",
908
- "size":70553706496.0,
909
- "type":"open-source",
910
- "license":"Llama3.3",
911
- "creation_date":1744502400000,
912
- "tasks":[
913
- "translation_from",
914
- "translation_to",
915
- "classification",
916
- "mmlu",
917
- "arc",
918
- "truthfulqa",
919
- "mgsm"
920
- ]
921
- },
922
- {
923
- "id":"x-ai\/grok-2-vision-1212",
924
- "name":"Grok 2 Vision 1212",
925
- "provider_name":"xAI",
926
- "cost":10.0,
927
- "hf_id":null,
928
- "size":null,
929
- "type":"closed-source",
930
- "license":null,
931
- "creation_date":1734220800000,
932
- "tasks":[
933
- "translation_from",
934
- "translation_to",
935
- "classification",
936
- "mmlu",
937
- "arc",
938
- "truthfulqa",
939
- "mgsm"
940
- ]
941
- },
942
- {
943
- "id":"x-ai\/grok-4",
944
- "name":"Grok 4",
945
- "provider_name":"xAI",
946
- "cost":15.0,
947
- "hf_id":null,
948
- "size":null,
949
- "type":"closed-source",
950
- "license":null,
951
- "creation_date":1752019200000,
952
- "tasks":[
953
- "translation_from",
954
- "translation_to",
955
- "classification",
956
- "mmlu",
957
- "arc",
958
- "truthfulqa",
959
- "mgsm"
960
- ]
961
- },
962
- {
963
- "id":"z-ai\/glm-4.5",
964
- "name":"GLM 4.5",
965
- "provider_name":"Z.AI",
966
- "cost":1.32,
967
- "hf_id":"zai-org\/GLM-4.5",
968
- "size":358337791296.0,
969
- "type":"open-source",
970
- "license":"Mit",
971
- "creation_date":1752969600000,
972
- "tasks":[
973
- "translation_from",
974
- "translation_to",
975
- "classification",
976
- "mmlu",
977
- "arc",
978
- "truthfulqa",
979
- "mgsm"
980
- ]
981
- },
982
- {
983
- "id":"google\/translate-v2",
984
- "name":"Google Translate",
985
- "provider_name":"Google",
986
- "cost":20.0,
987
- "hf_id":null,
988
- "size":null,
989
- "type":"closed-source",
990
- "license":null,
991
- "creation_date":null,
992
- "tasks":[
993
- "translation_from",
994
- "translation_to"
995
- ]
996
  }
997
  ]
 
20
  ]
21
  },
22
  {
23
+ "id":"anthropic\/claude-3.5-sonnet",
24
+ "name":"Claude 3.5 Sonnet",
25
  "provider_name":"Anthropic",
26
  "cost":15.0,
27
  "hf_id":null,
28
  "size":null,
29
  "type":"closed-source",
30
  "license":null,
31
+ "creation_date":1729555200000,
32
  "tasks":[
33
  "translation_from",
34
  "translation_to",
 
40
  ]
41
  },
42
  {
43
+ "id":"anthropic\/claude-3.7-sonnet",
44
+ "name":"Claude 3.7 Sonnet",
45
  "provider_name":"Anthropic",
46
  "cost":15.0,
47
  "hf_id":null,
48
  "size":null,
49
  "type":"closed-source",
50
  "license":null,
51
+ "creation_date":1740355200000,
52
  "tasks":[
53
  "translation_from",
54
  "translation_to",
 
60
  ]
61
  },
62
  {
63
+ "id":"anthropic\/claude-sonnet-4",
64
+ "name":"Claude Sonnet 4",
65
+ "provider_name":"Anthropic",
66
  "cost":15.0,
67
  "hf_id":null,
68
  "size":null,
69
  "type":"closed-source",
70
  "license":null,
71
+ "creation_date":1747872000000,
72
  "tasks":[
73
  "translation_from",
74
  "translation_to",
 
83
  "id":"deepseek\/deepseek-chat",
84
  "name":"DeepSeek V3",
85
  "provider_name":"DeepSeek",
86
+ "cost":0.0,
87
  "hf_id":"deepseek-ai\/DeepSeek-V3",
88
  "size":684531386000.0,
89
  "type":"open-source",
 
120
  ]
121
  },
122
  {
123
+ "id":"deepseek\/deepseek-r1",
124
+ "name":"R1",
125
  "provider_name":"DeepSeek",
126
  "cost":0.0,
127
+ "hf_id":"deepseek-ai\/DeepSeek-R1",
128
  "size":684531386000.0,
129
  "type":"open-source",
130
  "license":"Mit",
131
+ "creation_date":1737331200000,
132
+ "tasks":[
133
+ "translation_from",
134
+ "translation_to",
135
+ "classification",
136
+ "mmlu",
137
+ "arc",
138
+ "truthfulqa",
139
+ "mgsm"
140
+ ]
141
+ },
142
+ {
143
+ "id":"deepseek\/deepseek-r1-0528",
144
+ "name":"R1 0528",
145
+ "provider_name":"DeepSeek",
146
+ "cost":0.0,
147
+ "hf_id":"deepseek-ai\/DeepSeek-R1-0528",
148
+ "size":684531386000.0,
149
+ "type":"open-source",
150
+ "license":"Mit",
151
+ "creation_date":1748390400000.0,
152
  "tasks":[
153
  "translation_from",
154
  "translation_to",
 
220
  ]
221
  },
222
  {
223
+ "id":"google\/gemini-2.5-flash-lite-preview-06-17",
224
+ "name":"Gemini 2.5 Flash Lite Preview 06-17",
225
  "provider_name":"Google",
226
+ "cost":0.4,
227
+ "hf_id":null,
228
+ "size":null,
229
+ "type":"closed-source",
230
+ "license":null,
231
+ "creation_date":1750118400000.0,
232
+ "tasks":[
233
+ "translation_from",
234
+ "translation_to",
235
+ "classification",
236
+ "mmlu",
237
+ "mgsm"
238
+ ]
239
+ },
240
+ {
241
+ "id":"google\/gemini-2.5-flash-preview",
242
+ "name":"Gemini 2.5 Flash Preview 04-17",
243
+ "provider_name":"Google",
244
+ "cost":0.6,
245
+ "hf_id":null,
246
+ "size":null,
247
+ "type":"closed-source",
248
+ "license":null,
249
+ "creation_date":1744848000000.0,
250
+ "tasks":[
251
+ "translation_from",
252
+ "translation_to",
253
+ "classification",
254
+ "mmlu",
255
+ "mgsm"
256
+ ]
257
+ },
258
+ {
259
+ "id":"google\/gemini-2.5-flash-preview-05-20",
260
+ "name":"Gemini 2.5 Flash Preview 05-20",
261
+ "provider_name":"Google",
262
+ "cost":0.6,
263
+ "hf_id":null,
264
+ "size":null,
265
+ "type":"closed-source",
266
+ "license":null,
267
+ "creation_date":1747699200000.0,
268
+ "tasks":[
269
+ "translation_from",
270
+ "translation_to",
271
+ "classification",
272
+ "mmlu",
273
+ "mgsm"
274
+ ]
275
+ },
276
+ {
277
+ "id":"google\/gemini-2.5-pro",
278
+ "name":"Gemini 2.5 Pro",
279
+ "provider_name":"Google",
280
+ "cost":10.0,
281
+ "hf_id":null,
282
+ "size":null,
283
+ "type":"closed-source",
284
+ "license":null,
285
+ "creation_date":1750118400000,
286
+ "tasks":[
287
+ "translation_from",
288
+ "translation_to",
289
+ "classification",
290
+ "mmlu",
291
+ "arc",
292
+ "truthfulqa",
293
+ "mgsm"
294
+ ]
295
+ },
296
+ {
297
+ "id":"google\/gemini-2.5-pro-preview",
298
+ "name":"Gemini 2.5 Pro Preview 06-05",
299
+ "provider_name":"Google",
300
+ "cost":10.0,
301
+ "hf_id":null,
302
+ "size":null,
303
+ "type":"closed-source",
304
+ "license":null,
305
+ "creation_date":1749081600000.0,
306
+ "tasks":[
307
+ "translation_from",
308
+ "translation_to",
309
+ "classification",
310
+ "mmlu",
311
+ "mgsm"
312
+ ]
313
+ },
314
+ {
315
+ "id":"google\/gemini-2.5-pro-preview-05-06",
316
+ "name":"Gemini 2.5 Pro Preview 05-06",
317
+ "provider_name":"Google",
318
+ "cost":10.0,
319
+ "hf_id":null,
320
+ "size":null,
321
+ "type":"closed-source",
322
+ "license":null,
323
+ "creation_date":1746576000000.0,
324
+ "tasks":[
325
+ "translation_from",
326
+ "translation_to",
327
+ "classification",
328
+ "mmlu",
329
+ "mgsm"
330
+ ]
331
+ },
332
+ {
333
+ "id":"google\/gemini-flash-1.5",
334
+ "name":"Gemini 1.5 Flash ",
335
+ "provider_name":"Google",
336
+ "cost":0.3,
337
+ "hf_id":null,
338
+ "size":null,
339
+ "type":"closed-source",
340
+ "license":null,
341
+ "creation_date":1715644800000,
342
+ "tasks":[
343
+ "translation_from",
344
+ "translation_to",
345
+ "classification",
346
+ "mmlu",
347
+ "arc",
348
+ "truthfulqa",
349
+ "mgsm"
350
+ ]
351
+ },
352
+ {
353
+ "id":"google\/gemini-flash-1.5-8b",
354
+ "name":"Gemini 1.5 Flash 8B",
355
+ "provider_name":"Google",
356
+ "cost":0.15,
357
+ "hf_id":null,
358
+ "size":null,
359
+ "type":"closed-source",
360
+ "license":null,
361
+ "creation_date":1727913600000,
362
  "tasks":[
363
  "translation_from",
364
  "translation_to",
 
390
  ]
391
  },
392
  {
393
+ "id":"google\/translate-v2",
394
+ "name":"Google Translate",
395
+ "provider_name":"Google",
396
+ "cost":20.0,
397
+ "hf_id":null,
398
+ "size":null,
399
+ "type":"closed-source",
400
+ "license":null,
401
+ "creation_date":null,
402
+ "tasks":[
403
+ "translation_from",
404
+ "translation_to"
405
+ ]
406
+ },
407
+ {
408
+ "id":"gryphe\/mythomax-l2-13b",
409
+ "name":"MythoMax 13B",
410
+ "provider_name":"MythoMax 13B",
411
+ "cost":0.07,
412
+ "hf_id":"Gryphe\/MythoMax-L2-13b",
413
+ "size":null,
414
  "type":"open-source",
415
+ "license":"Other",
416
+ "creation_date":1691625600000,
417
  "tasks":[
418
  "translation_from",
419
  "translation_to",
 
425
  ]
426
  },
427
  {
428
+ "id":"meta-llama\/llama-3-70b-instruct",
429
+ "name":"Llama 3 70B Instruct",
430
  "provider_name":"Meta",
431
+ "cost":0.4,
432
+ "hf_id":"meta-llama\/Meta-Llama-3-70B-Instruct",
433
+ "size":70553706496.0,
434
  "type":"open-source",
435
  "license":"Llama3",
436
  "creation_date":1713312000000,
 
464
  "mgsm"
465
  ]
466
  },
467
+ {
468
+ "id":"meta-llama\/llama-3.1-8b-instruct",
469
+ "name":"Llama 3.1 8B Instruct",
470
+ "provider_name":"Meta",
471
+ "cost":0.0,
472
+ "hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
473
+ "size":8030261248.0,
474
+ "type":"open-source",
475
+ "license":"Llama3.1",
476
+ "creation_date":1721260800000.0,
477
+ "tasks":null
478
+ },
479
+ {
480
+ "id":"meta-llama\/llama-3.2-1b-instruct",
481
+ "name":"Llama 3.2 1B Instruct",
482
+ "provider_name":"Meta",
483
+ "cost":0.0,
484
+ "hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
485
+ "size":1235814400.0,
486
+ "type":"open-source",
487
+ "license":"Llama3.2",
488
+ "creation_date":1726617600000.0,
489
+ "tasks":null
490
+ },
491
  {
492
  "id":"meta-llama\/llama-3.3-70b-instruct",
493
  "name":"Llama 3.3 70B Instruct",
 
528
  "mgsm"
529
  ]
530
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
  {
532
  "id":"microsoft\/phi-4",
533
  "name":"Phi 4",
 
568
  "mgsm"
569
  ]
570
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  {
572
  "id":"mistralai\/mistral-nemo",
573
  "name":"Mistral Nemo",
 
608
  "mgsm"
609
  ]
610
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  {
612
  "id":"mistralai\/mistral-small-3.1-24b-instruct",
613
  "name":"Mistral Small 3.1 24B",
 
628
  "mgsm"
629
  ]
630
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  {
632
  "id":"openai\/gpt-3.5-turbo-0613",
633
  "name":"GPT-3.5 Turbo (older v0613)",
 
708
  "mgsm"
709
  ]
710
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  {
712
  "id":"openai\/gpt-4o-mini",
713
  "name":"GPT-4o-mini",
 
728
  "mgsm"
729
  ]
730
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  {
732
  "id":"qwen\/qwen3-235b-a22b",
733
  "name":"Qwen3 235B A22B",
 
772
  "id":"qwen\/qwen3-32b",
773
  "name":"Qwen3 32B",
774
  "provider_name":"Qwen",
775
+ "cost":0.0,
776
  "hf_id":"Qwen\/Qwen3-32B",
777
  "size":32762123264.0,
778
  "type":"open-source",
 
787
  "truthfulqa",
788
  "mgsm"
789
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
  }
791
  ]
pyproject.toml CHANGED
@@ -36,9 +36,6 @@ dev = [
36
  "tqdm>=4.67.1",
37
  "transformers>=4.51.3",
38
  ]
39
- cloud = [
40
- "google-cloud-storage>=3.2.0",
41
- ]
42
 
43
  [dependency-groups]
44
  dev = [
@@ -47,10 +44,3 @@ dev = [
47
  "scipy>=1.16.0",
48
  "seaborn>=0.13.2",
49
  ]
50
-
51
- [build-system]
52
- requires = ["hatchling"]
53
- build-backend = "hatchling.build"
54
-
55
- [tool.hatch.build.targets.wheel]
56
- packages = ["evals"]
 
36
  "tqdm>=4.67.1",
37
  "transformers>=4.51.3",
38
  ]
 
 
 
39
 
40
  [dependency-groups]
41
  dev = [
 
44
  "scipy>=1.16.0",
45
  "seaborn>=0.13.2",
46
  ]
 
 
 
 
 
 
 
results.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3425d95dc42dec7d32104ca836a9d82d256d6f4ca4cc4971bab4a43339eb3090
3
- size 15266201
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dbe020a1941a0e49c05f81aeee40ba37d3e2f9f3d83303fcfe1b5711676d1d8
3
+ size 2978273
uv.lock CHANGED
The diff for this file is too large to render. See raw diff