|
|
|
""" |
|
Script minimal pour tester directement lighteval avec la tâche yourbench |
|
""" |
|
import os |
|
import sys |
|
import subprocess |
|
import json |
|
import time |
|
from pathlib import Path |
|
import logging |
|
|
|
|
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
|
|
|
|
sys.path.append(os.getcwd()) |
|
from tasks.yourbench_lighteval_task import create_yourbench_task |
|
|
|
def run_lighteval_test(): |
|
""" |
|
Exécuter un test minimal avec lighteval |
|
""" |
|
|
|
dataset_name = "yourbench_a" |
|
organization = "yourbench" |
|
model_name = "Qwen/Qwen2.5-72B-Instruct" |
|
provider = "novita" |
|
output_dir = f"uploaded_files/test_{provider}/lighteval_results" |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
dataset_path = f"{organization}/{dataset_name}" |
|
print(f"Dataset à évaluer: {dataset_path}") |
|
|
|
|
|
import tempfile |
|
temp_file_path = tempfile.mktemp(suffix=".py") |
|
print(f"Création du fichier temporaire: {temp_file_path}") |
|
|
|
with open(temp_file_path, 'w') as temp_file: |
|
|
|
temp_file.write(f""" |
|
import os |
|
import sys |
|
import logging |
|
sys.path.append("{os.getcwd()}") |
|
|
|
from tasks.yourbench_lighteval_task import create_yourbench_task |
|
|
|
# Configurer le logging |
|
logging.basicConfig(level=logging.INFO) |
|
|
|
# Créer la tâche yourbench |
|
yourbench = create_yourbench_task("{dataset_path}", "lighteval") |
|
|
|
# Définir la variable TASKS_TABLE dont lighteval a besoin |
|
TASKS_TABLE = [yourbench] |
|
""") |
|
|
|
|
|
cmd = [ |
|
"lighteval", |
|
"endpoint", |
|
"inference-providers", |
|
f"model={model_name},provider={provider}", |
|
"custom|yourbench|0|0", |
|
"--custom-tasks", |
|
temp_file_path, |
|
"--max-samples", "5", |
|
"--output-dir", output_dir, |
|
"--save-details", |
|
"--no-push-to-hub" |
|
] |
|
|
|
|
|
print(f"Exécution de la commande: {' '.join(cmd)}") |
|
print(f"Heure de début: {time.strftime('%H:%M:%S')}") |
|
|
|
|
|
try: |
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True) |
|
|
|
|
|
print(f"Code de retour: {result.returncode}") |
|
print("--- SORTIE STANDARD ---") |
|
print(result.stdout) |
|
print("--- ERREUR STANDARD ---") |
|
print(result.stderr) |
|
|
|
|
|
results_dir = Path(output_dir) / "results" |
|
if results_dir.exists(): |
|
print(f"Dossier de résultats créé: {results_dir}") |
|
|
|
result_files = list(results_dir.glob("**/*.json")) |
|
if result_files: |
|
print(f"Fichiers de résultats trouvés: {result_files}") |
|
|
|
result_files.sort(key=lambda x: x.stat().st_mtime, reverse=True) |
|
latest_result = result_files[0] |
|
print(f"Fichier de résultats le plus récent: {latest_result}") |
|
|
|
|
|
with open(latest_result, 'r') as f: |
|
results = json.load(f) |
|
print("Contenu du fichier de résultats:") |
|
print(json.dumps(results, indent=2)) |
|
|
|
|
|
print("\n==== ANALYSE DES RÉSULTATS ====") |
|
if "results" in results: |
|
for task_name, task_results in results["results"].items(): |
|
print(f"Tâche: {task_name}") |
|
for metric_name, metric_value in task_results.items(): |
|
print(f" {metric_name}: {metric_value}") |
|
else: |
|
print("Aucun résultat trouvé dans le fichier JSON") |
|
|
|
|
|
details_dir = Path(output_dir) / "details" |
|
if details_dir.exists(): |
|
print(f"\nDossier de détails trouvé: {details_dir}") |
|
model_details_dirs = list(details_dir.glob("**/*")) |
|
if model_details_dirs: |
|
print(f"Dossiers de détails par modèle: {model_details_dirs}") |
|
else: |
|
print("Aucun fichier de résultats trouvé.") |
|
else: |
|
print(f"Aucun dossier de résultats créé.") |
|
|
|
except subprocess.CalledProcessError as e: |
|
print(f"Erreur lors de l'exécution de la commande: {e}") |
|
except Exception as e: |
|
print(f"Exception: {e}") |
|
finally: |
|
|
|
try: |
|
os.unlink(temp_file_path) |
|
print(f"Fichier temporaire supprimé: {temp_file_path}") |
|
except: |
|
pass |
|
|
|
print(f"Heure de fin: {time.strftime('%H:%M:%S')}") |
|
|
|
if __name__ == "__main__": |
|
run_lighteval_test() |