|
|
|
""" |
|
Script pour tester les résultats de Yourbench et vérifier les datasets sur le Hub Hugging Face. |
|
""" |
|
|
|
import os |
|
import sys |
|
import json |
|
import argparse |
|
import requests |
|
import tempfile |
|
from datetime import datetime |
|
from typing import Dict, List, Any, Optional, Tuple |
|
|
|
|
|
try: |
|
from dotenv import load_dotenv |
|
from huggingface_hub import HfApi, DatasetInfo, ModelInfo |
|
from loguru import logger |
|
import pandas as pd |
|
except ImportError: |
|
print("Installation des dépendances...") |
|
import subprocess |
|
subprocess.run(["pip", "install", "python-dotenv", "huggingface_hub", "loguru", "pandas", "pyarrow"], check=True) |
|
from dotenv import load_dotenv |
|
from huggingface_hub import HfApi, DatasetInfo, ModelInfo |
|
from loguru import logger |
|
import pandas as pd |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
logger.remove() |
|
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>") |
|
logger.add("yourbench_tests.log", rotation="10 MB", retention="1 week") |
|
|
|
def configure_argument_parser() -> argparse.ArgumentParser: |
|
"""Configure le parser d'arguments.""" |
|
parser = argparse.ArgumentParser(description="Tester les résultats de Yourbench et vérifier les datasets") |
|
parser.add_argument("--dataset", type=str, help="Nom du dataset à vérifier (sans le nom de l'organisation)") |
|
parser.add_argument("--org", type=str, default=os.environ.get("HF_ORGANIZATION", "yourbench"), |
|
help="Organisation Hugging Face (défaut: valeur de HF_ORGANIZATION dans .env ou 'yourbench')") |
|
parser.add_argument("--verbose", "-v", action="store_true", help="Afficher des informations détaillées") |
|
return parser |
|
|
|
class YourbenchTester: |
|
"""Classe pour tester les résultats et datasets de Yourbench.""" |
|
|
|
def __init__(self, organization: str, verbose: bool = False): |
|
"""Initialise le testeur Yourbench. |
|
|
|
Args: |
|
organization: Nom de l'organisation sur Hugging Face |
|
verbose: Afficher des informations détaillées |
|
""" |
|
self.organization = organization |
|
self.verbose = verbose |
|
self.hf_token = os.environ.get("HF_TOKEN") |
|
|
|
if not self.hf_token: |
|
logger.error("Variable d'environnement HF_TOKEN non trouvée dans le fichier .env") |
|
sys.exit(1) |
|
|
|
self.api = HfApi(token=self.hf_token) |
|
logger.info(f"Initialisation du testeur pour l'organisation: {organization}") |
|
|
|
def test_dataset_exists(self, dataset_name: str) -> Optional[DatasetInfo]: |
|
"""Vérifie si un dataset existe sur le Hub. |
|
|
|
Args: |
|
dataset_name: Nom du dataset à vérifier |
|
|
|
Returns: |
|
Informations sur le dataset s'il existe, None sinon |
|
""" |
|
full_dataset_name = f"{self.organization}/{dataset_name}" |
|
logger.info(f"Vérification de l'existence du dataset: {full_dataset_name}") |
|
|
|
try: |
|
dataset_info = self.api.dataset_info(full_dataset_name) |
|
logger.success(f"Dataset {full_dataset_name} trouvé!") |
|
|
|
if self.verbose: |
|
logger.info(f"ID: {dataset_info.id}") |
|
logger.info(f"Dernière modification: {dataset_info.lastModified}") |
|
logger.info(f"SHA: {dataset_info.sha}") |
|
|
|
return dataset_info |
|
|
|
except Exception as e: |
|
logger.error(f"Impossible de trouver le dataset {full_dataset_name}: {str(e)}") |
|
return None |
|
|
|
def analyze_dataset_content(self, dataset_name: str) -> Tuple[bool, Dict[str, Any]]: |
|
"""Analyse le contenu d'un dataset. |
|
|
|
Args: |
|
dataset_name: Nom du dataset à analyser |
|
|
|
Returns: |
|
Tuple contenant un booléen indiquant si l'analyse a réussi et un dictionnaire de statistiques |
|
""" |
|
full_dataset_name = f"{self.organization}/{dataset_name}" |
|
logger.info(f"Analyse du contenu du dataset: {full_dataset_name}") |
|
|
|
stats = { |
|
"fichiers": 0, |
|
"taille_totale": 0, |
|
"fichiers_json": 0, |
|
"fichiers_parquet": 0, |
|
"a_questions": False, |
|
"nb_questions": 0, |
|
"structure_parquet": {}, |
|
"types_documents": set() |
|
} |
|
|
|
try: |
|
|
|
files = self.api.list_repo_files(full_dataset_name, repo_type="dataset") |
|
stats["fichiers"] = len(files) |
|
|
|
if self.verbose: |
|
logger.info(f"Fichiers trouvés dans le dataset: {len(files)}") |
|
for file in files[:10]: |
|
logger.info(f" - {file}") |
|
if len(files) > 10: |
|
logger.info(f" ... et {len(files) - 10} fichiers supplémentaires") |
|
|
|
|
|
question_files = [f for f in files if "question" in f.lower() and f.endswith(".json")] |
|
stats["fichiers_json"] = len([f for f in files if f.endswith(".json")]) |
|
|
|
|
|
parquet_files = [f for f in files if f.endswith(".parquet")] |
|
stats["fichiers_parquet"] = len(parquet_files) |
|
|
|
if parquet_files: |
|
logger.info(f"Fichiers Parquet trouvés: {len(parquet_files)}") |
|
|
|
|
|
for parquet_file in parquet_files[:3]: |
|
category = parquet_file.split('/')[0] if '/' in parquet_file else "unknown" |
|
|
|
logger.info(f"Analyse du fichier Parquet: {parquet_file} (catégorie: {category})") |
|
|
|
try: |
|
|
|
temp_file = self.api.hf_hub_download( |
|
repo_id=full_dataset_name, |
|
filename=parquet_file, |
|
repo_type="dataset" |
|
) |
|
|
|
|
|
df = pd.read_parquet(temp_file) |
|
|
|
|
|
stats["structure_parquet"][category] = { |
|
"colonnes": list(df.columns), |
|
"nb_lignes": len(df), |
|
"exemple": df.iloc[0].to_dict() if len(df) > 0 else {} |
|
} |
|
|
|
|
|
if any(col for col in df.columns if "question" in col.lower()): |
|
stats["a_questions"] = True |
|
question_col = next(col for col in df.columns if "question" in col.lower()) |
|
stats["nb_questions"] = len(df) |
|
|
|
|
|
if len(df) > 0 and question_col in df.columns: |
|
logger.info(f"Exemple de question: {df[question_col].iloc[0][:100]}...") |
|
|
|
|
|
if "doc_type" in df.columns and len(df) > 0: |
|
doc_types = df["doc_type"].unique() |
|
stats["types_documents"].update(doc_types) |
|
|
|
except Exception as e: |
|
logger.warning(f"Erreur lors de l'analyse du fichier {parquet_file}: {str(e)}") |
|
|
|
|
|
stats["types_documents"] = list(stats["types_documents"]) |
|
|
|
if question_files: |
|
stats["a_questions"] = True |
|
|
|
|
|
sample_file = question_files[0] |
|
content = self.api.hf_hub_download( |
|
repo_id=full_dataset_name, |
|
filename=sample_file, |
|
repo_type="dataset" |
|
) |
|
|
|
with open(content, 'r') as f: |
|
data = json.load(f) |
|
|
|
if isinstance(data, list): |
|
stats["nb_questions"] = len(data) |
|
elif isinstance(data, dict) and "questions" in data: |
|
stats["nb_questions"] = len(data["questions"]) |
|
|
|
logger.success(f"Fichiers de questions trouvés: {len(question_files)}") |
|
logger.info(f"Exemple de fichier analysé: {sample_file}") |
|
logger.info(f"Nombre de questions trouvées: {stats['nb_questions']}") |
|
|
|
return True, stats |
|
|
|
except Exception as e: |
|
logger.error(f"Erreur lors de l'analyse du dataset {full_dataset_name}: {str(e)}") |
|
return False, stats |
|
|
|
def check_evaluation_results(self, dataset_name: str) -> bool: |
|
"""Vérifie s'il existe des résultats d'évaluation pour ce dataset. |
|
|
|
Args: |
|
dataset_name: Nom du dataset à vérifier |
|
|
|
Returns: |
|
True si des résultats d'évaluation existent, False sinon |
|
""" |
|
logger.info(f"Recherche de résultats d'évaluation pour le dataset: {dataset_name}") |
|
|
|
try: |
|
|
|
datasets = self.api.list_datasets(author=self.organization) |
|
|
|
|
|
eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")] |
|
|
|
if self.verbose: |
|
logger.info(f"Datasets d'évaluation trouvés: {len(eval_datasets)}") |
|
for ds in eval_datasets[:5]: |
|
logger.info(f" - {ds.id}") |
|
|
|
|
|
for eval_ds in eval_datasets: |
|
try: |
|
|
|
readme_path = self.api.hf_hub_download( |
|
repo_id=eval_ds.id, |
|
filename="README.md", |
|
repo_type="dataset" |
|
) |
|
|
|
with open(readme_path, 'r') as f: |
|
readme_content = f.read() |
|
|
|
if dataset_name in readme_content: |
|
logger.success(f"Résultats d'évaluation trouvés dans: {eval_ds.id}") |
|
return True |
|
except: |
|
continue |
|
|
|
logger.warning(f"Aucun résultat d'évaluation trouvé pour le dataset: {dataset_name}") |
|
return False |
|
|
|
except Exception as e: |
|
logger.error(f"Erreur lors de la recherche de résultats d'évaluation: {str(e)}") |
|
return False |
|
|
|
def check_model_performances(self, dataset_name: str) -> Dict[str, float]: |
|
"""Vérifie les performances des modèles sur le dataset spécifié. |
|
|
|
Args: |
|
dataset_name: Nom du dataset à vérifier |
|
|
|
Returns: |
|
Dictionnaire des performances des modèles (model_name -> score) |
|
""" |
|
logger.info(f"Vérification des performances des modèles sur le dataset: {dataset_name}") |
|
performances = {} |
|
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
datasets = self.api.list_datasets(author=self.organization) |
|
eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")] |
|
|
|
for eval_ds in eval_datasets: |
|
try: |
|
files = self.api.list_repo_files(eval_ds.id, repo_type="dataset") |
|
result_files = [f for f in files if "result" in f.lower() and f.endswith(".json")] |
|
|
|
for result_file in result_files: |
|
file_path = self.api.hf_hub_download( |
|
repo_id=eval_ds.id, |
|
filename=result_file, |
|
repo_type="dataset" |
|
) |
|
|
|
with open(file_path, 'r') as f: |
|
results = json.load(f) |
|
|
|
|
|
if "model_name" in results and "metrics" in results: |
|
model_name = results["model_name"] |
|
metrics = results["metrics"] |
|
|
|
|
|
if metrics and isinstance(metrics, dict): |
|
first_metric = list(metrics.keys())[0] |
|
performances[model_name] = metrics[first_metric] |
|
except: |
|
continue |
|
|
|
if performances: |
|
logger.success(f"Performances trouvées pour {len(performances)} modèles") |
|
for model, score in performances.items(): |
|
logger.info(f" - {model}: {score}") |
|
else: |
|
logger.warning("Aucune performance de modèle trouvée") |
|
|
|
return performances |
|
|
|
except Exception as e: |
|
logger.error(f"Erreur lors de la vérification des performances: {str(e)}") |
|
return {} |
|
|
|
def main(): |
|
"""Fonction principale.""" |
|
parser = configure_argument_parser() |
|
args = parser.parse_args() |
|
|
|
if not args.dataset: |
|
logger.error("Veuillez spécifier un dataset avec --dataset") |
|
parser.print_help() |
|
return |
|
|
|
|
|
tester = YourbenchTester(args.org, args.verbose) |
|
|
|
|
|
dataset_info = tester.test_dataset_exists(args.dataset) |
|
|
|
if not dataset_info: |
|
logger.error(f"Le dataset {args.org}/{args.dataset} n'existe pas ou n'est pas accessible") |
|
return |
|
|
|
|
|
success, stats = tester.analyze_dataset_content(args.dataset) |
|
|
|
if success: |
|
logger.info("\n=== Statistiques du dataset ===") |
|
logger.info(f"Nombre de fichiers: {stats['fichiers']}") |
|
logger.info(f"Fichiers JSON: {stats['fichiers_json']}") |
|
logger.info(f"Fichiers Parquet: {stats['fichiers_parquet']}") |
|
logger.info(f"Contient des questions: {'Oui' if stats['a_questions'] else 'Non'}") |
|
|
|
if stats['a_questions']: |
|
logger.info(f"Nombre de questions: {stats['nb_questions']}") |
|
|
|
if 'types_documents' in stats and stats['types_documents']: |
|
logger.info(f"Types de documents: {', '.join(stats['types_documents'])}") |
|
|
|
|
|
if 'structure_parquet' in stats and stats['structure_parquet']: |
|
logger.info("\n=== Structure des fichiers Parquet ===") |
|
for category, info in stats['structure_parquet'].items(): |
|
logger.info(f"\nCatégorie: {category}") |
|
logger.info(f"Nombre de lignes: {info['nb_lignes']}") |
|
logger.info(f"Colonnes: {', '.join(info['colonnes'])}") |
|
|
|
if args.verbose and 'exemple' in info and info['exemple']: |
|
logger.info("\nExemple de ligne:") |
|
for key, value in info['exemple'].items(): |
|
|
|
if isinstance(value, str) and len(value) > 100: |
|
value = value[:100] + "..." |
|
logger.info(f" {key}: {value}") |
|
|
|
|
|
has_evaluations = tester.check_evaluation_results(args.dataset) |
|
|
|
if has_evaluations: |
|
|
|
performances = tester.check_model_performances(args.dataset) |
|
|
|
if performances: |
|
logger.info("\n=== Classement des modèles ===") |
|
|
|
sorted_models = sorted(performances.items(), key=lambda x: x[1], reverse=True) |
|
for i, (model, score) in enumerate(sorted_models, 1): |
|
logger.info(f"{i}. {model}: {score:.4f}") |
|
|
|
logger.success("Test terminé !") |
|
|
|
if __name__ == "__main__": |
|
main() |