Spaces:
Running
Running
import argparse | |
import json | |
import warnings | |
from pathlib import Path | |
from typing import Dict | |
import numpy as np | |
from PIL import Image | |
from tqdm import tqdm | |
from benchmark import create_benchmark | |
from benchmark.metrics import create_metric | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
def evaluate_benchmark( | |
benchmark_type: str, api_type: str, images_dir: Path = Path("images") | |
) -> Dict: | |
""" | |
Evaluate a benchmark's images using its specific metrics. | |
Args: | |
benchmark_type (str): Type of benchmark to evaluate | |
api_type (str): Type of API used to generate images | |
images_dir (Path): Base directory containing generated images | |
Returns: | |
Dict containing evaluation results | |
""" | |
benchmark = create_benchmark(benchmark_type) | |
benchmark_dir = images_dir / api_type / benchmark_type | |
metadata_file = benchmark_dir / "metadata.jsonl" | |
if not metadata_file.exists(): | |
raise FileNotFoundError( | |
f"No metadata file found for {api_type}/{benchmark_type}. Please run sample.py first." | |
) | |
metadata = [] | |
with open(metadata_file, "r") as f: | |
for line in f: | |
metadata.append(json.loads(line)) | |
metrics = { | |
metric_type: create_metric(metric_type) for metric_type in benchmark.metrics | |
} | |
results = { | |
"api": api_type, | |
"benchmark": benchmark_type, | |
"metrics": {metric: 0.0 for metric in benchmark.metrics}, | |
"total_images": len(metadata), | |
} | |
inference_times = [] | |
for entry in tqdm(metadata): | |
image_path = benchmark_dir / entry["filepath"] | |
if not image_path.exists(): | |
continue | |
for metric_type, metric in metrics.items(): | |
try: | |
if metric_type == "vqa": | |
score = metric.compute_score(image_path, entry["prompt"]) | |
else: | |
image = Image.open(image_path) | |
score = metric.compute_score(image, entry["prompt"]) | |
results["metrics"][metric_type] += score[metric_type] | |
except Exception as e: | |
print(f"Error computing {metric_type} for {image_path}: {str(e)}") | |
inference_times.append(entry["inference_time"]) | |
for metric in results["metrics"]: | |
results["metrics"][metric] /= len(metadata) | |
results["median_inference_time"] = np.median(inference_times).item() | |
return results | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Evaluate generated images using benchmark-specific metrics" | |
) | |
parser.add_argument("api_type", help="Type of API to evaluate") | |
parser.add_argument( | |
"benchmarks", nargs="+", help="List of benchmark types to evaluate" | |
) | |
args = parser.parse_args() | |
results_dir = Path("evaluation_results") | |
results_dir.mkdir(exist_ok=True) | |
results_file = results_dir / f"{args.api_type}.jsonl" | |
existing_results = set() | |
if results_file.exists(): | |
with open(results_file, "r") as f: | |
for line in f: | |
result = json.loads(line) | |
existing_results.add(result["benchmark"]) | |
for benchmark_type in args.benchmarks: | |
if benchmark_type in existing_results: | |
print(f"Skipping {args.api_type}/{benchmark_type} - already evaluated") | |
continue | |
try: | |
print(f"Evaluating {args.api_type}/{benchmark_type}") | |
results = evaluate_benchmark(benchmark_type, args.api_type) | |
# Append results to file | |
with open(results_file, "a") as f: | |
f.write(json.dumps(results) + "\n") | |
except Exception as e: | |
print(f"Error evaluating {args.api_type}/{benchmark_type}: {str(e)}") | |
if __name__ == "__main__": | |
main() | |