Spaces:

PrunaAI
/

InferBench

Running

App Files Files Community

nifleisch commited on May 26

Commit

4f41410

1 Parent(s): 2c50826

fix: fix several erros

Browse files

Files changed (15) hide show

.env.example +4 -0
.gitignore +4 -0
README.md +8 -0
api/__init__.py +16 -1
api/fal.py +1 -1
api/pruna_dev.py +49 -0
benchmark/hps.py +2 -2
benchmark/metrics/__init__.py +3 -2
benchmark/metrics/arniqa.py +3 -2
benchmark/metrics/clip_iqa.py +1 -1
benchmark/metrics/vqa.py +4 -9
environment.yml +27 -0
evaluate.py +15 -7
pyproject.toml +0 -22
uv.lock +0 -0

.env.example ADDED Viewed

	@@ -0,0 +1,4 @@

+FIREWORKS_API_TOKEN=your_fireworks_api_token_here
+REPLICATE_API_TOKEN=your_replicate_api_token_here
+FAL_KEY=your_fal_key_here
+TOGETHER_API_KEY=your_together_api_key_here

.gitignore CHANGED Viewed

@@ -172,3 +172,7 @@ cython_debug/
 # PyPI configuration file
 .pypirc

 # PyPI configuration file
 .pypirc
+evaluation_results/
+images/
+hf_cache/

README.md CHANGED Viewed

@@ -1,2 +1,10 @@
 # InferBench-
 Evaluate the quality and efficiency of image gen api's.

 # InferBench-
 Evaluate the quality and efficiency of image gen api's.
+Install dependencies with conda like that:
+conda env create -f environment.yml
+Create .env file with all the files you will need.
+python sample.py replicate draw_bench genai_bench geneval hps parti

api/__init__.py CHANGED Viewed

@@ -1,13 +1,26 @@
-from typing import Optional, Type
 from api.baseline import BaselineAPI
 from api.fireworks import FireworksAPI
 from api.flux import FluxAPI
 from api.pruna import PrunaAPI
 from api.replicate import ReplicateAPI
 from api.together import TogetherAPI
 from api.fal import FalAPI
 def create_api(api_type: str) -> FluxAPI:
     """
     Factory function to create API instances.
@@ -27,6 +40,8 @@ def create_api(api_type: str) -> FluxAPI:
     Raises:
         ValueError: If an invalid API type is provided
     """
     if api_type.startswith("pruna_"):
         speed_mode = api_type[6:]  # Remove "pruna_" prefix
         return PrunaAPI(speed_mode)

+from typing import Type
 from api.baseline import BaselineAPI
 from api.fireworks import FireworksAPI
 from api.flux import FluxAPI
 from api.pruna import PrunaAPI
+from api.pruna_dev import PrunaDevAPI
 from api.replicate import ReplicateAPI
 from api.together import TogetherAPI
 from api.fal import FalAPI
+__all__ = [
+    'create_api',
+    'FluxAPI',
+    'BaselineAPI',
+    'FireworksAPI',
+    'PrunaAPI',
+    'ReplicateAPI',
+    'TogetherAPI',
+    'FalAPI',
+    'PrunaDevAPI',
+]
 def create_api(api_type: str) -> FluxAPI:
     """
     Factory function to create API instances.
     Raises:
         ValueError: If an invalid API type is provided
     """
+    if api_type == "pruna_dev":
+        return PrunaDevAPI()
     if api_type.startswith("pruna_"):
         speed_mode = api_type[6:]  # Remove "pruna_" prefix
         return PrunaAPI(speed_mode)

api/fal.py CHANGED Viewed

@@ -7,7 +7,7 @@ import fal_client
 import requests
 from PIL import Image
-from flux import FluxAPI
 class FalAPI(FluxAPI):

 import requests
 from PIL import Image
+from api.flux import FluxAPI
 class FalAPI(FluxAPI):

api/pruna_dev.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import time
+from pathlib import Path
+from typing import Any
+from dotenv import load_dotenv
+import replicate
+from api.flux import FluxAPI
+class PrunaDevAPI(FluxAPI):
+    def __init__(self):
+        load_dotenv()
+        self._api_key = os.getenv("REPLICATE_API_TOKEN")
+        if not self._api_key:
+            raise ValueError("REPLICATE_API_TOKEN not found in environment variables")
+    @property
+    def name(self) -> str:
+        return "pruna_dev"
+    def generate_image(self, prompt: str, save_path: Path) -> float:
+        start_time = time.time()
+        result = replicate.run(
+            "prunaai/flux.1-dev:938a4eb31a87d65fb7b23fc300fb5b7ab88a36844bb26e54e1d1dec7acf4eefe",
+            input={
+                "seed": 0,
+                "prompt": prompt,
+                "guidance": 3.5,
+                "num_outputs": 1,
+                "aspect_ratio": "1:1",
+                "output_format": "png",
+                "speed_mode": "Juiced 🔥 (default)",
+                "num_inference_steps": 28,
+            },
+        )
+        end_time = time.time()
+        if result:
+            self._save_image_from_result(result, save_path)
+        else:
+            raise Exception("No result returned from Replicate API")
+        return end_time - start_time
+    def _save_image_from_result(self, result: Any, save_path: Path):
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(save_path, "wb") as f:
+            f.write(result.read())

benchmark/hps.py CHANGED Viewed

@@ -15,7 +15,7 @@ class HPSPrompts:
         self._size = 0
         for file in self.hps_prompt_files:
             category = file.replace('.json', '')
-            with open(os.path.join('datasets/hps', file), 'r') as f:
                 prompts = json.load(f)
                 for i, prompt in enumerate(prompts):
                     if i == 100:
@@ -26,7 +26,7 @@ class HPSPrompts:
     def __iter__(self) -> Iterator[Tuple[str, Path]]:
         for filename, prompt in self.prompts.items():
-            yield prompt, filename
     @property
     def name(self) -> str:

         self._size = 0
         for file in self.hps_prompt_files:
             category = file.replace('.json', '')
+            with open(os.path.join('downloads/hps', file), 'r') as f:
                 prompts = json.load(f)
                 for i, prompt in enumerate(prompts):
                     if i == 100:
     def __iter__(self) -> Iterator[Tuple[str, Path]]:
         for filename, prompt in self.prompts.items():
+            yield prompt, Path(filename)
     @property
     def name(self) -> str:

benchmark/metrics/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ from benchmark.metrics.clip_iqa import CLIPIQAMetric
 from benchmark.metrics.image_reward import ImageRewardMetric
 from benchmark.metrics.sharpness import SharpnessMetric
 from benchmark.metrics.vqa import VQAMetric
 def create_metric(metric_type: str) -> Type[ARNIQAMetric | CLIPMetric | CLIPIQAMetric | ImageRewardMetric | SharpnessMetric | VQAMetric]:
     """
@@ -20,7 +20,7 @@ def create_metric(metric_type: str) -> Type[ARNIQAMetric | CLIPMetric | CLIPIQAM
             - "image_reward"
             - "sharpness"
             - "vqa"
     Returns:
         An instance of the requested metric implementation
@@ -34,6 +34,7 @@ def create_metric(metric_type: str) -> Type[ARNIQAMetric | CLIPMetric | CLIPIQAM
         "image_reward": ImageRewardMetric,
         "sharpness": SharpnessMetric,
         "vqa": VQAMetric,
     }
     if metric_type not in metric_map:

 from benchmark.metrics.image_reward import ImageRewardMetric
 from benchmark.metrics.sharpness import SharpnessMetric
 from benchmark.metrics.vqa import VQAMetric
+from benchmark.metrics.hps import HPSMetric
 def create_metric(metric_type: str) -> Type[ARNIQAMetric | CLIPMetric | CLIPIQAMetric | ImageRewardMetric | SharpnessMetric | VQAMetric]:
     """
             - "image_reward"
             - "sharpness"
             - "vqa"
+            - "hps"
     Returns:
         An instance of the requested metric implementation
         "image_reward": ImageRewardMetric,
         "sharpness": SharpnessMetric,
         "vqa": VQAMetric,
+        "hps": HPSMetric,
     }
     if metric_type not in metric_map:

benchmark/metrics/arniqa.py CHANGED Viewed

@@ -8,19 +8,20 @@ from torchmetrics.image.arniqa import ARNIQA
 class ARNIQAMetric:
     def __init__(self):
         self.metric = ARNIQA(
             regressor_dataset="koniq10k",
             reduction="mean",
             normalize=True,
             autocast=False
         )
     @property
     def name(self) -> str:
         return "arniqa"
     def compute_score(self, image: Image.Image, prompt: str) -> Dict[str, float]:
         image_tensor = torch.from_numpy(np.array(image)).permute(2, 0, 1).float() / 255.0
-        image_tensor = image_tensor.unsqueeze(0)
         score = self.metric(image_tensor)
         return {"arniqa": score.item()}

 class ARNIQAMetric:
     def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.metric = ARNIQA(
             regressor_dataset="koniq10k",
             reduction="mean",
             normalize=True,
             autocast=False
         )
+        self.metric.to(self.device)
     @property
     def name(self) -> str:
         return "arniqa"
     def compute_score(self, image: Image.Image, prompt: str) -> Dict[str, float]:
         image_tensor = torch.from_numpy(np.array(image)).permute(2, 0, 1).float() / 255.0
+        image_tensor = image_tensor.unsqueeze(0).to(self.device)
         score = self.metric(image_tensor)
         return {"arniqa": score.item()}

benchmark/metrics/clip_iqa.py CHANGED Viewed

@@ -12,7 +12,7 @@ class CLIPIQAMetric:
         self.metric = CLIPImageQualityAssessment(
             model_name_or_path="clip_iqa",
             data_range=255.0,
-            prompts=["quality"]
         )
         self.metric.to(self.device)

         self.metric = CLIPImageQualityAssessment(
             model_name_or_path="clip_iqa",
             data_range=255.0,
+            prompts=("quality",)
         )
         self.metric.to(self.device)

benchmark/metrics/vqa.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import os
-import tempfile
 from typing import Dict
 import t2v_metrics
-from PIL import Image
 class VQAMetric:
     def __init__(self):
@@ -15,11 +13,8 @@ class VQAMetric:
     def compute_score(
         self,
-        image: Image.Image,
         prompt: str,
     ) -> Dict[str, float]:
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
-            image.save(tmp.name)
-            score = self.metric(images=[tmp.name], texts=[prompt])
-            os.unlink(tmp.name)
-        return {"vqa_score": score[0][0].item()}

+from pathlib import Path
 from typing import Dict
 import t2v_metrics
 class VQAMetric:
     def __init__(self):
     def compute_score(
         self,
+        image_path: Path,
         prompt: str,
     ) -> Dict[str, float]:
+        score = self.metric(images=[str(image_path)], texts=[prompt])
+        return {"vqa": score[0][0].item()}

environment.yml ADDED Viewed

	@@ -0,0 +1,27 @@

+name: inferbench
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.12
+  - numpy
+  - opencv
+  - pillow
+  - python-dotenv
+  - requests
+  - tqdm
+  - pip
+  - pip:
+      - datasets>=3.5.0
+      - fal-client>=0.5.9
+      - hpsv2>=1.2.0
+      - huggingface-hub>=0.30.2
+      - image-reward>=1.5
+      - replicate>=1.0.4
+      - t2v-metrics>=1.2
+      - together>=1.5.5
+      - torch>=2.7.0
+      - torchmetrics>=1.7.1
+      - clip
+      - diffusers<=0.31
+      - piq>=0.8.0

evaluate.py CHANGED Viewed

@@ -2,10 +2,16 @@ import argparse
 import json
 from pathlib import Path
 from typing import Dict
 from benchmark import create_benchmark
 from benchmark.metrics import create_metric
 from PIL import Image
 def evaluate_benchmark(benchmark_type: str, api_type: str, images_dir: Path = Path("images")) -> Dict:
@@ -39,29 +45,31 @@ def evaluate_benchmark(benchmark_type: str, api_type: str, images_dir: Path = Pa
         "api": api_type,
         "benchmark": benchmark_type,
         "metrics": {metric: 0.0 for metric in benchmark.metrics},
-        "avg_inference_time": 0.0,
         "total_images": len(metadata)
     }
-    for entry in metadata:
         image_path = benchmark_dir / entry["filepath"]
         if not image_path.exists():
             continue
-        image = Image.open(image_path)
         for metric_type, metric in metrics.items():
             try:
-                score = metric.compute_score(image, entry["prompt"])
                 results["metrics"][metric_type] += score[metric_type]
             except Exception as e:
                 print(f"Error computing {metric_type} for {image_path}: {str(e)}")
-        results["avg_inference_time"] += entry["inference_time"]
     for metric in results["metrics"]:
         results["metrics"][metric] /= len(metadata)
-    results["avg_inference_time"] /= len(metadata)
     return results

 import json
 from pathlib import Path
 from typing import Dict
+import warnings
 from benchmark import create_benchmark
 from benchmark.metrics import create_metric
+import numpy as np
 from PIL import Image
+from tqdm import tqdm
+warnings.filterwarnings("ignore", category=FutureWarning)
 def evaluate_benchmark(benchmark_type: str, api_type: str, images_dir: Path = Path("images")) -> Dict:
         "api": api_type,
         "benchmark": benchmark_type,
         "metrics": {metric: 0.0 for metric in benchmark.metrics},
         "total_images": len(metadata)
     }
+    inference_times = []
+    for entry in tqdm(metadata):
         image_path = benchmark_dir / entry["filepath"]
         if not image_path.exists():
             continue
         for metric_type, metric in metrics.items():
             try:
+                if metric_type == "vqa":
+                    score = metric.compute_score(image_path, entry["prompt"])
+                else:
+                    image = Image.open(image_path)
+                    score = metric.compute_score(image, entry["prompt"])
                 results["metrics"][metric_type] += score[metric_type]
             except Exception as e:
                 print(f"Error computing {metric_type} for {image_path}: {str(e)}")
+        inference_times.append(entry["inference_time"])
     for metric in results["metrics"]:
         results["metrics"][metric] /= len(metadata)
+    results["median_inference_time"] = np.median(inference_times).item()
     return results

pyproject.toml DELETED Viewed

@@ -1,22 +0,0 @@
-[project]
-name = "inferbench"
-version = "0.1.0"
-requires-python = ">=3.12"
-dependencies = [
-    "datasets>=3.5.0",
-    "fal-client>=0.5.9",
-    "hpsv2>=1.2.0",
-    "huggingface-hub>=0.30.2",
-    "image-reward>=1.5",
-    "numpy>=2.2.5",
-    "opencv-python>=4.11.0.86",
-    "pillow>=11.2.1",
-    "python-dotenv>=1.1.0",
-    "replicate>=1.0.4",
-    "requests>=2.32.3",
-    "t2v-metrics>=1.2",
-    "together>=1.5.5",
-    "torch>=2.7.0",
-    "torchmetrics>=1.7.1",
-    "tqdm>=4.67.1",
-]

uv.lock DELETED Viewed

The diff for this file is too large to render. See raw diff