Spaces:

TeddyYao
/

grok4-gpqa-eval

Running

App Files Files Community

TeddyYao commited on 25 days ago

Commit

8474f02

verified ·

1 Parent(s): cd16551

Upload 38 files

Browse files

Files changed (38) hide show

README.md +29 -8
apis/__init__.py +0 -0
apis/__pycache__/__init__.cpython-312.pyc +0 -0
apis/__pycache__/anthropic_api.cpython-312.pyc +0 -0
apis/__pycache__/api_factory.cpython-312.pyc +0 -0
apis/__pycache__/base_api.cpython-312.pyc +0 -0
apis/__pycache__/grok_api.cpython-312.pyc +0 -0
apis/__pycache__/openai_api.cpython-312.pyc +0 -0
apis/anthropic_api.py +30 -0
apis/api_factory.py +71 -0
apis/base_api.py +54 -0
apis/grok_api.py +42 -0
apis/openai_api.py +32 -0
app.py +124 -0
benchmarks/__init__.py +21 -0
benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
benchmarks/__pycache__/base_benchmark.cpython-312.pyc +0 -0
benchmarks/__pycache__/evaluation_utils.cpython-312.pyc +0 -0
benchmarks/__pycache__/gpqa_benchmark.cpython-312.pyc +0 -0
benchmarks/__pycache__/gsm8k_benchmark.cpython-312.pyc +0 -0
benchmarks/__pycache__/humaneval_benchmark.cpython-312.pyc +0 -0
benchmarks/__pycache__/math_benchmark.cpython-312.pyc +0 -0
benchmarks/__pycache__/mmlu_benchmark.cpython-312.pyc +0 -0
benchmarks/__pycache__/prompt_templates.cpython-312.pyc +0 -0
benchmarks/base_benchmark.py +124 -0
benchmarks/evaluation_utils.py +160 -0
benchmarks/gpqa_benchmark.py +126 -0
benchmarks/gsm8k_benchmark.py +115 -0
benchmarks/humaneval_benchmark.py +134 -0
benchmarks/math_benchmark.py +125 -0
benchmarks/mmlu_benchmark.py +134 -0
benchmarks/prompt_templates.py +86 -0
check_deployment.py +67 -0
official_config.yaml +77 -0
requirements.txt +17 -0
run_evaluation.py +225 -0
run_hf_space.py +39 -0
setup_hf_space.py +244 -0

README.md CHANGED Viewed

@@ -1,13 +1,34 @@
 ---
-title: Grok4 Gpqa Eval
-emoji: 🏢
-colorFrom: red
-colorTo: blue
 sdk: gradio
-sdk_version: 5.38.0
-app_file: app.py
 pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Grok-4 GPQA Evaluation
+emoji: 🧠
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: "4.31.0"
+app_file: run_hf_space.py
 pinned: false
 ---
+# Grok-4 GPQA Evaluation Dashboard
+Real-time evaluation of Grok-4 model on GPQA benchmark.
+## ⚙️ Configuration Required
+Please set these secrets in your Space settings:
+- **GROK_API_KEY**: Your Grok API key from x.ai
+- **HF_TOKEN**: Your Hugging Face token (for GPQA dataset access)
+## 📊 Features
+- Real-time progress tracking
+- Accuracy metrics and performance stats
+- Detailed results export
+- Support for full GPQA dataset (448 questions)
+## 🚀 Getting Started
+1. Set the required secrets in Space settings
+2. Make sure you have GPQA dataset access
+3. The evaluation will start automatically
+4. Monitor progress in the dashboard

apis/__init__.py ADDED Viewed

File without changes

apis/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (150 Bytes). View file

apis/__pycache__/anthropic_api.cpython-312.pyc ADDED Viewed

Binary file (2.11 kB). View file

apis/__pycache__/api_factory.cpython-312.pyc ADDED Viewed

Binary file (2.47 kB). View file

apis/__pycache__/base_api.cpython-312.pyc ADDED Viewed

Binary file (3.36 kB). View file

apis/__pycache__/grok_api.cpython-312.pyc ADDED Viewed

Binary file (2.48 kB). View file

apis/__pycache__/openai_api.cpython-312.pyc ADDED Viewed

Binary file (2.21 kB). View file

apis/anthropic_api.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import anthropic
+from .base_api import BaseAPI
+class AnthropicAPI(BaseAPI):
+    """Anthropic API implementation"""
+    def __init__(self, api_key: str, model_name: str, **kwargs):
+        super().__init__(api_key, model_name, **kwargs)
+        self.client = anthropic.AsyncAnthropic(api_key=api_key)
+    async def generate_response(self, prompt: str, **kwargs) -> str:
+        """Generate response using Anthropic API"""
+        try:
+            response = await self.client.messages.create(
+                model=self.model_name,
+                max_tokens=kwargs.get('max_tokens', 2048),
+                temperature=kwargs.get('temperature', 0.0),
+                messages=[{"role": "user", "content": prompt}]
+            )
+            return response.content[0].text
+        except Exception as e:
+            raise Exception(f"Anthropic API error: {str(e)}")
+    def get_model_info(self) -> dict:
+        """Get model information"""
+        return {
+            "provider": "Anthropic",
+            "model": self.model_name,
+            "api_version": "2023-06-01"
+        }

apis/api_factory.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from typing import Dict, Any
+from .openai_api import OpenAIAPI
+from .anthropic_api import AnthropicAPI
+from .grok_api import GrokAPI
+from .base_api import BaseAPI
+class APIFactory:
+    """Factory class to create API instances based on model name"""
+    # Model to provider mapping
+    MODEL_PROVIDERS = {
+        # OpenAI models
+        'gpt-4o': 'openai',
+        'gpt-4-turbo': 'openai',
+        'gpt-3.5-turbo': 'openai',
+        # Anthropic models
+        'claude-3-5-sonnet-20241022': 'anthropic',
+        'claude-3-opus-20240229': 'anthropic',
+        'claude-3-haiku-20240307': 'anthropic',
+        # Grok models
+        'grok-4-0709': 'grok',
+        'grok-beta': 'grok',
+        'grok-2-latest': 'grok',
+        'grok-vision-beta': 'grok',
+    }
+    # Provider to API class mapping
+    PROVIDER_APIS = {
+        'openai': OpenAIAPI,
+        'anthropic': AnthropicAPI,
+        'grok': GrokAPI,
+    }
+    @classmethod
+    def create_api(cls, model_name: str, config: Dict[str, Any]) -> BaseAPI:
+        """Create an API instance for the given model"""
+        # Determine provider
+        provider = cls.MODEL_PROVIDERS.get(model_name)
+        if not provider:
+            raise ValueError(f"Unknown model: {model_name}")
+        # Get provider config
+        provider_config = config['models'].get(provider)
+        if not provider_config:
+            raise ValueError(f"No configuration found for provider: {provider}")
+        # Get API key
+        api_key = provider_config.get('api_key')
+        if not api_key:
+            raise ValueError(f"No API key found for provider: {provider}")
+        # Get API class
+        api_class = cls.PROVIDER_APIS.get(provider)
+        if not api_class:
+            raise ValueError(f"No API implementation for provider: {provider}")
+        # Create API instance with provider-specific kwargs
+        kwargs = {
+            'rate_limit_delay': config['evaluation'].get('rate_limit_delay', 1.0),
+            'max_retries': config['evaluation'].get('max_retries', 3),
+            'timeout': config['evaluation'].get('timeout', 30),
+        }
+        # Add provider-specific config
+        if provider == 'grok':
+            kwargs['base_url'] = provider_config.get('base_url', 'https://api.x.ai/v1')
+        return api_class(api_key, model_name, **kwargs)

apis/base_api.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from abc import ABC, abstractmethod
+import time
+import asyncio
+from typing import List, Dict, Any, Optional
+class BaseAPI(ABC):
+    """Base class for all API implementations"""
+    def __init__(self, api_key: str, model_name: str, **kwargs):
+        self.api_key = api_key
+        self.model_name = model_name
+        self.rate_limit_delay = kwargs.get('rate_limit_delay', 1.0)
+        self.max_retries = kwargs.get('max_retries', 3)
+        self.timeout = kwargs.get('timeout', 30)
+    @abstractmethod
+    async def generate_response(self, prompt: str, **kwargs) -> str:
+        """Generate a response from the model"""
+        pass
+    async def generate_with_retry(self, prompt: str, **kwargs) -> str:
+        """Generate response with retry logic"""
+        for attempt in range(self.max_retries):
+            try:
+                response = await self.generate_response(prompt, **kwargs)
+                return response
+            except Exception as e:
+                error_str = str(e).lower()
+                # Check if it's a timeout error
+                if 'timeout' in error_str or 'timed out' in error_str:
+                    # For timeout errors, use longer backoff
+                    max_retries = min(self.max_retries + 2, 5)  # Allow more retries for timeouts
+                    if attempt < max_retries - 1:
+                        backoff = min(60, 5 * (2 ** attempt))  # Max 60 seconds wait
+                        print(f"Timeout error, retrying in {backoff}s... (attempt {attempt + 1}/{max_retries})")
+                        await asyncio.sleep(backoff)
+                        continue
+                # For other errors, use standard backoff
+                if attempt == self.max_retries - 1:
+                    raise e
+                backoff = min(30, 2 ** attempt)  # Max 30 seconds for other errors
+                await asyncio.sleep(backoff)
+    async def batch_generate(self, prompts: List[str], **kwargs) -> List[str]:
+        """Generate responses for multiple prompts"""
+        responses = []
+        for prompt in prompts:
+            response = await self.generate_with_retry(prompt, **kwargs)
+            responses.append(response)
+            await asyncio.sleep(self.rate_limit_delay)
+        return responses

apis/grok_api.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import openai
+from .base_api import BaseAPI
+class GrokAPI(BaseAPI):
+    """Grok API implementation (uses OpenAI-compatible interface)"""
+    def __init__(self, api_key: str, model_name: str, **kwargs):
+        super().__init__(api_key, model_name, **kwargs)
+        self.base_url = kwargs.get('base_url', 'https://api.x.ai/v1')
+        self.client = openai.AsyncOpenAI(
+            api_key=api_key,
+            base_url=self.base_url
+        )
+    async def generate_response(self, prompt: str, **kwargs) -> str:
+        """Generate response using Grok API"""
+        try:
+            # Build parameters
+            params = {
+                "model": self.model_name,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": kwargs.get('temperature', 0.0),
+                "timeout": self.timeout
+            }
+            # For grok-4-0709, don't set max_tokens to allow full reasoning
+            if self.model_name != 'grok-4-0709':
+                params['max_tokens'] = kwargs.get('max_tokens', 2048)
+            response = await self.client.chat.completions.create(**params)
+            return response.choices[0].message.content
+        except Exception as e:
+            raise Exception(f"Grok API error: {str(e)}")
+    def get_model_info(self) -> dict:
+        """Get model information"""
+        return {
+            "provider": "Grok",
+            "model": self.model_name,
+            "api_version": "v1",
+            "base_url": self.base_url
+        }

apis/openai_api.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import openai
+from .base_api import BaseAPI
+import asyncio
+class OpenAIAPI(BaseAPI):
+    """OpenAI API implementation"""
+    def __init__(self, api_key: str, model_name: str, **kwargs):
+        super().__init__(api_key, model_name, **kwargs)
+        self.client = openai.AsyncOpenAI(api_key=api_key)
+    async def generate_response(self, prompt: str, **kwargs) -> str:
+        """Generate response using OpenAI API"""
+        try:
+            response = await self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=kwargs.get('temperature', 0.0),
+                max_tokens=kwargs.get('max_tokens', 2048),
+                timeout=self.timeout
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            raise Exception(f"OpenAI API error: {str(e)}")
+    def get_model_info(self) -> dict:
+        """Get model information"""
+        return {
+            "provider": "OpenAI",
+            "model": self.model_name,
+            "api_version": "v1"
+        }

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import gradio as gr
+import pandas as pd
+import json
+import os
+from datetime import datetime
+from dotenv import load_dotenv
+import time
+# Load environment variables
+load_dotenv()
+RESULTS_DIR = "results"
+PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json")
+def load_progress():
+    if not os.path.exists(PROGRESS_FILE):
+        return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A"
+    try:
+        df = pd.read_json(PROGRESS_FILE)
+        if df.empty:
+            return pd.DataFrame(), "Progress file is empty.", "N/A"
+        # Calculate metrics
+        total_questions = len(df)
+        correct_answers = df['is_correct'].sum()
+        accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
+        avg_response_time = df['response_time'].mean()
+        summary_text = f"""
+        ## Evaluation Progress
+        - **Questions Processed:** {total_questions} / 448
+        - **Current Accuracy:** {accuracy:.2f}%
+        - **Correct Answers:** {correct_answers}
+        - **Average Response Time:** {avg_response_time:.2f} seconds/question
+        """
+        # Get last modified time
+        last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S')
+        return df, summary_text, f"Last updated: {last_modified_time}"
+    except Exception as e:
+        return pd.DataFrame(), f"Error loading progress file: {e}", "N/A"
+def create_ui():
+    df, summary, last_updated = load_progress()
+    with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo:
+        gr.Markdown("# Real-Time GPQA Evaluation Dashboard")
+        gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.")
+        with gr.Row():
+            summary_box = gr.Markdown(summary)
+            last_updated_box = gr.Markdown(last_updated)
+        with gr.Row():
+            # Create a simple plot: number of correct vs incorrect answers
+            if not df.empty:
+                correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'})
+                plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False)
+        gr.Markdown("## Raw Results")
+        gr.DataFrame(df, wrap=True)
+    return demo
+def check_environment():
+    """Check if all required environment variables are set"""
+    issues = []
+    if not os.getenv('GROK_API_KEY'):
+        issues.append("GROK_API_KEY not found in environment")
+    if not os.getenv('HF_TOKEN'):
+        issues.append("HF_TOKEN not found (required for GPQA dataset access)")
+    return issues
+def start_evaluation_safe():
+    """Safely start the evaluation process with error handling"""
+    issues = check_environment()
+    if issues:
+        print("⚠️  Environment issues detected:")
+        for issue in issues:
+            print(f"   - {issue}")
+        print("\nPlease set the required environment variables in .env or Hugging Face Secrets")
+        return None
+    import subprocess
+    import sys
+    print("Starting background evaluation process...")
+    command = [
+        sys.executable,
+        "run_evaluation.py",
+        "--config", "official_config.yaml",
+        "--models", "grok-4-0709",
+        "--benchmarks", "gpqa"
+    ]
+    try:
+        # Use Popen to run in the background
+        process = subprocess.Popen(command)
+        print(f"Evaluation process started with PID: {process.pid}")
+        return process
+    except Exception as e:
+        print(f"Failed to start evaluation: {e}")
+        return None
+if __name__ == "__main__":
+    # Check environment first
+    issues = check_environment()
+    if issues:
+        # Create UI with warning message
+        ui = create_ui()
+        print("\n⚠️  Running in demo mode due to missing configuration")
+    else:
+        # Start evaluation process
+        process = start_evaluation_safe()
+        ui = create_ui()
+    # Launch the UI
+    ui.launch(server_name="0.0.0.0", server_port=7860)

benchmarks/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from .base_benchmark import BaseBenchmark, BenchmarkResult
+from .mmlu_benchmark import MMLUBenchmark
+from .gsm8k_benchmark import GSM8KBenchmark
+from .humaneval_benchmark import HumanEvalBenchmark
+from .gpqa_benchmark import GPQABenchmark
+from .math_benchmark import MATHBenchmark
+BENCHMARK_REGISTRY = {
+    'mmlu': MMLUBenchmark,
+    'gsm8k': GSM8KBenchmark,
+    'humaneval': HumanEvalBenchmark,
+    'gpqa': GPQABenchmark,
+    'math': MATHBenchmark
+}
+def get_benchmark(name: str) -> BaseBenchmark:
+    """Get benchmark instance by name"""
+    if name.lower() not in BENCHMARK_REGISTRY:
+        raise ValueError(f"Unknown benchmark: {name}. Available: {list(BENCHMARK_REGISTRY.keys())}")
+    return BENCHMARK_REGISTRY[name.lower()]()

benchmarks/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.12 kB). View file

benchmarks/__pycache__/base_benchmark.cpython-312.pyc ADDED Viewed

Binary file (5.45 kB). View file

benchmarks/__pycache__/evaluation_utils.cpython-312.pyc ADDED Viewed

Binary file (5.64 kB). View file

benchmarks/__pycache__/gpqa_benchmark.cpython-312.pyc ADDED Viewed

Binary file (4.45 kB). View file

benchmarks/__pycache__/gsm8k_benchmark.cpython-312.pyc ADDED Viewed

Binary file (4.91 kB). View file

benchmarks/__pycache__/humaneval_benchmark.cpython-312.pyc ADDED Viewed

Binary file (6.18 kB). View file

benchmarks/__pycache__/math_benchmark.cpython-312.pyc ADDED Viewed

Binary file (5.41 kB). View file

benchmarks/__pycache__/mmlu_benchmark.cpython-312.pyc ADDED Viewed

Binary file (5.7 kB). View file

benchmarks/__pycache__/prompt_templates.cpython-312.pyc ADDED Viewed

Binary file (4.39 kB). View file

benchmarks/base_benchmark.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from abc import ABC, abstractmethod
+from typing import List, Dict, Any, Optional, Tuple
+import asyncio
+from dataclasses import dataclass
+import time
+from tqdm import tqdm
+@dataclass
+class BenchmarkResult:
+    """Container for benchmark results"""
+    benchmark_name: str
+    model_name: str
+    total_questions: int
+    correct: int
+    accuracy: float
+    avg_response_time: float
+    raw_results: List[Dict[str, Any]]
+class BaseBenchmark(ABC):
+    """Base class for all benchmark implementations"""
+    def __init__(self, name: str, dataset_name: str = None):
+        self.name = name
+        self.dataset_name = dataset_name or name
+        self.dataset = None
+        self.results = []
+    @abstractmethod
+    async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
+        """Load the benchmark dataset"""
+        pass
+    @abstractmethod
+    async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
+        """Evaluate a single sample"""
+        pass
+    @abstractmethod
+    def format_prompt(self, sample: Dict[str, Any]) -> str:
+        """Format the prompt for the model"""
+        pass
+    async def run_benchmark(self, api, sample_size: Optional[int] = None, **kwargs) -> BenchmarkResult:
+        """Run the benchmark on the given API"""
+        print(f"Running {self.name} benchmark on {api.model_name}...")
+        # Load dataset
+        await self.load_dataset(sample_size, **kwargs)
+        if not self.dataset:
+            raise ValueError(f"No dataset loaded for {self.name}")
+        # Prepare samples
+        samples = self.dataset if sample_size is None else self.dataset[:sample_size]
+        total_samples = len(samples)
+        # Run evaluation
+        correct_count = 0
+        response_times = []
+        raw_results = []
+        # Use async semaphore for concurrent requests
+        concurrent_limit = kwargs.get('concurrent_requests', 5)
+        semaphore = asyncio.Semaphore(concurrent_limit)
+        async def evaluate_with_semaphore(sample, idx):
+            async with semaphore:
+                start_time = time.time()
+                is_correct, result = await self.evaluate_sample(api, sample, **kwargs)
+                end_time = time.time()
+                result['response_time'] = end_time - start_time
+                result['index'] = idx
+                return is_correct, result
+        # Create tasks for all samples
+        tasks = [evaluate_with_semaphore(sample, idx) for idx, sample in enumerate(samples)]
+        # Run with progress bar
+        # Add imports needed for progress saving
+        import json
+        import os
+        with tqdm(total=total_samples, desc=f"{self.name}") as pbar:
+            for coro in asyncio.as_completed(tasks):
+                is_correct, result = await coro
+                if is_correct:
+                    correct_count += 1
+                response_times.append(result['response_time'])
+                raw_results.append(result)
+                pbar.update(1)
+                # --- START: REAL-TIME PROGRESS SAVING ---
+                # Every 10 samples, save the progress to a file
+                if pbar.n > 0 and pbar.n % 10 == 0:
+                    # Ensure results directory exists
+                    results_dir = kwargs.get('output_dir', 'results')
+                    os.makedirs(results_dir, exist_ok=True)
+                    progress_path = os.path.join(results_dir, f'{self.name}_progress.json')
+                    # Sort results by index before saving
+                    sorted_progress = sorted(raw_results, key=lambda x: x['index'])
+                    try:
+                        with open(progress_path, 'w') as f:
+                            json.dump(sorted_progress, f, indent=2)
+                    except Exception as e:
+                        print(f"Error saving progress: {e}")
+                # --- END: REAL-TIME PROGRESS SAVING ---
+        # Calculate metrics
+        accuracy = correct_count / total_samples if total_samples > 0 else 0
+        avg_response_time = sum(response_times) / len(response_times) if response_times else 0
+        return BenchmarkResult(
+            benchmark_name=self.name,
+            model_name=api.model_name,
+            total_questions=total_samples,
+            correct=correct_count,
+            accuracy=accuracy,
+            avg_response_time=avg_response_time,
+            raw_results=sorted(raw_results, key=lambda x: x['index'])
+        )

benchmarks/evaluation_utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""Evaluation utilities matching standard implementations"""
+import re
+from typing import Optional, Union
+import numpy as np
+try:
+    import sympy
+    from sympy.parsing.latex import parse_latex
+    SYMPY_AVAILABLE = True
+except ImportError:
+    SYMPY_AVAILABLE = False
+def normalize_math_answer(answer: str) -> str:
+    """Normalize mathematical answers following lm-eval's approach"""
+    if not answer:
+        return ""
+    # Extract content after equals sign
+    if '=' in answer:
+        answer = answer.split('=')[-1]
+    # Remove dollar signs and spaces
+    answer = answer.strip()
+    answer = answer.strip('$')
+    # Remove text{} and textbf{}
+    answer = re.sub(r'\\text\{([^}]*)\}', r'\1', answer)
+    answer = re.sub(r'\\textbf\{([^}]*)\}', r'\1', answer)
+    # Fix \fracab -> \frac{a}{b}
+    answer = re.sub(r'\\frac([0-9a-zA-Z])([0-9a-zA-Z])', r'\\frac{\1}{\2}', answer)
+    # Remove commas from numbers
+    answer = re.sub(r'(\d),', r'\1', answer)
+    # Remove specific words
+    for word in ['square', 'units', 'integers', 'dollars', 'mph', 'inches', 'feet', 'minutes', 'cm', 'gm', 'pounds', 'meters', 'meals', 'edges', 'students', 'childrentickets', 'multiples', 'hours', 'degrees', 'ounces', 'bits', 'factorization', 'greenmarbles', 'redmarbles', 'bluemarbles']:
+        answer = answer.replace(word, '')
+    # Remove extra spaces
+    answer = ' '.join(answer.split())
+    return answer.strip()
+def extract_answer_gsm8k(response: str) -> Optional[float]:
+    """Extract answer from GSM8K response following official format"""
+    # Look for the last number in the response
+    numbers = re.findall(r'[-+]?\d*\.?\d+', response)
+    if numbers:
+        try:
+            return float(numbers[-1])
+        except:
+            pass
+    return None
+def extract_answer_mmlu(response: str) -> Optional[str]:
+    """Extract MMLU answer following official format"""
+    # Clean response
+    response = response.strip()
+    # Look for single letter answer
+    if len(response) == 1 and response in 'ABCD':
+        return response
+    # Look for letter followed by parenthesis or period
+    match = re.search(r'^([ABCD])[).\s]', response)
+    if match:
+        return match.group(1)
+    # Look for "answer is X" pattern
+    match = re.search(r'answer is ([ABCD])', response, re.IGNORECASE)
+    if match:
+        return match.group(1).upper()
+    # Look for first occurrence of A, B, C, or D
+    match = re.search(r'[ABCD]', response)
+    if match:
+        return match.group(0)
+    return None
+def calculate_accuracy_with_confidence(results: list) -> dict:
+    """Calculate accuracy with confidence intervals"""
+    correct = sum(1 for r in results if r.get('is_correct', False))
+    total = len(results)
+    if total == 0:
+        return {
+            'accuracy': 0.0,
+            'correct': 0,
+            'total': 0,
+            'confidence_interval': (0.0, 0.0)
+        }
+    accuracy = correct / total
+    # Wilson score interval for binomial proportion
+    z = 1.96  # 95% confidence
+    n = total
+    p = accuracy
+    denominator = 1 + z**2 / n
+    center = (p + z**2 / (2*n)) / denominator
+    margin = z * np.sqrt(p * (1-p) / n + z**2 / (4*n**2)) / denominator
+    lower = max(0, center - margin)
+    upper = min(1, center + margin)
+    return {
+        'accuracy': accuracy,
+        'correct': correct,
+        'total': total,
+        'confidence_interval': (lower, upper)
+    }
+def is_math_equiv(pred: str, gold: str) -> bool:
+    """Check mathematical equivalence using SymPy (matching lm-eval)"""
+    # First normalize both answers
+    pred_norm = normalize_math_answer(pred)
+    gold_norm = normalize_math_answer(gold)
+    # Quick string comparison
+    if pred_norm == gold_norm:
+        return True
+    if not SYMPY_AVAILABLE:
+        # Fallback to string comparison
+        return pred_norm == gold_norm
+    try:
+        # Try to parse as LaTeX
+        try:
+            pred_expr = parse_latex(pred_norm)
+            gold_expr = parse_latex(gold_norm)
+        except:
+            # Try parsing as regular SymPy expression
+            pred_expr = sympy.sympify(pred_norm)
+            gold_expr = sympy.sympify(gold_norm)
+        # Check if expressions are equivalent
+        diff = sympy.simplify(pred_expr - gold_expr)
+        return diff == 0 or diff.is_zero
+    except Exception:
+        # If parsing fails, fall back to string comparison
+        return pred_norm == gold_norm
+def is_gsm8k_correct(pred: str, gold: str) -> bool:
+    """Check GSM8K answer correctness"""
+    if pred == gold:
+        return True
+    try:
+        # Try numeric comparison
+        pred_num = float(pred)
+        gold_num = float(gold)
+        # GSM8K uses exact match, but we allow tiny floating point errors
+        return abs(pred_num - gold_num) < 1e-9
+    except:
+        return False

benchmarks/gpqa_benchmark.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from .base_benchmark import BaseBenchmark
+from typing import Dict, Any, Optional, Tuple
+from datasets import load_dataset
+import re
+import random
+from .evaluation_utils import extract_answer_mmlu
+class GPQABenchmark(BaseBenchmark):
+    """GPQA (Graduate-Level Google-Proof Q&A) benchmark"""
+    def __init__(self):
+        super().__init__(name="GPQA", dataset_name="Idavidrein/gpqa")
+    async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
+        """Load GPQA dataset"""
+        # GPQA has different subsets: gpqa_main, gpqa_diamond, gpqa_extended
+        subset = kwargs.get('subset', 'gpqa_main')
+        try:
+            # Set HF token if available
+            import os
+            hf_token = os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN')
+            if hf_token:
+                dataset = load_dataset(self.dataset_name, subset, split='train', token=hf_token)
+            else:
+                dataset = load_dataset(self.dataset_name, subset, split='train')
+        except Exception as e:
+            if "gated dataset" in str(e) or "authentication" in str(e).lower():
+                raise Exception(
+                    "GPQA dataset requires authentication. Please:\n"
+                    "1. Set HF_TOKEN environment variable\n"
+                    "2. Request access at https://huggingface.co/datasets/Idavidrein/gpqa\n"
+                    f"Original error: {e}"
+                )
+            # Fallback to main if subset not found
+            try:
+                dataset = load_dataset(self.dataset_name, 'gpqa_main', split='train')
+            except:
+                raise e
+        self.dataset = []
+        for sample in dataset:
+            # GPQA has these fields: Question, Correct Answer, Incorrect Answer 1-3
+            choices = [
+                sample.get('Correct Answer', ''),
+                sample.get('Incorrect Answer 1', ''),
+                sample.get('Incorrect Answer 2', ''),
+                sample.get('Incorrect Answer 3', '')
+            ]
+            # Shuffle choices and track correct index
+            import random
+            indices = list(range(4))
+            random.shuffle(indices)
+            shuffled_choices = [choices[i] for i in indices]
+            correct_index = indices.index(0)  # 0 was the correct answer position
+            self.dataset.append({
+                'question': sample['Question'],
+                'choices': shuffled_choices,
+                'correct_index': correct_index,
+                'subject': sample.get('Subdomain', 'Unknown'),
+                'raw_sample': sample
+            })
+        # Shuffle dataset
+        random.shuffle(self.dataset)
+        if sample_size and len(self.dataset) > sample_size:
+            self.dataset = self.dataset[:sample_size]
+    def format_prompt(self, sample: Dict[str, Any]) -> str:
+        """Format GPQA question as prompt matching official format"""
+        question = sample['question']
+        choices = sample['choices']
+        # GPQA uses a simpler format in lm-eval
+        prompt = f"""What is the correct answer to this question: {question}
+Choices:
+(A) {choices[0]}
+(B) {choices[1]}
+(C) {choices[2]}
+(D) {choices[3]}
+Answer:"""
+        return prompt
+    async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
+        """Evaluate a single GPQA sample"""
+        prompt = self.format_prompt(sample)
+        try:
+            response = await api.generate_with_retry(prompt, **kwargs)
+            # Extract answer from response using standard extraction
+            predicted_letter = extract_answer_mmlu(response)
+            if predicted_letter:
+                predicted_index = ord(predicted_letter) - ord('A')
+            else:
+                # If no clear answer, mark as incorrect
+                predicted_index = -1
+            correct_index = sample['correct_index']
+            is_correct = predicted_index == correct_index
+            result = {
+                'question': sample['question'],
+                'choices': sample['choices'],
+                'correct_answer': correct_index,
+                'predicted_answer': predicted_index,
+                'model_response': response,
+                'is_correct': is_correct,
+                'subject': sample['subject']
+            }
+            return is_correct, result
+        except Exception as e:
+            result = {
+                'question': sample['question'],
+                'error': str(e),
+                'is_correct': False
+            }
+            return False, result

benchmarks/gsm8k_benchmark.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from .base_benchmark import BaseBenchmark
+from typing import Dict, Any, Optional, Tuple
+from datasets import load_dataset
+import re
+from .prompt_templates import get_gsm8k_cot_prompt
+class GSM8KBenchmark(BaseBenchmark):
+    """GSM8K (Grade School Math 8K) benchmark"""
+    def __init__(self):
+        super().__init__(name="GSM8K", dataset_name="gsm8k")
+    async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
+        """Load GSM8K dataset"""
+        dataset = load_dataset(self.dataset_name, 'main', split='test')
+        self.dataset = []
+        for sample in dataset:
+            self.dataset.append({
+                'question': sample['question'],
+                'answer': sample['answer'],
+                'raw_sample': sample
+            })
+        # Shuffle dataset
+        import random
+        random.shuffle(self.dataset)
+        if sample_size and len(self.dataset) > sample_size:
+            self.dataset = self.dataset[:sample_size]
+    def extract_answer_from_solution(self, solution: str) -> Optional[str]:
+        """Extract numerical answer from GSM8K solution string"""
+        # GSM8K answers are in format: "... #### number"
+        match = re.search(r'#### ([\-0-9\.\,]+)', solution)
+        if match:
+            answer_str = match.group(1).replace(',', '')
+            return answer_str
+        return None
+    def extract_number_from_response(self, response: str) -> Optional[str]:
+        """Extract the final numerical answer from model response"""
+        # Official lm-eval uses these patterns in order:
+        # 1. Look for "The answer is X" pattern (CoT standard)
+        match = re.search(r'The answer is ([\-0-9\.\,]+)\.?', response, re.IGNORECASE)
+        if match:
+            return match.group(1).replace(',', '')
+        # 2. Look for #### format (if model knows GSM8K format)
+        match = re.search(r'#### ([\-0-9\.\,]+)', response)
+        if match:
+            return match.group(1).replace(',', '')
+        # 3. Flexible extraction: find all numbers and take the last one
+        # This matches lm-eval's flexible-extract with group_select: -1
+        numbers = re.findall(r'(-?[$0-9.,]{2,})|(-?[0-9]+)', response)
+        if numbers:
+            # Flatten tuples and get last non-empty match
+            flat_numbers = [n for group in numbers for n in group if n]
+            if flat_numbers:
+                last_number = flat_numbers[-1]
+                # Clean the number
+                cleaned = last_number.replace('$', '').replace(',', '')
+                try:
+                    # Validate it's a proper number
+                    float(cleaned)
+                    return cleaned
+                except:
+                    pass
+        return None
+    def format_prompt(self, sample: Dict[str, Any]) -> str:
+        """Format GSM8K question as prompt with CoT examples"""
+        # Use the standard CoT prompt from lm-eval
+        return get_gsm8k_cot_prompt(sample['question'])
+    async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
+        """Evaluate a single GSM8K sample"""
+        prompt = self.format_prompt(sample)
+        try:
+            response = await api.generate_with_retry(prompt, **kwargs)
+            # Extract correct answer
+            correct_answer = self.extract_answer_from_solution(sample['answer'])
+            # Extract model's answer
+            model_answer = self.extract_number_from_response(response)
+            # Check if answers match (exact string match after normalization)
+            is_correct = False
+            if correct_answer is not None and model_answer is not None:
+                # GSM8K uses exact match on normalized strings
+                is_correct = correct_answer == model_answer
+            result = {
+                'question': sample['question'],
+                'correct_answer': correct_answer,
+                'model_answer': model_answer,
+                'model_response': response,
+                'is_correct': is_correct,
+                'solution': sample['answer']
+            }
+            return is_correct, result
+        except Exception as e:
+            result = {
+                'question': sample['question'],
+                'error': str(e),
+                'is_correct': False
+            }
+            return False, result

benchmarks/humaneval_benchmark.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from .base_benchmark import BaseBenchmark
+from typing import Dict, Any, Optional, Tuple
+from datasets import load_dataset
+import subprocess
+import tempfile
+import os
+import sys
+import re
+class HumanEvalBenchmark(BaseBenchmark):
+    """HumanEval code generation benchmark"""
+    def __init__(self):
+        super().__init__(name="HumanEval", dataset_name="openai_humaneval")
+    async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
+        """Load HumanEval dataset"""
+        dataset = load_dataset(self.dataset_name, split='test')
+        self.dataset = []
+        for sample in dataset:
+            self.dataset.append({
+                'task_id': sample['task_id'],
+                'prompt': sample['prompt'],
+                'canonical_solution': sample['canonical_solution'],
+                'test': sample['test'],
+                'entry_point': sample['entry_point'],
+                'raw_sample': sample
+            })
+        if sample_size and len(self.dataset) > sample_size:
+            self.dataset = self.dataset[:sample_size]
+    def format_prompt(self, sample: Dict[str, Any]) -> str:
+        """Format HumanEval problem as prompt"""
+        # lm-eval uses just the raw prompt without additional instructions
+        return sample['prompt']
+    def extract_code(self, response: str, entry_point: str, prompt: str) -> str:
+        """Extract code from model response"""
+        # Clean the response - handle markdown code blocks
+        code = response.strip()
+        # Remove markdown code block markers
+        if code.startswith('```python'):
+            code = code[9:]  # Remove ```python
+        elif code.startswith('```'):
+            code = code[3:]   # Remove ```
+        if code.endswith('```'):
+            code = code[:-3]  # Remove trailing ```
+        code = code.strip()
+        # If the response contains the complete function, use it directly
+        if f"def {entry_point}" in code:
+            return code
+        else:
+            # Fallback: assume it's completion to be added after prompt
+            stop_sequences = ['\nclass', '\ndef', '\n#', '\nif __name__']
+            for stop in stop_sequences:
+                pos = code.find(stop)
+                if pos > 0:
+                    code = code[:pos]
+                    break
+            return prompt + code
+    def run_test(self, code: str, test_code: str) -> Tuple[bool, str]:
+        """Run the test code and return success status and output"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+            # Write the complete test file
+            f.write(code + '\n\n' + test_code)
+            f.flush()
+            try:
+                # Run the test
+                result = subprocess.run(
+                    [sys.executable, f.name],
+                    capture_output=True,
+                    text=True,
+                    timeout=10
+                )
+                if result.returncode == 0:
+                    return True, result.stdout
+                else:
+                    return False, result.stderr
+            except subprocess.TimeoutExpired:
+                return False, "Timeout: Code execution took too long"
+            except Exception as e:
+                return False, f"Error running test: {str(e)}"
+            finally:
+                # Clean up
+                try:
+                    os.unlink(f.name)
+                except:
+                    pass
+    async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
+        """Evaluate a single HumanEval sample"""
+        prompt = self.format_prompt(sample)
+        try:
+            response = await api.generate_with_retry(prompt, **kwargs)
+            # Extract code from response
+            code = self.extract_code(response, sample['entry_point'], sample['prompt'])
+            # Run the test
+            is_correct, test_output = self.run_test(code, sample['test'])
+            result = {
+                'task_id': sample['task_id'],
+                'prompt': sample['prompt'],
+                'model_response': response,
+                'extracted_code': code,
+                'is_correct': is_correct,
+                'test_output': test_output,
+                'entry_point': sample['entry_point']
+            }
+            return is_correct, result
+        except Exception as e:
+            result = {
+                'task_id': sample['task_id'],
+                'prompt': sample['prompt'],
+                'error': str(e),
+                'is_correct': False
+            }
+            return False, result

benchmarks/math_benchmark.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from .base_benchmark import BaseBenchmark
+from typing import Dict, Any, Optional, Tuple
+from datasets import load_dataset
+import re
+from .evaluation_utils import normalize_math_answer, is_math_equiv
+class MATHBenchmark(BaseBenchmark):
+    """MATH (Mathematics) benchmark for competition-level problems"""
+    LEVELS = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    TYPES = ['Algebra', 'Counting & Probability', 'Geometry', 'Intermediate Algebra',
+             'Number Theory', 'Prealgebra', 'Precalculus']
+    def __init__(self):
+        super().__init__(name="MATH", dataset_name="hendrycks/competition_math")
+    async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
+        """Load MATH dataset"""
+        dataset = load_dataset(self.dataset_name, split='test')
+        # Filter by difficulty level if specified
+        difficulty_levels = kwargs.get('difficulty', ['all'])
+        if 'all' not in difficulty_levels:
+            dataset = dataset.filter(lambda x: x['level'] in difficulty_levels)
+        self.dataset = []
+        for sample in dataset:
+            self.dataset.append({
+                'problem': sample['problem'],
+                'solution': sample['solution'],
+                'level': sample['level'],
+                'type': sample['type'],
+                'raw_sample': sample
+            })
+        # Shuffle dataset
+        import random
+        random.shuffle(self.dataset)
+        if sample_size and len(self.dataset) > sample_size:
+            self.dataset = self.dataset[:sample_size]
+    def extract_answer(self, solution: str) -> Optional[str]:
+        """Extract the final answer from MATH solution using lm-eval's method"""
+        # Find all boxed content
+        boxed_matches = re.findall(r'\\boxed\{([^{}]*)\}', solution)
+        fbox_matches = re.findall(r'\\fbox\{([^{}]*)\}', solution)
+        all_matches = boxed_matches + fbox_matches
+        if all_matches:
+            # Return the last boxed answer
+            return all_matches[-1].strip()
+        return None
+    def extract_model_answer(self, response: str) -> Optional[str]:
+        """Extract answer from model response"""
+        # Try to find boxed answer first
+        answer = self.extract_answer(response)
+        if answer:
+            return answer
+        # If no boxed answer, look for common patterns
+        # "The answer is X"
+        match = re.search(r'answer is[\s:]*([^.\n]+)', response, re.IGNORECASE)
+        if match:
+            return match.group(1).strip()
+        # "Therefore, X"
+        match = re.search(r'therefore[,\s]+([^.\n]+)', response, re.IGNORECASE)
+        if match:
+            return match.group(1).strip()
+        return None
+    def format_prompt(self, sample: Dict[str, Any]) -> str:
+        """Format MATH problem as prompt"""
+        prompt = f"""Solve the following mathematics problem step by step. Show all your work and put your final answer in the format \\boxed{{answer}}.
+Problem: {sample['problem']}
+Solution:"""
+        return prompt
+    async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
+        """Evaluate a single MATH sample"""
+        prompt = self.format_prompt(sample)
+        try:
+            response = await api.generate_with_retry(prompt, **kwargs)
+            # Extract correct answer
+            correct_answer = self.extract_answer(sample['solution'])
+            # Extract model's answer
+            model_answer = self.extract_model_answer(response)
+            # Compare answers using mathematical equivalence
+            is_correct = False
+            if correct_answer and model_answer:
+                # Use the official equivalence checking
+                is_correct = is_math_equiv(model_answer, correct_answer)
+            result = {
+                'problem': sample['problem'],
+                'level': sample['level'],
+                'type': sample['type'],
+                'correct_answer': correct_answer,
+                'model_answer': model_answer,
+                'model_response': response,
+                'is_correct': is_correct
+            }
+            return is_correct, result
+        except Exception as e:
+            result = {
+                'problem': sample['problem'],
+                'level': sample['level'],
+                'type': sample['type'],
+                'error': str(e),
+                'is_correct': False
+            }
+            return False, result

benchmarks/mmlu_benchmark.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from .base_benchmark import BaseBenchmark
+from typing import Dict, Any, Optional, Tuple, List
+from datasets import load_dataset
+import re
+import random
+from .prompt_templates import get_mmlu_prompt
+from .evaluation_utils import extract_answer_mmlu
+class MMLUBenchmark(BaseBenchmark):
+    """MMLU (Massive Multitask Language Understanding) benchmark"""
+    SUBJECTS = [
+        'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics',
+        'clinical_knowledge', 'college_biology', 'college_chemistry',
+        'college_computer_science', 'college_mathematics', 'college_medicine',
+        'college_physics', 'computer_security', 'conceptual_physics',
+        'econometrics', 'electrical_engineering', 'elementary_mathematics',
+        'formal_logic', 'global_facts', 'high_school_biology',
+        'high_school_chemistry', 'high_school_computer_science',
+        'high_school_european_history', 'high_school_geography',
+        'high_school_government_and_politics', 'high_school_macroeconomics',
+        'high_school_mathematics', 'high_school_microeconomics',
+        'high_school_physics', 'high_school_psychology', 'high_school_statistics',
+        'high_school_us_history', 'high_school_world_history', 'human_aging',
+        'human_sexuality', 'international_law', 'jurisprudence',
+        'logical_fallacies', 'machine_learning', 'management', 'marketing',
+        'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios',
+        'nutrition', 'philosophy', 'prehistory', 'professional_accounting',
+        'professional_law', 'professional_medicine', 'professional_psychology',
+        'public_relations', 'security_studies', 'sociology', 'us_foreign_policy',
+        'virology', 'world_religions'
+    ]
+    def __init__(self):
+        super().__init__(name="MMLU", dataset_name="cais/mmlu")
+    async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
+        """Load MMLU dataset"""
+        subjects = kwargs.get('subjects', ['all'])
+        if 'all' in subjects:
+            subjects = self.SUBJECTS
+        else:
+            subjects = [s for s in subjects if s in self.SUBJECTS]
+        self.dataset = []
+        self.few_shot_examples = {}  # Store few-shot examples per subject
+        for subject in subjects:
+            try:
+                # Load dev split for few-shot examples
+                dev_ds = load_dataset(self.dataset_name, subject, split='dev')
+                # Standard MMLU uses 5-shot
+                self.few_shot_examples[subject] = [
+                    {
+                        'question': ex['question'],
+                        'choices': ex['choices'],
+                        'answer': ex['answer']
+                    }
+                    for ex in list(dev_ds)[:5]
+                ]
+                # Load test split for evaluation
+                test_ds = load_dataset(self.dataset_name, subject, split='test')
+                for sample in test_ds:
+                    self.dataset.append({
+                        'subject': subject,
+                        'question': sample['question'],
+                        'choices': sample['choices'],
+                        'answer': sample['answer'],  # 0-3 index
+                        'raw_sample': sample
+                    })
+            except Exception as e:
+                print(f"Error loading {subject}: {e}")
+                continue
+        # Shuffle dataset
+        random.shuffle(self.dataset)
+        if sample_size and len(self.dataset) > sample_size:
+            self.dataset = self.dataset[:sample_size]
+    def format_prompt(self, sample: Dict[str, Any]) -> str:
+        """Format MMLU question as prompt with few-shot examples"""
+        subject = sample['subject']
+        few_shot_examples = self.few_shot_examples.get(subject, [])
+        return get_mmlu_prompt(
+            question=sample['question'],
+            choices=sample['choices'],
+            subject=subject.replace('_', ' ').title(),
+            few_shot_examples=few_shot_examples
+        )
+    async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
+        """Evaluate a single MMLU sample"""
+        prompt = self.format_prompt(sample)
+        try:
+            response = await api.generate_with_retry(prompt, **kwargs)
+            # Extract answer from response using standard extraction
+            predicted_letter = extract_answer_mmlu(response)
+            if predicted_letter:
+                predicted_index = ord(predicted_letter) - ord('A')
+            else:
+                # If no clear answer, mark as incorrect
+                predicted_index = -1
+            correct_index = sample['answer']
+            is_correct = predicted_index == correct_index
+            result = {
+                'subject': sample['subject'],
+                'question': sample['question'],
+                'choices': sample['choices'],
+                'correct_answer': correct_index,
+                'predicted_answer': predicted_index,
+                'model_response': response,
+                'is_correct': is_correct
+            }
+            return is_correct, result
+        except Exception as e:
+            result = {
+                'subject': sample['subject'],
+                'question': sample['question'],
+                'error': str(e),
+                'is_correct': False
+            }
+            return False, result

benchmarks/prompt_templates.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""Standard prompt templates matching lm-eval implementation"""
+MMLU_PROMPT_TEMPLATE = """The following are multiple choice questions (with answers) about {subject}.
+{few_shot_examples}
+Question: {question}
+A) {choice_a}
+B) {choice_b}
+C) {choice_c}
+D) {choice_d}
+Answer:"""
+MMLU_FEW_SHOT_TEMPLATE = """Question: {question}
+A) {choice_a}
+B) {choice_b}
+C) {choice_c}
+D) {choice_d}
+Answer: {answer}
+"""
+GSM8K_PROMPT_TEMPLATE = """Question: {question}
+Let's think step by step.
+"""
+GSM8K_COT_TEMPLATE = """Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
+A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.
+Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
+A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.
+Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
+A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.
+Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
+A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he gave to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.
+Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
+A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.
+Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?
+A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.
+Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?
+A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.
+Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
+A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.
+Q: {question}
+A:"""
+HUMANEVAL_PROMPT_TEMPLATE = """Complete the following Python function:
+{prompt}"""
+def get_mmlu_prompt(question, choices, subject="", few_shot_examples=None):
+    """Generate MMLU prompt with few-shot examples"""
+    if not few_shot_examples:
+        few_shot_examples = []
+    # Format few-shot examples
+    examples = ""
+    for ex in few_shot_examples:
+        examples += MMLU_FEW_SHOT_TEMPLATE.format(
+            question=ex['question'],
+            choice_a=ex['choices'][0],
+            choice_b=ex['choices'][1],
+            choice_c=ex['choices'][2],
+            choice_d=ex['choices'][3],
+            answer=chr(ord('A') + ex['answer'])
+        )
+    return MMLU_PROMPT_TEMPLATE.format(
+        subject=subject,
+        few_shot_examples=examples.rstrip(),  # Remove trailing newline
+        question=question,
+        choice_a=choices[0],
+        choice_b=choices[1],
+        choice_c=choices[2],
+        choice_d=choices[3]
+    )
+def get_gsm8k_cot_prompt(question):
+    """Generate GSM8K prompt with Chain-of-Thought examples"""
+    return GSM8K_COT_TEMPLATE.format(question=question)

check_deployment.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+"""
+Pre-deployment checklist for HF Space
+"""
+import os
+import sys
+from pathlib import Path
+def check_deployment_ready():
+    """Check if everything is ready for HF deployment"""
+    print("🔍 Pre-deployment checklist:\n")
+    checks = []
+    # Check files exist
+    required_files = [
+        "app.py",
+        "run_evaluation.py",
+        "requirements.txt",
+        ".env.example",
+        "run_hf_space.py",
+        "official_config.yaml"
+    ]
+    for file in required_files:
+        if Path(file).exists():
+            checks.append((f"✅ {file} exists", True))
+        else:
+            checks.append((f"❌ {file} missing", False))
+    # Check API directories
+    if Path("apis").is_dir() and list(Path("apis").glob("*.py")):
+        checks.append(("✅ APIs directory configured", True))
+    else:
+        checks.append(("❌ APIs directory missing or empty", False))
+    # Check benchmarks directory
+    if Path("benchmarks").is_dir() and Path("benchmarks/gpqa_benchmark.py").exists():
+        checks.append(("✅ GPQA benchmark implementation found", True))
+    else:
+        checks.append(("❌ GPQA benchmark missing", False))
+    # Check for sensitive data
+    if Path(".env").exists():
+        checks.append(("⚠️  .env file exists - make sure it's in .gitignore!", None))
+    # Print results
+    for check, status in checks:
+        print(check)
+    all_good = all(status is not False for _, status in checks)
+    if all_good:
+        print("\n✅ Ready for deployment!")
+        print("\nNext steps:")
+        print("1. Set GROK_API_KEY and HF_TOKEN in HF Space secrets")
+        print("2. Make sure you have GPQA dataset access")
+        print("3. Push to Hugging Face")
+    else:
+        print("\n❌ Issues found - please fix before deploying")
+    return all_good
+if __name__ == "__main__":
+    check_deployment_ready()

official_config.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+# Official benchmark configuration matching lm-eval settings
+models:
+  openai:
+    api_key: "${OPENAI_API_KEY}"
+    models:
+      - "gpt-4o"
+      - "gpt-4-turbo"
+      - "gpt-3.5-turbo"
+  anthropic:
+    api_key: "${ANTHROPIC_API_KEY}"
+    models:
+      - "claude-3-5-sonnet-20241022"
+      - "claude-3-opus-20240229"
+      - "claude-3-haiku-20240307"
+  grok:
+    api_key: "${GROK_API_KEY}"
+    base_url: "https://api.x.ai/v1"
+    models:
+      - "grok-4-0709"
+      - "grok-beta"
+      - "grok-2-latest"
+benchmarks:
+  mmlu:
+    enabled: true
+    sample_size: null  # Use full dataset
+    subjects: ["all"]
+    # Official settings
+    num_fewshot: 5
+    doc_to_choice: ["A", "B", "C", "D"]
+  gsm8k:
+    enabled: true
+    sample_size: null  # Full test set (1319 samples)
+    # Official settings
+    num_fewshot: 8  # 8-shot CoT
+    use_cot: true
+  humaneval:
+    enabled: true
+    sample_size: null  # Full test set (164 samples)
+    # Official settings
+    pass_at_k: [1]  # Calculate Pass@1
+    do_sample: false  # Deterministic generation
+  gpqa:
+    enabled: true
+    sample_size: null
+    subset: "gpqa_main"  # or "gpqa_diamond" for harder subset
+  math:
+    enabled: true
+    sample_size: null  # Full test set (5000 samples)
+    # Official settings
+    use_sympy: true  # Use SymPy for equivalence checking
+evaluation:
+  # Generation settings matching lm-eval
+  temperature: 0.0  # Deterministic for evaluation
+  max_tokens: 2048
+  top_p: 1.0
+  # For HumanEval code generation
+  humaneval_max_tokens: 1024
+  # System settings
+  timeout: 60  # Increased for complex problems
+  max_retries: 3
+  concurrent_requests: 5
+  rate_limit_delay: 0.5
+output:
+  save_results: true
+  results_dir: "results"
+  generate_report: true
+  plot_graphs: true
+  save_raw_outputs: true  # Save all model outputs for debugging

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+openai>=1.0.0
+anthropic>=0.20.0
+requests>=2.28.0
+numpy>=1.21.0
+pandas>=1.3.0
+tqdm>=4.62.0
+pyyaml>=6.0
+datasets>=2.0.0
+transformers>=4.20.0
+scipy>=1.7.0
+matplotlib>=3.5.0
+seaborn>=0.11.0
+python-dotenv>=0.19.0
+aiohttp>=3.8.0
+sympy>=1.11.0
+gradio>=4.31.0
+huggingface_hub>=0.20.0

run_evaluation.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""Main script to run AI model evaluation benchmarks"""
+import argparse
+import asyncio
+import json
+import os
+import yaml
+from datetime import datetime
+from typing import List, Dict, Any
+from dotenv import load_dotenv
+import pandas as pd
+from apis.api_factory import APIFactory
+from benchmarks import get_benchmark, BenchmarkResult
+# Load environment variables
+load_dotenv()
+def load_config(config_path: str = 'official_config.yaml') -> dict:
+    """Load configuration from YAML file"""
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    # Replace environment variables
+    def replace_env_vars(obj):
+        if isinstance(obj, str) and obj.startswith('${') and obj.endswith('}'):
+            env_var = obj[2:-1]
+            return os.getenv(env_var, obj)
+        elif isinstance(obj, dict):
+            return {k: replace_env_vars(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [replace_env_vars(item) for item in obj]
+        return obj
+    return replace_env_vars(config)
+def save_results(results: List[BenchmarkResult], output_dir: str):
+    """Save evaluation results"""
+    os.makedirs(output_dir, exist_ok=True)
+    # Create timestamp for this run
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    # Save detailed results as JSON
+    detailed_results = []
+    for result in results:
+        detailed_results.append({
+            'benchmark': result.benchmark_name,
+            'model': result.model_name,
+            'total_questions': result.total_questions,
+            'correct': result.correct,
+            'accuracy': result.accuracy,
+            'avg_response_time': result.avg_response_time,
+            'timestamp': timestamp
+        })
+    json_path = os.path.join(output_dir, f'results_{timestamp}.json')
+    with open(json_path, 'w') as f:
+        json.dump(detailed_results, f, indent=2)
+    # Save summary as CSV
+    df = pd.DataFrame(detailed_results)
+    csv_path = os.path.join(output_dir, f'summary_{timestamp}.csv')
+    df.to_csv(csv_path, index=False)
+    # Save raw results for debugging
+    for result in results:
+        raw_path = os.path.join(output_dir, f'{result.model_name}_{result.benchmark_name}_{timestamp}_raw.json')
+        with open(raw_path, 'w') as f:
+            json.dump(result.raw_results, f, indent=2)
+    return json_path, csv_path
+def print_results_table(results: List[BenchmarkResult]):
+    """Print results in a nice table format"""
+    if not results:
+        return
+    # Group by model
+    model_results = {}
+    for result in results:
+        if result.model_name not in model_results:
+            model_results[result.model_name] = {}
+        model_results[result.model_name][result.benchmark_name] = result
+    # Print header
+    benchmarks = list(set(r.benchmark_name for r in results))
+    benchmarks.sort()
+    print("\n" + "="*80)
+    print("EVALUATION RESULTS")
+    print("="*80)
+    # Create table
+    header = ["Model"] + benchmarks + ["Average"]
+    print(f"{'Model':<20}", end="")
+    for bench in benchmarks:
+        print(f"{bench:<15}", end="")
+    print(f"{'Average':<10}")
+    print("-"*80)
+    # Print results for each model
+    for model, bench_results in model_results.items():
+        print(f"{model:<20}", end="")
+        scores = []
+        for bench in benchmarks:
+            if bench in bench_results:
+                score = bench_results[bench].accuracy * 100
+                scores.append(score)
+                print(f"{score:>6.1f}%        ", end="")
+            else:
+                print(f"{'N/A':<15}", end="")
+        # Calculate average
+        if scores:
+            avg = sum(scores) / len(scores)
+            print(f"{avg:>6.1f}%")
+        else:
+            print("N/A")
+    print("="*80)
+async def run_single_evaluation(api, benchmark_name: str, config: dict) -> BenchmarkResult:
+    """Run a single benchmark evaluation"""
+    benchmark = get_benchmark(benchmark_name)
+    # Get benchmark-specific config
+    bench_config = config['benchmarks'].get(benchmark_name, {})
+    eval_config = config['evaluation']
+    # Merge configs
+    kwargs = {
+        **eval_config,
+        'concurrent_requests': eval_config.get('concurrent_requests', 5)
+    }
+    # Add benchmark-specific configs but exclude sample_size
+    for key, value in bench_config.items():
+        if key != 'sample_size':
+            kwargs[key] = value
+    # Run benchmark
+    result = await benchmark.run_benchmark(
+        api,
+        sample_size=bench_config.get('sample_size'),
+        **kwargs
+    )
+    return result
+async def main():
+    parser = argparse.ArgumentParser(description='Run AI benchmark evaluation')
+    parser.add_argument('--models', nargs='+', help='Models to evaluate (e.g., gpt-4o claude-3-opus)')
+    parser.add_argument('--benchmarks', nargs='+', help='Benchmarks to run (e.g., mmlu gsm8k)')
+    parser.add_argument('--config', default='config.yaml', help='Config file path')
+    parser.add_argument('--output-dir', default='results', help='Output directory for results')
+    parser.add_argument('--no-save', action='store_true', help='Do not save results to files')
+    args = parser.parse_args()
+    # Load configuration
+    config = load_config(args.config)
+    # Determine which models to evaluate
+    if args.models:
+        models_to_eval = args.models
+    else:
+        # Get all models from config
+        models_to_eval = []
+        for provider, provider_config in config['models'].items():
+            for model in provider_config.get('models', []):
+                models_to_eval.append(model)
+    # Determine which benchmarks to run
+    if args.benchmarks:
+        benchmarks_to_run = args.benchmarks
+    else:
+        # Get enabled benchmarks from config
+        benchmarks_to_run = [
+            name for name, bench_config in config['benchmarks'].items()
+            if bench_config.get('enabled', True)
+        ]
+    print(f"Models to evaluate: {models_to_eval}")
+    print(f"Benchmarks to run: {benchmarks_to_run}")
+    # Run evaluations
+    all_results = []
+    for model_name in models_to_eval:
+        print(f"\n{'='*60}")
+        print(f"Evaluating model: {model_name}")
+        print(f"{'='*60}")
+        try:
+            # Create API instance
+            api = APIFactory.create_api(model_name, config)
+            # Run each benchmark
+            for benchmark_name in benchmarks_to_run:
+                print(f"\nRunning {benchmark_name} benchmark...")
+                try:
+                    result = await run_single_evaluation(api, benchmark_name, config)
+                    all_results.append(result)
+                    print(f"[OK] {benchmark_name}: {result.accuracy*100:.1f}% accuracy")
+                except Exception as e:
+                    print(f"[ERROR] {benchmark_name}: Error - {e}")
+        except Exception as e:
+            print(f"Failed to create API for {model_name}: {e}")
+            continue
+    # Print results table
+    print_results_table(all_results)
+    # Save results
+    if not args.no_save and all_results:
+        json_path, csv_path = save_results(all_results, args.output_dir)
+        print(f"\nResults saved to:")
+        print(f"  - {json_path}")
+        print(f"  - {csv_path}")
+if __name__ == "__main__":
+    asyncio.run(main())

run_hf_space.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/env python3
+"""
+Hugging Face Space entry point for GPQA evaluation
+"""
+import os
+import sys
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Set HF token if available
+hf_token = os.getenv('HF_TOKEN')
+if hf_token:
+    os.environ['HUGGING_FACE_HUB_TOKEN'] = hf_token
+    print("✅ HF Token configured")
+# Import and run the app
+from app import create_ui, start_evaluation_safe, check_environment
+if __name__ == "__main__":
+    # Check environment
+    issues = check_environment()
+    if issues:
+        print("\n⚠️  Configuration issues:")
+        for issue in issues:
+            print(f"   - {issue}")
+        print("\nThe app will run in demo mode.")
+        print("To enable GPQA evaluation, please set the required secrets in HF Space settings.")
+    else:
+        print("✅ All environment variables configured")
+        # Start evaluation in background
+        start_evaluation_safe()
+    # Create and launch UI
+    ui = create_ui()
+    ui.launch()

setup_hf_space.py ADDED Viewed

	@@ -0,0 +1,244 @@

+#!/usr/bin/env python3
+"""
+Setup script for Hugging Face Space deployment
+Ensures GPQA benchmark can run successfully on HF
+"""
+import os
+import sys
+import subprocess
+from pathlib import Path
+def create_deployment_files():
+    """Create necessary files for HF deployment"""
+    print("🚀 Setting up Hugging Face Space deployment...")
+    # 1. Update requirements.txt with HF dependencies
+    requirements_path = Path("requirements.txt")
+    existing_reqs = requirements_path.read_text() if requirements_path.exists() else ""
+    hf_deps = [
+        "huggingface_hub>=0.20.0",
+        "gradio>=4.31.0",
+        "python-dotenv>=0.19.0"
+    ]
+    for dep in hf_deps:
+        if dep.split(">=")[0] not in existing_reqs:
+            existing_reqs += f"\n{dep}"
+    requirements_path.write_text(existing_reqs.strip() + "\n")
+    print("✅ Updated requirements.txt")
+    # 2. Create .env.example
+    env_example = """# Hugging Face Space Configuration
+# Copy this to .env or set in HF Secrets
+# Required: Your Grok API key from x.ai
+GROK_API_KEY=your_grok_api_key_here
+# Required: Your Hugging Face token for GPQA dataset access
+# Get it from: https://huggingface.co/settings/tokens
+HF_TOKEN=your_hugging_face_token_here
+# Optional: OpenAI and Anthropic keys for comparison
+# OPENAI_API_KEY=your_openai_key_here
+# ANTHROPIC_API_KEY=your_anthropic_key_here
+"""
+    with open(".env.example", "w") as f:
+        f.write(env_example)
+    print("✅ Created .env.example")
+    # 3. Create HF-specific run script
+    run_script = """#!/usr/bin/env python3
+\"\"\"
+Hugging Face Space entry point for GPQA evaluation
+\"\"\"
+import os
+import sys
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Set HF token if available
+hf_token = os.getenv('HF_TOKEN')
+if hf_token:
+    os.environ['HUGGING_FACE_HUB_TOKEN'] = hf_token
+    print("✅ HF Token configured")
+# Import and run the app
+from app import create_ui, start_evaluation_safe, check_environment
+if __name__ == "__main__":
+    # Check environment
+    issues = check_environment()
+    if issues:
+        print("\\n⚠️  Configuration issues:")
+        for issue in issues:
+            print(f"   - {issue}")
+        print("\\nThe app will run in demo mode.")
+        print("To enable GPQA evaluation, please set the required secrets in HF Space settings.")
+    else:
+        print("✅ All environment variables configured")
+        # Start evaluation in background
+        start_evaluation_safe()
+    # Create and launch UI
+    ui = create_ui()
+    ui.launch()
+"""
+    with open("run_hf_space.py", "w") as f:
+        f.write(run_script)
+    os.chmod("run_hf_space.py", 0o755)
+    print("✅ Created run_hf_space.py")
+    # 4. Create README for HF Space
+    readme_content = """---
+title: Grok-4 GPQA Evaluation
+emoji: 🧠
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: "4.31.0"
+app_file: run_hf_space.py
+pinned: false
+---
+# Grok-4 GPQA Evaluation Dashboard
+Real-time evaluation of Grok-4 model on the GPQA (Graduate-Level Google-Proof Q&A) benchmark.
+## 🔧 Configuration
+This Space requires the following secrets to be set in your HF Space settings:
+1. **GROK_API_KEY** (Required)
+   - Get from: https://x.ai
+   - Your Grok API key for running evaluations
+2. **HF_TOKEN** (Required)
+   - Get from: https://huggingface.co/settings/tokens
+   - Required for accessing the GPQA dataset
+   - Make sure you have requested access to: https://huggingface.co/datasets/Idavidrein/gpqa
+## 📊 Features
+- Real-time progress tracking
+- Accuracy metrics and performance stats
+- Detailed results export
+- Support for full GPQA dataset (448 questions)
+## 🚀 Quick Start
+1. Fork this Space
+2. Set the required secrets in your Space settings
+3. The evaluation will start automatically
+4. Monitor progress in the dashboard
+## ⚠️ Known Issues
+- GPQA dataset requires access approval (usually 1-2 days)
+- Grok-4-0709 uses extensive reasoning tokens (~2500-3000 per question)
+- Full evaluation takes ~3-4 hours due to model response times
+## 📈 Expected Performance
+Based on our testing:
+- Accuracy: ~80-90% (excluding timeouts)
+- Avg Response Time: ~50s per question
+- Total Runtime: ~3-4 hours for full dataset
+"""
+    with open("README_HF.md", "w") as f:
+        f.write(readme_content)
+    print("✅ Created README_HF.md")
+    # 5. Create pre-flight check script
+    check_script = """#!/usr/bin/env python3
+\"\"\"
+Pre-deployment checklist for HF Space
+\"\"\"
+import os
+import sys
+from pathlib import Path
+def check_deployment_ready():
+    \"\"\"Check if everything is ready for HF deployment\"\"\"
+    print("🔍 Pre-deployment checklist:\\n")
+    checks = []
+    # Check files exist
+    required_files = [
+        "app.py",
+        "run_evaluation.py",
+        "requirements.txt",
+        ".env.example",
+        "run_hf_space.py",
+        "official_config.yaml"
+    ]
+    for file in required_files:
+        if Path(file).exists():
+            checks.append((f"✅ {file} exists", True))
+        else:
+            checks.append((f"❌ {file} missing", False))
+    # Check API directories
+    if Path("apis").is_dir() and list(Path("apis").glob("*.py")):
+        checks.append(("✅ APIs directory configured", True))
+    else:
+        checks.append(("❌ APIs directory missing or empty", False))
+    # Check benchmarks directory
+    if Path("benchmarks").is_dir() and Path("benchmarks/gpqa_benchmark.py").exists():
+        checks.append(("✅ GPQA benchmark implementation found", True))
+    else:
+        checks.append(("❌ GPQA benchmark missing", False))
+    # Check for sensitive data
+    if Path(".env").exists():
+        checks.append(("⚠️  .env file exists - make sure it's in .gitignore!", None))
+    # Print results
+    for check, status in checks:
+        print(check)
+    all_good = all(status is not False for _, status in checks)
+    if all_good:
+        print("\\n✅ Ready for deployment!")
+        print("\\nNext steps:")
+        print("1. Set GROK_API_KEY and HF_TOKEN in HF Space secrets")
+        print("2. Make sure you have GPQA dataset access")
+        print("3. Push to Hugging Face")
+    else:
+        print("\\n❌ Issues found - please fix before deploying")
+    return all_good
+if __name__ == "__main__":
+    check_deployment_ready()
+"""
+    with open("check_deployment.py", "w") as f:
+        f.write(check_script)
+    os.chmod("check_deployment.py", 0o755)
+    print("✅ Created check_deployment.py")
+    print("\n🎉 Deployment files created successfully!")
+    print("\nNext steps:")
+    print("1. Run: python check_deployment.py")
+    print("2. Set your API keys in HF Space secrets")
+    print("3. Push to Hugging Face")
+if __name__ == "__main__":
+    create_deployment_files()