TeddyYao commited on
Commit
8474f02
Β·
verified Β·
1 Parent(s): cd16551

Upload 38 files

Browse files
Files changed (38) hide show
  1. README.md +29 -8
  2. apis/__init__.py +0 -0
  3. apis/__pycache__/__init__.cpython-312.pyc +0 -0
  4. apis/__pycache__/anthropic_api.cpython-312.pyc +0 -0
  5. apis/__pycache__/api_factory.cpython-312.pyc +0 -0
  6. apis/__pycache__/base_api.cpython-312.pyc +0 -0
  7. apis/__pycache__/grok_api.cpython-312.pyc +0 -0
  8. apis/__pycache__/openai_api.cpython-312.pyc +0 -0
  9. apis/anthropic_api.py +30 -0
  10. apis/api_factory.py +71 -0
  11. apis/base_api.py +54 -0
  12. apis/grok_api.py +42 -0
  13. apis/openai_api.py +32 -0
  14. app.py +124 -0
  15. benchmarks/__init__.py +21 -0
  16. benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  17. benchmarks/__pycache__/base_benchmark.cpython-312.pyc +0 -0
  18. benchmarks/__pycache__/evaluation_utils.cpython-312.pyc +0 -0
  19. benchmarks/__pycache__/gpqa_benchmark.cpython-312.pyc +0 -0
  20. benchmarks/__pycache__/gsm8k_benchmark.cpython-312.pyc +0 -0
  21. benchmarks/__pycache__/humaneval_benchmark.cpython-312.pyc +0 -0
  22. benchmarks/__pycache__/math_benchmark.cpython-312.pyc +0 -0
  23. benchmarks/__pycache__/mmlu_benchmark.cpython-312.pyc +0 -0
  24. benchmarks/__pycache__/prompt_templates.cpython-312.pyc +0 -0
  25. benchmarks/base_benchmark.py +124 -0
  26. benchmarks/evaluation_utils.py +160 -0
  27. benchmarks/gpqa_benchmark.py +126 -0
  28. benchmarks/gsm8k_benchmark.py +115 -0
  29. benchmarks/humaneval_benchmark.py +134 -0
  30. benchmarks/math_benchmark.py +125 -0
  31. benchmarks/mmlu_benchmark.py +134 -0
  32. benchmarks/prompt_templates.py +86 -0
  33. check_deployment.py +67 -0
  34. official_config.yaml +77 -0
  35. requirements.txt +17 -0
  36. run_evaluation.py +225 -0
  37. run_hf_space.py +39 -0
  38. setup_hf_space.py +244 -0
README.md CHANGED
@@ -1,13 +1,34 @@
1
  ---
2
- title: Grok4 Gpqa Eval
3
- emoji: 🏒
4
- colorFrom: red
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.38.0
8
- app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Grok-4 GPQA Evaluation
3
+ emoji: 🧠
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: "4.31.0"
8
+ app_file: run_hf_space.py
9
  pinned: false
 
10
  ---
11
 
12
+ # Grok-4 GPQA Evaluation Dashboard
13
+
14
+ Real-time evaluation of Grok-4 model on GPQA benchmark.
15
+
16
+ ## βš™οΈ Configuration Required
17
+
18
+ Please set these secrets in your Space settings:
19
+ - **GROK_API_KEY**: Your Grok API key from x.ai
20
+ - **HF_TOKEN**: Your Hugging Face token (for GPQA dataset access)
21
+
22
+ ## πŸ“Š Features
23
+
24
+ - Real-time progress tracking
25
+ - Accuracy metrics and performance stats
26
+ - Detailed results export
27
+ - Support for full GPQA dataset (448 questions)
28
+
29
+ ## πŸš€ Getting Started
30
+
31
+ 1. Set the required secrets in Space settings
32
+ 2. Make sure you have GPQA dataset access
33
+ 3. The evaluation will start automatically
34
+ 4. Monitor progress in the dashboard
apis/__init__.py ADDED
File without changes
apis/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (150 Bytes). View file
 
apis/__pycache__/anthropic_api.cpython-312.pyc ADDED
Binary file (2.11 kB). View file
 
apis/__pycache__/api_factory.cpython-312.pyc ADDED
Binary file (2.47 kB). View file
 
apis/__pycache__/base_api.cpython-312.pyc ADDED
Binary file (3.36 kB). View file
 
apis/__pycache__/grok_api.cpython-312.pyc ADDED
Binary file (2.48 kB). View file
 
apis/__pycache__/openai_api.cpython-312.pyc ADDED
Binary file (2.21 kB). View file
 
apis/anthropic_api.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import anthropic
2
+ from .base_api import BaseAPI
3
+
4
+ class AnthropicAPI(BaseAPI):
5
+ """Anthropic API implementation"""
6
+
7
+ def __init__(self, api_key: str, model_name: str, **kwargs):
8
+ super().__init__(api_key, model_name, **kwargs)
9
+ self.client = anthropic.AsyncAnthropic(api_key=api_key)
10
+
11
+ async def generate_response(self, prompt: str, **kwargs) -> str:
12
+ """Generate response using Anthropic API"""
13
+ try:
14
+ response = await self.client.messages.create(
15
+ model=self.model_name,
16
+ max_tokens=kwargs.get('max_tokens', 2048),
17
+ temperature=kwargs.get('temperature', 0.0),
18
+ messages=[{"role": "user", "content": prompt}]
19
+ )
20
+ return response.content[0].text
21
+ except Exception as e:
22
+ raise Exception(f"Anthropic API error: {str(e)}")
23
+
24
+ def get_model_info(self) -> dict:
25
+ """Get model information"""
26
+ return {
27
+ "provider": "Anthropic",
28
+ "model": self.model_name,
29
+ "api_version": "2023-06-01"
30
+ }
apis/api_factory.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ from .openai_api import OpenAIAPI
3
+ from .anthropic_api import AnthropicAPI
4
+ from .grok_api import GrokAPI
5
+ from .base_api import BaseAPI
6
+
7
+ class APIFactory:
8
+ """Factory class to create API instances based on model name"""
9
+
10
+ # Model to provider mapping
11
+ MODEL_PROVIDERS = {
12
+ # OpenAI models
13
+ 'gpt-4o': 'openai',
14
+ 'gpt-4-turbo': 'openai',
15
+ 'gpt-3.5-turbo': 'openai',
16
+
17
+ # Anthropic models
18
+ 'claude-3-5-sonnet-20241022': 'anthropic',
19
+ 'claude-3-opus-20240229': 'anthropic',
20
+ 'claude-3-haiku-20240307': 'anthropic',
21
+
22
+ # Grok models
23
+ 'grok-4-0709': 'grok',
24
+ 'grok-beta': 'grok',
25
+ 'grok-2-latest': 'grok',
26
+ 'grok-vision-beta': 'grok',
27
+ }
28
+
29
+ # Provider to API class mapping
30
+ PROVIDER_APIS = {
31
+ 'openai': OpenAIAPI,
32
+ 'anthropic': AnthropicAPI,
33
+ 'grok': GrokAPI,
34
+ }
35
+
36
+ @classmethod
37
+ def create_api(cls, model_name: str, config: Dict[str, Any]) -> BaseAPI:
38
+ """Create an API instance for the given model"""
39
+
40
+ # Determine provider
41
+ provider = cls.MODEL_PROVIDERS.get(model_name)
42
+ if not provider:
43
+ raise ValueError(f"Unknown model: {model_name}")
44
+
45
+ # Get provider config
46
+ provider_config = config['models'].get(provider)
47
+ if not provider_config:
48
+ raise ValueError(f"No configuration found for provider: {provider}")
49
+
50
+ # Get API key
51
+ api_key = provider_config.get('api_key')
52
+ if not api_key:
53
+ raise ValueError(f"No API key found for provider: {provider}")
54
+
55
+ # Get API class
56
+ api_class = cls.PROVIDER_APIS.get(provider)
57
+ if not api_class:
58
+ raise ValueError(f"No API implementation for provider: {provider}")
59
+
60
+ # Create API instance with provider-specific kwargs
61
+ kwargs = {
62
+ 'rate_limit_delay': config['evaluation'].get('rate_limit_delay', 1.0),
63
+ 'max_retries': config['evaluation'].get('max_retries', 3),
64
+ 'timeout': config['evaluation'].get('timeout', 30),
65
+ }
66
+
67
+ # Add provider-specific config
68
+ if provider == 'grok':
69
+ kwargs['base_url'] = provider_config.get('base_url', 'https://api.x.ai/v1')
70
+
71
+ return api_class(api_key, model_name, **kwargs)
apis/base_api.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ import time
3
+ import asyncio
4
+ from typing import List, Dict, Any, Optional
5
+
6
+ class BaseAPI(ABC):
7
+ """Base class for all API implementations"""
8
+
9
+ def __init__(self, api_key: str, model_name: str, **kwargs):
10
+ self.api_key = api_key
11
+ self.model_name = model_name
12
+ self.rate_limit_delay = kwargs.get('rate_limit_delay', 1.0)
13
+ self.max_retries = kwargs.get('max_retries', 3)
14
+ self.timeout = kwargs.get('timeout', 30)
15
+
16
+ @abstractmethod
17
+ async def generate_response(self, prompt: str, **kwargs) -> str:
18
+ """Generate a response from the model"""
19
+ pass
20
+
21
+ async def generate_with_retry(self, prompt: str, **kwargs) -> str:
22
+ """Generate response with retry logic"""
23
+ for attempt in range(self.max_retries):
24
+ try:
25
+ response = await self.generate_response(prompt, **kwargs)
26
+ return response
27
+ except Exception as e:
28
+ error_str = str(e).lower()
29
+
30
+ # Check if it's a timeout error
31
+ if 'timeout' in error_str or 'timed out' in error_str:
32
+ # For timeout errors, use longer backoff
33
+ max_retries = min(self.max_retries + 2, 5) # Allow more retries for timeouts
34
+ if attempt < max_retries - 1:
35
+ backoff = min(60, 5 * (2 ** attempt)) # Max 60 seconds wait
36
+ print(f"Timeout error, retrying in {backoff}s... (attempt {attempt + 1}/{max_retries})")
37
+ await asyncio.sleep(backoff)
38
+ continue
39
+
40
+ # For other errors, use standard backoff
41
+ if attempt == self.max_retries - 1:
42
+ raise e
43
+
44
+ backoff = min(30, 2 ** attempt) # Max 30 seconds for other errors
45
+ await asyncio.sleep(backoff)
46
+
47
+ async def batch_generate(self, prompts: List[str], **kwargs) -> List[str]:
48
+ """Generate responses for multiple prompts"""
49
+ responses = []
50
+ for prompt in prompts:
51
+ response = await self.generate_with_retry(prompt, **kwargs)
52
+ responses.append(response)
53
+ await asyncio.sleep(self.rate_limit_delay)
54
+ return responses
apis/grok_api.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from .base_api import BaseAPI
3
+
4
+ class GrokAPI(BaseAPI):
5
+ """Grok API implementation (uses OpenAI-compatible interface)"""
6
+
7
+ def __init__(self, api_key: str, model_name: str, **kwargs):
8
+ super().__init__(api_key, model_name, **kwargs)
9
+ self.base_url = kwargs.get('base_url', 'https://api.x.ai/v1')
10
+ self.client = openai.AsyncOpenAI(
11
+ api_key=api_key,
12
+ base_url=self.base_url
13
+ )
14
+
15
+ async def generate_response(self, prompt: str, **kwargs) -> str:
16
+ """Generate response using Grok API"""
17
+ try:
18
+ # Build parameters
19
+ params = {
20
+ "model": self.model_name,
21
+ "messages": [{"role": "user", "content": prompt}],
22
+ "temperature": kwargs.get('temperature', 0.0),
23
+ "timeout": self.timeout
24
+ }
25
+
26
+ # For grok-4-0709, don't set max_tokens to allow full reasoning
27
+ if self.model_name != 'grok-4-0709':
28
+ params['max_tokens'] = kwargs.get('max_tokens', 2048)
29
+
30
+ response = await self.client.chat.completions.create(**params)
31
+ return response.choices[0].message.content
32
+ except Exception as e:
33
+ raise Exception(f"Grok API error: {str(e)}")
34
+
35
+ def get_model_info(self) -> dict:
36
+ """Get model information"""
37
+ return {
38
+ "provider": "Grok",
39
+ "model": self.model_name,
40
+ "api_version": "v1",
41
+ "base_url": self.base_url
42
+ }
apis/openai_api.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from .base_api import BaseAPI
3
+ import asyncio
4
+
5
+ class OpenAIAPI(BaseAPI):
6
+ """OpenAI API implementation"""
7
+
8
+ def __init__(self, api_key: str, model_name: str, **kwargs):
9
+ super().__init__(api_key, model_name, **kwargs)
10
+ self.client = openai.AsyncOpenAI(api_key=api_key)
11
+
12
+ async def generate_response(self, prompt: str, **kwargs) -> str:
13
+ """Generate response using OpenAI API"""
14
+ try:
15
+ response = await self.client.chat.completions.create(
16
+ model=self.model_name,
17
+ messages=[{"role": "user", "content": prompt}],
18
+ temperature=kwargs.get('temperature', 0.0),
19
+ max_tokens=kwargs.get('max_tokens', 2048),
20
+ timeout=self.timeout
21
+ )
22
+ return response.choices[0].message.content
23
+ except Exception as e:
24
+ raise Exception(f"OpenAI API error: {str(e)}")
25
+
26
+ def get_model_info(self) -> dict:
27
+ """Get model information"""
28
+ return {
29
+ "provider": "OpenAI",
30
+ "model": self.model_name,
31
+ "api_version": "v1"
32
+ }
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+ from datetime import datetime
6
+ from dotenv import load_dotenv
7
+ import time
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ RESULTS_DIR = "results"
13
+ PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json")
14
+
15
+ def load_progress():
16
+ if not os.path.exists(PROGRESS_FILE):
17
+ return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A"
18
+
19
+ try:
20
+ df = pd.read_json(PROGRESS_FILE)
21
+ if df.empty:
22
+ return pd.DataFrame(), "Progress file is empty.", "N/A"
23
+
24
+ # Calculate metrics
25
+ total_questions = len(df)
26
+ correct_answers = df['is_correct'].sum()
27
+ accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
28
+ avg_response_time = df['response_time'].mean()
29
+
30
+ summary_text = f"""
31
+ ## Evaluation Progress
32
+ - **Questions Processed:** {total_questions} / 448
33
+ - **Current Accuracy:** {accuracy:.2f}%
34
+ - **Correct Answers:** {correct_answers}
35
+ - **Average Response Time:** {avg_response_time:.2f} seconds/question
36
+ """
37
+
38
+ # Get last modified time
39
+ last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S')
40
+
41
+ return df, summary_text, f"Last updated: {last_modified_time}"
42
+ except Exception as e:
43
+ return pd.DataFrame(), f"Error loading progress file: {e}", "N/A"
44
+
45
+ def create_ui():
46
+ df, summary, last_updated = load_progress()
47
+
48
+ with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo:
49
+ gr.Markdown("# Real-Time GPQA Evaluation Dashboard")
50
+ gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.")
51
+
52
+ with gr.Row():
53
+ summary_box = gr.Markdown(summary)
54
+ last_updated_box = gr.Markdown(last_updated)
55
+
56
+ with gr.Row():
57
+ # Create a simple plot: number of correct vs incorrect answers
58
+ if not df.empty:
59
+ correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'})
60
+ plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False)
61
+
62
+ gr.Markdown("## Raw Results")
63
+ gr.DataFrame(df, wrap=True)
64
+
65
+ return demo
66
+
67
+ def check_environment():
68
+ """Check if all required environment variables are set"""
69
+ issues = []
70
+
71
+ if not os.getenv('GROK_API_KEY'):
72
+ issues.append("GROK_API_KEY not found in environment")
73
+
74
+ if not os.getenv('HF_TOKEN'):
75
+ issues.append("HF_TOKEN not found (required for GPQA dataset access)")
76
+
77
+ return issues
78
+
79
+ def start_evaluation_safe():
80
+ """Safely start the evaluation process with error handling"""
81
+ issues = check_environment()
82
+ if issues:
83
+ print("⚠️ Environment issues detected:")
84
+ for issue in issues:
85
+ print(f" - {issue}")
86
+ print("\nPlease set the required environment variables in .env or Hugging Face Secrets")
87
+ return None
88
+
89
+ import subprocess
90
+ import sys
91
+
92
+ print("Starting background evaluation process...")
93
+ command = [
94
+ sys.executable,
95
+ "run_evaluation.py",
96
+ "--config", "official_config.yaml",
97
+ "--models", "grok-4-0709",
98
+ "--benchmarks", "gpqa"
99
+ ]
100
+
101
+ try:
102
+ # Use Popen to run in the background
103
+ process = subprocess.Popen(command)
104
+ print(f"Evaluation process started with PID: {process.pid}")
105
+ return process
106
+ except Exception as e:
107
+ print(f"Failed to start evaluation: {e}")
108
+ return None
109
+
110
+ if __name__ == "__main__":
111
+ # Check environment first
112
+ issues = check_environment()
113
+
114
+ if issues:
115
+ # Create UI with warning message
116
+ ui = create_ui()
117
+ print("\n⚠️ Running in demo mode due to missing configuration")
118
+ else:
119
+ # Start evaluation process
120
+ process = start_evaluation_safe()
121
+ ui = create_ui()
122
+
123
+ # Launch the UI
124
+ ui.launch(server_name="0.0.0.0", server_port=7860)
benchmarks/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_benchmark import BaseBenchmark, BenchmarkResult
2
+ from .mmlu_benchmark import MMLUBenchmark
3
+ from .gsm8k_benchmark import GSM8KBenchmark
4
+ from .humaneval_benchmark import HumanEvalBenchmark
5
+ from .gpqa_benchmark import GPQABenchmark
6
+ from .math_benchmark import MATHBenchmark
7
+
8
+ BENCHMARK_REGISTRY = {
9
+ 'mmlu': MMLUBenchmark,
10
+ 'gsm8k': GSM8KBenchmark,
11
+ 'humaneval': HumanEvalBenchmark,
12
+ 'gpqa': GPQABenchmark,
13
+ 'math': MATHBenchmark
14
+ }
15
+
16
+ def get_benchmark(name: str) -> BaseBenchmark:
17
+ """Get benchmark instance by name"""
18
+ if name.lower() not in BENCHMARK_REGISTRY:
19
+ raise ValueError(f"Unknown benchmark: {name}. Available: {list(BENCHMARK_REGISTRY.keys())}")
20
+
21
+ return BENCHMARK_REGISTRY[name.lower()]()
benchmarks/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (1.12 kB). View file
 
benchmarks/__pycache__/base_benchmark.cpython-312.pyc ADDED
Binary file (5.45 kB). View file
 
benchmarks/__pycache__/evaluation_utils.cpython-312.pyc ADDED
Binary file (5.64 kB). View file
 
benchmarks/__pycache__/gpqa_benchmark.cpython-312.pyc ADDED
Binary file (4.45 kB). View file
 
benchmarks/__pycache__/gsm8k_benchmark.cpython-312.pyc ADDED
Binary file (4.91 kB). View file
 
benchmarks/__pycache__/humaneval_benchmark.cpython-312.pyc ADDED
Binary file (6.18 kB). View file
 
benchmarks/__pycache__/math_benchmark.cpython-312.pyc ADDED
Binary file (5.41 kB). View file
 
benchmarks/__pycache__/mmlu_benchmark.cpython-312.pyc ADDED
Binary file (5.7 kB). View file
 
benchmarks/__pycache__/prompt_templates.cpython-312.pyc ADDED
Binary file (4.39 kB). View file
 
benchmarks/base_benchmark.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Dict, Any, Optional, Tuple
3
+ import asyncio
4
+ from dataclasses import dataclass
5
+ import time
6
+ from tqdm import tqdm
7
+
8
+ @dataclass
9
+ class BenchmarkResult:
10
+ """Container for benchmark results"""
11
+ benchmark_name: str
12
+ model_name: str
13
+ total_questions: int
14
+ correct: int
15
+ accuracy: float
16
+ avg_response_time: float
17
+ raw_results: List[Dict[str, Any]]
18
+
19
+ class BaseBenchmark(ABC):
20
+ """Base class for all benchmark implementations"""
21
+
22
+ def __init__(self, name: str, dataset_name: str = None):
23
+ self.name = name
24
+ self.dataset_name = dataset_name or name
25
+ self.dataset = None
26
+ self.results = []
27
+
28
+ @abstractmethod
29
+ async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
30
+ """Load the benchmark dataset"""
31
+ pass
32
+
33
+ @abstractmethod
34
+ async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
35
+ """Evaluate a single sample"""
36
+ pass
37
+
38
+ @abstractmethod
39
+ def format_prompt(self, sample: Dict[str, Any]) -> str:
40
+ """Format the prompt for the model"""
41
+ pass
42
+
43
+ async def run_benchmark(self, api, sample_size: Optional[int] = None, **kwargs) -> BenchmarkResult:
44
+ """Run the benchmark on the given API"""
45
+ print(f"Running {self.name} benchmark on {api.model_name}...")
46
+
47
+ # Load dataset
48
+ await self.load_dataset(sample_size, **kwargs)
49
+
50
+ if not self.dataset:
51
+ raise ValueError(f"No dataset loaded for {self.name}")
52
+
53
+ # Prepare samples
54
+ samples = self.dataset if sample_size is None else self.dataset[:sample_size]
55
+ total_samples = len(samples)
56
+
57
+ # Run evaluation
58
+ correct_count = 0
59
+ response_times = []
60
+ raw_results = []
61
+
62
+ # Use async semaphore for concurrent requests
63
+ concurrent_limit = kwargs.get('concurrent_requests', 5)
64
+ semaphore = asyncio.Semaphore(concurrent_limit)
65
+
66
+ async def evaluate_with_semaphore(sample, idx):
67
+ async with semaphore:
68
+ start_time = time.time()
69
+ is_correct, result = await self.evaluate_sample(api, sample, **kwargs)
70
+ end_time = time.time()
71
+
72
+ result['response_time'] = end_time - start_time
73
+ result['index'] = idx
74
+ return is_correct, result
75
+
76
+ # Create tasks for all samples
77
+ tasks = [evaluate_with_semaphore(sample, idx) for idx, sample in enumerate(samples)]
78
+
79
+ # Run with progress bar
80
+ # Add imports needed for progress saving
81
+ import json
82
+ import os
83
+
84
+ with tqdm(total=total_samples, desc=f"{self.name}") as pbar:
85
+ for coro in asyncio.as_completed(tasks):
86
+ is_correct, result = await coro
87
+
88
+ if is_correct:
89
+ correct_count += 1
90
+
91
+ response_times.append(result['response_time'])
92
+ raw_results.append(result)
93
+ pbar.update(1)
94
+
95
+ # --- START: REAL-TIME PROGRESS SAVING ---
96
+ # Every 10 samples, save the progress to a file
97
+ if pbar.n > 0 and pbar.n % 10 == 0:
98
+ # Ensure results directory exists
99
+ results_dir = kwargs.get('output_dir', 'results')
100
+ os.makedirs(results_dir, exist_ok=True)
101
+
102
+ progress_path = os.path.join(results_dir, f'{self.name}_progress.json')
103
+ # Sort results by index before saving
104
+ sorted_progress = sorted(raw_results, key=lambda x: x['index'])
105
+ try:
106
+ with open(progress_path, 'w') as f:
107
+ json.dump(sorted_progress, f, indent=2)
108
+ except Exception as e:
109
+ print(f"Error saving progress: {e}")
110
+ # --- END: REAL-TIME PROGRESS SAVING ---
111
+
112
+ # Calculate metrics
113
+ accuracy = correct_count / total_samples if total_samples > 0 else 0
114
+ avg_response_time = sum(response_times) / len(response_times) if response_times else 0
115
+
116
+ return BenchmarkResult(
117
+ benchmark_name=self.name,
118
+ model_name=api.model_name,
119
+ total_questions=total_samples,
120
+ correct=correct_count,
121
+ accuracy=accuracy,
122
+ avg_response_time=avg_response_time,
123
+ raw_results=sorted(raw_results, key=lambda x: x['index'])
124
+ )
benchmarks/evaluation_utils.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Evaluation utilities matching standard implementations"""
2
+
3
+ import re
4
+ from typing import Optional, Union
5
+ import numpy as np
6
+ try:
7
+ import sympy
8
+ from sympy.parsing.latex import parse_latex
9
+ SYMPY_AVAILABLE = True
10
+ except ImportError:
11
+ SYMPY_AVAILABLE = False
12
+
13
+ def normalize_math_answer(answer: str) -> str:
14
+ """Normalize mathematical answers following lm-eval's approach"""
15
+ if not answer:
16
+ return ""
17
+
18
+ # Extract content after equals sign
19
+ if '=' in answer:
20
+ answer = answer.split('=')[-1]
21
+
22
+ # Remove dollar signs and spaces
23
+ answer = answer.strip()
24
+ answer = answer.strip('$')
25
+
26
+ # Remove text{} and textbf{}
27
+ answer = re.sub(r'\\text\{([^}]*)\}', r'\1', answer)
28
+ answer = re.sub(r'\\textbf\{([^}]*)\}', r'\1', answer)
29
+
30
+ # Fix \fracab -> \frac{a}{b}
31
+ answer = re.sub(r'\\frac([0-9a-zA-Z])([0-9a-zA-Z])', r'\\frac{\1}{\2}', answer)
32
+
33
+ # Remove commas from numbers
34
+ answer = re.sub(r'(\d),', r'\1', answer)
35
+
36
+ # Remove specific words
37
+ for word in ['square', 'units', 'integers', 'dollars', 'mph', 'inches', 'feet', 'minutes', 'cm', 'gm', 'pounds', 'meters', 'meals', 'edges', 'students', 'childrentickets', 'multiples', 'hours', 'degrees', 'ounces', 'bits', 'factorization', 'greenmarbles', 'redmarbles', 'bluemarbles']:
38
+ answer = answer.replace(word, '')
39
+
40
+ # Remove extra spaces
41
+ answer = ' '.join(answer.split())
42
+
43
+ return answer.strip()
44
+
45
+ def extract_answer_gsm8k(response: str) -> Optional[float]:
46
+ """Extract answer from GSM8K response following official format"""
47
+ # Look for the last number in the response
48
+ numbers = re.findall(r'[-+]?\d*\.?\d+', response)
49
+ if numbers:
50
+ try:
51
+ return float(numbers[-1])
52
+ except:
53
+ pass
54
+ return None
55
+
56
+ def extract_answer_mmlu(response: str) -> Optional[str]:
57
+ """Extract MMLU answer following official format"""
58
+ # Clean response
59
+ response = response.strip()
60
+
61
+ # Look for single letter answer
62
+ if len(response) == 1 and response in 'ABCD':
63
+ return response
64
+
65
+ # Look for letter followed by parenthesis or period
66
+ match = re.search(r'^([ABCD])[).\s]', response)
67
+ if match:
68
+ return match.group(1)
69
+
70
+ # Look for "answer is X" pattern
71
+ match = re.search(r'answer is ([ABCD])', response, re.IGNORECASE)
72
+ if match:
73
+ return match.group(1).upper()
74
+
75
+ # Look for first occurrence of A, B, C, or D
76
+ match = re.search(r'[ABCD]', response)
77
+ if match:
78
+ return match.group(0)
79
+
80
+ return None
81
+
82
+ def calculate_accuracy_with_confidence(results: list) -> dict:
83
+ """Calculate accuracy with confidence intervals"""
84
+ correct = sum(1 for r in results if r.get('is_correct', False))
85
+ total = len(results)
86
+
87
+ if total == 0:
88
+ return {
89
+ 'accuracy': 0.0,
90
+ 'correct': 0,
91
+ 'total': 0,
92
+ 'confidence_interval': (0.0, 0.0)
93
+ }
94
+
95
+ accuracy = correct / total
96
+
97
+ # Wilson score interval for binomial proportion
98
+ z = 1.96 # 95% confidence
99
+ n = total
100
+ p = accuracy
101
+
102
+ denominator = 1 + z**2 / n
103
+ center = (p + z**2 / (2*n)) / denominator
104
+ margin = z * np.sqrt(p * (1-p) / n + z**2 / (4*n**2)) / denominator
105
+
106
+ lower = max(0, center - margin)
107
+ upper = min(1, center + margin)
108
+
109
+ return {
110
+ 'accuracy': accuracy,
111
+ 'correct': correct,
112
+ 'total': total,
113
+ 'confidence_interval': (lower, upper)
114
+ }
115
+
116
+ def is_math_equiv(pred: str, gold: str) -> bool:
117
+ """Check mathematical equivalence using SymPy (matching lm-eval)"""
118
+ # First normalize both answers
119
+ pred_norm = normalize_math_answer(pred)
120
+ gold_norm = normalize_math_answer(gold)
121
+
122
+ # Quick string comparison
123
+ if pred_norm == gold_norm:
124
+ return True
125
+
126
+ if not SYMPY_AVAILABLE:
127
+ # Fallback to string comparison
128
+ return pred_norm == gold_norm
129
+
130
+ try:
131
+ # Try to parse as LaTeX
132
+ try:
133
+ pred_expr = parse_latex(pred_norm)
134
+ gold_expr = parse_latex(gold_norm)
135
+ except:
136
+ # Try parsing as regular SymPy expression
137
+ pred_expr = sympy.sympify(pred_norm)
138
+ gold_expr = sympy.sympify(gold_norm)
139
+
140
+ # Check if expressions are equivalent
141
+ diff = sympy.simplify(pred_expr - gold_expr)
142
+ return diff == 0 or diff.is_zero
143
+
144
+ except Exception:
145
+ # If parsing fails, fall back to string comparison
146
+ return pred_norm == gold_norm
147
+
148
+ def is_gsm8k_correct(pred: str, gold: str) -> bool:
149
+ """Check GSM8K answer correctness"""
150
+ if pred == gold:
151
+ return True
152
+
153
+ try:
154
+ # Try numeric comparison
155
+ pred_num = float(pred)
156
+ gold_num = float(gold)
157
+ # GSM8K uses exact match, but we allow tiny floating point errors
158
+ return abs(pred_num - gold_num) < 1e-9
159
+ except:
160
+ return False
benchmarks/gpqa_benchmark.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_benchmark import BaseBenchmark
2
+ from typing import Dict, Any, Optional, Tuple
3
+ from datasets import load_dataset
4
+ import re
5
+ import random
6
+ from .evaluation_utils import extract_answer_mmlu
7
+
8
+ class GPQABenchmark(BaseBenchmark):
9
+ """GPQA (Graduate-Level Google-Proof Q&A) benchmark"""
10
+
11
+ def __init__(self):
12
+ super().__init__(name="GPQA", dataset_name="Idavidrein/gpqa")
13
+
14
+ async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
15
+ """Load GPQA dataset"""
16
+ # GPQA has different subsets: gpqa_main, gpqa_diamond, gpqa_extended
17
+ subset = kwargs.get('subset', 'gpqa_main')
18
+
19
+ try:
20
+ # Set HF token if available
21
+ import os
22
+ hf_token = os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN')
23
+ if hf_token:
24
+ dataset = load_dataset(self.dataset_name, subset, split='train', token=hf_token)
25
+ else:
26
+ dataset = load_dataset(self.dataset_name, subset, split='train')
27
+ except Exception as e:
28
+ if "gated dataset" in str(e) or "authentication" in str(e).lower():
29
+ raise Exception(
30
+ "GPQA dataset requires authentication. Please:\n"
31
+ "1. Set HF_TOKEN environment variable\n"
32
+ "2. Request access at https://huggingface.co/datasets/Idavidrein/gpqa\n"
33
+ f"Original error: {e}"
34
+ )
35
+ # Fallback to main if subset not found
36
+ try:
37
+ dataset = load_dataset(self.dataset_name, 'gpqa_main', split='train')
38
+ except:
39
+ raise e
40
+
41
+ self.dataset = []
42
+ for sample in dataset:
43
+ # GPQA has these fields: Question, Correct Answer, Incorrect Answer 1-3
44
+ choices = [
45
+ sample.get('Correct Answer', ''),
46
+ sample.get('Incorrect Answer 1', ''),
47
+ sample.get('Incorrect Answer 2', ''),
48
+ sample.get('Incorrect Answer 3', '')
49
+ ]
50
+
51
+ # Shuffle choices and track correct index
52
+ import random
53
+ indices = list(range(4))
54
+ random.shuffle(indices)
55
+ shuffled_choices = [choices[i] for i in indices]
56
+ correct_index = indices.index(0) # 0 was the correct answer position
57
+
58
+ self.dataset.append({
59
+ 'question': sample['Question'],
60
+ 'choices': shuffled_choices,
61
+ 'correct_index': correct_index,
62
+ 'subject': sample.get('Subdomain', 'Unknown'),
63
+ 'raw_sample': sample
64
+ })
65
+
66
+ # Shuffle dataset
67
+ random.shuffle(self.dataset)
68
+
69
+ if sample_size and len(self.dataset) > sample_size:
70
+ self.dataset = self.dataset[:sample_size]
71
+
72
+ def format_prompt(self, sample: Dict[str, Any]) -> str:
73
+ """Format GPQA question as prompt matching official format"""
74
+ question = sample['question']
75
+ choices = sample['choices']
76
+
77
+ # GPQA uses a simpler format in lm-eval
78
+ prompt = f"""What is the correct answer to this question: {question}
79
+
80
+ Choices:
81
+ (A) {choices[0]}
82
+ (B) {choices[1]}
83
+ (C) {choices[2]}
84
+ (D) {choices[3]}
85
+
86
+ Answer:"""
87
+ return prompt
88
+
89
+ async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
90
+ """Evaluate a single GPQA sample"""
91
+ prompt = self.format_prompt(sample)
92
+
93
+ try:
94
+ response = await api.generate_with_retry(prompt, **kwargs)
95
+
96
+ # Extract answer from response using standard extraction
97
+ predicted_letter = extract_answer_mmlu(response)
98
+
99
+ if predicted_letter:
100
+ predicted_index = ord(predicted_letter) - ord('A')
101
+ else:
102
+ # If no clear answer, mark as incorrect
103
+ predicted_index = -1
104
+
105
+ correct_index = sample['correct_index']
106
+ is_correct = predicted_index == correct_index
107
+
108
+ result = {
109
+ 'question': sample['question'],
110
+ 'choices': sample['choices'],
111
+ 'correct_answer': correct_index,
112
+ 'predicted_answer': predicted_index,
113
+ 'model_response': response,
114
+ 'is_correct': is_correct,
115
+ 'subject': sample['subject']
116
+ }
117
+
118
+ return is_correct, result
119
+
120
+ except Exception as e:
121
+ result = {
122
+ 'question': sample['question'],
123
+ 'error': str(e),
124
+ 'is_correct': False
125
+ }
126
+ return False, result
benchmarks/gsm8k_benchmark.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_benchmark import BaseBenchmark
2
+ from typing import Dict, Any, Optional, Tuple
3
+ from datasets import load_dataset
4
+ import re
5
+ from .prompt_templates import get_gsm8k_cot_prompt
6
+
7
+ class GSM8KBenchmark(BaseBenchmark):
8
+ """GSM8K (Grade School Math 8K) benchmark"""
9
+
10
+ def __init__(self):
11
+ super().__init__(name="GSM8K", dataset_name="gsm8k")
12
+
13
+ async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
14
+ """Load GSM8K dataset"""
15
+ dataset = load_dataset(self.dataset_name, 'main', split='test')
16
+
17
+ self.dataset = []
18
+ for sample in dataset:
19
+ self.dataset.append({
20
+ 'question': sample['question'],
21
+ 'answer': sample['answer'],
22
+ 'raw_sample': sample
23
+ })
24
+
25
+ # Shuffle dataset
26
+ import random
27
+ random.shuffle(self.dataset)
28
+
29
+ if sample_size and len(self.dataset) > sample_size:
30
+ self.dataset = self.dataset[:sample_size]
31
+
32
+ def extract_answer_from_solution(self, solution: str) -> Optional[str]:
33
+ """Extract numerical answer from GSM8K solution string"""
34
+ # GSM8K answers are in format: "... #### number"
35
+ match = re.search(r'#### ([\-0-9\.\,]+)', solution)
36
+ if match:
37
+ answer_str = match.group(1).replace(',', '')
38
+ return answer_str
39
+ return None
40
+
41
+ def extract_number_from_response(self, response: str) -> Optional[str]:
42
+ """Extract the final numerical answer from model response"""
43
+ # Official lm-eval uses these patterns in order:
44
+
45
+ # 1. Look for "The answer is X" pattern (CoT standard)
46
+ match = re.search(r'The answer is ([\-0-9\.\,]+)\.?', response, re.IGNORECASE)
47
+ if match:
48
+ return match.group(1).replace(',', '')
49
+
50
+ # 2. Look for #### format (if model knows GSM8K format)
51
+ match = re.search(r'#### ([\-0-9\.\,]+)', response)
52
+ if match:
53
+ return match.group(1).replace(',', '')
54
+
55
+ # 3. Flexible extraction: find all numbers and take the last one
56
+ # This matches lm-eval's flexible-extract with group_select: -1
57
+ numbers = re.findall(r'(-?[$0-9.,]{2,})|(-?[0-9]+)', response)
58
+ if numbers:
59
+ # Flatten tuples and get last non-empty match
60
+ flat_numbers = [n for group in numbers for n in group if n]
61
+ if flat_numbers:
62
+ last_number = flat_numbers[-1]
63
+ # Clean the number
64
+ cleaned = last_number.replace('$', '').replace(',', '')
65
+ try:
66
+ # Validate it's a proper number
67
+ float(cleaned)
68
+ return cleaned
69
+ except:
70
+ pass
71
+
72
+ return None
73
+
74
+ def format_prompt(self, sample: Dict[str, Any]) -> str:
75
+ """Format GSM8K question as prompt with CoT examples"""
76
+ # Use the standard CoT prompt from lm-eval
77
+ return get_gsm8k_cot_prompt(sample['question'])
78
+
79
+ async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
80
+ """Evaluate a single GSM8K sample"""
81
+ prompt = self.format_prompt(sample)
82
+
83
+ try:
84
+ response = await api.generate_with_retry(prompt, **kwargs)
85
+
86
+ # Extract correct answer
87
+ correct_answer = self.extract_answer_from_solution(sample['answer'])
88
+
89
+ # Extract model's answer
90
+ model_answer = self.extract_number_from_response(response)
91
+
92
+ # Check if answers match (exact string match after normalization)
93
+ is_correct = False
94
+ if correct_answer is not None and model_answer is not None:
95
+ # GSM8K uses exact match on normalized strings
96
+ is_correct = correct_answer == model_answer
97
+
98
+ result = {
99
+ 'question': sample['question'],
100
+ 'correct_answer': correct_answer,
101
+ 'model_answer': model_answer,
102
+ 'model_response': response,
103
+ 'is_correct': is_correct,
104
+ 'solution': sample['answer']
105
+ }
106
+
107
+ return is_correct, result
108
+
109
+ except Exception as e:
110
+ result = {
111
+ 'question': sample['question'],
112
+ 'error': str(e),
113
+ 'is_correct': False
114
+ }
115
+ return False, result
benchmarks/humaneval_benchmark.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_benchmark import BaseBenchmark
2
+ from typing import Dict, Any, Optional, Tuple
3
+ from datasets import load_dataset
4
+ import subprocess
5
+ import tempfile
6
+ import os
7
+ import sys
8
+ import re
9
+
10
+ class HumanEvalBenchmark(BaseBenchmark):
11
+ """HumanEval code generation benchmark"""
12
+
13
+ def __init__(self):
14
+ super().__init__(name="HumanEval", dataset_name="openai_humaneval")
15
+
16
+ async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
17
+ """Load HumanEval dataset"""
18
+ dataset = load_dataset(self.dataset_name, split='test')
19
+
20
+ self.dataset = []
21
+ for sample in dataset:
22
+ self.dataset.append({
23
+ 'task_id': sample['task_id'],
24
+ 'prompt': sample['prompt'],
25
+ 'canonical_solution': sample['canonical_solution'],
26
+ 'test': sample['test'],
27
+ 'entry_point': sample['entry_point'],
28
+ 'raw_sample': sample
29
+ })
30
+
31
+ if sample_size and len(self.dataset) > sample_size:
32
+ self.dataset = self.dataset[:sample_size]
33
+
34
+ def format_prompt(self, sample: Dict[str, Any]) -> str:
35
+ """Format HumanEval problem as prompt"""
36
+ # lm-eval uses just the raw prompt without additional instructions
37
+ return sample['prompt']
38
+
39
+ def extract_code(self, response: str, entry_point: str, prompt: str) -> str:
40
+ """Extract code from model response"""
41
+ # Clean the response - handle markdown code blocks
42
+ code = response.strip()
43
+
44
+ # Remove markdown code block markers
45
+ if code.startswith('```python'):
46
+ code = code[9:] # Remove ```python
47
+ elif code.startswith('```'):
48
+ code = code[3:] # Remove ```
49
+
50
+ if code.endswith('```'):
51
+ code = code[:-3] # Remove trailing ```
52
+
53
+ code = code.strip()
54
+
55
+ # If the response contains the complete function, use it directly
56
+ if f"def {entry_point}" in code:
57
+ return code
58
+ else:
59
+ # Fallback: assume it's completion to be added after prompt
60
+ stop_sequences = ['\nclass', '\ndef', '\n#', '\nif __name__']
61
+
62
+ for stop in stop_sequences:
63
+ pos = code.find(stop)
64
+ if pos > 0:
65
+ code = code[:pos]
66
+ break
67
+
68
+ return prompt + code
69
+
70
+ def run_test(self, code: str, test_code: str) -> Tuple[bool, str]:
71
+ """Run the test code and return success status and output"""
72
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
73
+ # Write the complete test file
74
+ f.write(code + '\n\n' + test_code)
75
+ f.flush()
76
+
77
+ try:
78
+ # Run the test
79
+ result = subprocess.run(
80
+ [sys.executable, f.name],
81
+ capture_output=True,
82
+ text=True,
83
+ timeout=10
84
+ )
85
+
86
+ if result.returncode == 0:
87
+ return True, result.stdout
88
+ else:
89
+ return False, result.stderr
90
+
91
+ except subprocess.TimeoutExpired:
92
+ return False, "Timeout: Code execution took too long"
93
+ except Exception as e:
94
+ return False, f"Error running test: {str(e)}"
95
+ finally:
96
+ # Clean up
97
+ try:
98
+ os.unlink(f.name)
99
+ except:
100
+ pass
101
+
102
+ async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
103
+ """Evaluate a single HumanEval sample"""
104
+ prompt = self.format_prompt(sample)
105
+
106
+ try:
107
+ response = await api.generate_with_retry(prompt, **kwargs)
108
+
109
+ # Extract code from response
110
+ code = self.extract_code(response, sample['entry_point'], sample['prompt'])
111
+
112
+ # Run the test
113
+ is_correct, test_output = self.run_test(code, sample['test'])
114
+
115
+ result = {
116
+ 'task_id': sample['task_id'],
117
+ 'prompt': sample['prompt'],
118
+ 'model_response': response,
119
+ 'extracted_code': code,
120
+ 'is_correct': is_correct,
121
+ 'test_output': test_output,
122
+ 'entry_point': sample['entry_point']
123
+ }
124
+
125
+ return is_correct, result
126
+
127
+ except Exception as e:
128
+ result = {
129
+ 'task_id': sample['task_id'],
130
+ 'prompt': sample['prompt'],
131
+ 'error': str(e),
132
+ 'is_correct': False
133
+ }
134
+ return False, result
benchmarks/math_benchmark.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_benchmark import BaseBenchmark
2
+ from typing import Dict, Any, Optional, Tuple
3
+ from datasets import load_dataset
4
+ import re
5
+ from .evaluation_utils import normalize_math_answer, is_math_equiv
6
+
7
+ class MATHBenchmark(BaseBenchmark):
8
+ """MATH (Mathematics) benchmark for competition-level problems"""
9
+
10
+ LEVELS = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
11
+ TYPES = ['Algebra', 'Counting & Probability', 'Geometry', 'Intermediate Algebra',
12
+ 'Number Theory', 'Prealgebra', 'Precalculus']
13
+
14
+ def __init__(self):
15
+ super().__init__(name="MATH", dataset_name="hendrycks/competition_math")
16
+
17
+ async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
18
+ """Load MATH dataset"""
19
+ dataset = load_dataset(self.dataset_name, split='test')
20
+
21
+ # Filter by difficulty level if specified
22
+ difficulty_levels = kwargs.get('difficulty', ['all'])
23
+ if 'all' not in difficulty_levels:
24
+ dataset = dataset.filter(lambda x: x['level'] in difficulty_levels)
25
+
26
+ self.dataset = []
27
+ for sample in dataset:
28
+ self.dataset.append({
29
+ 'problem': sample['problem'],
30
+ 'solution': sample['solution'],
31
+ 'level': sample['level'],
32
+ 'type': sample['type'],
33
+ 'raw_sample': sample
34
+ })
35
+
36
+ # Shuffle dataset
37
+ import random
38
+ random.shuffle(self.dataset)
39
+
40
+ if sample_size and len(self.dataset) > sample_size:
41
+ self.dataset = self.dataset[:sample_size]
42
+
43
+ def extract_answer(self, solution: str) -> Optional[str]:
44
+ """Extract the final answer from MATH solution using lm-eval's method"""
45
+ # Find all boxed content
46
+ boxed_matches = re.findall(r'\\boxed\{([^{}]*)\}', solution)
47
+ fbox_matches = re.findall(r'\\fbox\{([^{}]*)\}', solution)
48
+
49
+ all_matches = boxed_matches + fbox_matches
50
+
51
+ if all_matches:
52
+ # Return the last boxed answer
53
+ return all_matches[-1].strip()
54
+
55
+ return None
56
+
57
+ def extract_model_answer(self, response: str) -> Optional[str]:
58
+ """Extract answer from model response"""
59
+ # Try to find boxed answer first
60
+ answer = self.extract_answer(response)
61
+ if answer:
62
+ return answer
63
+
64
+ # If no boxed answer, look for common patterns
65
+ # "The answer is X"
66
+ match = re.search(r'answer is[\s:]*([^.\n]+)', response, re.IGNORECASE)
67
+ if match:
68
+ return match.group(1).strip()
69
+
70
+ # "Therefore, X"
71
+ match = re.search(r'therefore[,\s]+([^.\n]+)', response, re.IGNORECASE)
72
+ if match:
73
+ return match.group(1).strip()
74
+
75
+ return None
76
+
77
+ def format_prompt(self, sample: Dict[str, Any]) -> str:
78
+ """Format MATH problem as prompt"""
79
+ prompt = f"""Solve the following mathematics problem step by step. Show all your work and put your final answer in the format \\boxed{{answer}}.
80
+
81
+ Problem: {sample['problem']}
82
+
83
+ Solution:"""
84
+ return prompt
85
+
86
+ async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
87
+ """Evaluate a single MATH sample"""
88
+ prompt = self.format_prompt(sample)
89
+
90
+ try:
91
+ response = await api.generate_with_retry(prompt, **kwargs)
92
+
93
+ # Extract correct answer
94
+ correct_answer = self.extract_answer(sample['solution'])
95
+
96
+ # Extract model's answer
97
+ model_answer = self.extract_model_answer(response)
98
+
99
+ # Compare answers using mathematical equivalence
100
+ is_correct = False
101
+ if correct_answer and model_answer:
102
+ # Use the official equivalence checking
103
+ is_correct = is_math_equiv(model_answer, correct_answer)
104
+
105
+ result = {
106
+ 'problem': sample['problem'],
107
+ 'level': sample['level'],
108
+ 'type': sample['type'],
109
+ 'correct_answer': correct_answer,
110
+ 'model_answer': model_answer,
111
+ 'model_response': response,
112
+ 'is_correct': is_correct
113
+ }
114
+
115
+ return is_correct, result
116
+
117
+ except Exception as e:
118
+ result = {
119
+ 'problem': sample['problem'],
120
+ 'level': sample['level'],
121
+ 'type': sample['type'],
122
+ 'error': str(e),
123
+ 'is_correct': False
124
+ }
125
+ return False, result
benchmarks/mmlu_benchmark.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_benchmark import BaseBenchmark
2
+ from typing import Dict, Any, Optional, Tuple, List
3
+ from datasets import load_dataset
4
+ import re
5
+ import random
6
+ from .prompt_templates import get_mmlu_prompt
7
+ from .evaluation_utils import extract_answer_mmlu
8
+
9
+ class MMLUBenchmark(BaseBenchmark):
10
+ """MMLU (Massive Multitask Language Understanding) benchmark"""
11
+
12
+ SUBJECTS = [
13
+ 'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics',
14
+ 'clinical_knowledge', 'college_biology', 'college_chemistry',
15
+ 'college_computer_science', 'college_mathematics', 'college_medicine',
16
+ 'college_physics', 'computer_security', 'conceptual_physics',
17
+ 'econometrics', 'electrical_engineering', 'elementary_mathematics',
18
+ 'formal_logic', 'global_facts', 'high_school_biology',
19
+ 'high_school_chemistry', 'high_school_computer_science',
20
+ 'high_school_european_history', 'high_school_geography',
21
+ 'high_school_government_and_politics', 'high_school_macroeconomics',
22
+ 'high_school_mathematics', 'high_school_microeconomics',
23
+ 'high_school_physics', 'high_school_psychology', 'high_school_statistics',
24
+ 'high_school_us_history', 'high_school_world_history', 'human_aging',
25
+ 'human_sexuality', 'international_law', 'jurisprudence',
26
+ 'logical_fallacies', 'machine_learning', 'management', 'marketing',
27
+ 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios',
28
+ 'nutrition', 'philosophy', 'prehistory', 'professional_accounting',
29
+ 'professional_law', 'professional_medicine', 'professional_psychology',
30
+ 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy',
31
+ 'virology', 'world_religions'
32
+ ]
33
+
34
+ def __init__(self):
35
+ super().__init__(name="MMLU", dataset_name="cais/mmlu")
36
+
37
+ async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
38
+ """Load MMLU dataset"""
39
+ subjects = kwargs.get('subjects', ['all'])
40
+
41
+ if 'all' in subjects:
42
+ subjects = self.SUBJECTS
43
+ else:
44
+ subjects = [s for s in subjects if s in self.SUBJECTS]
45
+
46
+ self.dataset = []
47
+ self.few_shot_examples = {} # Store few-shot examples per subject
48
+
49
+ for subject in subjects:
50
+ try:
51
+ # Load dev split for few-shot examples
52
+ dev_ds = load_dataset(self.dataset_name, subject, split='dev')
53
+ # Standard MMLU uses 5-shot
54
+ self.few_shot_examples[subject] = [
55
+ {
56
+ 'question': ex['question'],
57
+ 'choices': ex['choices'],
58
+ 'answer': ex['answer']
59
+ }
60
+ for ex in list(dev_ds)[:5]
61
+ ]
62
+
63
+ # Load test split for evaluation
64
+ test_ds = load_dataset(self.dataset_name, subject, split='test')
65
+
66
+ for sample in test_ds:
67
+ self.dataset.append({
68
+ 'subject': subject,
69
+ 'question': sample['question'],
70
+ 'choices': sample['choices'],
71
+ 'answer': sample['answer'], # 0-3 index
72
+ 'raw_sample': sample
73
+ })
74
+ except Exception as e:
75
+ print(f"Error loading {subject}: {e}")
76
+ continue
77
+
78
+ # Shuffle dataset
79
+ random.shuffle(self.dataset)
80
+
81
+ if sample_size and len(self.dataset) > sample_size:
82
+ self.dataset = self.dataset[:sample_size]
83
+
84
+ def format_prompt(self, sample: Dict[str, Any]) -> str:
85
+ """Format MMLU question as prompt with few-shot examples"""
86
+ subject = sample['subject']
87
+ few_shot_examples = self.few_shot_examples.get(subject, [])
88
+
89
+ return get_mmlu_prompt(
90
+ question=sample['question'],
91
+ choices=sample['choices'],
92
+ subject=subject.replace('_', ' ').title(),
93
+ few_shot_examples=few_shot_examples
94
+ )
95
+
96
+ async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
97
+ """Evaluate a single MMLU sample"""
98
+ prompt = self.format_prompt(sample)
99
+
100
+ try:
101
+ response = await api.generate_with_retry(prompt, **kwargs)
102
+
103
+ # Extract answer from response using standard extraction
104
+ predicted_letter = extract_answer_mmlu(response)
105
+
106
+ if predicted_letter:
107
+ predicted_index = ord(predicted_letter) - ord('A')
108
+ else:
109
+ # If no clear answer, mark as incorrect
110
+ predicted_index = -1
111
+
112
+ correct_index = sample['answer']
113
+ is_correct = predicted_index == correct_index
114
+
115
+ result = {
116
+ 'subject': sample['subject'],
117
+ 'question': sample['question'],
118
+ 'choices': sample['choices'],
119
+ 'correct_answer': correct_index,
120
+ 'predicted_answer': predicted_index,
121
+ 'model_response': response,
122
+ 'is_correct': is_correct
123
+ }
124
+
125
+ return is_correct, result
126
+
127
+ except Exception as e:
128
+ result = {
129
+ 'subject': sample['subject'],
130
+ 'question': sample['question'],
131
+ 'error': str(e),
132
+ 'is_correct': False
133
+ }
134
+ return False, result
benchmarks/prompt_templates.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Standard prompt templates matching lm-eval implementation"""
2
+
3
+ MMLU_PROMPT_TEMPLATE = """The following are multiple choice questions (with answers) about {subject}.
4
+
5
+ {few_shot_examples}
6
+ Question: {question}
7
+ A) {choice_a}
8
+ B) {choice_b}
9
+ C) {choice_c}
10
+ D) {choice_d}
11
+ Answer:"""
12
+
13
+ MMLU_FEW_SHOT_TEMPLATE = """Question: {question}
14
+ A) {choice_a}
15
+ B) {choice_b}
16
+ C) {choice_c}
17
+ D) {choice_d}
18
+ Answer: {answer}
19
+
20
+ """
21
+
22
+ GSM8K_PROMPT_TEMPLATE = """Question: {question}
23
+ Let's think step by step.
24
+ """
25
+
26
+ GSM8K_COT_TEMPLATE = """Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
27
+ A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.
28
+
29
+ Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
30
+ A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.
31
+
32
+ Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
33
+ A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.
34
+
35
+ Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
36
+ A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he gave to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.
37
+
38
+ Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
39
+ A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.
40
+
41
+ Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?
42
+ A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.
43
+
44
+ Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?
45
+ A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.
46
+
47
+ Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
48
+ A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.
49
+
50
+ Q: {question}
51
+ A:"""
52
+
53
+ HUMANEVAL_PROMPT_TEMPLATE = """Complete the following Python function:
54
+
55
+ {prompt}"""
56
+
57
+ def get_mmlu_prompt(question, choices, subject="", few_shot_examples=None):
58
+ """Generate MMLU prompt with few-shot examples"""
59
+ if not few_shot_examples:
60
+ few_shot_examples = []
61
+
62
+ # Format few-shot examples
63
+ examples = ""
64
+ for ex in few_shot_examples:
65
+ examples += MMLU_FEW_SHOT_TEMPLATE.format(
66
+ question=ex['question'],
67
+ choice_a=ex['choices'][0],
68
+ choice_b=ex['choices'][1],
69
+ choice_c=ex['choices'][2],
70
+ choice_d=ex['choices'][3],
71
+ answer=chr(ord('A') + ex['answer'])
72
+ )
73
+
74
+ return MMLU_PROMPT_TEMPLATE.format(
75
+ subject=subject,
76
+ few_shot_examples=examples.rstrip(), # Remove trailing newline
77
+ question=question,
78
+ choice_a=choices[0],
79
+ choice_b=choices[1],
80
+ choice_c=choices[2],
81
+ choice_d=choices[3]
82
+ )
83
+
84
+ def get_gsm8k_cot_prompt(question):
85
+ """Generate GSM8K prompt with Chain-of-Thought examples"""
86
+ return GSM8K_COT_TEMPLATE.format(question=question)
check_deployment.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Pre-deployment checklist for HF Space
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ def check_deployment_ready():
11
+ """Check if everything is ready for HF deployment"""
12
+
13
+ print("πŸ” Pre-deployment checklist:\n")
14
+
15
+ checks = []
16
+
17
+ # Check files exist
18
+ required_files = [
19
+ "app.py",
20
+ "run_evaluation.py",
21
+ "requirements.txt",
22
+ ".env.example",
23
+ "run_hf_space.py",
24
+ "official_config.yaml"
25
+ ]
26
+
27
+ for file in required_files:
28
+ if Path(file).exists():
29
+ checks.append((f"βœ… {file} exists", True))
30
+ else:
31
+ checks.append((f"❌ {file} missing", False))
32
+
33
+ # Check API directories
34
+ if Path("apis").is_dir() and list(Path("apis").glob("*.py")):
35
+ checks.append(("βœ… APIs directory configured", True))
36
+ else:
37
+ checks.append(("❌ APIs directory missing or empty", False))
38
+
39
+ # Check benchmarks directory
40
+ if Path("benchmarks").is_dir() and Path("benchmarks/gpqa_benchmark.py").exists():
41
+ checks.append(("βœ… GPQA benchmark implementation found", True))
42
+ else:
43
+ checks.append(("❌ GPQA benchmark missing", False))
44
+
45
+ # Check for sensitive data
46
+ if Path(".env").exists():
47
+ checks.append(("⚠️ .env file exists - make sure it's in .gitignore!", None))
48
+
49
+ # Print results
50
+ for check, status in checks:
51
+ print(check)
52
+
53
+ all_good = all(status is not False for _, status in checks)
54
+
55
+ if all_good:
56
+ print("\nβœ… Ready for deployment!")
57
+ print("\nNext steps:")
58
+ print("1. Set GROK_API_KEY and HF_TOKEN in HF Space secrets")
59
+ print("2. Make sure you have GPQA dataset access")
60
+ print("3. Push to Hugging Face")
61
+ else:
62
+ print("\n❌ Issues found - please fix before deploying")
63
+
64
+ return all_good
65
+
66
+ if __name__ == "__main__":
67
+ check_deployment_ready()
official_config.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Official benchmark configuration matching lm-eval settings
2
+ models:
3
+ openai:
4
+ api_key: "${OPENAI_API_KEY}"
5
+ models:
6
+ - "gpt-4o"
7
+ - "gpt-4-turbo"
8
+ - "gpt-3.5-turbo"
9
+ anthropic:
10
+ api_key: "${ANTHROPIC_API_KEY}"
11
+ models:
12
+ - "claude-3-5-sonnet-20241022"
13
+ - "claude-3-opus-20240229"
14
+ - "claude-3-haiku-20240307"
15
+ grok:
16
+ api_key: "${GROK_API_KEY}"
17
+ base_url: "https://api.x.ai/v1"
18
+ models:
19
+ - "grok-4-0709"
20
+ - "grok-beta"
21
+ - "grok-2-latest"
22
+
23
+ benchmarks:
24
+ mmlu:
25
+ enabled: true
26
+ sample_size: null # Use full dataset
27
+ subjects: ["all"]
28
+ # Official settings
29
+ num_fewshot: 5
30
+ doc_to_choice: ["A", "B", "C", "D"]
31
+
32
+ gsm8k:
33
+ enabled: true
34
+ sample_size: null # Full test set (1319 samples)
35
+ # Official settings
36
+ num_fewshot: 8 # 8-shot CoT
37
+ use_cot: true
38
+
39
+ humaneval:
40
+ enabled: true
41
+ sample_size: null # Full test set (164 samples)
42
+ # Official settings
43
+ pass_at_k: [1] # Calculate Pass@1
44
+ do_sample: false # Deterministic generation
45
+
46
+ gpqa:
47
+ enabled: true
48
+ sample_size: null
49
+ subset: "gpqa_main" # or "gpqa_diamond" for harder subset
50
+
51
+ math:
52
+ enabled: true
53
+ sample_size: null # Full test set (5000 samples)
54
+ # Official settings
55
+ use_sympy: true # Use SymPy for equivalence checking
56
+
57
+ evaluation:
58
+ # Generation settings matching lm-eval
59
+ temperature: 0.0 # Deterministic for evaluation
60
+ max_tokens: 2048
61
+ top_p: 1.0
62
+
63
+ # For HumanEval code generation
64
+ humaneval_max_tokens: 1024
65
+
66
+ # System settings
67
+ timeout: 60 # Increased for complex problems
68
+ max_retries: 3
69
+ concurrent_requests: 5
70
+ rate_limit_delay: 0.5
71
+
72
+ output:
73
+ save_results: true
74
+ results_dir: "results"
75
+ generate_report: true
76
+ plot_graphs: true
77
+ save_raw_outputs: true # Save all model outputs for debugging
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai>=1.0.0
2
+ anthropic>=0.20.0
3
+ requests>=2.28.0
4
+ numpy>=1.21.0
5
+ pandas>=1.3.0
6
+ tqdm>=4.62.0
7
+ pyyaml>=6.0
8
+ datasets>=2.0.0
9
+ transformers>=4.20.0
10
+ scipy>=1.7.0
11
+ matplotlib>=3.5.0
12
+ seaborn>=0.11.0
13
+ python-dotenv>=0.19.0
14
+ aiohttp>=3.8.0
15
+ sympy>=1.11.0
16
+ gradio>=4.31.0
17
+ huggingface_hub>=0.20.0
run_evaluation.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main script to run AI model evaluation benchmarks"""
2
+
3
+ import argparse
4
+ import asyncio
5
+ import json
6
+ import os
7
+ import yaml
8
+ from datetime import datetime
9
+ from typing import List, Dict, Any
10
+ from dotenv import load_dotenv
11
+ import pandas as pd
12
+
13
+ from apis.api_factory import APIFactory
14
+ from benchmarks import get_benchmark, BenchmarkResult
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ def load_config(config_path: str = 'official_config.yaml') -> dict:
20
+ """Load configuration from YAML file"""
21
+ with open(config_path, 'r') as f:
22
+ config = yaml.safe_load(f)
23
+
24
+ # Replace environment variables
25
+ def replace_env_vars(obj):
26
+ if isinstance(obj, str) and obj.startswith('${') and obj.endswith('}'):
27
+ env_var = obj[2:-1]
28
+ return os.getenv(env_var, obj)
29
+ elif isinstance(obj, dict):
30
+ return {k: replace_env_vars(v) for k, v in obj.items()}
31
+ elif isinstance(obj, list):
32
+ return [replace_env_vars(item) for item in obj]
33
+ return obj
34
+
35
+ return replace_env_vars(config)
36
+
37
+ def save_results(results: List[BenchmarkResult], output_dir: str):
38
+ """Save evaluation results"""
39
+ os.makedirs(output_dir, exist_ok=True)
40
+
41
+ # Create timestamp for this run
42
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
43
+
44
+ # Save detailed results as JSON
45
+ detailed_results = []
46
+ for result in results:
47
+ detailed_results.append({
48
+ 'benchmark': result.benchmark_name,
49
+ 'model': result.model_name,
50
+ 'total_questions': result.total_questions,
51
+ 'correct': result.correct,
52
+ 'accuracy': result.accuracy,
53
+ 'avg_response_time': result.avg_response_time,
54
+ 'timestamp': timestamp
55
+ })
56
+
57
+ json_path = os.path.join(output_dir, f'results_{timestamp}.json')
58
+ with open(json_path, 'w') as f:
59
+ json.dump(detailed_results, f, indent=2)
60
+
61
+ # Save summary as CSV
62
+ df = pd.DataFrame(detailed_results)
63
+ csv_path = os.path.join(output_dir, f'summary_{timestamp}.csv')
64
+ df.to_csv(csv_path, index=False)
65
+
66
+ # Save raw results for debugging
67
+ for result in results:
68
+ raw_path = os.path.join(output_dir, f'{result.model_name}_{result.benchmark_name}_{timestamp}_raw.json')
69
+ with open(raw_path, 'w') as f:
70
+ json.dump(result.raw_results, f, indent=2)
71
+
72
+ return json_path, csv_path
73
+
74
+ def print_results_table(results: List[BenchmarkResult]):
75
+ """Print results in a nice table format"""
76
+ if not results:
77
+ return
78
+
79
+ # Group by model
80
+ model_results = {}
81
+ for result in results:
82
+ if result.model_name not in model_results:
83
+ model_results[result.model_name] = {}
84
+ model_results[result.model_name][result.benchmark_name] = result
85
+
86
+ # Print header
87
+ benchmarks = list(set(r.benchmark_name for r in results))
88
+ benchmarks.sort()
89
+
90
+ print("\n" + "="*80)
91
+ print("EVALUATION RESULTS")
92
+ print("="*80)
93
+
94
+ # Create table
95
+ header = ["Model"] + benchmarks + ["Average"]
96
+ print(f"{'Model':<20}", end="")
97
+ for bench in benchmarks:
98
+ print(f"{bench:<15}", end="")
99
+ print(f"{'Average':<10}")
100
+ print("-"*80)
101
+
102
+ # Print results for each model
103
+ for model, bench_results in model_results.items():
104
+ print(f"{model:<20}", end="")
105
+ scores = []
106
+
107
+ for bench in benchmarks:
108
+ if bench in bench_results:
109
+ score = bench_results[bench].accuracy * 100
110
+ scores.append(score)
111
+ print(f"{score:>6.1f}% ", end="")
112
+ else:
113
+ print(f"{'N/A':<15}", end="")
114
+
115
+ # Calculate average
116
+ if scores:
117
+ avg = sum(scores) / len(scores)
118
+ print(f"{avg:>6.1f}%")
119
+ else:
120
+ print("N/A")
121
+
122
+ print("="*80)
123
+
124
+ async def run_single_evaluation(api, benchmark_name: str, config: dict) -> BenchmarkResult:
125
+ """Run a single benchmark evaluation"""
126
+ benchmark = get_benchmark(benchmark_name)
127
+
128
+ # Get benchmark-specific config
129
+ bench_config = config['benchmarks'].get(benchmark_name, {})
130
+ eval_config = config['evaluation']
131
+
132
+ # Merge configs
133
+ kwargs = {
134
+ **eval_config,
135
+ 'concurrent_requests': eval_config.get('concurrent_requests', 5)
136
+ }
137
+
138
+ # Add benchmark-specific configs but exclude sample_size
139
+ for key, value in bench_config.items():
140
+ if key != 'sample_size':
141
+ kwargs[key] = value
142
+
143
+ # Run benchmark
144
+ result = await benchmark.run_benchmark(
145
+ api,
146
+ sample_size=bench_config.get('sample_size'),
147
+ **kwargs
148
+ )
149
+
150
+ return result
151
+
152
+ async def main():
153
+ parser = argparse.ArgumentParser(description='Run AI benchmark evaluation')
154
+ parser.add_argument('--models', nargs='+', help='Models to evaluate (e.g., gpt-4o claude-3-opus)')
155
+ parser.add_argument('--benchmarks', nargs='+', help='Benchmarks to run (e.g., mmlu gsm8k)')
156
+ parser.add_argument('--config', default='config.yaml', help='Config file path')
157
+ parser.add_argument('--output-dir', default='results', help='Output directory for results')
158
+ parser.add_argument('--no-save', action='store_true', help='Do not save results to files')
159
+
160
+ args = parser.parse_args()
161
+
162
+ # Load configuration
163
+ config = load_config(args.config)
164
+
165
+ # Determine which models to evaluate
166
+ if args.models:
167
+ models_to_eval = args.models
168
+ else:
169
+ # Get all models from config
170
+ models_to_eval = []
171
+ for provider, provider_config in config['models'].items():
172
+ for model in provider_config.get('models', []):
173
+ models_to_eval.append(model)
174
+
175
+ # Determine which benchmarks to run
176
+ if args.benchmarks:
177
+ benchmarks_to_run = args.benchmarks
178
+ else:
179
+ # Get enabled benchmarks from config
180
+ benchmarks_to_run = [
181
+ name for name, bench_config in config['benchmarks'].items()
182
+ if bench_config.get('enabled', True)
183
+ ]
184
+
185
+ print(f"Models to evaluate: {models_to_eval}")
186
+ print(f"Benchmarks to run: {benchmarks_to_run}")
187
+
188
+ # Run evaluations
189
+ all_results = []
190
+
191
+ for model_name in models_to_eval:
192
+ print(f"\n{'='*60}")
193
+ print(f"Evaluating model: {model_name}")
194
+ print(f"{'='*60}")
195
+
196
+ try:
197
+ # Create API instance
198
+ api = APIFactory.create_api(model_name, config)
199
+
200
+ # Run each benchmark
201
+ for benchmark_name in benchmarks_to_run:
202
+ print(f"\nRunning {benchmark_name} benchmark...")
203
+ try:
204
+ result = await run_single_evaluation(api, benchmark_name, config)
205
+ all_results.append(result)
206
+ print(f"[OK] {benchmark_name}: {result.accuracy*100:.1f}% accuracy")
207
+ except Exception as e:
208
+ print(f"[ERROR] {benchmark_name}: Error - {e}")
209
+
210
+ except Exception as e:
211
+ print(f"Failed to create API for {model_name}: {e}")
212
+ continue
213
+
214
+ # Print results table
215
+ print_results_table(all_results)
216
+
217
+ # Save results
218
+ if not args.no_save and all_results:
219
+ json_path, csv_path = save_results(all_results, args.output_dir)
220
+ print(f"\nResults saved to:")
221
+ print(f" - {json_path}")
222
+ print(f" - {csv_path}")
223
+
224
+ if __name__ == "__main__":
225
+ asyncio.run(main())
run_hf_space.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Hugging Face Space entry point for GPQA evaluation
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from dotenv import load_dotenv
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ # Set HF token if available
14
+ hf_token = os.getenv('HF_TOKEN')
15
+ if hf_token:
16
+ os.environ['HUGGING_FACE_HUB_TOKEN'] = hf_token
17
+ print("βœ… HF Token configured")
18
+
19
+ # Import and run the app
20
+ from app import create_ui, start_evaluation_safe, check_environment
21
+
22
+ if __name__ == "__main__":
23
+ # Check environment
24
+ issues = check_environment()
25
+
26
+ if issues:
27
+ print("\n⚠️ Configuration issues:")
28
+ for issue in issues:
29
+ print(f" - {issue}")
30
+ print("\nThe app will run in demo mode.")
31
+ print("To enable GPQA evaluation, please set the required secrets in HF Space settings.")
32
+ else:
33
+ print("βœ… All environment variables configured")
34
+ # Start evaluation in background
35
+ start_evaluation_safe()
36
+
37
+ # Create and launch UI
38
+ ui = create_ui()
39
+ ui.launch()
setup_hf_space.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Setup script for Hugging Face Space deployment
4
+ Ensures GPQA benchmark can run successfully on HF
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import subprocess
10
+ from pathlib import Path
11
+
12
+ def create_deployment_files():
13
+ """Create necessary files for HF deployment"""
14
+
15
+ print("πŸš€ Setting up Hugging Face Space deployment...")
16
+
17
+ # 1. Update requirements.txt with HF dependencies
18
+ requirements_path = Path("requirements.txt")
19
+ existing_reqs = requirements_path.read_text() if requirements_path.exists() else ""
20
+
21
+ hf_deps = [
22
+ "huggingface_hub>=0.20.0",
23
+ "gradio>=4.31.0",
24
+ "python-dotenv>=0.19.0"
25
+ ]
26
+
27
+ for dep in hf_deps:
28
+ if dep.split(">=")[0] not in existing_reqs:
29
+ existing_reqs += f"\n{dep}"
30
+
31
+ requirements_path.write_text(existing_reqs.strip() + "\n")
32
+ print("βœ… Updated requirements.txt")
33
+
34
+ # 2. Create .env.example
35
+ env_example = """# Hugging Face Space Configuration
36
+ # Copy this to .env or set in HF Secrets
37
+
38
+ # Required: Your Grok API key from x.ai
39
+ GROK_API_KEY=your_grok_api_key_here
40
+
41
+ # Required: Your Hugging Face token for GPQA dataset access
42
+ # Get it from: https://huggingface.co/settings/tokens
43
+ HF_TOKEN=your_hugging_face_token_here
44
+
45
+ # Optional: OpenAI and Anthropic keys for comparison
46
+ # OPENAI_API_KEY=your_openai_key_here
47
+ # ANTHROPIC_API_KEY=your_anthropic_key_here
48
+ """
49
+
50
+ with open(".env.example", "w") as f:
51
+ f.write(env_example)
52
+ print("βœ… Created .env.example")
53
+
54
+ # 3. Create HF-specific run script
55
+ run_script = """#!/usr/bin/env python3
56
+ \"\"\"
57
+ Hugging Face Space entry point for GPQA evaluation
58
+ \"\"\"
59
+
60
+ import os
61
+ import sys
62
+ from dotenv import load_dotenv
63
+
64
+ # Load environment variables
65
+ load_dotenv()
66
+
67
+ # Set HF token if available
68
+ hf_token = os.getenv('HF_TOKEN')
69
+ if hf_token:
70
+ os.environ['HUGGING_FACE_HUB_TOKEN'] = hf_token
71
+ print("βœ… HF Token configured")
72
+
73
+ # Import and run the app
74
+ from app import create_ui, start_evaluation_safe, check_environment
75
+
76
+ if __name__ == "__main__":
77
+ # Check environment
78
+ issues = check_environment()
79
+
80
+ if issues:
81
+ print("\\n⚠️ Configuration issues:")
82
+ for issue in issues:
83
+ print(f" - {issue}")
84
+ print("\\nThe app will run in demo mode.")
85
+ print("To enable GPQA evaluation, please set the required secrets in HF Space settings.")
86
+ else:
87
+ print("βœ… All environment variables configured")
88
+ # Start evaluation in background
89
+ start_evaluation_safe()
90
+
91
+ # Create and launch UI
92
+ ui = create_ui()
93
+ ui.launch()
94
+ """
95
+
96
+ with open("run_hf_space.py", "w") as f:
97
+ f.write(run_script)
98
+ os.chmod("run_hf_space.py", 0o755)
99
+ print("βœ… Created run_hf_space.py")
100
+
101
+ # 4. Create README for HF Space
102
+ readme_content = """---
103
+ title: Grok-4 GPQA Evaluation
104
+ emoji: 🧠
105
+ colorFrom: blue
106
+ colorTo: green
107
+ sdk: gradio
108
+ sdk_version: "4.31.0"
109
+ app_file: run_hf_space.py
110
+ pinned: false
111
+ ---
112
+
113
+ # Grok-4 GPQA Evaluation Dashboard
114
+
115
+ Real-time evaluation of Grok-4 model on the GPQA (Graduate-Level Google-Proof Q&A) benchmark.
116
+
117
+ ## πŸ”§ Configuration
118
+
119
+ This Space requires the following secrets to be set in your HF Space settings:
120
+
121
+ 1. **GROK_API_KEY** (Required)
122
+ - Get from: https://x.ai
123
+ - Your Grok API key for running evaluations
124
+
125
+ 2. **HF_TOKEN** (Required)
126
+ - Get from: https://huggingface.co/settings/tokens
127
+ - Required for accessing the GPQA dataset
128
+ - Make sure you have requested access to: https://huggingface.co/datasets/Idavidrein/gpqa
129
+
130
+ ## πŸ“Š Features
131
+
132
+ - Real-time progress tracking
133
+ - Accuracy metrics and performance stats
134
+ - Detailed results export
135
+ - Support for full GPQA dataset (448 questions)
136
+
137
+ ## πŸš€ Quick Start
138
+
139
+ 1. Fork this Space
140
+ 2. Set the required secrets in your Space settings
141
+ 3. The evaluation will start automatically
142
+ 4. Monitor progress in the dashboard
143
+
144
+ ## ⚠️ Known Issues
145
+
146
+ - GPQA dataset requires access approval (usually 1-2 days)
147
+ - Grok-4-0709 uses extensive reasoning tokens (~2500-3000 per question)
148
+ - Full evaluation takes ~3-4 hours due to model response times
149
+
150
+ ## πŸ“ˆ Expected Performance
151
+
152
+ Based on our testing:
153
+ - Accuracy: ~80-90% (excluding timeouts)
154
+ - Avg Response Time: ~50s per question
155
+ - Total Runtime: ~3-4 hours for full dataset
156
+ """
157
+
158
+ with open("README_HF.md", "w") as f:
159
+ f.write(readme_content)
160
+ print("βœ… Created README_HF.md")
161
+
162
+ # 5. Create pre-flight check script
163
+ check_script = """#!/usr/bin/env python3
164
+ \"\"\"
165
+ Pre-deployment checklist for HF Space
166
+ \"\"\"
167
+
168
+ import os
169
+ import sys
170
+ from pathlib import Path
171
+
172
+ def check_deployment_ready():
173
+ \"\"\"Check if everything is ready for HF deployment\"\"\"
174
+
175
+ print("πŸ” Pre-deployment checklist:\\n")
176
+
177
+ checks = []
178
+
179
+ # Check files exist
180
+ required_files = [
181
+ "app.py",
182
+ "run_evaluation.py",
183
+ "requirements.txt",
184
+ ".env.example",
185
+ "run_hf_space.py",
186
+ "official_config.yaml"
187
+ ]
188
+
189
+ for file in required_files:
190
+ if Path(file).exists():
191
+ checks.append((f"βœ… {file} exists", True))
192
+ else:
193
+ checks.append((f"❌ {file} missing", False))
194
+
195
+ # Check API directories
196
+ if Path("apis").is_dir() and list(Path("apis").glob("*.py")):
197
+ checks.append(("βœ… APIs directory configured", True))
198
+ else:
199
+ checks.append(("❌ APIs directory missing or empty", False))
200
+
201
+ # Check benchmarks directory
202
+ if Path("benchmarks").is_dir() and Path("benchmarks/gpqa_benchmark.py").exists():
203
+ checks.append(("βœ… GPQA benchmark implementation found", True))
204
+ else:
205
+ checks.append(("❌ GPQA benchmark missing", False))
206
+
207
+ # Check for sensitive data
208
+ if Path(".env").exists():
209
+ checks.append(("⚠️ .env file exists - make sure it's in .gitignore!", None))
210
+
211
+ # Print results
212
+ for check, status in checks:
213
+ print(check)
214
+
215
+ all_good = all(status is not False for _, status in checks)
216
+
217
+ if all_good:
218
+ print("\\nβœ… Ready for deployment!")
219
+ print("\\nNext steps:")
220
+ print("1. Set GROK_API_KEY and HF_TOKEN in HF Space secrets")
221
+ print("2. Make sure you have GPQA dataset access")
222
+ print("3. Push to Hugging Face")
223
+ else:
224
+ print("\\n❌ Issues found - please fix before deploying")
225
+
226
+ return all_good
227
+
228
+ if __name__ == "__main__":
229
+ check_deployment_ready()
230
+ """
231
+
232
+ with open("check_deployment.py", "w") as f:
233
+ f.write(check_script)
234
+ os.chmod("check_deployment.py", 0o755)
235
+ print("βœ… Created check_deployment.py")
236
+
237
+ print("\nπŸŽ‰ Deployment files created successfully!")
238
+ print("\nNext steps:")
239
+ print("1. Run: python check_deployment.py")
240
+ print("2. Set your API keys in HF Space secrets")
241
+ print("3. Push to Hugging Face")
242
+
243
+ if __name__ == "__main__":
244
+ create_deployment_files()