Spaces:
Running
Running
File size: 4,850 Bytes
8474f02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from .base_benchmark import BaseBenchmark
from typing import Dict, Any, Optional, Tuple
from datasets import load_dataset
import subprocess
import tempfile
import os
import sys
import re
class HumanEvalBenchmark(BaseBenchmark):
"""HumanEval code generation benchmark"""
def __init__(self):
super().__init__(name="HumanEval", dataset_name="openai_humaneval")
async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
"""Load HumanEval dataset"""
dataset = load_dataset(self.dataset_name, split='test')
self.dataset = []
for sample in dataset:
self.dataset.append({
'task_id': sample['task_id'],
'prompt': sample['prompt'],
'canonical_solution': sample['canonical_solution'],
'test': sample['test'],
'entry_point': sample['entry_point'],
'raw_sample': sample
})
if sample_size and len(self.dataset) > sample_size:
self.dataset = self.dataset[:sample_size]
def format_prompt(self, sample: Dict[str, Any]) -> str:
"""Format HumanEval problem as prompt"""
# lm-eval uses just the raw prompt without additional instructions
return sample['prompt']
def extract_code(self, response: str, entry_point: str, prompt: str) -> str:
"""Extract code from model response"""
# Clean the response - handle markdown code blocks
code = response.strip()
# Remove markdown code block markers
if code.startswith('```python'):
code = code[9:] # Remove ```python
elif code.startswith('```'):
code = code[3:] # Remove ```
if code.endswith('```'):
code = code[:-3] # Remove trailing ```
code = code.strip()
# If the response contains the complete function, use it directly
if f"def {entry_point}" in code:
return code
else:
# Fallback: assume it's completion to be added after prompt
stop_sequences = ['\nclass', '\ndef', '\n#', '\nif __name__']
for stop in stop_sequences:
pos = code.find(stop)
if pos > 0:
code = code[:pos]
break
return prompt + code
def run_test(self, code: str, test_code: str) -> Tuple[bool, str]:
"""Run the test code and return success status and output"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
# Write the complete test file
f.write(code + '\n\n' + test_code)
f.flush()
try:
# Run the test
result = subprocess.run(
[sys.executable, f.name],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0:
return True, result.stdout
else:
return False, result.stderr
except subprocess.TimeoutExpired:
return False, "Timeout: Code execution took too long"
except Exception as e:
return False, f"Error running test: {str(e)}"
finally:
# Clean up
try:
os.unlink(f.name)
except:
pass
async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
"""Evaluate a single HumanEval sample"""
prompt = self.format_prompt(sample)
try:
response = await api.generate_with_retry(prompt, **kwargs)
# Extract code from response
code = self.extract_code(response, sample['entry_point'], sample['prompt'])
# Run the test
is_correct, test_output = self.run_test(code, sample['test'])
result = {
'task_id': sample['task_id'],
'prompt': sample['prompt'],
'model_response': response,
'extracted_code': code,
'is_correct': is_correct,
'test_output': test_output,
'entry_point': sample['entry_point']
}
return is_correct, result
except Exception as e:
result = {
'task_id': sample['task_id'],
'prompt': sample['prompt'],
'error': str(e),
'is_correct': False
}
return False, result |