File size: 4,850 Bytes
8474f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from .base_benchmark import BaseBenchmark
from typing import Dict, Any, Optional, Tuple
from datasets import load_dataset
import subprocess
import tempfile
import os
import sys
import re

class HumanEvalBenchmark(BaseBenchmark):
    """HumanEval code generation benchmark"""
    
    def __init__(self):
        super().__init__(name="HumanEval", dataset_name="openai_humaneval")
        
    async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
        """Load HumanEval dataset"""
        dataset = load_dataset(self.dataset_name, split='test')
        
        self.dataset = []
        for sample in dataset:
            self.dataset.append({
                'task_id': sample['task_id'],
                'prompt': sample['prompt'],
                'canonical_solution': sample['canonical_solution'],
                'test': sample['test'],
                'entry_point': sample['entry_point'],
                'raw_sample': sample
            })
        
        if sample_size and len(self.dataset) > sample_size:
            self.dataset = self.dataset[:sample_size]
    
    def format_prompt(self, sample: Dict[str, Any]) -> str:
        """Format HumanEval problem as prompt"""
        # lm-eval uses just the raw prompt without additional instructions
        return sample['prompt']
    
    def extract_code(self, response: str, entry_point: str, prompt: str) -> str:
        """Extract code from model response"""
        # Clean the response - handle markdown code blocks
        code = response.strip()
        
        # Remove markdown code block markers
        if code.startswith('```python'):
            code = code[9:]  # Remove ```python
        elif code.startswith('```'):
            code = code[3:]   # Remove ```
        
        if code.endswith('```'):
            code = code[:-3]  # Remove trailing ```
        
        code = code.strip()
        
        # If the response contains the complete function, use it directly
        if f"def {entry_point}" in code:
            return code
        else:
            # Fallback: assume it's completion to be added after prompt
            stop_sequences = ['\nclass', '\ndef', '\n#', '\nif __name__']
            
            for stop in stop_sequences:
                pos = code.find(stop)
                if pos > 0:
                    code = code[:pos]
                    break
            
            return prompt + code
    
    def run_test(self, code: str, test_code: str) -> Tuple[bool, str]:
        """Run the test code and return success status and output"""
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
            # Write the complete test file
            f.write(code + '\n\n' + test_code)
            f.flush()
            
            try:
                # Run the test
                result = subprocess.run(
                    [sys.executable, f.name],
                    capture_output=True,
                    text=True,
                    timeout=10
                )
                
                if result.returncode == 0:
                    return True, result.stdout
                else:
                    return False, result.stderr
                    
            except subprocess.TimeoutExpired:
                return False, "Timeout: Code execution took too long"
            except Exception as e:
                return False, f"Error running test: {str(e)}"
            finally:
                # Clean up
                try:
                    os.unlink(f.name)
                except:
                    pass
    
    async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
        """Evaluate a single HumanEval sample"""
        prompt = self.format_prompt(sample)
        
        try:
            response = await api.generate_with_retry(prompt, **kwargs)
            
            # Extract code from response
            code = self.extract_code(response, sample['entry_point'], sample['prompt'])
            
            # Run the test
            is_correct, test_output = self.run_test(code, sample['test'])
            
            result = {
                'task_id': sample['task_id'],
                'prompt': sample['prompt'],
                'model_response': response,
                'extracted_code': code,
                'is_correct': is_correct,
                'test_output': test_output,
                'entry_point': sample['entry_point']
            }
            
            return is_correct, result
            
        except Exception as e:
            result = {
                'task_id': sample['task_id'],
                'prompt': sample['prompt'],
                'error': str(e),
                'is_correct': False
            }
            return False, result