Trouter-Library commited on
Commit
9cc73a0
·
verified ·
1 Parent(s): b39a3fb

Create evaluate_model.py

Browse files
Files changed (1) hide show
  1. evaluate_model.py +571 -0
evaluate_model.py ADDED
@@ -0,0 +1,571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model Evaluation Script for Troviku-1.1
3
+
4
+ Comprehensive evaluation suite for testing the model's performance
5
+ on various coding benchmarks and tasks.
6
+ """
7
+
8
+ import json
9
+ import time
10
+ from typing import List, Dict, Any, Optional, Tuple
11
+ from dataclasses import dataclass, asdict
12
+ from collections import defaultdict
13
+ import statistics
14
+
15
+
16
+ @dataclass
17
+ class EvaluationResult:
18
+ """Result from a single evaluation."""
19
+ task_id: str
20
+ task_type: str
21
+ language: str
22
+ passed: bool
23
+ score: float
24
+ execution_time: float
25
+ error_message: Optional[str] = None
26
+
27
+ def to_dict(self) -> Dict[str, Any]:
28
+ return asdict(self)
29
+
30
+
31
+ @dataclass
32
+ class BenchmarkResults:
33
+ """Aggregated benchmark results."""
34
+ benchmark_name: str
35
+ total_tasks: int
36
+ passed_tasks: int
37
+ failed_tasks: int
38
+ average_score: float
39
+ pass_rate: float
40
+ average_execution_time: float
41
+ results_by_language: Dict[str, Dict[str, float]]
42
+
43
+ def to_dict(self) -> Dict[str, Any]:
44
+ return asdict(self)
45
+
46
+
47
+ class CodeEvaluator:
48
+ """
49
+ Evaluator for Troviku-1.1 model performance.
50
+
51
+ Runs various benchmarks and coding tasks to assess model capabilities.
52
+ """
53
+
54
+ def __init__(self, api_key: str, model: str = "OpenTrouter/Troviku-1.1"):
55
+ """
56
+ Initialize the evaluator.
57
+
58
+ Args:
59
+ api_key: OpenTrouter API key
60
+ model: Model identifier to evaluate
61
+ """
62
+ from troviku_client import TrovikuClient
63
+
64
+ self.client = TrovikuClient(api_key=api_key, model=model)
65
+ self.results: List[EvaluationResult] = []
66
+
67
+ def evaluate_humaneval(self, problems: List[Dict[str, Any]]) -> BenchmarkResults:
68
+ """
69
+ Evaluate on HumanEval benchmark.
70
+
71
+ Args:
72
+ problems: List of HumanEval problems
73
+
74
+ Returns:
75
+ BenchmarkResults with aggregated scores
76
+ """
77
+ print("Evaluating HumanEval benchmark...")
78
+
79
+ for problem in problems:
80
+ task_id = problem['task_id']
81
+ prompt = problem['prompt']
82
+ test_cases = problem['test']
83
+
84
+ try:
85
+ start_time = time.time()
86
+ response = self.client.generate(prompt, language="python")
87
+ execution_time = time.time() - start_time
88
+
89
+ # Execute test cases
90
+ passed, error = self._execute_tests(response.code, test_cases)
91
+
92
+ result = EvaluationResult(
93
+ task_id=task_id,
94
+ task_type="code_generation",
95
+ language="python",
96
+ passed=passed,
97
+ score=1.0 if passed else 0.0,
98
+ execution_time=execution_time,
99
+ error_message=error
100
+ )
101
+
102
+ self.results.append(result)
103
+ print(f" {task_id}: {'PASS' if passed else 'FAIL'}")
104
+
105
+ except Exception as e:
106
+ print(f" {task_id}: ERROR - {str(e)}")
107
+ result = EvaluationResult(
108
+ task_id=task_id,
109
+ task_type="code_generation",
110
+ language="python",
111
+ passed=False,
112
+ score=0.0,
113
+ execution_time=0.0,
114
+ error_message=str(e)
115
+ )
116
+ self.results.append(result)
117
+
118
+ return self._aggregate_results("HumanEval")
119
+
120
+ def evaluate_mbpp(self, problems: List[Dict[str, Any]]) -> BenchmarkResults:
121
+ """
122
+ Evaluate on MBPP (Mostly Basic Python Problems) benchmark.
123
+
124
+ Args:
125
+ problems: List of MBPP problems
126
+
127
+ Returns:
128
+ BenchmarkResults with aggregated scores
129
+ """
130
+ print("Evaluating MBPP benchmark...")
131
+
132
+ for problem in problems:
133
+ task_id = str(problem['task_id'])
134
+ prompt = problem['text']
135
+ test_cases = problem['test_list']
136
+
137
+ try:
138
+ start_time = time.time()
139
+ response = self.client.generate(prompt, language="python")
140
+ execution_time = time.time() - start_time
141
+
142
+ passed, error = self._execute_tests(response.code, test_cases)
143
+
144
+ result = EvaluationResult(
145
+ task_id=task_id,
146
+ task_type="code_generation",
147
+ language="python",
148
+ passed=passed,
149
+ score=1.0 if passed else 0.0,
150
+ execution_time=execution_time,
151
+ error_message=error
152
+ )
153
+
154
+ self.results.append(result)
155
+ print(f" Task {task_id}: {'PASS' if passed else 'FAIL'}")
156
+
157
+ except Exception as e:
158
+ print(f" Task {task_id}: ERROR - {str(e)}")
159
+
160
+ return self._aggregate_results("MBPP")
161
+
162
+ def evaluate_code_translation(
163
+ self,
164
+ test_cases: List[Dict[str, Any]]
165
+ ) -> BenchmarkResults:
166
+ """
167
+ Evaluate code translation between languages.
168
+
169
+ Args:
170
+ test_cases: List of translation test cases
171
+
172
+ Returns:
173
+ BenchmarkResults with translation accuracy
174
+ """
175
+ print("Evaluating code translation...")
176
+
177
+ for test_case in test_cases:
178
+ task_id = test_case['id']
179
+ source_code = test_case['source_code']
180
+ source_lang = test_case['source_language']
181
+ target_lang = test_case['target_language']
182
+ expected_behavior = test_case.get('expected_behavior')
183
+
184
+ try:
185
+ start_time = time.time()
186
+ response = self.client.translate(
187
+ code=source_code,
188
+ source_language=source_lang,
189
+ target_language=target_lang
190
+ )
191
+ execution_time = time.time() - start_time
192
+
193
+ # Validate translation (simplified - would need actual execution)
194
+ score = self._validate_translation(
195
+ response.code,
196
+ target_lang,
197
+ expected_behavior
198
+ )
199
+
200
+ result = EvaluationResult(
201
+ task_id=task_id,
202
+ task_type="code_translation",
203
+ language=f"{source_lang}_to_{target_lang}",
204
+ passed=score >= 0.8,
205
+ score=score,
206
+ execution_time=execution_time
207
+ )
208
+
209
+ self.results.append(result)
210
+ print(f" {task_id}: Score {score:.2f}")
211
+
212
+ except Exception as e:
213
+ print(f" {task_id}: ERROR - {str(e)}")
214
+
215
+ return self._aggregate_results("Code Translation")
216
+
217
+ def evaluate_code_explanation(
218
+ self,
219
+ test_cases: List[Dict[str, Any]]
220
+ ) -> BenchmarkResults:
221
+ """
222
+ Evaluate code explanation quality.
223
+
224
+ Args:
225
+ test_cases: List of explanation test cases
226
+
227
+ Returns:
228
+ BenchmarkResults with explanation scores
229
+ """
230
+ print("Evaluating code explanation...")
231
+
232
+ for test_case in test_cases:
233
+ task_id = test_case['id']
234
+ code = test_case['code']
235
+ language = test_case['language']
236
+ key_concepts = test_case.get('key_concepts', [])
237
+
238
+ try:
239
+ start_time = time.time()
240
+ explanation = self.client.explain(code, language)
241
+ execution_time = time.time() - start_time
242
+
243
+ # Score explanation based on coverage of key concepts
244
+ score = self._score_explanation(explanation, key_concepts)
245
+
246
+ result = EvaluationResult(
247
+ task_id=task_id,
248
+ task_type="code_explanation",
249
+ language=language,
250
+ passed=score >= 0.7,
251
+ score=score,
252
+ execution_time=execution_time
253
+ )
254
+
255
+ self.results.append(result)
256
+ print(f" {task_id}: Score {score:.2f}")
257
+
258
+ except Exception as e:
259
+ print(f" {task_id}: ERROR - {str(e)}")
260
+
261
+ return self._aggregate_results("Code Explanation")
262
+
263
+ def evaluate_bug_detection(
264
+ self,
265
+ test_cases: List[Dict[str, Any]]
266
+ ) -> BenchmarkResults:
267
+ """
268
+ Evaluate bug detection and fixing capabilities.
269
+
270
+ Args:
271
+ test_cases: List of buggy code samples
272
+
273
+ Returns:
274
+ BenchmarkResults with bug fix success rate
275
+ """
276
+ print("Evaluating bug detection and fixing...")
277
+
278
+ for test_case in test_cases:
279
+ task_id = test_case['id']
280
+ buggy_code = test_case['buggy_code']
281
+ error_message = test_case['error_message']
282
+ language = test_case['language']
283
+ tests = test_case.get('tests', [])
284
+
285
+ try:
286
+ start_time = time.time()
287
+ response = self.client.debug(buggy_code, error_message, language)
288
+ execution_time = time.time() - start_time
289
+
290
+ # Test if fixed code passes tests
291
+ passed, error = self._execute_tests(response.code, tests)
292
+
293
+ result = EvaluationResult(
294
+ task_id=task_id,
295
+ task_type="bug_fixing",
296
+ language=language,
297
+ passed=passed,
298
+ score=1.0 if passed else 0.0,
299
+ execution_time=execution_time,
300
+ error_message=error
301
+ )
302
+
303
+ self.results.append(result)
304
+ print(f" {task_id}: {'FIXED' if passed else 'FAILED'}")
305
+
306
+ except Exception as e:
307
+ print(f" {task_id}: ERROR - {str(e)}")
308
+
309
+ return self._aggregate_results("Bug Detection")
310
+
311
+ def _execute_tests(
312
+ self,
313
+ code: str,
314
+ test_cases: List[str]
315
+ ) -> Tuple[bool, Optional[str]]:
316
+ """
317
+ Execute test cases against generated code.
318
+
319
+ Args:
320
+ code: Generated code to test
321
+ test_cases: List of test case strings
322
+
323
+ Returns:
324
+ Tuple of (passed, error_message)
325
+ """
326
+ try:
327
+ # Create execution environment
328
+ namespace = {}
329
+ exec(code, namespace)
330
+
331
+ # Run test cases
332
+ for test in test_cases:
333
+ exec(test, namespace)
334
+
335
+ return True, None
336
+
337
+ except Exception as e:
338
+ return False, str(e)
339
+
340
+ def _validate_translation(
341
+ self,
342
+ translated_code: str,
343
+ target_language: str,
344
+ expected_behavior: Optional[Dict[str, Any]]
345
+ ) -> float:
346
+ """
347
+ Validate translated code quality.
348
+
349
+ Args:
350
+ translated_code: Translated code
351
+ target_language: Target language
352
+ expected_behavior: Expected behavior specification
353
+
354
+ Returns:
355
+ Quality score (0.0 to 1.0)
356
+ """
357
+ # Simplified validation - in practice would need language-specific execution
358
+ score = 0.0
359
+
360
+ # Check for syntax validity (simplified)
361
+ if len(translated_code.strip()) > 0:
362
+ score += 0.3
363
+
364
+ # Check for language-specific keywords
365
+ if target_language.lower() in translated_code.lower():
366
+ score += 0.2
367
+
368
+ # If expected behavior is specified, score higher
369
+ if expected_behavior:
370
+ score += 0.5
371
+
372
+ return min(score, 1.0)
373
+
374
+ def _score_explanation(
375
+ self,
376
+ explanation: str,
377
+ key_concepts: List[str]
378
+ ) -> float:
379
+ """
380
+ Score explanation quality based on concept coverage.
381
+
382
+ Args:
383
+ explanation: Generated explanation
384
+ key_concepts: List of key concepts that should be covered
385
+
386
+ Returns:
387
+ Quality score (0.0 to 1.0)
388
+ """
389
+ if not key_concepts:
390
+ # Base score for reasonable length explanation
391
+ return 0.8 if len(explanation) > 100 else 0.5
392
+
393
+ explanation_lower = explanation.lower()
394
+ covered = sum(1 for concept in key_concepts
395
+ if concept.lower() in explanation_lower)
396
+
397
+ coverage_score = covered / len(key_concepts)
398
+ length_score = min(len(explanation) / 500, 1.0)
399
+
400
+ return (coverage_score * 0.7 + length_score * 0.3)
401
+
402
+ def _aggregate_results(self, benchmark_name: str) -> BenchmarkResults:
403
+ """
404
+ Aggregate evaluation results for a benchmark.
405
+
406
+ Args:
407
+ benchmark_name: Name of the benchmark
408
+
409
+ Returns:
410
+ BenchmarkResults with aggregated statistics
411
+ """
412
+ benchmark_results = [r for r in self.results
413
+ if benchmark_name.lower() in r.task_id.lower() or
414
+ benchmark_name.lower() == r.task_type.lower()]
415
+
416
+ if not benchmark_results:
417
+ return BenchmarkResults(
418
+ benchmark_name=benchmark_name,
419
+ total_tasks=0,
420
+ passed_tasks=0,
421
+ failed_tasks=0,
422
+ average_score=0.0,
423
+ pass_rate=0.0,
424
+ average_execution_time=0.0,
425
+ results_by_language={}
426
+ )
427
+
428
+ total = len(benchmark_results)
429
+ passed = sum(1 for r in benchmark_results if r.passed)
430
+ failed = total - passed
431
+ avg_score = statistics.mean(r.score for r in benchmark_results)
432
+ pass_rate = passed / total if total > 0 else 0.0
433
+ avg_time = statistics.mean(r.execution_time for r in benchmark_results)
434
+
435
+ # Aggregate by language
436
+ by_language = defaultdict(lambda: {"passed": 0, "total": 0, "score": []})
437
+ for result in benchmark_results:
438
+ lang = result.language
439
+ by_language[lang]["total"] += 1
440
+ if result.passed:
441
+ by_language[lang]["passed"] += 1
442
+ by_language[lang]["score"].append(result.score)
443
+
444
+ results_by_language = {
445
+ lang: {
446
+ "pass_rate": stats["passed"] / stats["total"],
447
+ "average_score": statistics.mean(stats["score"])
448
+ }
449
+ for lang, stats in by_language.items()
450
+ }
451
+
452
+ return BenchmarkResults(
453
+ benchmark_name=benchmark_name,
454
+ total_tasks=total,
455
+ passed_tasks=passed,
456
+ failed_tasks=failed,
457
+ average_score=avg_score,
458
+ pass_rate=pass_rate,
459
+ average_execution_time=avg_time,
460
+ results_by_language=results_by_language
461
+ )
462
+
463
+ def save_results(self, filepath: str):
464
+ """
465
+ Save evaluation results to JSON file.
466
+
467
+ Args:
468
+ filepath: Path to save results
469
+ """
470
+ results_data = {
471
+ "individual_results": [r.to_dict() for r in self.results],
472
+ "summary": self.get_summary()
473
+ }
474
+
475
+ with open(filepath, 'w') as f:
476
+ json.dump(results_data, f, indent=2)
477
+
478
+ print(f"\nResults saved to {filepath}")
479
+
480
+ def get_summary(self) -> Dict[str, Any]:
481
+ """
482
+ Get summary of all evaluation results.
483
+
484
+ Returns:
485
+ Dictionary with summary statistics
486
+ """
487
+ if not self.results:
488
+ return {"message": "No results available"}
489
+
490
+ total = len(self.results)
491
+ passed = sum(1 for r in self.results if r.passed)
492
+
493
+ return {
494
+ "total_tasks": total,
495
+ "passed_tasks": passed,
496
+ "failed_tasks": total - passed,
497
+ "overall_pass_rate": passed / total,
498
+ "average_score": statistics.mean(r.score for r in self.results),
499
+ "average_execution_time": statistics.mean(r.execution_time for r in self.results),
500
+ "by_task_type": self._group_by_field("task_type"),
501
+ "by_language": self._group_by_field("language")
502
+ }
503
+
504
+ def _group_by_field(self, field: str) -> Dict[str, Dict[str, float]]:
505
+ """Group results by a specific field."""
506
+ grouped = defaultdict(lambda: {"passed": 0, "total": 0, "scores": []})
507
+
508
+ for result in self.results:
509
+ value = getattr(result, field)
510
+ grouped[value]["total"] += 1
511
+ if result.passed:
512
+ grouped[value]["passed"] += 1
513
+ grouped[value]["scores"].append(result.score)
514
+
515
+ return {
516
+ key: {
517
+ "pass_rate": stats["passed"] / stats["total"],
518
+ "average_score": statistics.mean(stats["scores"])
519
+ }
520
+ for key, stats in grouped.items()
521
+ }
522
+
523
+ def print_summary(self):
524
+ """Print evaluation summary to console."""
525
+ summary = self.get_summary()
526
+
527
+ print("\n" + "="*60)
528
+ print("EVALUATION SUMMARY")
529
+ print("="*60)
530
+ print(f"Total Tasks: {summary['total_tasks']}")
531
+ print(f"Passed: {summary['passed_tasks']}")
532
+ print(f"Failed: {summary['failed_tasks']}")
533
+ print(f"Overall Pass Rate: {summary['overall_pass_rate']:.2%}")
534
+ print(f"Average Score: {summary['average_score']:.2f}")
535
+ print(f"Average Execution Time: {summary['average_execution_time']:.2f}s")
536
+
537
+ print("\nBy Task Type:")
538
+ for task_type, stats in summary['by_task_type'].items():
539
+ print(f" {task_type}:")
540
+ print(f" Pass Rate: {stats['pass_rate']:.2%}")
541
+ print(f" Avg Score: {stats['average_score']:.2f}")
542
+
543
+ print("\nBy Language:")
544
+ for language, stats in summary['by_language'].items():
545
+ print(f" {language}:")
546
+ print(f" Pass Rate: {stats['pass_rate']:.2%}")
547
+ print(f" Avg Score: {stats['average_score']:.2f}")
548
+
549
+ print("="*60)
550
+
551
+
552
+ # Example usage
553
+ if __name__ == "__main__":
554
+ # Initialize evaluator
555
+ evaluator = CodeEvaluator(api_key="your_api_key_here")
556
+
557
+ # Example HumanEval problems (simplified)
558
+ humaneval_problems = [
559
+ {
560
+ "task_id": "HumanEval/0",
561
+ "prompt": "Write a function that takes a list of numbers and returns True if the list contains a pair of numbers that sum to zero.",
562
+ "test": "assert has_zero_sum([1, -1, 2]) == True\nassert has_zero_sum([1, 2, 3]) == False"
563
+ }
564
+ ]
565
+
566
+ # Run evaluation
567
+ results = evaluator.evaluate_humaneval(humaneval_problems)
568
+
569
+ # Print and save results
570
+ evaluator.print_summary()
571
+ evaluator.save_results("evaluation_results.json")