OpenTrouter
/

Troviku-1.1

Model card Files Files and versions

xet

Community

Trouter-Library commited on 20 days ago

Commit

9cc73a0

verified ·

1 Parent(s): b39a3fb

Create evaluate_model.py

Browse files

Files changed (1) hide show

evaluate_model.py +571 -0

evaluate_model.py ADDED Viewed

	@@ -0,0 +1,571 @@

+"""
+Model Evaluation Script for Troviku-1.1
+Comprehensive evaluation suite for testing the model's performance
+on various coding benchmarks and tasks.
+"""
+import json
+import time
+from typing import List, Dict, Any, Optional, Tuple
+from dataclasses import dataclass, asdict
+from collections import defaultdict
+import statistics
+@dataclass
+class EvaluationResult:
+    """Result from a single evaluation."""
+    task_id: str
+    task_type: str
+    language: str
+    passed: bool
+    score: float
+    execution_time: float
+    error_message: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+@dataclass
+class BenchmarkResults:
+    """Aggregated benchmark results."""
+    benchmark_name: str
+    total_tasks: int
+    passed_tasks: int
+    failed_tasks: int
+    average_score: float
+    pass_rate: float
+    average_execution_time: float
+    results_by_language: Dict[str, Dict[str, float]]
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+class CodeEvaluator:
+    """
+    Evaluator for Troviku-1.1 model performance.
+    Runs various benchmarks and coding tasks to assess model capabilities.
+    """
+    def __init__(self, api_key: str, model: str = "OpenTrouter/Troviku-1.1"):
+        """
+        Initialize the evaluator.
+        Args:
+            api_key: OpenTrouter API key
+            model: Model identifier to evaluate
+        """
+        from troviku_client import TrovikuClient
+        self.client = TrovikuClient(api_key=api_key, model=model)
+        self.results: List[EvaluationResult] = []
+    def evaluate_humaneval(self, problems: List[Dict[str, Any]]) -> BenchmarkResults:
+        """
+        Evaluate on HumanEval benchmark.
+        Args:
+            problems: List of HumanEval problems
+        Returns:
+            BenchmarkResults with aggregated scores
+        """
+        print("Evaluating HumanEval benchmark...")
+        for problem in problems:
+            task_id = problem['task_id']
+            prompt = problem['prompt']
+            test_cases = problem['test']
+            try:
+                start_time = time.time()
+                response = self.client.generate(prompt, language="python")
+                execution_time = time.time() - start_time
+                # Execute test cases
+                passed, error = self._execute_tests(response.code, test_cases)
+                result = EvaluationResult(
+                    task_id=task_id,
+                    task_type="code_generation",
+                    language="python",
+                    passed=passed,
+                    score=1.0 if passed else 0.0,
+                    execution_time=execution_time,
+                    error_message=error
+                )
+                self.results.append(result)
+                print(f"  {task_id}: {'PASS' if passed else 'FAIL'}")
+            except Exception as e:
+                print(f"  {task_id}: ERROR - {str(e)}")
+                result = EvaluationResult(
+                    task_id=task_id,
+                    task_type="code_generation",
+                    language="python",
+                    passed=False,
+                    score=0.0,
+                    execution_time=0.0,
+                    error_message=str(e)
+                )
+                self.results.append(result)
+        return self._aggregate_results("HumanEval")
+    def evaluate_mbpp(self, problems: List[Dict[str, Any]]) -> BenchmarkResults:
+        """
+        Evaluate on MBPP (Mostly Basic Python Problems) benchmark.
+        Args:
+            problems: List of MBPP problems
+        Returns:
+            BenchmarkResults with aggregated scores
+        """
+        print("Evaluating MBPP benchmark...")
+        for problem in problems:
+            task_id = str(problem['task_id'])
+            prompt = problem['text']
+            test_cases = problem['test_list']
+            try:
+                start_time = time.time()
+                response = self.client.generate(prompt, language="python")
+                execution_time = time.time() - start_time
+                passed, error = self._execute_tests(response.code, test_cases)
+                result = EvaluationResult(
+                    task_id=task_id,
+                    task_type="code_generation",
+                    language="python",
+                    passed=passed,
+                    score=1.0 if passed else 0.0,
+                    execution_time=execution_time,
+                    error_message=error
+                )
+                self.results.append(result)
+                print(f"  Task {task_id}: {'PASS' if passed else 'FAIL'}")
+            except Exception as e:
+                print(f"  Task {task_id}: ERROR - {str(e)}")
+        return self._aggregate_results("MBPP")
+    def evaluate_code_translation(
+        self,
+        test_cases: List[Dict[str, Any]]
+    ) -> BenchmarkResults:
+        """
+        Evaluate code translation between languages.
+        Args:
+            test_cases: List of translation test cases
+        Returns:
+            BenchmarkResults with translation accuracy
+        """
+        print("Evaluating code translation...")
+        for test_case in test_cases:
+            task_id = test_case['id']
+            source_code = test_case['source_code']
+            source_lang = test_case['source_language']
+            target_lang = test_case['target_language']
+            expected_behavior = test_case.get('expected_behavior')
+            try:
+                start_time = time.time()
+                response = self.client.translate(
+                    code=source_code,
+                    source_language=source_lang,
+                    target_language=target_lang
+                )
+                execution_time = time.time() - start_time
+                # Validate translation (simplified - would need actual execution)
+                score = self._validate_translation(
+                    response.code,
+                    target_lang,
+                    expected_behavior
+                )
+                result = EvaluationResult(
+                    task_id=task_id,
+                    task_type="code_translation",
+                    language=f"{source_lang}_to_{target_lang}",
+                    passed=score >= 0.8,
+                    score=score,
+                    execution_time=execution_time
+                )
+                self.results.append(result)
+                print(f"  {task_id}: Score {score:.2f}")
+            except Exception as e:
+                print(f"  {task_id}: ERROR - {str(e)}")
+        return self._aggregate_results("Code Translation")
+    def evaluate_code_explanation(
+        self,
+        test_cases: List[Dict[str, Any]]
+    ) -> BenchmarkResults:
+        """
+        Evaluate code explanation quality.
+        Args:
+            test_cases: List of explanation test cases
+        Returns:
+            BenchmarkResults with explanation scores
+        """
+        print("Evaluating code explanation...")
+        for test_case in test_cases:
+            task_id = test_case['id']
+            code = test_case['code']
+            language = test_case['language']
+            key_concepts = test_case.get('key_concepts', [])
+            try:
+                start_time = time.time()
+                explanation = self.client.explain(code, language)
+                execution_time = time.time() - start_time
+                # Score explanation based on coverage of key concepts
+                score = self._score_explanation(explanation, key_concepts)
+                result = EvaluationResult(
+                    task_id=task_id,
+                    task_type="code_explanation",
+                    language=language,
+                    passed=score >= 0.7,
+                    score=score,
+                    execution_time=execution_time
+                )
+                self.results.append(result)
+                print(f"  {task_id}: Score {score:.2f}")
+            except Exception as e:
+                print(f"  {task_id}: ERROR - {str(e)}")
+        return self._aggregate_results("Code Explanation")
+    def evaluate_bug_detection(
+        self,
+        test_cases: List[Dict[str, Any]]
+    ) -> BenchmarkResults:
+        """
+        Evaluate bug detection and fixing capabilities.
+        Args:
+            test_cases: List of buggy code samples
+        Returns:
+            BenchmarkResults with bug fix success rate
+        """
+        print("Evaluating bug detection and fixing...")
+        for test_case in test_cases:
+            task_id = test_case['id']
+            buggy_code = test_case['buggy_code']
+            error_message = test_case['error_message']
+            language = test_case['language']
+            tests = test_case.get('tests', [])
+            try:
+                start_time = time.time()
+                response = self.client.debug(buggy_code, error_message, language)
+                execution_time = time.time() - start_time
+                # Test if fixed code passes tests
+                passed, error = self._execute_tests(response.code, tests)
+                result = EvaluationResult(
+                    task_id=task_id,
+                    task_type="bug_fixing",
+                    language=language,
+                    passed=passed,
+                    score=1.0 if passed else 0.0,
+                    execution_time=execution_time,
+                    error_message=error
+                )
+                self.results.append(result)
+                print(f"  {task_id}: {'FIXED' if passed else 'FAILED'}")
+            except Exception as e:
+                print(f"  {task_id}: ERROR - {str(e)}")
+        return self._aggregate_results("Bug Detection")
+    def _execute_tests(
+        self,
+        code: str,
+        test_cases: List[str]
+    ) -> Tuple[bool, Optional[str]]:
+        """
+        Execute test cases against generated code.
+        Args:
+            code: Generated code to test
+            test_cases: List of test case strings
+        Returns:
+            Tuple of (passed, error_message)
+        """
+        try:
+            # Create execution environment
+            namespace = {}
+            exec(code, namespace)
+            # Run test cases
+            for test in test_cases:
+                exec(test, namespace)
+            return True, None
+        except Exception as e:
+            return False, str(e)
+    def _validate_translation(
+        self,
+        translated_code: str,
+        target_language: str,
+        expected_behavior: Optional[Dict[str, Any]]
+    ) -> float:
+        """
+        Validate translated code quality.
+        Args:
+            translated_code: Translated code
+            target_language: Target language
+            expected_behavior: Expected behavior specification
+        Returns:
+            Quality score (0.0 to 1.0)
+        """
+        # Simplified validation - in practice would need language-specific execution
+        score = 0.0
+        # Check for syntax validity (simplified)
+        if len(translated_code.strip()) > 0:
+            score += 0.3
+        # Check for language-specific keywords
+        if target_language.lower() in translated_code.lower():
+            score += 0.2
+        # If expected behavior is specified, score higher
+        if expected_behavior:
+            score += 0.5
+        return min(score, 1.0)
+    def _score_explanation(
+        self,
+        explanation: str,
+        key_concepts: List[str]
+    ) -> float:
+        """
+        Score explanation quality based on concept coverage.
+        Args:
+            explanation: Generated explanation
+            key_concepts: List of key concepts that should be covered
+        Returns:
+            Quality score (0.0 to 1.0)
+        """
+        if not key_concepts:
+            # Base score for reasonable length explanation
+            return 0.8 if len(explanation) > 100 else 0.5
+        explanation_lower = explanation.lower()
+        covered = sum(1 for concept in key_concepts
+                     if concept.lower() in explanation_lower)
+        coverage_score = covered / len(key_concepts)
+        length_score = min(len(explanation) / 500, 1.0)
+        return (coverage_score * 0.7 + length_score * 0.3)
+    def _aggregate_results(self, benchmark_name: str) -> BenchmarkResults:
+        """
+        Aggregate evaluation results for a benchmark.
+        Args:
+            benchmark_name: Name of the benchmark
+        Returns:
+            BenchmarkResults with aggregated statistics
+        """
+        benchmark_results = [r for r in self.results
+                           if benchmark_name.lower() in r.task_id.lower() or
+                              benchmark_name.lower() == r.task_type.lower()]
+        if not benchmark_results:
+            return BenchmarkResults(
+                benchmark_name=benchmark_name,
+                total_tasks=0,
+                passed_tasks=0,
+                failed_tasks=0,
+                average_score=0.0,
+                pass_rate=0.0,
+                average_execution_time=0.0,
+                results_by_language={}
+            )
+        total = len(benchmark_results)
+        passed = sum(1 for r in benchmark_results if r.passed)
+        failed = total - passed
+        avg_score = statistics.mean(r.score for r in benchmark_results)
+        pass_rate = passed / total if total > 0 else 0.0
+        avg_time = statistics.mean(r.execution_time for r in benchmark_results)
+        # Aggregate by language
+        by_language = defaultdict(lambda: {"passed": 0, "total": 0, "score": []})
+        for result in benchmark_results:
+            lang = result.language
+            by_language[lang]["total"] += 1
+            if result.passed:
+                by_language[lang]["passed"] += 1
+            by_language[lang]["score"].append(result.score)
+        results_by_language = {
+            lang: {
+                "pass_rate": stats["passed"] / stats["total"],
+                "average_score": statistics.mean(stats["score"])
+            }
+            for lang, stats in by_language.items()
+        }
+        return BenchmarkResults(
+            benchmark_name=benchmark_name,
+            total_tasks=total,
+            passed_tasks=passed,
+            failed_tasks=failed,
+            average_score=avg_score,
+            pass_rate=pass_rate,
+            average_execution_time=avg_time,
+            results_by_language=results_by_language
+        )
+    def save_results(self, filepath: str):
+        """
+        Save evaluation results to JSON file.
+        Args:
+            filepath: Path to save results
+        """
+        results_data = {
+            "individual_results": [r.to_dict() for r in self.results],
+            "summary": self.get_summary()
+        }
+        with open(filepath, 'w') as f:
+            json.dump(results_data, f, indent=2)
+        print(f"\nResults saved to {filepath}")
+    def get_summary(self) -> Dict[str, Any]:
+        """
+        Get summary of all evaluation results.
+        Returns:
+            Dictionary with summary statistics
+        """
+        if not self.results:
+            return {"message": "No results available"}
+        total = len(self.results)
+        passed = sum(1 for r in self.results if r.passed)
+        return {
+            "total_tasks": total,
+            "passed_tasks": passed,
+            "failed_tasks": total - passed,
+            "overall_pass_rate": passed / total,
+            "average_score": statistics.mean(r.score for r in self.results),
+            "average_execution_time": statistics.mean(r.execution_time for r in self.results),
+            "by_task_type": self._group_by_field("task_type"),
+            "by_language": self._group_by_field("language")
+        }
+    def _group_by_field(self, field: str) -> Dict[str, Dict[str, float]]:
+        """Group results by a specific field."""
+        grouped = defaultdict(lambda: {"passed": 0, "total": 0, "scores": []})
+        for result in self.results:
+            value = getattr(result, field)
+            grouped[value]["total"] += 1
+            if result.passed:
+                grouped[value]["passed"] += 1
+            grouped[value]["scores"].append(result.score)
+        return {
+            key: {
+                "pass_rate": stats["passed"] / stats["total"],
+                "average_score": statistics.mean(stats["scores"])
+            }
+            for key, stats in grouped.items()
+        }
+    def print_summary(self):
+        """Print evaluation summary to console."""
+        summary = self.get_summary()
+        print("\n" + "="*60)
+        print("EVALUATION SUMMARY")
+        print("="*60)
+        print(f"Total Tasks: {summary['total_tasks']}")
+        print(f"Passed: {summary['passed_tasks']}")
+        print(f"Failed: {summary['failed_tasks']}")
+        print(f"Overall Pass Rate: {summary['overall_pass_rate']:.2%}")
+        print(f"Average Score: {summary['average_score']:.2f}")
+        print(f"Average Execution Time: {summary['average_execution_time']:.2f}s")
+        print("\nBy Task Type:")
+        for task_type, stats in summary['by_task_type'].items():
+            print(f"  {task_type}:")
+            print(f"    Pass Rate: {stats['pass_rate']:.2%}")
+            print(f"    Avg Score: {stats['average_score']:.2f}")
+        print("\nBy Language:")
+        for language, stats in summary['by_language'].items():
+            print(f"  {language}:")
+            print(f"    Pass Rate: {stats['pass_rate']:.2%}")
+            print(f"    Avg Score: {stats['average_score']:.2f}")
+        print("="*60)
+# Example usage
+if __name__ == "__main__":
+    # Initialize evaluator
+    evaluator = CodeEvaluator(api_key="your_api_key_here")
+    # Example HumanEval problems (simplified)
+    humaneval_problems = [
+        {
+            "task_id": "HumanEval/0",
+            "prompt": "Write a function that takes a list of numbers and returns True if the list contains a pair of numbers that sum to zero.",
+            "test": "assert has_zero_sum([1, -1, 2]) == True\nassert has_zero_sum([1, 2, 3]) == False"
+        }
+    ]
+    # Run evaluation
+    results = evaluator.evaluate_humaneval(humaneval_problems)
+    # Print and save results
+    evaluator.print_summary()
+    evaluator.save_results("evaluation_results.json")