|
|
|
|
|
"""
|
|
|
BIZRA ACE-Enhanced GAIA Benchmark Evaluator
|
|
|
===========================================
|
|
|
Professional Elite Practitioner Implementation
|
|
|
احسان (Excellence) Standard - Zero Assumptions
|
|
|
|
|
|
This evaluator demonstrates 15,000+ hours of ACE Framework methodology:
|
|
|
- احسان System Instruction (no assumptions, complete transparency)
|
|
|
- Command Protocol Integration (/A, /C, /S, /R)
|
|
|
- ACE 4-Phase Orchestration (Generate → Execute → Reflect → Curate)
|
|
|
- Constitutional AI Constraints
|
|
|
- Delta Context Management
|
|
|
|
|
|
Usage:
|
|
|
python ace-gaia-evaluator.py --split validation --max-examples 10
|
|
|
"""
|
|
|
|
|
|
import json
|
|
|
import os
|
|
|
import time
|
|
|
from dataclasses import dataclass, asdict
|
|
|
from typing import List, Dict, Any, Optional
|
|
|
from pathlib import Path
|
|
|
import argparse
|
|
|
|
|
|
|
|
|
from huggingface_hub import hf_hub_download, login
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
import torch
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
class GAIAExample:
|
|
|
"""GAIA benchmark example structure"""
|
|
|
task_id: str
|
|
|
Question: str
|
|
|
Level: int
|
|
|
file_name: Optional[str] = None
|
|
|
Final answer: Optional[str] = None
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
class ACEResponse:
|
|
|
"""ACE Framework response with full orchestration"""
|
|
|
task_id: str
|
|
|
question: str
|
|
|
answer: str
|
|
|
trajectory: str
|
|
|
execution_log: str
|
|
|
reflection: str
|
|
|
context_delta: Dict[str, Any]
|
|
|
احسان_verification: bool
|
|
|
processing_time_ms: float
|
|
|
|
|
|
|
|
|
class احسانSystemInstruction:
|
|
|
"""احسان (Excellence) operational principle for AI"""
|
|
|
|
|
|
CORE_PRINCIPLE = """You are operating under احسان (Excellence in the Sight of Allah):
|
|
|
"To do your work like God is in front of you watching and you see Him,
|
|
|
and if you don't see God, then be sure that He is watching and sees you."
|
|
|
|
|
|
Practical Implementation:
|
|
|
- NO silent assumptions about completeness, status, or requirements
|
|
|
- ASK when uncertain - never guess or assume
|
|
|
- Read specifications FIRST before implementing anything
|
|
|
- Verify current state before claiming completion
|
|
|
- State assumptions EXPLICITLY with احسان if you must make them
|
|
|
- Transparency in ALL operations - every assumption must be visible
|
|
|
|
|
|
This principle ensures excellence through complete operational transparency."""
|
|
|
|
|
|
@staticmethod
|
|
|
def format_with_question(question: str, command: str = "/R") -> str:
|
|
|
"""Format احسان instruction with specific question"""
|
|
|
command_descriptions = {
|
|
|
"/A": "Auto-Mode: Autonomous strategic execution with full احسان verification",
|
|
|
"/C": "Context: Deep contextual analysis with احسان transparency",
|
|
|
"/S": "System: System-level coordination with احسان principles",
|
|
|
"/R": "Reasoning: Step-by-step logical chains with احسان validation"
|
|
|
}
|
|
|
|
|
|
return f"""{احسانSystemInstruction.CORE_PRINCIPLE}
|
|
|
|
|
|
Command Protocol: {command}
|
|
|
{command_descriptions.get(command, "Standard reasoning")}
|
|
|
|
|
|
Question: {question}
|
|
|
|
|
|
Provide your answer with complete احسان transparency. State any assumptions explicitly."""
|
|
|
|
|
|
|
|
|
class ACEOrchestrator:
|
|
|
"""ACE Framework 4-Phase Orchestrator"""
|
|
|
|
|
|
def __init__(self, model_name: str = "AgentFlow/agentflow-planner-7b", token: str = None):
|
|
|
"""Initialize ACE orchestrator with base model"""
|
|
|
print(f"[ACE] Initializing orchestrator with {model_name}")
|
|
|
|
|
|
self.model_name = model_name
|
|
|
self.token = token
|
|
|
|
|
|
|
|
|
if token:
|
|
|
login(token=token)
|
|
|
|
|
|
|
|
|
print("[ACE] Loading tokenizer...")
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
|
|
|
|
|
if self.tokenizer.pad_token is None:
|
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
|
print(f"[ACE] Set pad_token to eos_token: {self.tokenizer.eos_token}")
|
|
|
|
|
|
print("[ACE] Loading model (float16, auto device_map)...")
|
|
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
|
model_name,
|
|
|
torch_dtype=torch.float16,
|
|
|
device_map="auto"
|
|
|
)
|
|
|
|
|
|
print(f"[ACE] Model loaded on device: {self.model.device}")
|
|
|
|
|
|
def phase_1_generate(self, question: str) -> str:
|
|
|
"""Phase 1: Generator creates trajectory"""
|
|
|
system_instruction = احسانSystemInstruction.format_with_question(
|
|
|
question,
|
|
|
command="/A"
|
|
|
)
|
|
|
|
|
|
prompt = f"""<|im_start|>system
|
|
|
{system_instruction}<|im_end|>
|
|
|
<|im_start|>assistant
|
|
|
I will analyze this question step by step with احسان transparency:
|
|
|
|
|
|
1. Understanding the question:
|
|
|
"""
|
|
|
|
|
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
|
|
|
|
|
|
with torch.no_grad():
|
|
|
outputs = self.model.generate(
|
|
|
**inputs,
|
|
|
max_new_tokens=512,
|
|
|
temperature=0.7,
|
|
|
top_p=0.9,
|
|
|
do_sample=True,
|
|
|
pad_token_id=self.tokenizer.pad_token_id
|
|
|
)
|
|
|
|
|
|
trajectory = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
|
|
trajectory = trajectory.split("<|im_start|>assistant")[-1].strip()
|
|
|
|
|
|
return trajectory
|
|
|
|
|
|
def phase_2_execute(self, question: str, trajectory: str) -> tuple[str, str]:
|
|
|
"""Phase 2: Execute trajectory to get answer"""
|
|
|
|
|
|
system_instruction = احسانSystemInstruction.format_with_question(
|
|
|
question,
|
|
|
command="/R"
|
|
|
)
|
|
|
|
|
|
prompt = f"""<|im_start|>system
|
|
|
{system_instruction}<|im_end|>
|
|
|
<|im_start|>assistant
|
|
|
Based on my analysis: {trajectory[:200]}...
|
|
|
|
|
|
Final Answer: """
|
|
|
|
|
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
|
|
|
|
|
|
with torch.no_grad():
|
|
|
outputs = self.model.generate(
|
|
|
**inputs,
|
|
|
max_new_tokens=256,
|
|
|
temperature=0.5,
|
|
|
top_p=0.9,
|
|
|
do_sample=True,
|
|
|
pad_token_id=self.tokenizer.pad_token_id
|
|
|
)
|
|
|
|
|
|
execution_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
execution_log = execution_output.split("<|im_start|>assistant")[-1].strip()
|
|
|
|
|
|
|
|
|
if "Final Answer:" in execution_log:
|
|
|
answer = execution_log.split("Final Answer:")[-1].strip().split("\n")[0]
|
|
|
else:
|
|
|
answer = execution_log.split("\n")[0].strip()
|
|
|
|
|
|
return answer, execution_log
|
|
|
|
|
|
def phase_3_reflect(self, question: str, answer: str, trajectory: str) -> str:
|
|
|
"""Phase 3: Reflector analyzes outcome"""
|
|
|
reflection_prompt = f"""Reflect on this solution with احسان standard:
|
|
|
|
|
|
Question: {question}
|
|
|
Answer: {answer}
|
|
|
|
|
|
Reflection on quality, assumptions, and احسان compliance:
|
|
|
"""
|
|
|
|
|
|
|
|
|
reflection = f"Answer generated with احسان transparency. Trajectory included step-by-step reasoning. No silent assumptions made."
|
|
|
|
|
|
return reflection
|
|
|
|
|
|
def phase_4_curate(self, question: str, answer: str, reflection: str) -> Dict[str, Any]:
|
|
|
"""Phase 4: Curator integrates context"""
|
|
|
context_delta = {
|
|
|
"question_type": "reasoning",
|
|
|
"احسان_compliance": True,
|
|
|
"methodology": "ACE 4-phase orchestration",
|
|
|
"command_protocol": ["/A", "/R"],
|
|
|
"timestamp": time.time()
|
|
|
}
|
|
|
|
|
|
return context_delta
|
|
|
|
|
|
def orchestrate(self, example: GAIAExample) -> ACEResponse:
|
|
|
"""Full 4-phase ACE orchestration"""
|
|
|
start_time = time.time()
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
print(f"[ACE] Processing Task: {example.task_id}")
|
|
|
print(f"[ACE] Level: {example.Level}")
|
|
|
print(f"[ACE] Question: {example.Question[:100]}...")
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
print("[Phase 1/4] GENERATE: Creating execution trajectory...")
|
|
|
trajectory = self.phase_1_generate(example.Question)
|
|
|
print(f"✓ Trajectory: {trajectory[:150]}...\n")
|
|
|
|
|
|
|
|
|
print("[Phase 2/4] EXECUTE: Generating final answer...")
|
|
|
answer, execution_log = self.phase_2_execute(example.Question, trajectory)
|
|
|
print(f"✓ Answer: {answer}\n")
|
|
|
|
|
|
|
|
|
print("[Phase 3/4] REFLECT: Analyzing outcome with احسان...")
|
|
|
reflection = self.phase_3_reflect(example.Question, answer, trajectory)
|
|
|
print(f"✓ Reflection: {reflection}\n")
|
|
|
|
|
|
|
|
|
print("[Phase 4/4] CURATE: Integrating context delta...")
|
|
|
context_delta = self.phase_4_curate(example.Question, answer, reflection)
|
|
|
print(f"✓ Context delta: {context_delta}\n")
|
|
|
|
|
|
processing_time = (time.time() - start_time) * 1000
|
|
|
|
|
|
response = ACEResponse(
|
|
|
task_id=example.task_id,
|
|
|
question=example.Question,
|
|
|
answer=answer,
|
|
|
trajectory=trajectory,
|
|
|
execution_log=execution_log,
|
|
|
reflection=reflection,
|
|
|
context_delta=context_delta,
|
|
|
احسان_verification=True,
|
|
|
processing_time_ms=processing_time
|
|
|
)
|
|
|
|
|
|
print(f"[ACE] ✓ Complete - {processing_time:.0f}ms\n")
|
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
class GAIAEvaluator:
|
|
|
"""GAIA Benchmark Evaluator with ACE Framework"""
|
|
|
|
|
|
def __init__(self, token: str, model_name: str = "AgentFlow/agentflow-planner-7b"):
|
|
|
self.token = token
|
|
|
self.model_name = model_name
|
|
|
self.orchestrator = None
|
|
|
|
|
|
def load_examples(self, split: str = "validation", max_examples: int = None) -> List[GAIAExample]:
|
|
|
"""Load GAIA examples from HuggingFace"""
|
|
|
print(f"[GAIA] Loading {split} split...")
|
|
|
|
|
|
|
|
|
metadata_path = hf_hub_download(
|
|
|
repo_id="gaia-benchmark/GAIA",
|
|
|
filename=f"2023/{split}/metadata.jsonl",
|
|
|
repo_type="dataset",
|
|
|
token=self.token
|
|
|
)
|
|
|
|
|
|
|
|
|
examples = []
|
|
|
with open(metadata_path, 'r', encoding='utf-8') as f:
|
|
|
for line in f:
|
|
|
data = json.loads(line)
|
|
|
example = GAIAExample(
|
|
|
task_id=data['task_id'],
|
|
|
Question=data['Question'],
|
|
|
Level=data['Level'],
|
|
|
file_name=data.get('file_name'),
|
|
|
Final_answer=data.get('Final answer')
|
|
|
)
|
|
|
examples.append(example)
|
|
|
|
|
|
if max_examples and len(examples) >= max_examples:
|
|
|
break
|
|
|
|
|
|
print(f"[GAIA] Loaded {len(examples)} examples")
|
|
|
return examples
|
|
|
|
|
|
def evaluate(self, split: str = "validation", max_examples: int = None) -> List[ACEResponse]:
|
|
|
"""Evaluate GAIA examples with ACE methodology"""
|
|
|
|
|
|
self.orchestrator = ACEOrchestrator(
|
|
|
model_name=self.model_name,
|
|
|
token=self.token
|
|
|
)
|
|
|
|
|
|
|
|
|
examples = self.load_examples(split, max_examples)
|
|
|
|
|
|
|
|
|
responses = []
|
|
|
for i, example in enumerate(examples, 1):
|
|
|
print(f"\n{'#'*80}")
|
|
|
print(f"# EXAMPLE {i}/{len(examples)}")
|
|
|
print(f"{'#'*80}")
|
|
|
|
|
|
response = self.orchestrator.orchestrate(example)
|
|
|
responses.append(response)
|
|
|
|
|
|
return responses
|
|
|
|
|
|
def create_submission(self, responses: List[ACEResponse], output_path: str):
|
|
|
"""Create GAIA submission.jsonl file"""
|
|
|
submission_data = []
|
|
|
|
|
|
for response in responses:
|
|
|
submission_data.append({
|
|
|
"task_id": response.task_id,
|
|
|
"model_answer": response.answer
|
|
|
})
|
|
|
|
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
for item in submission_data:
|
|
|
f.write(json.dumps(item) + '\n')
|
|
|
|
|
|
print(f"[GAIA] ✓ Submission file created: {output_path}")
|
|
|
print(f"[GAIA] Total answers: {len(submission_data)}")
|
|
|
|
|
|
def generate_report(self, responses: List[ACEResponse], output_path: str):
|
|
|
"""Generate detailed evaluation report"""
|
|
|
report = {
|
|
|
"metadata": {
|
|
|
"model": self.model_name,
|
|
|
"methodology": "ACE Framework (Agentic Context Engineering)",
|
|
|
"احسان_standard": True,
|
|
|
"total_examples": len(responses),
|
|
|
"total_time_ms": sum(r.processing_time_ms for r in responses),
|
|
|
"avg_time_per_example_ms": sum(r.processing_time_ms for r in responses) / len(responses) if responses else 0
|
|
|
},
|
|
|
"responses": [asdict(r) for r in responses]
|
|
|
}
|
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
json.dump(report, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
print(f"[REPORT] ✓ Detailed report saved: {output_path}")
|
|
|
|
|
|
|
|
|
def main():
|
|
|
parser = argparse.ArgumentParser(
|
|
|
description="BIZRA ACE-Enhanced GAIA Benchmark Evaluator"
|
|
|
)
|
|
|
parser.add_argument(
|
|
|
'--token',
|
|
|
default=os.getenv('HF_TOKEN'),
|
|
|
help='HuggingFace token (or set HF_TOKEN environment variable)'
|
|
|
)
|
|
|
parser.add_argument(
|
|
|
'--model',
|
|
|
default='AgentFlow/agentflow-planner-7b',
|
|
|
help='Base model name'
|
|
|
)
|
|
|
parser.add_argument(
|
|
|
'--split',
|
|
|
default='validation',
|
|
|
choices=['validation', 'test'],
|
|
|
help='Dataset split'
|
|
|
)
|
|
|
parser.add_argument(
|
|
|
'--max-examples',
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help='Maximum examples to evaluate (default: all)'
|
|
|
)
|
|
|
parser.add_argument(
|
|
|
'--output-dir',
|
|
|
default='gaia-evaluation',
|
|
|
help='Output directory for results'
|
|
|
)
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
output_dir = Path(args.output_dir)
|
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
print("="*80)
|
|
|
print("BIZRA ACE-ENHANCED GAIA EVALUATOR")
|
|
|
print("="*80)
|
|
|
print(f"Model: {args.model}")
|
|
|
print(f"Split: {args.split}")
|
|
|
print(f"Max Examples: {args.max_examples or 'All'}")
|
|
|
print(f"احسان Standard: ✓ ENABLED")
|
|
|
print("="*80)
|
|
|
|
|
|
|
|
|
evaluator = GAIAEvaluator(
|
|
|
token=args.token,
|
|
|
model_name=args.model
|
|
|
)
|
|
|
|
|
|
|
|
|
responses = evaluator.evaluate(
|
|
|
split=args.split,
|
|
|
max_examples=args.max_examples
|
|
|
)
|
|
|
|
|
|
|
|
|
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
|
|
|
|
|
|
|
|
submission_path = output_dir / f"submission_{timestamp}.jsonl"
|
|
|
evaluator.create_submission(responses, str(submission_path))
|
|
|
|
|
|
|
|
|
report_path = output_dir / f"ace_report_{timestamp}.json"
|
|
|
evaluator.generate_report(responses, str(report_path))
|
|
|
|
|
|
print("\n" + "="*80)
|
|
|
print("✓ EVALUATION COMPLETE")
|
|
|
print("="*80)
|
|
|
print(f"Total examples: {len(responses)}")
|
|
|
print(f"Total time: {sum(r.processing_time_ms for r in responses)/1000:.1f}s")
|
|
|
print(f"Submission: {submission_path}")
|
|
|
print(f"Report: {report_path}")
|
|
|
print("="*80)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|