Spaces:
Running
π Add comprehensive answer validation interface for accuracy monitoring
Browse files**Restored Missing Validation Column:**
- Added "Correct Answer" column displaying expected answers from validation data
- Added "Match" column with visual indicators:
- β
= Exact match (case-insensitive)
- π‘ = Partial match (substring matching)
- β = No match or error
**Enhanced Validation Features:**
- Loads validation data from gaia_validation_metadata.jsonl
- Real-time answer comparison during processing
- Detailed match logging for performance analysis
- Graceful fallback when validation data unavailable
**Interface Improvements:**
- Updated results table label to "Detailed Question Results with Validation"
- Added validation legend to user instructions
- Applied to both root and deployment app versions consistently
**Technical Details:**
- Case-insensitive string matching for robustness
- Substring matching for partial credit detection
- Supports multiple validation file locations
- JSON parsing with error handling
This restores the validation functionality from commit 7724e0ec51435b950a6ea341fee67a3fce051261
and enables real-time accuracy monitoring during evaluation runs.
π§ Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
- app.py +47 -4
- app/app.py +47 -4
@@ -297,7 +297,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
297 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
|
298 |
print(f"π Agent code available at: {agent_code}")
|
299 |
|
300 |
-
# 2. Fetch Questions
|
301 |
print(f"π₯ Fetching questions from: {questions_url}")
|
302 |
try:
|
303 |
response = requests.get(questions_url, timeout=15)
|
@@ -316,6 +316,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
316 |
except Exception as e:
|
317 |
print(f"β Unexpected error fetching questions: {e}")
|
318 |
return f"An unexpected error occurred fetching questions: {e}", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
|
320 |
# 3. Run Advanced GAIA Agent
|
321 |
results_log = []
|
@@ -338,21 +360,39 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
338 |
submitted_answer = agent(question_text)
|
339 |
question_time = time.time() - question_start
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
342 |
results_log.append({
|
343 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
344 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
345 |
"Submitted Answer": submitted_answer,
|
|
|
|
|
346 |
"Processing Time (s)": f"{question_time:.2f}"
|
347 |
})
|
348 |
-
print(f"β
Completed in {question_time:.2f}s")
|
349 |
|
350 |
except Exception as e:
|
351 |
print(f"β Error running agent on task {task_id}: {e}")
|
|
|
352 |
results_log.append({
|
353 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
354 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
355 |
"Submitted Answer": f"AGENT ERROR: {e}",
|
|
|
|
|
356 |
"Processing Time (s)": "Error"
|
357 |
})
|
358 |
|
@@ -459,7 +499,10 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
|
|
459 |
|
460 |
1. **Login**: Use the Hugging Face login button below
|
461 |
2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
|
462 |
-
3. **Results**: View detailed results
|
|
|
|
|
|
|
463 |
|
464 |
---
|
465 |
|
@@ -489,7 +532,7 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
|
|
489 |
)
|
490 |
|
491 |
results_table = gr.DataFrame(
|
492 |
-
label="π Detailed Question Results",
|
493 |
wrap=True,
|
494 |
interactive=False
|
495 |
)
|
|
|
297 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
|
298 |
print(f"π Agent code available at: {agent_code}")
|
299 |
|
300 |
+
# 2. Fetch Questions and Load Validation Data
|
301 |
print(f"π₯ Fetching questions from: {questions_url}")
|
302 |
try:
|
303 |
response = requests.get(questions_url, timeout=15)
|
|
|
316 |
except Exception as e:
|
317 |
print(f"β Unexpected error fetching questions: {e}")
|
318 |
return f"An unexpected error occurred fetching questions: {e}", None
|
319 |
+
|
320 |
+
# Load validation data for correct answers
|
321 |
+
validation_data = {}
|
322 |
+
validation_files = [
|
323 |
+
"/home/user/gaia_validation_metadata.jsonl",
|
324 |
+
"/home/user/app/gaia_validation_metadata.jsonl"
|
325 |
+
]
|
326 |
+
|
327 |
+
for validation_file in validation_files:
|
328 |
+
try:
|
329 |
+
if os.path.exists(validation_file):
|
330 |
+
print(f"π Loading validation data from: {validation_file}")
|
331 |
+
with open(validation_file, 'r') as f:
|
332 |
+
for line in f:
|
333 |
+
if line.strip():
|
334 |
+
entry = json.loads(line.strip())
|
335 |
+
validation_data[entry['task_id']] = entry.get('Final answer', 'N/A')
|
336 |
+
print(f"β
Loaded validation data for {len(validation_data)} questions")
|
337 |
+
break
|
338 |
+
except Exception as e:
|
339 |
+
print(f"β οΈ Could not load validation data from {validation_file}: {e}")
|
340 |
+
continue
|
341 |
|
342 |
# 3. Run Advanced GAIA Agent
|
343 |
results_log = []
|
|
|
360 |
submitted_answer = agent(question_text)
|
361 |
question_time = time.time() - question_start
|
362 |
|
363 |
+
# Get correct answer for validation
|
364 |
+
correct_answer = validation_data.get(task_id, "N/A")
|
365 |
+
|
366 |
+
# Check if submitted answer matches correct answer (case-insensitive, trimmed)
|
367 |
+
is_correct = "β"
|
368 |
+
if correct_answer != "N/A":
|
369 |
+
submitted_clean = str(submitted_answer).strip().lower()
|
370 |
+
correct_clean = str(correct_answer).strip().lower()
|
371 |
+
if submitted_clean == correct_clean:
|
372 |
+
is_correct = "β
"
|
373 |
+
elif submitted_clean in correct_clean or correct_clean in submitted_clean:
|
374 |
+
is_correct = "π‘" # Partial match
|
375 |
+
|
376 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
377 |
results_log.append({
|
378 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
379 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
380 |
"Submitted Answer": submitted_answer,
|
381 |
+
"Correct Answer": correct_answer,
|
382 |
+
"Match": is_correct,
|
383 |
"Processing Time (s)": f"{question_time:.2f}"
|
384 |
})
|
385 |
+
print(f"β
Completed in {question_time:.2f}s - Match: {is_correct}")
|
386 |
|
387 |
except Exception as e:
|
388 |
print(f"β Error running agent on task {task_id}: {e}")
|
389 |
+
correct_answer = validation_data.get(task_id, "N/A")
|
390 |
results_log.append({
|
391 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
392 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
393 |
"Submitted Answer": f"AGENT ERROR: {e}",
|
394 |
+
"Correct Answer": correct_answer,
|
395 |
+
"Match": "β",
|
396 |
"Processing Time (s)": "Error"
|
397 |
})
|
398 |
|
|
|
499 |
|
500 |
1. **Login**: Use the Hugging Face login button below
|
501 |
2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
|
502 |
+
3. **Results**: View detailed results with validation against correct answers
|
503 |
+
- β
= Exact match
|
504 |
+
- π‘ = Partial match
|
505 |
+
- β = No match
|
506 |
|
507 |
---
|
508 |
|
|
|
532 |
)
|
533 |
|
534 |
results_table = gr.DataFrame(
|
535 |
+
label="π Detailed Question Results with Validation",
|
536 |
wrap=True,
|
537 |
interactive=False
|
538 |
)
|
@@ -297,7 +297,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
297 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
|
298 |
print(f"π Agent code available at: {agent_code}")
|
299 |
|
300 |
-
# 2. Fetch Questions
|
301 |
print(f"π₯ Fetching questions from: {questions_url}")
|
302 |
try:
|
303 |
response = requests.get(questions_url, timeout=15)
|
@@ -316,6 +316,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
316 |
except Exception as e:
|
317 |
print(f"β Unexpected error fetching questions: {e}")
|
318 |
return f"An unexpected error occurred fetching questions: {e}", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
|
320 |
# 3. Run Advanced GAIA Agent
|
321 |
results_log = []
|
@@ -338,21 +360,39 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
338 |
submitted_answer = agent(question_text)
|
339 |
question_time = time.time() - question_start
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
342 |
results_log.append({
|
343 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
344 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
345 |
"Submitted Answer": submitted_answer,
|
|
|
|
|
346 |
"Processing Time (s)": f"{question_time:.2f}"
|
347 |
})
|
348 |
-
print(f"β
Completed in {question_time:.2f}s")
|
349 |
|
350 |
except Exception as e:
|
351 |
print(f"β Error running agent on task {task_id}: {e}")
|
|
|
352 |
results_log.append({
|
353 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
354 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
355 |
"Submitted Answer": f"AGENT ERROR: {e}",
|
|
|
|
|
356 |
"Processing Time (s)": "Error"
|
357 |
})
|
358 |
|
@@ -459,7 +499,10 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
|
|
459 |
|
460 |
1. **Login**: Use the Hugging Face login button below
|
461 |
2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
|
462 |
-
3. **Results**: View detailed results
|
|
|
|
|
|
|
463 |
|
464 |
---
|
465 |
|
@@ -489,7 +532,7 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
|
|
489 |
)
|
490 |
|
491 |
results_table = gr.DataFrame(
|
492 |
-
label="π Detailed Question Results",
|
493 |
wrap=True,
|
494 |
interactive=False
|
495 |
)
|
|
|
297 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
|
298 |
print(f"π Agent code available at: {agent_code}")
|
299 |
|
300 |
+
# 2. Fetch Questions and Load Validation Data
|
301 |
print(f"π₯ Fetching questions from: {questions_url}")
|
302 |
try:
|
303 |
response = requests.get(questions_url, timeout=15)
|
|
|
316 |
except Exception as e:
|
317 |
print(f"β Unexpected error fetching questions: {e}")
|
318 |
return f"An unexpected error occurred fetching questions: {e}", None
|
319 |
+
|
320 |
+
# Load validation data for correct answers
|
321 |
+
validation_data = {}
|
322 |
+
validation_files = [
|
323 |
+
"/home/user/gaia_validation_metadata.jsonl",
|
324 |
+
"/home/user/app/gaia_validation_metadata.jsonl"
|
325 |
+
]
|
326 |
+
|
327 |
+
for validation_file in validation_files:
|
328 |
+
try:
|
329 |
+
if os.path.exists(validation_file):
|
330 |
+
print(f"π Loading validation data from: {validation_file}")
|
331 |
+
with open(validation_file, 'r') as f:
|
332 |
+
for line in f:
|
333 |
+
if line.strip():
|
334 |
+
entry = json.loads(line.strip())
|
335 |
+
validation_data[entry['task_id']] = entry.get('Final answer', 'N/A')
|
336 |
+
print(f"β
Loaded validation data for {len(validation_data)} questions")
|
337 |
+
break
|
338 |
+
except Exception as e:
|
339 |
+
print(f"β οΈ Could not load validation data from {validation_file}: {e}")
|
340 |
+
continue
|
341 |
|
342 |
# 3. Run Advanced GAIA Agent
|
343 |
results_log = []
|
|
|
360 |
submitted_answer = agent(question_text)
|
361 |
question_time = time.time() - question_start
|
362 |
|
363 |
+
# Get correct answer for validation
|
364 |
+
correct_answer = validation_data.get(task_id, "N/A")
|
365 |
+
|
366 |
+
# Check if submitted answer matches correct answer (case-insensitive, trimmed)
|
367 |
+
is_correct = "β"
|
368 |
+
if correct_answer != "N/A":
|
369 |
+
submitted_clean = str(submitted_answer).strip().lower()
|
370 |
+
correct_clean = str(correct_answer).strip().lower()
|
371 |
+
if submitted_clean == correct_clean:
|
372 |
+
is_correct = "β
"
|
373 |
+
elif submitted_clean in correct_clean or correct_clean in submitted_clean:
|
374 |
+
is_correct = "π‘" # Partial match
|
375 |
+
|
376 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
377 |
results_log.append({
|
378 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
379 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
380 |
"Submitted Answer": submitted_answer,
|
381 |
+
"Correct Answer": correct_answer,
|
382 |
+
"Match": is_correct,
|
383 |
"Processing Time (s)": f"{question_time:.2f}"
|
384 |
})
|
385 |
+
print(f"β
Completed in {question_time:.2f}s - Match: {is_correct}")
|
386 |
|
387 |
except Exception as e:
|
388 |
print(f"β Error running agent on task {task_id}: {e}")
|
389 |
+
correct_answer = validation_data.get(task_id, "N/A")
|
390 |
results_log.append({
|
391 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
392 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
393 |
"Submitted Answer": f"AGENT ERROR: {e}",
|
394 |
+
"Correct Answer": correct_answer,
|
395 |
+
"Match": "β",
|
396 |
"Processing Time (s)": "Error"
|
397 |
})
|
398 |
|
|
|
499 |
|
500 |
1. **Login**: Use the Hugging Face login button below
|
501 |
2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
|
502 |
+
3. **Results**: View detailed results with validation against correct answers
|
503 |
+
- β
= Exact match
|
504 |
+
- π‘ = Partial match
|
505 |
+
- β = No match
|
506 |
|
507 |
---
|
508 |
|
|
|
532 |
)
|
533 |
|
534 |
results_table = gr.DataFrame(
|
535 |
+
label="π Detailed Question Results with Validation",
|
536 |
wrap=True,
|
537 |
interactive=False
|
538 |
)
|