GAIA Developer Claude commited on
Commit
e09f605
Β·
1 Parent(s): aebabc5

πŸ“Š Add comprehensive answer validation interface for accuracy monitoring

Browse files

**Restored Missing Validation Column:**
- Added "Correct Answer" column displaying expected answers from validation data
- Added "Match" column with visual indicators:
- βœ… = Exact match (case-insensitive)
- 🟑 = Partial match (substring matching)
- ❌ = No match or error

**Enhanced Validation Features:**
- Loads validation data from gaia_validation_metadata.jsonl
- Real-time answer comparison during processing
- Detailed match logging for performance analysis
- Graceful fallback when validation data unavailable

**Interface Improvements:**
- Updated results table label to "Detailed Question Results with Validation"
- Added validation legend to user instructions
- Applied to both root and deployment app versions consistently

**Technical Details:**
- Case-insensitive string matching for robustness
- Substring matching for partial credit detection
- Supports multiple validation file locations
- JSON parsing with error handling

This restores the validation functionality from commit 7724e0ec51435b950a6ea341fee67a3fce051261
and enables real-time accuracy monitoring during evaluation runs.

πŸ”§ Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +47 -4
  2. app/app.py +47 -4
app.py CHANGED
@@ -297,7 +297,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
297
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
298
  print(f"πŸ“‹ Agent code available at: {agent_code}")
299
 
300
- # 2. Fetch Questions
301
  print(f"πŸ“₯ Fetching questions from: {questions_url}")
302
  try:
303
  response = requests.get(questions_url, timeout=15)
@@ -316,6 +316,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
316
  except Exception as e:
317
  print(f"❌ Unexpected error fetching questions: {e}")
318
  return f"An unexpected error occurred fetching questions: {e}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  # 3. Run Advanced GAIA Agent
321
  results_log = []
@@ -338,21 +360,39 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
338
  submitted_answer = agent(question_text)
339
  question_time = time.time() - question_start
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
342
  results_log.append({
343
  "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
344
  "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
345
  "Submitted Answer": submitted_answer,
 
 
346
  "Processing Time (s)": f"{question_time:.2f}"
347
  })
348
- print(f"βœ… Completed in {question_time:.2f}s")
349
 
350
  except Exception as e:
351
  print(f"❌ Error running agent on task {task_id}: {e}")
 
352
  results_log.append({
353
  "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
354
  "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
355
  "Submitted Answer": f"AGENT ERROR: {e}",
 
 
356
  "Processing Time (s)": "Error"
357
  })
358
 
@@ -459,7 +499,10 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
459
 
460
  1. **Login**: Use the Hugging Face login button below
461
  2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
462
- 3. **Results**: View detailed results and performance metrics
 
 
 
463
 
464
  ---
465
 
@@ -489,7 +532,7 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
489
  )
490
 
491
  results_table = gr.DataFrame(
492
- label="πŸ“‹ Detailed Question Results",
493
  wrap=True,
494
  interactive=False
495
  )
 
297
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
298
  print(f"πŸ“‹ Agent code available at: {agent_code}")
299
 
300
+ # 2. Fetch Questions and Load Validation Data
301
  print(f"πŸ“₯ Fetching questions from: {questions_url}")
302
  try:
303
  response = requests.get(questions_url, timeout=15)
 
316
  except Exception as e:
317
  print(f"❌ Unexpected error fetching questions: {e}")
318
  return f"An unexpected error occurred fetching questions: {e}", None
319
+
320
+ # Load validation data for correct answers
321
+ validation_data = {}
322
+ validation_files = [
323
+ "/home/user/gaia_validation_metadata.jsonl",
324
+ "/home/user/app/gaia_validation_metadata.jsonl"
325
+ ]
326
+
327
+ for validation_file in validation_files:
328
+ try:
329
+ if os.path.exists(validation_file):
330
+ print(f"πŸ“‹ Loading validation data from: {validation_file}")
331
+ with open(validation_file, 'r') as f:
332
+ for line in f:
333
+ if line.strip():
334
+ entry = json.loads(line.strip())
335
+ validation_data[entry['task_id']] = entry.get('Final answer', 'N/A')
336
+ print(f"βœ… Loaded validation data for {len(validation_data)} questions")
337
+ break
338
+ except Exception as e:
339
+ print(f"⚠️ Could not load validation data from {validation_file}: {e}")
340
+ continue
341
 
342
  # 3. Run Advanced GAIA Agent
343
  results_log = []
 
360
  submitted_answer = agent(question_text)
361
  question_time = time.time() - question_start
362
 
363
+ # Get correct answer for validation
364
+ correct_answer = validation_data.get(task_id, "N/A")
365
+
366
+ # Check if submitted answer matches correct answer (case-insensitive, trimmed)
367
+ is_correct = "❌"
368
+ if correct_answer != "N/A":
369
+ submitted_clean = str(submitted_answer).strip().lower()
370
+ correct_clean = str(correct_answer).strip().lower()
371
+ if submitted_clean == correct_clean:
372
+ is_correct = "βœ…"
373
+ elif submitted_clean in correct_clean or correct_clean in submitted_clean:
374
+ is_correct = "🟑" # Partial match
375
+
376
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
377
  results_log.append({
378
  "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
379
  "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
380
  "Submitted Answer": submitted_answer,
381
+ "Correct Answer": correct_answer,
382
+ "Match": is_correct,
383
  "Processing Time (s)": f"{question_time:.2f}"
384
  })
385
+ print(f"βœ… Completed in {question_time:.2f}s - Match: {is_correct}")
386
 
387
  except Exception as e:
388
  print(f"❌ Error running agent on task {task_id}: {e}")
389
+ correct_answer = validation_data.get(task_id, "N/A")
390
  results_log.append({
391
  "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
392
  "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
393
  "Submitted Answer": f"AGENT ERROR: {e}",
394
+ "Correct Answer": correct_answer,
395
+ "Match": "❌",
396
  "Processing Time (s)": "Error"
397
  })
398
 
 
499
 
500
  1. **Login**: Use the Hugging Face login button below
501
  2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
502
+ 3. **Results**: View detailed results with validation against correct answers
503
+ - βœ… = Exact match
504
+ - 🟑 = Partial match
505
+ - ❌ = No match
506
 
507
  ---
508
 
 
532
  )
533
 
534
  results_table = gr.DataFrame(
535
+ label="πŸ“‹ Detailed Question Results with Validation",
536
  wrap=True,
537
  interactive=False
538
  )
app/app.py CHANGED
@@ -297,7 +297,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
297
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
298
  print(f"πŸ“‹ Agent code available at: {agent_code}")
299
 
300
- # 2. Fetch Questions
301
  print(f"πŸ“₯ Fetching questions from: {questions_url}")
302
  try:
303
  response = requests.get(questions_url, timeout=15)
@@ -316,6 +316,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
316
  except Exception as e:
317
  print(f"❌ Unexpected error fetching questions: {e}")
318
  return f"An unexpected error occurred fetching questions: {e}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  # 3. Run Advanced GAIA Agent
321
  results_log = []
@@ -338,21 +360,39 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
338
  submitted_answer = agent(question_text)
339
  question_time = time.time() - question_start
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
342
  results_log.append({
343
  "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
344
  "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
345
  "Submitted Answer": submitted_answer,
 
 
346
  "Processing Time (s)": f"{question_time:.2f}"
347
  })
348
- print(f"βœ… Completed in {question_time:.2f}s")
349
 
350
  except Exception as e:
351
  print(f"❌ Error running agent on task {task_id}: {e}")
 
352
  results_log.append({
353
  "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
354
  "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
355
  "Submitted Answer": f"AGENT ERROR: {e}",
 
 
356
  "Processing Time (s)": "Error"
357
  })
358
 
@@ -459,7 +499,10 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
459
 
460
  1. **Login**: Use the Hugging Face login button below
461
  2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
462
- 3. **Results**: View detailed results and performance metrics
 
 
 
463
 
464
  ---
465
 
@@ -489,7 +532,7 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
489
  )
490
 
491
  results_table = gr.DataFrame(
492
- label="πŸ“‹ Detailed Question Results",
493
  wrap=True,
494
  interactive=False
495
  )
 
297
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
298
  print(f"πŸ“‹ Agent code available at: {agent_code}")
299
 
300
+ # 2. Fetch Questions and Load Validation Data
301
  print(f"πŸ“₯ Fetching questions from: {questions_url}")
302
  try:
303
  response = requests.get(questions_url, timeout=15)
 
316
  except Exception as e:
317
  print(f"❌ Unexpected error fetching questions: {e}")
318
  return f"An unexpected error occurred fetching questions: {e}", None
319
+
320
+ # Load validation data for correct answers
321
+ validation_data = {}
322
+ validation_files = [
323
+ "/home/user/gaia_validation_metadata.jsonl",
324
+ "/home/user/app/gaia_validation_metadata.jsonl"
325
+ ]
326
+
327
+ for validation_file in validation_files:
328
+ try:
329
+ if os.path.exists(validation_file):
330
+ print(f"πŸ“‹ Loading validation data from: {validation_file}")
331
+ with open(validation_file, 'r') as f:
332
+ for line in f:
333
+ if line.strip():
334
+ entry = json.loads(line.strip())
335
+ validation_data[entry['task_id']] = entry.get('Final answer', 'N/A')
336
+ print(f"βœ… Loaded validation data for {len(validation_data)} questions")
337
+ break
338
+ except Exception as e:
339
+ print(f"⚠️ Could not load validation data from {validation_file}: {e}")
340
+ continue
341
 
342
  # 3. Run Advanced GAIA Agent
343
  results_log = []
 
360
  submitted_answer = agent(question_text)
361
  question_time = time.time() - question_start
362
 
363
+ # Get correct answer for validation
364
+ correct_answer = validation_data.get(task_id, "N/A")
365
+
366
+ # Check if submitted answer matches correct answer (case-insensitive, trimmed)
367
+ is_correct = "❌"
368
+ if correct_answer != "N/A":
369
+ submitted_clean = str(submitted_answer).strip().lower()
370
+ correct_clean = str(correct_answer).strip().lower()
371
+ if submitted_clean == correct_clean:
372
+ is_correct = "βœ…"
373
+ elif submitted_clean in correct_clean or correct_clean in submitted_clean:
374
+ is_correct = "🟑" # Partial match
375
+
376
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
377
  results_log.append({
378
  "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
379
  "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
380
  "Submitted Answer": submitted_answer,
381
+ "Correct Answer": correct_answer,
382
+ "Match": is_correct,
383
  "Processing Time (s)": f"{question_time:.2f}"
384
  })
385
+ print(f"βœ… Completed in {question_time:.2f}s - Match: {is_correct}")
386
 
387
  except Exception as e:
388
  print(f"❌ Error running agent on task {task_id}: {e}")
389
+ correct_answer = validation_data.get(task_id, "N/A")
390
  results_log.append({
391
  "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
392
  "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
393
  "Submitted Answer": f"AGENT ERROR: {e}",
394
+ "Correct Answer": correct_answer,
395
+ "Match": "❌",
396
  "Processing Time (s)": "Error"
397
  })
398
 
 
499
 
500
  1. **Login**: Use the Hugging Face login button below
501
  2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
502
+ 3. **Results**: View detailed results with validation against correct answers
503
+ - βœ… = Exact match
504
+ - 🟑 = Partial match
505
+ - ❌ = No match
506
 
507
  ---
508
 
 
532
  )
533
 
534
  results_table = gr.DataFrame(
535
+ label="πŸ“‹ Detailed Question Results with Validation",
536
  wrap=True,
537
  interactive=False
538
  )