Spaces:
Running
Running
GAIA Developer
Claude
commited on
Commit
Β·
520f8ca
1
Parent(s):
b58a59f
π§ Fix web interface accuracy by removing redundant answer extraction
Browse filesFixed critical issue where solve_question() output was being double-processed,
causing accuracy to drop from 90% to 30%. The solve_question method already
returns clean, processed answers, so removed redundant _extract_answer() call.
Also fixed import paths to ensure GAIASolver initializes properly.
π€ Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
- app/app.py +16 -16
app/app.py
CHANGED
@@ -17,6 +17,7 @@ from pathlib import Path
|
|
17 |
|
18 |
# Add current directory to Python path to find main modules
|
19 |
sys.path.insert(0, '/home/user/app')
|
|
|
20 |
|
21 |
# --- Constants ---
|
22 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
@@ -138,21 +139,19 @@ class AdvancedGAIAAgent:
|
|
138 |
"question": question,
|
139 |
"file_name": ""
|
140 |
}
|
141 |
-
|
142 |
-
answer = self.
|
143 |
elif self.solver == "refactored":
|
144 |
# For refactored architecture
|
145 |
try:
|
146 |
from main_refactored import main as refactored_main
|
147 |
-
|
148 |
-
answer = self._extract_answer(result)
|
149 |
except Exception as e:
|
150 |
print(f"Refactored solver error: {e}")
|
151 |
answer = f"Refactored solver error: {e}"
|
152 |
elif hasattr(self.solver, '__call__'):
|
153 |
# Generic callable solver
|
154 |
-
|
155 |
-
answer = self._extract_answer(result)
|
156 |
else:
|
157 |
# Last resort
|
158 |
answer = "Unable to process question with current solver"
|
@@ -260,9 +259,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
260 |
"Our Answer": submitted_answer[:50] + "..." if len(str(submitted_answer)) > 50 else submitted_answer,
|
261 |
"Expected Answer": correct_answer,
|
262 |
"Result": f"{validation_result['icon']} {validation_result['status']}",
|
263 |
-
"
|
264 |
-
"
|
265 |
-
"Time (s)": f"{question_time:.2f}"
|
266 |
})
|
267 |
print(f"β
Completed in {question_time:.2f}s - {validation_result['icon']} {validation_result['status']}")
|
268 |
|
@@ -274,9 +272,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
274 |
"Our Answer": f"ERROR: {e}",
|
275 |
"Expected Answer": correct_answers.get(task_id, {}).get('answer', 'Not available'),
|
276 |
"Result": "β ERROR",
|
277 |
-
"
|
278 |
-
"
|
279 |
-
"Time (s)": "Error"
|
280 |
})
|
281 |
|
282 |
total_time = time.time() - start_time
|
@@ -289,12 +286,12 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
289 |
|
290 |
for result in results_log:
|
291 |
try:
|
292 |
-
score = float(result.get('
|
293 |
total_score += score
|
294 |
validated_count += 1
|
295 |
if score >= 1.0:
|
296 |
correct_count += 1
|
297 |
-
except ValueError:
|
298 |
pass
|
299 |
|
300 |
local_accuracy = (total_score / validated_count * 100) if validated_count > 0 else 0
|
@@ -306,7 +303,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
306 |
|
307 |
if not answers_payload:
|
308 |
print("β Agent did not produce any answers to submit.")
|
309 |
-
|
|
|
310 |
|
311 |
# 4. Prepare Submission
|
312 |
submission_data = {
|
@@ -347,7 +345,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
347 |
f"- Features: Enhanced reasoning, 42 specialized tools, domain expertise"
|
348 |
)
|
349 |
print("β
Submission successful.")
|
350 |
-
|
|
|
|
|
351 |
return final_status, results_df
|
352 |
|
353 |
except requests.exceptions.HTTPError as e:
|
|
|
17 |
|
18 |
# Add current directory to Python path to find main modules
|
19 |
sys.path.insert(0, '/home/user/app')
|
20 |
+
sys.path.insert(0, '/home/user')
|
21 |
|
22 |
# --- Constants ---
|
23 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
|
139 |
"question": question,
|
140 |
"file_name": ""
|
141 |
}
|
142 |
+
# solve_question already returns a clean, processed answer string
|
143 |
+
answer = self.solver.solve_question(question_data)
|
144 |
elif self.solver == "refactored":
|
145 |
# For refactored architecture
|
146 |
try:
|
147 |
from main_refactored import main as refactored_main
|
148 |
+
answer = refactored_main(question)
|
|
|
149 |
except Exception as e:
|
150 |
print(f"Refactored solver error: {e}")
|
151 |
answer = f"Refactored solver error: {e}"
|
152 |
elif hasattr(self.solver, '__call__'):
|
153 |
# Generic callable solver
|
154 |
+
answer = self.solver(question)
|
|
|
155 |
else:
|
156 |
# Last resort
|
157 |
answer = "Unable to process question with current solver"
|
|
|
259 |
"Our Answer": submitted_answer[:50] + "..." if len(str(submitted_answer)) > 50 else submitted_answer,
|
260 |
"Expected Answer": correct_answer,
|
261 |
"Result": f"{validation_result['icon']} {validation_result['status']}",
|
262 |
+
"Time (s)": f"{question_time:.2f}",
|
263 |
+
"_score": validation_result['score'] # Keep for calculation but don't display
|
|
|
264 |
})
|
265 |
print(f"β
Completed in {question_time:.2f}s - {validation_result['icon']} {validation_result['status']}")
|
266 |
|
|
|
272 |
"Our Answer": f"ERROR: {e}",
|
273 |
"Expected Answer": correct_answers.get(task_id, {}).get('answer', 'Not available'),
|
274 |
"Result": "β ERROR",
|
275 |
+
"Time (s)": "Error",
|
276 |
+
"_score": 0.0 # Keep for calculation but don't display
|
|
|
277 |
})
|
278 |
|
279 |
total_time = time.time() - start_time
|
|
|
286 |
|
287 |
for result in results_log:
|
288 |
try:
|
289 |
+
score = float(result.get('_score', 0.0))
|
290 |
total_score += score
|
291 |
validated_count += 1
|
292 |
if score >= 1.0:
|
293 |
correct_count += 1
|
294 |
+
except (ValueError, TypeError):
|
295 |
pass
|
296 |
|
297 |
local_accuracy = (total_score / validated_count * 100) if validated_count > 0 else 0
|
|
|
303 |
|
304 |
if not answers_payload:
|
305 |
print("β Agent did not produce any answers to submit.")
|
306 |
+
display_results = [{k: v for k, v in result.items() if not k.startswith('_')} for result in results_log]
|
307 |
+
return "Agent did not produce any answers to submit.", pd.DataFrame(display_results)
|
308 |
|
309 |
# 4. Prepare Submission
|
310 |
submission_data = {
|
|
|
345 |
f"- Features: Enhanced reasoning, 42 specialized tools, domain expertise"
|
346 |
)
|
347 |
print("β
Submission successful.")
|
348 |
+
# Create DataFrame excluding hidden score field
|
349 |
+
display_results = [{k: v for k, v in result.items() if not k.startswith('_')} for result in results_log]
|
350 |
+
results_df = pd.DataFrame(display_results)
|
351 |
return final_status, results_df
|
352 |
|
353 |
except requests.exceptions.HTTPError as e:
|