File size: 9,022 Bytes
37cadfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python3
"""
GAIA Question Loader - Web API version
Fetch questions directly from GAIA API instead of local files
"""

import json
import time
import logging
from typing import List, Dict, Optional
import requests
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Configure logging
logger = logging.getLogger(__name__)


def retry_with_backoff(max_retries: int = 3, initial_delay: float = 1.0, backoff_factor: float = 2.0):
    """Decorator to retry a function call with exponential backoff"""
    def decorator(func):
        def wrapper(*args, **kwargs):
            retries = 0
            delay = initial_delay
            last_exception = None
            
            while retries < max_retries:
                try:
                    return func(*args, **kwargs)
                except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
                    last_exception = e
                    retries += 1
                    if retries < max_retries:
                        logger.warning(f"Retry {retries}/{max_retries} for {func.__name__} due to {type(e).__name__}. Delaying {delay:.2f}s")
                        time.sleep(delay)
                        delay *= backoff_factor
                    else:
                        logger.error(f"Max retries reached for {func.__name__}")
                        raise last_exception
                except requests.exceptions.HTTPError as e:
                    if e.response and e.response.status_code in (500, 502, 503, 504):
                        last_exception = e
                        retries += 1
                        if retries < max_retries:
                            logger.warning(f"Retry {retries}/{max_retries} for {func.__name__} due to HTTP {e.response.status_code}. Delaying {delay:.2f}s")
                            time.sleep(delay)
                            delay *= backoff_factor
                        else:
                            logger.error(f"Max retries reached for {func.__name__}")
                            raise last_exception
                    else:
                        raise
            
            return func(*args, **kwargs)
        return wrapper
    return decorator


class GAIAQuestionLoaderWeb:
    """Load and manage GAIA questions from the web API"""
    
    def __init__(self, api_base: Optional[str] = None, username: Optional[str] = None):
        self.api_base = api_base or os.getenv("GAIA_API_BASE", "https://agents-course-unit4-scoring.hf.space")
        self.username = username or os.getenv("GAIA_USERNAME", "tonthatthienvu")
        self.questions: List[Dict] = []
        self._load_questions()
    
    @retry_with_backoff()
    def _make_request(self, method: str, endpoint: str, params: Optional[Dict] = None, 
                     payload: Optional[Dict] = None, timeout: int = 15) -> requests.Response:
        """Make HTTP request with retry logic"""
        url = f"{self.api_base}/{endpoint.lstrip('/')}"
        logger.info(f"Request: {method.upper()} {url}")
        
        try:
            response = requests.request(method, url, params=params, json=payload, timeout=timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.HTTPError as e:
            logger.error(f"HTTPError: {e.response.status_code} for {method.upper()} {url}")
            if e.response:
                logger.error(f"Response: {e.response.text[:200]}")
            raise
        except requests.exceptions.Timeout:
            logger.error(f"Timeout: Request to {url} timed out after {timeout}s")
            raise
        except requests.exceptions.ConnectionError as e:
            logger.error(f"ConnectionError: Could not connect to {url}. Details: {e}")
            raise
    
    def _load_questions(self):
        """Fetch all questions from the GAIA API"""
        try:
            logger.info(f"Fetching questions from GAIA API: {self.api_base}/questions")
            response = self._make_request("get", "questions", timeout=15)
            self.questions = response.json()
            print(f"✅ Loaded {len(self.questions)} GAIA questions from web API")
            logger.info(f"Successfully retrieved {len(self.questions)} questions from API")
        except requests.exceptions.RequestException as e:
            logger.error(f"Failed to fetch questions from API: {e}")
            print(f"❌ Failed to load questions from web API: {e}")
            self.questions = []
        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse JSON response: {e}")
            print(f"❌ Failed to parse questions from web API: {e}")
            self.questions = []
    
    def get_random_question(self) -> Optional[Dict]:
        """Get a random question from the API"""
        try:
            logger.info(f"Getting random question from: {self.api_base}/random-question")
            response = self._make_request("get", "random-question", timeout=15)
            question = response.json()
            task_id = question.get('task_id', 'Unknown')
            logger.info(f"Successfully retrieved random question: {task_id}")
            return question
        except requests.exceptions.RequestException as e:
            logger.error(f"Failed to get random question: {e}")
            # Fallback to local random selection
            import random
            return random.choice(self.questions) if self.questions else None
        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse random question response: {e}")
            return None
    
    def get_question_by_id(self, task_id: str) -> Optional[Dict]:
        """Get a specific question by task ID"""
        return next((q for q in self.questions if q.get('task_id') == task_id), None)
    
    def get_questions_by_level(self, level: str) -> List[Dict]:
        """Get all questions of a specific difficulty level"""
        return [q for q in self.questions if q.get('Level') == level]
    
    def get_questions_with_files(self) -> List[Dict]:
        """Get all questions that have associated files"""
        return [q for q in self.questions if q.get('file_name')]
    
    def get_questions_without_files(self) -> List[Dict]:
        """Get all questions that don't have associated files"""
        return [q for q in self.questions if not q.get('file_name')]
    
    def count_by_level(self) -> Dict[str, int]:
        """Count questions by difficulty level"""
        levels = {}
        for q in self.questions:
            level = q.get('Level', 'Unknown')
            levels[level] = levels.get(level, 0) + 1
        return levels
    
    def summary(self) -> Dict:
        """Get a summary of loaded questions"""
        return {
            'total_questions': len(self.questions),
            'with_files': len(self.get_questions_with_files()),
            'without_files': len(self.get_questions_without_files()),
            'by_level': self.count_by_level(),
            'api_base': self.api_base,
            'username': self.username
        }
    
    def download_file(self, task_id: str, save_dir: str = "./downloads") -> Optional[str]:
        """Download a file associated with a question"""
        try:
            import os
            from pathlib import Path
            
            # Create download directory
            Path(save_dir).mkdir(exist_ok=True)
            
            logger.info(f"Downloading file for task: {task_id}")
            response = self._make_request("get", f"files/{task_id}", timeout=30)
            
            # Try to get filename from headers
            filename = task_id
            if 'content-disposition' in response.headers:
                import re
                match = re.search(r'filename="?([^"]+)"?', response.headers['content-disposition'])
                if match:
                    filename = match.group(1)
            
            # Save file
            file_path = Path(save_dir) / filename
            with open(file_path, 'wb') as f:
                f.write(response.content)
            
            logger.info(f"File downloaded successfully: {file_path}")
            return str(file_path)
            
        except requests.exceptions.RequestException as e:
            logger.error(f"Failed to download file for task {task_id}: {e}")
            return None
        except Exception as e:
            logger.error(f"Error saving file for task {task_id}: {e}")
            return None
    
    def test_api_connection(self) -> bool:
        """Test connectivity to the GAIA API"""
        try:
            logger.info(f"Testing API connection to: {self.api_base}")
            response = self._make_request("get", "questions", timeout=10)
            logger.info("✅ API connection successful")
            return True
        except Exception as e:
            logger.error(f"❌ API connection failed: {e}")
            return False