mihirinamdar commited on
Commit
0c71790
Β·
verified Β·
1 Parent(s): c588451

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -26
app.py CHANGED
@@ -15,6 +15,7 @@ import tempfile
15
  import shutil
16
  import gc
17
  import time
 
18
 
19
  # Core ML libraries
20
  import torch
@@ -246,25 +247,108 @@ class OptimizedRagSystem:
246
  raise
247
 
248
  def search_arxiv(self, query: str, max_results: int = 15, categories: List[str] = None) -> List[Paper]:
249
- """Search ArXiv with error handling and rate limiting"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  try:
251
- papers = []
252
- search_query = query
253
-
254
- if categories:
255
- category_filter = " OR ".join([f"cat:{cat.strip()}" for cat in categories])
256
- search_query = f"({query}) AND ({category_filter})"
257
-
258
- logger.info(f"Searching ArXiv for: {search_query}")
259
-
260
- search = arxiv.Search(
261
- query=search_query,
262
- max_results=max_results,
263
- sort_by=arxiv.SortCriterion.Relevance,
264
  sort_order=arxiv.SortOrder.Descending
265
  )
266
 
267
- for result in search.results():
 
 
 
268
  try:
269
  paper = Paper(
270
  id=result.entry_id.split('/')[-1],
@@ -276,20 +360,19 @@ class OptimizedRagSystem:
276
  url=result.entry_id
277
  )
278
  papers.append(paper)
279
-
280
- # Rate limiting
281
- time.sleep(0.1)
282
-
283
  except Exception as e:
284
- logger.warning(f"Error processing paper: {e}")
285
  continue
286
-
287
- logger.info(f"Found {len(papers)} papers")
288
- return papers
289
-
 
290
  except Exception as e:
291
- logger.error(f"ArXiv search error: {e}")
292
- return []
 
 
293
 
294
  def create_chunks(self, papers: List[Paper]) -> List[Chunk]:
295
  """Create text chunks from papers"""
 
15
  import shutil
16
  import gc
17
  import time
18
+ import signal
19
 
20
  # Core ML libraries
21
  import torch
 
247
  raise
248
 
249
  def search_arxiv(self, query: str, max_results: int = 15, categories: List[str] = None) -> List[Paper]:
250
+ """Search ArXiv with enhanced error handling and retry logic"""
251
+ max_retries = 3
252
+ retry_delay = 1.0
253
+
254
+ for attempt in range(max_retries):
255
+ try:
256
+ papers = []
257
+ search_query = query.strip()
258
+
259
+ # Simple query validation
260
+ if not search_query or len(search_query) < 2:
261
+ logger.warning("Query too short, using default search")
262
+ search_query = "machine learning"
263
+
264
+ if categories and len(categories) > 0:
265
+ category_filter = " OR ".join([f"cat:{cat.strip()}" for cat in categories if cat.strip()])
266
+ if category_filter:
267
+ search_query = f"({search_query}) AND ({category_filter})"
268
+
269
+ logger.info(f"πŸ” ArXiv search attempt {attempt + 1}: '{search_query}'")
270
+
271
+ # Create search with timeout and retry settings
272
+ search = arxiv.Search(
273
+ query=search_query,
274
+ max_results=min(max_results, 50), # Limit to prevent API issues
275
+ sort_by=arxiv.SortCriterion.Relevance,
276
+ sort_order=arxiv.SortOrder.Descending
277
+ )
278
+
279
+ # Set a reasonable timeout
280
+ def timeout_handler(signum, frame):
281
+ raise TimeoutError("ArXiv search timeout")
282
+
283
+ signal.signal(signal.SIGALRM, timeout_handler)
284
+ signal.alarm(30) # 30 second timeout
285
+
286
+ try:
287
+ result_count = 0
288
+ for result in search.results():
289
+ try:
290
+ # Basic validation of result
291
+ if not result.title or not result.summary:
292
+ logger.warning("Skipping paper with missing title/abstract")
293
+ continue
294
+
295
+ paper = Paper(
296
+ id=result.entry_id.split('/')[-1] if result.entry_id else f"unknown_{result_count}",
297
+ title=result.title.strip(),
298
+ abstract=result.summary.strip(),
299
+ authors=[author.name for author in (result.authors or [])],
300
+ categories=result.categories or [],
301
+ published=result.published or datetime.now(),
302
+ url=result.entry_id or f"https://arxiv.org/abs/{result_count}"
303
+ )
304
+ papers.append(paper)
305
+ result_count += 1
306
+
307
+ # Rate limiting to be nice to ArXiv API
308
+ time.sleep(0.1)
309
+
310
+ # Break if we have enough papers
311
+ if len(papers) >= max_results:
312
+ break
313
+
314
+ except Exception as e:
315
+ logger.warning(f"Error processing individual paper: {e}")
316
+ continue
317
+
318
+ finally:
319
+ signal.alarm(0) # Cancel the alarm
320
+
321
+ if papers:
322
+ logger.info(f"βœ… Successfully found {len(papers)} papers")
323
+ return papers
324
+ else:
325
+ logger.warning(f"No papers found on attempt {attempt + 1}")
326
+
327
+ except TimeoutError:
328
+ logger.warning(f"ArXiv search timeout on attempt {attempt + 1}")
329
+ except Exception as e:
330
+ logger.error(f"ArXiv search error on attempt {attempt + 1}: {type(e).__name__}: {e}")
331
+
332
+ # Wait before retry
333
+ if attempt < max_retries - 1:
334
+ logger.info(f"Retrying in {retry_delay} seconds...")
335
+ time.sleep(retry_delay)
336
+ retry_delay *= 2 # Exponential backoff
337
+
338
+ # If all attempts failed, try a simple fallback search
339
+ logger.warning("All search attempts failed, trying fallback search...")
340
  try:
341
+ fallback_search = arxiv.Search(
342
+ query="artificial intelligence", # Simple fallback
343
+ max_results=5,
344
+ sort_by=arxiv.SortCriterion.SubmittedDate,
 
 
 
 
 
 
 
 
 
345
  sort_order=arxiv.SortOrder.Descending
346
  )
347
 
348
+ papers = []
349
+ for i, result in enumerate(fallback_search.results()):
350
+ if i >= 5: # Limit fallback results
351
+ break
352
  try:
353
  paper = Paper(
354
  id=result.entry_id.split('/')[-1],
 
360
  url=result.entry_id
361
  )
362
  papers.append(paper)
 
 
 
 
363
  except Exception as e:
364
+ logger.warning(f"Error in fallback paper processing: {e}")
365
  continue
366
+
367
+ if papers:
368
+ logger.info(f"πŸ”„ Fallback search returned {len(papers)} papers")
369
+ return papers
370
+
371
  except Exception as e:
372
+ logger.error(f"Even fallback search failed: {e}")
373
+
374
+ logger.error("❌ All ArXiv search methods failed")
375
+ return []
376
 
377
  def create_chunks(self, papers: List[Paper]) -> List[Chunk]:
378
  """Create text chunks from papers"""