Spaces:

mihirinamdar
/

arxiv-rag-optimized

Running

App Files Files Community

mihirinamdar commited on Jun 12

Commit

0c71790

verified ·

1 Parent(s): c588451

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -26

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import tempfile
 import shutil
 import gc
 import time
 # Core ML libraries
 import torch
@@ -246,25 +247,108 @@ class OptimizedRagSystem:
             raise
     def search_arxiv(self, query: str, max_results: int = 15, categories: List[str] = None) -> List[Paper]:
-        """Search ArXiv with error handling and rate limiting"""
         try:
-            papers = []
-            search_query = query
-            if categories:
-                category_filter = " OR ".join([f"cat:{cat.strip()}" for cat in categories])
-                search_query = f"({query}) AND ({category_filter})"
-            logger.info(f"Searching ArXiv for: {search_query}")
-            search = arxiv.Search(
-                query=search_query,
-                max_results=max_results,
-                sort_by=arxiv.SortCriterion.Relevance,
                 sort_order=arxiv.SortOrder.Descending
             )
-            for result in search.results():
                 try:
                     paper = Paper(
                         id=result.entry_id.split('/')[-1],
@@ -276,20 +360,19 @@ class OptimizedRagSystem:
                         url=result.entry_id
                     )
                     papers.append(paper)
-                    # Rate limiting
-                    time.sleep(0.1)
                 except Exception as e:
-                    logger.warning(f"Error processing paper: {e}")
                     continue
-            logger.info(f"Found {len(papers)} papers")
-            return papers
         except Exception as e:
-            logger.error(f"ArXiv search error: {e}")
-            return []
     def create_chunks(self, papers: List[Paper]) -> List[Chunk]:
         """Create text chunks from papers"""

 import shutil
 import gc
 import time
+import signal
 # Core ML libraries
 import torch
             raise
     def search_arxiv(self, query: str, max_results: int = 15, categories: List[str] = None) -> List[Paper]:
+        """Search ArXiv with enhanced error handling and retry logic"""
+        max_retries = 3
+        retry_delay = 1.0
+        for attempt in range(max_retries):
+            try:
+                papers = []
+                search_query = query.strip()
+                # Simple query validation
+                if not search_query or len(search_query) < 2:
+                    logger.warning("Query too short, using default search")
+                    search_query = "machine learning"
+                if categories and len(categories) > 0:
+                    category_filter = " OR ".join([f"cat:{cat.strip()}" for cat in categories if cat.strip()])
+                    if category_filter:
+                        search_query = f"({search_query}) AND ({category_filter})"
+                logger.info(f"🔍 ArXiv search attempt {attempt + 1}: '{search_query}'")
+                # Create search with timeout and retry settings
+                search = arxiv.Search(
+                    query=search_query,
+                    max_results=min(max_results, 50),  # Limit to prevent API issues
+                    sort_by=arxiv.SortCriterion.Relevance,
+                    sort_order=arxiv.SortOrder.Descending
+                )
+                # Set a reasonable timeout
+                def timeout_handler(signum, frame):
+                    raise TimeoutError("ArXiv search timeout")
+                signal.signal(signal.SIGALRM, timeout_handler)
+                signal.alarm(30)  # 30 second timeout
+                try:
+                    result_count = 0
+                    for result in search.results():
+                        try:
+                            # Basic validation of result
+                            if not result.title or not result.summary:
+                                logger.warning("Skipping paper with missing title/abstract")
+                                continue
+                            paper = Paper(
+                                id=result.entry_id.split('/')[-1] if result.entry_id else f"unknown_{result_count}",
+                                title=result.title.strip(),
+                                abstract=result.summary.strip(),
+                                authors=[author.name for author in (result.authors or [])],
+                                categories=result.categories or [],
+                                published=result.published or datetime.now(),
+                                url=result.entry_id or f"https://arxiv.org/abs/{result_count}"
+                            )
+                            papers.append(paper)
+                            result_count += 1
+                            # Rate limiting to be nice to ArXiv API
+                            time.sleep(0.1)
+                            # Break if we have enough papers
+                            if len(papers) >= max_results:
+                                break
+                        except Exception as e:
+                            logger.warning(f"Error processing individual paper: {e}")
+                            continue
+                finally:
+                    signal.alarm(0)  # Cancel the alarm
+                if papers:
+                    logger.info(f"✅ Successfully found {len(papers)} papers")
+                    return papers
+                else:
+                    logger.warning(f"No papers found on attempt {attempt + 1}")
+            except TimeoutError:
+                logger.warning(f"ArXiv search timeout on attempt {attempt + 1}")
+            except Exception as e:
+                logger.error(f"ArXiv search error on attempt {attempt + 1}: {type(e).__name__}: {e}")
+            # Wait before retry
+            if attempt < max_retries - 1:
+                logger.info(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                retry_delay *= 2  # Exponential backoff
+        # If all attempts failed, try a simple fallback search
+        logger.warning("All search attempts failed, trying fallback search...")
         try:
+            fallback_search = arxiv.Search(
+                query="artificial intelligence",  # Simple fallback
+                max_results=5,
+                sort_by=arxiv.SortCriterion.SubmittedDate,
                 sort_order=arxiv.SortOrder.Descending
             )
+            papers = []
+            for i, result in enumerate(fallback_search.results()):
+                if i >= 5:  # Limit fallback results
+                    break
                 try:
                     paper = Paper(
                         id=result.entry_id.split('/')[-1],
                         url=result.entry_id
                     )
                     papers.append(paper)
                 except Exception as e:
+                    logger.warning(f"Error in fallback paper processing: {e}")
                     continue
+            if papers:
+                logger.info(f"🔄 Fallback search returned {len(papers)} papers")
+                return papers
         except Exception as e:
+            logger.error(f"Even fallback search failed: {e}")
+        logger.error("❌ All ArXiv search methods failed")
+        return []
     def create_chunks(self, papers: List[Paper]) -> List[Chunk]:
         """Create text chunks from papers"""