open-notebooklm

Running on Zero

App Files Files Community

SlouchyBuffalo commited on May 12

Commit

1c4ff89

verified ·

1 Parent(s): 872db68

Delete papers.py

Browse files

Files changed (1) hide show

papers.py +0 -116

papers.py DELETED Viewed

@@ -1,116 +0,0 @@
-import os
-import requests
-import tempfile
-from datetime import datetime, timezone
-import base64
-from tqdm.auto import tqdm
-import pymupdf
-DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
-class PaperManager:
-    def __init__(self, papers_per_page=30):
-        self.papers = []
-        self.raw_papers = []  # To store fetched data
-    def calculate_rising_score(self, paper):
-        """
-        Calculate the rising score of a paper.
-        This emphasizes recent upvotes and the rate of upvote accumulation.
-        """
-        upvotes = paper.get('paper', {}).get('upvotes', 0)
-        published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
-        try:
-            published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
-        except ValueError:
-            published_time = datetime.now(timezone.utc)
-        time_diff = datetime.now(timezone.utc) - published_time
-        time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours
-        # Rising score favors papers that are gaining upvotes quickly
-        # Adjusted to have a linear decay over time
-        score = upvotes / (time_diff_hours + 1)
-        return score
-    def fetch_papers(self):
-        try:
-            response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
-            response.raise_for_status()
-            data = response.json()
-            if not data:
-                print("No data received from API.")
-                return False
-            self.raw_papers = data  # Store raw data
-            return True
-        except requests.RequestException as e:
-            print(f"Error fetching papers: {e}")
-            return False
-        except Exception as e:
-            print(f"Unexpected error: {e}")
-            return False
-    def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
-        self.papers = []
-        for paper in self.raw_papers:
-            paper_score = self.calculate_rising_score(paper)
-            # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
-            self.papers.append(paper)
-        self.papers = sorted(
-            self.papers,
-            key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
-            reverse=True
-        )[:2]
-        return self.papers
-    # def get_paper_content(self, paper_id):
-    #     pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
-    #     print("Processing paper:", pdf_url)
-    #     client = httpx.Client(follow_redirects=True)
-    #     response = client.get(pdf_url)
-    #     # First verification - check if we got a valid PDF response
-    #     if response.status_code != 200:
-    #         raise Exception(f"Failed to fetch PDF: {response.status_code}")
-    #     if not response.headers.get('content-type', '').startswith('application/pdf'):
-    #         raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
-    #     # Second verification - check the first few bytes of the content
-    #     if not response.content.startswith(b'%PDF'):
-    #         raise Exception("Content doesn't appear to be a valid PDF")
-    #     pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
-    #     return {"pdf": pdf_data, "url": pdf_url}
-    def get_paper_text(self, paper_id):
-        url = f"https://arxiv.org/pdf/{paper_id}.pdf"
-        response = requests.get(url)
-        if response.status_code != 200:
-            raise Exception(f"Failed to download PDF: {response.status_code}")
-        with open("temp.pdf", "wb") as f:
-            f.write(response.content)
-        with pymupdf.open("temp.pdf") as doc:
-            text = ""
-            for page in doc:
-                text += page.get_text()
-        return text
-    def get_top_content(self):
-        self.fetch_papers()
-        self.filter_top_papers()
-        contents =  {}
-        print(f"Processing {len(self.papers)} papers:")
-        for paper in tqdm(self.papers):
-            paper_id = paper["paper"]['id']
-            contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
-        return contents