SlouchyBuffalo commited on
Commit
1c4ff89
·
verified ·
1 Parent(s): 872db68

Delete papers.py

Browse files
Files changed (1) hide show
  1. papers.py +0 -116
papers.py DELETED
@@ -1,116 +0,0 @@
1
- import os
2
- import requests
3
- import tempfile
4
- from datetime import datetime, timezone
5
- import base64
6
- from tqdm.auto import tqdm
7
- import pymupdf
8
-
9
- DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
10
-
11
- class PaperManager:
12
- def __init__(self, papers_per_page=30):
13
- self.papers = []
14
- self.raw_papers = [] # To store fetched data
15
-
16
- def calculate_rising_score(self, paper):
17
- """
18
- Calculate the rising score of a paper.
19
- This emphasizes recent upvotes and the rate of upvote accumulation.
20
- """
21
- upvotes = paper.get('paper', {}).get('upvotes', 0)
22
- published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
23
- try:
24
- published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
25
- except ValueError:
26
- published_time = datetime.now(timezone.utc)
27
-
28
- time_diff = datetime.now(timezone.utc) - published_time
29
- time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
30
-
31
- # Rising score favors papers that are gaining upvotes quickly
32
- # Adjusted to have a linear decay over time
33
- score = upvotes / (time_diff_hours + 1)
34
- return score
35
-
36
- def fetch_papers(self):
37
- try:
38
- response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
39
- response.raise_for_status()
40
- data = response.json()
41
-
42
- if not data:
43
- print("No data received from API.")
44
- return False
45
-
46
- self.raw_papers = data # Store raw data
47
-
48
- return True
49
-
50
- except requests.RequestException as e:
51
- print(f"Error fetching papers: {e}")
52
- return False
53
- except Exception as e:
54
- print(f"Unexpected error: {e}")
55
- return False
56
-
57
- def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
58
- self.papers = []
59
- for paper in self.raw_papers:
60
- paper_score = self.calculate_rising_score(paper)
61
- # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
62
- self.papers.append(paper)
63
-
64
- self.papers = sorted(
65
- self.papers,
66
- key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
67
- reverse=True
68
- )[:2]
69
- return self.papers
70
-
71
- # def get_paper_content(self, paper_id):
72
- # pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
73
- # print("Processing paper:", pdf_url)
74
- # client = httpx.Client(follow_redirects=True)
75
- # response = client.get(pdf_url)
76
-
77
- # # First verification - check if we got a valid PDF response
78
- # if response.status_code != 200:
79
- # raise Exception(f"Failed to fetch PDF: {response.status_code}")
80
-
81
- # if not response.headers.get('content-type', '').startswith('application/pdf'):
82
- # raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
83
-
84
- # # Second verification - check the first few bytes of the content
85
- # if not response.content.startswith(b'%PDF'):
86
- # raise Exception("Content doesn't appear to be a valid PDF")
87
-
88
- # pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
89
- # return {"pdf": pdf_data, "url": pdf_url}
90
-
91
- def get_paper_text(self, paper_id):
92
- url = f"https://arxiv.org/pdf/{paper_id}.pdf"
93
- response = requests.get(url)
94
-
95
- if response.status_code != 200:
96
- raise Exception(f"Failed to download PDF: {response.status_code}")
97
-
98
- with open("temp.pdf", "wb") as f:
99
- f.write(response.content)
100
-
101
- with pymupdf.open("temp.pdf") as doc:
102
- text = ""
103
- for page in doc:
104
- text += page.get_text()
105
- return text
106
-
107
-
108
- def get_top_content(self):
109
- self.fetch_papers()
110
- self.filter_top_papers()
111
- contents = {}
112
- print(f"Processing {len(self.papers)} papers:")
113
- for paper in tqdm(self.papers):
114
- paper_id = paper["paper"]['id']
115
- contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
116
- return contents