arXiv / app.py
Omar ID EL MOUMEN
Attempt of fix #2
52717e9
from fastapi import FastAPI, Header
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from contextlib import asynccontextmanager
import xml.etree.ElementTree as xmlparser
import requests
from pydantic import BaseModel
import sys
import fitz
import re,os,json
from io import BytesIO
from datetime import datetime
def remove_in_betweens(text):
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
return removed_parentheses
def remove_punctuations(text):
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)
def receive_signal(signalNumber, frame):
print('Received:', signalNumber)
sys.exit()
@asynccontextmanager
async def lifespan(app: FastAPI):
import signal
signal.signal(signal.SIGINT, receive_signal)
yield
app = FastAPI(lifespan=lifespan)
app.mount("/static", StaticFiles(directory="static"), name="static")
origins = [
"*",
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def root():
return FileResponse(os.path.join("templates", "index.html"))
class Query(BaseModel):
keyword: str
limit: int
class DocumentID(BaseModel):
doc_id: str
class PDF(BaseModel):
url: str
page_num: int = -1
@app.post("/search")
async def get_articles(query: Query):
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
content = {}
try:
arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{query.keyword}&max_results={query.limit}", verify=False)
response = xmlparser.fromstring(arxiv_search_result.text)
publications = response.findall(f"{XML_NAMESPACE}entry")
for pub in publications:
id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1]
title_pub = pub.find(f"{XML_NAMESPACE}title").text
authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")])
pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y")
abstract = pub.find(f"{XML_NAMESPACE}summary").text
content[id_pub] = {
"title": title_pub,
"authors": authors,
"date": pub_date,
"abstract": abstract,
"pdf": f"http://arxiv.org/pdf/{id_pub}"
}
return {"error": False, "message": content}
except Exception as e:
print(f"Error while downloading data : {str(e)}")
return {"error": True, "message": str(e)}
@app.post("/extract_pdf/arxiv_id")
async def extract_arxiv_pdf(document: DocumentID):
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
if pdf_req.status_code == 200:
pdf_data = BytesIO(pdf_req.content)
doc = fitz.open(stream=pdf_data, filetype="pdf")
pdf_text = " ".join([page.get_text("text") for page in doc])
ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE)
if ref_pos:
ref_pos = ref_pos.end()
if ref_pos is not None:
pdf_text = pdf_text[:ref_pos - 10]
postprocess_text = remove_in_betweens(pdf_text)
postprocess_text = remove_punctuations(postprocess_text)
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
postprocess_text = postprocess_text.strip()
return {"error": False, "message": {"pub_id": document.doc_id, "text": postprocess_text}}
else:
print("ID: " + document.doc_id)
print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}")
print("Status code: " + str(pdf_req.status_code))
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
@app.post("/extract_pdf/url")
async def extract_pdf(pdf: PDF):
pdf_req = requests.get(pdf.url)
if pdf_req.status_code == 200:
pdf_data = BytesIO(pdf_req.content)
doc = fitz.open(stream=pdf_data, filetype="pdf")
pdf_text = " ".join([doc[page].get_text("text") for page in range(doc.page_count if pdf.page_num == -1 else pdf.page_num)])
pdf_metadata = doc.metadata
print(pdf_metadata)
postprocess_text = remove_in_betweens(pdf_text)
postprocess_text = remove_punctuations(postprocess_text)
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
postprocess_text = postprocess_text.strip()
return {"error": False, "message": {"title": pdf_metadata.get("title", "No title found").strip(), "text": postprocess_text}}
else:
print("URL: " + pdf.url)
print("Status code: " + str(pdf_req.status_code))
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
def researcher(model, user, token):
url = 'https://api.groq.com/openai/v1/chat/completions'
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {token}',
}
system_msg = {
'role': 'system',
'content': (
'You are an experience PhD professor with 20 years experience in research. You help the user build their research plan based on the following examples. build the plan according to the examples without further questions. provide the steps of the plan in a form of research requests to search engines of public document publisher or web searching purposes, nothing else:\n'
'''<example>
<search-request>
Help me research recent AI-powered marketing campaigns to benchmark for 2025 planning
</search-request>
<search-plan>
Help me research recent AI-powered marketing campaigns to benchmark for 2025 planning by:
(1) Find articles and case studies on AI-powered marketing campaigns in 2024.
(2) Find information on the specific AI technologies used in these campaigns (e.g., generative AI, predictive analytics).
(3) Find data on the results of these campaigns (e.g., ROI, customer engagement).
(4) Find information on the challenges and limitations of using AI in marketing.
(5) Find information on emerging trends in AI-powered marketing for 2025.
(6) Based on the above information, create a report summarizing key takeaways.
(7) Create a SWOT analysis of AI-powered marketing campaigns.
</search-plan>
</example>\n'''
'''<example>
<search-request>
Research AI models and compare them per use cases for a guide on which model to use for which use case
</search-request>
<search-plan>
Research AI models and compare them per use cases for a guide on which model to use for which use case by:
(1) Find a list of popular AI models and categorize them by type (e.g., image generation, language processing, etc.).
(2) For each AI model, find information on its strengths, weaknesses, and common use cases.
(3) Compare and contrast the AI models within each category based on their performance, ease of use, and cost.
(4) Find real-world examples of how each AI model is being used in different industries and applications.
(5) Create a guide that recommends specific AI models for different use cases, taking into account factors such as accuracy, speed, and cost.
(6) Include a disclaimer in the guide stating that the recommendations are based on current knowledge and may change as AI technology evolves.
</search-plan>
</example>\n'''
'''<example>
<search-request>
research Open source threat or opportunities to 6G standardization bodies such as 3GPP for thought leadership paper
</search-request>
<search-plan>
Research Open source threat or opportunities to 6G standardization bodies such as 3GPP for thought leadership paper:
(1) Find information on the role of 3GPP in 6G standardization.
(2) Find information on open source initiatives in the 6G space.
(3) Find articles or reports discussing the potential impact of open source on 6G standardization.
(4) Find information on the benefits and challenges of open source for 6G standardization.
(5) Find information on how 3GPP is addressing the challenges of open source.
(6) Find examples of successful open source initiatives in other technology domains.
(7) Based on your research, develop a point of view on the threat or opportunity of open source to 6G standardization bodies like 3GPP.
</search-plan>
</example>\n'''
'''<example>
<search-request>
research vodafone activities in 6G for a competition analysis include a SWOT analysis
</search-request>
<search-plan>
Research Vodafone's activities in 6G for a competition analysis, including a SWOT analysis, by:
(1) Find Vodafone's public statements and press releases about their 6G research and development efforts.
(2) Find news articles and industry reports about Vodafone's 6G activities.
(3) Find information about Vodafone's partnerships and collaborations in the 6G space.
(4) Find information about Vodafone's investments in 6G infrastructure and technology.
(5) Based on the information gathered, create a SWOT analysis of Vodafone's position in the 6G landscape:
(a) Strengths: Vodafone's existing infrastructure, expertise, and partnerships.
(b) Weaknesses: Vodafone's potential challenges in competing with larger or more established players in the 6G space.
(c) Opportunities: Emerging 6G technologies and market trends that Vodafone could leverage.
(d) Threats: Competition from other companies, regulatory hurdles, and technological uncertainties.
(6) Compare Vodafone's 6G activities to those of its main competitors (e.g., Ericsson, Nokia, Huawei, Samsung) to identify areas of strength and weakness.
(7) Consider any recent developments or announcements in the 6G space that could impact Vodafone's competitive position.
</search-plan>
</example>\n'''
'''<example>
<search-request>
report on researches on 6G energy efficiency, how to achieve it, assess the reality of these findings or solutions for a technical paper on 6G environmental impact
</search-request>
<search-plan>
Report on researches on 6G energy efficiency, how to achieve it, assess the reality of these findings or solutions for a technical paper on 6G environmental impact by:
(1) Find research papers and articles on 6G energy efficiency.
(2) Summarize the findings of these researches on 6G energy efficiency.
(3) Find proposed solutions to achieve 6G energy efficiency.
(4) Assess the feasibility and potential impact of these solutions.
(5) Find any existing case studies or pilot projects implementing these solutions.
(6) Find information on the potential environmental benefits of 6G energy efficiency.
(7) Find information on the challenges and limitations of achieving 6G energy efficiency.
(8) Find information on the potential economic benefits of 6G energy efficiency.
</search-plan>
</example>\n'''
'''<example>
<search-request>
research Authentication and Identity Management:
Study lightweight, low-energy authentication methods for IoT and other connected devices.
Explore advancements in identity and access management for 6G networks.
</search-request>
<search-plan>
Research Authentication and Identity Management: Study lightweight, low-energy authentication methods for IoT and other connected devices. Explore advancements in identity and access management for 6G networks by:
(1) Find research papers and articles on lightweight, low-energy authentication methods for IoT and other connected devices.
(2) Find research papers and articles on advancements in identity and access management for 6G networks.
(3) Find information on the current state of authentication and identity management for IoT and other connected devices.
(4) Find information on the challenges and opportunities in authentication and identity management for 6G networks.
(5) Find information on the different types of authentication methods available for IoT and other connected devices.
(6) Find information on the different types of identity and access management systems available for 6G networks.
(7) Compare and contrast the different authentication methods and identity and access management systems.
(8) Identify potential areas for future research in authentication and identity management for IoT and other connected devices, as well as for 6G networks.
</search-plan>
</example>\n'''
'''<example>
<search-request>
research Global Market Dynamics:
Assess which stakeholders (e.g., operators, tech companies, governments) are likely to drive investment in 6G infrastructure.
Investigate the role of new entrants, such as GAFAM (Google, Amazon, Facebook, Apple, Microsoft), in shaping the 6G ecosystem.
Explore how regions like China, the EU, and the US are positioning themselves for 6G leadership.
</search-request>
<search-plan>
Research Global Market Dynamics: Assess which stakeholders (e.g., operators, tech companies, governments) are likely to drive investment in 6G infrastructure. Investigate the role of new entrants, such as GAFAM (Google, Amazon, Facebook, Apple, Microsoft), in shaping the 6G ecosystem. Explore how regions like China, the EU, and the US are positioning themselves for 6G leadership by:
(1) Find research reports and articles on the 6G market and its potential stakeholders.
(2) Find information on the current investments and initiatives of major telecom operators in 6G. If there are too many, limit to several that are most relevant.
(3) Find information on the R&D efforts of major tech companies, including GAFAM, in 6G technologies.
(4) Find information on government policies and funding initiatives related to 6G in China, the EU, and the US.
(5) Find expert opinions and analysis on the potential drivers of 6G investment and the role of new entrants.
(6) Find information on the potential impact of 6G on various industries and sectors.
(7) Find information on the potential challenges and barriers to 6G adoption and deployment.
(8) Find information on the potential timeline for 6G commercialization and deployment.
</search-plan>
</example>\n'''
'''<example>
<search-request>
research how new business models beyond ROI could benefit societal impact of 6G
</search-request>
<search-plan>
research how new business models beyond ROI could benefit societal impact of 6G by:
(1) Find articles and research papers discussing the potential societal impact of 6G technology.
(2) Find articles and research papers discussing current business models used in the telecommunications industry and their limitations.
(3) Find articles and research papers discussing alternative business models that could be used to fund and deploy 6G technology.
(4) Find articles and research papers discussing how new business models could be used to maximize the societal benefits of 6G technology.
(5) Find case studies of companies or organizations that have successfully implemented alternative business models to achieve social impact goals.
(6) Find information on potential risks and challenges associated with new business models for 6G technology.
</search-plan>
</example>\n'''
'Optionally, do not precise the sources, as we search on every websites that we possibly can. Take note that sometimes, the user will send you keywords only, just provide report of them.\n'
'For the response format, you must send a JSON of this format : [{"step_index": The step number, "step_text": What we have to do, "keywords": The important keywords separated by spaces (no comma) (important: the keywords that we will use for search engines and APIs, so get rid of `research papers`, `articles`, ... keywords), `privilegie les abbreviations`}, ...] Take those for examples :\n'
'''<example><search-request>Provide a plan for 6G challenges</search-request><search-plan>[
{
"step_index": 1,
"step_text": "Find information on technical challenges in 6G development",
"keywords": "6G technical challenges development hurdles"
},
{
"step_index": 2,
"step_text": " Identify key challenges in 6G standardization",
"keywords": "6G standardization challenges 3GPP"
},
{
"step_index": 3,
"step_text": "Investigate security challenges in 6G networks",
"keywords": "6G security threats vulnerabilities"
},
{
"step_index": 4,
"step_text": "Explore challenges in 6G deployment and implementation",
"keywords": "6G deployment implementation rollout"
},
{
"step_index": 5,
"step_text": "Find information on energy efficiency challenges in 6G",
"keywords": "6G energy efficiency power consumption"
},
{
"step_index": 6,
"step_text": "Investigate challenges in 6G spectrum management",
"keywords": "6G spectrum frequency management"
},
{
"step_index": 7,
"step_text": "Analyze challenges in 6G device and hardware development",
"keywords": "6G devices hardware UE"
},
{
"step_index": 8,
"step_text": "Explore challenges in 6G network architecture and design",
"keywords": "6G network architecture network design RAN"
},
{
"step_index": 9,
"step_text": "Find information on challenges in 6G testing and validation",
"keywords": "6G testing validation trial"
},
{
"step_index": 10,
"step_text": "Investigate challenges in 6G regulation and policy-making",
"keywords": "6G regulation policy governance"
}
]</search-plan></example>'''
)
}
user_msg = {
'role': 'user',
'content': user
}
data = {
'model': model,
'messages': [system_msg, user_msg]
}
response = requests.post(url, headers=headers, data=json.dumps(data), verify=False)
if response.status_code != 200:
print(f"Groq API error on post: {response.status_code}")
return None
try:
response_data = response.json()
raw_content = response_data['choices'][0]['message']['content'].strip()
return raw_content
except Exception as e:
print(f"Groq API error after post: {str(e)}")
return None
class GroqRequest(BaseModel):
model: str
user: str
@app.post("/search/plan")
async def get_research_plan(infos: GroqRequest, api_key: str = Header(None, alias="GROQ_TOKEN")):
if api_key is None:
return {"error": True, "message": "Missing API key"}
plan = researcher(infos.model, infos.user, api_key)
if plan is None:
return {"error": True, "message": "Error while generating the research plan"}
plan = json.loads(re.sub(r"\s+", " ", plan))
return {"error": False, "message": {"plan": plan}}
@app.post("/search/plan/arxiv")
async def get_arxiv_research_plan(infos: GroqRequest, api_key: str = Header(None, alias="GROQ_TOKEN")):
plan_articles = []
plan = get_research_plan(infos, api_key)["plan"]
if plan == "":
return {"error": True, "message": "Error while generating the research plan"}
for step in plan:
index, inst, kws = step.values()
data = await get_articles(Query(keyword=kws, limit=5))
if not data["error"]:
publications = data["message"]
else:
print(data["message"])
continue
plan_articles.append({'step_id': index, 'request': inst, 'articles': [x for x in publications.keys()]})
return {"error": False, "message": plan_articles}