Omar ID EL MOUMEN commited on
Commit
61b2353
·
1 Parent(s): 664de9e

First commit - add application and Dockerfile

Browse files
Files changed (3) hide show
  1. Dockerfile +13 -0
  2. app.py +101 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from contextlib import asynccontextmanager
4
+ import xml.etree.ElementTree as xmlparser
5
+ import requests
6
+ import sys
7
+ import random
8
+ import fitz
9
+ import re
10
+ from io import BytesIO
11
+ from datetime import datetime
12
+
13
+
14
+ def receive_signal(signalNumber, frame):
15
+ print('Received:', signalNumber)
16
+ sys.exit()
17
+
18
+
19
+ @asynccontextmanager
20
+ async def lifespan(app: FastAPI):
21
+ import signal
22
+ signal.signal(signal.SIGINT, receive_signal)
23
+ yield
24
+
25
+ app = FastAPI(lifespan=lifespan)
26
+
27
+ origins = [
28
+ "*",
29
+ ]
30
+
31
+ app.add_middleware(
32
+ CORSMiddleware,
33
+ allow_origins=origins,
34
+ allow_credentials=True,
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+
39
+ @app.get("/")
40
+ async def root():
41
+ return {"message": "API started successfully"}
42
+
43
+ @app.get("/search/{keyword}/{limit}")
44
+ async def get_articles(keyword: str, limit: int):
45
+ XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
46
+ content = {}
47
+ try:
48
+ arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{keyword}&max_results={limit}", verify=False)
49
+ response = xmlparser.fromstring(arxiv_search_result.text)
50
+ publications = response.findall(f"{XML_NAMESPACE}entry")
51
+ for pub in publications:
52
+ id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1]
53
+ title_pub = pub.find(f"{XML_NAMESPACE}title").text
54
+ authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")])
55
+ pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y")
56
+ abstract = pub.find(f"{XML_NAMESPACE}summary").text
57
+ content[id_pub] = {
58
+ "title": title_pub,
59
+ "authors": authors,
60
+ "date": pub_date,
61
+ "abstract": abstract
62
+ }
63
+ return {"error": False, "message": content}
64
+ except Exception as e:
65
+ print(f"Error while downloading data : {str(e)}")
66
+ return {"error": True, "message": str(e)}
67
+
68
+ @app.get("/extract/{id_doc}")
69
+ async def extract_text_pdf(id_doc: str):
70
+ pdf_req = requests.get(f"http://arxiv.org/pdf/{id_doc}", verify=False)
71
+ if pdf_req.status_code == 200:
72
+ pdf_data = BytesIO(pdf_req.content)
73
+ doc = fitz.open(stream=pdf_data, filetype="pdf")
74
+ pdf_text = " ".join([page.get_text("text") for page in doc])
75
+ ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE)
76
+ if ref_pos:
77
+ ref_pos = ref_pos.end()
78
+
79
+ if ref_pos is not None:
80
+ postprocess_text = pdf_text[:ref_pos - 10]
81
+
82
+ def remove_in_betweens(text):
83
+ removed_brackets = re.sub(r'\[.*?\]', ' ', text)
84
+ removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
85
+ return removed_parentheses
86
+
87
+ def remove_punctuations(text):
88
+ return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*\-]", ' ', text)
89
+
90
+ postprocess_text = remove_in_betweens(postprocess_text)
91
+ postprocess_text = remove_punctuations(postprocess_text)
92
+ regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
93
+ titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE) if len(doc.get_toc()) <= 0 else doc.get_toc()
94
+ return {"error": False, "message": titles}
95
+ else:
96
+ return {"error": True}
97
+
98
+ @app.get("/extract/random/{keyword}/{limit}")
99
+ async def extract_random_pdf(keyword: str, limit: int):
100
+ pubs = await get_articles(keyword, limit)
101
+ return await extract_text_pdf(random.choice(pubs.keys()))
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ huggingface_hub
4
+ transformers
5
+ sentencepiece
6
+ requests
7
+ pymupdf