Lucas ARRIESSE
commited on
Commit
·
d2dc29e
1
Parent(s):
8ac47d4
Remove usages of NLTK (that is unused actually)
Browse files- api/docs.py +3 -16
- app.py +0 -6
- requirements.txt +0 -1
api/docs.py
CHANGED
@@ -2,9 +2,7 @@ import asyncio
|
|
2 |
from typing import Dict, List, Literal, Tuple
|
3 |
from fastapi.routing import APIRouter
|
4 |
import logging
|
5 |
-
import string
|
6 |
import io
|
7 |
-
import traceback
|
8 |
import zipfile
|
9 |
import json
|
10 |
import os
|
@@ -15,10 +13,7 @@ import subprocess
|
|
15 |
import pandas as pd
|
16 |
import re
|
17 |
from lxml import etree
|
18 |
-
from nltk.tokenize import word_tokenize
|
19 |
from bs4 import BeautifulSoup
|
20 |
-
from nltk.corpus import stopwords
|
21 |
-
from nltk.stem import WordNetLemmatizer
|
22 |
from fastapi import Depends, BackgroundTasks, HTTPException, Request
|
23 |
from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
|
24 |
from fastapi.responses import StreamingResponse
|
@@ -30,21 +25,12 @@ from schemas import DataRequest, DataResponse, DocRequirements, DocDownloadReque
|
|
30 |
router = APIRouter(tags=["document extraction"])
|
31 |
|
32 |
# ==================================================== Utilities =================================================================
|
33 |
-
|
34 |
-
lemmatizer = WordNetLemmatizer()
|
35 |
-
|
36 |
NSMAP = {
|
37 |
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
38 |
'v': 'urn:schemas-microsoft-com:vml'
|
39 |
}
|
40 |
|
41 |
-
|
42 |
-
def lemma(text: str):
|
43 |
-
stop_words = set(stopwords.words('english'))
|
44 |
-
txt = text.translate(str.maketrans('', '', string.punctuation)).strip()
|
45 |
-
tokens = [token for token in word_tokenize(
|
46 |
-
txt.lower()) if token not in stop_words]
|
47 |
-
return [lemmatizer.lemmatize(token) for token in tokens]
|
48 |
|
49 |
|
50 |
def get_docx_archive(url: str) -> zipfile.ZipFile:
|
@@ -358,7 +344,8 @@ async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends
|
|
358 |
documents = req.documents
|
359 |
n_docs = len(documents)
|
360 |
|
361 |
-
logging.info(
|
|
|
362 |
|
363 |
# limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
|
364 |
concurrency_sema = asyncio.Semaphore(4)
|
|
|
2 |
from typing import Dict, List, Literal, Tuple
|
3 |
from fastapi.routing import APIRouter
|
4 |
import logging
|
|
|
5 |
import io
|
|
|
6 |
import zipfile
|
7 |
import json
|
8 |
import os
|
|
|
13 |
import pandas as pd
|
14 |
import re
|
15 |
from lxml import etree
|
|
|
16 |
from bs4 import BeautifulSoup
|
|
|
|
|
17 |
from fastapi import Depends, BackgroundTasks, HTTPException, Request
|
18 |
from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
|
19 |
from fastapi.responses import StreamingResponse
|
|
|
25 |
router = APIRouter(tags=["document extraction"])
|
26 |
|
27 |
# ==================================================== Utilities =================================================================
|
|
|
|
|
|
|
28 |
NSMAP = {
|
29 |
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
30 |
'v': 'urn:schemas-microsoft-com:vml'
|
31 |
}
|
32 |
|
33 |
+
# ================================== Converting of files to .txt ====================================
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
|
36 |
def get_docx_archive(url: str) -> zipfile.ZipFile:
|
|
|
344 |
documents = req.documents
|
345 |
n_docs = len(documents)
|
346 |
|
347 |
+
logging.info(
|
348 |
+
"Generating requirements for documents: {}".format(req.documents))
|
349 |
|
350 |
# limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
|
351 |
concurrency_sema = asyncio.Semaphore(4)
|
app.py
CHANGED
@@ -3,7 +3,6 @@ import logging
|
|
3 |
from dotenv import load_dotenv
|
4 |
from typing import Literal
|
5 |
from jinja2 import Environment, TemplateNotFound
|
6 |
-
import nltk
|
7 |
import warnings
|
8 |
import os
|
9 |
from fastapi import Depends, FastAPI, BackgroundTasks, HTTPException, Request, Response
|
@@ -30,11 +29,6 @@ logging.basicConfig(
|
|
30 |
# Initialize global dependencies
|
31 |
init_dependencies()
|
32 |
|
33 |
-
# Download required packages for NLTK
|
34 |
-
nltk.download('stopwords')
|
35 |
-
nltk.download('punkt_tab')
|
36 |
-
nltk.download('wordnet')
|
37 |
-
|
38 |
warnings.filterwarnings("ignore")
|
39 |
|
40 |
app = FastAPI(title="Requirements Extractor", docs_url="/apidocs")
|
|
|
3 |
from dotenv import load_dotenv
|
4 |
from typing import Literal
|
5 |
from jinja2 import Environment, TemplateNotFound
|
|
|
6 |
import warnings
|
7 |
import os
|
8 |
from fastapi import Depends, FastAPI, BackgroundTasks, HTTPException, Request, Response
|
|
|
29 |
# Initialize global dependencies
|
30 |
init_dependencies()
|
31 |
|
|
|
|
|
|
|
|
|
|
|
32 |
warnings.filterwarnings("ignore")
|
33 |
|
34 |
app = FastAPI(title="Requirements Extractor", docs_url="/apidocs")
|
requirements.txt
CHANGED
@@ -9,6 +9,5 @@ lxml
|
|
9 |
openpyxl
|
10 |
beautifulsoup4
|
11 |
aiolimiter
|
12 |
-
nltk
|
13 |
httpx
|
14 |
Jinja2
|
|
|
9 |
openpyxl
|
10 |
beautifulsoup4
|
11 |
aiolimiter
|
|
|
12 |
httpx
|
13 |
Jinja2
|