Lucas ARRIESSE commited on
Commit
d2dc29e
·
1 Parent(s): 8ac47d4

Remove usages of NLTK (that is unused actually)

Browse files
Files changed (3) hide show
  1. api/docs.py +3 -16
  2. app.py +0 -6
  3. requirements.txt +0 -1
api/docs.py CHANGED
@@ -2,9 +2,7 @@ import asyncio
2
  from typing import Dict, List, Literal, Tuple
3
  from fastapi.routing import APIRouter
4
  import logging
5
- import string
6
  import io
7
- import traceback
8
  import zipfile
9
  import json
10
  import os
@@ -15,10 +13,7 @@ import subprocess
15
  import pandas as pd
16
  import re
17
  from lxml import etree
18
- from nltk.tokenize import word_tokenize
19
  from bs4 import BeautifulSoup
20
- from nltk.corpus import stopwords
21
- from nltk.stem import WordNetLemmatizer
22
  from fastapi import Depends, BackgroundTasks, HTTPException, Request
23
  from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
24
  from fastapi.responses import StreamingResponse
@@ -30,21 +25,12 @@ from schemas import DataRequest, DataResponse, DocRequirements, DocDownloadReque
30
  router = APIRouter(tags=["document extraction"])
31
 
32
  # ==================================================== Utilities =================================================================
33
-
34
- lemmatizer = WordNetLemmatizer()
35
-
36
  NSMAP = {
37
  'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
38
  'v': 'urn:schemas-microsoft-com:vml'
39
  }
40
 
41
-
42
- def lemma(text: str):
43
- stop_words = set(stopwords.words('english'))
44
- txt = text.translate(str.maketrans('', '', string.punctuation)).strip()
45
- tokens = [token for token in word_tokenize(
46
- txt.lower()) if token not in stop_words]
47
- return [lemmatizer.lemmatize(token) for token in tokens]
48
 
49
 
50
  def get_docx_archive(url: str) -> zipfile.ZipFile:
@@ -358,7 +344,8 @@ async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends
358
  documents = req.documents
359
  n_docs = len(documents)
360
 
361
- logging.info("Generating requirements for documents: {}".format(req.documents))
 
362
 
363
  # limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
364
  concurrency_sema = asyncio.Semaphore(4)
 
2
  from typing import Dict, List, Literal, Tuple
3
  from fastapi.routing import APIRouter
4
  import logging
 
5
  import io
 
6
  import zipfile
7
  import json
8
  import os
 
13
  import pandas as pd
14
  import re
15
  from lxml import etree
 
16
  from bs4 import BeautifulSoup
 
 
17
  from fastapi import Depends, BackgroundTasks, HTTPException, Request
18
  from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
19
  from fastapi.responses import StreamingResponse
 
25
  router = APIRouter(tags=["document extraction"])
26
 
27
  # ==================================================== Utilities =================================================================
 
 
 
28
  NSMAP = {
29
  'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
30
  'v': 'urn:schemas-microsoft-com:vml'
31
  }
32
 
33
+ # ================================== Converting of files to .txt ====================================
 
 
 
 
 
 
34
 
35
 
36
  def get_docx_archive(url: str) -> zipfile.ZipFile:
 
344
  documents = req.documents
345
  n_docs = len(documents)
346
 
347
+ logging.info(
348
+ "Generating requirements for documents: {}".format(req.documents))
349
 
350
  # limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
351
  concurrency_sema = asyncio.Semaphore(4)
app.py CHANGED
@@ -3,7 +3,6 @@ import logging
3
  from dotenv import load_dotenv
4
  from typing import Literal
5
  from jinja2 import Environment, TemplateNotFound
6
- import nltk
7
  import warnings
8
  import os
9
  from fastapi import Depends, FastAPI, BackgroundTasks, HTTPException, Request, Response
@@ -30,11 +29,6 @@ logging.basicConfig(
30
  # Initialize global dependencies
31
  init_dependencies()
32
 
33
- # Download required packages for NLTK
34
- nltk.download('stopwords')
35
- nltk.download('punkt_tab')
36
- nltk.download('wordnet')
37
-
38
  warnings.filterwarnings("ignore")
39
 
40
  app = FastAPI(title="Requirements Extractor", docs_url="/apidocs")
 
3
  from dotenv import load_dotenv
4
  from typing import Literal
5
  from jinja2 import Environment, TemplateNotFound
 
6
  import warnings
7
  import os
8
  from fastapi import Depends, FastAPI, BackgroundTasks, HTTPException, Request, Response
 
29
  # Initialize global dependencies
30
  init_dependencies()
31
 
 
 
 
 
 
32
  warnings.filterwarnings("ignore")
33
 
34
  app = FastAPI(title="Requirements Extractor", docs_url="/apidocs")
requirements.txt CHANGED
@@ -9,6 +9,5 @@ lxml
9
  openpyxl
10
  beautifulsoup4
11
  aiolimiter
12
- nltk
13
  httpx
14
  Jinja2
 
9
  openpyxl
10
  beautifulsoup4
11
  aiolimiter
 
12
  httpx
13
  Jinja2