Lucas ARRIESSE
commited on
Commit
·
1b57e39
1
Parent(s):
4e54efb
Fix TDocs downloading not working
Browse files- app.py +22 -18
- static/script.js +26 -16
app.py
CHANGED
@@ -1,20 +1,24 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
3 |
import warnings
|
4 |
import io
|
|
|
5 |
import zipfile
|
6 |
-
|
7 |
import os
|
8 |
-
from dotenv import load_dotenv
|
9 |
import requests
|
10 |
import subprocess
|
11 |
-
import
|
|
|
|
|
|
|
|
|
12 |
from nltk.tokenize import word_tokenize
|
|
|
13 |
from nltk.corpus import stopwords
|
14 |
from nltk.stem import WordNetLemmatizer
|
15 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
16 |
-
import json
|
17 |
-
import traceback
|
18 |
from fastapi import FastAPI, BackgroundTasks, HTTPException, Request
|
19 |
from fastapi.staticfiles import StaticFiles
|
20 |
from schemas import *
|
@@ -22,11 +26,6 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
22 |
from fastapi.responses import FileResponse, StreamingResponse
|
23 |
from litellm.router import Router
|
24 |
from aiolimiter import AsyncLimiter
|
25 |
-
import pandas as pd
|
26 |
-
import asyncio
|
27 |
-
import logging
|
28 |
-
import re
|
29 |
-
import nltk
|
30 |
|
31 |
load_dotenv()
|
32 |
|
@@ -36,6 +35,7 @@ logging.basicConfig(
|
|
36 |
datefmt='%Y-%m-%d %H:%M:%S'
|
37 |
)
|
38 |
|
|
|
39 |
nltk.download('stopwords')
|
40 |
nltk.download('punkt_tab')
|
41 |
nltk.download('wordnet')
|
@@ -227,11 +227,14 @@ def get_meetings(req: MeetingsRequest):
|
|
227 |
working_group = req.working_group
|
228 |
tsg = re.sub(r"\d+", "", working_group)
|
229 |
wg_number = re.search(r"\d", working_group).group(0)
|
|
|
230 |
logging.debug(tsg, wg_number)
|
231 |
url = "https://www.3gpp.org/ftp/tsg_" + tsg
|
232 |
logging.debug(url)
|
|
|
233 |
resp = requests.get(url, verify=False)
|
234 |
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
235 |
meeting_folders = []
|
236 |
all_meetings = []
|
237 |
wg_folders = [item.get_text() for item in soup.select("tr td a")]
|
@@ -309,15 +312,18 @@ def download_tdocs(req: DownloadRequest):
|
|
309 |
data=json.dumps({"doc_id": doc_id}),
|
310 |
verify=False
|
311 |
)
|
312 |
-
|
|
|
313 |
url = url.json()['url']
|
314 |
-
|
|
|
315 |
try:
|
316 |
txt = "\n".join(docx_to_txt(doc_id, url))
|
317 |
except Exception as e:
|
318 |
txt = f"Document {doc_id} text extraction failed: {e}"
|
319 |
return doc_id, txt.encode("utf-8")
|
320 |
|
|
|
321 |
def process_batch(batch):
|
322 |
results = {}
|
323 |
for doc in batch:
|
@@ -420,8 +426,6 @@ async def gen_reqs(req: RequirementsRequest, background_tasks: BackgroundTasks):
|
|
420 |
|
421 |
# ======================================================================================================================================================================================
|
422 |
|
423 |
-
SUBPROCESS_SEMAPHORE = asyncio.Semaphore(32)
|
424 |
-
|
425 |
|
426 |
class ProgressUpdate(BaseModel):
|
427 |
"""Defines the structure of a single SSE message."""
|
@@ -431,7 +435,7 @@ class ProgressUpdate(BaseModel):
|
|
431 |
processed_docs: int
|
432 |
|
433 |
|
434 |
-
@app.post("/generate_requirements/
|
435 |
async def gen_reqs(req: RequirementsRequest, con: Request):
|
436 |
"""Extract requirements from the specified TDocs using a LLM and returns SSE events about the progress of ongoing operations"""
|
437 |
|
|
|
1 |
+
import asyncio
|
2 |
+
import logging
|
3 |
+
import nltk
|
4 |
+
import string
|
5 |
import warnings
|
6 |
import io
|
7 |
+
import traceback
|
8 |
import zipfile
|
9 |
+
import json
|
10 |
import os
|
|
|
11 |
import requests
|
12 |
import subprocess
|
13 |
+
import pandas as pd
|
14 |
+
import re
|
15 |
+
from lxml import etree
|
16 |
+
from typing import Literal
|
17 |
+
from dotenv import load_dotenv
|
18 |
from nltk.tokenize import word_tokenize
|
19 |
+
from bs4 import BeautifulSoup
|
20 |
from nltk.corpus import stopwords
|
21 |
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
|
22 |
from fastapi import FastAPI, BackgroundTasks, HTTPException, Request
|
23 |
from fastapi.staticfiles import StaticFiles
|
24 |
from schemas import *
|
|
|
26 |
from fastapi.responses import FileResponse, StreamingResponse
|
27 |
from litellm.router import Router
|
28 |
from aiolimiter import AsyncLimiter
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
load_dotenv()
|
31 |
|
|
|
35 |
datefmt='%Y-%m-%d %H:%M:%S'
|
36 |
)
|
37 |
|
38 |
+
# Download required packages for NLTK
|
39 |
nltk.download('stopwords')
|
40 |
nltk.download('punkt_tab')
|
41 |
nltk.download('wordnet')
|
|
|
227 |
working_group = req.working_group
|
228 |
tsg = re.sub(r"\d+", "", working_group)
|
229 |
wg_number = re.search(r"\d", working_group).group(0)
|
230 |
+
|
231 |
logging.debug(tsg, wg_number)
|
232 |
url = "https://www.3gpp.org/ftp/tsg_" + tsg
|
233 |
logging.debug(url)
|
234 |
+
|
235 |
resp = requests.get(url, verify=False)
|
236 |
soup = BeautifulSoup(resp.text, "html.parser")
|
237 |
+
|
238 |
meeting_folders = []
|
239 |
all_meetings = []
|
240 |
wg_folders = [item.get_text() for item in soup.select("tr td a")]
|
|
|
312 |
data=json.dumps({"doc_id": doc_id}),
|
313 |
verify=False
|
314 |
)
|
315 |
+
logging.info(
|
316 |
+
f"Retrieving URL for doc {doc_id} returned http status {url.status_code}")
|
317 |
url = url.json()['url']
|
318 |
+
logging.debug(f"Doc URL for {doc_id} is {url}")
|
319 |
+
|
320 |
try:
|
321 |
txt = "\n".join(docx_to_txt(doc_id, url))
|
322 |
except Exception as e:
|
323 |
txt = f"Document {doc_id} text extraction failed: {e}"
|
324 |
return doc_id, txt.encode("utf-8")
|
325 |
|
326 |
+
# PERF: use asyncio?
|
327 |
def process_batch(batch):
|
328 |
results = {}
|
329 |
for doc in batch:
|
|
|
426 |
|
427 |
# ======================================================================================================================================================================================
|
428 |
|
|
|
|
|
429 |
|
430 |
class ProgressUpdate(BaseModel):
|
431 |
"""Defines the structure of a single SSE message."""
|
|
|
435 |
processed_docs: int
|
436 |
|
437 |
|
438 |
+
@app.post("/generate_requirements/sse")
|
439 |
async def gen_reqs(req: RequirementsRequest, con: Request):
|
440 |
"""Extract requirements from the specified TDocs using a LLM and returns SSE events about the progress of ongoing operations"""
|
441 |
|
static/script.js
CHANGED
@@ -520,7 +520,7 @@ async function downloadTDocs() {
|
|
520 |
const blob = await response.blob();
|
521 |
downloadBlob(blob, generateDownloadFilename());
|
522 |
} catch (error) {
|
523 |
-
console.error(
|
524 |
alert('Erreur lors du téléchargement des TDocs');
|
525 |
} finally {
|
526 |
hideLoadingOverlay();
|
@@ -535,19 +535,29 @@ async function downloadTDocs() {
|
|
535 |
function generateDownloadFilename() {
|
536 |
let filename = document.getElementById('meeting-select').value || 'documents';
|
537 |
|
538 |
-
const
|
539 |
-
const
|
540 |
-
const docType =
|
541 |
|
542 |
-
|
543 |
-
|
|
|
|
|
|
|
544 |
}
|
545 |
-
|
546 |
-
|
|
|
|
|
|
|
|
|
547 |
}
|
548 |
-
|
|
|
|
|
549 |
filename = `${docType}_${filename}`;
|
550 |
}
|
|
|
551 |
if (hasRequirementsExtracted) {
|
552 |
filename = `requirements_${filename}`;
|
553 |
}
|
@@ -585,7 +595,7 @@ async function extractRequirements() {
|
|
585 |
toggleElementsEnabled(['extract-requirements-btn'], false);
|
586 |
|
587 |
try {
|
588 |
-
const response = await postWithSSE('/generate_requirements/
|
589 |
onMessage: (msg) => {
|
590 |
console.log("SSE message:");
|
591 |
console.log(msg);
|
@@ -599,11 +609,11 @@ async function extractRequirements() {
|
|
599 |
});
|
600 |
|
601 |
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
|
608 |
const data = response.data; // data in the SSE message contains the requirements response
|
609 |
requirements = data.requirements;
|
@@ -619,7 +629,7 @@ async function extractRequirements() {
|
|
619 |
req_id++;
|
620 |
})
|
621 |
})
|
622 |
-
|
623 |
displayRequirements(requirements);
|
624 |
|
625 |
toggleContainersVisibility(['requirements-container', 'query-requirements-container'], true);
|
|
|
520 |
const blob = await response.blob();
|
521 |
downloadBlob(blob, generateDownloadFilename());
|
522 |
} catch (error) {
|
523 |
+
console.error(error);
|
524 |
alert('Erreur lors du téléchargement des TDocs');
|
525 |
} finally {
|
526 |
hideLoadingOverlay();
|
|
|
535 |
function generateDownloadFilename() {
|
536 |
let filename = document.getElementById('meeting-select').value || 'documents';
|
537 |
|
538 |
+
const agendaItems = selectedAgenda;
|
539 |
+
const docStatuses = selectedStatus
|
540 |
+
const docType = selectedType;
|
541 |
|
542 |
+
// empty set means "Tous" is selected
|
543 |
+
if (agendaItems) {
|
544 |
+
for (aItem of agendaItems) {
|
545 |
+
filename += `_${aItem}`;
|
546 |
+
}
|
547 |
}
|
548 |
+
|
549 |
+
// empty set means "Tous" is selected
|
550 |
+
if (docStatuses) {
|
551 |
+
for (docStatus of docStatuses) {
|
552 |
+
filename += `_${docStatus}`;
|
553 |
+
}
|
554 |
}
|
555 |
+
|
556 |
+
// empty means "Tous"
|
557 |
+
if (docType && docType !== "") {
|
558 |
filename = `${docType}_${filename}`;
|
559 |
}
|
560 |
+
|
561 |
if (hasRequirementsExtracted) {
|
562 |
filename = `requirements_${filename}`;
|
563 |
}
|
|
|
595 |
toggleElementsEnabled(['extract-requirements-btn'], false);
|
596 |
|
597 |
try {
|
598 |
+
const response = await postWithSSE('/generate_requirements/sse', { documents: selectedData }, {
|
599 |
onMessage: (msg) => {
|
600 |
console.log("SSE message:");
|
601 |
console.log(msg);
|
|
|
609 |
});
|
610 |
|
611 |
|
612 |
+
// const response = await fetch('/generate_requirements/', {
|
613 |
+
// method: 'POST',
|
614 |
+
// headers: { 'Content-Type': 'application/json' },
|
615 |
+
// body: req
|
616 |
+
// });
|
617 |
|
618 |
const data = response.data; // data in the SSE message contains the requirements response
|
619 |
requirements = data.requirements;
|
|
|
629 |
req_id++;
|
630 |
})
|
631 |
})
|
632 |
+
|
633 |
displayRequirements(requirements);
|
634 |
|
635 |
toggleContainersVisibility(['requirements-container', 'query-requirements-container'], true);
|