om4r932 commited on
Commit
48e4b27
·
1 Parent(s): 8114183
Files changed (5) hide show
  1. app.py +166 -537
  2. indexed_docs.json +0 -8
  3. requirements.txt +8 -17
  4. schemas.py +40 -0
  5. static/script.js +1 -26
app.py CHANGED
@@ -1,462 +1,209 @@
1
- from io import StringIO
2
- import bm25s
3
- import numpy as np
4
- import pandas as pd
5
- import faiss
6
- import requests
7
- from bs4 import BeautifulSoup
8
- import json
9
- import os
10
- import traceback
11
- import uuid
12
- import zipfile
13
- import io
14
- import openai
15
- import httpx
16
- import subprocess
17
- import os
18
- import re
19
  import time
20
  from datetime import datetime
 
 
 
21
  from dotenv import load_dotenv
22
- import warnings
 
 
 
 
 
 
 
 
 
 
23
  from fastapi import FastAPI, HTTPException
24
  from fastapi.middleware.cors import CORSMiddleware
25
- import nltk
26
- from nltk.stem import WordNetLemmatizer
27
  from fastapi.responses import FileResponse
28
  from fastapi.staticfiles import StaticFiles
29
- from pydantic import BaseModel
30
- from typing import Any, Dict, List, Literal, Optional
31
- os.environ['CURL_CA_BUNDLE'] = ''
32
- from sentence_transformers import SentenceTransformer
33
- import warnings
34
-
35
- warnings.filterwarnings("ignore")
36
-
37
- from sklearn.preprocessing import MinMaxScaler
38
- nltk.download("wordnet")
39
- load_dotenv()
40
 
41
- warnings.filterwarnings("ignore")
 
42
 
43
- app = FastAPI(title="3GPP Document Finder API",
44
- description="API to find 3GPP documents based on TSG document IDs")
45
 
46
- app.mount("/static", StaticFiles(directory="static"), name="static")
 
 
 
47
 
48
- origins = [
49
- "*",
50
- ]
51
 
52
- regex = r"^(\d+[a-z]?(?:\.\d+)*)\t[\ \S]+$"
 
 
 
 
 
 
 
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  app.add_middleware(
55
  CORSMiddleware,
56
- allow_origins=origins,
57
  allow_credentials=True,
58
  allow_methods=["*"],
59
  allow_headers=["*"],
60
  )
61
 
62
- def get_text(specification: str, version: str):
63
- """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
64
- doc_id = specification
65
- series = doc_id.split(".")[0]
66
-
67
- response = requests.get(
68
- f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
69
- verify=False,
70
- headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
71
- )
72
-
73
- if response.status_code != 200:
74
- raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
75
-
76
- zip_bytes = io.BytesIO(response.content)
77
-
78
- with zipfile.ZipFile(zip_bytes) as zf:
79
- for file_name in zf.namelist():
80
- if file_name.endswith("zip"):
81
- print("Another ZIP !")
82
- zip_bytes = io.BytesIO(zf.read(file_name))
83
- zf = zipfile.ZipFile(zip_bytes)
84
- for file_name2 in zf.namelist():
85
- if file_name2.endswith("doc") or file_name2.endswith("docx"):
86
- if "cover" in file_name2.lower():
87
- print("COVER !")
88
- continue
89
- ext = file_name2.split(".")[-1]
90
- doc_bytes = zf.read(file_name2)
91
- temp_id = str(uuid.uuid4())
92
- input_path = f"/tmp/{temp_id}.{ext}"
93
- output_path = f"/tmp/{temp_id}.txt"
94
-
95
- with open(input_path, "wb") as f:
96
- f.write(doc_bytes)
97
-
98
- subprocess.run([
99
- "libreoffice",
100
- "--headless",
101
- "--convert-to", "txt",
102
- "--outdir", "/tmp",
103
- input_path
104
- ], check=True)
105
-
106
- with open(output_path, "r") as f:
107
- txt_data = [line.strip() for line in f if line.strip()]
108
-
109
- os.remove(input_path)
110
- os.remove(output_path)
111
- return txt_data
112
- elif file_name.endswith("doc") or file_name.endswith("docx"):
113
- if "cover" in file_name.lower():
114
- print("COVER !")
115
- continue
116
- ext = file_name.split(".")[-1]
117
- doc_bytes = zf.read(file_name)
118
- temp_id = str(uuid.uuid4())
119
- input_path = f"/tmp/{temp_id}.{ext}"
120
- output_path = f"/tmp/{temp_id}.txt"
121
-
122
- print("Ecriture")
123
- with open(input_path, "wb") as f:
124
- f.write(doc_bytes)
125
-
126
- print("Convertissement")
127
- subprocess.run([
128
- "libreoffice",
129
- "--headless",
130
- "--convert-to", "txt",
131
- "--outdir", "/tmp",
132
- input_path
133
- ], check=True)
134
-
135
- print("Ecriture TXT")
136
- with open(output_path, "r", encoding="utf-8") as f:
137
- txt_data = [line.strip() for line in f if line.strip()]
138
-
139
- os.remove(input_path)
140
- os.remove(output_path)
141
- return txt_data
142
-
143
- raise Exception(f"Aucun fichier .doc/.docx trouvé dans le ZIP pour {specification}-{version}")
144
-
145
- def get_scope(specification: str, version: str):
146
- try:
147
- spec_text = get_text(specification, version)
148
- scp_i = 0
149
- nxt_i = 0
150
- for x in range(len(spec_text)):
151
- text = spec_text[x]
152
- if re.search(r"scope$", text, flags=re.IGNORECASE):
153
- scp_i = x
154
- nxt_i = scp_i + 10
155
- if re.search(r"references$", text, flags=re.IGNORECASE):
156
- nxt_i = x
157
-
158
- return re.sub(r"\s+", " ", " ".join(spec_text[scp_i+1:nxt_i])) if len(spec_text[scp_i+1:nxt_i]) < 2 else "Not found"
159
- except Exception as e:
160
- traceback.print_exception(e)
161
- return "Not found (error)"
162
 
163
- def get_spec_content(specification: str, version: str):
164
- text = get_text(specification, version)
165
- forewords = []
166
- for x in range(len(text)):
167
- line = text[x]
168
- if "Foreword" in line:
169
- forewords.append(x)
170
- if len(forewords) >= 2:
 
 
 
 
 
171
  break
 
 
 
 
 
 
 
172
 
173
- toc_brut = text[forewords[1]:]
174
- chapters = []
175
- for line in toc_brut:
176
- x = line.split("\t")
177
- m = re.search(regex, line)
178
- if m and any(line in c for c in text[forewords[0]:forewords[1]]):
179
- chapters.append(line)
180
- print(line)
181
-
182
- real_toc_indexes = {}
183
-
184
- for chapter in chapters:
185
- x = text.index(chapter)
186
- real_toc_indexes[chapter] = x
187
-
188
- document = {}
189
- toc = list(real_toc_indexes.keys())
190
- index_toc = list(real_toc_indexes.values())
191
- curr_index = 0
192
- for x in range(1, len(toc)):
193
- document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
194
- curr_index = x
195
-
196
- document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
197
- print(len(toc)-1, toc[curr_index], curr_index)
198
- return document
199
-
200
- def caseSensitive(string: str, sensitive: bool):
201
- return string if sensitive else string.lower()
202
-
203
- class DocRequest(BaseModel):
204
- doc_id: str
205
- release: Optional[int] = None
206
-
207
- class DocResponse(BaseModel):
208
- doc_id: str
209
- url: str
210
- version: str
211
- scope: Optional[str] = None
212
- search_time: float
213
-
214
- class BatchDocRequest(BaseModel):
215
- doc_ids: List[str]
216
- release: Optional[int] = None
217
-
218
- class BatchDocResponse(BaseModel):
219
- results: Dict[str, str]
220
- missing: List[str]
221
- search_time: float
222
-
223
- class KeywordRequest2(BaseModel):
224
- keywords: Optional[str] = ""
225
- threshold: Optional[int] = 60
226
- release: Optional[str] = None
227
- working_group: Optional[str] = None
228
- spec_type: Optional[Literal["TS", "TR"]] = None
229
-
230
- class KeywordRequest(BaseModel):
231
- keywords: Optional[str] = ""
232
- search_mode: Literal["quick", "deep"]
233
- case_sensitive: Optional[bool] = False
234
- release: Optional[str] = None
235
- working_group: Optional[str] = None
236
- spec_type: Optional[Literal["TS", "TR"]] = None
237
- mode: Optional[Literal["and", "or"]] = "and"
238
-
239
- class KeywordResponse(BaseModel):
240
- results: List[Dict[str, Any]]
241
- search_time: float
242
-
243
- class TsgDocFinder:
244
- def __init__(self):
245
- self.main_ftp_url = "https://www.3gpp.org/ftp"
246
- self.indexer_file = "indexed_docs.json"
247
- self.indexer, self.last_indexer_date = self.load_indexer()
248
-
249
- def load_indexer(self):
250
- """Load existing index if available"""
251
- if os.path.exists(self.indexer_file):
252
- with open(self.indexer_file, "r", encoding="utf-8") as f:
253
- x = json.load(f)
254
- return x["docs"], x["last_indexed_date"]
255
- return {}, None
256
-
257
- def save_indexer(self):
258
- """Save the updated index"""
259
- self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
260
- with open(self.indexer_file, "w", encoding="utf-8") as f:
261
- today = datetime.today()
262
- output = {"docs": self.indexer, "last_indexed_date": self.last_indexer_date}
263
- json.dump(output, f, indent=4, ensure_ascii=False)
264
-
265
- def get_workgroup(self, doc):
266
- main_tsg = "tsg_ct" if doc[0] == "C" else "tsg_sa" if doc[0] == "S" else "tsg_ran" if doc[0] == "R" else None
267
- if main_tsg is None:
268
- return None, None, None
269
- workgroup = f"WG{int(doc[1])}" if doc[1].isnumeric() else main_tsg.upper()
270
- return main_tsg, workgroup, doc
271
-
272
- def find_workgroup_url(self, main_tsg, workgroup):
273
- """Find the URL for the specific workgroup"""
274
- response = requests.get(f"{self.main_ftp_url}/{main_tsg}", verify=False)
275
- soup = BeautifulSoup(response.text, 'html.parser')
276
-
277
- for item in soup.find_all("tr"):
278
- link = item.find("a")
279
- if link and workgroup in link.get_text():
280
- return f"{self.main_ftp_url}/{main_tsg}/{link.get_text()}"
281
-
282
- return f"{self.main_ftp_url}/{main_tsg}/{workgroup}"
283
 
284
- def get_docs_from_url(self, url):
285
- """Get list of documents/directories from a URL"""
286
- try:
287
- response = requests.get(url, verify=False, timeout=10)
288
- soup = BeautifulSoup(response.text, "html.parser")
289
- return [item.get_text() for item in soup.select("tr td a")]
290
- except Exception as e:
291
- print(f"Error accessing {url}: {e}")
292
- return []
293
 
294
- def search_document(self, doc_id: str, release=None):
295
- original_id = doc_id
296
-
297
- if original_id in self.indexer:
298
- return self.indexer[original_id]
299
- for doc in self.indexer:
300
- if doc.startswith(original_id):
301
- return self.indexer[doc]
302
-
303
- # 2. Recherche live "classique" (TSG/CT)
304
- main_tsg, workgroup, doc = self.get_workgroup(doc_id)
305
- if main_tsg:
306
- wg_url = self.find_workgroup_url(main_tsg, workgroup)
307
- if wg_url:
308
- meeting_folders = self.get_docs_from_url(wg_url)
309
- for folder in meeting_folders:
310
- meeting_url = f"{wg_url}/{folder}"
311
- meeting_contents = self.get_docs_from_url(meeting_url)
312
- key = "docs" if "docs" in [x.lower() for x in meeting_contents] else "tdocs" if "tdocs" in [x.lower() for x in meeting_contents] else "tdoc" if "tdoc" in [x.lower() for x in meeting_contents] else None
313
- if key is not None:
314
- docs_url = f"{meeting_url}/{key}"
315
- files = self.get_docs_from_url(docs_url)
316
- for file in files:
317
- if doc in file.lower() or original_id in file:
318
- doc_url = f"{docs_url}/{file}"
319
- self.indexer[original_id] = doc_url
320
- return doc_url
321
- # ZIP subfolder
322
- if "zip" in [x for x in files]:
323
- zip_url = f"{docs_url}/zip"
324
- zip_files = self.get_docs_from_url(zip_url)
325
- for file in zip_files:
326
- if doc in file.lower() or original_id in file:
327
- doc_url = f"{zip_url}/{file}"
328
- self.indexer[original_id] = doc_url
329
- self.save_indexer()
330
- return doc_url
331
-
332
- # 3. Dernier recours : tenter dans /ftp/workshop (recherche live)
333
- workshop_url = f"{self.main_ftp_url}/workshop"
334
- meetings = self.get_docs_from_url(workshop_url)
335
- for meeting in meetings:
336
- if meeting in ['./', '../']:
337
- continue
338
- meeting_url = f"{workshop_url}/{meeting}"
339
- contents = self.get_docs_from_url(meeting_url)
340
- for sub in contents:
341
- if sub.lower() in ['docs', 'tdocs']:
342
- docs_url = f"{meeting_url}/{sub}"
343
- files = self.get_docs_from_url(docs_url)
344
- for file in files:
345
- if doc_id.lower() in file.lower() or original_id in file:
346
- doc_url = f"{docs_url}/{file}"
347
- self.indexer[original_id] = doc_url
348
- self.save_indexer()
349
- return doc_url
350
- if "zip" in [x.lower() for x in files]:
351
- zip_url = f"{docs_url}/zip"
352
- zip_files = self.get_docs_from_url(zip_url)
353
- for file in zip_files:
354
- if doc_id.lower() in file.lower() or original_id in file:
355
- doc_url = f"{zip_url}/{file}"
356
- self.indexer[original_id] = doc_url
357
- self.save_indexer()
358
- return doc_url
359
 
360
- return f"Document {doc_id} not found"
 
 
 
 
 
 
 
 
 
 
361
 
362
-
 
363
 
364
- class SpecDocFinder:
365
- def __init__(self):
366
- self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
367
- self.indexer_file = "indexed_specifications.json"
368
- self.doc_zip = "indexed_docs_content.zip"
369
- self.indexer_specs, self.indexer_scopes, self.last_indexer_date = self.load_indexer()
370
- self.indexer_documents = self.load_documents()
371
 
372
- def load_indexer(self):
373
- """Load existing index if available"""
374
- if os.path.exists(self.indexer_file):
375
- with open(self.indexer_file, "r", encoding="utf-8") as f:
376
- x = json.load(f)
377
- return x["specs"], x["scopes"], x["last_indexed_date"]
378
- return {}, {}, None
379
-
380
- def load_documents(self):
381
- if os.path.exists(self.doc_zip):
382
- with zipfile.ZipFile(open(self.doc_zip, "rb")) as zf:
383
- for file_name in zf.namelist():
384
- if file_name.endswith(".json"):
385
- doc_bytes = zf.read(file_name)
386
- try:
387
- doc_data = json.loads(doc_bytes.decode("utf-8"))
388
- print("Documents loaded successfully !")
389
- return doc_data
390
- except json.JSONDecodeError as e:
391
- print(f"Error while decoding the JSON file {file_name}: {e}")
392
- print("Failed !")
393
- return {}
394
-
395
- def get_document(self, spec, version):
396
- doc = self.indexer_documents.get(spec)
397
- if doc:
398
- return doc
399
- else:
400
- return get_spec_content(spec, version)
401
 
402
- def get_section(self, doc, chapter):
403
- return doc[chapter]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
- def save_indexer(self):
406
- """Save the updated index"""
407
- self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
408
- with open(self.indexer_file, "w", encoding="utf-8") as f:
409
- today = datetime.today()
410
- output = {"specs": self.indexer_specs, "scopes": self.indexer_scopes, "last_indexed_date": self.last_indexer_date}
411
- json.dump(output, f, indent=4, ensure_ascii=False)
412
 
413
- def get_docs_from_url(self, url):
414
- """Get list of documents/directories from a URL"""
415
- try:
416
- response = requests.get(url, verify=False, timeout=10)
417
- soup = BeautifulSoup(response.text, "html.parser")
418
- return [item.get_text() for item in soup.select("tr td a")]
419
- except Exception as e:
420
- print(f"Error accessing {url}: {e}")
421
- return []
422
-
423
- def search_document(self, document, release):
424
- series = document.split(".")[0].zfill(2)
425
- url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{document}"
426
- versions = self.get_docs_from_url(url)
427
- return url + "/" + versions[-1] if versions != [] else f"Specification {document} not found"
428
-
429
- finder_tsg = TsgDocFinder()
430
- finder_spec = SpecDocFinder()
431
- lemmatizer = WordNetLemmatizer()
432
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", backend="onnx")
433
-
434
- if os.path.exists("bm25s.zip"):
435
- with zipfile.ZipFile("bm25s.zip", 'r') as zip_ref:
436
- zip_ref.extractall(".")
437
- bm25_engine = bm25s.BM25.load("3gpp_bm25_docs", load_corpus=True)
438
-
439
- @app.get("/")
440
- async def main_menu():
441
- return FileResponse(os.path.join("templates", "index.html"))
442
-
443
  @app.post("/search-spec/experimental", response_model=KeywordResponse)
444
- def search_spec_bm25(request: KeywordRequest2):
445
  start_time = time.time()
446
- release = request.release
447
  working_group = request.working_group
448
  spec_type = request.spec_type
449
  threshold = request.threshold
450
- query = lemmatizer.lemmatize(request.keywords)
451
 
452
  results_out = []
453
  query_tokens = bm25s.tokenize(query)
454
- results, scores = bm25_engine.retrieve(query_tokens, k=len(bm25_engine.corpus))
 
455
 
456
  def calculate_boosted_score(metadata, score, query):
457
- title = {lemmatizer.lemmatize(metadata['title']).lower()}
458
- q = {query.lower()}
459
- spec_id_presence = 0.5 if len(q & {metadata['id']}) > 0 else 0
460
  booster = len(q & title) * 0.5
461
  return score + spec_id_presence + booster
462
 
@@ -500,12 +247,10 @@ def search_spec_bm25(request: KeywordRequest2):
500
  spec_details[spec]["normalized_score"] = normalized_scores[spec]
501
 
502
  unique_specs = sorted(normalized_scores.keys(), key=lambda x: normalized_scores[x], reverse=True)
503
-
504
  for rank, spec in enumerate(unique_specs, 1):
505
  details = spec_details[spec]
506
  metadata = details['doc']['metadata']
507
- if metadata.get('version', None) is None or (release is not None and metadata["version"].split(".")[0] != str(release)):
508
- continue
509
  if metadata.get('type', None) is None or (spec_type is not None and metadata["type"] != spec_type):
510
  continue
511
  if metadata.get('working_group', None) is None or (working_group is not None and metadata["working_group"] != working_group):
@@ -520,120 +265,4 @@ def search_spec_bm25(request: KeywordRequest2):
520
  search_time=time.time() - start_time
521
  )
522
  else:
523
- raise HTTPException(status_code=404, detail="Specifications not found")
524
-
525
- @app.post("/search-spec", response_model=KeywordResponse)
526
- def search_spec(request: KeywordRequest):
527
- start_time = time.time()
528
- booleanLowered = request.case_sensitive
529
- search_mode = request.search_mode
530
- release = request.release
531
- working_group = request.working_group
532
- spec_type = request.spec_type
533
- kws = [caseSensitive(_, booleanLowered) for _ in request.keywords.split(",")]
534
- print(kws)
535
- unique_specs = set()
536
- results = []
537
-
538
- if kws == [""] and search_mode == "deep":
539
- raise HTTPException(status_code=400, detail="You must enter keywords in deep search mode !")
540
-
541
- for string, spec in finder_spec.indexer_specs.items():
542
- put = False
543
- if spec['id'] in unique_specs:
544
- continue
545
- if spec.get('version', None) is None or (release is not None and spec["version"].split(".")[0] != str(release)):
546
- continue
547
- if spec.get('type', None) is None or (spec_type is not None and spec["type"] != spec_type):
548
- continue
549
- if spec.get('working_group', None) is None or (working_group is not None and spec["working_group"] != working_group):
550
- continue
551
-
552
- if kws != "":
553
- if search_mode == "deep":
554
- contents = []
555
- version = finder_spec.search_document(spec['id'], spec['release']).split("/")[-1].replace(".zip", "").split("-")[-1]
556
- doc = finder_spec.get_document(spec['id'], version)
557
- docValid = not isinstance(doc, str)
558
-
559
- if request.mode == "and":
560
- if all(kw in caseSensitive(string, booleanLowered) for kw in kws):
561
- put = True
562
- if search_mode == "deep":
563
- if docValid:
564
- for chapter in list(doc.keys())[1:]:
565
- if "references" not in chapter.lower() and "void" not in chapter.lower() and "annex" not in doc[chapter].lower():
566
- if all(kw in caseSensitive(doc[chapter], booleanLowered) for kw in kws):
567
- put = True
568
- contents.append(chapter)
569
- elif request.mode == "or":
570
- if any(kw in caseSensitive(string, booleanLowered) for kw in kws):
571
- put = True
572
- if search_mode == "deep":
573
- if docValid:
574
- for chapter in list(doc.keys())[1:]:
575
- if "references" not in chapter.lower() and "void" not in chapter.lower() and "annex" not in doc[chapter].lower():
576
- if any(kw in caseSensitive(doc[chapter], booleanLowered) for kw in kws):
577
- put = True
578
- contents.append(chapter)
579
- else:
580
- put = True
581
-
582
- if put:
583
- spec_content = spec
584
- if search_mode == "deep":
585
- spec_content["contains"] = {chap: doc[chap] for chap in contents}
586
- results.append(spec_content)
587
- else:
588
- unique_specs.add(spec['id'])
589
- if len(results) > 0:
590
- return KeywordResponse(
591
- results=results,
592
- search_time=time.time() - start_time
593
- )
594
- else:
595
- raise HTTPException(status_code=404, detail="Specifications not found")
596
-
597
- @app.post("/find", response_model=DocResponse)
598
- def find_document(request: DocRequest):
599
- start_time = time.time()
600
- finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
601
- result = finder.search_document(request.doc_id, request.release)
602
-
603
- if "not found" not in result and "Could not" not in result and "Unable" not in result:
604
- version = result.split("/")[-1].replace(".zip", "").split("-")[-1]
605
- return DocResponse(
606
- doc_id=request.doc_id,
607
- version=version,
608
- url=result,
609
- search_time=time.time() - start_time
610
- ) if isinstance(finder, TsgDocFinder) else DocResponse(
611
- doc_id=request.doc_id,
612
- version=version,
613
- url=result,
614
- search_time=time.time() - start_time,
615
- scope=finder.indexer_scopes[request.doc_id] if request.doc_id in finder.indexer_scopes else get_scope(request.doc_id, version)
616
- )
617
- else:
618
- raise HTTPException(status_code=404, detail=result)
619
-
620
- @app.post("/batch", response_model=BatchDocResponse)
621
- def find_documents_batch(request: BatchDocRequest):
622
- start_time = time.time()
623
-
624
- results = {}
625
- missing = []
626
-
627
- for doc_id in request.doc_ids:
628
- finder = finder_tsg if doc_id[0].isalpha() else finder_spec
629
- result = finder.search_document(doc_id)
630
- if "not found" not in result and "Could not" not in result and "Unable" not in result:
631
- results[doc_id] = result
632
- else:
633
- missing.append(doc_id)
634
-
635
- return BatchDocResponse(
636
- results=results,
637
- missing=missing,
638
- search_time=time.time() - start_time
639
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import time
2
  from datetime import datetime
3
+ import os, warnings, nltk, json, subprocess
4
+ import numpy as np
5
+ from nltk.stem import WordNetLemmatizer
6
  from dotenv import load_dotenv
7
+ from sklearn.preprocessing import MinMaxScaler
8
+
9
+ os.environ['CURL_CA_BUNDLE'] = ""
10
+ warnings.filterwarnings('ignore')
11
+ nltk.download('wordnet')
12
+ load_dotenv()
13
+
14
+ from datasets import load_dataset
15
+ import bm25s
16
+ from bm25s.hf import BM25HF
17
+
18
  from fastapi import FastAPI, HTTPException
19
  from fastapi.middleware.cors import CORSMiddleware
 
 
20
  from fastapi.responses import FileResponse
21
  from fastapi.staticfiles import StaticFiles
22
+ from schemas import *
 
 
 
 
 
 
 
 
 
 
23
 
24
+ from bs4 import BeautifulSoup
25
+ import requests
26
 
27
+ lemmatizer = WordNetLemmatizer()
 
28
 
29
+ spec_metadatas = load_dataset("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ["HF_TOKEN"])
30
+ spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
31
+ tdoc_locations = load_dataset("OrganizedProgrammers/3GPPTDocLocation", token=os.environ["HF_TOKEN"])
32
+ bm25_index = BM25HF.load_from_hub("OrganizedProgrammers/3GPPBM25IndexSingle", load_corpus=True, token=os.environ["HF_TOKEN"])
33
 
34
+ spec_metadatas = spec_metadatas["train"].to_list()
35
+ spec_contents = spec_contents["train"].to_list()
36
+ tdoc_locations = tdoc_locations["train"].to_list()
37
 
38
+ def get_docs_from_url(url):
39
+ """Get list of documents/directories from a URL"""
40
+ try:
41
+ response = requests.get(url, verify=False, timeout=10)
42
+ soup = BeautifulSoup(response.text, "html.parser")
43
+ return [item.get_text() for item in soup.select("tr td a")]
44
+ except Exception as e:
45
+ print(f"Error accessing {url}: {e}")
46
+ return []
47
 
48
+ def get_tdoc_url(doc_id):
49
+ for tdoc in tdoc_locations:
50
+ if tdoc["doc_id"] == doc_id:
51
+ return tdoc["url"]
52
+
53
+ def get_spec_url(document):
54
+ series = document.split(".")[0].zfill(2)
55
+ url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{document}"
56
+ versions = get_docs_from_url(url)
57
+ return url + "/" + versions[-1] if versions != [] else f"Specification {document} not found"
58
+
59
+ def get_document(spec_id: str, spec_title: str):
60
+ text = [f"{spec_id} - {spec_title}"]
61
+ for section in spec_contents:
62
+ if spec_id == section["doc_id"]:
63
+ text.extend([section['section'], section['content']])
64
+ return text
65
+
66
+ app = FastAPI(title="3GPP Document Finder Back-End", description="Backend for 3GPPDocFinder - Searching technical documents & specifications from 3GPP FTP server")
67
+ app.mount("/static", StaticFiles(directory="static"), name="static")
68
  app.add_middleware(
69
  CORSMiddleware,
70
+ allow_origins=["*"],
71
  allow_credentials=True,
72
  allow_methods=["*"],
73
  allow_headers=["*"],
74
  )
75
 
76
+ @app.get("/")
77
+ def index():
78
+ return FileResponse(os.path.join('templates', 'index.html'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ @app.post("/find", response_model=DocResponse)
81
+ def find_document(request: DocRequest):
82
+ start_time = time.time()
83
+ document = request.doc_id
84
+ url = get_tdoc_url(document) if document[0].isalpha() else get_spec_url(document)
85
+ if "Specification" in url or "Document" in url:
86
+ raise HTTPException(status_code=404, detail=url)
87
+
88
+ version = url.split("/")[-1].replace(".zip", "").split("-")[-1]
89
+ scope = None
90
+ for spec in spec_metadatas:
91
+ if spec['id'] == document:
92
+ scope = spec['id']
93
  break
94
+ return DocResponse(
95
+ doc_id=document,
96
+ version=version,
97
+ url=url,
98
+ search_time=time.time() - start_time,
99
+ scope=scope
100
+ )
101
 
102
+ @app.post("/batch", response_model=BatchDocResponse)
103
+ def find_multiple_documents(request: BatchDocRequest):
104
+ start_time = time.time()
105
+ documents = request.doc_ids
106
+ results = {}
107
+ missing = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ for document in documents:
110
+ url = get_tdoc_url(document) if document[0].isalpha() else get_spec_url(document)
111
+ if "Specification" not in url and "Document" not in url:
112
+ results[document] = url
113
+ else:
114
+ missing.append(document)
 
 
 
115
 
116
+ return BatchDocResponse(
117
+ results=results,
118
+ missing=missing,
119
+ search_time=time.time()-start_time
120
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ @app.post("/search-spec", response_model=KeywordResponse)
123
+ def search_specification_by_keywords(request: KeywordRequest):
124
+ start_time = time.time()
125
+ boolSensitiveCase = request.case_sensitive
126
+ search_mode = request.search_mode
127
+ working_group = request.working_group
128
+ spec_type = request.spec_type
129
+ keywords = [string.lower() if boolSensitiveCase else string for string in request.keywords.split(",")]
130
+ print(keywords)
131
+ unique_specs = set()
132
+ results = []
133
 
134
+ if keywords == [""] and search_mode == "deep":
135
+ raise HTTPException(status_code=400, detail="You must enter keywords in deep search mode !")
136
 
137
+ for spec in spec_metadatas:
138
+ valid = False
139
+ if spec['id'] in unique_specs: continue
140
+ if spec.get('type', None) is None or (spec_type is not None and spec["type"] != spec_type): continue
141
+ if spec.get('working_group', None) is None or (working_group is not None and spec["working_group"] != working_group): continue
 
 
142
 
143
+ if search_mode == "deep":
144
+ contents = []
145
+ doc = get_document(spec["id"], spec["title"])
146
+ docValid = len(doc) > 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
+ if request.mode == "and":
149
+ string = f"{spec['id']}+-+{spec['title']}+-+{spec['type']}+-+{spec['version']}+-+{spec['working_group']}"
150
+ if all(keyword in (string.lower() if boolSensitiveCase else string) for keyword in keywords):
151
+ valid = True
152
+ if search_mode == "deep":
153
+ if docValid:
154
+ for x in range(1, len(doc) - 1, 2):
155
+ section_title = doc[x]
156
+ section_content = doc[x+1]
157
+ if "reference" not in section_title.lower() and "void" not in section_title.lower() and "annex" not in section_content.lower():
158
+ if all(keyword in (section_content.lower() if boolSensitiveCase else section_content) for keyword in keywords):
159
+ valid = True
160
+ contents.append({section_title: section_content})
161
+ elif request.mode == "or":
162
+ string = f"{spec['id']}+-+{spec['title']}+-+{spec['type']}+-+{spec['version']}+-+{spec['working_group']}"
163
+ if any(keyword in (string.lower() if boolSensitiveCase else string) for keyword in keywords):
164
+ valid = True
165
+ if search_mode == "deep":
166
+ if docValid:
167
+ for x in range(1, len(doc) - 1, 2):
168
+ section_title = doc[x]
169
+ section_content = doc[x+1]
170
+ if "reference" not in section_title.lower() and "void" not in section_title.lower() and "annex" not in section_content.lower():
171
+ if any(keyword in (section_content.lower() if boolSensitiveCase else section_content) for keyword in keywords):
172
+ valid = True
173
+ contents.append({section_title: section_content})
174
+ if valid:
175
+ spec_content = spec
176
+ if search_mode == "deep":
177
+ spec_content["contains"] = {k: v for d in contents for k, v in d.items()}
178
+ results.append(spec_content)
179
+ else:
180
+ unique_specs.add(spec['id'])
181
 
182
+ if len(results) > 0:
183
+ return KeywordResponse(
184
+ results=results,
185
+ search_time=time.time() - start_time
186
+ )
187
+ else:
188
+ raise HTTPException(status_code=404, detail="Specifications not found")
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  @app.post("/search-spec/experimental", response_model=KeywordResponse)
191
+ def bm25_search_specification(request: BM25KeywordRequest):
192
  start_time = time.time()
 
193
  working_group = request.working_group
194
  spec_type = request.spec_type
195
  threshold = request.threshold
196
+ query = request.keywords
197
 
198
  results_out = []
199
  query_tokens = bm25s.tokenize(query)
200
+ results, scores = bm25_index.retrieve(query_tokens, k=len(bm25_index.corpus))
201
+ print("BM25 raw scores:", scores)
202
 
203
  def calculate_boosted_score(metadata, score, query):
204
+ title = set(metadata['title'].lower().split())
205
+ q = set(query.lower().split())
206
+ spec_id_presence = 0.5 if metadata['id'].lower() in q else 0
207
  booster = len(q & title) * 0.5
208
  return score + spec_id_presence + booster
209
 
 
247
  spec_details[spec]["normalized_score"] = normalized_scores[spec]
248
 
249
  unique_specs = sorted(normalized_scores.keys(), key=lambda x: normalized_scores[x], reverse=True)
250
+
251
  for rank, spec in enumerate(unique_specs, 1):
252
  details = spec_details[spec]
253
  metadata = details['doc']['metadata']
 
 
254
  if metadata.get('type', None) is None or (spec_type is not None and metadata["type"] != spec_type):
255
  continue
256
  if metadata.get('working_group', None) is None or (working_group is not None and metadata["working_group"] != working_group):
 
265
  search_time=time.time() - start_time
266
  )
267
  else:
268
+ raise HTTPException(status_code=404, detail="Specifications not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
indexed_docs.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "S4-110084": "https://www.3gpp.org/ftp/tsg_sa/WG4_CODEC/TSGS4_62/Docs/S4-110084.zip",
3
- "SP-000182": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000182.zip",
4
- "SP-000183": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000183.zip",
5
- "SP-000184": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000184.zip",
6
- "SP-000185": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000185.zip",
7
- "SP-090017": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_43/Docs/SP-090017.zip"
8
- }
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,21 +1,12 @@
1
- fastapi
2
- uvicorn[standard]
3
- requests
4
- beautifulsoup4
5
- pydantic
6
- psycopg2-binary
7
  numpy
8
- pandas
9
- pymupdf
10
  python-dotenv
11
- lxml
12
  nltk
13
  bm25s[full]
14
- scikit-learn
15
- faiss-cpu
16
- sentence-transformers[onnx]
17
- transformers
18
- accelerate
19
- peft
20
- huggingface_hub
21
- openai
 
 
 
 
 
 
 
1
  numpy
 
 
2
  python-dotenv
3
+ scikit-learn
4
  nltk
5
  bm25s[full]
6
+ jax[cpu]
7
+ datasets
8
+ fastapi
9
+ uvicorn[standard]
10
+ beautifulsoup4
11
+ requests
12
+ pydantic
 
schemas.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import *
3
+
4
+ class DocRequest(BaseModel):
5
+ doc_id: str
6
+
7
+ class DocResponse(BaseModel):
8
+ doc_id: str
9
+ url: str
10
+ version: str
11
+ scope: Optional[str] = None
12
+ search_time: float
13
+
14
+ class BatchDocRequest(BaseModel):
15
+ doc_ids: List[str]
16
+
17
+ class BatchDocResponse(BaseModel):
18
+ results: Dict[str, str]
19
+ missing: List[str]
20
+ search_time: float
21
+
22
+ class BM25KeywordRequest(BaseModel):
23
+ keywords: Optional[str] = ""
24
+ threshold: Optional[int] = 60
25
+ release: Optional[str] = None
26
+ working_group: Optional[str] = None
27
+ spec_type: Optional[Literal["TS", "TR"]] = None
28
+
29
+ class KeywordRequest(BaseModel):
30
+ keywords: Optional[str] = ""
31
+ search_mode: Literal["quick", "deep"]
32
+ case_sensitive: Optional[bool] = False
33
+ release: Optional[str] = None
34
+ working_group: Optional[str] = None
35
+ spec_type: Optional[Literal["TS", "TR"]] = None
36
+ mode: Optional[Literal["and", "or"]] = "and"
37
+
38
+ class KeywordResponse(BaseModel):
39
+ results: List[Dict[str, Any]]
40
+ search_time: float
static/script.js CHANGED
@@ -354,7 +354,6 @@ function displayKeywordResults(data, mode) {
354
  <div class="result-url">
355
  <p>Title: ${spec.title}</p>
356
  <p>Type: ${spec.type}</p>
357
- <p>Release: ${spec.release}</p>
358
  <p>Version: ${spec.version}</p>
359
  <p>WG: ${spec.working_group}</p>
360
  <p>URL: <a target="_blank" href="${spec.url}">${spec.url}</a></p>
@@ -430,30 +429,6 @@ function openSectionPopup(specId, sections) {
430
  newTab.document.open();
431
  newTab.document.write(htmlContent);
432
  newTab.document.close()
433
- // popupTitle.textContent = `Sections of specification ${specId}`;
434
-
435
- // popupTextareas.innerHTML = '';
436
- // Object.entries(sections).forEach(([section, content], index) => {
437
- // const container = document.createElement("div");
438
- // container.className = "textarea-container";
439
-
440
- // const textarea = document.createElement("textarea");
441
- // textarea.id = `section-${index}`;
442
- // textarea.value = `${section}\n\n${content}`
443
- // textarea.readOnly = true;
444
-
445
- // const copyBtn = document.createElement('button');
446
- // copyBtn.className = 'copy-btn';
447
- // copyBtn.textContent = 'Copy';
448
- // copyBtn.onclick = () => copyTextarea(`section-${index}`);
449
-
450
- // container.appendChild(textarea);
451
- // container.appendChild(copyBtn);
452
- // popupTextareas.appendChild(container);
453
- // });
454
-
455
- // sectionPopup.style.display = 'block';
456
- // document.body.style.overflow = 'hidden';
457
  }
458
 
459
  // Display batch results
@@ -534,6 +509,6 @@ keywordInput.addEventListener('keypress', (event)=>{
534
 
535
  expKeywordInput.addEventListener('keypress', (event)=>{
536
  if (event.key === "Enter"){
537
- keywordSearchBtn.click();
538
  }
539
  })
 
354
  <div class="result-url">
355
  <p>Title: ${spec.title}</p>
356
  <p>Type: ${spec.type}</p>
 
357
  <p>Version: ${spec.version}</p>
358
  <p>WG: ${spec.working_group}</p>
359
  <p>URL: <a target="_blank" href="${spec.url}">${spec.url}</a></p>
 
429
  newTab.document.open();
430
  newTab.document.write(htmlContent);
431
  newTab.document.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  }
433
 
434
  // Display batch results
 
509
 
510
  expKeywordInput.addEventListener('keypress', (event)=>{
511
  if (event.key === "Enter"){
512
+ expKeywordSearchBtn.click();
513
  }
514
  })