Spaces:

OrganizedProgrammers
/

3GPPDocFinder

Running

App Files Files Community

om4r932 commited on 1 day ago

Commit

48e4b27

1 Parent(s): 8114183

V2 upload

Browse files

Files changed (5) hide show

app.py +166 -537
indexed_docs.json +0 -8
requirements.txt +8 -17
schemas.py +40 -0
static/script.js +1 -26

app.py CHANGED Viewed

@@ -1,462 +1,209 @@
-from io import StringIO
-import bm25s
-import numpy as np
-import pandas as pd
-import faiss
-import requests
-from bs4 import BeautifulSoup
-import json
-import os
-import traceback
-import uuid
-import zipfile
-import io
-import openai
-import httpx
-import subprocess
-import os
-import re
 import time
 from datetime import datetime
 from dotenv import load_dotenv
-import warnings
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-import nltk
-from nltk.stem import WordNetLemmatizer
 from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
-from pydantic import BaseModel
-from typing import Any, Dict, List, Literal, Optional
-os.environ['CURL_CA_BUNDLE'] = ''
-from sentence_transformers import SentenceTransformer
-import warnings
-warnings.filterwarnings("ignore")
-from sklearn.preprocessing import MinMaxScaler
-nltk.download("wordnet")
-load_dotenv()
-warnings.filterwarnings("ignore")
-app = FastAPI(title="3GPP Document Finder API",
-              description="API to find 3GPP documents based on TSG document IDs")
-app.mount("/static", StaticFiles(directory="static"), name="static")
-origins = [
-    "*",
-]
-regex = r"^(\d+[a-z]?(?:\.\d+)*)\t[\ \S]+$"
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=origins,
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-def get_text(specification: str, version: str):
-    """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
-    doc_id = specification
-    series = doc_id.split(".")[0]
-    response = requests.get(
-        f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
-        verify=False,
-        headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
-    )
-    if response.status_code != 200:
-        raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
-    zip_bytes = io.BytesIO(response.content)
-    with zipfile.ZipFile(zip_bytes) as zf:
-        for file_name in zf.namelist():
-            if file_name.endswith("zip"):
-                print("Another ZIP !")
-                zip_bytes = io.BytesIO(zf.read(file_name))
-                zf = zipfile.ZipFile(zip_bytes)
-                for file_name2 in zf.namelist():
-                    if file_name2.endswith("doc") or file_name2.endswith("docx"):
-                        if "cover" in file_name2.lower():
-                            print("COVER !")
-                            continue
-                        ext = file_name2.split(".")[-1]
-                        doc_bytes = zf.read(file_name2)
-                        temp_id = str(uuid.uuid4())
-                        input_path = f"/tmp/{temp_id}.{ext}"
-                        output_path = f"/tmp/{temp_id}.txt"
-                        with open(input_path, "wb") as f:
-                            f.write(doc_bytes)
-                        subprocess.run([
-                            "libreoffice",
-                            "--headless",
-                            "--convert-to", "txt",
-                            "--outdir", "/tmp",
-                            input_path
-                        ], check=True)
-                        with open(output_path, "r") as f:
-                            txt_data = [line.strip() for line in f if line.strip()]
-                        os.remove(input_path)
-                        os.remove(output_path)
-                        return txt_data
-            elif file_name.endswith("doc") or file_name.endswith("docx"):
-                if "cover" in file_name.lower():
-                    print("COVER !")
-                    continue
-                ext = file_name.split(".")[-1]
-                doc_bytes = zf.read(file_name)
-                temp_id = str(uuid.uuid4())
-                input_path = f"/tmp/{temp_id}.{ext}"
-                output_path = f"/tmp/{temp_id}.txt"
-                print("Ecriture")
-                with open(input_path, "wb") as f:
-                    f.write(doc_bytes)
-                print("Convertissement")
-                subprocess.run([
-                    "libreoffice",
-                    "--headless",
-                    "--convert-to", "txt",
-                    "--outdir", "/tmp",
-                    input_path
-                ], check=True)
-                print("Ecriture TXT")
-                with open(output_path, "r", encoding="utf-8") as f:
-                    txt_data = [line.strip() for line in f if line.strip()]
-                os.remove(input_path)
-                os.remove(output_path)
-                return txt_data
-    raise Exception(f"Aucun fichier .doc/.docx trouvé dans le ZIP pour {specification}-{version}")
-def get_scope(specification: str, version: str):
-    try:
-        spec_text = get_text(specification, version)
-        scp_i = 0
-        nxt_i = 0
-        for x in range(len(spec_text)):
-            text = spec_text[x]
-            if re.search(r"scope$", text, flags=re.IGNORECASE):
-                scp_i = x
-                nxt_i = scp_i + 10
-            if re.search(r"references$", text, flags=re.IGNORECASE):
-                nxt_i = x
-        return re.sub(r"\s+", " ", " ".join(spec_text[scp_i+1:nxt_i])) if len(spec_text[scp_i+1:nxt_i]) < 2 else "Not found"
-    except Exception as e:
-        traceback.print_exception(e)
-        return "Not found (error)"
-def get_spec_content(specification: str, version: str):
-    text = get_text(specification, version)
-    forewords = []
-    for x in range(len(text)):
-        line = text[x]
-        if "Foreword" in line:
-            forewords.append(x)
-        if len(forewords) >= 2:
             break
-    toc_brut = text[forewords[1]:]
-    chapters = []
-    for line in toc_brut:
-        x = line.split("\t")
-        m = re.search(regex, line)
-        if m and any(line in c for c in text[forewords[0]:forewords[1]]):
-            chapters.append(line)
-            print(line)
-    real_toc_indexes = {}
-    for chapter in chapters:
-        x = text.index(chapter)
-        real_toc_indexes[chapter] = x
-    document = {}
-    toc = list(real_toc_indexes.keys())
-    index_toc = list(real_toc_indexes.values())
-    curr_index = 0
-    for x in range(1, len(toc)):
-        document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
-        curr_index = x
-    document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
-    print(len(toc)-1, toc[curr_index], curr_index)
-    return document
-def caseSensitive(string: str, sensitive: bool):
-    return string if sensitive else string.lower()
-class DocRequest(BaseModel):
-    doc_id: str
-    release: Optional[int] = None
-class DocResponse(BaseModel):
-    doc_id: str
-    url: str
-    version: str
-    scope: Optional[str] = None
-    search_time: float
-class BatchDocRequest(BaseModel):
-    doc_ids: List[str]
-    release: Optional[int] = None
-class BatchDocResponse(BaseModel):
-    results: Dict[str, str]
-    missing: List[str]
-    search_time: float
-class KeywordRequest2(BaseModel):
-    keywords: Optional[str] = ""
-    threshold: Optional[int] = 60
-    release: Optional[str] = None
-    working_group: Optional[str] = None
-    spec_type: Optional[Literal["TS", "TR"]] = None
-class KeywordRequest(BaseModel):
-    keywords: Optional[str] = ""
-    search_mode: Literal["quick", "deep"]
-    case_sensitive: Optional[bool] = False
-    release: Optional[str] = None
-    working_group: Optional[str] = None
-    spec_type: Optional[Literal["TS", "TR"]] = None
-    mode: Optional[Literal["and", "or"]] = "and"
-class KeywordResponse(BaseModel):
-    results: List[Dict[str, Any]]
-    search_time: float
-class TsgDocFinder:
-    def __init__(self):
-        self.main_ftp_url = "https://www.3gpp.org/ftp"
-        self.indexer_file = "indexed_docs.json"
-        self.indexer, self.last_indexer_date = self.load_indexer()
-    def load_indexer(self):
-        """Load existing index if available"""
-        if os.path.exists(self.indexer_file):
-            with open(self.indexer_file, "r", encoding="utf-8") as f:
-                x = json.load(f)
-                return x["docs"], x["last_indexed_date"]
-        return {}, None
-    def save_indexer(self):
-        """Save the updated index"""
-        self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
-        with open(self.indexer_file, "w", encoding="utf-8") as f:
-            today = datetime.today()
-            output = {"docs": self.indexer, "last_indexed_date": self.last_indexer_date}
-            json.dump(output, f, indent=4, ensure_ascii=False)
-    def get_workgroup(self, doc):
-        main_tsg = "tsg_ct" if doc[0] == "C" else "tsg_sa" if doc[0] == "S" else "tsg_ran" if doc[0] == "R" else None
-        if main_tsg is None:
-            return None, None, None
-        workgroup = f"WG{int(doc[1])}" if doc[1].isnumeric() else main_tsg.upper()
-        return main_tsg, workgroup, doc
-    def find_workgroup_url(self, main_tsg, workgroup):
-        """Find the URL for the specific workgroup"""
-        response = requests.get(f"{self.main_ftp_url}/{main_tsg}", verify=False)
-        soup = BeautifulSoup(response.text, 'html.parser')
-        for item in soup.find_all("tr"):
-            link = item.find("a")
-            if link and workgroup in link.get_text():
-                return f"{self.main_ftp_url}/{main_tsg}/{link.get_text()}"
-        return f"{self.main_ftp_url}/{main_tsg}/{workgroup}"
-    def get_docs_from_url(self, url):
-        """Get list of documents/directories from a URL"""
-        try:
-            response = requests.get(url, verify=False, timeout=10)
-            soup = BeautifulSoup(response.text, "html.parser")
-            return [item.get_text() for item in soup.select("tr td a")]
-        except Exception as e:
-            print(f"Error accessing {url}: {e}")
-            return []
-    def search_document(self, doc_id: str, release=None):
-        original_id = doc_id
-        if original_id in self.indexer:
-            return self.indexer[original_id]
-        for doc in self.indexer:
-            if doc.startswith(original_id):
-                return self.indexer[doc]
-        # 2. Recherche live "classique" (TSG/CT)
-        main_tsg, workgroup, doc = self.get_workgroup(doc_id)
-        if main_tsg:
-            wg_url = self.find_workgroup_url(main_tsg, workgroup)
-            if wg_url:
-                meeting_folders = self.get_docs_from_url(wg_url)
-                for folder in meeting_folders:
-                    meeting_url = f"{wg_url}/{folder}"
-                    meeting_contents = self.get_docs_from_url(meeting_url)
-                    key = "docs" if "docs" in [x.lower() for x in meeting_contents] else "tdocs" if "tdocs" in [x.lower() for x in meeting_contents] else "tdoc" if "tdoc" in [x.lower() for x in meeting_contents] else None
-                    if key is not None:
-                        docs_url = f"{meeting_url}/{key}"
-                        files = self.get_docs_from_url(docs_url)
-                        for file in files:
-                            if doc in file.lower() or original_id in file:
-                                doc_url = f"{docs_url}/{file}"
-                                self.indexer[original_id] = doc_url
-                                return doc_url
-                        # ZIP subfolder
-                        if "zip" in [x for x in files]:
-                            zip_url = f"{docs_url}/zip"
-                            zip_files = self.get_docs_from_url(zip_url)
-                            for file in zip_files:
-                                if doc in file.lower() or original_id in file:
-                                    doc_url = f"{zip_url}/{file}"
-                                    self.indexer[original_id] = doc_url
-                                    self.save_indexer()
-                                    return doc_url
-        # 3. Dernier recours : tenter dans /ftp/workshop (recherche live)
-        workshop_url = f"{self.main_ftp_url}/workshop"
-        meetings = self.get_docs_from_url(workshop_url)
-        for meeting in meetings:
-            if meeting in ['./', '../']:
-                continue
-            meeting_url = f"{workshop_url}/{meeting}"
-            contents = self.get_docs_from_url(meeting_url)
-            for sub in contents:
-                if sub.lower() in ['docs', 'tdocs']:
-                    docs_url = f"{meeting_url}/{sub}"
-                    files = self.get_docs_from_url(docs_url)
-                    for file in files:
-                        if doc_id.lower() in file.lower() or original_id in file:
-                            doc_url = f"{docs_url}/{file}"
-                            self.indexer[original_id] = doc_url
-                            self.save_indexer()
-                            return doc_url
-                    if "zip" in [x.lower() for x in files]:
-                        zip_url = f"{docs_url}/zip"
-                        zip_files = self.get_docs_from_url(zip_url)
-                        for file in zip_files:
-                            if doc_id.lower() in file.lower() or original_id in file:
-                                doc_url = f"{zip_url}/{file}"
-                                self.indexer[original_id] = doc_url
-                                self.save_indexer()
-                                return doc_url
-        return f"Document {doc_id} not found"
-class SpecDocFinder:
-    def __init__(self):
-        self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
-        self.indexer_file = "indexed_specifications.json"
-        self.doc_zip = "indexed_docs_content.zip"
-        self.indexer_specs, self.indexer_scopes, self.last_indexer_date = self.load_indexer()
-        self.indexer_documents = self.load_documents()
-    def load_indexer(self):
-        """Load existing index if available"""
-        if os.path.exists(self.indexer_file):
-            with open(self.indexer_file, "r", encoding="utf-8") as f:
-                x = json.load(f)
-                return x["specs"], x["scopes"], x["last_indexed_date"]
-        return {}, {}, None
-    def load_documents(self):
-        if os.path.exists(self.doc_zip):
-            with zipfile.ZipFile(open(self.doc_zip, "rb")) as zf:
-                for file_name in zf.namelist():
-                    if file_name.endswith(".json"):
-                        doc_bytes = zf.read(file_name)
-                        try:
-                            doc_data = json.loads(doc_bytes.decode("utf-8"))
-                            print("Documents loaded successfully !")
-                            return doc_data
-                        except json.JSONDecodeError as e:
-                            print(f"Error while decoding the JSON file {file_name}: {e}")
-        print("Failed !")
-        return {}
-    def get_document(self, spec, version):
-        doc = self.indexer_documents.get(spec)
-        if doc:
-            return doc
-        else:
-            return get_spec_content(spec, version)
-    def get_section(self, doc, chapter):
-        return doc[chapter]
-    def save_indexer(self):
-        """Save the updated index"""
-        self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
-        with open(self.indexer_file, "w", encoding="utf-8") as f:
-            today = datetime.today()
-            output = {"specs": self.indexer_specs, "scopes": self.indexer_scopes, "last_indexed_date": self.last_indexer_date}
-            json.dump(output, f, indent=4, ensure_ascii=False)
-    def get_docs_from_url(self, url):
-        """Get list of documents/directories from a URL"""
-        try:
-            response = requests.get(url, verify=False, timeout=10)
-            soup = BeautifulSoup(response.text, "html.parser")
-            return [item.get_text() for item in soup.select("tr td a")]
-        except Exception as e:
-            print(f"Error accessing {url}: {e}")
-            return []
-    def search_document(self, document, release):
-        series = document.split(".")[0].zfill(2)
-        url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{document}"
-        versions = self.get_docs_from_url(url)
-        return url + "/" + versions[-1] if versions != [] else f"Specification {document} not found"
-finder_tsg = TsgDocFinder()
-finder_spec = SpecDocFinder()
-lemmatizer = WordNetLemmatizer()
-model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", backend="onnx")
-if os.path.exists("bm25s.zip"):
-    with zipfile.ZipFile("bm25s.zip", 'r') as zip_ref:
-        zip_ref.extractall(".")
-    bm25_engine = bm25s.BM25.load("3gpp_bm25_docs", load_corpus=True)
-@app.get("/")
-async def main_menu():
-    return FileResponse(os.path.join("templates", "index.html"))
 @app.post("/search-spec/experimental", response_model=KeywordResponse)
-def search_spec_bm25(request: KeywordRequest2):
     start_time = time.time()
-    release = request.release
     working_group = request.working_group
     spec_type = request.spec_type
     threshold = request.threshold
-    query = lemmatizer.lemmatize(request.keywords)
     results_out = []
     query_tokens = bm25s.tokenize(query)
-    results, scores = bm25_engine.retrieve(query_tokens, k=len(bm25_engine.corpus))
     def calculate_boosted_score(metadata, score, query):
-        title = {lemmatizer.lemmatize(metadata['title']).lower()}
-        q = {query.lower()}
-        spec_id_presence = 0.5 if len(q & {metadata['id']}) > 0 else 0
         booster = len(q & title) * 0.5
         return score + spec_id_presence + booster
@@ -500,12 +247,10 @@ def search_spec_bm25(request: KeywordRequest2):
         spec_details[spec]["normalized_score"] = normalized_scores[spec]
     unique_specs = sorted(normalized_scores.keys(), key=lambda x: normalized_scores[x], reverse=True)
     for rank, spec in enumerate(unique_specs, 1):
         details = spec_details[spec]
         metadata = details['doc']['metadata']
-        if metadata.get('version', None) is None or (release is not None and metadata["version"].split(".")[0] != str(release)):
-            continue
         if metadata.get('type', None) is None or (spec_type is not None and metadata["type"] != spec_type):
             continue
         if metadata.get('working_group', None) is None or (working_group is not None and metadata["working_group"] != working_group):
@@ -520,120 +265,4 @@ def search_spec_bm25(request: KeywordRequest2):
             search_time=time.time() - start_time
         )
     else:
-        raise HTTPException(status_code=404, detail="Specifications not found")
-@app.post("/search-spec", response_model=KeywordResponse)
-def search_spec(request: KeywordRequest):
-    start_time = time.time()
-    booleanLowered = request.case_sensitive
-    search_mode = request.search_mode
-    release = request.release
-    working_group = request.working_group
-    spec_type = request.spec_type
-    kws = [caseSensitive(_, booleanLowered) for _ in request.keywords.split(",")]
-    print(kws)
-    unique_specs = set()
-    results = []
-    if kws == [""] and search_mode == "deep":
-        raise HTTPException(status_code=400, detail="You must enter keywords in deep search mode !")
-    for string, spec in finder_spec.indexer_specs.items():
-        put = False
-        if spec['id'] in unique_specs:
-            continue
-        if spec.get('version', None) is None or (release is not None and spec["version"].split(".")[0] != str(release)):
-            continue
-        if spec.get('type', None) is None or (spec_type is not None and spec["type"] != spec_type):
-            continue
-        if spec.get('working_group', None) is None or (working_group is not None and spec["working_group"] != working_group):
-            continue
-        if kws != "":
-            if search_mode == "deep":
-                contents = []
-                version = finder_spec.search_document(spec['id'], spec['release']).split("/")[-1].replace(".zip", "").split("-")[-1]
-                doc = finder_spec.get_document(spec['id'], version)
-                docValid = not isinstance(doc, str)
-            if request.mode == "and":
-                if all(kw in caseSensitive(string, booleanLowered) for kw in kws):
-                    put = True
-                if search_mode == "deep":
-                    if docValid:
-                        for chapter in list(doc.keys())[1:]:
-                            if "references" not in chapter.lower() and "void" not in chapter.lower() and "annex" not in doc[chapter].lower():
-                                if all(kw in caseSensitive(doc[chapter], booleanLowered) for kw in kws):
-                                    put = True
-                                    contents.append(chapter)
-            elif request.mode == "or":
-                if any(kw in caseSensitive(string, booleanLowered) for kw in kws):
-                    put = True
-                if search_mode == "deep":
-                    if docValid:
-                        for chapter in list(doc.keys())[1:]:
-                            if "references" not in chapter.lower() and "void" not in chapter.lower() and "annex" not in doc[chapter].lower():
-                                if any(kw in caseSensitive(doc[chapter], booleanLowered) for kw in kws):
-                                    put = True
-                                    contents.append(chapter)
-        else:
-            put = True
-        if put:
-            spec_content = spec
-            if search_mode == "deep":
-                spec_content["contains"] = {chap: doc[chap] for chap in contents}
-            results.append(spec_content)
-        else:
-            unique_specs.add(spec['id'])
-    if len(results) > 0:
-        return KeywordResponse(
-            results=results,
-            search_time=time.time() - start_time
-        )
-    else:
-        raise HTTPException(status_code=404, detail="Specifications not found")
-@app.post("/find", response_model=DocResponse)
-def find_document(request: DocRequest):
-    start_time = time.time()
-    finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
-    result = finder.search_document(request.doc_id, request.release)
-    if "not found" not in result and "Could not" not in result and "Unable" not in result:
-        version = result.split("/")[-1].replace(".zip", "").split("-")[-1]
-        return DocResponse(
-            doc_id=request.doc_id,
-            version=version,
-            url=result,
-            search_time=time.time() - start_time
-        ) if isinstance(finder, TsgDocFinder) else DocResponse(
-            doc_id=request.doc_id,
-            version=version,
-            url=result,
-            search_time=time.time() - start_time,
-            scope=finder.indexer_scopes[request.doc_id] if request.doc_id in finder.indexer_scopes else get_scope(request.doc_id, version)
-        )
-    else:
-        raise HTTPException(status_code=404, detail=result)
-@app.post("/batch", response_model=BatchDocResponse)
-def find_documents_batch(request: BatchDocRequest):
-    start_time = time.time()
-    results = {}
-    missing = []
-    for doc_id in request.doc_ids:
-        finder = finder_tsg if doc_id[0].isalpha() else finder_spec
-        result = finder.search_document(doc_id)
-        if "not found" not in result and "Could not" not in result and "Unable" not in result:
-            results[doc_id] = result
-        else:
-            missing.append(doc_id)
-    return BatchDocResponse(
-        results=results,
-        missing=missing,
-        search_time=time.time() - start_time
-    )

 import time
 from datetime import datetime
+import os, warnings, nltk, json, subprocess
+import numpy as np
+from nltk.stem import WordNetLemmatizer
 from dotenv import load_dotenv
+from sklearn.preprocessing import MinMaxScaler
+os.environ['CURL_CA_BUNDLE'] = ""
+warnings.filterwarnings('ignore')
+nltk.download('wordnet')
+load_dotenv()
+from datasets import load_dataset
+import bm25s
+from bm25s.hf import BM25HF
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
+from schemas import *
+from bs4 import BeautifulSoup
+import requests
+lemmatizer = WordNetLemmatizer()
+spec_metadatas = load_dataset("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ["HF_TOKEN"])
+spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
+tdoc_locations = load_dataset("OrganizedProgrammers/3GPPTDocLocation", token=os.environ["HF_TOKEN"])
+bm25_index = BM25HF.load_from_hub("OrganizedProgrammers/3GPPBM25IndexSingle", load_corpus=True, token=os.environ["HF_TOKEN"])
+spec_metadatas = spec_metadatas["train"].to_list()
+spec_contents = spec_contents["train"].to_list()
+tdoc_locations = tdoc_locations["train"].to_list()
+def get_docs_from_url(url):
+    """Get list of documents/directories from a URL"""
+    try:
+        response = requests.get(url, verify=False, timeout=10)
+        soup = BeautifulSoup(response.text, "html.parser")
+        return [item.get_text() for item in soup.select("tr td a")]
+    except Exception as e:
+        print(f"Error accessing {url}: {e}")
+        return []
+def get_tdoc_url(doc_id):
+    for tdoc in tdoc_locations:
+        if tdoc["doc_id"] == doc_id:
+            return tdoc["url"]
+def get_spec_url(document):
+    series = document.split(".")[0].zfill(2)
+    url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{document}"
+    versions = get_docs_from_url(url)
+    return url + "/" + versions[-1] if versions != [] else f"Specification {document} not found"
+def get_document(spec_id: str, spec_title: str):
+    text = [f"{spec_id} - {spec_title}"]
+    for section in spec_contents:
+        if spec_id == section["doc_id"]:
+            text.extend([section['section'], section['content']])
+    return text
+app = FastAPI(title="3GPP Document Finder Back-End", description="Backend for 3GPPDocFinder - Searching technical documents & specifications from 3GPP FTP server")
+app.mount("/static", StaticFiles(directory="static"), name="static")
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+@app.get("/")
+def index():
+    return FileResponse(os.path.join('templates', 'index.html'))
+@app.post("/find", response_model=DocResponse)
+def find_document(request: DocRequest):
+    start_time = time.time()
+    document = request.doc_id
+    url = get_tdoc_url(document) if document[0].isalpha() else get_spec_url(document)
+    if "Specification" in url or "Document" in url:
+        raise HTTPException(status_code=404, detail=url)
+    version = url.split("/")[-1].replace(".zip", "").split("-")[-1]
+    scope = None
+    for spec in spec_metadatas:
+        if spec['id'] == document:
+            scope = spec['id']
             break
+    return DocResponse(
+        doc_id=document,
+        version=version,
+        url=url,
+        search_time=time.time() - start_time,
+        scope=scope
+    )
+@app.post("/batch", response_model=BatchDocResponse)
+def find_multiple_documents(request: BatchDocRequest):
+    start_time = time.time()
+    documents = request.doc_ids
+    results = {}
+    missing = []
+    for document in documents:
+        url = get_tdoc_url(document) if document[0].isalpha() else get_spec_url(document)
+        if "Specification" not in url and "Document" not in url:
+            results[document] = url
+        else:
+            missing.append(document)
+    return BatchDocResponse(
+        results=results,
+        missing=missing,
+        search_time=time.time()-start_time
+    )
+@app.post("/search-spec", response_model=KeywordResponse)
+def search_specification_by_keywords(request: KeywordRequest):
+    start_time = time.time()
+    boolSensitiveCase = request.case_sensitive
+    search_mode = request.search_mode
+    working_group = request.working_group
+    spec_type = request.spec_type
+    keywords = [string.lower() if boolSensitiveCase else string for string in request.keywords.split(",")]
+    print(keywords)
+    unique_specs = set()
+    results = []
+    if keywords == [""] and search_mode == "deep":
+        raise HTTPException(status_code=400, detail="You must enter keywords in deep search mode !")
+    for spec in spec_metadatas:
+        valid = False
+        if spec['id'] in unique_specs: continue
+        if spec.get('type', None) is None or (spec_type is not None and spec["type"] != spec_type): continue
+        if spec.get('working_group', None) is None or (working_group is not None and spec["working_group"] != working_group): continue
+        if search_mode == "deep":
+            contents = []
+            doc = get_document(spec["id"], spec["title"])
+            docValid = len(doc) > 1
+        if request.mode == "and":
+            string = f"{spec['id']}+-+{spec['title']}+-+{spec['type']}+-+{spec['version']}+-+{spec['working_group']}"
+            if all(keyword in (string.lower() if boolSensitiveCase else string) for keyword in keywords):
+                valid = True
+            if search_mode == "deep":
+                if docValid:
+                    for x in range(1, len(doc) - 1, 2):
+                        section_title = doc[x]
+                        section_content = doc[x+1]
+                        if "reference" not in section_title.lower() and "void" not in section_title.lower() and "annex" not in section_content.lower():
+                            if all(keyword in (section_content.lower() if boolSensitiveCase else section_content) for keyword in keywords):
+                                valid = True
+                                contents.append({section_title: section_content})
+        elif request.mode == "or":
+            string = f"{spec['id']}+-+{spec['title']}+-+{spec['type']}+-+{spec['version']}+-+{spec['working_group']}"
+            if any(keyword in (string.lower() if boolSensitiveCase else string) for keyword in keywords):
+                valid = True
+            if search_mode == "deep":
+                if docValid:
+                    for x in range(1, len(doc) - 1, 2):
+                        section_title = doc[x]
+                        section_content = doc[x+1]
+                        if "reference" not in section_title.lower() and "void" not in section_title.lower() and "annex" not in section_content.lower():
+                            if any(keyword in (section_content.lower() if boolSensitiveCase else section_content) for keyword in keywords):
+                                valid = True
+                                contents.append({section_title: section_content})
+        if valid:
+            spec_content = spec
+            if search_mode == "deep":
+                spec_content["contains"] = {k: v for d in contents for k, v in d.items()}
+            results.append(spec_content)
+        else:
+            unique_specs.add(spec['id'])
+    if len(results) > 0:
+        return KeywordResponse(
+            results=results,
+            search_time=time.time() - start_time
+        )
+    else:
+        raise HTTPException(status_code=404, detail="Specifications not found")
 @app.post("/search-spec/experimental", response_model=KeywordResponse)
+def bm25_search_specification(request: BM25KeywordRequest):
     start_time = time.time()
     working_group = request.working_group
     spec_type = request.spec_type
     threshold = request.threshold
+    query = request.keywords
     results_out = []
     query_tokens = bm25s.tokenize(query)
+    results, scores = bm25_index.retrieve(query_tokens, k=len(bm25_index.corpus))
+    print("BM25 raw scores:", scores)
     def calculate_boosted_score(metadata, score, query):
+        title = set(metadata['title'].lower().split())
+        q = set(query.lower().split())
+        spec_id_presence = 0.5 if metadata['id'].lower() in q else 0
         booster = len(q & title) * 0.5
         return score + spec_id_presence + booster
         spec_details[spec]["normalized_score"] = normalized_scores[spec]
     unique_specs = sorted(normalized_scores.keys(), key=lambda x: normalized_scores[x], reverse=True)
     for rank, spec in enumerate(unique_specs, 1):
         details = spec_details[spec]
         metadata = details['doc']['metadata']
         if metadata.get('type', None) is None or (spec_type is not None and metadata["type"] != spec_type):
             continue
         if metadata.get('working_group', None) is None or (working_group is not None and metadata["working_group"] != working_group):
             search_time=time.time() - start_time
         )
     else:
+        raise HTTPException(status_code=404, detail="Specifications not found")

indexed_docs.json DELETED Viewed

@@ -1,8 +0,0 @@
-{
-    "S4-110084": "https://www.3gpp.org/ftp/tsg_sa/WG4_CODEC/TSGS4_62/Docs/S4-110084.zip",
-    "SP-000182": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000182.zip",
-    "SP-000183": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000183.zip",
-    "SP-000184": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000184.zip",
-    "SP-000185": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000185.zip",
-    "SP-090017": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_43/Docs/SP-090017.zip"
-}

requirements.txt CHANGED Viewed

@@ -1,21 +1,12 @@
-fastapi
-uvicorn[standard]
-requests
-beautifulsoup4
-pydantic
-psycopg2-binary
 numpy
-pandas
-pymupdf
 python-dotenv
-lxml
 nltk
 bm25s[full]
-scikit-learn
-faiss-cpu
-sentence-transformers[onnx]
-transformers
-accelerate
-peft
-huggingface_hub
-openai

 numpy
 python-dotenv
+scikit-learn
 nltk
 bm25s[full]
+jax[cpu]
+datasets
+fastapi
+uvicorn[standard]
+beautifulsoup4
+requests
+pydantic

schemas.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from pydantic import BaseModel
+from typing import *
+class DocRequest(BaseModel):
+    doc_id: str
+class DocResponse(BaseModel):
+    doc_id: str
+    url: str
+    version: str
+    scope: Optional[str] = None
+    search_time: float
+class BatchDocRequest(BaseModel):
+    doc_ids: List[str]
+class BatchDocResponse(BaseModel):
+    results: Dict[str, str]
+    missing: List[str]
+    search_time: float
+class BM25KeywordRequest(BaseModel):
+    keywords: Optional[str] = ""
+    threshold: Optional[int] = 60
+    release: Optional[str] = None
+    working_group: Optional[str] = None
+    spec_type: Optional[Literal["TS", "TR"]] = None
+class KeywordRequest(BaseModel):
+    keywords: Optional[str] = ""
+    search_mode: Literal["quick", "deep"]
+    case_sensitive: Optional[bool] = False
+    release: Optional[str] = None
+    working_group: Optional[str] = None
+    spec_type: Optional[Literal["TS", "TR"]] = None
+    mode: Optional[Literal["and", "or"]] = "and"
+class KeywordResponse(BaseModel):
+    results: List[Dict[str, Any]]
+    search_time: float

static/script.js CHANGED Viewed

@@ -354,7 +354,6 @@ function displayKeywordResults(data, mode) {
             <div class="result-url">
                 <p>Title: ${spec.title}</p>
                 <p>Type: ${spec.type}</p>
-                <p>Release: ${spec.release}</p>
                 <p>Version: ${spec.version}</p>
                 <p>WG: ${spec.working_group}</p>
                 <p>URL: <a target="_blank" href="${spec.url}">${spec.url}</a></p>
@@ -430,30 +429,6 @@ function openSectionPopup(specId, sections) {
     newTab.document.open();
     newTab.document.write(htmlContent);
     newTab.document.close()
-//   popupTitle.textContent = `Sections of specification ${specId}`;
-//   popupTextareas.innerHTML = '';
-//   Object.entries(sections).forEach(([section, content], index) => {
-//     const container = document.createElement("div");
-//     container.className = "textarea-container";
-//     const textarea = document.createElement("textarea");
-//     textarea.id = `section-${index}`;
-//     textarea.value = `${section}\n\n${content}`
-//     textarea.readOnly = true;
-//     const copyBtn = document.createElement('button');
-//     copyBtn.className = 'copy-btn';
-//     copyBtn.textContent = 'Copy';
-//     copyBtn.onclick = () => copyTextarea(`section-${index}`);
-//     container.appendChild(textarea);
-//     container.appendChild(copyBtn);
-//     popupTextareas.appendChild(container);
-//   });
-//   sectionPopup.style.display = 'block';
-//   document.body.style.overflow = 'hidden';
 }
 // Display batch results
@@ -534,6 +509,6 @@ keywordInput.addEventListener('keypress', (event)=>{
 expKeywordInput.addEventListener('keypress', (event)=>{
     if (event.key === "Enter"){
-        keywordSearchBtn.click();
     }
 })

             <div class="result-url">
                 <p>Title: ${spec.title}</p>
                 <p>Type: ${spec.type}</p>
                 <p>Version: ${spec.version}</p>
                 <p>WG: ${spec.working_group}</p>
                 <p>URL: <a target="_blank" href="${spec.url}">${spec.url}</a></p>
     newTab.document.open();
     newTab.document.write(htmlContent);
     newTab.document.close()
 }
 // Display batch results
 expKeywordInput.addEventListener('keypress', (event)=>{
     if (event.key === "Enter"){
+        expKeywordSearchBtn.click();
     }
 })