Rivalcoder commited on
Commit
192b91e
·
0 Parent(s):

Add First basic Version

Browse files
Files changed (10) hide show
  1. .dockerignore +26 -0
  2. .gitignore +61 -0
  3. Dockerfile +32 -0
  4. README.md +10 -0
  5. app.py +260 -0
  6. embedder.py +56 -0
  7. llm.py +82 -0
  8. pdf_parser.py +50 -0
  9. requirements.txt +12 -0
  10. retriever.py +9 -0
.dockerignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ README.md
4
+ DEPLOYMENT.md
5
+ render.yaml
6
+ start.sh
7
+ __pycache__
8
+ *.pyc
9
+ *.pyo
10
+ *.pyd
11
+ .Python
12
+ env
13
+ pip-log.txt
14
+ pip-delete-this-directory.txt
15
+ .tox
16
+ .coverage
17
+ .coverage.*
18
+ .cache
19
+ nosetests.xml
20
+ coverage.xml
21
+ *.cover
22
+ *.log
23
+ .git
24
+ .mypy_cache
25
+ .pytest_cache
26
+ .hypothesis
.gitignore ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+ .env.local
4
+ .env.production
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+ .cache
29
+ # Virtual environments
30
+ venv/
31
+ env/
32
+ ENV/
33
+ env.bak/
34
+ venv.bak/
35
+
36
+ # IDE
37
+ .vscode/
38
+ .idea/
39
+ *.swp
40
+ *.swo
41
+ *~
42
+
43
+ # OS
44
+ .DS_Store
45
+ Thumbs.db
46
+
47
+ # Logs
48
+ *.log
49
+
50
+ # Temporary files
51
+ *.tmp
52
+ *.temp
53
+
54
+ # FAISS index files
55
+ *.index
56
+ *.faiss
57
+
58
+ # PDF files (if you don't want to commit them)
59
+ *.pdf
60
+
61
+ DEPLOYMENT.md
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Create a non-root user
11
+ RUN useradd --create-home --shell /bin/bash appuser
12
+
13
+ # Copy requirements first for better caching
14
+ COPY requirements.txt .
15
+
16
+ # Install Python dependencies
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy application code
20
+ COPY . .
21
+
22
+ # Create cache directory with proper permissions
23
+ RUN mkdir -p /app/.cache && chown -R appuser:appuser /app
24
+
25
+ # Switch to non-root user
26
+ USER appuser
27
+
28
+ # Expose port
29
+ EXPOSE 7860
30
+
31
+ # Run the application
32
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Issurance Agent Rag
3
+ emoji: 💻
4
+ colorFrom: red
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import warnings
3
+ import logging
4
+ import time
5
+ from datetime import datetime
6
+
7
+ from fastapi import FastAPI, Request, HTTPException, Depends, Header
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from pydantic import BaseModel
10
+
11
+ from pdf_parser import parse_pdf_from_url_multithreaded as parse_pdf_from_url, parse_pdf_from_file_multithreaded as parse_pdf_from_file
12
+ from embedder import build_pinecone_index, preload_model
13
+ from retriever import retrieve_chunks
14
+ from llm import query_gemini
15
+
16
+ import uvicorn
17
+
18
+ # Set up cache directory for HuggingFace models
19
+ cache_dir = os.path.join(os.getcwd(), ".cache")
20
+ os.makedirs(cache_dir, exist_ok=True)
21
+ os.environ['HF_HOME'] = cache_dir
22
+ os.environ['TRANSFORMERS_CACHE'] = cache_dir
23
+
24
+ # Suppress TensorFlow warnings
25
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
26
+ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
27
+ os.environ['TF_LOGGING_LEVEL'] = 'ERROR'
28
+ os.environ['TF_ENABLE_DEPRECATION_WARNINGS'] = '0'
29
+ warnings.filterwarnings('ignore', category=DeprecationWarning, module='tensorflow')
30
+ logging.getLogger('tensorflow').setLevel(logging.ERROR)
31
+
32
+ app = FastAPI(title="HackRx Insurance Policy Assistant", version="1.0.0")
33
+
34
+ # Add CORS middleware
35
+ app.add_middleware(
36
+ CORSMiddleware,
37
+ allow_origins=["*"],
38
+ allow_credentials=True,
39
+ allow_methods=["*"],
40
+ allow_headers=["*"],
41
+ )
42
+
43
+ # Preload the model at startup
44
+ @app.on_event("startup")
45
+ async def startup_event():
46
+ print("Starting up HackRx Insurance Policy Assistant...")
47
+ print("Preloading sentence transformer model...")
48
+ preload_model()
49
+ print("Model preloading completed. API is ready to serve requests!")
50
+
51
+ @app.get("/")
52
+ async def root():
53
+ return {"message": "HackRx Insurance Policy Assistant API is running!"}
54
+
55
+ @app.get("/health")
56
+ async def health_check():
57
+ return {"status": "healthy", "message": "API is ready to process requests"}
58
+
59
+ class QueryRequest(BaseModel):
60
+ documents: str
61
+ questions: list[str]
62
+
63
+ class LocalQueryRequest(BaseModel):
64
+ document_path: str
65
+ questions: list[str]
66
+
67
+ def verify_token(authorization: str = Header(None)):
68
+ if not authorization or not authorization.startswith("Bearer "):
69
+ raise HTTPException(status_code=401, detail="Invalid authorization header")
70
+ token = authorization.replace("Bearer ", "")
71
+ # For demo purposes, accept any token. In production, validate against a database
72
+ if not token:
73
+ raise HTTPException(status_code=401, detail="Invalid token")
74
+ return token
75
+
76
+ @app.post("/api/v1/hackrx/run")
77
+ async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
78
+ start_time = time.time()
79
+ timing_data = {}
80
+ try:
81
+ print(f"\n=== INPUT JSON ===")
82
+ print(f"Documents: {request.documents}")
83
+ print(f"Questions: {request.questions}")
84
+ print(f"==================\n")
85
+
86
+ print(f"Processing {len(request.questions)} questions...")
87
+
88
+ # Time PDF parsing
89
+ pdf_start = time.time()
90
+ text_chunks = parse_pdf_from_url(request.documents)
91
+ pdf_time = time.time() - pdf_start
92
+ timing_data['pdf_parsing'] = round(pdf_time, 2)
93
+
94
+ print(f"Extracted {len(text_chunks)} text chunks from PDF")
95
+
96
+ # Time Pinecone index building/upsert
97
+ index_start = time.time()
98
+ pinecone_index = build_pinecone_index(text_chunks)
99
+ index_time = time.time() - index_start
100
+ timing_data['pinecone_index_building'] = round(index_time, 2)
101
+ texts = text_chunks # for retrieve_chunks
102
+
103
+ # Time chunk retrieval for all questions
104
+ retrieval_start = time.time()
105
+ all_chunks = set()
106
+ for i, question in enumerate(request.questions):
107
+ question_start = time.time()
108
+ top_chunks = retrieve_chunks(pinecone_index, texts, question)
109
+ question_time = time.time() - question_start
110
+ all_chunks.update(top_chunks)
111
+ retrieval_time = time.time() - retrieval_start
112
+ timing_data['chunk_retrieval'] = round(retrieval_time, 2)
113
+
114
+ print(f"Retrieved {len(all_chunks)} unique chunks")
115
+
116
+ # Time LLM processing
117
+ llm_start = time.time()
118
+ print(f"Processing all {len(request.questions)} questions in batch...")
119
+ response = query_gemini(request.questions, list(all_chunks))
120
+ llm_time = time.time() - llm_start
121
+ timing_data['llm_processing'] = round(llm_time, 2)
122
+
123
+ # Time response processing
124
+ response_start = time.time()
125
+ # Extract answers from the JSON response
126
+ if isinstance(response, dict) and "answers" in response:
127
+ answers = response["answers"]
128
+ while len(answers) < len(request.questions):
129
+ answers.append("Not Found")
130
+ answers = answers[:len(request.questions)]
131
+ else:
132
+ answers = [response] if isinstance(response, str) else []
133
+ while len(answers) < len(request.questions):
134
+ answers.append("Not Found")
135
+ answers = answers[:len(request.questions)]
136
+
137
+ response_time = time.time() - response_start
138
+ timing_data['response_processing'] = round(response_time, 2)
139
+
140
+ print(f"Generated {len(answers)} answers")
141
+
142
+ # Calculate total time
143
+ total_time = time.time() - start_time
144
+ timing_data['total_time'] = round(total_time, 2)
145
+
146
+ print(f"\n=== TIMING BREAKDOWN ===")
147
+ print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
148
+ print(f"Pinecone Index Building: {timing_data['pinecone_index_building']}s")
149
+ print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
150
+ print(f"LLM Processing: {timing_data['llm_processing']}s")
151
+ print(f"Response Processing: {timing_data['response_processing']}s")
152
+ print(f"TOTAL TIME: {timing_data['total_time']}s")
153
+ print(f"=======================\n")
154
+
155
+ result = {"answers": answers}
156
+
157
+ print(f"=== OUTPUT JSON ===")
158
+ print(f"{result}")
159
+ print(f"==================\n")
160
+
161
+ return result
162
+
163
+ except Exception as e:
164
+ total_time = time.time() - start_time
165
+ print(f"Error after {total_time:.2f} seconds: {str(e)}")
166
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
167
+
168
+ @app.post("/api/v1/hackrx/local")
169
+ async def run_local_query(request: LocalQueryRequest):
170
+ start_time = time.time()
171
+ timing_data = {}
172
+ try:
173
+ print(f"\n=== INPUT JSON ===")
174
+ print(f"Document Path: {request.document_path}")
175
+ print(f"Questions: {request.questions}")
176
+ print(f"==================\n")
177
+
178
+ print(f"Processing local document: {request.document_path}")
179
+ print(f"Processing {len(request.questions)} questions...")
180
+
181
+ # Time local PDF parsing
182
+ pdf_start = time.time()
183
+ text_chunks = parse_pdf_from_file(request.document_path)
184
+ pdf_time = time.time() - pdf_start
185
+ timing_data['pdf_parsing'] = round(pdf_time, 2)
186
+
187
+ print(f"Extracted {len(text_chunks)} text chunks from local PDF")
188
+
189
+ # Time Pinecone index building/upsert
190
+ index_start = time.time()
191
+ pinecone_index = build_pinecone_index(text_chunks)
192
+ index_time = time.time() - index_start
193
+ timing_data['pinecone_index_building'] = round(index_time, 2)
194
+ texts = text_chunks
195
+
196
+ # Time chunk retrieval for all questions
197
+ retrieval_start = time.time()
198
+ all_chunks = set()
199
+ for i, question in enumerate(request.questions):
200
+ question_start = time.time()
201
+ top_chunks = retrieve_chunks(pinecone_index, texts, question)
202
+ question_time = time.time() - question_start
203
+ all_chunks.update(top_chunks)
204
+ retrieval_time = time.time() - retrieval_start
205
+ timing_data['chunk_retrieval'] = round(retrieval_time, 2)
206
+
207
+ print(f"Retrieved {len(all_chunks)} unique chunks")
208
+
209
+ # Time LLM processing
210
+ llm_start = time.time()
211
+ print(f"Processing all {len(request.questions)} questions in batch...")
212
+ response = query_gemini(request.questions, list(all_chunks))
213
+ llm_time = time.time() - llm_start
214
+ timing_data['llm_processing'] = round(llm_time, 2)
215
+
216
+ # Time response processing
217
+ response_start = time.time()
218
+ if isinstance(response, dict) and "answers" in response:
219
+ answers = response["answers"]
220
+ while len(answers) < len(request.questions):
221
+ answers.append("Not Found")
222
+ answers = answers[:len(request.questions)]
223
+ else:
224
+ answers = [response] if isinstance(response, str) else []
225
+ while len(answers) < len(request.questions):
226
+ answers.append("Not Found")
227
+ answers = answers[:len(request.questions)]
228
+
229
+ response_time = time.time() - response_start
230
+ timing_data['response_processing'] = round(response_time, 2)
231
+
232
+ print(f"Generated {len(answers)} answers")
233
+
234
+ total_time = time.time() - start_time
235
+ timing_data['total_time'] = round(total_time, 2)
236
+
237
+ print(f"\n=== TIMING BREAKDOWN ===")
238
+ print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
239
+ print(f"Pinecone Index Building: {timing_data['pinecone_index_building']}s")
240
+ print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
241
+ print(f"LLM Processing: {timing_data['llm_processing']}s")
242
+ print(f"Response Processing: {timing_data['response_processing']}s")
243
+ print(f"TOTAL TIME: {timing_data['total_time']}s")
244
+ print(f"=======================\n")
245
+
246
+ result = {"answers": answers}
247
+
248
+ print(f"=== OUTPUT JSON ===")
249
+ print(f"{result}")
250
+ print(f"==================\n")
251
+
252
+ return result
253
+ except Exception as e:
254
+ total_time = time.time() - start_time
255
+ print(f"Error after {total_time:.2f} seconds: {str(e)}")
256
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
257
+
258
+ if __name__ == "__main__":
259
+ port = int(os.environ.get("PORT", 7860))
260
+ uvicorn.run("app:app", host="0.0.0.0", port=port)
embedder.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pinecone import Pinecone, ServerlessSpec
3
+ from sentence_transformers import SentenceTransformer
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+ cache_dir = os.path.join(os.getcwd(), ".cache")
8
+ os.makedirs(cache_dir, exist_ok=True)
9
+ os.environ['HF_HOME'] = cache_dir
10
+ os.environ['TRANSFORMERS_CACHE'] = cache_dir
11
+
12
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
13
+ PINECONE_ENV = os.getenv("PINECONE_ENV") # Not used in new SDK, keep cloud+region below instead
14
+ PINECONE_INDEX_NAME = 'bajaj-rag-assistant'
15
+ PINECONE_CLOUD = 'aws' # or 'gcp', or your choice, must match Pinecone project
16
+ PINECONE_REGION = 'us-east-1' # or your choice, must match Pinecone project
17
+
18
+ # Create Pinecone client globally
19
+ pc = Pinecone(api_key=PINECONE_API_KEY)
20
+
21
+ _model = None
22
+
23
+ def preload_model(model_name="paraphrase-MiniLM-L3-v2"):
24
+ global _model
25
+ if _model is not None:
26
+ return _model
27
+ _model = SentenceTransformer(model_name, cache_folder=cache_dir)
28
+ return _model
29
+
30
+ def get_model():
31
+ return preload_model()
32
+
33
+ def build_pinecone_index(chunks, index_name=PINECONE_INDEX_NAME):
34
+ model = get_model()
35
+ embeddings = model.encode(
36
+ chunks,
37
+ batch_size=128,
38
+ convert_to_numpy=True,
39
+ normalize_embeddings=True
40
+ )
41
+ # Create index if it doesn't exist
42
+ if index_name not in pc.list_indexes().names():
43
+ pc.create_index(
44
+ name=index_name,
45
+ dimension=embeddings.shape[1],
46
+ metric='cosine',
47
+ spec=ServerlessSpec(
48
+ cloud=PINECONE_CLOUD,
49
+ region=PINECONE_REGION
50
+ )
51
+ )
52
+ index = pc.Index(index_name)
53
+ # Upsert embeddings in Pinecone
54
+ vectors = [(f"id-{i}", emb.tolist(), {"text": chunk}) for i, (emb, chunk) in enumerate(zip(embeddings, chunks))]
55
+ index.upsert(vectors)
56
+ return index
llm.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ import os
3
+ import json
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ api_key = os.getenv("GOOGLE_API_KEY")
9
+ if not api_key:
10
+ raise ValueError("GOOGLE_API_KEY environment variable is not set. Please add it to your .env file")
11
+
12
+ print(f"Google API Key loaded: {api_key[:10]}..." if api_key else "No API key found")
13
+ genai.configure(api_key=api_key)
14
+
15
+ def query_gemini(questions, contexts):
16
+ try:
17
+ context = "\n\n".join(contexts)
18
+ questions_text = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
19
+ prompt = f"""
20
+ You are an expert insurance assistant generating formal yet user-facing answers to policy questions and Other Human Questions. Your goal is to write professional, structured answers that reflect the language of policy documents — but are still human-readable and easy to understand.
21
+
22
+ 🧠 FORMAT & TONE GUIDELINES:
23
+ - Write in professional third-person language (no "you", no "we").
24
+ - Use clear sentence structure with proper punctuation and spacing.
25
+ - Do NOT write in legalese or robotic passive constructions.
26
+ - Include eligibility, limits, and waiting periods explicitly where relevant.
27
+ - Keep it factual, neutral, and easy to follow.
28
+ - First, try to answer each question using information from the provided context.
29
+ - If the question is NOT covered by the context Provide Then Give The General Answer It Not Be In Context if Nothing Found Give Normal Ai Answer for The Question Correctly
30
+ - Limit each answer to 2–3 sentences, and do not repeat unnecessary information.
31
+ - If a question can be answered with a simple "Yes", "No", "Can apply", or "Cannot apply", then begin the answer with that phrase, followed by a short supporting Statement In Natural Human Like response.So Give A Good Answer For The Question With Correct Information.
32
+ - Avoid giving theory Based Long Long answers Try to Give Short Good Reasonable Answers.
33
+
34
+ 🛑 DO NOT:
35
+ - Use words like "context", "document", or "text".
36
+ - Output markdown, bullets, emojis, or markdown code blocks.
37
+ - Say "helpful", "available", "allowed", "indemnified", "excluded", etc.
38
+ - Use overly robotic passive constructions like "shall be indemnified".
39
+ - Dont Give In Message Like "Based On The Context "Or "Nothing Refered In The context" Like That Dont Give In Response Try To Give Answer For The Question Alone
40
+
41
+ ✅ DO:
42
+ - Write in clean, informative language.
43
+ - Give complete answers in 2–3 sentences maximum.
44
+
45
+
46
+
47
+
48
+ 📤 OUTPUT FORMAT (strict):
49
+ Respond with only the following JSON — no explanations, no comments, no markdown:
50
+
51
+ {{
52
+ "answers": [
53
+ "Answer to question 1",
54
+ "Answer to question 2",
55
+ ...
56
+ ]
57
+ }}
58
+
59
+ 📚 CONTEXT:
60
+ {context}
61
+
62
+ ❓ QUESTIONS:
63
+ {questions_text}
64
+
65
+ Your task: For each question, provide a complete, professional, and clearly written answer in 2–3 sentences using a formal but readable tone.
66
+ """
67
+ model = genai.GenerativeModel('gemini-2.5-flash-lite')
68
+ response = model.generate_content(prompt)
69
+ response_text = response.text.strip()
70
+ try:
71
+ if response_text.startswith("```json"):
72
+ response_text = response_text.replace("```json", "").replace("```", "").strip()
73
+ elif response_text.startswith("```"):
74
+ response_text = response_text.replace("```", "").strip()
75
+ parsed_response = json.loads(response_text)
76
+ return parsed_response
77
+ except json.JSONDecodeError:
78
+ print(f"Failed to parse JSON response: {response_text}")
79
+ return {"answers": ["Error parsing response"] * len(questions)}
80
+ except Exception as e:
81
+ print(f"Error in query_gemini: {str(e)}")
82
+ return {"answers": [f"Error generating response: {str(e)}"] * len(questions)}
pdf_parser.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import requests
3
+ from io import BytesIO
4
+ from concurrent.futures import ThreadPoolExecutor
5
+
6
+ def _extract_text(page):
7
+ text = page.get_text()
8
+ return text.strip() if text and text.strip() else None
9
+
10
+ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
11
+ """
12
+ Download PDF from URL, extract text in parallel, optionally chunk pages.
13
+ """
14
+ res = requests.get(url)
15
+ with fitz.open(stream=BytesIO(res.content), filetype="pdf") as doc:
16
+ num_pages = len(doc)
17
+ pages = list(doc)
18
+ # Step 1: Parallel text extraction
19
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
20
+ texts = list(executor.map(_extract_text, pages))
21
+ # Step 2: Optional chunking
22
+ if chunk_size > 1:
23
+ chunks = []
24
+ for i in range(0, len(texts), chunk_size):
25
+ chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
26
+ if chunk:
27
+ chunks.append(chunk)
28
+ return chunks
29
+ # Default: return one chunk per page
30
+ return [t for t in texts if t]
31
+
32
+ def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
33
+ """
34
+ Parse a local PDF file, extract text in parallel, optionally chunk pages.
35
+ """
36
+ with fitz.open(file_path) as doc:
37
+ num_pages = len(doc)
38
+ pages = list(doc)
39
+ # Step 1: Parallel text extraction
40
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
41
+ texts = list(executor.map(_extract_text, pages))
42
+ # Step 2: Optional chunking
43
+ if chunk_size > 1:
44
+ chunks = []
45
+ for i in range(0, len(texts), chunk_size):
46
+ chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
47
+ if chunk:
48
+ chunks.append(chunk)
49
+ return chunks
50
+ return [t for t in texts if t]
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ requests
4
+ faiss-cpu
5
+ sentence-transformers
6
+ PyMuPDF
7
+ python-dotenv
8
+ tf-keras
9
+ google-generativeai
10
+ pinecone
11
+
12
+
retriever.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from embedder import get_model
2
+
3
+ def retrieve_chunks(index, texts, question, top_k=15):
4
+ model = get_model()
5
+ q_embedding = model.encode([question], convert_to_numpy=True, normalize_embeddings=True)[0]
6
+ # Use Pinecone v3 index query
7
+ res = index.query(vector=q_embedding.tolist(), top_k=top_k, include_metadata=True)
8
+ selected_texts = [match['metadata']['text'] for match in res['matches']]
9
+ return selected_texts