Spaces:

Rivalcoder
/

Issurance_Agent_Rag

Running

App Files Files Community

Rivalcoder commited on 11 days ago

Commit

eb87b3b

1 Parent(s): c89a7bc

Update The Model issues and Prompt

Browse files

Files changed (23) hide show

.cache/chunks_6635d94cf9023c83521982b3043ec70c.pkl +3 -0
.cache/embeddings_b24811e7d333cc7d5047e52b357abd7e.pkl +3 -0
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/refs/main +1 -0
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json +7 -0
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md +173 -0
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json +7 -0
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json +20 -0
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json +4 -0
.cache/{.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock → models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json} +0 -0
.cache/models--sentence-transformers--all-MiniLM-L6-v2/{blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.incomplete → .no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja} +0 -0
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json +7 -0
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/model.safetensors +3 -0
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/special_tokens_map.json +1 -0
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer.json +0 -0
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer_config.json +1 -0
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/vocab.txt +0 -0
.cache/response_2ab720ffccd688afdc790db13e338c83.pkl +3 -0
app.py +119 -12
embedder.py +40 -2
llm.py +69 -54
main.py +119 -12
parser.py +23 -0
retriever.py +28 -3

.cache/chunks_6635d94cf9023c83521982b3043ec70c.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4cef2cc09ef9d4ef7d8649bb78ec868e356dcfecbcd6dde23442a90497d407e
+size 124546

.cache/embeddings_b24811e7d333cc7d5047e52b357abd7e.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:475523b57f8f6b89e62e668efef73309193b05f0f05bbeffb7f012ee952024f0
+size 347400

.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/refs/main ADDED Viewed

	@@ -0,0 +1 @@


1	+ c9745ed1d9f207416be6d2e6f8de32d1f16199bf

.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 384,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md ADDED Viewed

	@@ -0,0 +1,173 @@

+---
+language: en
+license: apache-2.0
+library_name: sentence-transformers
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+datasets:
+- s2orc
+- flax-sentence-embeddings/stackexchange_xml
+- ms_marco
+- gooaq
+- yahoo_answers_topics
+- code_search_net
+- search_qa
+- eli5
+- snli
+- multi_nli
+- wikihow
+- natural_questions
+- trivia_qa
+- embedding-data/sentence-compression
+- embedding-data/flickr30k-captions
+- embedding-data/altlex
+- embedding-data/simple-wiki
+- embedding-data/QQP
+- embedding-data/SPECTER
+- embedding-data/PAQ_pairs
+- embedding-data/WikiAnswers
+pipeline_tag: sentence-similarity
+---
+# all-MiniLM-L6-v2
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+# Normalize embeddings
+sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+------
+## Background
+The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
+contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
+1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
+We developed this model during the
+[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
+organized by Hugging Face. We developed this model as part of the project:
+[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
+## Intended uses
+Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
+the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
+By default, input text longer than 256 word pieces is truncated.
+## Training procedure
+### Pre-training
+We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
+### Fine-tuning
+We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
+We then apply the cross entropy loss by comparing with true pairs.
+#### Hyper parameters
+We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
+We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
+a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
+#### Training data
+We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
+We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
+| Dataset                                                  | Paper                                    | Number of training tuples  |
+|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
+| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
+| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
+| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
+| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs  | - | 25,316,456 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs  | - | 21,396,559 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs  | - | 21,396,559 |
+| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
+| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
+| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
+| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
+| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
+| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
+| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
+| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
+| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
+| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
+| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
+| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
+| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
+| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
+| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
+| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
+| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
+| **Total** | | **1,170,060,424** |

.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.0.0",
+    "transformers": "4.6.1",
+    "pytorch": "1.8.1"
+  }
+}

.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 256,
+  "do_lower_case": false
+}

.cache/{.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock → models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json} RENAMED Viewed

File without changes

.cache/models--sentence-transformers--all-MiniLM-L6-v2/{blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.incomplete → .no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja} RENAMED Viewed

File without changes

.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 384,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
+size 90868376

.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}

.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

.cache/response_2ab720ffccd688afdc790db13e338c83.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c5853e52bd3fdc0bdf05ca5b73769bc17fe8f44fe56271a78a87f155c5de6da
+size 429

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
 import warnings
 import logging
 # Set up cache directory for HuggingFace models
 cache_dir = os.path.join(os.getcwd(), ".cache")
@@ -22,7 +24,7 @@ from fastapi import FastAPI, Request, HTTPException, Depends, Header
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from parser import parse_pdf_from_url, parse_pdf_from_file
-from embedder import build_faiss_index
 from retriever import retrieve_chunks
 from llm import query_gemini
 import uvicorn
@@ -38,6 +40,14 @@ app.add_middleware(
     allow_headers=["*"],
 )
 @app.get("/")
 async def root():
     return {"message": "HackRx Insurance Policy Assistant API is running!"}
@@ -67,24 +77,52 @@ def verify_token(authorization: str = Header(None)):
 @app.post("/api/v1/hackrx/run")
 async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
     try:
         print(f"Processing {len(request.questions)} questions...")
         text_chunks = parse_pdf_from_url(request.documents)
         print(f"Extracted {len(text_chunks)} text chunks from PDF")
         index, texts = build_faiss_index(text_chunks)
-        # Get relevant chunks for all questions at once
         all_chunks = set()
-        for question in request.questions:
             top_chunks = retrieve_chunks(index, texts, question)
             all_chunks.update(top_chunks)
-        # Process all questions in a single LLM call
         print(f"Processing all {len(request.questions)} questions in batch...")
         response = query_gemini(request.questions, list(all_chunks))
         # Extract answers from the JSON response
         if isinstance(response, dict) and "answers" in response:
             answers = response["answers"]
@@ -100,35 +138,83 @@ async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
                 answers.append("Not Found")
             answers = answers[:len(request.questions)]
         print(f"Generated {len(answers)} answers")
-        return { "answers": answers }
     except Exception as e:
-        print(f"Error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/api/v1/hackrx/local")
 async def run_local_query(request: LocalQueryRequest):
     try:
         print(f"Processing local document: {request.document_path}")
         print(f"Processing {len(request.questions)} questions...")
-        # Parse local PDF file
         text_chunks = parse_pdf_from_file(request.document_path)
         print(f"Extracted {len(text_chunks)} text chunks from local PDF")
         index, texts = build_faiss_index(text_chunks)
-        # Get relevant chunks for all questions at once
         all_chunks = set()
-        for question in request.questions:
             top_chunks = retrieve_chunks(index, texts, question)
             all_chunks.update(top_chunks)
-        # Process all questions in a single LLM call
         print(f"Processing all {len(request.questions)} questions in batch...")
         response = query_gemini(request.questions, list(all_chunks))
         # Extract answers from the JSON response
         if isinstance(response, dict) and "answers" in response:
             answers = response["answers"]
@@ -144,11 +230,32 @@ async def run_local_query(request: LocalQueryRequest):
                 answers.append("Not Found")
             answers = answers[:len(request.questions)]
         print(f"Generated {len(answers)} answers")
-        return { "answers": answers }
     except Exception as e:
-        print(f"Error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":

 import os
 import warnings
 import logging
+import time
+from datetime import datetime
 # Set up cache directory for HuggingFace models
 cache_dir = os.path.join(os.getcwd(), ".cache")
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from parser import parse_pdf_from_url, parse_pdf_from_file
+from embedder import build_faiss_index, preload_model
 from retriever import retrieve_chunks
 from llm import query_gemini
 import uvicorn
     allow_headers=["*"],
 )
+# Preload the model at startup
+@app.on_event("startup")
+async def startup_event():
+    print("Starting up HackRx Insurance Policy Assistant...")
+    print("Preloading sentence transformer model...")
+    preload_model()
+    print("Model preloading completed. API is ready to serve requests!")
 @app.get("/")
 async def root():
     return {"message": "HackRx Insurance Policy Assistant API is running!"}
 @app.post("/api/v1/hackrx/run")
 async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
+    start_time = time.time()
+    timing_data = {}
     try:
         print(f"Processing {len(request.questions)} questions...")
+        # Time PDF parsing
+        pdf_start = time.time()
         text_chunks = parse_pdf_from_url(request.documents)
+        pdf_time = time.time() - pdf_start
+        timing_data['pdf_parsing'] = round(pdf_time, 2)
+        print(f"PDF Parsing took: {pdf_time:.2f} seconds")
         print(f"Extracted {len(text_chunks)} text chunks from PDF")
+        # Time FAISS index building
+        index_start = time.time()
         index, texts = build_faiss_index(text_chunks)
+        index_time = time.time() - index_start
+        timing_data['faiss_index_building'] = round(index_time, 2)
+        print(f"FAISS Index Building took: {index_time:.2f} seconds")
+        # Time chunk retrieval for all questions
+        retrieval_start = time.time()
         all_chunks = set()
+        for i, question in enumerate(request.questions):
+            question_start = time.time()
             top_chunks = retrieve_chunks(index, texts, question)
+            question_time = time.time() - question_start
+            print(f"Question {i+1} retrieval took: {question_time:.2f} seconds")
             all_chunks.update(top_chunks)
+        retrieval_time = time.time() - retrieval_start
+        timing_data['chunk_retrieval'] = round(retrieval_time, 2)
+        print(f"Total Chunk Retrieval took: {retrieval_time:.2f} seconds")
+        print(f"Retrieved {len(all_chunks)} unique chunks")
+        # Time LLM processing
+        llm_start = time.time()
         print(f"Processing all {len(request.questions)} questions in batch...")
         response = query_gemini(request.questions, list(all_chunks))
+        llm_time = time.time() - llm_start
+        timing_data['llm_processing'] = round(llm_time, 2)
+        print(f"LLM Processing took: {llm_time:.2f} seconds")
+        # Time response processing
+        response_start = time.time()
         # Extract answers from the JSON response
         if isinstance(response, dict) and "answers" in response:
             answers = response["answers"]
                 answers.append("Not Found")
             answers = answers[:len(request.questions)]
+        response_time = time.time() - response_start
+        timing_data['response_processing'] = round(response_time, 2)
+        print(f"Response Processing took: {response_time:.2f} seconds")
         print(f"Generated {len(answers)} answers")
+        # Calculate total time
+        total_time = time.time() - start_time
+        timing_data['total_time'] = round(total_time, 2)
+        timing_data['timestamp'] = datetime.now().isoformat()
+        print(f"\n=== TIMING BREAKDOWN ===")
+        print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
+        print(f"FAISS Index Building: {timing_data['faiss_index_building']}s")
+        print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
+        print(f"LLM Processing: {timing_data['llm_processing']}s")
+        print(f"Response Processing: {timing_data['response_processing']}s")
+        print(f"TOTAL TIME: {timing_data['total_time']}s")
+        print(f"=======================\n")
+        return {
+            "answers": answers
+        }
     except Exception as e:
+        total_time = time.time() - start_time
+        print(f"Error after {total_time:.2f} seconds: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/api/v1/hackrx/local")
 async def run_local_query(request: LocalQueryRequest):
+    start_time = time.time()
+    timing_data = {}
     try:
         print(f"Processing local document: {request.document_path}")
         print(f"Processing {len(request.questions)} questions...")
+        # Time local PDF parsing
+        pdf_start = time.time()
         text_chunks = parse_pdf_from_file(request.document_path)
+        pdf_time = time.time() - pdf_start
+        timing_data['pdf_parsing'] = round(pdf_time, 2)
+        print(f"Local PDF Parsing took: {pdf_time:.2f} seconds")
         print(f"Extracted {len(text_chunks)} text chunks from local PDF")
+        # Time FAISS index building
+        index_start = time.time()
         index, texts = build_faiss_index(text_chunks)
+        index_time = time.time() - index_start
+        timing_data['faiss_index_building'] = round(index_time, 2)
+        print(f"FAISS Index Building took: {index_time:.2f} seconds")
+        # Time chunk retrieval for all questions
+        retrieval_start = time.time()
         all_chunks = set()
+        for i, question in enumerate(request.questions):
+            question_start = time.time()
             top_chunks = retrieve_chunks(index, texts, question)
+            question_time = time.time() - question_start
+            print(f"Question {i+1} retrieval took: {question_time:.2f} seconds")
             all_chunks.update(top_chunks)
+        retrieval_time = time.time() - retrieval_start
+        timing_data['chunk_retrieval'] = round(retrieval_time, 2)
+        print(f"Total Chunk Retrieval took: {retrieval_time:.2f} seconds")
+        print(f"Retrieved {len(all_chunks)} unique chunks")
+        # Time LLM processing
+        llm_start = time.time()
         print(f"Processing all {len(request.questions)} questions in batch...")
         response = query_gemini(request.questions, list(all_chunks))
+        llm_time = time.time() - llm_start
+        timing_data['llm_processing'] = round(llm_time, 2)
+        print(f"LLM Processing took: {llm_time:.2f} seconds")
+        # Time response processing
+        response_start = time.time()
         # Extract answers from the JSON response
         if isinstance(response, dict) and "answers" in response:
             answers = response["answers"]
                 answers.append("Not Found")
             answers = answers[:len(request.questions)]
+        response_time = time.time() - response_start
+        timing_data['response_processing'] = round(response_time, 2)
+        print(f"Response Processing took: {response_time:.2f} seconds")
         print(f"Generated {len(answers)} answers")
+        # Calculate total time
+        total_time = time.time() - start_time
+        timing_data['total_time'] = round(total_time, 2)
+        timing_data['timestamp'] = datetime.now().isoformat()
+        print(f"\n=== TIMING BREAKDOWN ===")
+        print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
+        print(f"FAISS Index Building: {timing_data['faiss_index_building']}s")
+        print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
+        print(f"LLM Processing: {timing_data['llm_processing']}s")
+        print(f"Response Processing: {timing_data['response_processing']}s")
+        print(f"TOTAL TIME: {timing_data['total_time']}s")
+        print(f"=======================\n")
+        return {
+            "answers": answers
+        }
     except Exception as e:
+        total_time = time.time() - start_time
+        print(f"Error after {total_time:.2f} seconds: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":

embedder.py CHANGED Viewed

@@ -2,6 +2,7 @@ import faiss
 from sentence_transformers import SentenceTransformer
 import numpy as np
 import os
 # Set up cache directory in a writable location
 cache_dir = os.path.join(os.getcwd(), ".cache")
@@ -12,26 +13,63 @@ os.environ['TRANSFORMERS_CACHE'] = cache_dir
 # Initialize model as None - will be loaded lazily
 _model = None
-def get_model():
-    """Get the sentence transformer model, loading it lazily if needed"""
     global _model
     if _model is None:
         try:
             _model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir)
         except Exception as e:
             print(f"Error loading model: {e}")
             # Fallback to a different model if the first one fails
             try:
                 _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=cache_dir)
             except Exception as e2:
                 print(f"Error loading fallback model: {e2}")
                 raise
     return _model
 def build_faiss_index(chunks):
     model = get_model()
     embeddings = model.encode(chunks)
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(np.array(embeddings))
     return index, chunks

 from sentence_transformers import SentenceTransformer
 import numpy as np
 import os
+import time
 # Set up cache directory in a writable location
 cache_dir = os.path.join(os.getcwd(), ".cache")
 # Initialize model as None - will be loaded lazily
 _model = None
+def preload_model():
+    """Preload the sentence transformer model at startup"""
     global _model
     if _model is None:
+        model_start = time.time()
+        print("Preloading sentence transformer model...")
         try:
             _model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir)
+            model_time = time.time() - model_start
+            print(f"Model preloading completed in {model_time:.2f} seconds")
         except Exception as e:
             print(f"Error loading model: {e}")
             # Fallback to a different model if the first one fails
             try:
                 _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=cache_dir)
+                model_time = time.time() - model_start
+                print(f"Fallback model preloading completed in {model_time:.2f} seconds")
             except Exception as e2:
                 print(f"Error loading fallback model: {e2}")
                 raise
     return _model
+def get_model():
+    """Get the sentence transformer model, loading it lazily if needed"""
+    global _model
+    if _model is None:
+        # If model is not preloaded, load it now (should not happen in production)
+        print("Warning: Model not preloaded, loading now...")
+        return preload_model()
+    return _model
 def build_faiss_index(chunks):
+    start_time = time.time()
+    print(f"Building FAISS index for {len(chunks)} chunks...")
+    # Time model retrieval (should be instant now)
+    model_start = time.time()
     model = get_model()
+    model_time = time.time() - model_start
+    print(f"Model retrieval took: {model_time:.3f} seconds")
+    # Time embedding generation
+    embed_start = time.time()
     embeddings = model.encode(chunks)
+    embed_time = time.time() - embed_start
+    print(f"Embedding generation took: {embed_time:.2f} seconds")
+    print(f"Generated embeddings shape: {embeddings.shape}")
+    # Time FAISS index creation
+    index_start = time.time()
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(np.array(embeddings))
+    index_time = time.time() - index_start
+    print(f"FAISS index creation took: {index_time:.2f} seconds")
+    total_time = time.time() - start_time
+    print(f"Total FAISS index building took: {total_time:.2f} seconds")
     return index, chunks

llm.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import google.generativeai as genai
 import os
 import json
 from dotenv import load_dotenv
 load_dotenv()
@@ -12,78 +13,79 @@ print(f"Google API Key loaded: {api_key[:10]}..." if api_key else "No API key fo
 genai.configure(api_key=api_key)
 def query_gemini(questions, contexts):
     try:
         context = "\n\n".join(contexts)
         # Create a numbered list of questions
         questions_text = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
-        prompt = f"""You are an insurance policy assistant. Based on the below document snippets, answer the following questions precisely.
-IMPORTANT INSTRUCTIONS:
-1. Only respond based on the context provided. If information is not found in the context, respond with "Not Found".
-2. Provide clear, concise answers that directly address each question.
-3. Return your response in the exact JSON format shown below.
-4. Give complete, informative responses based on the provided context.
-5. Answer each question in the order provided.
-Context:
 {context}
-Questions:
 {questions_text}
-Response Should Good And Refined And Details But also Not So Large For Small Things and Too small Also Not Recommended With One Informative Line.
-Below I Provided For Current Context Coming How I Can need The Sentences Phrases And Words like is Given For Understanding and Reference:
-  -Old_Ai_Response_Format
-    {{
-    "answers": [
-            "The grace period for premium payment is thirty days.",
-            "Expenses related to the treatment of a Pre-Existing Disease (PED) and its direct complications shall be excluded until the expiry of thirty six (36) months of continuous coverage after the date of inception of the first policy.",
-            "Yes, the company shall indemnify Maternity Expenses as described in section 3.1.14 for any female Insured Person, and also Pre-Natal and Post-Natal Hospitalisation expenses per delivery, including expenses for necessary vaccination for New Born Baby, subject to the limit as shown in the Table of Benefits. The female Insured Person should have been continuously covered for at least 24 months before availing this benefit.",
-            "Cataract surgery has a waiting period of two years.",
-            "Yes, the Company shall indemnify the Medical Expenses incurred in respect of an organ donor’s Hospitalisation during the Policy Period for harvesting of the organ donated to an Insured Person, provided that certain conditions are met as outlined in section 3.1.7.",
-            "On renewal of policies with a term of one year, a NCD of flat 5% shall be allowed on the * base premium, provided claims are not reported in the expiring Policy.\nOn renewal of policies with a term exceeding one year, the NCD amount with respect to each claim free policy year shall be aggregated and allowed on renewal. Aggregate amount of NCD allowed shall not exceed flat 5% of the total base premium for the term of the policy.",
-            "Yes, expenses of health check up shall be reimbursed (irrespective of past claims) at the end of a block of two continuous policy years, provided the Policy has been continuously renewed with the Company without a break. Expenses payable are subject to the limit stated in the Table of Benefits.",
-            "Hospital means any institution established for in-patient care and day care treatment of disease/ injuries and which has been registered as a hospital with the local authorities under the Clinical Establishments (Registration and Regulation) Act, 2010 or under the enactments specified under Schedule of Section 56(1) of the said Act, OR complies with all minimum criteria as under:\ni.  has qualified nursing staff under its employment round the clock;\nii. has at least ten inpatient beds, in those towns having a population of less than ten lacs and fifteen inpatient beds in all other places;\niii. has qualified medical practitioner (s) in charge round the clock;\niv. has a fully equipped operation theatre of its own where surgical procedures are carried out \nv. maintains daily records of patients and shall make these accessible to the Company’s authorized personnel.",
-            "The Company shall indemnify Medical Expenses incurred for Inpatient Care treatment under Ayurveda, Yoga and Naturopathy, Unani, Siddha and Homeopathy systems of medicines during each Policy Period up to the limit of Sum Insured as specified in the Policy Schedule in any AYUSH Hospital.",
-            "For Plan A, Room Charges are limited to Up to 1% of SI or actual, whichever is lower and ICU Charges are limited to Up to 2% of SI or actual, whichever is lower, per day per insured person."
-        ]
-    }}
-    -New_Ai_Response_Format_And Wordings Given Like
-        {{
-        "answers": [
-                "A grace period of thirty days is provided for premium payment after the due date to renew or continue the policy without losing continuity benefits.",
-                "There is a waiting period of thirty-six (36) months of continuous coverage from the first policy inception for pre-existing diseases and their direct complications to be covered.",
-                "Yes, the policy covers maternity expenses, including childbirth and lawful medical termination of pregnancy. To be eligible, the female insured person must have been continuously covered for at least 24 months. The benefit is limited to two deliveries or terminations during the policy period.",
-                "The policy has a specific waiting period of two (2) years for cataract surgery.",
-                "Yes, the policy indemnifies the medical expenses for the organ donor's hospitalization for the purpose of harvesting the organ, provided the organ is for an insured person and the donation complies with the Transplantation of Human Organs Act, 1994.",
-                "A No Claim Discount of 5% on the base premium is offered on renewal for a one-year policy term if no claims were made in the preceding year. The maximum aggregate NCD is capped at 5% of the total base premium.",
-                "Yes, the policy reimburses expenses for health check-ups at the end of every block of two continuous policy years, provided the policy has been renewed without a break. The amount is subject to the limits specified in the Table of Benefits.",
-                "A hospital is defined as an institution with at least 10 inpatient beds (in towns with a population below ten lakhs) or 15 beds (in all other places), with qualified nursing staff and medical practitioners available 24/7, a fully equipped operation theatre, and which maintains daily records of patients.",
-                "The policy covers medical expenses for inpatient treatment under Ayurveda, Yoga, Naturopathy, Unani, Siddha, and Homeopathy systems up to the Sum Insured limit, provided the treatment is taken in an AYUSH Hospital.",
-                "Yes, for Plan A, the daily room rent is capped at 1% of the Sum Insured, and ICU charges are capped at 2% of the Sum Insured. These limits do not apply if the treatment is for a listed procedure in a Preferred Provider Network (PPN)."
-            ]
-        }}
-        ## The Above Is Reference How Can Give Output Wordings Back To The Question is Given For References
-Return your response in this exact JSON format:
-{{
-    "answers": [
-        "Answer to question 1",
-        "Answer to question 2",
-        "Answer to question 3",
-        ...
-    ]
-}}
-Ensure each answer is comprehensive and directly addresses the corresponding question. If information is not found in the context for any question, respond with "Not Found" for that question."""
         model = genai.GenerativeModel('gemini-2.0-flash-exp')
         response = model.generate_content(prompt)
         response_text = response.text.strip()
         # Try to parse the response as JSON
         try:
@@ -94,12 +96,25 @@ Ensure each answer is comprehensive and directly addresses the corresponding que
                 response_text = response_text.replace("```", "").strip()
             parsed_response = json.loads(response_text)
             return parsed_response
         except json.JSONDecodeError:
             # If JSON parsing fails, return a structured response
             print(f"Failed to parse JSON response: {response_text}")
             return {"answers": ["Error parsing response"] * len(questions)}
     except Exception as e:
-        print(f"Error in query_gemini: {str(e)}")
         return {"answers": [f"Error generating response: {str(e)}"] * len(questions)}

 import google.generativeai as genai
 import os
 import json
+import time
 from dotenv import load_dotenv
 load_dotenv()
 genai.configure(api_key=api_key)
 def query_gemini(questions, contexts):
+    start_time = time.time()
+    print(f"Starting LLM processing for {len(questions)} questions with {len(contexts)} context chunks")
     try:
+        # Time context preparation
+        context_start = time.time()
         context = "\n\n".join(contexts)
+        context_time = time.time() - context_start
+        print(f"Context preparation took: {context_time:.2f} seconds")
+        print(f"Total context length: {len(context)} characters")
+        # Time prompt preparation
+        prompt_start = time.time()
         # Create a numbered list of questions
         questions_text = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
+        prompt = f"""
+You are an intelligent insurance assistant trained to answer questions using insurance documents. Based on the context provided below, respond to each question with a **well-informed, complete, and professionally worded answer**.
+🎯 SCORING & OUTPUT GOAL:
+- Responses are part of an evaluated system.
+- Each answer should be **accurate**, **complete**, and **well-phrased** — ideally around **1–2 full sentences**.
+- Avoid short/fragmented answers or long multi-paragraph explanations.
+- Always write like an insurance advisor addressing a customer clearly.
+📘 INSTRUCTIONS:
+1. **Only use the provided context** to answer each question. If the answer is not found, respond with exactly: `"Not Found"`.
+2. Keep answers concise **but not vague**. Include all **key points** (such as limits, durations, conditions) in one or two complete sentences.
+3. DO NOT use bullet points, partial phrases, or excessive legal text. DO NOT repeat the question in the answer.
+4. Match the tone and format of these examples:
+   - "A grace period of thirty days is provided for premium payment after the due date to renew or continue the policy without losing continuity benefits."
+   - "Yes, the policy covers maternity expenses, including childbirth and lawful medical termination of pregnancy. To be eligible, the female insured person must have been continuously covered for at least 24 months. The benefit is limited to two deliveries or terminations during the policy period."
+   - "Yes, the policy indemnifies the medical expenses for the organ donor's hospitalization for the purpose of harvesting the organ, provided the organ is for an insured person and the donation complies with the Transplantation of Human Organs Act, 1994."
+   - "Not Found"
+📤 RETURN FORMAT:
+Respond strictly using this JSON structure:
+{{
+  "answers": [
+    "Answer to question 1",
+    "Answer to question 2",
+    ...
+  ]
+}}
+📚 CONTEXT:
 {context}
+❓ QUESTIONS:
 {questions_text}
+Your task: Provide accurate, refined answers based on the document context above. Use the tone and structure shown. Be concise but thorough. Only include what is supported in the context. Use "Not Found" if the answer is missing.
+"""
+        prompt_time = time.time() - prompt_start
+        print(f"Prompt preparation took: {prompt_time:.2f} seconds")
+        print(f"Total prompt length: {len(prompt)} characters")
+        # Time model initialization and API call
+        api_start = time.time()
         model = genai.GenerativeModel('gemini-2.0-flash-exp')
         response = model.generate_content(prompt)
+        api_time = time.time() - api_start
+        print(f"Gemini API call took: {api_time:.2f} seconds")
+        # Time response processing
+        process_start = time.time()
         response_text = response.text.strip()
+        print(f"Raw response length: {len(response_text)} characters")
         # Try to parse the response as JSON
         try:
                 response_text = response_text.replace("```", "").strip()
             parsed_response = json.loads(response_text)
+            process_time = time.time() - process_start
+            print(f"Response processing took: {process_time:.2f} seconds")
+            total_time = time.time() - start_time
+            print(f"Total LLM processing took: {total_time:.2f} seconds")
             return parsed_response
         except json.JSONDecodeError:
             # If JSON parsing fails, return a structured response
+            process_time = time.time() - process_start
+            print(f"Response processing took: {process_time:.2f} seconds (JSON parsing failed)")
             print(f"Failed to parse JSON response: {response_text}")
+            total_time = time.time() - start_time
+            print(f"Total LLM processing took: {total_time:.2f} seconds")
             return {"answers": ["Error parsing response"] * len(questions)}
     except Exception as e:
+        total_time = time.time() - start_time
+        print(f"Error in query_gemini after {total_time:.2f} seconds: {str(e)}")
         return {"answers": [f"Error generating response: {str(e)}"] * len(questions)}

main.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
 import warnings
 import logging
 # Suppress TensorFlow warnings
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
@@ -16,7 +18,7 @@ from fastapi import FastAPI, Request, HTTPException, Depends, Header
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from parser import parse_pdf_from_url, parse_pdf_from_file
-from embedder import build_faiss_index
 from retriever import retrieve_chunks
 from llm import query_gemini
 import uvicorn
@@ -32,6 +34,14 @@ app.add_middleware(
     allow_headers=["*"],
 )
 @app.get("/")
 async def root():
     return {"message": "HackRx Insurance Policy Assistant API is running!"}
@@ -61,24 +71,52 @@ def verify_token(authorization: str = Header(None)):
 @app.post("/api/v1/hackrx/run")
 async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
     try:
         print(f"Processing {len(request.questions)} questions...")
         text_chunks = parse_pdf_from_url(request.documents)
         print(f"Extracted {len(text_chunks)} text chunks from PDF")
         index, texts = build_faiss_index(text_chunks)
-        # Get relevant chunks for all questions at once
         all_chunks = set()
-        for question in request.questions:
             top_chunks = retrieve_chunks(index, texts, question)
             all_chunks.update(top_chunks)
-        # Process all questions in a single LLM call
         print(f"Processing all {len(request.questions)} questions in batch...")
         response = query_gemini(request.questions, list(all_chunks))
         # Extract answers from the JSON response
         if isinstance(response, dict) and "answers" in response:
             answers = response["answers"]
@@ -94,35 +132,83 @@ async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
                 answers.append("Not Found")
             answers = answers[:len(request.questions)]
         print(f"Generated {len(answers)} answers")
-        return { "answers": answers }
     except Exception as e:
-        print(f"Error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/api/v1/hackrx/local")
 async def run_local_query(request: LocalQueryRequest):
     try:
         print(f"Processing local document: {request.document_path}")
         print(f"Processing {len(request.questions)} questions...")
-        # Parse local PDF file
         text_chunks = parse_pdf_from_file(request.document_path)
         print(f"Extracted {len(text_chunks)} text chunks from local PDF")
         index, texts = build_faiss_index(text_chunks)
-        # Get relevant chunks for all questions at once
         all_chunks = set()
-        for question in request.questions:
             top_chunks = retrieve_chunks(index, texts, question)
             all_chunks.update(top_chunks)
-        # Process all questions in a single LLM call
         print(f"Processing all {len(request.questions)} questions in batch...")
         response = query_gemini(request.questions, list(all_chunks))
         # Extract answers from the JSON response
         if isinstance(response, dict) and "answers" in response:
             answers = response["answers"]
@@ -138,11 +224,32 @@ async def run_local_query(request: LocalQueryRequest):
                 answers.append("Not Found")
             answers = answers[:len(request.questions)]
         print(f"Generated {len(answers)} answers")
-        return { "answers": answers }
     except Exception as e:
-        print(f"Error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":

 import os
 import warnings
 import logging
+import time
+from datetime import datetime
 # Suppress TensorFlow warnings
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from parser import parse_pdf_from_url, parse_pdf_from_file
+from embedder import build_faiss_index, preload_model
 from retriever import retrieve_chunks
 from llm import query_gemini
 import uvicorn
     allow_headers=["*"],
 )
+# Preload the model at startup
+@app.on_event("startup")
+async def startup_event():
+    print("Starting up HackRx Insurance Policy Assistant...")
+    print("Preloading sentence transformer model...")
+    preload_model()
+    print("Model preloading completed. API is ready to serve requests!")
 @app.get("/")
 async def root():
     return {"message": "HackRx Insurance Policy Assistant API is running!"}
 @app.post("/api/v1/hackrx/run")
 async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
+    start_time = time.time()
+    timing_data = {}
     try:
         print(f"Processing {len(request.questions)} questions...")
+        # Time PDF parsing
+        pdf_start = time.time()
         text_chunks = parse_pdf_from_url(request.documents)
+        pdf_time = time.time() - pdf_start
+        timing_data['pdf_parsing'] = round(pdf_time, 2)
+        print(f"PDF Parsing took: {pdf_time:.2f} seconds")
         print(f"Extracted {len(text_chunks)} text chunks from PDF")
+        # Time FAISS index building
+        index_start = time.time()
         index, texts = build_faiss_index(text_chunks)
+        index_time = time.time() - index_start
+        timing_data['faiss_index_building'] = round(index_time, 2)
+        print(f"FAISS Index Building took: {index_time:.2f} seconds")
+        # Time chunk retrieval for all questions
+        retrieval_start = time.time()
         all_chunks = set()
+        for i, question in enumerate(request.questions):
+            question_start = time.time()
             top_chunks = retrieve_chunks(index, texts, question)
+            question_time = time.time() - question_start
+            print(f"Question {i+1} retrieval took: {question_time:.2f} seconds")
             all_chunks.update(top_chunks)
+        retrieval_time = time.time() - retrieval_start
+        timing_data['chunk_retrieval'] = round(retrieval_time, 2)
+        print(f"Total Chunk Retrieval took: {retrieval_time:.2f} seconds")
+        print(f"Retrieved {len(all_chunks)} unique chunks")
+        # Time LLM processing
+        llm_start = time.time()
         print(f"Processing all {len(request.questions)} questions in batch...")
         response = query_gemini(request.questions, list(all_chunks))
+        llm_time = time.time() - llm_start
+        timing_data['llm_processing'] = round(llm_time, 2)
+        print(f"LLM Processing took: {llm_time:.2f} seconds")
+        # Time response processing
+        response_start = time.time()
         # Extract answers from the JSON response
         if isinstance(response, dict) and "answers" in response:
             answers = response["answers"]
                 answers.append("Not Found")
             answers = answers[:len(request.questions)]
+        response_time = time.time() - response_start
+        timing_data['response_processing'] = round(response_time, 2)
+        print(f"Response Processing took: {response_time:.2f} seconds")
         print(f"Generated {len(answers)} answers")
+        # Calculate total time
+        total_time = time.time() - start_time
+        timing_data['total_time'] = round(total_time, 2)
+        timing_data['timestamp'] = datetime.now().isoformat()
+        print(f"\n=== TIMING BREAKDOWN ===")
+        print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
+        print(f"FAISS Index Building: {timing_data['faiss_index_building']}s")
+        print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
+        print(f"LLM Processing: {timing_data['llm_processing']}s")
+        print(f"Response Processing: {timing_data['response_processing']}s")
+        print(f"TOTAL TIME: {timing_data['total_time']}s")
+        print(f"=======================\n")
+        return {
+            "answers": answers
+        }
     except Exception as e:
+        total_time = time.time() - start_time
+        print(f"Error after {total_time:.2f} seconds: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/api/v1/hackrx/local")
 async def run_local_query(request: LocalQueryRequest):
+    start_time = time.time()
+    timing_data = {}
     try:
         print(f"Processing local document: {request.document_path}")
         print(f"Processing {len(request.questions)} questions...")
+        # Time local PDF parsing
+        pdf_start = time.time()
         text_chunks = parse_pdf_from_file(request.document_path)
+        pdf_time = time.time() - pdf_start
+        timing_data['pdf_parsing'] = round(pdf_time, 2)
+        print(f"Local PDF Parsing took: {pdf_time:.2f} seconds")
         print(f"Extracted {len(text_chunks)} text chunks from local PDF")
+        # Time FAISS index building
+        index_start = time.time()
         index, texts = build_faiss_index(text_chunks)
+        index_time = time.time() - index_start
+        timing_data['faiss_index_building'] = round(index_time, 2)
+        print(f"FAISS Index Building took: {index_time:.2f} seconds")
+        # Time chunk retrieval for all questions
+        retrieval_start = time.time()
         all_chunks = set()
+        for i, question in enumerate(request.questions):
+            question_start = time.time()
             top_chunks = retrieve_chunks(index, texts, question)
+            question_time = time.time() - question_start
+            print(f"Question {i+1} retrieval took: {question_time:.2f} seconds")
             all_chunks.update(top_chunks)
+        retrieval_time = time.time() - retrieval_start
+        timing_data['chunk_retrieval'] = round(retrieval_time, 2)
+        print(f"Total Chunk Retrieval took: {retrieval_time:.2f} seconds")
+        print(f"Retrieved {len(all_chunks)} unique chunks")
+        # Time LLM processing
+        llm_start = time.time()
         print(f"Processing all {len(request.questions)} questions in batch...")
         response = query_gemini(request.questions, list(all_chunks))
+        llm_time = time.time() - llm_start
+        timing_data['llm_processing'] = round(llm_time, 2)
+        print(f"LLM Processing took: {llm_time:.2f} seconds")
+        # Time response processing
+        response_start = time.time()
         # Extract answers from the JSON response
         if isinstance(response, dict) and "answers" in response:
             answers = response["answers"]
                 answers.append("Not Found")
             answers = answers[:len(request.questions)]
+        response_time = time.time() - response_start
+        timing_data['response_processing'] = round(response_time, 2)
+        print(f"Response Processing took: {response_time:.2f} seconds")
         print(f"Generated {len(answers)} answers")
+        # Calculate total time
+        total_time = time.time() - start_time
+        timing_data['total_time'] = round(total_time, 2)
+        timing_data['timestamp'] = datetime.now().isoformat()
+        print(f"\n=== TIMING BREAKDOWN ===")
+        print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
+        print(f"FAISS Index Building: {timing_data['faiss_index_building']}s")
+        print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
+        print(f"LLM Processing: {timing_data['llm_processing']}s")
+        print(f"Response Processing: {timing_data['response_processing']}s")
+        print(f"TOTAL TIME: {timing_data['total_time']}s")
+        print(f"=======================\n")
+        return {
+            "answers": answers
+        }
     except Exception as e:
+        total_time = time.time() - start_time
+        print(f"Error after {total_time:.2f} seconds: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":

parser.py CHANGED Viewed

@@ -1,19 +1,37 @@
 import fitz  # PyMuPDF
 import requests
 from io import BytesIO
 def parse_pdf_from_url(url):
     res = requests.get(url)
     doc = fitz.open(stream=BytesIO(res.content), filetype="pdf")
     chunks = []
     for page in doc:
         text = page.get_text()
         if text.strip():
             chunks.append(text)
     return chunks
 def parse_pdf_from_file(file_path):
     """Parse a local PDF file and extract text chunks"""
     try:
         doc = fitz.open(file_path)
         chunks = []
@@ -22,6 +40,11 @@ def parse_pdf_from_file(file_path):
             if text.strip():
                 chunks.append(text)
         doc.close()
         return chunks
     except Exception as e:
         raise Exception(f"Error parsing PDF file {file_path}: {str(e)}")

 import fitz  # PyMuPDF
 import requests
 from io import BytesIO
+import time
 def parse_pdf_from_url(url):
+    start_time = time.time()
+    print(f"Starting PDF download and parsing from URL...")
+    download_start = time.time()
     res = requests.get(url)
+    download_time = time.time() - download_start
+    print(f"PDF Download took: {download_time:.2f} seconds")
+    parse_start = time.time()
     doc = fitz.open(stream=BytesIO(res.content), filetype="pdf")
     chunks = []
     for page in doc:
         text = page.get_text()
         if text.strip():
             chunks.append(text)
+    doc.close()
+    parse_time = time.time() - parse_start
+    print(f"PDF Text Extraction took: {parse_time:.2f} seconds")
+    total_time = time.time() - start_time
+    print(f"Total PDF parsing from URL took: {total_time:.2f} seconds")
     return chunks
 def parse_pdf_from_file(file_path):
     """Parse a local PDF file and extract text chunks"""
+    start_time = time.time()
+    print(f"Starting PDF parsing from local file: {file_path}")
     try:
         doc = fitz.open(file_path)
         chunks = []
             if text.strip():
                 chunks.append(text)
         doc.close()
+        total_time = time.time() - start_time
+        print(f"Total PDF parsing from file took: {total_time:.2f} seconds")
         return chunks
     except Exception as e:
+        total_time = time.time() - start_time
+        print(f"Error parsing PDF file after {total_time:.2f} seconds: {str(e)}")
         raise Exception(f"Error parsing PDF file {file_path}: {str(e)}")

retriever.py CHANGED Viewed

@@ -1,9 +1,34 @@
 from sentence_transformers import SentenceTransformer
 import numpy as np
-model = SentenceTransformer("all-MiniLM-L6-v2")
 def retrieve_chunks(index, texts, query, k=5):
     query_vec = model.encode([query])
     distances, indices = index.search(np.array(query_vec), k)
-    return [texts[i] for i in indices[0]]

 from sentence_transformers import SentenceTransformer
 import numpy as np
+import time
+from embedder import get_model
+# Use the preloaded model from embedder instead of creating a new instance
 def retrieve_chunks(index, texts, query, k=5):
+    start_time = time.time()
+    print(f"Retrieving chunks for query: '{query[:50]}...'")
+    # Time query embedding
+    embed_start = time.time()
+    model = get_model()  # Use the preloaded model
     query_vec = model.encode([query])
+    embed_time = time.time() - embed_start
+    print(f"Query embedding took: {embed_time:.3f} seconds")
+    # Time FAISS search
+    search_start = time.time()
     distances, indices = index.search(np.array(query_vec), k)
+    search_time = time.time() - search_start
+    print(f"FAISS search took: {search_time:.3f} seconds")
+    # Time result processing
+    process_start = time.time()
+    results = [texts[i] for i in indices[0]]
+    process_time = time.time() - process_start
+    print(f"Result processing took: {process_time:.3f} seconds")
+    total_time = time.time() - start_time
+    print(f"Total chunk retrieval took: {total_time:.3f} seconds")
+    print(f"Retrieved {len(results)} chunks")
+    return results