Spaces:
Running
Running
Rivalcoder
commited on
Commit
Β·
7acce36
1
Parent(s):
0d10b91
Update Prompt
Browse files- .cache/chunks_6635d94cf9023c83521982b3043ec70c.pkl +0 -3
- .cache/embeddings_b24811e7d333cc7d5047e52b357abd7e.pkl +0 -3
- .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/refs/main +0 -1
- .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json +0 -7
- .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md +0 -173
- .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json +0 -7
- .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json +0 -20
- .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json +0 -4
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/adapter_config.json +0 -0
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json +0 -0
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja +0 -0
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/refs/main +0 -1
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json +0 -7
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md +0 -173
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json +0 -24
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json +0 -7
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/model.safetensors +0 -3
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json +0 -20
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json +0 -4
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/special_tokens_map.json +0 -1
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer.json +0 -0
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer_config.json +0 -1
- .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/vocab.txt +0 -0
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/adapter_config.json +0 -0
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/added_tokens.json +0 -0
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/chat_template.jinja +0 -0
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/refs/main +0 -1
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/1_Pooling/config.json +0 -7
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/README.md +0 -114
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/config.json +0 -24
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/config_sentence_transformers.json +0 -7
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/model.safetensors +0 -3
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/modules.json +0 -14
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/sentence_bert_config.json +0 -4
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/special_tokens_map.json +0 -1
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/tokenizer.json +0 -0
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/tokenizer_config.json +0 -1
- .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/vocab.txt +0 -0
- .cache/response_2ab720ffccd688afdc790db13e338c83.pkl +0 -3
- .gitattributes +0 -35
- HUGGINGFACE_DEPLOYMENT.md +0 -112
- README_HF.md +0 -112
- embedder.py +35 -31
- main.py +0 -260
- pdf_parser.py +40 -33
- start.sh +0 -12
- test_deployment.py +0 -75
- test_model_loading.py +0 -34
.cache/chunks_6635d94cf9023c83521982b3043ec70c.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a4cef2cc09ef9d4ef7d8649bb78ec868e356dcfecbcd6dde23442a90497d407e
|
3 |
-
size 124546
|
|
|
|
|
|
|
|
.cache/embeddings_b24811e7d333cc7d5047e52b357abd7e.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:475523b57f8f6b89e62e668efef73309193b05f0f05bbeffb7f012ee952024f0
|
3 |
-
size 347400
|
|
|
|
|
|
|
|
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/refs/main
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
c9745ed1d9f207416be6d2e6f8de32d1f16199bf
|
|
|
|
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"word_embedding_dimension": 384,
|
3 |
-
"pooling_mode_cls_token": false,
|
4 |
-
"pooling_mode_mean_tokens": true,
|
5 |
-
"pooling_mode_max_tokens": false,
|
6 |
-
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md
DELETED
@@ -1,173 +0,0 @@
|
|
1 |
-
---
|
2 |
-
language: en
|
3 |
-
license: apache-2.0
|
4 |
-
library_name: sentence-transformers
|
5 |
-
tags:
|
6 |
-
- sentence-transformers
|
7 |
-
- feature-extraction
|
8 |
-
- sentence-similarity
|
9 |
-
- transformers
|
10 |
-
datasets:
|
11 |
-
- s2orc
|
12 |
-
- flax-sentence-embeddings/stackexchange_xml
|
13 |
-
- ms_marco
|
14 |
-
- gooaq
|
15 |
-
- yahoo_answers_topics
|
16 |
-
- code_search_net
|
17 |
-
- search_qa
|
18 |
-
- eli5
|
19 |
-
- snli
|
20 |
-
- multi_nli
|
21 |
-
- wikihow
|
22 |
-
- natural_questions
|
23 |
-
- trivia_qa
|
24 |
-
- embedding-data/sentence-compression
|
25 |
-
- embedding-data/flickr30k-captions
|
26 |
-
- embedding-data/altlex
|
27 |
-
- embedding-data/simple-wiki
|
28 |
-
- embedding-data/QQP
|
29 |
-
- embedding-data/SPECTER
|
30 |
-
- embedding-data/PAQ_pairs
|
31 |
-
- embedding-data/WikiAnswers
|
32 |
-
pipeline_tag: sentence-similarity
|
33 |
-
---
|
34 |
-
|
35 |
-
|
36 |
-
# all-MiniLM-L6-v2
|
37 |
-
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
38 |
-
|
39 |
-
## Usage (Sentence-Transformers)
|
40 |
-
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
41 |
-
|
42 |
-
```
|
43 |
-
pip install -U sentence-transformers
|
44 |
-
```
|
45 |
-
|
46 |
-
Then you can use the model like this:
|
47 |
-
```python
|
48 |
-
from sentence_transformers import SentenceTransformer
|
49 |
-
sentences = ["This is an example sentence", "Each sentence is converted"]
|
50 |
-
|
51 |
-
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
52 |
-
embeddings = model.encode(sentences)
|
53 |
-
print(embeddings)
|
54 |
-
```
|
55 |
-
|
56 |
-
## Usage (HuggingFace Transformers)
|
57 |
-
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
58 |
-
|
59 |
-
```python
|
60 |
-
from transformers import AutoTokenizer, AutoModel
|
61 |
-
import torch
|
62 |
-
import torch.nn.functional as F
|
63 |
-
|
64 |
-
#Mean Pooling - Take attention mask into account for correct averaging
|
65 |
-
def mean_pooling(model_output, attention_mask):
|
66 |
-
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
67 |
-
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
68 |
-
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
69 |
-
|
70 |
-
|
71 |
-
# Sentences we want sentence embeddings for
|
72 |
-
sentences = ['This is an example sentence', 'Each sentence is converted']
|
73 |
-
|
74 |
-
# Load model from HuggingFace Hub
|
75 |
-
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
76 |
-
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
77 |
-
|
78 |
-
# Tokenize sentences
|
79 |
-
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
80 |
-
|
81 |
-
# Compute token embeddings
|
82 |
-
with torch.no_grad():
|
83 |
-
model_output = model(**encoded_input)
|
84 |
-
|
85 |
-
# Perform pooling
|
86 |
-
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
87 |
-
|
88 |
-
# Normalize embeddings
|
89 |
-
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
90 |
-
|
91 |
-
print("Sentence embeddings:")
|
92 |
-
print(sentence_embeddings)
|
93 |
-
```
|
94 |
-
|
95 |
-
------
|
96 |
-
|
97 |
-
## Background
|
98 |
-
|
99 |
-
The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
|
100 |
-
contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
|
101 |
-
1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
|
102 |
-
|
103 |
-
We developed this model during the
|
104 |
-
[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
|
105 |
-
organized by Hugging Face. We developed this model as part of the project:
|
106 |
-
[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
|
107 |
-
|
108 |
-
## Intended uses
|
109 |
-
|
110 |
-
Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
|
111 |
-
the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
|
112 |
-
|
113 |
-
By default, input text longer than 256 word pieces is truncated.
|
114 |
-
|
115 |
-
|
116 |
-
## Training procedure
|
117 |
-
|
118 |
-
### Pre-training
|
119 |
-
|
120 |
-
We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
|
121 |
-
|
122 |
-
### Fine-tuning
|
123 |
-
|
124 |
-
We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
|
125 |
-
We then apply the cross entropy loss by comparing with true pairs.
|
126 |
-
|
127 |
-
#### Hyper parameters
|
128 |
-
|
129 |
-
We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
|
130 |
-
We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
|
131 |
-
a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
|
132 |
-
|
133 |
-
#### Training data
|
134 |
-
|
135 |
-
We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
|
136 |
-
We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
|
137 |
-
|
138 |
-
|
139 |
-
| Dataset | Paper | Number of training tuples |
|
140 |
-
|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
|
141 |
-
| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
|
142 |
-
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
|
143 |
-
| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
|
144 |
-
| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
|
145 |
-
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
|
146 |
-
| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
|
147 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
|
148 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
|
149 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
|
150 |
-
| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
|
151 |
-
| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
|
152 |
-
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
|
153 |
-
| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
|
154 |
-
| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
|
155 |
-
| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
|
156 |
-
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
|
157 |
-
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
|
158 |
-
| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
|
159 |
-
| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
|
160 |
-
| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
|
161 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
|
162 |
-
| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
|
163 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
|
164 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
|
165 |
-
| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
|
166 |
-
| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
|
167 |
-
| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
|
168 |
-
| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
|
169 |
-
| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
|
170 |
-
| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
|
171 |
-
| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
|
172 |
-
| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
|
173 |
-
| **Total** | | **1,170,060,424** |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"__version__": {
|
3 |
-
"sentence_transformers": "2.0.0",
|
4 |
-
"transformers": "4.6.1",
|
5 |
-
"pytorch": "1.8.1"
|
6 |
-
}
|
7 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"idx": 0,
|
4 |
-
"name": "0",
|
5 |
-
"path": "",
|
6 |
-
"type": "sentence_transformers.models.Transformer"
|
7 |
-
},
|
8 |
-
{
|
9 |
-
"idx": 1,
|
10 |
-
"name": "1",
|
11 |
-
"path": "1_Pooling",
|
12 |
-
"type": "sentence_transformers.models.Pooling"
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"idx": 2,
|
16 |
-
"name": "2",
|
17 |
-
"path": "2_Normalize",
|
18 |
-
"type": "sentence_transformers.models.Normalize"
|
19 |
-
}
|
20 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"max_seq_length": 256,
|
3 |
-
"do_lower_case": false
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/adapter_config.json
DELETED
File without changes
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json
DELETED
File without changes
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja
DELETED
File without changes
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/refs/main
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
c9745ed1d9f207416be6d2e6f8de32d1f16199bf
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"word_embedding_dimension": 384,
|
3 |
-
"pooling_mode_cls_token": false,
|
4 |
-
"pooling_mode_mean_tokens": true,
|
5 |
-
"pooling_mode_max_tokens": false,
|
6 |
-
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md
DELETED
@@ -1,173 +0,0 @@
|
|
1 |
-
---
|
2 |
-
language: en
|
3 |
-
license: apache-2.0
|
4 |
-
library_name: sentence-transformers
|
5 |
-
tags:
|
6 |
-
- sentence-transformers
|
7 |
-
- feature-extraction
|
8 |
-
- sentence-similarity
|
9 |
-
- transformers
|
10 |
-
datasets:
|
11 |
-
- s2orc
|
12 |
-
- flax-sentence-embeddings/stackexchange_xml
|
13 |
-
- ms_marco
|
14 |
-
- gooaq
|
15 |
-
- yahoo_answers_topics
|
16 |
-
- code_search_net
|
17 |
-
- search_qa
|
18 |
-
- eli5
|
19 |
-
- snli
|
20 |
-
- multi_nli
|
21 |
-
- wikihow
|
22 |
-
- natural_questions
|
23 |
-
- trivia_qa
|
24 |
-
- embedding-data/sentence-compression
|
25 |
-
- embedding-data/flickr30k-captions
|
26 |
-
- embedding-data/altlex
|
27 |
-
- embedding-data/simple-wiki
|
28 |
-
- embedding-data/QQP
|
29 |
-
- embedding-data/SPECTER
|
30 |
-
- embedding-data/PAQ_pairs
|
31 |
-
- embedding-data/WikiAnswers
|
32 |
-
pipeline_tag: sentence-similarity
|
33 |
-
---
|
34 |
-
|
35 |
-
|
36 |
-
# all-MiniLM-L6-v2
|
37 |
-
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
38 |
-
|
39 |
-
## Usage (Sentence-Transformers)
|
40 |
-
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
41 |
-
|
42 |
-
```
|
43 |
-
pip install -U sentence-transformers
|
44 |
-
```
|
45 |
-
|
46 |
-
Then you can use the model like this:
|
47 |
-
```python
|
48 |
-
from sentence_transformers import SentenceTransformer
|
49 |
-
sentences = ["This is an example sentence", "Each sentence is converted"]
|
50 |
-
|
51 |
-
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
52 |
-
embeddings = model.encode(sentences)
|
53 |
-
print(embeddings)
|
54 |
-
```
|
55 |
-
|
56 |
-
## Usage (HuggingFace Transformers)
|
57 |
-
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
58 |
-
|
59 |
-
```python
|
60 |
-
from transformers import AutoTokenizer, AutoModel
|
61 |
-
import torch
|
62 |
-
import torch.nn.functional as F
|
63 |
-
|
64 |
-
#Mean Pooling - Take attention mask into account for correct averaging
|
65 |
-
def mean_pooling(model_output, attention_mask):
|
66 |
-
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
67 |
-
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
68 |
-
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
69 |
-
|
70 |
-
|
71 |
-
# Sentences we want sentence embeddings for
|
72 |
-
sentences = ['This is an example sentence', 'Each sentence is converted']
|
73 |
-
|
74 |
-
# Load model from HuggingFace Hub
|
75 |
-
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
76 |
-
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
77 |
-
|
78 |
-
# Tokenize sentences
|
79 |
-
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
80 |
-
|
81 |
-
# Compute token embeddings
|
82 |
-
with torch.no_grad():
|
83 |
-
model_output = model(**encoded_input)
|
84 |
-
|
85 |
-
# Perform pooling
|
86 |
-
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
87 |
-
|
88 |
-
# Normalize embeddings
|
89 |
-
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
90 |
-
|
91 |
-
print("Sentence embeddings:")
|
92 |
-
print(sentence_embeddings)
|
93 |
-
```
|
94 |
-
|
95 |
-
------
|
96 |
-
|
97 |
-
## Background
|
98 |
-
|
99 |
-
The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
|
100 |
-
contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
|
101 |
-
1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
|
102 |
-
|
103 |
-
We developed this model during the
|
104 |
-
[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
|
105 |
-
organized by Hugging Face. We developed this model as part of the project:
|
106 |
-
[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
|
107 |
-
|
108 |
-
## Intended uses
|
109 |
-
|
110 |
-
Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
|
111 |
-
the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
|
112 |
-
|
113 |
-
By default, input text longer than 256 word pieces is truncated.
|
114 |
-
|
115 |
-
|
116 |
-
## Training procedure
|
117 |
-
|
118 |
-
### Pre-training
|
119 |
-
|
120 |
-
We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
|
121 |
-
|
122 |
-
### Fine-tuning
|
123 |
-
|
124 |
-
We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
|
125 |
-
We then apply the cross entropy loss by comparing with true pairs.
|
126 |
-
|
127 |
-
#### Hyper parameters
|
128 |
-
|
129 |
-
We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
|
130 |
-
We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
|
131 |
-
a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
|
132 |
-
|
133 |
-
#### Training data
|
134 |
-
|
135 |
-
We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
|
136 |
-
We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
|
137 |
-
|
138 |
-
|
139 |
-
| Dataset | Paper | Number of training tuples |
|
140 |
-
|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
|
141 |
-
| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
|
142 |
-
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
|
143 |
-
| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
|
144 |
-
| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
|
145 |
-
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
|
146 |
-
| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
|
147 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
|
148 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
|
149 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
|
150 |
-
| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
|
151 |
-
| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
|
152 |
-
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
|
153 |
-
| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
|
154 |
-
| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
|
155 |
-
| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
|
156 |
-
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
|
157 |
-
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
|
158 |
-
| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
|
159 |
-
| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
|
160 |
-
| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
|
161 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
|
162 |
-
| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
|
163 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
|
164 |
-
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
|
165 |
-
| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
|
166 |
-
| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
|
167 |
-
| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
|
168 |
-
| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
|
169 |
-
| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
|
170 |
-
| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
|
171 |
-
| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
|
172 |
-
| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
|
173 |
-
| **Total** | | **1,170,060,424** |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
|
3 |
-
"architectures": [
|
4 |
-
"BertModel"
|
5 |
-
],
|
6 |
-
"attention_probs_dropout_prob": 0.1,
|
7 |
-
"gradient_checkpointing": false,
|
8 |
-
"hidden_act": "gelu",
|
9 |
-
"hidden_dropout_prob": 0.1,
|
10 |
-
"hidden_size": 384,
|
11 |
-
"initializer_range": 0.02,
|
12 |
-
"intermediate_size": 1536,
|
13 |
-
"layer_norm_eps": 1e-12,
|
14 |
-
"max_position_embeddings": 512,
|
15 |
-
"model_type": "bert",
|
16 |
-
"num_attention_heads": 12,
|
17 |
-
"num_hidden_layers": 6,
|
18 |
-
"pad_token_id": 0,
|
19 |
-
"position_embedding_type": "absolute",
|
20 |
-
"transformers_version": "4.8.2",
|
21 |
-
"type_vocab_size": 2,
|
22 |
-
"use_cache": true,
|
23 |
-
"vocab_size": 30522
|
24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"__version__": {
|
3 |
-
"sentence_transformers": "2.0.0",
|
4 |
-
"transformers": "4.6.1",
|
5 |
-
"pytorch": "1.8.1"
|
6 |
-
}
|
7 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/model.safetensors
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
|
3 |
-
size 90868376
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"idx": 0,
|
4 |
-
"name": "0",
|
5 |
-
"path": "",
|
6 |
-
"type": "sentence_transformers.models.Transformer"
|
7 |
-
},
|
8 |
-
{
|
9 |
-
"idx": 1,
|
10 |
-
"name": "1",
|
11 |
-
"path": "1_Pooling",
|
12 |
-
"type": "sentence_transformers.models.Pooling"
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"idx": 2,
|
16 |
-
"name": "2",
|
17 |
-
"path": "2_Normalize",
|
18 |
-
"type": "sentence_transformers.models.Normalize"
|
19 |
-
}
|
20 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"max_seq_length": 256,
|
3 |
-
"do_lower_case": false
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/special_tokens_map.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer_config.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}
|
|
|
|
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/vocab.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/adapter_config.json
DELETED
File without changes
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/added_tokens.json
DELETED
File without changes
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/chat_template.jinja
DELETED
File without changes
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/refs/main
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
4ca70771034acceecb2e72475f72050fcdde4ddc
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/1_Pooling/config.json
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"word_embedding_dimension": 384,
|
3 |
-
"pooling_mode_cls_token": false,
|
4 |
-
"pooling_mode_mean_tokens": true,
|
5 |
-
"pooling_mode_max_tokens": false,
|
6 |
-
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/README.md
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
library_name: sentence-transformers
|
4 |
-
tags:
|
5 |
-
- sentence-transformers
|
6 |
-
- feature-extraction
|
7 |
-
- sentence-similarity
|
8 |
-
- transformers
|
9 |
-
datasets:
|
10 |
-
- flax-sentence-embeddings/stackexchange_xml
|
11 |
-
- s2orc
|
12 |
-
- ms_marco
|
13 |
-
- wiki_atomic_edits
|
14 |
-
- snli
|
15 |
-
- multi_nli
|
16 |
-
- embedding-data/altlex
|
17 |
-
- embedding-data/simple-wiki
|
18 |
-
- embedding-data/flickr30k-captions
|
19 |
-
- embedding-data/coco_captions
|
20 |
-
- embedding-data/sentence-compression
|
21 |
-
- embedding-data/QQP
|
22 |
-
- yahoo_answers_topics
|
23 |
-
pipeline_tag: sentence-similarity
|
24 |
-
---
|
25 |
-
|
26 |
-
# sentence-transformers/paraphrase-MiniLM-L3-v2
|
27 |
-
|
28 |
-
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
## Usage (Sentence-Transformers)
|
33 |
-
|
34 |
-
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
35 |
-
|
36 |
-
```
|
37 |
-
pip install -U sentence-transformers
|
38 |
-
```
|
39 |
-
|
40 |
-
Then you can use the model like this:
|
41 |
-
|
42 |
-
```python
|
43 |
-
from sentence_transformers import SentenceTransformer
|
44 |
-
sentences = ["This is an example sentence", "Each sentence is converted"]
|
45 |
-
|
46 |
-
model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L3-v2')
|
47 |
-
embeddings = model.encode(sentences)
|
48 |
-
print(embeddings)
|
49 |
-
```
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
## Usage (HuggingFace Transformers)
|
54 |
-
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
55 |
-
|
56 |
-
```python
|
57 |
-
from transformers import AutoTokenizer, AutoModel
|
58 |
-
import torch
|
59 |
-
|
60 |
-
|
61 |
-
#Mean Pooling - Take attention mask into account for correct averaging
|
62 |
-
def mean_pooling(model_output, attention_mask):
|
63 |
-
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
64 |
-
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
65 |
-
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
66 |
-
|
67 |
-
|
68 |
-
# Sentences we want sentence embeddings for
|
69 |
-
sentences = ['This is an example sentence', 'Each sentence is converted']
|
70 |
-
|
71 |
-
# Load model from HuggingFace Hub
|
72 |
-
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L3-v2')
|
73 |
-
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L3-v2')
|
74 |
-
|
75 |
-
# Tokenize sentences
|
76 |
-
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
77 |
-
|
78 |
-
# Compute token embeddings
|
79 |
-
with torch.no_grad():
|
80 |
-
model_output = model(**encoded_input)
|
81 |
-
|
82 |
-
# Perform pooling. In this case, max pooling.
|
83 |
-
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
84 |
-
|
85 |
-
print("Sentence embeddings:")
|
86 |
-
print(sentence_embeddings)
|
87 |
-
```
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
## Full Model Architecture
|
92 |
-
```
|
93 |
-
SentenceTransformer(
|
94 |
-
(0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
|
95 |
-
(1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
|
96 |
-
)
|
97 |
-
```
|
98 |
-
|
99 |
-
## Citing & Authors
|
100 |
-
|
101 |
-
This model was trained by [sentence-transformers](https://www.sbert.net/).
|
102 |
-
|
103 |
-
If you find this model helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084):
|
104 |
-
```bibtex
|
105 |
-
@inproceedings{reimers-2019-sentence-bert,
|
106 |
-
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
107 |
-
author = "Reimers, Nils and Gurevych, Iryna",
|
108 |
-
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
109 |
-
month = "11",
|
110 |
-
year = "2019",
|
111 |
-
publisher = "Association for Computational Linguistics",
|
112 |
-
url = "http://arxiv.org/abs/1908.10084",
|
113 |
-
}
|
114 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/config.json
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": "old_models/paraphrase-MiniLM-L3-v2/0_Transformer",
|
3 |
-
"architectures": [
|
4 |
-
"BertModel"
|
5 |
-
],
|
6 |
-
"attention_probs_dropout_prob": 0.1,
|
7 |
-
"gradient_checkpointing": false,
|
8 |
-
"hidden_act": "gelu",
|
9 |
-
"hidden_dropout_prob": 0.1,
|
10 |
-
"hidden_size": 384,
|
11 |
-
"initializer_range": 0.02,
|
12 |
-
"intermediate_size": 1536,
|
13 |
-
"layer_norm_eps": 1e-12,
|
14 |
-
"max_position_embeddings": 512,
|
15 |
-
"model_type": "bert",
|
16 |
-
"num_attention_heads": 12,
|
17 |
-
"num_hidden_layers": 3,
|
18 |
-
"pad_token_id": 0,
|
19 |
-
"position_embedding_type": "absolute",
|
20 |
-
"transformers_version": "4.7.0",
|
21 |
-
"type_vocab_size": 2,
|
22 |
-
"use_cache": true,
|
23 |
-
"vocab_size": 30522
|
24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/config_sentence_transformers.json
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"__version__": {
|
3 |
-
"sentence_transformers": "2.0.0",
|
4 |
-
"transformers": "4.7.0",
|
5 |
-
"pytorch": "1.9.0+cu102"
|
6 |
-
}
|
7 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/model.safetensors
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:cf1e4e2d420c664973037c3c73125d7a8fc69952495093ef8f50596f8943a433
|
3 |
-
size 69569488
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/modules.json
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"idx": 0,
|
4 |
-
"name": "0",
|
5 |
-
"path": "",
|
6 |
-
"type": "sentence_transformers.models.Transformer"
|
7 |
-
},
|
8 |
-
{
|
9 |
-
"idx": 1,
|
10 |
-
"name": "1",
|
11 |
-
"path": "1_Pooling",
|
12 |
-
"type": "sentence_transformers.models.Pooling"
|
13 |
-
}
|
14 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/sentence_bert_config.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"max_seq_length": 128,
|
3 |
-
"do_lower_case": false
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/special_tokens_map.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/tokenizer.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/tokenizer_config.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L3-H384-uncased", "do_basic_tokenize": true, "never_split": null, "model_max_length": 512}
|
|
|
|
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/vocab.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
.cache/response_2ab720ffccd688afdc790db13e338c83.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1c5853e52bd3fdc0bdf05ca5b73769bc17fe8f44fe56271a78a87f155c5de6da
|
3 |
-
size 429
|
|
|
|
|
|
|
|
.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HUGGINGFACE_DEPLOYMENT.md
DELETED
@@ -1,112 +0,0 @@
|
|
1 |
-
# Hugging Face Spaces Deployment Guide
|
2 |
-
|
3 |
-
This guide will help you deploy your HackRx Insurance Policy Assistant to Hugging Face Spaces.
|
4 |
-
|
5 |
-
## Prerequisites
|
6 |
-
|
7 |
-
1. A Hugging Face account (free at https://huggingface.co)
|
8 |
-
2. A Google Gemini API key
|
9 |
-
3. Your code pushed to a Git repository (GitHub, GitLab, etc.)
|
10 |
-
|
11 |
-
## Step 1: Prepare Your Repository
|
12 |
-
|
13 |
-
Your repository should contain the following files:
|
14 |
-
- `app.py` - Main application entry point
|
15 |
-
- `Dockerfile` - Docker configuration
|
16 |
-
- `requirements.txt` - Python dependencies
|
17 |
-
- `parser.py`, `embedder.py`, `retriever.py`, `llm.py` - Application modules
|
18 |
-
- `.dockerignore` - Docker build optimization
|
19 |
-
|
20 |
-
## Step 2: Create a Hugging Face Space
|
21 |
-
|
22 |
-
1. Go to https://huggingface.co/spaces
|
23 |
-
2. Click "Create new Space"
|
24 |
-
3. Choose the following settings:
|
25 |
-
- **Owner**: Your username
|
26 |
-
- **Space name**: `hackrx-insurance-assistant` (or your preferred name)
|
27 |
-
- **Space SDK**: `Docker`
|
28 |
-
- **License**: Choose appropriate license
|
29 |
-
- **Visibility**: Public or Private (your choice)
|
30 |
-
|
31 |
-
## Step 3: Connect Your Repository
|
32 |
-
|
33 |
-
1. In your new Space, go to the "Settings" tab
|
34 |
-
2. Under "Repository", click "Connect to existing repository"
|
35 |
-
3. Select your Git provider (GitHub, GitLab, etc.)
|
36 |
-
4. Choose your repository
|
37 |
-
5. Click "Connect"
|
38 |
-
|
39 |
-
## Step 4: Configure Environment Variables
|
40 |
-
|
41 |
-
1. In your Space settings, go to the "Repository secrets" section
|
42 |
-
2. Add the following secret:
|
43 |
-
- **Name**: `GOOGLE_API_KEY`
|
44 |
-
- **Value**: Your Google Gemini API key
|
45 |
-
|
46 |
-
## Step 5: Deploy
|
47 |
-
|
48 |
-
1. Push your code to your Git repository
|
49 |
-
2. Hugging Face Spaces will automatically detect the changes and start building
|
50 |
-
3. You can monitor the build progress in the "Logs" tab
|
51 |
-
4. Once built successfully, your API will be available at `https://your-space-name.hf.space`
|
52 |
-
|
53 |
-
## Step 6: Test Your Deployment
|
54 |
-
|
55 |
-
### Health Check
|
56 |
-
```bash
|
57 |
-
curl https://your-space-name.hf.space/
|
58 |
-
```
|
59 |
-
|
60 |
-
### Test API Endpoint
|
61 |
-
```bash
|
62 |
-
curl -X POST https://your-space-name.hf.space/api/v1/hackrx/run \
|
63 |
-
-H "Content-Type: application/json" \
|
64 |
-
-H "Authorization: Bearer your_token_here" \
|
65 |
-
-d '{
|
66 |
-
"documents": "https://example.com/insurance-policy.pdf",
|
67 |
-
"questions": ["What is the coverage amount?"]
|
68 |
-
}'
|
69 |
-
```
|
70 |
-
|
71 |
-
## Troubleshooting
|
72 |
-
|
73 |
-
### Common Issues
|
74 |
-
|
75 |
-
1. **Build Fails**: Check the logs in the "Logs" tab for error messages
|
76 |
-
2. **Environment Variable Not Set**: Ensure `GOOGLE_API_KEY` is set in Space secrets
|
77 |
-
3. **Port Issues**: The application runs on port 7860 (default for Hugging Face Spaces)
|
78 |
-
4. **Memory Issues**: If you encounter memory issues, consider optimizing the Dockerfile
|
79 |
-
|
80 |
-
### Debugging
|
81 |
-
|
82 |
-
1. Check the build logs in the "Logs" tab
|
83 |
-
2. Monitor the application logs for runtime errors
|
84 |
-
3. Test locally first to ensure everything works
|
85 |
-
|
86 |
-
## API Documentation
|
87 |
-
|
88 |
-
Once deployed, your API will have the following endpoints:
|
89 |
-
|
90 |
-
- `GET /` - Health check
|
91 |
-
- `GET /health` - API status
|
92 |
-
- `POST /api/v1/hackrx/run` - Process PDF from URL
|
93 |
-
- `POST /api/v1/hackrx/local` - Process local PDF file
|
94 |
-
|
95 |
-
## Cost Considerations
|
96 |
-
|
97 |
-
- Hugging Face Spaces offers free hosting for public spaces
|
98 |
-
- Private spaces may have usage limits
|
99 |
-
- Consider the cost of Google Gemini API calls
|
100 |
-
|
101 |
-
## Security Notes
|
102 |
-
|
103 |
-
- Keep your API keys secure
|
104 |
-
- Use appropriate authentication for production use
|
105 |
-
- Consider rate limiting for public APIs
|
106 |
-
|
107 |
-
## Updates
|
108 |
-
|
109 |
-
To update your deployment:
|
110 |
-
1. Push changes to your Git repository
|
111 |
-
2. Hugging Face Spaces will automatically rebuild and deploy
|
112 |
-
3. Monitor the build process in the "Logs" tab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README_HF.md
DELETED
@@ -1,112 +0,0 @@
|
|
1 |
-
# HackRx Insurance Policy Assistant
|
2 |
-
|
3 |
-
A FastAPI application that processes PDF documents and answers questions using AI, deployed on Hugging Face Spaces.
|
4 |
-
|
5 |
-
## Features
|
6 |
-
|
7 |
-
- PDF document parsing and text extraction
|
8 |
-
- Vector-based document search using FAISS
|
9 |
-
- AI-powered question answering using Google Gemini
|
10 |
-
- RESTful API endpoints for document processing
|
11 |
-
|
12 |
-
## API Endpoints
|
13 |
-
|
14 |
-
### Health Check
|
15 |
-
- `GET /` - Root endpoint
|
16 |
-
- `GET /health` - API status check
|
17 |
-
|
18 |
-
### Process PDF from URL
|
19 |
-
- `POST /api/v1/hackrx/run`
|
20 |
-
- **Headers**: `Authorization: Bearer <your_token>`
|
21 |
-
- **Body**:
|
22 |
-
```json
|
23 |
-
{
|
24 |
-
"documents": "https://example.com/document.pdf",
|
25 |
-
"questions": ["What is the coverage amount?", "What are the exclusions?"]
|
26 |
-
}
|
27 |
-
```
|
28 |
-
|
29 |
-
### Process Local PDF File
|
30 |
-
- `POST /api/v1/hackrx/local`
|
31 |
-
- **Body**:
|
32 |
-
```json
|
33 |
-
{
|
34 |
-
"document_path": "/app/files/document.pdf",
|
35 |
-
"questions": ["What is the coverage amount?", "What are the exclusions?"]
|
36 |
-
}
|
37 |
-
```
|
38 |
-
|
39 |
-
## Environment Variables
|
40 |
-
|
41 |
-
Set these in your Hugging Face Space settings:
|
42 |
-
|
43 |
-
- `GOOGLE_API_KEY` - Your Google Gemini API key
|
44 |
-
|
45 |
-
## Usage Examples
|
46 |
-
|
47 |
-
### Using curl
|
48 |
-
|
49 |
-
```bash
|
50 |
-
# Health check
|
51 |
-
curl https://your-space-name.hf.space/
|
52 |
-
|
53 |
-
# Process PDF from URL
|
54 |
-
curl -X POST https://your-space-name.hf.space/api/v1/hackrx/run \
|
55 |
-
-H "Content-Type: application/json" \
|
56 |
-
-H "Authorization: Bearer your_token_here" \
|
57 |
-
-d '{
|
58 |
-
"documents": "https://example.com/insurance-policy.pdf",
|
59 |
-
"questions": ["What is the coverage amount?", "What are the exclusions?"]
|
60 |
-
}'
|
61 |
-
```
|
62 |
-
|
63 |
-
### Using Python
|
64 |
-
|
65 |
-
```python
|
66 |
-
import requests
|
67 |
-
|
68 |
-
# Health check
|
69 |
-
response = requests.get("https://your-space-name.hf.space/")
|
70 |
-
print(response.json())
|
71 |
-
|
72 |
-
# Process PDF
|
73 |
-
url = "https://your-space-name.hf.space/api/v1/hackrx/run"
|
74 |
-
headers = {
|
75 |
-
"Content-Type": "application/json",
|
76 |
-
"Authorization": "Bearer your_token_here"
|
77 |
-
}
|
78 |
-
data = {
|
79 |
-
"documents": "https://example.com/insurance-policy.pdf",
|
80 |
-
"questions": ["What is the coverage amount?", "What are the exclusions?"]
|
81 |
-
}
|
82 |
-
|
83 |
-
response = requests.post(url, headers=headers, json=data)
|
84 |
-
print(response.json())
|
85 |
-
```
|
86 |
-
|
87 |
-
## Local Development
|
88 |
-
|
89 |
-
To run the application locally:
|
90 |
-
|
91 |
-
```bash
|
92 |
-
pip install -r requirements.txt
|
93 |
-
python app.py
|
94 |
-
```
|
95 |
-
|
96 |
-
The API will be available at `http://localhost:7860`
|
97 |
-
|
98 |
-
## Deployment
|
99 |
-
|
100 |
-
This application is configured for deployment on Hugging Face Spaces using Docker. The following files are included:
|
101 |
-
|
102 |
-
- `app.py` - Main application entry point
|
103 |
-
- `Dockerfile` - Docker configuration
|
104 |
-
- `.dockerignore` - Docker build optimization
|
105 |
-
- `requirements.txt` - Python dependencies
|
106 |
-
|
107 |
-
## Model Information
|
108 |
-
|
109 |
-
- **Framework**: FastAPI
|
110 |
-
- **AI Model**: Google Gemini
|
111 |
-
- **Vector Database**: FAISS
|
112 |
-
- **Document Processing**: PyMuPDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embedder.py
CHANGED
@@ -1,48 +1,52 @@
|
|
1 |
import faiss
|
2 |
-
from sentence_transformers import SentenceTransformer
|
3 |
import numpy as np
|
4 |
import os
|
|
|
5 |
|
6 |
-
#
|
7 |
cache_dir = os.path.join(os.getcwd(), ".cache")
|
8 |
os.makedirs(cache_dir, exist_ok=True)
|
9 |
os.environ['HF_HOME'] = cache_dir
|
10 |
os.environ['TRANSFORMERS_CACHE'] = cache_dir
|
11 |
|
12 |
-
#
|
13 |
_model = None
|
14 |
|
15 |
-
def preload_model():
|
16 |
-
"""Preload the sentence transformer model at startup"""
|
17 |
global _model
|
18 |
-
if _model is None:
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
return _model
|
33 |
|
34 |
def get_model():
|
35 |
-
|
36 |
-
global _model
|
37 |
-
if _model is None:
|
38 |
-
print("Warning: Model not preloaded, loading now...")
|
39 |
-
return preload_model()
|
40 |
-
return _model
|
41 |
|
42 |
-
def build_faiss_index(chunks):
|
43 |
model = get_model()
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import faiss
|
|
|
2 |
import numpy as np
|
3 |
import os
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
|
6 |
+
# Use a local cache for transformer downloads
|
7 |
cache_dir = os.path.join(os.getcwd(), ".cache")
|
8 |
os.makedirs(cache_dir, exist_ok=True)
|
9 |
os.environ['HF_HOME'] = cache_dir
|
10 |
os.environ['TRANSFORMERS_CACHE'] = cache_dir
|
11 |
|
12 |
+
# Lazy-loaded model
|
13 |
_model = None
|
14 |
|
15 |
+
def preload_model(model_name="all-MiniLM-L6-v2"):
|
|
|
16 |
global _model
|
17 |
+
if _model is not None:
|
18 |
+
return _model
|
19 |
+
|
20 |
+
print("Preloading sentence transformer model...")
|
21 |
+
|
22 |
+
try:
|
23 |
+
_model = SentenceTransformer(model_name, cache_folder=cache_dir)
|
24 |
+
except Exception as e:
|
25 |
+
print(f"Primary model load failed: {e}")
|
26 |
+
fallback_name = "sentence-transformers/" + model_name
|
27 |
+
print(f"Trying fallback: {fallback_name}")
|
28 |
+
_model = SentenceTransformer(fallback_name, cache_folder=cache_dir)
|
29 |
+
|
30 |
+
print("β
Model ready.")
|
31 |
return _model
|
32 |
|
33 |
def get_model():
|
34 |
+
return preload_model()
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
def build_faiss_index(chunks, batch_size=128, show_progress_bar=False):
|
37 |
model = get_model()
|
38 |
+
|
39 |
+
# Encode using batching for speed
|
40 |
+
embeddings = model.encode(
|
41 |
+
chunks,
|
42 |
+
batch_size=batch_size,
|
43 |
+
show_progress_bar=show_progress_bar,
|
44 |
+
convert_to_numpy=True,
|
45 |
+
normalize_embeddings=True # Helps FAISS L2 perform better
|
46 |
+
)
|
47 |
+
|
48 |
+
dim = embeddings.shape[1]
|
49 |
+
index = faiss.IndexFlatL2(dim)
|
50 |
+
index.add(embeddings)
|
51 |
+
|
52 |
+
return index, chunks
|
main.py
DELETED
@@ -1,260 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import warnings
|
3 |
-
import logging
|
4 |
-
import time
|
5 |
-
from datetime import datetime
|
6 |
-
|
7 |
-
# Suppress TensorFlow warnings
|
8 |
-
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
9 |
-
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
|
10 |
-
os.environ['TF_LOGGING_LEVEL'] = 'ERROR'
|
11 |
-
os.environ['TF_ENABLE_DEPRECATION_WARNINGS'] = '0'
|
12 |
-
|
13 |
-
# Suppress specific TensorFlow deprecation warnings
|
14 |
-
warnings.filterwarnings('ignore', category=DeprecationWarning, module='tensorflow')
|
15 |
-
logging.getLogger('tensorflow').setLevel(logging.ERROR)
|
16 |
-
|
17 |
-
from fastapi import FastAPI, Request, HTTPException, Depends, Header
|
18 |
-
from fastapi.middleware.cors import CORSMiddleware
|
19 |
-
from pydantic import BaseModel
|
20 |
-
from pdf_parser import parse_pdf_from_url, parse_pdf_from_file
|
21 |
-
from embedder import build_faiss_index, preload_model
|
22 |
-
from retriever import retrieve_chunks
|
23 |
-
from llm import query_gemini
|
24 |
-
import uvicorn
|
25 |
-
|
26 |
-
app = FastAPI(title="HackRx Insurance Policy Assistant", version="1.0.0")
|
27 |
-
|
28 |
-
# Add CORS middleware
|
29 |
-
app.add_middleware(
|
30 |
-
CORSMiddleware,
|
31 |
-
allow_origins=["*"],
|
32 |
-
allow_credentials=True,
|
33 |
-
allow_methods=["*"],
|
34 |
-
allow_headers=["*"],
|
35 |
-
)
|
36 |
-
|
37 |
-
# Preload the model at startup
|
38 |
-
@app.on_event("startup")
|
39 |
-
async def startup_event():
|
40 |
-
print("Starting up HackRx Insurance Policy Assistant...")
|
41 |
-
print("Preloading sentence transformer model...")
|
42 |
-
preload_model()
|
43 |
-
print("Model preloading completed. API is ready to serve requests!")
|
44 |
-
|
45 |
-
@app.get("/")
|
46 |
-
async def root():
|
47 |
-
return {"message": "HackRx Insurance Policy Assistant API is running!"}
|
48 |
-
|
49 |
-
@app.get("/health")
|
50 |
-
async def health_check():
|
51 |
-
return {"status": "healthy", "message": "API is ready to process requests"}
|
52 |
-
|
53 |
-
class QueryRequest(BaseModel):
|
54 |
-
documents: str
|
55 |
-
questions: list[str]
|
56 |
-
|
57 |
-
class LocalQueryRequest(BaseModel):
|
58 |
-
document_path: str
|
59 |
-
questions: list[str]
|
60 |
-
|
61 |
-
def verify_token(authorization: str = Header(None)):
|
62 |
-
if not authorization or not authorization.startswith("Bearer "):
|
63 |
-
raise HTTPException(status_code=401, detail="Invalid authorization header")
|
64 |
-
|
65 |
-
token = authorization.replace("Bearer ", "")
|
66 |
-
# For demo purposes, accept any token. In production, validate against a database
|
67 |
-
if not token:
|
68 |
-
raise HTTPException(status_code=401, detail="Invalid token")
|
69 |
-
|
70 |
-
return token
|
71 |
-
|
72 |
-
@app.post("/api/v1/hackrx/run")
|
73 |
-
async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
|
74 |
-
start_time = time.time()
|
75 |
-
timing_data = {}
|
76 |
-
|
77 |
-
try:
|
78 |
-
print(f"\n=== INPUT JSON ===")
|
79 |
-
print(f"Documents: {request.documents}")
|
80 |
-
print(f"Questions: {request.questions}")
|
81 |
-
print(f"==================\n")
|
82 |
-
|
83 |
-
print(f"Processing {len(request.questions)} questions...")
|
84 |
-
|
85 |
-
# Time PDF parsing
|
86 |
-
pdf_start = time.time()
|
87 |
-
text_chunks = parse_pdf_from_url(request.documents)
|
88 |
-
pdf_time = time.time() - pdf_start
|
89 |
-
timing_data['pdf_parsing'] = round(pdf_time, 2)
|
90 |
-
print(f"Extracted {len(text_chunks)} text chunks from PDF")
|
91 |
-
|
92 |
-
# Time FAISS index building
|
93 |
-
index_start = time.time()
|
94 |
-
index, texts = build_faiss_index(text_chunks)
|
95 |
-
index_time = time.time() - index_start
|
96 |
-
timing_data['faiss_index_building'] = round(index_time, 2)
|
97 |
-
|
98 |
-
# Time chunk retrieval for all questions
|
99 |
-
retrieval_start = time.time()
|
100 |
-
all_chunks = set()
|
101 |
-
for i, question in enumerate(request.questions):
|
102 |
-
question_start = time.time()
|
103 |
-
top_chunks = retrieve_chunks(index, texts, question)
|
104 |
-
question_time = time.time() - question_start
|
105 |
-
all_chunks.update(top_chunks)
|
106 |
-
|
107 |
-
retrieval_time = time.time() - retrieval_start
|
108 |
-
timing_data['chunk_retrieval'] = round(retrieval_time, 2)
|
109 |
-
print(f"Retrieved {len(all_chunks)} unique chunks")
|
110 |
-
|
111 |
-
# Time LLM processing
|
112 |
-
llm_start = time.time()
|
113 |
-
print(f"Processing all {len(request.questions)} questions in batch...")
|
114 |
-
response = query_gemini(request.questions, list(all_chunks))
|
115 |
-
llm_time = time.time() - llm_start
|
116 |
-
timing_data['llm_processing'] = round(llm_time, 2)
|
117 |
-
|
118 |
-
# Time response processing
|
119 |
-
response_start = time.time()
|
120 |
-
# Extract answers from the JSON response
|
121 |
-
if isinstance(response, dict) and "answers" in response:
|
122 |
-
answers = response["answers"]
|
123 |
-
# Ensure we have the right number of answers
|
124 |
-
while len(answers) < len(request.questions):
|
125 |
-
answers.append("Not Found")
|
126 |
-
answers = answers[:len(request.questions)]
|
127 |
-
else:
|
128 |
-
# Fallback if response is not in expected format
|
129 |
-
answers = [response] if isinstance(response, str) else []
|
130 |
-
# Ensure we have the right number of answers
|
131 |
-
while len(answers) < len(request.questions):
|
132 |
-
answers.append("Not Found")
|
133 |
-
answers = answers[:len(request.questions)]
|
134 |
-
|
135 |
-
response_time = time.time() - response_start
|
136 |
-
timing_data['response_processing'] = round(response_time, 2)
|
137 |
-
print(f"Generated {len(answers)} answers")
|
138 |
-
|
139 |
-
# Calculate total time
|
140 |
-
total_time = time.time() - start_time
|
141 |
-
timing_data['total_time'] = round(total_time, 2)
|
142 |
-
|
143 |
-
print(f"\n=== TIMING BREAKDOWN ===")
|
144 |
-
print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
|
145 |
-
print(f"FAISS Index Building: {timing_data['faiss_index_building']}s")
|
146 |
-
print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
|
147 |
-
print(f"LLM Processing: {timing_data['llm_processing']}s")
|
148 |
-
print(f"Response Processing: {timing_data['response_processing']}s")
|
149 |
-
print(f"TOTAL TIME: {timing_data['total_time']}s")
|
150 |
-
print(f"=======================\n")
|
151 |
-
|
152 |
-
result = {"answers": answers}
|
153 |
-
print(f"=== OUTPUT JSON ===")
|
154 |
-
print(f"{result}")
|
155 |
-
print(f"==================\n")
|
156 |
-
|
157 |
-
return result
|
158 |
-
|
159 |
-
except Exception as e:
|
160 |
-
total_time = time.time() - start_time
|
161 |
-
print(f"Error after {total_time:.2f} seconds: {str(e)}")
|
162 |
-
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
163 |
-
|
164 |
-
@app.post("/api/v1/hackrx/local")
|
165 |
-
async def run_local_query(request: LocalQueryRequest):
|
166 |
-
start_time = time.time()
|
167 |
-
timing_data = {}
|
168 |
-
|
169 |
-
try:
|
170 |
-
print(f"\n=== INPUT JSON ===")
|
171 |
-
print(f"Document Path: {request.document_path}")
|
172 |
-
print(f"Questions: {request.questions}")
|
173 |
-
print(f"==================\n")
|
174 |
-
|
175 |
-
print(f"Processing local document: {request.document_path}")
|
176 |
-
print(f"Processing {len(request.questions)} questions...")
|
177 |
-
|
178 |
-
# Time local PDF parsing
|
179 |
-
pdf_start = time.time()
|
180 |
-
text_chunks = parse_pdf_from_file(request.document_path)
|
181 |
-
pdf_time = time.time() - pdf_start
|
182 |
-
timing_data['pdf_parsing'] = round(pdf_time, 2)
|
183 |
-
print(f"Extracted {len(text_chunks)} text chunks from local PDF")
|
184 |
-
|
185 |
-
# Time FAISS index building
|
186 |
-
index_start = time.time()
|
187 |
-
index, texts = build_faiss_index(text_chunks)
|
188 |
-
index_time = time.time() - index_start
|
189 |
-
timing_data['faiss_index_building'] = round(index_time, 2)
|
190 |
-
|
191 |
-
# Time chunk retrieval for all questions
|
192 |
-
retrieval_start = time.time()
|
193 |
-
all_chunks = set()
|
194 |
-
for i, question in enumerate(request.questions):
|
195 |
-
question_start = time.time()
|
196 |
-
top_chunks = retrieve_chunks(index, texts, question)
|
197 |
-
question_time = time.time() - question_start
|
198 |
-
all_chunks.update(top_chunks)
|
199 |
-
|
200 |
-
retrieval_time = time.time() - retrieval_start
|
201 |
-
timing_data['chunk_retrieval'] = round(retrieval_time, 2)
|
202 |
-
print(f"Retrieved {len(all_chunks)} unique chunks")
|
203 |
-
|
204 |
-
# Time LLM processing
|
205 |
-
llm_start = time.time()
|
206 |
-
print(f"Processing all {len(request.questions)} questions in batch...")
|
207 |
-
response = query_gemini(request.questions, list(all_chunks))
|
208 |
-
llm_time = time.time() - llm_start
|
209 |
-
timing_data['llm_processing'] = round(llm_time, 2)
|
210 |
-
|
211 |
-
# Time response processing
|
212 |
-
response_start = time.time()
|
213 |
-
# Extract answers from the JSON response
|
214 |
-
if isinstance(response, dict) and "answers" in response:
|
215 |
-
answers = response["answers"]
|
216 |
-
# Ensure we have the right number of answers
|
217 |
-
while len(answers) < len(request.questions):
|
218 |
-
answers.append("Not Found")
|
219 |
-
answers = answers[:len(request.questions)]
|
220 |
-
else:
|
221 |
-
# Fallback if response is not in expected format
|
222 |
-
answers = [response] if isinstance(response, str) else []
|
223 |
-
# Ensure we have the right number of answers
|
224 |
-
while len(answers) < len(request.questions):
|
225 |
-
answers.append("Not Found")
|
226 |
-
answers = answers[:len(request.questions)]
|
227 |
-
|
228 |
-
response_time = time.time() - response_start
|
229 |
-
timing_data['response_processing'] = round(response_time, 2)
|
230 |
-
print(f"Generated {len(answers)} answers")
|
231 |
-
|
232 |
-
# Calculate total time
|
233 |
-
total_time = time.time() - start_time
|
234 |
-
timing_data['total_time'] = round(total_time, 2)
|
235 |
-
|
236 |
-
print(f"\n=== TIMING BREAKDOWN ===")
|
237 |
-
print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
|
238 |
-
print(f"FAISS Index Building: {timing_data['faiss_index_building']}s")
|
239 |
-
print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
|
240 |
-
print(f"LLM Processing: {timing_data['llm_processing']}s")
|
241 |
-
print(f"Response Processing: {timing_data['response_processing']}s")
|
242 |
-
print(f"TOTAL TIME: {timing_data['total_time']}s")
|
243 |
-
print(f"=======================\n")
|
244 |
-
|
245 |
-
result = {"answers": answers}
|
246 |
-
print(f"=== OUTPUT JSON ===")
|
247 |
-
print(f"{result}")
|
248 |
-
print(f"==================\n")
|
249 |
-
|
250 |
-
return result
|
251 |
-
|
252 |
-
except Exception as e:
|
253 |
-
total_time = time.time() - start_time
|
254 |
-
print(f"Error after {total_time:.2f} seconds: {str(e)}")
|
255 |
-
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
256 |
-
|
257 |
-
if __name__ == "__main__":
|
258 |
-
port = int(os.environ.get("PORT", 10000))
|
259 |
-
uvicorn.run("main:app", host="0.0.0.0", port=port)
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_parser.py
CHANGED
@@ -2,42 +2,49 @@ import fitz # PyMuPDF
|
|
2 |
import requests
|
3 |
from io import BytesIO
|
4 |
from concurrent.futures import ThreadPoolExecutor
|
5 |
-
import os
|
6 |
|
7 |
-
def
|
8 |
text = page.get_text()
|
9 |
-
return text if text.strip() else None
|
10 |
-
|
11 |
-
def parse_pdf_from_url_multithreaded(url, max_workers=None):
|
12 |
-
# Automatically detect and use all available CPU cores if max_workers not set
|
13 |
-
if max_workers is None:
|
14 |
-
max_workers = os.cpu_count() or 8
|
15 |
|
|
|
|
|
|
|
|
|
16 |
res = requests.get(url)
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
38 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import requests
|
3 |
from io import BytesIO
|
4 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
5 |
|
6 |
+
def _extract_text(page):
|
7 |
text = page.get_text()
|
8 |
+
return text.strip() if text and text.strip() else None
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
11 |
+
"""
|
12 |
+
Download PDF from URL, extract text in parallel, optionally chunk pages.
|
13 |
+
"""
|
14 |
res = requests.get(url)
|
15 |
+
with fitz.open(stream=BytesIO(res.content), filetype="pdf") as doc:
|
16 |
+
num_pages = len(doc)
|
17 |
+
pages = list(doc)
|
18 |
+
# Step 1: Parallel text extraction
|
19 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
20 |
+
texts = list(executor.map(_extract_text, pages))
|
21 |
+
# Step 2: Optional chunking
|
22 |
+
if chunk_size > 1:
|
23 |
+
chunks = []
|
24 |
+
for i in range(0, len(texts), chunk_size):
|
25 |
+
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
|
26 |
+
if chunk:
|
27 |
+
chunks.append(chunk)
|
28 |
+
return chunks
|
29 |
+
# Default: return one chunk per page
|
30 |
+
return [t for t in texts if t]
|
31 |
|
32 |
+
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
|
33 |
+
"""
|
34 |
+
Parse a local PDF file, extract text in parallel, optionally chunk pages.
|
35 |
+
"""
|
36 |
+
with fitz.open(file_path) as doc:
|
37 |
+
num_pages = len(doc)
|
38 |
+
pages = list(doc)
|
39 |
+
# Step 1: Parallel text extraction
|
40 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
41 |
+
texts = list(executor.map(_extract_text, pages))
|
42 |
+
# Step 2: Optional chunking
|
43 |
+
if chunk_size > 1:
|
44 |
+
chunks = []
|
45 |
+
for i in range(0, len(texts), chunk_size):
|
46 |
+
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
|
47 |
+
if chunk:
|
48 |
+
chunks.append(chunk)
|
49 |
+
return chunks
|
50 |
+
return [t for t in texts if t]
|
start.sh
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
|
3 |
-
# Set up cache directory
|
4 |
-
mkdir -p .cache
|
5 |
-
export HF_HOME="$(pwd)/.cache"
|
6 |
-
export TRANSFORMERS_CACHE="$(pwd)/.cache"
|
7 |
-
|
8 |
-
echo "Cache directory set to: $(pwd)/.cache"
|
9 |
-
echo "Starting application..."
|
10 |
-
|
11 |
-
# Run the application
|
12 |
-
python app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_deployment.py
DELETED
@@ -1,75 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Test script for Hugging Face Spaces deployment
|
4 |
-
"""
|
5 |
-
|
6 |
-
import requests
|
7 |
-
import json
|
8 |
-
import sys
|
9 |
-
|
10 |
-
def test_health_check(base_url):
|
11 |
-
"""Test the health check endpoint"""
|
12 |
-
try:
|
13 |
-
response = requests.get(f"{base_url}/")
|
14 |
-
print(f"Health check status: {response.status_code}")
|
15 |
-
print(f"Response: {response.json()}")
|
16 |
-
return response.status_code == 200
|
17 |
-
except Exception as e:
|
18 |
-
print(f"Health check failed: {e}")
|
19 |
-
return False
|
20 |
-
|
21 |
-
def test_api_endpoint(base_url, api_key):
|
22 |
-
"""Test the main API endpoint"""
|
23 |
-
try:
|
24 |
-
url = f"{base_url}/api/v1/hackrx/run"
|
25 |
-
headers = {
|
26 |
-
"Content-Type": "application/json",
|
27 |
-
"Authorization": f"Bearer {api_key}"
|
28 |
-
}
|
29 |
-
data = {
|
30 |
-
"documents": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
31 |
-
"questions": ["What is this document about?"]
|
32 |
-
}
|
33 |
-
|
34 |
-
response = requests.post(url, headers=headers, json=data)
|
35 |
-
print(f"API test status: {response.status_code}")
|
36 |
-
print(f"Response: {response.json()}")
|
37 |
-
return response.status_code == 200
|
38 |
-
except Exception as e:
|
39 |
-
print(f"API test failed: {e}")
|
40 |
-
return False
|
41 |
-
|
42 |
-
def main():
|
43 |
-
if len(sys.argv) < 2:
|
44 |
-
print("Usage: python test_deployment.py <base_url> [api_key]")
|
45 |
-
print("Example: python test_deployment.py https://your-space-name.hf.space your_api_key")
|
46 |
-
sys.exit(1)
|
47 |
-
|
48 |
-
base_url = sys.argv[1].rstrip('/')
|
49 |
-
api_key = sys.argv[2] if len(sys.argv) > 2 else "test_token"
|
50 |
-
|
51 |
-
print(f"Testing deployment at: {base_url}")
|
52 |
-
print("=" * 50)
|
53 |
-
|
54 |
-
# Test health check
|
55 |
-
print("1. Testing health check...")
|
56 |
-
health_ok = test_health_check(base_url)
|
57 |
-
|
58 |
-
# Test API endpoint
|
59 |
-
print("\n2. Testing API endpoint...")
|
60 |
-
api_ok = test_api_endpoint(base_url, api_key)
|
61 |
-
|
62 |
-
# Summary
|
63 |
-
print("\n" + "=" * 50)
|
64 |
-
print("DEPLOYMENT TEST SUMMARY")
|
65 |
-
print("=" * 50)
|
66 |
-
print(f"Health check: {'β
PASS' if health_ok else 'β FAIL'}")
|
67 |
-
print(f"API endpoint: {'β
PASS' if api_ok else 'β FAIL'}")
|
68 |
-
|
69 |
-
if health_ok and api_ok:
|
70 |
-
print("\nπ Deployment is working correctly!")
|
71 |
-
else:
|
72 |
-
print("\nβ οΈ Some tests failed. Check the logs above for details.")
|
73 |
-
|
74 |
-
if __name__ == "__main__":
|
75 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_model_loading.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Test script to verify model loading works correctly
|
4 |
-
"""
|
5 |
-
import os
|
6 |
-
import sys
|
7 |
-
|
8 |
-
# Set up cache directory
|
9 |
-
cache_dir = os.path.join(os.getcwd(), ".cache")
|
10 |
-
os.makedirs(cache_dir, exist_ok=True)
|
11 |
-
os.environ['HF_HOME'] = cache_dir
|
12 |
-
os.environ['TRANSFORMERS_CACHE'] = cache_dir
|
13 |
-
|
14 |
-
print(f"Cache directory: {cache_dir}")
|
15 |
-
print(f"Current working directory: {os.getcwd()}")
|
16 |
-
|
17 |
-
try:
|
18 |
-
from embedder import get_model, build_faiss_index
|
19 |
-
|
20 |
-
print("Testing model loading...")
|
21 |
-
model = get_model()
|
22 |
-
print("β Model loaded successfully!")
|
23 |
-
|
24 |
-
# Test with some sample text
|
25 |
-
test_chunks = ["This is a test document.", "Another test sentence."]
|
26 |
-
print("Testing FAISS index building...")
|
27 |
-
index, texts = build_faiss_index(test_chunks)
|
28 |
-
print("β FAISS index built successfully!")
|
29 |
-
|
30 |
-
print("All tests passed!")
|
31 |
-
|
32 |
-
except Exception as e:
|
33 |
-
print(f"β Error: {e}")
|
34 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|