Rivalcoder commited on
Commit
7acce36
Β·
1 Parent(s): 0d10b91

Update Prompt

Browse files
Files changed (48) hide show
  1. .cache/chunks_6635d94cf9023c83521982b3043ec70c.pkl +0 -3
  2. .cache/embeddings_b24811e7d333cc7d5047e52b357abd7e.pkl +0 -3
  3. .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/refs/main +0 -1
  4. .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json +0 -7
  5. .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md +0 -173
  6. .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json +0 -7
  7. .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json +0 -20
  8. .cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json +0 -4
  9. .cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/adapter_config.json +0 -0
  10. .cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json +0 -0
  11. .cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja +0 -0
  12. .cache/models--sentence-transformers--all-MiniLM-L6-v2/refs/main +0 -1
  13. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json +0 -7
  14. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md +0 -173
  15. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json +0 -24
  16. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json +0 -7
  17. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/model.safetensors +0 -3
  18. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json +0 -20
  19. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json +0 -4
  20. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/special_tokens_map.json +0 -1
  21. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer.json +0 -0
  22. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer_config.json +0 -1
  23. .cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/vocab.txt +0 -0
  24. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/adapter_config.json +0 -0
  25. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/added_tokens.json +0 -0
  26. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/chat_template.jinja +0 -0
  27. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/refs/main +0 -1
  28. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/1_Pooling/config.json +0 -7
  29. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/README.md +0 -114
  30. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/config.json +0 -24
  31. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/config_sentence_transformers.json +0 -7
  32. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/model.safetensors +0 -3
  33. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/modules.json +0 -14
  34. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/sentence_bert_config.json +0 -4
  35. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/special_tokens_map.json +0 -1
  36. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/tokenizer.json +0 -0
  37. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/tokenizer_config.json +0 -1
  38. .cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/vocab.txt +0 -0
  39. .cache/response_2ab720ffccd688afdc790db13e338c83.pkl +0 -3
  40. .gitattributes +0 -35
  41. HUGGINGFACE_DEPLOYMENT.md +0 -112
  42. README_HF.md +0 -112
  43. embedder.py +35 -31
  44. main.py +0 -260
  45. pdf_parser.py +40 -33
  46. start.sh +0 -12
  47. test_deployment.py +0 -75
  48. test_model_loading.py +0 -34
.cache/chunks_6635d94cf9023c83521982b3043ec70c.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4cef2cc09ef9d4ef7d8649bb78ec868e356dcfecbcd6dde23442a90497d407e
3
- size 124546
 
 
 
 
.cache/embeddings_b24811e7d333cc7d5047e52b357abd7e.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:475523b57f8f6b89e62e668efef73309193b05f0f05bbeffb7f012ee952024f0
3
- size 347400
 
 
 
 
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/refs/main DELETED
@@ -1 +0,0 @@
1
- c9745ed1d9f207416be6d2e6f8de32d1f16199bf
 
 
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "word_embedding_dimension": 384,
3
- "pooling_mode_cls_token": false,
4
- "pooling_mode_mean_tokens": true,
5
- "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false
7
- }
 
 
 
 
 
 
 
 
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md DELETED
@@ -1,173 +0,0 @@
1
- ---
2
- language: en
3
- license: apache-2.0
4
- library_name: sentence-transformers
5
- tags:
6
- - sentence-transformers
7
- - feature-extraction
8
- - sentence-similarity
9
- - transformers
10
- datasets:
11
- - s2orc
12
- - flax-sentence-embeddings/stackexchange_xml
13
- - ms_marco
14
- - gooaq
15
- - yahoo_answers_topics
16
- - code_search_net
17
- - search_qa
18
- - eli5
19
- - snli
20
- - multi_nli
21
- - wikihow
22
- - natural_questions
23
- - trivia_qa
24
- - embedding-data/sentence-compression
25
- - embedding-data/flickr30k-captions
26
- - embedding-data/altlex
27
- - embedding-data/simple-wiki
28
- - embedding-data/QQP
29
- - embedding-data/SPECTER
30
- - embedding-data/PAQ_pairs
31
- - embedding-data/WikiAnswers
32
- pipeline_tag: sentence-similarity
33
- ---
34
-
35
-
36
- # all-MiniLM-L6-v2
37
- This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
38
-
39
- ## Usage (Sentence-Transformers)
40
- Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
41
-
42
- ```
43
- pip install -U sentence-transformers
44
- ```
45
-
46
- Then you can use the model like this:
47
- ```python
48
- from sentence_transformers import SentenceTransformer
49
- sentences = ["This is an example sentence", "Each sentence is converted"]
50
-
51
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
52
- embeddings = model.encode(sentences)
53
- print(embeddings)
54
- ```
55
-
56
- ## Usage (HuggingFace Transformers)
57
- Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
58
-
59
- ```python
60
- from transformers import AutoTokenizer, AutoModel
61
- import torch
62
- import torch.nn.functional as F
63
-
64
- #Mean Pooling - Take attention mask into account for correct averaging
65
- def mean_pooling(model_output, attention_mask):
66
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
67
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
68
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
69
-
70
-
71
- # Sentences we want sentence embeddings for
72
- sentences = ['This is an example sentence', 'Each sentence is converted']
73
-
74
- # Load model from HuggingFace Hub
75
- tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
76
- model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
77
-
78
- # Tokenize sentences
79
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
80
-
81
- # Compute token embeddings
82
- with torch.no_grad():
83
- model_output = model(**encoded_input)
84
-
85
- # Perform pooling
86
- sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
87
-
88
- # Normalize embeddings
89
- sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
90
-
91
- print("Sentence embeddings:")
92
- print(sentence_embeddings)
93
- ```
94
-
95
- ------
96
-
97
- ## Background
98
-
99
- The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
100
- contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
101
- 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
102
-
103
- We developed this model during the
104
- [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
105
- organized by Hugging Face. We developed this model as part of the project:
106
- [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
107
-
108
- ## Intended uses
109
-
110
- Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
111
- the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
112
-
113
- By default, input text longer than 256 word pieces is truncated.
114
-
115
-
116
- ## Training procedure
117
-
118
- ### Pre-training
119
-
120
- We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
121
-
122
- ### Fine-tuning
123
-
124
- We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
125
- We then apply the cross entropy loss by comparing with true pairs.
126
-
127
- #### Hyper parameters
128
-
129
- We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
130
- We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
131
- a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
132
-
133
- #### Training data
134
-
135
- We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
136
- We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
137
-
138
-
139
- | Dataset | Paper | Number of training tuples |
140
- |--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
141
- | [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
142
- | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
143
- | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
144
- | [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
145
- | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
146
- | [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
147
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
148
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
149
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
150
- | [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
151
- | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
152
- | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
153
- | [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
154
- | [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
155
- | [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
156
- | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
157
- | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
158
- | [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
159
- | [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
160
- | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
161
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
162
- | AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
163
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
164
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
165
- | [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
166
- | [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
167
- | [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
168
- | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
169
- | [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
170
- | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
171
- | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
172
- | [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
173
- | **Total** | | **1,170,060,424** |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "__version__": {
3
- "sentence_transformers": "2.0.0",
4
- "transformers": "4.6.1",
5
- "pytorch": "1.8.1"
6
- }
7
- }
 
 
 
 
 
 
 
 
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json DELETED
@@ -1,20 +0,0 @@
1
- [
2
- {
3
- "idx": 0,
4
- "name": "0",
5
- "path": "",
6
- "type": "sentence_transformers.models.Transformer"
7
- },
8
- {
9
- "idx": 1,
10
- "name": "1",
11
- "path": "1_Pooling",
12
- "type": "sentence_transformers.models.Pooling"
13
- },
14
- {
15
- "idx": 2,
16
- "name": "2",
17
- "path": "2_Normalize",
18
- "type": "sentence_transformers.models.Normalize"
19
- }
20
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.cache/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "max_seq_length": 256,
3
- "do_lower_case": false
4
- }
 
 
 
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/adapter_config.json DELETED
File without changes
.cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json DELETED
File without changes
.cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja DELETED
File without changes
.cache/models--sentence-transformers--all-MiniLM-L6-v2/refs/main DELETED
@@ -1 +0,0 @@
1
- c9745ed1d9f207416be6d2e6f8de32d1f16199bf
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/1_Pooling/config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "word_embedding_dimension": 384,
3
- "pooling_mode_cls_token": false,
4
- "pooling_mode_mean_tokens": true,
5
- "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false
7
- }
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/README.md DELETED
@@ -1,173 +0,0 @@
1
- ---
2
- language: en
3
- license: apache-2.0
4
- library_name: sentence-transformers
5
- tags:
6
- - sentence-transformers
7
- - feature-extraction
8
- - sentence-similarity
9
- - transformers
10
- datasets:
11
- - s2orc
12
- - flax-sentence-embeddings/stackexchange_xml
13
- - ms_marco
14
- - gooaq
15
- - yahoo_answers_topics
16
- - code_search_net
17
- - search_qa
18
- - eli5
19
- - snli
20
- - multi_nli
21
- - wikihow
22
- - natural_questions
23
- - trivia_qa
24
- - embedding-data/sentence-compression
25
- - embedding-data/flickr30k-captions
26
- - embedding-data/altlex
27
- - embedding-data/simple-wiki
28
- - embedding-data/QQP
29
- - embedding-data/SPECTER
30
- - embedding-data/PAQ_pairs
31
- - embedding-data/WikiAnswers
32
- pipeline_tag: sentence-similarity
33
- ---
34
-
35
-
36
- # all-MiniLM-L6-v2
37
- This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
38
-
39
- ## Usage (Sentence-Transformers)
40
- Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
41
-
42
- ```
43
- pip install -U sentence-transformers
44
- ```
45
-
46
- Then you can use the model like this:
47
- ```python
48
- from sentence_transformers import SentenceTransformer
49
- sentences = ["This is an example sentence", "Each sentence is converted"]
50
-
51
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
52
- embeddings = model.encode(sentences)
53
- print(embeddings)
54
- ```
55
-
56
- ## Usage (HuggingFace Transformers)
57
- Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
58
-
59
- ```python
60
- from transformers import AutoTokenizer, AutoModel
61
- import torch
62
- import torch.nn.functional as F
63
-
64
- #Mean Pooling - Take attention mask into account for correct averaging
65
- def mean_pooling(model_output, attention_mask):
66
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
67
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
68
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
69
-
70
-
71
- # Sentences we want sentence embeddings for
72
- sentences = ['This is an example sentence', 'Each sentence is converted']
73
-
74
- # Load model from HuggingFace Hub
75
- tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
76
- model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
77
-
78
- # Tokenize sentences
79
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
80
-
81
- # Compute token embeddings
82
- with torch.no_grad():
83
- model_output = model(**encoded_input)
84
-
85
- # Perform pooling
86
- sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
87
-
88
- # Normalize embeddings
89
- sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
90
-
91
- print("Sentence embeddings:")
92
- print(sentence_embeddings)
93
- ```
94
-
95
- ------
96
-
97
- ## Background
98
-
99
- The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
100
- contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
101
- 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
102
-
103
- We developed this model during the
104
- [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
105
- organized by Hugging Face. We developed this model as part of the project:
106
- [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
107
-
108
- ## Intended uses
109
-
110
- Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
111
- the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
112
-
113
- By default, input text longer than 256 word pieces is truncated.
114
-
115
-
116
- ## Training procedure
117
-
118
- ### Pre-training
119
-
120
- We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
121
-
122
- ### Fine-tuning
123
-
124
- We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
125
- We then apply the cross entropy loss by comparing with true pairs.
126
-
127
- #### Hyper parameters
128
-
129
- We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
130
- We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
131
- a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
132
-
133
- #### Training data
134
-
135
- We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
136
- We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
137
-
138
-
139
- | Dataset | Paper | Number of training tuples |
140
- |--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
141
- | [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
142
- | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
143
- | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
144
- | [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
145
- | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
146
- | [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
147
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
148
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
149
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
150
- | [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
151
- | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
152
- | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
153
- | [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
154
- | [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
155
- | [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
156
- | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
157
- | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
158
- | [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
159
- | [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
160
- | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
161
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
162
- | AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
163
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
164
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
165
- | [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
166
- | [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
167
- | [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
168
- | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
169
- | [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
170
- | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
171
- | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
172
- | [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
173
- | **Total** | | **1,170,060,424** |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
3
- "architectures": [
4
- "BertModel"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "gradient_checkpointing": false,
8
- "hidden_act": "gelu",
9
- "hidden_dropout_prob": 0.1,
10
- "hidden_size": 384,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1536,
13
- "layer_norm_eps": 1e-12,
14
- "max_position_embeddings": 512,
15
- "model_type": "bert",
16
- "num_attention_heads": 12,
17
- "num_hidden_layers": 6,
18
- "pad_token_id": 0,
19
- "position_embedding_type": "absolute",
20
- "transformers_version": "4.8.2",
21
- "type_vocab_size": 2,
22
- "use_cache": true,
23
- "vocab_size": 30522
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "__version__": {
3
- "sentence_transformers": "2.0.0",
4
- "transformers": "4.6.1",
5
- "pytorch": "1.8.1"
6
- }
7
- }
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
3
- size 90868376
 
 
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json DELETED
@@ -1,20 +0,0 @@
1
- [
2
- {
3
- "idx": 0,
4
- "name": "0",
5
- "path": "",
6
- "type": "sentence_transformers.models.Transformer"
7
- },
8
- {
9
- "idx": 1,
10
- "name": "1",
11
- "path": "1_Pooling",
12
- "type": "sentence_transformers.models.Pooling"
13
- },
14
- {
15
- "idx": 2,
16
- "name": "2",
17
- "path": "2_Normalize",
18
- "type": "sentence_transformers.models.Normalize"
19
- }
20
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/sentence_bert_config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "max_seq_length": 256,
3
- "do_lower_case": false
4
- }
 
 
 
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/special_tokens_map.json DELETED
@@ -1 +0,0 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer_config.json DELETED
@@ -1 +0,0 @@
1
- {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}
 
 
.cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/vocab.txt DELETED
The diff for this file is too large to render. See raw diff
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/adapter_config.json DELETED
File without changes
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/added_tokens.json DELETED
File without changes
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/.no_exist/4ca70771034acceecb2e72475f72050fcdde4ddc/chat_template.jinja DELETED
File without changes
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/refs/main DELETED
@@ -1 +0,0 @@
1
- 4ca70771034acceecb2e72475f72050fcdde4ddc
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/1_Pooling/config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "word_embedding_dimension": 384,
3
- "pooling_mode_cls_token": false,
4
- "pooling_mode_mean_tokens": true,
5
- "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false
7
- }
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/README.md DELETED
@@ -1,114 +0,0 @@
1
- ---
2
- license: apache-2.0
3
- library_name: sentence-transformers
4
- tags:
5
- - sentence-transformers
6
- - feature-extraction
7
- - sentence-similarity
8
- - transformers
9
- datasets:
10
- - flax-sentence-embeddings/stackexchange_xml
11
- - s2orc
12
- - ms_marco
13
- - wiki_atomic_edits
14
- - snli
15
- - multi_nli
16
- - embedding-data/altlex
17
- - embedding-data/simple-wiki
18
- - embedding-data/flickr30k-captions
19
- - embedding-data/coco_captions
20
- - embedding-data/sentence-compression
21
- - embedding-data/QQP
22
- - yahoo_answers_topics
23
- pipeline_tag: sentence-similarity
24
- ---
25
-
26
- # sentence-transformers/paraphrase-MiniLM-L3-v2
27
-
28
- This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
29
-
30
-
31
-
32
- ## Usage (Sentence-Transformers)
33
-
34
- Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
35
-
36
- ```
37
- pip install -U sentence-transformers
38
- ```
39
-
40
- Then you can use the model like this:
41
-
42
- ```python
43
- from sentence_transformers import SentenceTransformer
44
- sentences = ["This is an example sentence", "Each sentence is converted"]
45
-
46
- model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L3-v2')
47
- embeddings = model.encode(sentences)
48
- print(embeddings)
49
- ```
50
-
51
-
52
-
53
- ## Usage (HuggingFace Transformers)
54
- Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
55
-
56
- ```python
57
- from transformers import AutoTokenizer, AutoModel
58
- import torch
59
-
60
-
61
- #Mean Pooling - Take attention mask into account for correct averaging
62
- def mean_pooling(model_output, attention_mask):
63
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
64
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
65
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
66
-
67
-
68
- # Sentences we want sentence embeddings for
69
- sentences = ['This is an example sentence', 'Each sentence is converted']
70
-
71
- # Load model from HuggingFace Hub
72
- tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L3-v2')
73
- model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L3-v2')
74
-
75
- # Tokenize sentences
76
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
77
-
78
- # Compute token embeddings
79
- with torch.no_grad():
80
- model_output = model(**encoded_input)
81
-
82
- # Perform pooling. In this case, max pooling.
83
- sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
84
-
85
- print("Sentence embeddings:")
86
- print(sentence_embeddings)
87
- ```
88
-
89
-
90
-
91
- ## Full Model Architecture
92
- ```
93
- SentenceTransformer(
94
- (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
95
- (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
96
- )
97
- ```
98
-
99
- ## Citing & Authors
100
-
101
- This model was trained by [sentence-transformers](https://www.sbert.net/).
102
-
103
- If you find this model helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084):
104
- ```bibtex
105
- @inproceedings{reimers-2019-sentence-bert,
106
- title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
107
- author = "Reimers, Nils and Gurevych, Iryna",
108
- booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
109
- month = "11",
110
- year = "2019",
111
- publisher = "Association for Computational Linguistics",
112
- url = "http://arxiv.org/abs/1908.10084",
113
- }
114
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/config.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "_name_or_path": "old_models/paraphrase-MiniLM-L3-v2/0_Transformer",
3
- "architectures": [
4
- "BertModel"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "gradient_checkpointing": false,
8
- "hidden_act": "gelu",
9
- "hidden_dropout_prob": 0.1,
10
- "hidden_size": 384,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1536,
13
- "layer_norm_eps": 1e-12,
14
- "max_position_embeddings": 512,
15
- "model_type": "bert",
16
- "num_attention_heads": 12,
17
- "num_hidden_layers": 3,
18
- "pad_token_id": 0,
19
- "position_embedding_type": "absolute",
20
- "transformers_version": "4.7.0",
21
- "type_vocab_size": 2,
22
- "use_cache": true,
23
- "vocab_size": 30522
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/config_sentence_transformers.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "__version__": {
3
- "sentence_transformers": "2.0.0",
4
- "transformers": "4.7.0",
5
- "pytorch": "1.9.0+cu102"
6
- }
7
- }
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf1e4e2d420c664973037c3c73125d7a8fc69952495093ef8f50596f8943a433
3
- size 69569488
 
 
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/modules.json DELETED
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "idx": 0,
4
- "name": "0",
5
- "path": "",
6
- "type": "sentence_transformers.models.Transformer"
7
- },
8
- {
9
- "idx": 1,
10
- "name": "1",
11
- "path": "1_Pooling",
12
- "type": "sentence_transformers.models.Pooling"
13
- }
14
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/sentence_bert_config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "max_seq_length": 128,
3
- "do_lower_case": false
4
- }
 
 
 
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/special_tokens_map.json DELETED
@@ -1 +0,0 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/tokenizer_config.json DELETED
@@ -1 +0,0 @@
1
- {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L3-H384-uncased", "do_basic_tokenize": true, "never_split": null, "model_max_length": 512}
 
 
.cache/models--sentence-transformers--paraphrase-MiniLM-L3-v2/snapshots/4ca70771034acceecb2e72475f72050fcdde4ddc/vocab.txt DELETED
The diff for this file is too large to render. See raw diff
 
.cache/response_2ab720ffccd688afdc790db13e338c83.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c5853e52bd3fdc0bdf05ca5b73769bc17fe8f44fe56271a78a87f155c5de6da
3
- size 429
 
 
 
 
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HUGGINGFACE_DEPLOYMENT.md DELETED
@@ -1,112 +0,0 @@
1
- # Hugging Face Spaces Deployment Guide
2
-
3
- This guide will help you deploy your HackRx Insurance Policy Assistant to Hugging Face Spaces.
4
-
5
- ## Prerequisites
6
-
7
- 1. A Hugging Face account (free at https://huggingface.co)
8
- 2. A Google Gemini API key
9
- 3. Your code pushed to a Git repository (GitHub, GitLab, etc.)
10
-
11
- ## Step 1: Prepare Your Repository
12
-
13
- Your repository should contain the following files:
14
- - `app.py` - Main application entry point
15
- - `Dockerfile` - Docker configuration
16
- - `requirements.txt` - Python dependencies
17
- - `parser.py`, `embedder.py`, `retriever.py`, `llm.py` - Application modules
18
- - `.dockerignore` - Docker build optimization
19
-
20
- ## Step 2: Create a Hugging Face Space
21
-
22
- 1. Go to https://huggingface.co/spaces
23
- 2. Click "Create new Space"
24
- 3. Choose the following settings:
25
- - **Owner**: Your username
26
- - **Space name**: `hackrx-insurance-assistant` (or your preferred name)
27
- - **Space SDK**: `Docker`
28
- - **License**: Choose appropriate license
29
- - **Visibility**: Public or Private (your choice)
30
-
31
- ## Step 3: Connect Your Repository
32
-
33
- 1. In your new Space, go to the "Settings" tab
34
- 2. Under "Repository", click "Connect to existing repository"
35
- 3. Select your Git provider (GitHub, GitLab, etc.)
36
- 4. Choose your repository
37
- 5. Click "Connect"
38
-
39
- ## Step 4: Configure Environment Variables
40
-
41
- 1. In your Space settings, go to the "Repository secrets" section
42
- 2. Add the following secret:
43
- - **Name**: `GOOGLE_API_KEY`
44
- - **Value**: Your Google Gemini API key
45
-
46
- ## Step 5: Deploy
47
-
48
- 1. Push your code to your Git repository
49
- 2. Hugging Face Spaces will automatically detect the changes and start building
50
- 3. You can monitor the build progress in the "Logs" tab
51
- 4. Once built successfully, your API will be available at `https://your-space-name.hf.space`
52
-
53
- ## Step 6: Test Your Deployment
54
-
55
- ### Health Check
56
- ```bash
57
- curl https://your-space-name.hf.space/
58
- ```
59
-
60
- ### Test API Endpoint
61
- ```bash
62
- curl -X POST https://your-space-name.hf.space/api/v1/hackrx/run \
63
- -H "Content-Type: application/json" \
64
- -H "Authorization: Bearer your_token_here" \
65
- -d '{
66
- "documents": "https://example.com/insurance-policy.pdf",
67
- "questions": ["What is the coverage amount?"]
68
- }'
69
- ```
70
-
71
- ## Troubleshooting
72
-
73
- ### Common Issues
74
-
75
- 1. **Build Fails**: Check the logs in the "Logs" tab for error messages
76
- 2. **Environment Variable Not Set**: Ensure `GOOGLE_API_KEY` is set in Space secrets
77
- 3. **Port Issues**: The application runs on port 7860 (default for Hugging Face Spaces)
78
- 4. **Memory Issues**: If you encounter memory issues, consider optimizing the Dockerfile
79
-
80
- ### Debugging
81
-
82
- 1. Check the build logs in the "Logs" tab
83
- 2. Monitor the application logs for runtime errors
84
- 3. Test locally first to ensure everything works
85
-
86
- ## API Documentation
87
-
88
- Once deployed, your API will have the following endpoints:
89
-
90
- - `GET /` - Health check
91
- - `GET /health` - API status
92
- - `POST /api/v1/hackrx/run` - Process PDF from URL
93
- - `POST /api/v1/hackrx/local` - Process local PDF file
94
-
95
- ## Cost Considerations
96
-
97
- - Hugging Face Spaces offers free hosting for public spaces
98
- - Private spaces may have usage limits
99
- - Consider the cost of Google Gemini API calls
100
-
101
- ## Security Notes
102
-
103
- - Keep your API keys secure
104
- - Use appropriate authentication for production use
105
- - Consider rate limiting for public APIs
106
-
107
- ## Updates
108
-
109
- To update your deployment:
110
- 1. Push changes to your Git repository
111
- 2. Hugging Face Spaces will automatically rebuild and deploy
112
- 3. Monitor the build process in the "Logs" tab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README_HF.md DELETED
@@ -1,112 +0,0 @@
1
- # HackRx Insurance Policy Assistant
2
-
3
- A FastAPI application that processes PDF documents and answers questions using AI, deployed on Hugging Face Spaces.
4
-
5
- ## Features
6
-
7
- - PDF document parsing and text extraction
8
- - Vector-based document search using FAISS
9
- - AI-powered question answering using Google Gemini
10
- - RESTful API endpoints for document processing
11
-
12
- ## API Endpoints
13
-
14
- ### Health Check
15
- - `GET /` - Root endpoint
16
- - `GET /health` - API status check
17
-
18
- ### Process PDF from URL
19
- - `POST /api/v1/hackrx/run`
20
- - **Headers**: `Authorization: Bearer <your_token>`
21
- - **Body**:
22
- ```json
23
- {
24
- "documents": "https://example.com/document.pdf",
25
- "questions": ["What is the coverage amount?", "What are the exclusions?"]
26
- }
27
- ```
28
-
29
- ### Process Local PDF File
30
- - `POST /api/v1/hackrx/local`
31
- - **Body**:
32
- ```json
33
- {
34
- "document_path": "/app/files/document.pdf",
35
- "questions": ["What is the coverage amount?", "What are the exclusions?"]
36
- }
37
- ```
38
-
39
- ## Environment Variables
40
-
41
- Set these in your Hugging Face Space settings:
42
-
43
- - `GOOGLE_API_KEY` - Your Google Gemini API key
44
-
45
- ## Usage Examples
46
-
47
- ### Using curl
48
-
49
- ```bash
50
- # Health check
51
- curl https://your-space-name.hf.space/
52
-
53
- # Process PDF from URL
54
- curl -X POST https://your-space-name.hf.space/api/v1/hackrx/run \
55
- -H "Content-Type: application/json" \
56
- -H "Authorization: Bearer your_token_here" \
57
- -d '{
58
- "documents": "https://example.com/insurance-policy.pdf",
59
- "questions": ["What is the coverage amount?", "What are the exclusions?"]
60
- }'
61
- ```
62
-
63
- ### Using Python
64
-
65
- ```python
66
- import requests
67
-
68
- # Health check
69
- response = requests.get("https://your-space-name.hf.space/")
70
- print(response.json())
71
-
72
- # Process PDF
73
- url = "https://your-space-name.hf.space/api/v1/hackrx/run"
74
- headers = {
75
- "Content-Type": "application/json",
76
- "Authorization": "Bearer your_token_here"
77
- }
78
- data = {
79
- "documents": "https://example.com/insurance-policy.pdf",
80
- "questions": ["What is the coverage amount?", "What are the exclusions?"]
81
- }
82
-
83
- response = requests.post(url, headers=headers, json=data)
84
- print(response.json())
85
- ```
86
-
87
- ## Local Development
88
-
89
- To run the application locally:
90
-
91
- ```bash
92
- pip install -r requirements.txt
93
- python app.py
94
- ```
95
-
96
- The API will be available at `http://localhost:7860`
97
-
98
- ## Deployment
99
-
100
- This application is configured for deployment on Hugging Face Spaces using Docker. The following files are included:
101
-
102
- - `app.py` - Main application entry point
103
- - `Dockerfile` - Docker configuration
104
- - `.dockerignore` - Docker build optimization
105
- - `requirements.txt` - Python dependencies
106
-
107
- ## Model Information
108
-
109
- - **Framework**: FastAPI
110
- - **AI Model**: Google Gemini
111
- - **Vector Database**: FAISS
112
- - **Document Processing**: PyMuPDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
embedder.py CHANGED
@@ -1,48 +1,52 @@
1
  import faiss
2
- from sentence_transformers import SentenceTransformer
3
  import numpy as np
4
  import os
 
5
 
6
- # Set up cache directory in a writable location
7
  cache_dir = os.path.join(os.getcwd(), ".cache")
8
  os.makedirs(cache_dir, exist_ok=True)
9
  os.environ['HF_HOME'] = cache_dir
10
  os.environ['TRANSFORMERS_CACHE'] = cache_dir
11
 
12
- # Initialize model as None - will be loaded lazily
13
  _model = None
14
 
15
- def preload_model():
16
- """Preload the sentence transformer model at startup"""
17
  global _model
18
- if _model is None:
19
- print("Preloading sentence transformer model...")
20
- try:
21
- _model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir)
22
- print("Model preloading completed")
23
- except Exception as e:
24
- print(f"Error loading model: {e}")
25
- # Fallback to a different model if the first one fails
26
- try:
27
- _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=cache_dir)
28
- print("Fallback model preloading completed")
29
- except Exception as e2:
30
- print(f"Error loading fallback model: {e2}")
31
- raise
32
  return _model
33
 
34
  def get_model():
35
- """Get the sentence transformer model, loading it lazily if needed"""
36
- global _model
37
- if _model is None:
38
- print("Warning: Model not preloaded, loading now...")
39
- return preload_model()
40
- return _model
41
 
42
- def build_faiss_index(chunks):
43
  model = get_model()
44
- embeddings = model.encode(chunks)
45
- dimension = embeddings.shape[1]
46
- index = faiss.IndexFlatL2(dimension)
47
- index.add(np.array(embeddings))
48
- return index, chunks
 
 
 
 
 
 
 
 
 
 
 
1
  import faiss
 
2
  import numpy as np
3
  import os
4
+ from sentence_transformers import SentenceTransformer
5
 
6
+ # Use a local cache for transformer downloads
7
  cache_dir = os.path.join(os.getcwd(), ".cache")
8
  os.makedirs(cache_dir, exist_ok=True)
9
  os.environ['HF_HOME'] = cache_dir
10
  os.environ['TRANSFORMERS_CACHE'] = cache_dir
11
 
12
+ # Lazy-loaded model
13
  _model = None
14
 
15
+ def preload_model(model_name="all-MiniLM-L6-v2"):
 
16
  global _model
17
+ if _model is not None:
18
+ return _model
19
+
20
+ print("Preloading sentence transformer model...")
21
+
22
+ try:
23
+ _model = SentenceTransformer(model_name, cache_folder=cache_dir)
24
+ except Exception as e:
25
+ print(f"Primary model load failed: {e}")
26
+ fallback_name = "sentence-transformers/" + model_name
27
+ print(f"Trying fallback: {fallback_name}")
28
+ _model = SentenceTransformer(fallback_name, cache_folder=cache_dir)
29
+
30
+ print("βœ… Model ready.")
31
  return _model
32
 
33
  def get_model():
34
+ return preload_model()
 
 
 
 
 
35
 
36
+ def build_faiss_index(chunks, batch_size=128, show_progress_bar=False):
37
  model = get_model()
38
+
39
+ # Encode using batching for speed
40
+ embeddings = model.encode(
41
+ chunks,
42
+ batch_size=batch_size,
43
+ show_progress_bar=show_progress_bar,
44
+ convert_to_numpy=True,
45
+ normalize_embeddings=True # Helps FAISS L2 perform better
46
+ )
47
+
48
+ dim = embeddings.shape[1]
49
+ index = faiss.IndexFlatL2(dim)
50
+ index.add(embeddings)
51
+
52
+ return index, chunks
main.py DELETED
@@ -1,260 +0,0 @@
1
- import os
2
- import warnings
3
- import logging
4
- import time
5
- from datetime import datetime
6
-
7
- # Suppress TensorFlow warnings
8
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
9
- os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
10
- os.environ['TF_LOGGING_LEVEL'] = 'ERROR'
11
- os.environ['TF_ENABLE_DEPRECATION_WARNINGS'] = '0'
12
-
13
- # Suppress specific TensorFlow deprecation warnings
14
- warnings.filterwarnings('ignore', category=DeprecationWarning, module='tensorflow')
15
- logging.getLogger('tensorflow').setLevel(logging.ERROR)
16
-
17
- from fastapi import FastAPI, Request, HTTPException, Depends, Header
18
- from fastapi.middleware.cors import CORSMiddleware
19
- from pydantic import BaseModel
20
- from pdf_parser import parse_pdf_from_url, parse_pdf_from_file
21
- from embedder import build_faiss_index, preload_model
22
- from retriever import retrieve_chunks
23
- from llm import query_gemini
24
- import uvicorn
25
-
26
- app = FastAPI(title="HackRx Insurance Policy Assistant", version="1.0.0")
27
-
28
- # Add CORS middleware
29
- app.add_middleware(
30
- CORSMiddleware,
31
- allow_origins=["*"],
32
- allow_credentials=True,
33
- allow_methods=["*"],
34
- allow_headers=["*"],
35
- )
36
-
37
- # Preload the model at startup
38
- @app.on_event("startup")
39
- async def startup_event():
40
- print("Starting up HackRx Insurance Policy Assistant...")
41
- print("Preloading sentence transformer model...")
42
- preload_model()
43
- print("Model preloading completed. API is ready to serve requests!")
44
-
45
- @app.get("/")
46
- async def root():
47
- return {"message": "HackRx Insurance Policy Assistant API is running!"}
48
-
49
- @app.get("/health")
50
- async def health_check():
51
- return {"status": "healthy", "message": "API is ready to process requests"}
52
-
53
- class QueryRequest(BaseModel):
54
- documents: str
55
- questions: list[str]
56
-
57
- class LocalQueryRequest(BaseModel):
58
- document_path: str
59
- questions: list[str]
60
-
61
- def verify_token(authorization: str = Header(None)):
62
- if not authorization or not authorization.startswith("Bearer "):
63
- raise HTTPException(status_code=401, detail="Invalid authorization header")
64
-
65
- token = authorization.replace("Bearer ", "")
66
- # For demo purposes, accept any token. In production, validate against a database
67
- if not token:
68
- raise HTTPException(status_code=401, detail="Invalid token")
69
-
70
- return token
71
-
72
- @app.post("/api/v1/hackrx/run")
73
- async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
74
- start_time = time.time()
75
- timing_data = {}
76
-
77
- try:
78
- print(f"\n=== INPUT JSON ===")
79
- print(f"Documents: {request.documents}")
80
- print(f"Questions: {request.questions}")
81
- print(f"==================\n")
82
-
83
- print(f"Processing {len(request.questions)} questions...")
84
-
85
- # Time PDF parsing
86
- pdf_start = time.time()
87
- text_chunks = parse_pdf_from_url(request.documents)
88
- pdf_time = time.time() - pdf_start
89
- timing_data['pdf_parsing'] = round(pdf_time, 2)
90
- print(f"Extracted {len(text_chunks)} text chunks from PDF")
91
-
92
- # Time FAISS index building
93
- index_start = time.time()
94
- index, texts = build_faiss_index(text_chunks)
95
- index_time = time.time() - index_start
96
- timing_data['faiss_index_building'] = round(index_time, 2)
97
-
98
- # Time chunk retrieval for all questions
99
- retrieval_start = time.time()
100
- all_chunks = set()
101
- for i, question in enumerate(request.questions):
102
- question_start = time.time()
103
- top_chunks = retrieve_chunks(index, texts, question)
104
- question_time = time.time() - question_start
105
- all_chunks.update(top_chunks)
106
-
107
- retrieval_time = time.time() - retrieval_start
108
- timing_data['chunk_retrieval'] = round(retrieval_time, 2)
109
- print(f"Retrieved {len(all_chunks)} unique chunks")
110
-
111
- # Time LLM processing
112
- llm_start = time.time()
113
- print(f"Processing all {len(request.questions)} questions in batch...")
114
- response = query_gemini(request.questions, list(all_chunks))
115
- llm_time = time.time() - llm_start
116
- timing_data['llm_processing'] = round(llm_time, 2)
117
-
118
- # Time response processing
119
- response_start = time.time()
120
- # Extract answers from the JSON response
121
- if isinstance(response, dict) and "answers" in response:
122
- answers = response["answers"]
123
- # Ensure we have the right number of answers
124
- while len(answers) < len(request.questions):
125
- answers.append("Not Found")
126
- answers = answers[:len(request.questions)]
127
- else:
128
- # Fallback if response is not in expected format
129
- answers = [response] if isinstance(response, str) else []
130
- # Ensure we have the right number of answers
131
- while len(answers) < len(request.questions):
132
- answers.append("Not Found")
133
- answers = answers[:len(request.questions)]
134
-
135
- response_time = time.time() - response_start
136
- timing_data['response_processing'] = round(response_time, 2)
137
- print(f"Generated {len(answers)} answers")
138
-
139
- # Calculate total time
140
- total_time = time.time() - start_time
141
- timing_data['total_time'] = round(total_time, 2)
142
-
143
- print(f"\n=== TIMING BREAKDOWN ===")
144
- print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
145
- print(f"FAISS Index Building: {timing_data['faiss_index_building']}s")
146
- print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
147
- print(f"LLM Processing: {timing_data['llm_processing']}s")
148
- print(f"Response Processing: {timing_data['response_processing']}s")
149
- print(f"TOTAL TIME: {timing_data['total_time']}s")
150
- print(f"=======================\n")
151
-
152
- result = {"answers": answers}
153
- print(f"=== OUTPUT JSON ===")
154
- print(f"{result}")
155
- print(f"==================\n")
156
-
157
- return result
158
-
159
- except Exception as e:
160
- total_time = time.time() - start_time
161
- print(f"Error after {total_time:.2f} seconds: {str(e)}")
162
- raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
163
-
164
- @app.post("/api/v1/hackrx/local")
165
- async def run_local_query(request: LocalQueryRequest):
166
- start_time = time.time()
167
- timing_data = {}
168
-
169
- try:
170
- print(f"\n=== INPUT JSON ===")
171
- print(f"Document Path: {request.document_path}")
172
- print(f"Questions: {request.questions}")
173
- print(f"==================\n")
174
-
175
- print(f"Processing local document: {request.document_path}")
176
- print(f"Processing {len(request.questions)} questions...")
177
-
178
- # Time local PDF parsing
179
- pdf_start = time.time()
180
- text_chunks = parse_pdf_from_file(request.document_path)
181
- pdf_time = time.time() - pdf_start
182
- timing_data['pdf_parsing'] = round(pdf_time, 2)
183
- print(f"Extracted {len(text_chunks)} text chunks from local PDF")
184
-
185
- # Time FAISS index building
186
- index_start = time.time()
187
- index, texts = build_faiss_index(text_chunks)
188
- index_time = time.time() - index_start
189
- timing_data['faiss_index_building'] = round(index_time, 2)
190
-
191
- # Time chunk retrieval for all questions
192
- retrieval_start = time.time()
193
- all_chunks = set()
194
- for i, question in enumerate(request.questions):
195
- question_start = time.time()
196
- top_chunks = retrieve_chunks(index, texts, question)
197
- question_time = time.time() - question_start
198
- all_chunks.update(top_chunks)
199
-
200
- retrieval_time = time.time() - retrieval_start
201
- timing_data['chunk_retrieval'] = round(retrieval_time, 2)
202
- print(f"Retrieved {len(all_chunks)} unique chunks")
203
-
204
- # Time LLM processing
205
- llm_start = time.time()
206
- print(f"Processing all {len(request.questions)} questions in batch...")
207
- response = query_gemini(request.questions, list(all_chunks))
208
- llm_time = time.time() - llm_start
209
- timing_data['llm_processing'] = round(llm_time, 2)
210
-
211
- # Time response processing
212
- response_start = time.time()
213
- # Extract answers from the JSON response
214
- if isinstance(response, dict) and "answers" in response:
215
- answers = response["answers"]
216
- # Ensure we have the right number of answers
217
- while len(answers) < len(request.questions):
218
- answers.append("Not Found")
219
- answers = answers[:len(request.questions)]
220
- else:
221
- # Fallback if response is not in expected format
222
- answers = [response] if isinstance(response, str) else []
223
- # Ensure we have the right number of answers
224
- while len(answers) < len(request.questions):
225
- answers.append("Not Found")
226
- answers = answers[:len(request.questions)]
227
-
228
- response_time = time.time() - response_start
229
- timing_data['response_processing'] = round(response_time, 2)
230
- print(f"Generated {len(answers)} answers")
231
-
232
- # Calculate total time
233
- total_time = time.time() - start_time
234
- timing_data['total_time'] = round(total_time, 2)
235
-
236
- print(f"\n=== TIMING BREAKDOWN ===")
237
- print(f"PDF Parsing: {timing_data['pdf_parsing']}s")
238
- print(f"FAISS Index Building: {timing_data['faiss_index_building']}s")
239
- print(f"Chunk Retrieval: {timing_data['chunk_retrieval']}s")
240
- print(f"LLM Processing: {timing_data['llm_processing']}s")
241
- print(f"Response Processing: {timing_data['response_processing']}s")
242
- print(f"TOTAL TIME: {timing_data['total_time']}s")
243
- print(f"=======================\n")
244
-
245
- result = {"answers": answers}
246
- print(f"=== OUTPUT JSON ===")
247
- print(f"{result}")
248
- print(f"==================\n")
249
-
250
- return result
251
-
252
- except Exception as e:
253
- total_time = time.time() - start_time
254
- print(f"Error after {total_time:.2f} seconds: {str(e)}")
255
- raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
256
-
257
- if __name__ == "__main__":
258
- port = int(os.environ.get("PORT", 10000))
259
- uvicorn.run("main:app", host="0.0.0.0", port=port)
260
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdf_parser.py CHANGED
@@ -2,42 +2,49 @@ import fitz # PyMuPDF
2
  import requests
3
  from io import BytesIO
4
  from concurrent.futures import ThreadPoolExecutor
5
- import os
6
 
7
- def extract_page_text(page):
8
  text = page.get_text()
9
- return text if text.strip() else None
10
-
11
- def parse_pdf_from_url_multithreaded(url, max_workers=None):
12
- # Automatically detect and use all available CPU cores if max_workers not set
13
- if max_workers is None:
14
- max_workers = os.cpu_count() or 8
15
 
 
 
 
 
16
  res = requests.get(url)
17
- doc = fitz.open(stream=BytesIO(res.content), filetype="pdf")
18
- pages = [page for page in doc]
19
- chunks = [None] * len(pages)
20
-
21
- # Process pages in parallel, preserving page order
22
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
23
- results = list(executor.map(extract_page_text, pages))
24
-
25
- # Keep only non-empty page results, preserving order
26
- doc.close()
27
- return [r for r in results if r]
28
-
29
- def parse_pdf_from_file_multithreaded(file_path, max_workers=None):
30
- if max_workers is None:
31
- max_workers = os.cpu_count() or 8
 
32
 
33
- try:
34
- doc = fitz.open(file_path)
35
- pages = [page for page in doc]
36
- chunks = [None] * len(pages)
37
-
 
 
 
38
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
39
- results = list(executor.map(extract_page_text, pages))
40
- doc.close()
41
- return [r for r in results if r]
42
- except Exception as e:
43
- raise Exception(f"Error parsing PDF file {file_path}: {str(e)}")
 
 
 
 
 
 
2
  import requests
3
  from io import BytesIO
4
  from concurrent.futures import ThreadPoolExecutor
 
5
 
6
+ def _extract_text(page):
7
  text = page.get_text()
8
+ return text.strip() if text and text.strip() else None
 
 
 
 
 
9
 
10
+ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
11
+ """
12
+ Download PDF from URL, extract text in parallel, optionally chunk pages.
13
+ """
14
  res = requests.get(url)
15
+ with fitz.open(stream=BytesIO(res.content), filetype="pdf") as doc:
16
+ num_pages = len(doc)
17
+ pages = list(doc)
18
+ # Step 1: Parallel text extraction
19
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
20
+ texts = list(executor.map(_extract_text, pages))
21
+ # Step 2: Optional chunking
22
+ if chunk_size > 1:
23
+ chunks = []
24
+ for i in range(0, len(texts), chunk_size):
25
+ chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
26
+ if chunk:
27
+ chunks.append(chunk)
28
+ return chunks
29
+ # Default: return one chunk per page
30
+ return [t for t in texts if t]
31
 
32
+ def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
33
+ """
34
+ Parse a local PDF file, extract text in parallel, optionally chunk pages.
35
+ """
36
+ with fitz.open(file_path) as doc:
37
+ num_pages = len(doc)
38
+ pages = list(doc)
39
+ # Step 1: Parallel text extraction
40
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
41
+ texts = list(executor.map(_extract_text, pages))
42
+ # Step 2: Optional chunking
43
+ if chunk_size > 1:
44
+ chunks = []
45
+ for i in range(0, len(texts), chunk_size):
46
+ chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
47
+ if chunk:
48
+ chunks.append(chunk)
49
+ return chunks
50
+ return [t for t in texts if t]
start.sh DELETED
@@ -1,12 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Set up cache directory
4
- mkdir -p .cache
5
- export HF_HOME="$(pwd)/.cache"
6
- export TRANSFORMERS_CACHE="$(pwd)/.cache"
7
-
8
- echo "Cache directory set to: $(pwd)/.cache"
9
- echo "Starting application..."
10
-
11
- # Run the application
12
- python app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
test_deployment.py DELETED
@@ -1,75 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Test script for Hugging Face Spaces deployment
4
- """
5
-
6
- import requests
7
- import json
8
- import sys
9
-
10
- def test_health_check(base_url):
11
- """Test the health check endpoint"""
12
- try:
13
- response = requests.get(f"{base_url}/")
14
- print(f"Health check status: {response.status_code}")
15
- print(f"Response: {response.json()}")
16
- return response.status_code == 200
17
- except Exception as e:
18
- print(f"Health check failed: {e}")
19
- return False
20
-
21
- def test_api_endpoint(base_url, api_key):
22
- """Test the main API endpoint"""
23
- try:
24
- url = f"{base_url}/api/v1/hackrx/run"
25
- headers = {
26
- "Content-Type": "application/json",
27
- "Authorization": f"Bearer {api_key}"
28
- }
29
- data = {
30
- "documents": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
31
- "questions": ["What is this document about?"]
32
- }
33
-
34
- response = requests.post(url, headers=headers, json=data)
35
- print(f"API test status: {response.status_code}")
36
- print(f"Response: {response.json()}")
37
- return response.status_code == 200
38
- except Exception as e:
39
- print(f"API test failed: {e}")
40
- return False
41
-
42
- def main():
43
- if len(sys.argv) < 2:
44
- print("Usage: python test_deployment.py <base_url> [api_key]")
45
- print("Example: python test_deployment.py https://your-space-name.hf.space your_api_key")
46
- sys.exit(1)
47
-
48
- base_url = sys.argv[1].rstrip('/')
49
- api_key = sys.argv[2] if len(sys.argv) > 2 else "test_token"
50
-
51
- print(f"Testing deployment at: {base_url}")
52
- print("=" * 50)
53
-
54
- # Test health check
55
- print("1. Testing health check...")
56
- health_ok = test_health_check(base_url)
57
-
58
- # Test API endpoint
59
- print("\n2. Testing API endpoint...")
60
- api_ok = test_api_endpoint(base_url, api_key)
61
-
62
- # Summary
63
- print("\n" + "=" * 50)
64
- print("DEPLOYMENT TEST SUMMARY")
65
- print("=" * 50)
66
- print(f"Health check: {'βœ… PASS' if health_ok else '❌ FAIL'}")
67
- print(f"API endpoint: {'βœ… PASS' if api_ok else '❌ FAIL'}")
68
-
69
- if health_ok and api_ok:
70
- print("\nπŸŽ‰ Deployment is working correctly!")
71
- else:
72
- print("\n⚠️ Some tests failed. Check the logs above for details.")
73
-
74
- if __name__ == "__main__":
75
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_model_loading.py DELETED
@@ -1,34 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Test script to verify model loading works correctly
4
- """
5
- import os
6
- import sys
7
-
8
- # Set up cache directory
9
- cache_dir = os.path.join(os.getcwd(), ".cache")
10
- os.makedirs(cache_dir, exist_ok=True)
11
- os.environ['HF_HOME'] = cache_dir
12
- os.environ['TRANSFORMERS_CACHE'] = cache_dir
13
-
14
- print(f"Cache directory: {cache_dir}")
15
- print(f"Current working directory: {os.getcwd()}")
16
-
17
- try:
18
- from embedder import get_model, build_faiss_index
19
-
20
- print("Testing model loading...")
21
- model = get_model()
22
- print("βœ“ Model loaded successfully!")
23
-
24
- # Test with some sample text
25
- test_chunks = ["This is a test document.", "Another test sentence."]
26
- print("Testing FAISS index building...")
27
- index, texts = build_faiss_index(test_chunks)
28
- print("βœ“ FAISS index built successfully!")
29
-
30
- print("All tests passed!")
31
-
32
- except Exception as e:
33
- print(f"βœ— Error: {e}")
34
- sys.exit(1)