Spaces:
Runtime error
Runtime error
Commit
Β·
8c107a7
1
Parent(s):
22585fc
swap to new embedding model and handle user 'i dont know' scenario
Browse files- config/model_config_advanced.yml +1 -1
- database/mock_qna.sqlite +1 -1
- models/chroma_db_advanced/a88943fe-4428-425d-8b9c-7bb8665a0c79/link_lists.bin +0 -0
- models/chroma_db_advanced/af9795b7-8b5f-4493-adbc-40aedf3c96ed/data_level0.bin +3 -0
- models/chroma_db_advanced/{a88943fe-4428-425d-8b9c-7bb8665a0c79 β af9795b7-8b5f-4493-adbc-40aedf3c96ed}/header.bin +1 -1
- models/chroma_db_advanced/{a88943fe-4428-425d-8b9c-7bb8665a0c79/data_level0.bin β af9795b7-8b5f-4493-adbc-40aedf3c96ed/index_metadata.pickle} +2 -2
- models/chroma_db_advanced/{a88943fe-4428-425d-8b9c-7bb8665a0c79 β af9795b7-8b5f-4493-adbc-40aedf3c96ed}/length.bin +2 -2
- models/chroma_db_advanced/af9795b7-8b5f-4493-adbc-40aedf3c96ed/link_lists.bin +3 -0
- models/chroma_db_advanced/chroma.sqlite3 +2 -2
- notebooks/002_persisted-embedding-model-advanced.ipynb +228 -69
- preprocess_raw_documents.py +16 -0
- qna_prompting.py +33 -13
- streamlit_app.py +1 -1
config/model_config_advanced.yml
CHANGED
|
@@ -14,4 +14,4 @@ vector_store:
|
|
| 14 |
persisted_path: './models/chroma_db_advanced'
|
| 15 |
|
| 16 |
questionaire_data:
|
| 17 |
-
db_path: './database/
|
|
|
|
| 14 |
persisted_path: './models/chroma_db_advanced'
|
| 15 |
|
| 16 |
questionaire_data:
|
| 17 |
+
db_path: './database/mock_qna.sqlite'
|
database/mock_qna.sqlite
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 40960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d51005d26f568ee304005ab7cf52cdc58a55f528230ae914a11dc9b75219623
|
| 3 |
size 40960
|
models/chroma_db_advanced/a88943fe-4428-425d-8b9c-7bb8665a0c79/link_lists.bin
DELETED
|
File without changes
|
models/chroma_db_advanced/af9795b7-8b5f-4493-adbc-40aedf3c96ed/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:453d35bee81975816ce0a286e796c4884c609c148e52d0605ac221daa46bf3d7
|
| 3 |
+
size 10056000
|
models/chroma_db_advanced/{a88943fe-4428-425d-8b9c-7bb8665a0c79 β af9795b7-8b5f-4493-adbc-40aedf3c96ed}/header.bin
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 100
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89bd0cf182f20a10a0d7faa81bf3304c0565bc9b6f4705056ae63c061b9269ff
|
| 3 |
size 100
|
models/chroma_db_advanced/{a88943fe-4428-425d-8b9c-7bb8665a0c79/data_level0.bin β af9795b7-8b5f-4493-adbc-40aedf3c96ed/index_metadata.pickle}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5ecccac152d2deee938b41b1533b454bb8d5778a0befcd855529538a1a17bdf
|
| 3 |
+
size 346049
|
models/chroma_db_advanced/{a88943fe-4428-425d-8b9c-7bb8665a0c79 β af9795b7-8b5f-4493-adbc-40aedf3c96ed}/length.bin
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae1fb78e4b679db0ad051360ddb549f4584c14a8b45f99d8d052f7d67067acb3
|
| 3 |
+
size 24000
|
models/chroma_db_advanced/af9795b7-8b5f-4493-adbc-40aedf3c96ed/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:535d672bfbbeec1181b50015d78bc1e776088cbbb0738d04bc725a76249eb744
|
| 3 |
+
size 52152
|
models/chroma_db_advanced/chroma.sqlite3
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74c0d3543bf7cab83459feda7fad58a984a7c018fc566f79e937038b3756fcca
|
| 3 |
+
size 101720064
|
notebooks/002_persisted-embedding-model-advanced.ipynb
CHANGED
|
@@ -10,11 +10,16 @@
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
-
"execution_count":
|
| 14 |
"id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97",
|
| 15 |
"metadata": {},
|
| 16 |
"outputs": [],
|
| 17 |
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"import chromadb\n",
|
| 19 |
"from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
|
| 20 |
"from llama_index.vector_stores.chroma.base import ChromaVectorStore\n",
|
|
@@ -31,27 +36,78 @@
|
|
| 31 |
"import time"
|
| 32 |
]
|
| 33 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
{
|
| 35 |
"cell_type": "code",
|
| 36 |
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
"id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb",
|
| 38 |
"metadata": {},
|
| 39 |
"outputs": [],
|
| 40 |
"source": [
|
| 41 |
"# load some documents\n",
|
| 42 |
"documents = SimpleDirectoryReader(input_files=[\n",
|
| 43 |
-
" \"../raw_documents/qna.txt\",\n",
|
| 44 |
" \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
|
| 45 |
" \"../raw_documents/conversation_examples.txt\",\n",
|
| 46 |
" \"../raw_documents/HI_Knowledge_Base.pdf\",\n",
|
| 47 |
-
"
|
| 48 |
-
" ]).load_data()\n",
|
| 49 |
"document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))"
|
| 50 |
]
|
| 51 |
},
|
| 52 |
{
|
| 53 |
"cell_type": "code",
|
| 54 |
-
"execution_count":
|
| 55 |
"id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d",
|
| 56 |
"metadata": {},
|
| 57 |
"outputs": [],
|
|
@@ -62,7 +118,7 @@
|
|
| 62 |
},
|
| 63 |
{
|
| 64 |
"cell_type": "code",
|
| 65 |
-
"execution_count":
|
| 66 |
"id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed",
|
| 67 |
"metadata": {},
|
| 68 |
"outputs": [],
|
|
@@ -73,7 +129,7 @@
|
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"cell_type": "code",
|
| 76 |
-
"execution_count":
|
| 77 |
"id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672",
|
| 78 |
"metadata": {},
|
| 79 |
"outputs": [],
|
|
@@ -92,19 +148,28 @@
|
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"cell_type": "code",
|
| 95 |
-
"execution_count":
|
| 96 |
"id": "0946b6ce-96ab-44de-ad75-e424a8429f67",
|
| 97 |
"metadata": {},
|
| 98 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
"source": [
|
| 100 |
"Settings.llm = None\n",
|
| 101 |
"Settings.chunk_size = 1024\n",
|
|
|
|
| 102 |
"Settings.embed_model = \"local:../models/fine-tuned-embeddings-advanced\""
|
| 103 |
]
|
| 104 |
},
|
| 105 |
{
|
| 106 |
"cell_type": "code",
|
| 107 |
-
"execution_count":
|
| 108 |
"id": "b8c73a2c-1129-406a-8046-085afcaf9cbb",
|
| 109 |
"metadata": {},
|
| 110 |
"outputs": [],
|
|
@@ -114,10 +179,21 @@
|
|
| 114 |
},
|
| 115 |
{
|
| 116 |
"cell_type": "code",
|
| 117 |
-
"execution_count":
|
| 118 |
"id": "75f1c76f-d3e5-4b69-818c-98865adb1457",
|
| 119 |
"metadata": {},
|
| 120 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
"source": [
|
| 122 |
"len(nodes)"
|
| 123 |
]
|
|
@@ -132,7 +208,7 @@
|
|
| 132 |
},
|
| 133 |
{
|
| 134 |
"cell_type": "code",
|
| 135 |
-
"execution_count":
|
| 136 |
"id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4",
|
| 137 |
"metadata": {},
|
| 138 |
"outputs": [],
|
|
@@ -142,7 +218,7 @@
|
|
| 142 |
},
|
| 143 |
{
|
| 144 |
"cell_type": "code",
|
| 145 |
-
"execution_count":
|
| 146 |
"id": "6a764113-ad7e-4674-aa57-ebbf405902a8",
|
| 147 |
"metadata": {},
|
| 148 |
"outputs": [],
|
|
@@ -160,7 +236,7 @@
|
|
| 160 |
},
|
| 161 |
{
|
| 162 |
"cell_type": "code",
|
| 163 |
-
"execution_count":
|
| 164 |
"id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05",
|
| 165 |
"metadata": {},
|
| 166 |
"outputs": [],
|
|
@@ -170,7 +246,7 @@
|
|
| 170 |
},
|
| 171 |
{
|
| 172 |
"cell_type": "code",
|
| 173 |
-
"execution_count":
|
| 174 |
"id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb",
|
| 175 |
"metadata": {},
|
| 176 |
"outputs": [],
|
|
@@ -180,39 +256,88 @@
|
|
| 180 |
},
|
| 181 |
{
|
| 182 |
"cell_type": "code",
|
| 183 |
-
"execution_count":
|
| 184 |
-
"id": "
|
| 185 |
"metadata": {},
|
| 186 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
"source": [
|
| 188 |
-
"
|
|
|
|
|
|
|
| 189 |
]
|
| 190 |
},
|
| 191 |
{
|
| 192 |
"cell_type": "code",
|
| 193 |
-
"execution_count":
|
| 194 |
-
"id": "
|
| 195 |
"metadata": {},
|
| 196 |
"outputs": [],
|
| 197 |
"source": [
|
| 198 |
-
"
|
| 199 |
-
"indexing_cost = indexing_cost / 60\n",
|
| 200 |
-
"print(f\"Indexing time: {indexing_cost:.1f} mins\")"
|
| 201 |
]
|
| 202 |
},
|
| 203 |
{
|
| 204 |
"cell_type": "code",
|
| 205 |
-
"execution_count":
|
| 206 |
"id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e",
|
| 207 |
"metadata": {
|
| 208 |
"scrolled": true
|
| 209 |
},
|
| 210 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
"source": [
|
| 212 |
"response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n",
|
| 213 |
"response"
|
| 214 |
]
|
| 215 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
{
|
| 217 |
"cell_type": "code",
|
| 218 |
"execution_count": null,
|
|
@@ -239,7 +364,7 @@
|
|
| 239 |
},
|
| 240 |
{
|
| 241 |
"cell_type": "code",
|
| 242 |
-
"execution_count":
|
| 243 |
"id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5",
|
| 244 |
"metadata": {},
|
| 245 |
"outputs": [],
|
|
@@ -269,7 +394,7 @@
|
|
| 269 |
},
|
| 270 |
{
|
| 271 |
"cell_type": "code",
|
| 272 |
-
"execution_count":
|
| 273 |
"id": "d38dc953-b923-4128-86a1-c8c6f69af0ed",
|
| 274 |
"metadata": {},
|
| 275 |
"outputs": [],
|
|
@@ -279,7 +404,7 @@
|
|
| 279 |
},
|
| 280 |
{
|
| 281 |
"cell_type": "code",
|
| 282 |
-
"execution_count":
|
| 283 |
"id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e",
|
| 284 |
"metadata": {},
|
| 285 |
"outputs": [],
|
|
@@ -289,7 +414,7 @@
|
|
| 289 |
},
|
| 290 |
{
|
| 291 |
"cell_type": "code",
|
| 292 |
-
"execution_count":
|
| 293 |
"id": "0583e9b0-d977-488c-8331-46dfa749924c",
|
| 294 |
"metadata": {},
|
| 295 |
"outputs": [],
|
|
@@ -308,7 +433,7 @@
|
|
| 308 |
},
|
| 309 |
{
|
| 310 |
"cell_type": "code",
|
| 311 |
-
"execution_count":
|
| 312 |
"id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba",
|
| 313 |
"metadata": {},
|
| 314 |
"outputs": [],
|
|
@@ -318,7 +443,7 @@
|
|
| 318 |
},
|
| 319 |
{
|
| 320 |
"cell_type": "code",
|
| 321 |
-
"execution_count":
|
| 322 |
"id": "1b385644-b46e-4d13-88fa-9f4af39db405",
|
| 323 |
"metadata": {},
|
| 324 |
"outputs": [],
|
|
@@ -328,7 +453,7 @@
|
|
| 328 |
},
|
| 329 |
{
|
| 330 |
"cell_type": "code",
|
| 331 |
-
"execution_count":
|
| 332 |
"id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2",
|
| 333 |
"metadata": {},
|
| 334 |
"outputs": [],
|
|
@@ -340,7 +465,7 @@
|
|
| 340 |
},
|
| 341 |
{
|
| 342 |
"cell_type": "code",
|
| 343 |
-
"execution_count":
|
| 344 |
"id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae",
|
| 345 |
"metadata": {},
|
| 346 |
"outputs": [],
|
|
@@ -362,7 +487,7 @@
|
|
| 362 |
},
|
| 363 |
{
|
| 364 |
"cell_type": "code",
|
| 365 |
-
"execution_count":
|
| 366 |
"id": "1a506940-c2b4-4d14-ad93-fd451331c582",
|
| 367 |
"metadata": {},
|
| 368 |
"outputs": [],
|
|
@@ -375,7 +500,7 @@
|
|
| 375 |
},
|
| 376 |
{
|
| 377 |
"cell_type": "code",
|
| 378 |
-
"execution_count":
|
| 379 |
"id": "3f592848-8536-4b4d-b34a-adc32d043432",
|
| 380 |
"metadata": {},
|
| 381 |
"outputs": [],
|
|
@@ -385,7 +510,7 @@
|
|
| 385 |
},
|
| 386 |
{
|
| 387 |
"cell_type": "code",
|
| 388 |
-
"execution_count":
|
| 389 |
"id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252",
|
| 390 |
"metadata": {},
|
| 391 |
"outputs": [],
|
|
@@ -399,58 +524,66 @@
|
|
| 399 |
},
|
| 400 |
{
|
| 401 |
"cell_type": "code",
|
| 402 |
-
"execution_count":
|
| 403 |
-
"id": "
|
| 404 |
"metadata": {},
|
| 405 |
"outputs": [],
|
| 406 |
"source": [
|
| 407 |
-
"
|
| 408 |
-
"
|
| 409 |
-
"
|
| 410 |
-
"
|
| 411 |
-
"
|
| 412 |
-
"
|
| 413 |
-
"\"\"\""
|
| 414 |
]
|
| 415 |
},
|
| 416 |
{
|
| 417 |
"cell_type": "code",
|
| 418 |
"execution_count": null,
|
| 419 |
-
"id": "
|
| 420 |
"metadata": {},
|
| 421 |
"outputs": [],
|
| 422 |
-
"source": [
|
| 423 |
-
"res = chat_engine.chat(prompt)\n",
|
| 424 |
-
"print(res.response)"
|
| 425 |
-
]
|
| 426 |
},
|
| 427 |
{
|
| 428 |
"cell_type": "code",
|
| 429 |
-
"execution_count":
|
| 430 |
-
"id": "
|
| 431 |
"metadata": {},
|
| 432 |
"outputs": [],
|
| 433 |
-
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
},
|
| 435 |
{
|
| 436 |
"cell_type": "code",
|
| 437 |
-
"execution_count":
|
| 438 |
-
"id": "
|
| 439 |
"metadata": {},
|
| 440 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
"source": [
|
| 442 |
-
"
|
| 443 |
-
"
|
| 444 |
-
" system_prompt=system_content,\n",
|
| 445 |
-
" similarity_top_k=3,\n",
|
| 446 |
-
" streaming=True\n",
|
| 447 |
-
")"
|
| 448 |
]
|
| 449 |
},
|
| 450 |
{
|
| 451 |
"cell_type": "code",
|
| 452 |
"execution_count": null,
|
| 453 |
-
"id": "
|
| 454 |
"metadata": {},
|
| 455 |
"outputs": [],
|
| 456 |
"source": []
|
|
@@ -458,18 +591,44 @@
|
|
| 458 |
{
|
| 459 |
"cell_type": "code",
|
| 460 |
"execution_count": null,
|
| 461 |
-
"id": "
|
| 462 |
"metadata": {},
|
| 463 |
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
"source": [
|
| 465 |
-
"res =
|
| 466 |
-
"print(res)"
|
| 467 |
]
|
| 468 |
},
|
| 469 |
{
|
| 470 |
"cell_type": "code",
|
| 471 |
"execution_count": null,
|
| 472 |
-
"id": "
|
| 473 |
"metadata": {},
|
| 474 |
"outputs": [],
|
| 475 |
"source": []
|
|
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
+
"execution_count": 1,
|
| 14 |
"id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97",
|
| 15 |
"metadata": {},
|
| 16 |
"outputs": [],
|
| 17 |
"source": [
|
| 18 |
+
"import sys, os, shutil\n",
|
| 19 |
+
"sys.path.insert(0, \"../\")\n",
|
| 20 |
+
"\n",
|
| 21 |
+
"from preprocess_raw_documents import split_content\n",
|
| 22 |
+
"\n",
|
| 23 |
"import chromadb\n",
|
| 24 |
"from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
|
| 25 |
"from llama_index.vector_stores.chroma.base import ChromaVectorStore\n",
|
|
|
|
| 36 |
"import time"
|
| 37 |
]
|
| 38 |
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "code",
|
| 41 |
+
"execution_count": 2,
|
| 42 |
+
"id": "978152ce-4d87-44b5-b521-dbaff60b32b0",
|
| 43 |
+
"metadata": {},
|
| 44 |
+
"outputs": [
|
| 45 |
+
{
|
| 46 |
+
"name": "stderr",
|
| 47 |
+
"output_type": "stream",
|
| 48 |
+
"text": [
|
| 49 |
+
"199it [00:00, 8821.71it/s]\n",
|
| 50 |
+
"200it [00:00, 12584.17it/s]\n"
|
| 51 |
+
]
|
| 52 |
+
}
|
| 53 |
+
],
|
| 54 |
+
"source": [
|
| 55 |
+
"split_content(filepath=\"../raw_documents/answers.txt\", \n",
|
| 56 |
+
" separator=\"\\n\\n\", \n",
|
| 57 |
+
" tmp_folder=\"../raw_documents/answers_temp\")\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"split_content(filepath=\"../raw_documents/qna.txt\", \n",
|
| 60 |
+
" separator=\"\\n\\n\\n\", \n",
|
| 61 |
+
" tmp_folder=\"../raw_documents/qna_temp\")"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"cell_type": "code",
|
| 66 |
+
"execution_count": 5,
|
| 67 |
+
"id": "d925371b-8777-4f5b-a7f2-ec3f228ef266",
|
| 68 |
+
"metadata": {},
|
| 69 |
+
"outputs": [],
|
| 70 |
+
"source": [
|
| 71 |
+
"answers_temp_files = []\n",
|
| 72 |
+
"folder_path = \"../raw_documents/answers_temp\"\n",
|
| 73 |
+
"for f in os.listdir(folder_path):\n",
|
| 74 |
+
" fpath = os.path.join(folder_path, f)\n",
|
| 75 |
+
" answers_temp_files.append(fpath)\n",
|
| 76 |
+
" \n",
|
| 77 |
+
"qna_temp_files = []\n",
|
| 78 |
+
"folder_path = \"../raw_documents/qna_temp\"\n",
|
| 79 |
+
"for f in os.listdir(folder_path):\n",
|
| 80 |
+
" fpath = os.path.join(folder_path, f)\n",
|
| 81 |
+
" qna_temp_files.append(fpath)"
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
{
|
| 85 |
"cell_type": "code",
|
| 86 |
"execution_count": null,
|
| 87 |
+
"id": "e876a26b-822d-44d6-a3dd-ccdcc04933cf",
|
| 88 |
+
"metadata": {},
|
| 89 |
+
"outputs": [],
|
| 90 |
+
"source": []
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"cell_type": "code",
|
| 94 |
+
"execution_count": 7,
|
| 95 |
"id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb",
|
| 96 |
"metadata": {},
|
| 97 |
"outputs": [],
|
| 98 |
"source": [
|
| 99 |
"# load some documents\n",
|
| 100 |
"documents = SimpleDirectoryReader(input_files=[\n",
|
|
|
|
| 101 |
" \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
|
| 102 |
" \"../raw_documents/conversation_examples.txt\",\n",
|
| 103 |
" \"../raw_documents/HI_Knowledge_Base.pdf\",\n",
|
| 104 |
+
" ] + answers_temp_files + qna_temp_files ).load_data()\n",
|
|
|
|
| 105 |
"document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))"
|
| 106 |
]
|
| 107 |
},
|
| 108 |
{
|
| 109 |
"cell_type": "code",
|
| 110 |
+
"execution_count": 8,
|
| 111 |
"id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d",
|
| 112 |
"metadata": {},
|
| 113 |
"outputs": [],
|
|
|
|
| 118 |
},
|
| 119 |
{
|
| 120 |
"cell_type": "code",
|
| 121 |
+
"execution_count": 9,
|
| 122 |
"id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed",
|
| 123 |
"metadata": {},
|
| 124 |
"outputs": [],
|
|
|
|
| 129 |
},
|
| 130 |
{
|
| 131 |
"cell_type": "code",
|
| 132 |
+
"execution_count": 10,
|
| 133 |
"id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672",
|
| 134 |
"metadata": {},
|
| 135 |
"outputs": [],
|
|
|
|
| 148 |
},
|
| 149 |
{
|
| 150 |
"cell_type": "code",
|
| 151 |
+
"execution_count": 11,
|
| 152 |
"id": "0946b6ce-96ab-44de-ad75-e424a8429f67",
|
| 153 |
"metadata": {},
|
| 154 |
+
"outputs": [
|
| 155 |
+
{
|
| 156 |
+
"name": "stdout",
|
| 157 |
+
"output_type": "stream",
|
| 158 |
+
"text": [
|
| 159 |
+
"LLM is explicitly disabled. Using MockLLM.\n"
|
| 160 |
+
]
|
| 161 |
+
}
|
| 162 |
+
],
|
| 163 |
"source": [
|
| 164 |
"Settings.llm = None\n",
|
| 165 |
"Settings.chunk_size = 1024\n",
|
| 166 |
+
"Settings.chunk_overlap = 50\n",
|
| 167 |
"Settings.embed_model = \"local:../models/fine-tuned-embeddings-advanced\""
|
| 168 |
]
|
| 169 |
},
|
| 170 |
{
|
| 171 |
"cell_type": "code",
|
| 172 |
+
"execution_count": 12,
|
| 173 |
"id": "b8c73a2c-1129-406a-8046-085afcaf9cbb",
|
| 174 |
"metadata": {},
|
| 175 |
"outputs": [],
|
|
|
|
| 179 |
},
|
| 180 |
{
|
| 181 |
"cell_type": "code",
|
| 182 |
+
"execution_count": 13,
|
| 183 |
"id": "75f1c76f-d3e5-4b69-818c-98865adb1457",
|
| 184 |
"metadata": {},
|
| 185 |
+
"outputs": [
|
| 186 |
+
{
|
| 187 |
+
"data": {
|
| 188 |
+
"text/plain": [
|
| 189 |
+
"6814"
|
| 190 |
+
]
|
| 191 |
+
},
|
| 192 |
+
"execution_count": 13,
|
| 193 |
+
"metadata": {},
|
| 194 |
+
"output_type": "execute_result"
|
| 195 |
+
}
|
| 196 |
+
],
|
| 197 |
"source": [
|
| 198 |
"len(nodes)"
|
| 199 |
]
|
|
|
|
| 208 |
},
|
| 209 |
{
|
| 210 |
"cell_type": "code",
|
| 211 |
+
"execution_count": 14,
|
| 212 |
"id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4",
|
| 213 |
"metadata": {},
|
| 214 |
"outputs": [],
|
|
|
|
| 218 |
},
|
| 219 |
{
|
| 220 |
"cell_type": "code",
|
| 221 |
+
"execution_count": 15,
|
| 222 |
"id": "6a764113-ad7e-4674-aa57-ebbf405902a8",
|
| 223 |
"metadata": {},
|
| 224 |
"outputs": [],
|
|
|
|
| 236 |
},
|
| 237 |
{
|
| 238 |
"cell_type": "code",
|
| 239 |
+
"execution_count": 16,
|
| 240 |
"id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05",
|
| 241 |
"metadata": {},
|
| 242 |
"outputs": [],
|
|
|
|
| 246 |
},
|
| 247 |
{
|
| 248 |
"cell_type": "code",
|
| 249 |
+
"execution_count": 17,
|
| 250 |
"id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb",
|
| 251 |
"metadata": {},
|
| 252 |
"outputs": [],
|
|
|
|
| 256 |
},
|
| 257 |
{
|
| 258 |
"cell_type": "code",
|
| 259 |
+
"execution_count": 18,
|
| 260 |
+
"id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3",
|
| 261 |
"metadata": {},
|
| 262 |
+
"outputs": [
|
| 263 |
+
{
|
| 264 |
+
"name": "stdout",
|
| 265 |
+
"output_type": "stream",
|
| 266 |
+
"text": [
|
| 267 |
+
"Indexing time: 2.3 mins\n"
|
| 268 |
+
]
|
| 269 |
+
}
|
| 270 |
+
],
|
| 271 |
"source": [
|
| 272 |
+
"indexing_cost = time.time() - start_time\n",
|
| 273 |
+
"indexing_cost = indexing_cost / 60\n",
|
| 274 |
+
"print(f\"Indexing time: {indexing_cost:.1f} mins\")"
|
| 275 |
]
|
| 276 |
},
|
| 277 |
{
|
| 278 |
"cell_type": "code",
|
| 279 |
+
"execution_count": 19,
|
| 280 |
+
"id": "f16cca33-71fb-437d-a033-671b9fd44054",
|
| 281 |
"metadata": {},
|
| 282 |
"outputs": [],
|
| 283 |
"source": [
|
| 284 |
+
"vector_query_engine = vector_index.as_query_engine()"
|
|
|
|
|
|
|
| 285 |
]
|
| 286 |
},
|
| 287 |
{
|
| 288 |
"cell_type": "code",
|
| 289 |
+
"execution_count": 20,
|
| 290 |
"id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e",
|
| 291 |
"metadata": {
|
| 292 |
"scrolled": true
|
| 293 |
},
|
| 294 |
+
"outputs": [
|
| 295 |
+
{
|
| 296 |
+
"data": {
|
| 297 |
+
"text/plain": [
|
| 298 |
+
"Response(response='Context information is below.\\n---------------------\\nfile_path: ../raw_documents/answers_temp/answers_050.txt\\n\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".\\n\\nfile_path: ../raw_documents/qna_temp/qna_050.txt\\n\\nC1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3βs M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: Healthcare System in Singapore consists of?\\nAnswer: ', source_nodes=[NodeWithScore(node=TextNode(id_='536fef67-6a3f-4054-a94a-cc9143599510', embedding=None, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='2b0f7dad-c532-4abd-8c42-f53383a4fc76', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='5b1d1dc729a663e4ccfacc0f18adf0f6644a2a7d2991490fd962d1550c83f2ff'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='6d93c092-b4cc-4b5b-b379-080d777d3908', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '../raw_documents/answers_temp/answers_044.txt', 'file_name': 'answers_044.txt', 'file_type': 'text/plain', 'file_size': 164, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='caeb59043b8daa56ed472941882947570abff951f64aa0498672aba5921fac1d'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='859a9958-6f5d-4581-95d0-39edfc950ef5', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='8416454b2fbad3e6122c5151d2b3d1eadf0afde3514ba09374c71e96baf712bc')}, text='Question: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=130, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4159636550867191), NodeWithScore(node=TextNode(id_='472000ae-a0aa-4464-a200-72fe67a3fbde', embedding=None, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='506fb715-d3b0-4ca7-b7ca-011a1e1a1f0d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='7461ffa12ff6729003131976b82995b7254ab10f8dc7d79c65988ec9e3b7b631'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='d8232b90-d641-4966-b98f-4ca0821db773', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '../raw_documents/qna_temp/qna_044.txt', 'file_name': 'qna_044.txt', 'file_type': 'text/plain', 'file_size': 383, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='cbeb00c29c6130548466697a862fee43ab2be92d84158cc0b69c2f5c7bbe68b1'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='e772e623-cf91-41cd-a516-50acb894eb54', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='a7583b0fd46f98d0118c712632277d81f417b779f8bcc100ab2558dae6317cde')}, text='C1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3βs M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=295, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4126648577998099)], metadata={'536fef67-6a3f-4054-a94a-cc9143599510': {'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, '472000ae-a0aa-4464-a200-72fe67a3fbde': {'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}})"
|
| 299 |
+
]
|
| 300 |
+
},
|
| 301 |
+
"execution_count": 20,
|
| 302 |
+
"metadata": {},
|
| 303 |
+
"output_type": "execute_result"
|
| 304 |
+
}
|
| 305 |
+
],
|
| 306 |
"source": [
|
| 307 |
"response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n",
|
| 308 |
"response"
|
| 309 |
]
|
| 310 |
},
|
| 311 |
+
{
|
| 312 |
+
"cell_type": "code",
|
| 313 |
+
"execution_count": null,
|
| 314 |
+
"id": "aa4b9906-5f75-4003-9f4c-5cfcc7ab1eaf",
|
| 315 |
+
"metadata": {},
|
| 316 |
+
"outputs": [],
|
| 317 |
+
"source": []
|
| 318 |
+
},
|
| 319 |
+
{
|
| 320 |
+
"cell_type": "code",
|
| 321 |
+
"execution_count": 21,
|
| 322 |
+
"id": "1bb75b04-6a62-43a4-8728-d2e52e49f1c0",
|
| 323 |
+
"metadata": {},
|
| 324 |
+
"outputs": [],
|
| 325 |
+
"source": [
|
| 326 |
+
"if os.path.exists(\"../raw_documents/answers_temp\"):\n",
|
| 327 |
+
" shutil.rmtree(\"../raw_documents/answers_temp\")"
|
| 328 |
+
]
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
"cell_type": "code",
|
| 332 |
+
"execution_count": 22,
|
| 333 |
+
"id": "0ed920fb-6456-49ac-8b63-08bd86b5b39c",
|
| 334 |
+
"metadata": {},
|
| 335 |
+
"outputs": [],
|
| 336 |
+
"source": [
|
| 337 |
+
"if os.path.exists(\"../raw_documents/qna_temp\"):\n",
|
| 338 |
+
" shutil.rmtree(\"../raw_documents/qna_temp\")"
|
| 339 |
+
]
|
| 340 |
+
},
|
| 341 |
{
|
| 342 |
"cell_type": "code",
|
| 343 |
"execution_count": null,
|
|
|
|
| 364 |
},
|
| 365 |
{
|
| 366 |
"cell_type": "code",
|
| 367 |
+
"execution_count": 1,
|
| 368 |
"id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5",
|
| 369 |
"metadata": {},
|
| 370 |
"outputs": [],
|
|
|
|
| 394 |
},
|
| 395 |
{
|
| 396 |
"cell_type": "code",
|
| 397 |
+
"execution_count": 2,
|
| 398 |
"id": "d38dc953-b923-4128-86a1-c8c6f69af0ed",
|
| 399 |
"metadata": {},
|
| 400 |
"outputs": [],
|
|
|
|
| 404 |
},
|
| 405 |
{
|
| 406 |
"cell_type": "code",
|
| 407 |
+
"execution_count": 3,
|
| 408 |
"id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e",
|
| 409 |
"metadata": {},
|
| 410 |
"outputs": [],
|
|
|
|
| 414 |
},
|
| 415 |
{
|
| 416 |
"cell_type": "code",
|
| 417 |
+
"execution_count": 4,
|
| 418 |
"id": "0583e9b0-d977-488c-8331-46dfa749924c",
|
| 419 |
"metadata": {},
|
| 420 |
"outputs": [],
|
|
|
|
| 433 |
},
|
| 434 |
{
|
| 435 |
"cell_type": "code",
|
| 436 |
+
"execution_count": 5,
|
| 437 |
"id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba",
|
| 438 |
"metadata": {},
|
| 439 |
"outputs": [],
|
|
|
|
| 443 |
},
|
| 444 |
{
|
| 445 |
"cell_type": "code",
|
| 446 |
+
"execution_count": 6,
|
| 447 |
"id": "1b385644-b46e-4d13-88fa-9f4af39db405",
|
| 448 |
"metadata": {},
|
| 449 |
"outputs": [],
|
|
|
|
| 453 |
},
|
| 454 |
{
|
| 455 |
"cell_type": "code",
|
| 456 |
+
"execution_count": 7,
|
| 457 |
"id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2",
|
| 458 |
"metadata": {},
|
| 459 |
"outputs": [],
|
|
|
|
| 465 |
},
|
| 466 |
{
|
| 467 |
"cell_type": "code",
|
| 468 |
+
"execution_count": 8,
|
| 469 |
"id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae",
|
| 470 |
"metadata": {},
|
| 471 |
"outputs": [],
|
|
|
|
| 487 |
},
|
| 488 |
{
|
| 489 |
"cell_type": "code",
|
| 490 |
+
"execution_count": 9,
|
| 491 |
"id": "1a506940-c2b4-4d14-ad93-fd451331c582",
|
| 492 |
"metadata": {},
|
| 493 |
"outputs": [],
|
|
|
|
| 500 |
},
|
| 501 |
{
|
| 502 |
"cell_type": "code",
|
| 503 |
+
"execution_count": 10,
|
| 504 |
"id": "3f592848-8536-4b4d-b34a-adc32d043432",
|
| 505 |
"metadata": {},
|
| 506 |
"outputs": [],
|
|
|
|
| 510 |
},
|
| 511 |
{
|
| 512 |
"cell_type": "code",
|
| 513 |
+
"execution_count": 11,
|
| 514 |
"id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252",
|
| 515 |
"metadata": {},
|
| 516 |
"outputs": [],
|
|
|
|
| 524 |
},
|
| 525 |
{
|
| 526 |
"cell_type": "code",
|
| 527 |
+
"execution_count": 12,
|
| 528 |
+
"id": "c3106dff-dd6f-47a9-9454-1e61775e7539",
|
| 529 |
"metadata": {},
|
| 530 |
"outputs": [],
|
| 531 |
"source": [
|
| 532 |
+
"hi_engine = index.as_query_engine(\n",
|
| 533 |
+
" memory=memory,\n",
|
| 534 |
+
" system_prompt=system_content,\n",
|
| 535 |
+
" similarity_top_k=10,\n",
|
| 536 |
+
" streaming=True\n",
|
| 537 |
+
")"
|
|
|
|
| 538 |
]
|
| 539 |
},
|
| 540 |
{
|
| 541 |
"cell_type": "code",
|
| 542 |
"execution_count": null,
|
| 543 |
+
"id": "53a38081-4a79-44bc-bfa3-5d8653804328",
|
| 544 |
"metadata": {},
|
| 545 |
"outputs": [],
|
| 546 |
+
"source": []
|
|
|
|
|
|
|
|
|
|
| 547 |
},
|
| 548 |
{
|
| 549 |
"cell_type": "code",
|
| 550 |
+
"execution_count": 24,
|
| 551 |
+
"id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612",
|
| 552 |
"metadata": {},
|
| 553 |
"outputs": [],
|
| 554 |
+
"source": [
|
| 555 |
+
"prompt = \"\"\"\n",
|
| 556 |
+
"Question: Which is not a government healthcare philosophy? \n",
|
| 557 |
+
"A. To nurture a healthy nation by promoting good health.\n",
|
| 558 |
+
"B. To rely on competition to improve service and raise efficiency\n",
|
| 559 |
+
"C. To intervene directly whenever necessary\n",
|
| 560 |
+
"D. To provide for the care of employees\n",
|
| 561 |
+
"\"\"\""
|
| 562 |
+
]
|
| 563 |
},
|
| 564 |
{
|
| 565 |
"cell_type": "code",
|
| 566 |
+
"execution_count": 26,
|
| 567 |
+
"id": "a1c83dff-50d1-47b1-b7e9-4fc5cd08e1e8",
|
| 568 |
"metadata": {},
|
| 569 |
+
"outputs": [
|
| 570 |
+
{
|
| 571 |
+
"name": "stdout",
|
| 572 |
+
"output_type": "stream",
|
| 573 |
+
"text": [
|
| 574 |
+
"D. To provide for the care of employees\n"
|
| 575 |
+
]
|
| 576 |
+
}
|
| 577 |
+
],
|
| 578 |
"source": [
|
| 579 |
+
"res = hi_engine.query(prompt)\n",
|
| 580 |
+
"print(res)"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
]
|
| 582 |
},
|
| 583 |
{
|
| 584 |
"cell_type": "code",
|
| 585 |
"execution_count": null,
|
| 586 |
+
"id": "cedd3512-548d-4455-80fd-c6a8b2c0cd00",
|
| 587 |
"metadata": {},
|
| 588 |
"outputs": [],
|
| 589 |
"source": []
|
|
|
|
| 591 |
{
|
| 592 |
"cell_type": "code",
|
| 593 |
"execution_count": null,
|
| 594 |
+
"id": "ec53dfcf-d4c0-4d10-a24e-be2004a83656",
|
| 595 |
"metadata": {},
|
| 596 |
"outputs": [],
|
| 597 |
+
"source": []
|
| 598 |
+
},
|
| 599 |
+
{
|
| 600 |
+
"cell_type": "code",
|
| 601 |
+
"execution_count": 14,
|
| 602 |
+
"id": "78abaf95-e52d-445c-9d8e-bc51efb20f06",
|
| 603 |
+
"metadata": {},
|
| 604 |
+
"outputs": [
|
| 605 |
+
{
|
| 606 |
+
"name": "stderr",
|
| 607 |
+
"output_type": "stream",
|
| 608 |
+
"text": [
|
| 609 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
| 610 |
+
"To disable this warning, you can either:\n",
|
| 611 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
| 612 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
| 613 |
+
]
|
| 614 |
+
},
|
| 615 |
+
{
|
| 616 |
+
"name": "stdout",
|
| 617 |
+
"output_type": "stream",
|
| 618 |
+
"text": [
|
| 619 |
+
"The correct answer is \"Deductibles apply for all treatments\".\n"
|
| 620 |
+
]
|
| 621 |
+
}
|
| 622 |
+
],
|
| 623 |
"source": [
|
| 624 |
+
"res = chat_engine.chat(prompt)\n",
|
| 625 |
+
"print(res.response)"
|
| 626 |
]
|
| 627 |
},
|
| 628 |
{
|
| 629 |
"cell_type": "code",
|
| 630 |
"execution_count": null,
|
| 631 |
+
"id": "1e62303c-3a00-448f-ad93-15cb6cee1f24",
|
| 632 |
"metadata": {},
|
| 633 |
"outputs": [],
|
| 634 |
"source": []
|
preprocess_raw_documents.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def split_content(filepath, separator, tmp_folder):
|
| 7 |
+
os.makedirs(tmp_folder, exist_ok=True)
|
| 8 |
+
base_file_name = os.path.basename(filepath)
|
| 9 |
+
fname, fextn = base_file_name.split(".")
|
| 10 |
+
with open(filepath, "r") as fp:
|
| 11 |
+
content = fp.read()
|
| 12 |
+
content_chunk = content.split(separator)
|
| 13 |
+
for index, chunk in tqdm(enumerate(content_chunk)):
|
| 14 |
+
new_fpath = os.path.join(tmp_folder, f"{fname}_{index:03d}.{fextn}")
|
| 15 |
+
with open(new_fpath, "w") as fp:
|
| 16 |
+
fp.write(chunk)
|
qna_prompting.py
CHANGED
|
@@ -25,10 +25,11 @@ qna_question_data_format = """
|
|
| 25 |
Example 3: `Chapter_5` for fifth chapter
|
| 26 |
"""
|
| 27 |
qna_answer_description = """
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
If user's answer is not a single alphabet letter, but is contextually
|
| 34 |
closer to a particular answer choice, return the corresponding
|
|
@@ -122,7 +123,6 @@ def evaluate_qna_answer(user_selected_answer: str) -> str:
|
|
| 122 |
|
| 123 |
### convert to numeric type
|
| 124 |
qna_answer = int(qna_answer)
|
| 125 |
-
|
| 126 |
qna_answer_alphabet = num_mapping.get(qna_answer, "ERROR")
|
| 127 |
|
| 128 |
con = sqlite3.connect(db_path)
|
|
@@ -138,13 +138,34 @@ def evaluate_qna_answer(user_selected_answer: str) -> str:
|
|
| 138 |
con.commit()
|
| 139 |
con.close()
|
| 140 |
|
| 141 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
st.toast("π― yummy yummy, hooray!", icon="π")
|
| 143 |
time.sleep(2)
|
| 144 |
st.toast("π»ππ― You got it right!", icon="π")
|
| 145 |
time.sleep(2)
|
| 146 |
st.toast("π₯ You are amazing! π―π―", icon="πͺ")
|
| 147 |
st.balloons()
|
|
|
|
| 148 |
else:
|
| 149 |
st.toast("πΌ Something doesn't seem right.. π₯π π₯", icon="π")
|
| 150 |
time.sleep(2)
|
|
@@ -152,17 +173,16 @@ def evaluate_qna_answer(user_selected_answer: str) -> str:
|
|
| 152 |
time.sleep(2)
|
| 153 |
st.toast("π€π€ Nevertheless, it was a good try!! ποΈββοΈποΈββοΈ", icon="π")
|
| 154 |
st.snow()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
-
reasoning = "" if "textbook" in reasons else "Rationale is that: " + reasons
|
| 157 |
-
qna_answer_response = (
|
| 158 |
-
f"Your selected answer is `{user_selected_answer}`, "
|
| 159 |
-
f"but the actual answer is `{qna_answer_alphabet}`. " + reasoning
|
| 160 |
-
)
|
| 161 |
-
|
| 162 |
except Exception as e:
|
| 163 |
print(e)
|
| 164 |
|
| 165 |
-
return
|
| 166 |
|
| 167 |
get_qna_question_tool = FunctionTool.from_defaults(
|
| 168 |
fn=get_qna_question,
|
|
|
|
| 25 |
Example 3: `Chapter_5` for fifth chapter
|
| 26 |
"""
|
| 27 |
qna_answer_description = """
|
| 28 |
+
Not to trigger this when questions being asked, come directly from user.
|
| 29 |
+
Only use this tool to trigger the evaluation of user's provided input with the
|
| 30 |
+
correct answer of the Q&A question asked by Assistant. When user provides
|
| 31 |
+
answer to the question asked, they can reply in natural language or giving
|
| 32 |
+
the alphabet letter of which selected choice they think it's the right answer.
|
| 33 |
|
| 34 |
If user's answer is not a single alphabet letter, but is contextually
|
| 35 |
closer to a particular answer choice, return the corresponding
|
|
|
|
| 123 |
|
| 124 |
### convert to numeric type
|
| 125 |
qna_answer = int(qna_answer)
|
|
|
|
| 126 |
qna_answer_alphabet = num_mapping.get(qna_answer, "ERROR")
|
| 127 |
|
| 128 |
con = sqlite3.connect(db_path)
|
|
|
|
| 138 |
con.commit()
|
| 139 |
con.close()
|
| 140 |
|
| 141 |
+
reasoning = "" if "textbook" in reasons else f"Rationale is that: {reasons}. "
|
| 142 |
+
qna_answer_response = (
|
| 143 |
+
f"Your selected answer is `{user_selected_answer}`, "
|
| 144 |
+
f"but the actual answer is `{qna_answer_alphabet}`. "
|
| 145 |
+
)
|
| 146 |
+
qna_not_knowing_response = (
|
| 147 |
+
f"No problem! The answer is `{qna_answer_alphabet}`. "
|
| 148 |
+
)
|
| 149 |
+
to_know_more = (
|
| 150 |
+
"Let me know if you want to know more, "
|
| 151 |
+
"I can give you an explanation π»π"
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
if user_answer_numeric == 0:
|
| 155 |
+
st.toast("π―β couldn't find the honey? π no worries!", icon="π« ")
|
| 156 |
+
time.sleep(2)
|
| 157 |
+
st.toast("π» Let me bring it to you! π―π", icon="π")
|
| 158 |
+
time.sleep(2)
|
| 159 |
+
st.toast("β¨ You will do great next time! π", icon="π")
|
| 160 |
+
final_response = qna_not_knowing_response + reasoning + to_know_more
|
| 161 |
+
elif qna_answer == user_answer_numeric:
|
| 162 |
st.toast("π― yummy yummy, hooray!", icon="π")
|
| 163 |
time.sleep(2)
|
| 164 |
st.toast("π»ππ― You got it right!", icon="π")
|
| 165 |
time.sleep(2)
|
| 166 |
st.toast("π₯ You are amazing! π―π―", icon="πͺ")
|
| 167 |
st.balloons()
|
| 168 |
+
final_response = qna_answer_response + reasoning + to_know_more
|
| 169 |
else:
|
| 170 |
st.toast("πΌ Something doesn't seem right.. π₯π π₯", icon="π")
|
| 171 |
time.sleep(2)
|
|
|
|
| 173 |
time.sleep(2)
|
| 174 |
st.toast("π€π€ Nevertheless, it was a good try!! ποΈββοΈποΈββοΈ", icon="π")
|
| 175 |
st.snow()
|
| 176 |
+
final_response = qna_answer_response + reasoning + to_know_more
|
| 177 |
+
|
| 178 |
+
st.session_state.question_id = None
|
| 179 |
+
st.session_state.qna_answer = None
|
| 180 |
+
st.session_state.reasons = None
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
except Exception as e:
|
| 183 |
print(e)
|
| 184 |
|
| 185 |
+
return final_response
|
| 186 |
|
| 187 |
get_qna_question_tool = FunctionTool.from_defaults(
|
| 188 |
fn=get_qna_question,
|
streamlit_app.py
CHANGED
|
@@ -40,7 +40,7 @@ nest_asyncio.apply()
|
|
| 40 |
st.set_page_config(page_title="π»π Study Bear π―")
|
| 41 |
openai_api = os.getenv("OPENAI_API_KEY")
|
| 42 |
|
| 43 |
-
with open("./config/
|
| 44 |
model_config = yaml.safe_load(file_reader)
|
| 45 |
|
| 46 |
input_files = model_config["input_data"]["source"]
|
|
|
|
| 40 |
st.set_page_config(page_title="π»π Study Bear π―")
|
| 41 |
openai_api = os.getenv("OPENAI_API_KEY")
|
| 42 |
|
| 43 |
+
with open("./config/model_config_advanced.yml", "r") as file_reader:
|
| 44 |
model_config = yaml.safe_load(file_reader)
|
| 45 |
|
| 46 |
input_files = model_config["input_data"]["source"]
|