Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,7 @@ from langchain_aws import ChatBedrock
|
|
13 |
from langchain_core.prompts import ChatPromptTemplate
|
14 |
from langchain_core.runnables import RunnablePassthrough
|
15 |
from langchain_core.output_parsers import StrOutputParser
|
|
|
16 |
import re
|
17 |
import json
|
18 |
|
@@ -50,10 +51,15 @@ def load_wikipedia_documents():
|
|
50 |
@st.cache_resource
|
51 |
def split_documents(_documents):
|
52 |
"""Split documents into chunks."""
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
@st.cache_resource
|
59 |
def initialize_embeddings():
|
@@ -72,16 +78,42 @@ def initialize_embeddings():
|
|
72 |
|
73 |
@st.cache_resource
|
74 |
def store_in_qdrant(_chunks, _embeddings):
|
75 |
-
"""Store document chunks in a hosted Qdrant instance."""
|
76 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
vector_store = Qdrant.from_documents(
|
78 |
documents=_chunks,
|
79 |
embedding=_embeddings,
|
80 |
url=os.getenv("QDRANT_URL"),
|
81 |
api_key=os.getenv("QDRANT_API_KEY"),
|
82 |
-
collection_name=
|
83 |
)
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
return vector_store
|
86 |
except Exception as e:
|
87 |
logger.error(f"Error storing in Qdrant: {e}")
|
|
|
13 |
from langchain_core.prompts import ChatPromptTemplate
|
14 |
from langchain_core.runnables import RunnablePassthrough
|
15 |
from langchain_core.output_parsers import StrOutputParser
|
16 |
+
from qdrant_client import QdrantClient
|
17 |
import re
|
18 |
import json
|
19 |
|
|
|
51 |
@st.cache_resource
|
52 |
def split_documents(_documents):
|
53 |
"""Split documents into chunks."""
|
54 |
+
try:
|
55 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
56 |
+
chunks = splitter.split_documents(_documents)
|
57 |
+
logger.info(f"Split into {len(chunks)} chunks")
|
58 |
+
return chunks
|
59 |
+
except Exception as e:
|
60 |
+
logger.error(f"Error splitting documents: {e}")
|
61 |
+
st.error(f"Failed to split documents: {e}")
|
62 |
+
return []
|
63 |
|
64 |
@st.cache_resource
|
65 |
def initialize_embeddings():
|
|
|
78 |
|
79 |
@st.cache_resource
|
80 |
def store_in_qdrant(_chunks, _embeddings):
|
81 |
+
"""Store document chunks in a hosted Qdrant instance after deleting existing collection."""
|
82 |
try:
|
83 |
+
# Initialize Qdrant client
|
84 |
+
client = QdrantClient(
|
85 |
+
url=os.getenv("QDRANT_URL"),
|
86 |
+
api_key=os.getenv("QDRANT_API_KEY")
|
87 |
+
)
|
88 |
+
|
89 |
+
# Delete existing collection if it exists
|
90 |
+
collection_name = "wikipedia_chunks"
|
91 |
+
try:
|
92 |
+
client.delete_collection(collection_name)
|
93 |
+
logger.info(f"Deleted existing Qdrant collection: {collection_name}")
|
94 |
+
except Exception as e:
|
95 |
+
logger.warning(f"No existing collection {collection_name} to delete or error: {e}")
|
96 |
+
|
97 |
+
# Create and populate new collection
|
98 |
vector_store = Qdrant.from_documents(
|
99 |
documents=_chunks,
|
100 |
embedding=_embeddings,
|
101 |
url=os.getenv("QDRANT_URL"),
|
102 |
api_key=os.getenv("QDRANT_API_KEY"),
|
103 |
+
collection_name=collection_name
|
104 |
)
|
105 |
+
|
106 |
+
# Verify storage by checking collection size
|
107 |
+
collection_info = client.get_collection(collection_name)
|
108 |
+
stored_points = collection_info.points_count
|
109 |
+
logger.info(f"Stored {stored_points} chunks in Qdrant at {os.getenv('QDRANT_URL')}")
|
110 |
+
if stored_points == 0:
|
111 |
+
logger.error("No documents stored in Qdrant collection")
|
112 |
+
st.error("No documents stored in Qdrant collection")
|
113 |
+
return None
|
114 |
+
if stored_points != len(_chunks):
|
115 |
+
logger.warning(f"Expected {len(_chunks)} chunks, but stored {stored_points} in Qdrant")
|
116 |
+
|
117 |
return vector_store
|
118 |
except Exception as e:
|
119 |
logger.error(f"Error storing in Qdrant: {e}")
|