Spaces:
Sleeping
Sleeping
Update src/RAGSample.py
Browse files- src/RAGSample.py +23 -23
src/RAGSample.py
CHANGED
@@ -126,29 +126,29 @@ class SmartFAQRetriever(BaseRetriever):
|
|
126 |
|
127 |
|
128 |
def _get_relevant_documents_with_scores(self, query: str) -> List[tuple[Document, float]]:
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
|
153 |
|
154 |
def _get_relevant_documents(self, query: str) -> List[Document]:
|
|
|
126 |
|
127 |
|
128 |
def _get_relevant_documents_with_scores(self, query: str) -> List[tuple[Document, float]]:
|
129 |
+
"""Retrieve documents along with similarity scores."""
|
130 |
+
if not hasattr(self, '_vectorizer') or self._vectorizer is None or not hasattr(self._vectorizer, 'vocabulary_') or not self._vectorizer.vocabulary_:
|
131 |
+
self._vectorizer = TfidfVectorizer(
|
132 |
+
max_features=3000,
|
133 |
+
stop_words='english',
|
134 |
+
ngram_range=(1, 2),
|
135 |
+
min_df=1,
|
136 |
+
max_df=0.9
|
137 |
+
)
|
138 |
+
questions = [doc.page_content.split("ANSWER:")[0].replace("QUESTION:", "").strip()
|
139 |
+
if "QUESTION:" in doc.page_content else doc.page_content
|
140 |
+
for doc in self._documents]
|
141 |
+
self._vectorizer.fit(questions)
|
142 |
+
|
143 |
+
query_vector = self._vectorizer.transform([query.lower().strip()])
|
144 |
+
question_texts = [doc.page_content.split("ANSWER:")[0].replace("QUESTION:", "").strip()
|
145 |
+
if "QUESTION:" in doc.page_content else doc.page_content
|
146 |
+
for doc in self._documents]
|
147 |
+
question_vectors = self._vectorizer.transform(question_texts)
|
148 |
+
similarities = cosine_similarity(query_vector, question_vectors).flatten()
|
149 |
+
|
150 |
+
top_indices = similarities.argsort()[-self._k:][::-1]
|
151 |
+
return [(self._documents[i], float(similarities[i])) for i in top_indices if similarities[i] > 0.1]
|
152 |
|
153 |
|
154 |
def _get_relevant_documents(self, query: str) -> List[Document]:
|