brendon-ai commited on
Commit
6ef0559
·
verified ·
1 Parent(s): af117dd

Update src/RAGSample.py

Browse files
Files changed (1) hide show
  1. src/RAGSample.py +23 -23
src/RAGSample.py CHANGED
@@ -126,29 +126,29 @@ class SmartFAQRetriever(BaseRetriever):
126
 
127
 
128
  def _get_relevant_documents_with_scores(self, query: str) -> List[tuple[Document, float]]:
129
- """Retrieve documents along with similarity scores."""
130
- if not hasattr(self, '_vectorizer') or self._vectorizer is None or not hasattr(self._vectorizer, 'vocabulary_') or not self._vectorizer.vocabulary_:
131
- self._vectorizer = TfidfVectorizer(
132
- max_features=3000,
133
- stop_words='english',
134
- ngram_range=(1, 2),
135
- min_df=1,
136
- max_df=0.9
137
- )
138
- questions = [doc.page_content.split("ANSWER:")[0].replace("QUESTION:", "").strip()
139
- if "QUESTION:" in doc.page_content else doc.page_content
140
- for doc in self._documents]
141
- self._vectorizer.fit(questions)
142
-
143
- query_vector = self._vectorizer.transform([query.lower().strip()])
144
- question_texts = [doc.page_content.split("ANSWER:")[0].replace("QUESTION:", "").strip()
145
- if "QUESTION:" in doc.page_content else doc.page_content
146
- for doc in self._documents]
147
- question_vectors = self._vectorizer.transform(question_texts)
148
- similarities = cosine_similarity(query_vector, question_vectors).flatten()
149
-
150
- top_indices = similarities.argsort()[-self._k:][::-1]
151
- return [(self._documents[i], float(similarities[i])) for i in top_indices if similarities[i] > 0.1]
152
 
153
 
154
  def _get_relevant_documents(self, query: str) -> List[Document]:
 
126
 
127
 
128
  def _get_relevant_documents_with_scores(self, query: str) -> List[tuple[Document, float]]:
129
+ """Retrieve documents along with similarity scores."""
130
+ if not hasattr(self, '_vectorizer') or self._vectorizer is None or not hasattr(self._vectorizer, 'vocabulary_') or not self._vectorizer.vocabulary_:
131
+ self._vectorizer = TfidfVectorizer(
132
+ max_features=3000,
133
+ stop_words='english',
134
+ ngram_range=(1, 2),
135
+ min_df=1,
136
+ max_df=0.9
137
+ )
138
+ questions = [doc.page_content.split("ANSWER:")[0].replace("QUESTION:", "").strip()
139
+ if "QUESTION:" in doc.page_content else doc.page_content
140
+ for doc in self._documents]
141
+ self._vectorizer.fit(questions)
142
+
143
+ query_vector = self._vectorizer.transform([query.lower().strip()])
144
+ question_texts = [doc.page_content.split("ANSWER:")[0].replace("QUESTION:", "").strip()
145
+ if "QUESTION:" in doc.page_content else doc.page_content
146
+ for doc in self._documents]
147
+ question_vectors = self._vectorizer.transform(question_texts)
148
+ similarities = cosine_similarity(query_vector, question_vectors).flatten()
149
+
150
+ top_indices = similarities.argsort()[-self._k:][::-1]
151
+ return [(self._documents[i], float(similarities[i])) for i in top_indices if similarities[i] > 0.1]
152
 
153
 
154
  def _get_relevant_documents(self, query: str) -> List[Document]: