|
import os |
|
import json |
|
from pathlib import Path |
|
from dotenv import load_dotenv |
|
from agno.embedder.openai import OpenAIEmbedder |
|
from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader |
|
from agno.vectordb.qdrant import Qdrant |
|
from agno.document.chunking.fixed import FixedSizeChunking |
|
|
|
|
|
load_dotenv() |
|
|
|
QDRANT_URL = os.getenv("QDRANT_URL") |
|
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embeddings = OpenAIEmbedder( |
|
id="BAAI/bge-en-icl", |
|
dimensions=4096, |
|
api_key=os.getenv("NEBIUS_API_KEY"), |
|
base_url="https://api.studio.nebius.com/v1/" |
|
) |
|
|
|
|
|
class AgnoKnowledgeBase: |
|
def __init__(self, query: str, user_id: str, thread_id: str, agno_kb_config: dict, |
|
chunk_size: int = 1000, num_documents: int = 6): |
|
self.query = query |
|
self.user_id = user_id |
|
self.thread_id = thread_id |
|
self.agno_kb_config = agno_kb_config |
|
self.chunk_size = chunk_size |
|
self.num_documents = num_documents |
|
|
|
def setup_knowledge_base(self): |
|
print(self.agno_kb_config) |
|
agno_kb_config = self.agno_kb_config['knowledge_base'] |
|
input_data = agno_kb_config.get("input_data", {}) |
|
sources = input_data.get("source", []) |
|
recreate = agno_kb_config.get("recreate", False) |
|
collection_name = agno_kb_config.get("collection_name") |
|
chunk_size = agno_kb_config.get("chunk_size") |
|
overlap = agno_kb_config.get("overlap") |
|
num_documents = agno_kb_config.get("num_documents") |
|
chunking_strategy_type = agno_kb_config.get("chunking_strategy", "fixed") |
|
|
|
if chunking_strategy_type == "fixed": |
|
chunking_strategy = FixedSizeChunking(chunk_size=chunk_size, overlap=overlap) |
|
else: |
|
raise ValueError(f"Unsupported chunking strategy: {chunking_strategy_type}") |
|
|
|
vector_db = Qdrant( |
|
collection=collection_name, |
|
embedder=embeddings, |
|
url=QDRANT_URL, |
|
api_key=QDRANT_API_KEY |
|
) |
|
|
|
for source in sources: |
|
paths = source.get("path", []) |
|
for path in paths: |
|
print(f"Loading PDF into Qdrant: {path}") |
|
knowledge_base = PDFKnowledgeBase( |
|
path=path, |
|
vector_db=vector_db, |
|
reader=PDFReader(), |
|
chunking_strategy=chunking_strategy, |
|
num_documents=num_documents |
|
) |
|
knowledge_base.load(recreate=recreate) |
|
|
|
return PDFKnowledgeBase( |
|
path=None, |
|
vector_db=vector_db, |
|
reader=PDFReader(), |
|
chunking_strategy=chunking_strategy, |
|
num_documents=num_documents |
|
) |
|
|