File size: 2,860 Bytes
b15be4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os
import json
from pathlib import Path
from dotenv import load_dotenv
from agno.embedder.openai import OpenAIEmbedder
from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader
from agno.vectordb.qdrant import Qdrant
from agno.document.chunking.fixed import FixedSizeChunking
# Load environment variables
load_dotenv()
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
# embeddings = OpenAIEmbedder(
# id="text-embedding-3-large",
# dimensions=3072,
# api_key=os.getenv("OPENAI_API_KEY")
# )
embeddings = OpenAIEmbedder(
id="BAAI/bge-en-icl",
dimensions=4096,
api_key=os.getenv("NEBIUS_API_KEY"),
base_url="https://api.studio.nebius.com/v1/"
)
class AgnoKnowledgeBase:
def __init__(self, query: str, user_id: str, thread_id: str, agno_kb_config: dict,
chunk_size: int = 1000, num_documents: int = 6):
self.query = query
self.user_id = user_id
self.thread_id = thread_id
self.agno_kb_config = agno_kb_config
self.chunk_size = chunk_size
self.num_documents = num_documents
def setup_knowledge_base(self):
print(self.agno_kb_config)
agno_kb_config = self.agno_kb_config['knowledge_base']
input_data = agno_kb_config.get("input_data", {})
sources = input_data.get("source", [])
recreate = agno_kb_config.get("recreate", False)
collection_name = agno_kb_config.get("collection_name")
chunk_size = agno_kb_config.get("chunk_size")
overlap = agno_kb_config.get("overlap")
num_documents = agno_kb_config.get("num_documents")
chunking_strategy_type = agno_kb_config.get("chunking_strategy", "fixed")
if chunking_strategy_type == "fixed":
chunking_strategy = FixedSizeChunking(chunk_size=chunk_size, overlap=overlap)
else:
raise ValueError(f"Unsupported chunking strategy: {chunking_strategy_type}")
vector_db = Qdrant(
collection=collection_name,
embedder=embeddings,
url=QDRANT_URL,
api_key=QDRANT_API_KEY
)
for source in sources:
paths = source.get("path", [])
for path in paths:
print(f"Loading PDF into Qdrant: {path}")
knowledge_base = PDFKnowledgeBase(
path=path,
vector_db=vector_db,
reader=PDFReader(),
chunking_strategy=chunking_strategy,
num_documents=num_documents
)
knowledge_base.load(recreate=recreate)
return PDFKnowledgeBase(
path=None,
vector_db=vector_db,
reader=PDFReader(),
chunking_strategy=chunking_strategy,
num_documents=num_documents
)
|