File size: 2,860 Bytes
b15be4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import json
from pathlib import Path
from dotenv import load_dotenv
from agno.embedder.openai import OpenAIEmbedder
from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader
from agno.vectordb.qdrant import Qdrant
from agno.document.chunking.fixed import FixedSizeChunking

# Load environment variables
load_dotenv()

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# embeddings = OpenAIEmbedder(
#     id="text-embedding-3-large",
#     dimensions=3072,
#     api_key=os.getenv("OPENAI_API_KEY")
# )

embeddings = OpenAIEmbedder(
    id="BAAI/bge-en-icl",
    dimensions=4096,
    api_key=os.getenv("NEBIUS_API_KEY"),
    base_url="https://api.studio.nebius.com/v1/"
)


class AgnoKnowledgeBase:
    def __init__(self, query: str, user_id: str, thread_id: str, agno_kb_config: dict,
                 chunk_size: int = 1000, num_documents: int = 6):
        self.query = query
        self.user_id = user_id
        self.thread_id = thread_id
        self.agno_kb_config = agno_kb_config
        self.chunk_size = chunk_size
        self.num_documents = num_documents

    def setup_knowledge_base(self):
        print(self.agno_kb_config)
        agno_kb_config = self.agno_kb_config['knowledge_base']
        input_data = agno_kb_config.get("input_data", {})
        sources = input_data.get("source", [])
        recreate = agno_kb_config.get("recreate", False)
        collection_name = agno_kb_config.get("collection_name")
        chunk_size = agno_kb_config.get("chunk_size")
        overlap = agno_kb_config.get("overlap")
        num_documents = agno_kb_config.get("num_documents")
        chunking_strategy_type = agno_kb_config.get("chunking_strategy", "fixed")

        if chunking_strategy_type == "fixed":
            chunking_strategy = FixedSizeChunking(chunk_size=chunk_size, overlap=overlap)
        else:
            raise ValueError(f"Unsupported chunking strategy: {chunking_strategy_type}")

        vector_db = Qdrant(
            collection=collection_name,
            embedder=embeddings,
            url=QDRANT_URL,
            api_key=QDRANT_API_KEY
        )

        for source in sources:
            paths = source.get("path", [])
            for path in paths:
                print(f"Loading PDF into Qdrant: {path}")
                knowledge_base = PDFKnowledgeBase(
                    path=path,
                    vector_db=vector_db,
                    reader=PDFReader(),
                    chunking_strategy=chunking_strategy,
                    num_documents=num_documents
                )
                knowledge_base.load(recreate=recreate)

        return PDFKnowledgeBase(
            path=None,
            vector_db=vector_db,
            reader=PDFReader(),
            chunking_strategy=chunking_strategy,
            num_documents=num_documents
        )