Shamik commited on
Commit
3c99d4f
·
unverified ·
1 Parent(s): 635a34b

feat: adding project files.

Browse files
pyproject.toml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "agent-hackathon"
3
+ version = "0.1.0"
4
+ description = "Agent hackathon"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "shamik", email = "39588365+Shamik-07@users.noreply.github.com" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "arxiv>=2.2.0",
12
+ "flagembedding>=1.3.5",
13
+ "httpx>=0.28.1",
14
+ "huggingface-hub[hf-xet]>=0.32.4",
15
+ "llama-hub>=0.0.79.post1",
16
+ "llama-index-embeddings-huggingface>=0.5.4",
17
+ "llama-index-embeddings-huggingface-api>=0.3.1",
18
+ "llama-index-llms-huggingface>=0.5.0",
19
+ "llama-index-llms-huggingface-api>=0.5.0",
20
+ "llama-index-vector-stores-milvus>=0.8.4",
21
+ "openai>=1.84.0",
22
+ "pyprojroot>=0.3.0",
23
+ "python-dotenv>=1.1.0",
24
+ "smolagents>=1.17.0",
25
+ ]
26
+
27
+
28
+ [dependency-groups]
29
+ dev = [
30
+ "google-generativeai>=0.8.5",
31
+ "ipykernel>=6.29.5",
32
+ "ipywidgets>=8.1.7",
33
+ "marimo>=0.13.15",
34
+ "nbformat>=5.10.4",
35
+ "ruff>=0.11.13",
36
+ ]
src/agent_hackathon/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ def hello() -> str:
2
+ return "Hello from agent-hackathon!"
src/agent_hackathon/consts.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from pyprojroot import find_root, has_file
2
+
3
+ PROJECT_ROOT_DIR = find_root(criterion=has_file(file="README.md"))
src/agent_hackathon/create_vector_db.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from copy import deepcopy
3
+
4
+ from dotenv import find_dotenv, load_dotenv
5
+ from llama_index.core import StorageContext, VectorStoreIndex
6
+ from llama_index.core.node_parser import SentenceSplitter
7
+ from llama_index.core.schema import Document
8
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
9
+ from llama_index.vector_stores.milvus import MilvusVectorStore
10
+ from llama_index.vector_stores.milvus.utils import BGEM3SparseEmbeddingFunction
11
+
12
+ from src.agent_hackathon.consts import PROJECT_ROOT_DIR
13
+ from src.agent_hackathon.logger import get_logger
14
+
15
+ logger = get_logger(log_name="create_vector_db", log_dir=PROJECT_ROOT_DIR / "logs")
16
+
17
+
18
+ class VectorDBCreator:
19
+ """Handles creation of a Milvus vector database from arXiv data."""
20
+
21
+ def __init__(
22
+ self,
23
+ data_path: str,
24
+ db_uri: str,
25
+ embedding_model: str = "Qwen/Qwen3-Embedding-0.6B",
26
+ chunk_size: int = 20_000,
27
+ chunk_overlap: int = 0,
28
+ vector_dim: int = 1024,
29
+ insert_batch_size: int = 8192,
30
+ ) -> None:
31
+ """
32
+ Initialize the VectorDBCreator.
33
+
34
+ Args:
35
+ data_path: Path to the JSON data file.
36
+ db_uri: URI for the Milvus database.
37
+ embedding_model: Name of the embedding model.
38
+ chunk_size: Size of text chunks for splitting.
39
+ chunk_overlap: Overlap between text chunks.
40
+ vector_dim: Dimension of the embedding vectors.
41
+ insert_batch_size: Batch size for insertion.
42
+ """
43
+ self.data_path = data_path
44
+ self.db_uri = db_uri
45
+ self.embedding_model = embedding_model
46
+ self.chunk_size = chunk_size
47
+ self.chunk_overlap = chunk_overlap
48
+ self.vector_dim = vector_dim
49
+ self.insert_batch_size = insert_batch_size
50
+ self.embed_model = HuggingFaceEmbedding(
51
+ model_name=self.embedding_model, device="cpu"
52
+ )
53
+ self.sent_splitter = SentenceSplitter(
54
+ chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
55
+ )
56
+ logger.info("VectorDBCreator initialized.")
57
+
58
+ def load_data(self) -> list[dict]:
59
+ """
60
+ Load and return data from the JSON file.
61
+
62
+ Returns:
63
+ List of dictionaries containing arXiv data.
64
+ """
65
+ logger.info(f"Loading data from {self.data_path}")
66
+ with open(file=self.data_path) as f:
67
+ data = json.load(fp=f)
68
+ logger.info("Data loaded successfully.")
69
+ return deepcopy(x=data)
70
+
71
+ def prepare_documents(self, data: list[dict]) -> list[Document]:
72
+ """
73
+ Convert raw data into a list of Document objects.
74
+
75
+ Args:
76
+ data: List of dictionaries with arXiv data.
77
+
78
+ Returns:
79
+ List of Document objects.
80
+ """
81
+ logger.info("Preparing documents from data.")
82
+ docs = [Document(text=d.pop("abstract"), metadata=d) for d in data]
83
+ logger.info(f"Prepared {len(docs)} documents.")
84
+ return docs
85
+
86
+ def create_vector_store(self) -> MilvusVectorStore:
87
+ """
88
+ Create and return a MilvusVectorStore instance.
89
+
90
+ Returns:
91
+ Configured MilvusVectorStore.
92
+ """
93
+ logger.info(f"Creating MilvusVectorStore at {self.db_uri}")
94
+ store = MilvusVectorStore(
95
+ uri=self.db_uri,
96
+ dim=self.vector_dim,
97
+ enable_sparse=True,
98
+ sparse_embedding_function=BGEM3SparseEmbeddingFunction(),
99
+ )
100
+ logger.info("MilvusVectorStore created.")
101
+ return store
102
+
103
+ def build_index(
104
+ self, docs_list: list[Document], vector_store: MilvusVectorStore
105
+ ) -> VectorStoreIndex:
106
+ """
107
+ Build and return a VectorStoreIndex from documents.
108
+
109
+ Args:
110
+ docs_list: List of Document objects.
111
+ vector_store: MilvusVectorStore instance.
112
+
113
+ Returns:
114
+ VectorStoreIndex object.
115
+ """
116
+ logger.info("Building VectorStoreIndex.")
117
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
118
+ index = VectorStoreIndex.from_documents(
119
+ documents=docs_list,
120
+ storage_context=storage_context,
121
+ embed_model=self.embed_model,
122
+ transformations=[self.sent_splitter],
123
+ show_progress=True,
124
+ insert_batch_size=self.insert_batch_size,
125
+ )
126
+ logger.info("VectorStoreIndex built.")
127
+ return index
128
+
129
+ def run(self) -> None:
130
+ """
131
+ Execute the full pipeline: load data, prepare documents, create vector store, and build index.
132
+ """
133
+ logger.info("Running full vector DB creation pipeline.")
134
+ data = self.load_data()
135
+ docs_list = self.prepare_documents(data=data)
136
+ vector_store = self.create_vector_store()
137
+ self.build_index(docs_list=docs_list, vector_store=vector_store)
138
+ logger.info("Pipeline finished.")
139
+
140
+
141
+ if __name__ == "__main__":
142
+ logger.info("Script started.")
143
+ # Optionally load environment variables if needed
144
+ _ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=True))
145
+ creator = VectorDBCreator(
146
+ data_path=f"{PROJECT_ROOT_DIR}/data/cs_data_arxiv.json", db_uri="arxiv_docs.db"
147
+ )
148
+ creator.run()
149
+ logger.info("Script finished.")
src/agent_hackathon/generate_arxiv_responses.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from huggingface_hub import InferenceClient
6
+
7
+ from src.agent_hackathon.consts import PROJECT_ROOT_DIR
8
+ from src.agent_hackathon.create_vector_db import VectorDBCreator
9
+ from src.agent_hackathon.logger import get_logger
10
+ from src.agent_hackathon.query_vector_db import RetrieverEngineBuilder
11
+
12
+ logger = get_logger(log_name="arxiv_responses", log_dir=PROJECT_ROOT_DIR / "logs")
13
+
14
+
15
+ class ArxivResponseGenerator:
16
+ """
17
+ Handles retrieval and formatting of arXiv papers using a vector database and LLM.
18
+ """
19
+
20
+ def __init__(self, vector_store_path: Path) -> None:
21
+ """Initializes the ArxivResponseGenerator."""
22
+ self.vector_store_path = vector_store_path
23
+ self.client = self._initialise_client()
24
+ logger.info("ArxivResponseGenerator initialized.")
25
+
26
+ def _initialise_retriever(self) -> Any:
27
+ """
28
+ Initializes and returns a retriever engine.
29
+
30
+ Returns:
31
+ Any: Retriever engine object.
32
+ """
33
+ logger.info("Initializing retriever engine.")
34
+ vector_db_creator = VectorDBCreator(
35
+ data_path=..., db_uri=self.vector_store_path.as_posix()
36
+ )
37
+ vector_store = vector_db_creator.create_vector_store()
38
+ retriever_class = RetrieverEngineBuilder(
39
+ vector_store=vector_store,
40
+ )
41
+ retriever = retriever_class.build_retriever_engine()
42
+ logger.info("Retriever engine initialized.")
43
+ return retriever, retriever_class
44
+
45
+ def _initialise_client(self) -> InferenceClient:
46
+ """
47
+ Initializes and returns an InferenceClient.
48
+
49
+ Returns:
50
+ InferenceClient: HuggingFace InferenceClient instance.
51
+ """
52
+ logger.info("Initializing InferenceClient.")
53
+ client = InferenceClient(
54
+ provider="auto",
55
+ bill_to="VitalNest",
56
+ )
57
+ logger.info("InferenceClient initialized.")
58
+ return client
59
+
60
+ def retrieve_arxiv_papers(self, query: str) -> str:
61
+ """
62
+ Retrieves and formats arXiv papers for a given query.
63
+
64
+ Args:
65
+ query (str): The search query.
66
+
67
+ Returns:
68
+ str: Formatted response from the LLM.
69
+ """
70
+ logger.info(f"Retrieving arXiv papers for query: {query}")
71
+ retriever, retriever_class = self._initialise_retriever()
72
+ retrieved_content = json.dumps(
73
+ obj=[(i.get_content(), i.metadata) for i in retriever.retrieve(query)]
74
+ )
75
+ logger.info("Retrieved content from vector DB.")
76
+ completion = self.client.chat.completions.create(
77
+ model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
78
+ temperature=0.1,
79
+ messages=[
80
+ {
81
+ "role": "user",
82
+ "content": [
83
+ {
84
+ "type": "text",
85
+ "text": f"Format the following output neatly:{retrieved_content}. Return only the output.",
86
+ },
87
+ ],
88
+ }
89
+ ],
90
+ )
91
+ logger.info("Received completion from LLM.")
92
+ retriever_class.vector_store.client.close()
93
+ logger.info("Closed vector store client.")
94
+ return completion.choices[0].message.content
95
+
96
+
97
+ if __name__ == "__main__":
98
+ logger.info("Script started.")
99
+ generator = ArxivResponseGenerator(
100
+ vector_store_path=PROJECT_ROOT_DIR / "db/arxiv_docs.db"
101
+ )
102
+ query = "deep learning for NLP" # Example query, replace as needed
103
+ result = generator.retrieve_arxiv_papers(query=query)
104
+ print(result)
105
+ logger.info("Script finished.")
src/agent_hackathon/logger.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+
5
+ from rich.logging import RichHandler
6
+
7
+
8
+ def get_logger(log_name: str, log_dir: Path) -> logging.Logger:
9
+ """
10
+ Returns a logger with RichHandler and file handler.
11
+
12
+ Args:
13
+ log_name (str): Name prefix for the log file.
14
+ log_dir (Path): Directory to store log files.
15
+
16
+ Returns:
17
+ logging.Logger: Configured logger instance.
18
+ """
19
+ log_dir.mkdir(parents=True, exist_ok=True)
20
+ date_str = datetime.now().strftime(format="%m_%d_%Y")
21
+ log_file = log_dir / f"{log_name}_{date_str}.log"
22
+
23
+ logger = logging.getLogger(name=log_name)
24
+ logger.setLevel(level=logging.INFO)
25
+ logger.handlers.clear()
26
+
27
+ # Rich console handler
28
+ rich_handler = RichHandler(
29
+ rich_tracebacks=True, show_time=True, show_level=True, show_path=True
30
+ )
31
+ rich_handler.setLevel(level=logging.INFO)
32
+
33
+ # File handler
34
+ file_handler = logging.FileHandler(filename=log_file, encoding="utf-8")
35
+ file_handler.setLevel(level=logging.INFO)
36
+ formatter = logging.Formatter(
37
+ fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s"
38
+ )
39
+ file_handler.setFormatter(formatter)
40
+
41
+ logger.addHandler(rich_handler)
42
+ logger.addHandler(file_handler)
43
+ logger.propagate = False
44
+
45
+ return logger
src/agent_hackathon/py.typed ADDED
File without changes
src/agent_hackathon/query_vector_db.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any
3
+
4
+ from dotenv import find_dotenv, load_dotenv
5
+ from huggingface_hub import login
6
+ from llama_index.core import VectorStoreIndex
7
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
8
+ from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
9
+ from llama_index.vector_stores.milvus import MilvusVectorStore
10
+
11
+ from src.agent_hackathon.consts import PROJECT_ROOT_DIR
12
+ from src.agent_hackathon.logger import get_logger
13
+
14
+ logger = get_logger(log_name="query_vector_db", log_dir=PROJECT_ROOT_DIR / "logs")
15
+
16
+
17
+ class RetrieverEngineBuilder:
18
+ """
19
+ Handles the creation of a query engine for a vector database using HuggingFace and LlamaIndex.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ hf_token_env: str = "HF_TOKEN",
25
+ embedding_model: str = "Qwen/Qwen3-Embedding-0.6B",
26
+ llm_model: str = "meta-llama/Llama-4-Scout-17B-16E-Instruct",
27
+ vector_store: MilvusVectorStore = None,
28
+ device: str = "cpu",
29
+ ) -> None:
30
+ """
31
+ Initialize the QueryEngineBuilder.
32
+
33
+ Args:
34
+ hf_token_env: Environment variable name for HuggingFace token.
35
+ embedding_model: Name of the embedding model.
36
+ llm_model: Name of the LLM model.
37
+ vector_store: An instance of MilvusVectorStore.
38
+ device: Device to run the embedding model on.
39
+ """
40
+ self.hf_token_env = hf_token_env
41
+ self.embedding_model = embedding_model
42
+ self.llm_model = llm_model
43
+ self.vector_store = vector_store
44
+ self.device = device
45
+
46
+ logger.info("Initializing RetrieverEngineBuilder.")
47
+ self._login_huggingface()
48
+ self._load_env()
49
+
50
+ self.embed_model = HuggingFaceEmbedding(
51
+ model_name=self.embedding_model, device=self.device
52
+ )
53
+ self.llm = HuggingFaceInferenceAPI(
54
+ model=self.llm_model,
55
+ provider="auto",
56
+ )
57
+ logger.info("RetrieverEngineBuilder initialized.")
58
+
59
+ def _login_huggingface(self) -> None:
60
+ """Login to HuggingFace using the token from environment variable."""
61
+ logger.info("Logging in to HuggingFace.")
62
+ login(token=os.getenv(key=self.hf_token_env))
63
+ logger.info("Logged in to HuggingFace.")
64
+
65
+ def _load_env(self) -> None:
66
+ """Load environment variables from .env file."""
67
+ logger.info("Loading environment variables.")
68
+ _ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=True))
69
+ logger.info("Environment variables loaded.")
70
+
71
+ def build_retriever_engine(self) -> Any:
72
+ """
73
+ Build and return the retriever engine.
74
+
75
+ Returns:
76
+ Retriever engine object.
77
+ """
78
+ logger.info("Building retriever engine.")
79
+ index = VectorStoreIndex.from_vector_store(
80
+ vector_store=self.vector_store, embed_model=self.embed_model
81
+ )
82
+ retriever = index.as_retriever(
83
+ vector_store_query_mode="hybrid",
84
+ similarity_top_k=5,
85
+ )
86
+ logger.info("Retriever engine built.")
87
+ return retriever
uv.lock ADDED
The diff for this file is too large to render. See raw diff