shamik commited on
Commit
f896763
·
unverified ·
1 Parent(s): 4fb2578

feat: adding project files.

Browse files
.gitignore ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Abstra
171
+ # Abstra is an AI-powered process automation framework.
172
+ # Ignore directories containing user credentials, local state, and settings.
173
+ # Learn more at https://abstra.io/docs
174
+ .abstra/
175
+
176
+ # Visual Studio Code
177
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
178
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
179
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
180
+ # you could uncomment the following to ignore the enitre vscode folder
181
+ # .vscode/
182
+
183
+ # Ruff stuff:
184
+ .ruff_cache/
185
+
186
+ # PyPI configuration file
187
+ .pypirc
188
+
189
+ # Cursor
190
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
191
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
192
+ # refer to https://docs.cursor.com/context/ignore-files
193
+ .cursorignore
194
+ .cursorindexingignore
README.md CHANGED
@@ -1,12 +1,20 @@
1
  ---
2
- title: Ml Research Agent
3
- emoji:
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.33.1
8
  app_file: app.py
 
9
  pinned: false
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: Ml Research Assistant And Tutor
3
+ emoji: 👁
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.33.1
8
  app_file: app.py
9
+ tags: [agent-demo-track]
10
  pinned: false
11
+ license: mit
12
+ short_description: Agentic system for ML research and tutoring
13
+ python_version: 3.11.6
14
+ preload_from_hub:
15
+ - Shamik/arxiv_cs_2020_07_2025 arxiv_docs.db
16
+
17
  ---
18
 
19
+ Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
20
+ ---
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ import gradio as gr
4
+ import nest_asyncio
5
+ from huggingface_hub import login
6
+
7
+ from src.agent_hackathon.consts import PROJECT_ROOT_DIR
8
+ from src.agent_hackathon.logger import get_logger
9
+ from src.agent_hackathon.multiagent import MultiAgentWorkflow
10
+
11
+ nest_asyncio.apply()
12
+
13
+ logger = get_logger(log_name="multiagent", log_dir=PROJECT_ROOT_DIR / "logs")
14
+
15
+ PRIMARY_HEADING = """# ML Topics Deep Research"""
16
+ SECONDARY_HEADING = """### This multi agent framework queries a DB containing arxiv ML research papers from Jan 2020 - Jun 6th 2025 for select categories, and finds events/conferences related to the user's query.
17
+
18
+ For more details on the filtered arxiv ds refer [here](https://huggingface.co/datasets/Shamik/arxiv_cs_2020_07_2025)
19
+ """
20
+ workflow = MultiAgentWorkflow()
21
+
22
+ _login_done = False
23
+
24
+
25
+ def run(
26
+ query: str, api_key: str, chat_history: list[dict[str, str | None]]
27
+ ) -> tuple[str,list[dict[str, str | None]]] | None:
28
+ global _login_done
29
+ if not api_key or not api_key.startswith("hf"):
30
+ raise ValueError("Incorrect HuggingFace Inference API Key")
31
+ if not _login_done:
32
+ login(token=api_key)
33
+ _login_done = True
34
+ try:
35
+ result = asyncio.run(workflow.run(user_query=query))
36
+ chat_history.append({"role": "user", "content": query})
37
+ chat_history.append({"role": "assistant", "content": result})
38
+ return "", chat_history
39
+ except Exception as err:
40
+ logger.error(f"Error during workflow execution: {err}")
41
+ return None
42
+
43
+
44
+ with gr.Blocks(fill_height=True) as demo:
45
+ gr.Markdown(value=PRIMARY_HEADING)
46
+ gr.Markdown(value=SECONDARY_HEADING)
47
+ gr.Markdown(
48
+ value="""<span style="color:red"> Please use a 🤗 Inference API Key </span>"""
49
+ )
50
+ api_key = gr.Textbox(
51
+ placeholder="Enter your HuggingFace Inference API KEY HERE",
52
+ label="🤗 Inference API Key",
53
+ show_label=True,
54
+ type="password",
55
+ )
56
+ chatbot = gr.Chatbot(
57
+ type="messages", label="DeepResearch", show_label=True, height=500,
58
+ show_copy_all_button=True, show_copy_button=True
59
+ )
60
+ msg = gr.Textbox(
61
+ placeholder="Type your message here and press enter...",
62
+ show_label=True,
63
+ label="Input",
64
+ submit_btn=True,
65
+ stop_btn=True,
66
+ )
67
+ clear = gr.ClearButton(components=[msg, chatbot])
68
+ msg.submit(fn=run, inputs=[msg, api_key, chatbot], outputs=[msg, chatbot])
69
+
70
+ demo.queue(max_size=1).launch(share=False)
71
+
72
+ # if __name__ == "__main__":
73
+ # demo.queue(max_size=1).launch(share=False)
74
+
75
+
76
+ # example queries
77
+ # tell me about reinforcement learning in robotics
78
+ # give me event details on reinforcement learning & robotics
requirements.txt ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ accelerate==1.7.0
4
+ # via
5
+ # flagembedding
6
+ # peft
7
+ # transformers
8
+ aiofiles==24.1.0
9
+ # via gradio
10
+ aiohappyeyeballs==2.6.1
11
+ # via aiohttp
12
+ aiohttp==3.12.11
13
+ # via
14
+ # fsspec
15
+ # huggingface-hub
16
+ # llama-index-core
17
+ aiosignal==1.3.2
18
+ # via aiohttp
19
+ aiosqlite==0.21.0
20
+ # via llama-index-core
21
+ annotated-types==0.7.0
22
+ # via pydantic
23
+ anyio==4.9.0
24
+ # via
25
+ # gradio
26
+ # httpx
27
+ # openai
28
+ # starlette
29
+ arxiv==2.2.0
30
+ # via agent-hackathon (pyproject.toml)
31
+ attrs==25.3.0
32
+ # via aiohttp
33
+ banks==2.1.2
34
+ # via llama-index-core
35
+ beautifulsoup4==4.13.4
36
+ # via
37
+ # ir-datasets
38
+ # llama-index-readers-file
39
+ cbor==1.0.0
40
+ # via trec-car-tools
41
+ certifi==2025.4.26
42
+ # via
43
+ # httpcore
44
+ # httpx
45
+ # llama-cloud
46
+ # requests
47
+ charset-normalizer==3.4.2
48
+ # via requests
49
+ click==8.2.1
50
+ # via
51
+ # duckduckgo-search
52
+ # llama-cloud-services
53
+ # nltk
54
+ # typer
55
+ # uvicorn
56
+ colorama==0.4.6
57
+ # via griffe
58
+ dataclasses-json==0.6.7
59
+ # via llama-index-core
60
+ datasets==3.6.0
61
+ # via flagembedding
62
+ deprecated==1.2.18
63
+ # via
64
+ # banks
65
+ # llama-index-core
66
+ dill==0.3.8
67
+ # via
68
+ # datasets
69
+ # multiprocess
70
+ dirtyjson==1.0.8
71
+ # via llama-index-core
72
+ distro==1.9.0
73
+ # via openai
74
+ duckduckgo-search==6.4.2
75
+ # via llama-index-tools-duckduckgo
76
+ fastapi==0.115.12
77
+ # via gradio
78
+ feedparser==6.0.11
79
+ # via arxiv
80
+ ffmpy==0.6.0
81
+ # via gradio
82
+ filelock==3.18.0
83
+ # via
84
+ # datasets
85
+ # huggingface-hub
86
+ # torch
87
+ # transformers
88
+ filetype==1.2.0
89
+ # via llama-index-core
90
+ flagembedding==1.3.5
91
+ # via agent-hackathon (pyproject.toml)
92
+ frozenlist==1.6.2
93
+ # via
94
+ # aiohttp
95
+ # aiosignal
96
+ fsspec==2025.3.0
97
+ # via
98
+ # datasets
99
+ # gradio-client
100
+ # huggingface-hub
101
+ # llama-index-core
102
+ # torch
103
+ gradio==5.33.1
104
+ # via agent-hackathon (pyproject.toml)
105
+ gradio-client==1.10.3
106
+ # via gradio
107
+ greenlet==3.2.3
108
+ # via sqlalchemy
109
+ griffe==1.7.3
110
+ # via banks
111
+ groovy==0.1.2
112
+ # via gradio
113
+ grpcio==1.67.1
114
+ # via pymilvus
115
+ h11==0.16.0
116
+ # via
117
+ # httpcore
118
+ # uvicorn
119
+ hf-xet==1.1.3
120
+ # via huggingface-hub
121
+ html2text==2025.4.15
122
+ # via llama-hub
123
+ httpcore==1.0.9
124
+ # via httpx
125
+ httpx==0.28.1
126
+ # via
127
+ # agent-hackathon (pyproject.toml)
128
+ # gradio
129
+ # gradio-client
130
+ # llama-cloud
131
+ # llama-index-core
132
+ # openai
133
+ # safehttpx
134
+ huggingface-hub==0.32.4
135
+ # via
136
+ # agent-hackathon (pyproject.toml)
137
+ # accelerate
138
+ # datasets
139
+ # gradio
140
+ # gradio-client
141
+ # llama-index-embeddings-huggingface
142
+ # llama-index-embeddings-huggingface-api
143
+ # llama-index-llms-huggingface-api
144
+ # llama-index-utils-huggingface
145
+ # peft
146
+ # sentence-transformers
147
+ # smolagents
148
+ # tokenizers
149
+ # transformers
150
+ idna==3.10
151
+ # via
152
+ # anyio
153
+ # httpx
154
+ # requests
155
+ # yarl
156
+ ijson==3.4.0
157
+ # via ir-datasets
158
+ inquirerpy==0.3.4
159
+ # via huggingface-hub
160
+ inscriptis==2.6.0
161
+ # via ir-datasets
162
+ ir-datasets==0.5.10
163
+ # via flagembedding
164
+ jinja2==3.1.6
165
+ # via
166
+ # banks
167
+ # gradio
168
+ # smolagents
169
+ # torch
170
+ jiter==0.10.0
171
+ # via openai
172
+ joblib==1.5.1
173
+ # via
174
+ # nltk
175
+ # scikit-learn
176
+ llama-cloud==0.1.23
177
+ # via
178
+ # llama-cloud-services
179
+ # llama-index-indices-managed-llama-cloud
180
+ llama-cloud-services==0.6.30
181
+ # via llama-parse
182
+ llama-hub==0.0.79.post1
183
+ # via agent-hackathon (pyproject.toml)
184
+ llama-index==0.12.41
185
+ # via llama-hub
186
+ llama-index-agent-openai==0.4.9
187
+ # via
188
+ # llama-index
189
+ # llama-index-program-openai
190
+ llama-index-cli==0.4.3
191
+ # via llama-index
192
+ llama-index-core==0.12.41
193
+ # via
194
+ # llama-cloud-services
195
+ # llama-index
196
+ # llama-index-agent-openai
197
+ # llama-index-cli
198
+ # llama-index-embeddings-huggingface
199
+ # llama-index-embeddings-huggingface-api
200
+ # llama-index-embeddings-openai
201
+ # llama-index-indices-managed-llama-cloud
202
+ # llama-index-llms-huggingface
203
+ # llama-index-llms-huggingface-api
204
+ # llama-index-llms-nebius
205
+ # llama-index-llms-openai
206
+ # llama-index-llms-openai-like
207
+ # llama-index-multi-modal-llms-openai
208
+ # llama-index-program-openai
209
+ # llama-index-question-gen-openai
210
+ # llama-index-readers-file
211
+ # llama-index-readers-llama-parse
212
+ # llama-index-tools-duckduckgo
213
+ # llama-index-utils-huggingface
214
+ # llama-index-vector-stores-milvus
215
+ llama-index-embeddings-huggingface==0.5.4
216
+ # via agent-hackathon (pyproject.toml)
217
+ llama-index-embeddings-huggingface-api==0.3.1
218
+ # via agent-hackathon (pyproject.toml)
219
+ llama-index-embeddings-openai==0.3.1
220
+ # via
221
+ # llama-index
222
+ # llama-index-cli
223
+ llama-index-indices-managed-llama-cloud==0.7.4
224
+ # via llama-index
225
+ llama-index-llms-huggingface==0.5.0
226
+ # via agent-hackathon (pyproject.toml)
227
+ llama-index-llms-huggingface-api==0.5.0
228
+ # via agent-hackathon (pyproject.toml)
229
+ llama-index-llms-nebius==0.1.2
230
+ # via agent-hackathon (pyproject.toml)
231
+ llama-index-llms-openai==0.4.4
232
+ # via
233
+ # llama-index
234
+ # llama-index-agent-openai
235
+ # llama-index-cli
236
+ # llama-index-llms-openai-like
237
+ # llama-index-multi-modal-llms-openai
238
+ # llama-index-program-openai
239
+ # llama-index-question-gen-openai
240
+ llama-index-llms-openai-like==0.4.0
241
+ # via llama-index-llms-nebius
242
+ llama-index-multi-modal-llms-openai==0.5.1
243
+ # via llama-index
244
+ llama-index-program-openai==0.3.2
245
+ # via
246
+ # llama-index
247
+ # llama-index-question-gen-openai
248
+ llama-index-question-gen-openai==0.3.1
249
+ # via llama-index
250
+ llama-index-readers-file==0.4.8
251
+ # via llama-index
252
+ llama-index-readers-llama-parse==0.4.0
253
+ # via llama-index
254
+ llama-index-tools-duckduckgo==0.3.0
255
+ # via agent-hackathon (pyproject.toml)
256
+ llama-index-utils-huggingface==0.3.0
257
+ # via llama-index-embeddings-huggingface-api
258
+ llama-index-vector-stores-milvus==0.8.4
259
+ # via agent-hackathon (pyproject.toml)
260
+ llama-parse==0.6.30
261
+ # via llama-index-readers-llama-parse
262
+ lxml==5.4.0
263
+ # via
264
+ # inscriptis
265
+ # ir-datasets
266
+ lz4==4.4.4
267
+ # via ir-datasets
268
+ markdown-it-py==3.0.0
269
+ # via rich
270
+ markupsafe==3.0.2
271
+ # via
272
+ # gradio
273
+ # jinja2
274
+ marshmallow==3.26.1
275
+ # via dataclasses-json
276
+ mdurl==0.1.2
277
+ # via markdown-it-py
278
+ milvus-lite==2.4.12
279
+ # via pymilvus
280
+ mpmath==1.3.0
281
+ # via sympy
282
+ multidict==6.4.4
283
+ # via
284
+ # aiohttp
285
+ # yarl
286
+ multiprocess==0.70.16
287
+ # via datasets
288
+ mypy-extensions==1.1.0
289
+ # via typing-inspect
290
+ nest-asyncio==1.6.0
291
+ # via llama-index-core
292
+ networkx==3.4.2
293
+ # via
294
+ # llama-index-core
295
+ # torch
296
+ nltk==3.9.1
297
+ # via
298
+ # llama-index
299
+ # llama-index-core
300
+ numpy==2.2.6
301
+ # via
302
+ # accelerate
303
+ # datasets
304
+ # gradio
305
+ # ir-datasets
306
+ # llama-index-core
307
+ # pandas
308
+ # peft
309
+ # scikit-learn
310
+ # scipy
311
+ # transformers
312
+ # trec-car-tools
313
+ openai==1.84.0
314
+ # via
315
+ # agent-hackathon (pyproject.toml)
316
+ # llama-index-agent-openai
317
+ # llama-index-embeddings-openai
318
+ # llama-index-llms-openai
319
+ orjson==3.10.18
320
+ # via gradio
321
+ packaging==25.0
322
+ # via
323
+ # accelerate
324
+ # datasets
325
+ # gradio
326
+ # gradio-client
327
+ # huggingface-hub
328
+ # marshmallow
329
+ # peft
330
+ # transformers
331
+ pandas==2.3.0
332
+ # via
333
+ # datasets
334
+ # gradio
335
+ # llama-index-readers-file
336
+ # pymilvus
337
+ peft==0.15.2
338
+ # via flagembedding
339
+ pfzy==0.3.4
340
+ # via inquirerpy
341
+ pillow==11.2.1
342
+ # via
343
+ # gradio
344
+ # llama-index-core
345
+ # sentence-transformers
346
+ # smolagents
347
+ pip==25.1.1
348
+ # via agent-hackathon (pyproject.toml)
349
+ platformdirs==4.3.8
350
+ # via
351
+ # banks
352
+ # llama-cloud-services
353
+ primp==0.15.0
354
+ # via duckduckgo-search
355
+ prompt-toolkit==3.0.51
356
+ # via inquirerpy
357
+ propcache==0.3.1
358
+ # via
359
+ # aiohttp
360
+ # yarl
361
+ protobuf==6.31.1
362
+ # via
363
+ # flagembedding
364
+ # pymilvus
365
+ psutil==7.0.0
366
+ # via
367
+ # accelerate
368
+ # llama-hub
369
+ # peft
370
+ pyaml==23.12.0
371
+ # via llama-hub
372
+ pyarrow==20.0.0
373
+ # via
374
+ # datasets
375
+ # ir-datasets
376
+ pydantic==2.11.5
377
+ # via
378
+ # banks
379
+ # fastapi
380
+ # gradio
381
+ # llama-cloud
382
+ # llama-cloud-services
383
+ # llama-index-core
384
+ # openai
385
+ pydantic-core==2.33.2
386
+ # via pydantic
387
+ pydub==0.25.1
388
+ # via gradio
389
+ pygments==2.19.1
390
+ # via rich
391
+ pymilvus==2.5.10
392
+ # via llama-index-vector-stores-milvus
393
+ pypdf==5.6.0
394
+ # via llama-index-readers-file
395
+ pyprojroot==0.3.0
396
+ # via agent-hackathon (pyproject.toml)
397
+ python-dateutil==2.9.0.post0
398
+ # via pandas
399
+ python-dotenv==1.1.0
400
+ # via
401
+ # agent-hackathon (pyproject.toml)
402
+ # llama-cloud-services
403
+ # pymilvus
404
+ # smolagents
405
+ python-multipart==0.0.20
406
+ # via gradio
407
+ pytz==2025.2
408
+ # via pandas
409
+ pyyaml==6.0.2
410
+ # via
411
+ # accelerate
412
+ # datasets
413
+ # gradio
414
+ # huggingface-hub
415
+ # ir-datasets
416
+ # llama-index-core
417
+ # peft
418
+ # pyaml
419
+ # transformers
420
+ regex==2024.11.6
421
+ # via
422
+ # nltk
423
+ # tiktoken
424
+ # transformers
425
+ requests==2.32.3
426
+ # via
427
+ # arxiv
428
+ # datasets
429
+ # huggingface-hub
430
+ # inscriptis
431
+ # ir-datasets
432
+ # llama-index-core
433
+ # smolagents
434
+ # tiktoken
435
+ # transformers
436
+ retrying==1.3.4
437
+ # via llama-hub
438
+ rich==14.0.0
439
+ # via
440
+ # smolagents
441
+ # typer
442
+ ruff==0.11.13
443
+ # via gradio
444
+ safehttpx==0.1.6
445
+ # via gradio
446
+ safetensors==0.5.3
447
+ # via
448
+ # accelerate
449
+ # peft
450
+ # transformers
451
+ scikit-learn==1.7.0
452
+ # via sentence-transformers
453
+ scipy==1.15.3
454
+ # via
455
+ # scikit-learn
456
+ # sentence-transformers
457
+ semantic-version==2.10.0
458
+ # via gradio
459
+ sentence-transformers==4.1.0
460
+ # via
461
+ # flagembedding
462
+ # llama-index-embeddings-huggingface
463
+ sentencepiece==0.2.0
464
+ # via flagembedding
465
+ setuptools==80.9.0
466
+ # via pymilvus
467
+ sgmllib3k==1.0.0
468
+ # via feedparser
469
+ shellingham==1.5.4
470
+ # via typer
471
+ six==1.17.0
472
+ # via
473
+ # python-dateutil
474
+ # retrying
475
+ smolagents==1.17.0
476
+ # via agent-hackathon (pyproject.toml)
477
+ sniffio==1.3.1
478
+ # via
479
+ # anyio
480
+ # openai
481
+ soupsieve==2.7
482
+ # via beautifulsoup4
483
+ sqlalchemy==2.0.41
484
+ # via llama-index-core
485
+ starlette==0.46.2
486
+ # via
487
+ # fastapi
488
+ # gradio
489
+ striprtf==0.0.26
490
+ # via llama-index-readers-file
491
+ sympy==1.13.1
492
+ # via torch
493
+ tenacity==9.1.2
494
+ # via llama-index-core
495
+ threadpoolctl==3.6.0
496
+ # via scikit-learn
497
+ tiktoken==0.9.0
498
+ # via llama-index-core
499
+ tokenizers==0.21.1
500
+ # via transformers
501
+ tomlkit==0.13.3
502
+ # via gradio
503
+ torch==2.6.0
504
+ # via
505
+ # accelerate
506
+ # flagembedding
507
+ # llama-index-llms-huggingface
508
+ # peft
509
+ # sentence-transformers
510
+ # transformers
511
+ tqdm==4.67.1
512
+ # via
513
+ # datasets
514
+ # huggingface-hub
515
+ # ir-datasets
516
+ # llama-index-core
517
+ # milvus-lite
518
+ # nltk
519
+ # openai
520
+ # peft
521
+ # sentence-transformers
522
+ # transformers
523
+ transformers==4.52.4
524
+ # via
525
+ # flagembedding
526
+ # llama-index-llms-huggingface
527
+ # llama-index-llms-openai-like
528
+ # peft
529
+ # sentence-transformers
530
+ trec-car-tools==2.6
531
+ # via ir-datasets
532
+ typer==0.16.0
533
+ # via gradio
534
+ typing-extensions==4.14.0
535
+ # via
536
+ # aiosqlite
537
+ # anyio
538
+ # beautifulsoup4
539
+ # fastapi
540
+ # gradio
541
+ # gradio-client
542
+ # huggingface-hub
543
+ # llama-index-core
544
+ # openai
545
+ # pydantic
546
+ # pydantic-core
547
+ # pyprojroot
548
+ # sentence-transformers
549
+ # sqlalchemy
550
+ # torch
551
+ # typer
552
+ # typing-inspect
553
+ # typing-inspection
554
+ typing-inspect==0.9.0
555
+ # via
556
+ # dataclasses-json
557
+ # llama-index-core
558
+ typing-inspection==0.4.1
559
+ # via pydantic
560
+ tzdata==2025.2
561
+ # via pandas
562
+ ujson==5.10.0
563
+ # via pymilvus
564
+ unlzw3==0.2.3
565
+ # via ir-datasets
566
+ urllib3==2.4.0
567
+ # via requests
568
+ uvicorn==0.34.3
569
+ # via gradio
570
+ warc3-wet==0.2.5
571
+ # via ir-datasets
572
+ warc3-wet-clueweb09==0.2.5
573
+ # via ir-datasets
574
+ wcwidth==0.2.13
575
+ # via prompt-toolkit
576
+ websockets==15.0.1
577
+ # via gradio-client
578
+ wrapt==1.17.2
579
+ # via
580
+ # deprecated
581
+ # llama-index-core
582
+ xxhash==3.5.0
583
+ # via datasets
584
+ yarl==1.20.0
585
+ # via aiohttp
586
+ zlib-state==0.1.9
587
+ # via ir-datasets
src/agent_hackathon/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ def hello() -> str:
2
+ return "Hello from agent-hackathon!"
src/agent_hackathon/consts.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from pyprojroot import find_root, has_file
2
+
3
+ PROJECT_ROOT_DIR = find_root(criterion=has_file(file="README.md"))
src/agent_hackathon/create_vector_db.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from copy import deepcopy
3
+
4
+ from dotenv import find_dotenv, load_dotenv
5
+ from llama_index.core import StorageContext, VectorStoreIndex
6
+ from llama_index.core.node_parser import SentenceSplitter
7
+ from llama_index.core.schema import Document
8
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
9
+ from llama_index.vector_stores.milvus import MilvusVectorStore
10
+ from llama_index.vector_stores.milvus.utils import BGEM3SparseEmbeddingFunction
11
+
12
+ from src.agent_hackathon.consts import PROJECT_ROOT_DIR
13
+ from src.agent_hackathon.logger import get_logger
14
+
15
+ logger = get_logger(log_name="create_vector_db", log_dir=PROJECT_ROOT_DIR / "logs")
16
+
17
+
18
+ class VectorDBCreator:
19
+ """Handles creation of a Milvus vector database from arXiv data."""
20
+
21
+ def __init__(
22
+ self,
23
+ data_path: str,
24
+ db_uri: str,
25
+ embedding_model: str = "Qwen/Qwen3-Embedding-0.6B",
26
+ chunk_size: int = 20_000,
27
+ chunk_overlap: int = 0,
28
+ vector_dim: int = 1024,
29
+ insert_batch_size: int = 8192,
30
+ ) -> None:
31
+ """
32
+ Initialize the VectorDBCreator.
33
+
34
+ Args:
35
+ data_path: Path to the JSON data file.
36
+ db_uri: URI for the Milvus database.
37
+ embedding_model: Name of the embedding model.
38
+ chunk_size: Size of text chunks for splitting.
39
+ chunk_overlap: Overlap between text chunks.
40
+ vector_dim: Dimension of the embedding vectors.
41
+ insert_batch_size: Batch size for insertion.
42
+ """
43
+ self.data_path = data_path
44
+ self.db_uri = db_uri
45
+ self.embedding_model = embedding_model
46
+ self.chunk_size = chunk_size
47
+ self.chunk_overlap = chunk_overlap
48
+ self.vector_dim = vector_dim
49
+ self.insert_batch_size = insert_batch_size
50
+ self.embed_model = HuggingFaceEmbedding(
51
+ model_name=self.embedding_model, device="cpu"
52
+ )
53
+ self.sent_splitter = SentenceSplitter(
54
+ chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
55
+ )
56
+ logger.info("VectorDBCreator initialized.")
57
+
58
+ def load_data(self) -> list[dict]:
59
+ """
60
+ Load and return data from the JSON file.
61
+
62
+ Returns:
63
+ List of dictionaries containing arXiv data.
64
+ """
65
+ logger.info(f"Loading data from {self.data_path}")
66
+ with open(file=self.data_path) as f:
67
+ data = json.load(fp=f)
68
+ logger.info("Data loaded successfully.")
69
+ return deepcopy(x=data)
70
+
71
+ def prepare_documents(self, data: list[dict]) -> list[Document]:
72
+ """
73
+ Convert raw data into a list of Document objects.
74
+
75
+ Args:
76
+ data: List of dictionaries with arXiv data.
77
+
78
+ Returns:
79
+ List of Document objects.
80
+ """
81
+ logger.info("Preparing documents from data.")
82
+ docs = [Document(text=d.pop("abstract"), metadata=d) for d in data]
83
+ logger.info(f"Prepared {len(docs)} documents.")
84
+ return docs
85
+
86
+ def create_vector_store(self) -> MilvusVectorStore:
87
+ """
88
+ Create and return a MilvusVectorStore instance.
89
+
90
+ Returns:
91
+ Configured MilvusVectorStore.
92
+ """
93
+ logger.info(f"Creating MilvusVectorStore at {self.db_uri}")
94
+ store = MilvusVectorStore(
95
+ uri=self.db_uri,
96
+ dim=self.vector_dim,
97
+ enable_sparse=True,
98
+ sparse_embedding_function=BGEM3SparseEmbeddingFunction(),
99
+ )
100
+ logger.info("MilvusVectorStore created.")
101
+ return store
102
+
103
+ def build_index(
104
+ self, docs_list: list[Document], vector_store: MilvusVectorStore
105
+ ) -> VectorStoreIndex:
106
+ """
107
+ Build and return a VectorStoreIndex from documents.
108
+
109
+ Args:
110
+ docs_list: List of Document objects.
111
+ vector_store: MilvusVectorStore instance.
112
+
113
+ Returns:
114
+ VectorStoreIndex object.
115
+ """
116
+ logger.info("Building VectorStoreIndex.")
117
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
118
+ index = VectorStoreIndex.from_documents(
119
+ documents=docs_list,
120
+ storage_context=storage_context,
121
+ embed_model=self.embed_model,
122
+ transformations=[self.sent_splitter],
123
+ show_progress=True,
124
+ insert_batch_size=self.insert_batch_size,
125
+ )
126
+ logger.info("VectorStoreIndex built.")
127
+ return index
128
+
129
+ def run(self) -> None:
130
+ """
131
+ Execute the full pipeline: load data, prepare documents, create vector store, and build index.
132
+ """
133
+ logger.info("Running full vector DB creation pipeline.")
134
+ data = self.load_data()
135
+ docs_list = self.prepare_documents(data=data)
136
+ vector_store = self.create_vector_store()
137
+ self.build_index(docs_list=docs_list, vector_store=vector_store)
138
+ logger.info("Pipeline finished.")
139
+
140
+
141
+ # if __name__ == "__main__":
142
+ # logger.info("Script started.")
143
+ # # Optionally load environment variables if needed
144
+ # _ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=True))
145
+ # creator = VectorDBCreator(
146
+ # data_path=f"{PROJECT_ROOT_DIR}/data/cs_data_arxiv.json", db_uri="arxiv_docs.db"
147
+ # )
148
+ # creator.run()
149
+ # logger.info("Script finished.")
src/agent_hackathon/generate_arxiv_responses.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from huggingface_hub import InferenceClient
6
+
7
+ from src.agent_hackathon.consts import PROJECT_ROOT_DIR
8
+ from src.agent_hackathon.create_vector_db import VectorDBCreator
9
+ from src.agent_hackathon.logger import get_logger
10
+ from src.agent_hackathon.query_vector_db import RetrieverEngineBuilder
11
+
12
+ logger = get_logger(log_name="arxiv_responses", log_dir=PROJECT_ROOT_DIR / "logs")
13
+
14
+
15
+ class ArxivResponseGenerator:
16
+ """
17
+ Handles retrieval and formatting of arXiv papers using a vector database and LLM.
18
+ """
19
+
20
+ def __init__(self, vector_store_path: Path) -> None:
21
+ """Initializes the ArxivResponseGenerator."""
22
+ self.vector_store_path = vector_store_path
23
+ self.client = self._initialise_client()
24
+ self.retriever = self._initialise_retriever()
25
+ logger.info("ArxivResponseGenerator initialized.")
26
+
27
+ def _initialise_retriever(self) -> Any:
28
+ """
29
+ Initializes and returns a retriever engine.
30
+
31
+ Returns:
32
+ Any: Retriever engine object.
33
+ """
34
+ logger.info("Initializing retriever engine.")
35
+ vector_db_creator = VectorDBCreator(
36
+ data_path=..., db_uri=self.vector_store_path.as_posix()
37
+ )
38
+ vector_store = vector_db_creator.create_vector_store()
39
+ retriever_class = RetrieverEngineBuilder(
40
+ vector_store=vector_store,
41
+ )
42
+ retriever = retriever_class.build_retriever_engine()
43
+ logger.info("Retriever engine initialized.")
44
+ return retriever
45
+
46
+ def _initialise_client(self) -> InferenceClient:
47
+ """
48
+ Initializes and returns an InferenceClient.
49
+
50
+ Returns:
51
+ InferenceClient: HuggingFace InferenceClient instance.
52
+ """
53
+ logger.info("Initializing InferenceClient.")
54
+ client = InferenceClient(
55
+ provider="auto",
56
+ # bill_to="VitalNest",
57
+ )
58
+ logger.info("InferenceClient initialized.")
59
+ return client
60
+
61
+ def retrieve_arxiv_papers(self, query: str) -> str:
62
+ """
63
+ Retrieves and formats arXiv papers for a given query.
64
+
65
+ Args:
66
+ query (str): The search query.
67
+
68
+ Returns:
69
+ str: Formatted response from the LLM.
70
+ """
71
+ logger.info(f"Retrieving arXiv papers for query: {query}")
72
+
73
+ try:
74
+ retrieved_content = json.dumps(
75
+ obj=[(i.get_content(), i.metadata) for i in self.retriever.retrieve(query)]
76
+ )
77
+ logger.info("Retrieved content from vector DB.")
78
+ except Exception as err:
79
+ logger.error(f"Error retrieving from vector DB: {err}")
80
+ raise
81
+ completion = self.client.chat.completions.create(
82
+ model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
83
+ temperature=0.1,
84
+ messages=[
85
+ {
86
+ "role": "user",
87
+ "content": [
88
+ {
89
+ "type": "text",
90
+ "text": f"Format the following output neatly:{retrieved_content}. Return only the output.",
91
+ },
92
+ ],
93
+ }
94
+ ],
95
+ )
96
+ logger.info("Received completion from LLM.")
97
+ return completion.choices[0].message.content
98
+
99
+
100
+ # if __name__ == "__main__":
101
+ # logger.info("Script started.")
102
+ # generator = ArxivResponseGenerator(
103
+ # vector_store_path=PROJECT_ROOT_DIR / "db/arxiv_docs.db"
104
+ # )
105
+ # query = "deep learning for NLP" # Example query, replace as needed
106
+ # result = generator.retrieve_arxiv_papers(query=query)
107
+ # print(result)
108
+ # logger.info("Script finished.")
src/agent_hackathon/logger.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+
5
+ from rich.logging import RichHandler
6
+
7
+
8
+ def get_logger(log_name: str, log_dir: Path) -> logging.Logger:
9
+ """
10
+ Returns a logger with RichHandler and file handler.
11
+
12
+ Args:
13
+ log_name (str): Name prefix for the log file.
14
+ log_dir (Path): Directory to store log files.
15
+
16
+ Returns:
17
+ logging.Logger: Configured logger instance.
18
+ """
19
+ log_dir.mkdir(parents=True, exist_ok=True)
20
+ date_str = datetime.now().strftime(format="%m_%d_%Y")
21
+ log_file = log_dir / f"{log_name}_{date_str}.log"
22
+
23
+ logger = logging.getLogger(name=log_name)
24
+ logger.setLevel(level=logging.INFO)
25
+ logger.handlers.clear()
26
+
27
+ # Rich console handler
28
+ rich_handler = RichHandler(
29
+ rich_tracebacks=True, show_time=True, show_level=True, show_path=True
30
+ )
31
+ rich_handler.setLevel(level=logging.INFO)
32
+
33
+ # File handler
34
+ file_handler = logging.FileHandler(filename=log_file, encoding="utf-8")
35
+ file_handler.setLevel(level=logging.INFO)
36
+ formatter = logging.Formatter(
37
+ fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s"
38
+ )
39
+ file_handler.setFormatter(formatter)
40
+
41
+ logger.addHandler(rich_handler)
42
+ logger.addHandler(file_handler)
43
+ logger.propagate = False
44
+
45
+ return logger
src/agent_hackathon/multiagent.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import asyncio
2
+ from datetime import date
3
+
4
+ import nest_asyncio
5
+ from llama_index.core.agent.workflow import AgentWorkflow, ReActAgent
6
+ from llama_index.core.tools import FunctionTool
7
+ from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
8
+ from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
9
+
10
+ from src.agent_hackathon.consts import PROJECT_ROOT_DIR
11
+
12
+ # from dotenv import find_dotenv, load_dotenv
13
+ from src.agent_hackathon.generate_arxiv_responses import ArxivResponseGenerator
14
+ from src.agent_hackathon.logger import get_logger
15
+
16
+ nest_asyncio.apply()
17
+
18
+ # _ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False), override=True)
19
+
20
+ logger = get_logger(log_name="multiagent", log_dir=PROJECT_ROOT_DIR / "logs")
21
+
22
+
23
+ class MultiAgentWorkflow:
24
+ """Multi-agent workflow for retrieving research papers and related events."""
25
+
26
+ def __init__(self) -> None:
27
+ """Initialize the workflow with LLM, tools, and generator."""
28
+ logger.info("Initializing MultiAgentWorkflow.")
29
+ self.llm = HuggingFaceInferenceAPI(
30
+ model="meta-llama/Llama-3.3-70B-Instruct",
31
+ provider="auto",
32
+ # provider="nebius",
33
+ temperature=0.1,
34
+ top_p=0.95,
35
+ # api_key=os.getenv(key="NEBIUS_API_KEY"),
36
+ # base_url="https://api.studio.nebius.com/v1/",
37
+ system_prompt="Don't just plan, but execute the plan until failure.",
38
+ )
39
+ self._generator = ArxivResponseGenerator(
40
+ vector_store_path=PROJECT_ROOT_DIR / "db/arxiv_docs.db"
41
+ )
42
+ self._arxiv_rag_tool = FunctionTool.from_defaults(
43
+ fn=self._arxiv_rag,
44
+ name="arxiv_rag",
45
+ description="Retrieves arxiv research papers.",
46
+ return_direct=False,
47
+ )
48
+ self._duckduckgo_search_tool = [
49
+ tool
50
+ for tool in DuckDuckGoSearchToolSpec().to_tool_list()
51
+ if tool.metadata.name == "duckduckgo_full_search"
52
+ ]
53
+ self._arxiv_agent = ReActAgent(
54
+ name="arxiv_agent",
55
+ description="Retrieves information about arxiv research papers",
56
+ system_prompt="You are arxiv research paper agent, who retrieves information "
57
+ "about arxiv research papers.",
58
+ tools=[self._arxiv_rag_tool],
59
+ llm=self.llm,
60
+ )
61
+ self._websearch_agent = ReActAgent(
62
+ name="web_search",
63
+ description="Searches the web",
64
+ system_prompt="You are search engine who searches the web using duckduckgo tool",
65
+ tools=self._duckduckgo_search_tool,
66
+ llm=self.llm,
67
+ )
68
+
69
+ self._workflow = AgentWorkflow(
70
+ agents=[self._arxiv_agent, self._websearch_agent],
71
+ root_agent="arxiv_agent",
72
+ timeout=180,
73
+ )
74
+ # AgentWorkflow.from_tools_or_functions(
75
+ # tools_or_functions=self._duckduckgo_search_tool,
76
+ # llm=self.llm,
77
+ # system_prompt="You are an expert that "
78
+ # "searches for any corresponding events related to the "
79
+ # "user query "
80
+ # "using the duckduckgo_search_tool and returns the final results." \
81
+ # "Don't return the steps but execute the necessary tools that you have " \
82
+ # "access to and return the results.",
83
+ # timeout=180,
84
+ # )
85
+
86
+ logger.info("MultiAgentWorkflow initialized.")
87
+
88
+ def _arxiv_rag(self, query: str) -> str:
89
+ """Retrieve research papers from arXiv based on the query.
90
+
91
+ Args:
92
+ query (str): The search query.
93
+
94
+ Returns:
95
+ str: Retrieved research papers as a string.
96
+ """
97
+ return self._generator.retrieve_arxiv_papers(query=query)
98
+
99
+ def _clean_response(self, result: str) -> str:
100
+ """Removes the think tags.
101
+
102
+ Args:
103
+ result (str): The result with the <think></think> content.
104
+
105
+ Returns:
106
+ str: The result without the <think></think> content.
107
+ """
108
+ if result.find("</think>"):
109
+ result = result[result.find("</think>") + len("</think>") :]
110
+ return result
111
+
112
+ async def run(self, user_query: str) -> str:
113
+ """Run the multi-agent workflow for a given user query.
114
+
115
+ Args:
116
+ user_query (str): The user's search query.
117
+
118
+ Returns:
119
+ str: The output string.
120
+ """
121
+ logger.info("Running multi-agent workflow.")
122
+ try:
123
+ user_msg = (
124
+ f"First, give me arxiv research papers about: {user_query}."
125
+ f"Then search with web search agent for any events related to : {user_query}.\n"
126
+ f"The web search results should be relevant to the current year: {date.today().year}."
127
+ "Return all the content from all the agents."
128
+ )
129
+ results = await self._workflow.run(user_msg=user_msg)
130
+ logger.info("Workflow run completed successfully.")
131
+ return results
132
+ except Exception as err:
133
+ logger.error(f"Workflow run failed: {err}")
134
+ raise
135
+
136
+
137
+ # if __name__ == "__main__":
138
+ # USER_QUERY = "i want to learn more about nlp"
139
+ # workflow = MultiAgentWorkflow()
140
+ # logger.info("Starting workflow for user query.")
141
+ # try:
142
+ # result = asyncio.run(workflow.run(user_query=USER_QUERY))
143
+ # logger.info("Workflow finished. Output below:")
144
+ # print(result)
145
+ # except Exception as err:
146
+ # logger.error(f"Error during workflow execution: {err}")
src/agent_hackathon/py.typed ADDED
File without changes
src/agent_hackathon/query_vector_db.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any
3
+
4
+ from dotenv import find_dotenv, load_dotenv
5
+ from huggingface_hub import login
6
+ from llama_index.core import VectorStoreIndex
7
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
8
+ from llama_index.vector_stores.milvus import MilvusVectorStore
9
+
10
+ from src.agent_hackathon.consts import PROJECT_ROOT_DIR
11
+ from src.agent_hackathon.logger import get_logger
12
+
13
+ logger = get_logger(log_name="query_vector_db", log_dir=PROJECT_ROOT_DIR / "logs")
14
+
15
+
16
+ class RetrieverEngineBuilder:
17
+ """
18
+ Handles the creation of a query engine for a vector database using HuggingFace and LlamaIndex.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ hf_token_env: str = "HF_TOKEN",
24
+ embedding_model: str = "Qwen/Qwen3-Embedding-0.6B",
25
+ vector_store: MilvusVectorStore = None,
26
+ device: str = "cpu",
27
+ ) -> None:
28
+ """
29
+ Initialize the QueryEngineBuilder.
30
+
31
+ Args:
32
+ hf_token_env: Environment variable name for HuggingFace token.
33
+ embedding_model: Name of the embedding model.
34
+ vector_store: An instance of MilvusVectorStore.
35
+ device: Device to run the embedding model on.
36
+ """
37
+ self.hf_token_env = hf_token_env
38
+ self.embedding_model = embedding_model
39
+ self.vector_store = vector_store
40
+ self.device = device
41
+
42
+ logger.info("Initializing RetrieverEngineBuilder.")
43
+ # self._login_huggingface()
44
+ # self._load_env()
45
+
46
+ self.embed_model = HuggingFaceEmbedding(
47
+ model_name=self.embedding_model, device=self.device
48
+ )
49
+ logger.info("RetrieverEngineBuilder initialized.")
50
+
51
+ def _login_huggingface(self) -> None:
52
+ """Login to HuggingFace using the token from environment variable."""
53
+ logger.info("Logging in to HuggingFace.")
54
+ login(token=os.getenv(key=self.hf_token_env))
55
+ logger.info("Logged in to HuggingFace.")
56
+
57
+ def _load_env(self) -> None:
58
+ """Load environment variables from .env file."""
59
+ logger.info("Loading environment variables.")
60
+ _ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False))
61
+ logger.info("Environment variables loaded.")
62
+
63
+ def build_retriever_engine(self) -> Any:
64
+ """
65
+ Build and return the retriever engine.
66
+
67
+ Returns:
68
+ Retriever engine object.
69
+ """
70
+ logger.info("Building retriever engine.")
71
+ index = VectorStoreIndex.from_vector_store(
72
+ vector_store=self.vector_store, embed_model=self.embed_model
73
+ )
74
+ retriever = index.as_retriever(
75
+ vector_store_query_mode="hybrid",
76
+ similarity_top_k=5,
77
+ )
78
+ logger.info("Retriever engine built.")
79
+ return retriever