Spaces:

Agents-MCP-Hackathon
/

ml_research_agent

Starting

App Files Files Community

shamik commited on 15 days ago

Commit

f896763

unverified ·

1 Parent(s): 4fb2578

feat: adding project files.

Browse files

Files changed (12) hide show

.gitignore +194 -0
README.md +13 -5
app.py +78 -0
requirements.txt +587 -0
src/agent_hackathon/__init__.py +2 -0
src/agent_hackathon/consts.py +3 -0
src/agent_hackathon/create_vector_db.py +149 -0
src/agent_hackathon/generate_arxiv_responses.py +108 -0
src/agent_hackathon/logger.py +45 -0
src/agent_hackathon/multiagent.py +146 -0
src/agent_hackathon/py.typed +0 -0
src/agent_hackathon/query_vector_db.py +79 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,194 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the enitre vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore

README.md CHANGED Viewed

@@ -1,12 +1,20 @@
 ---
-title: Ml Research Agent
-emoji: ⚡
-colorFrom: indigo
-colorTo: red
 sdk: gradio
 sdk_version: 5.33.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Ml Research Assistant And Tutor
+emoji: 👁
+colorFrom: blue
+colorTo: purple
 sdk: gradio
 sdk_version: 5.33.1
 app_file: app.py
+tags: [agent-demo-track]
 pinned: false
+license: mit
+short_description: Agentic system for ML research and tutoring
+python_version: 3.11.6
+preload_from_hub:
+  - Shamik/arxiv_cs_2020_07_2025 arxiv_docs.db
 ---
+Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
+---

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import asyncio
+import gradio as gr
+import nest_asyncio
+from huggingface_hub import login
+from src.agent_hackathon.consts import PROJECT_ROOT_DIR
+from src.agent_hackathon.logger import get_logger
+from src.agent_hackathon.multiagent import MultiAgentWorkflow
+nest_asyncio.apply()
+logger = get_logger(log_name="multiagent", log_dir=PROJECT_ROOT_DIR / "logs")
+PRIMARY_HEADING = """# ML Topics Deep Research"""
+SECONDARY_HEADING = """### This multi agent framework queries a DB containing arxiv ML research papers from Jan 2020 - Jun 6th 2025 for select categories, and finds events/conferences related to the user's query.
+For more details on the filtered arxiv ds refer [here](https://huggingface.co/datasets/Shamik/arxiv_cs_2020_07_2025)
+"""
+workflow = MultiAgentWorkflow()
+_login_done = False
+def run(
+    query: str, api_key: str, chat_history: list[dict[str, str | None]]
+) -> tuple[str,list[dict[str, str | None]]] | None:
+    global _login_done
+    if not api_key or not api_key.startswith("hf"):
+        raise ValueError("Incorrect HuggingFace Inference API Key")
+    if not _login_done:
+        login(token=api_key)
+        _login_done = True
+    try:
+        result = asyncio.run(workflow.run(user_query=query))
+        chat_history.append({"role": "user", "content": query})
+        chat_history.append({"role": "assistant", "content": result})
+        return "", chat_history
+    except Exception as err:
+        logger.error(f"Error during workflow execution: {err}")
+        return None
+with gr.Blocks(fill_height=True) as demo:
+    gr.Markdown(value=PRIMARY_HEADING)
+    gr.Markdown(value=SECONDARY_HEADING)
+    gr.Markdown(
+        value="""<span style="color:red"> Please use a 🤗 Inference API Key </span>"""
+    )
+    api_key = gr.Textbox(
+        placeholder="Enter your HuggingFace Inference API KEY HERE",
+        label="🤗 Inference API Key",
+        show_label=True,
+        type="password",
+    )
+    chatbot = gr.Chatbot(
+        type="messages", label="DeepResearch", show_label=True, height=500,
+        show_copy_all_button=True, show_copy_button=True
+    )
+    msg = gr.Textbox(
+        placeholder="Type your message here and press enter...",
+        show_label=True,
+        label="Input",
+        submit_btn=True,
+        stop_btn=True,
+    )
+    clear = gr.ClearButton(components=[msg, chatbot])
+    msg.submit(fn=run, inputs=[msg, api_key, chatbot], outputs=[msg, chatbot])
+    demo.queue(max_size=1).launch(share=False)
+# if __name__ == "__main__":
+#     demo.queue(max_size=1).launch(share=False)
+# example queries
+# tell me about reinforcement learning in robotics
+# give me event details on reinforcement learning & robotics

requirements.txt ADDED Viewed

	@@ -0,0 +1,587 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+accelerate==1.7.0
+    # via
+    #   flagembedding
+    #   peft
+    #   transformers
+aiofiles==24.1.0
+    # via gradio
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.11
+    # via
+    #   fsspec
+    #   huggingface-hub
+    #   llama-index-core
+aiosignal==1.3.2
+    # via aiohttp
+aiosqlite==0.21.0
+    # via llama-index-core
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.9.0
+    # via
+    #   gradio
+    #   httpx
+    #   openai
+    #   starlette
+arxiv==2.2.0
+    # via agent-hackathon (pyproject.toml)
+attrs==25.3.0
+    # via aiohttp
+banks==2.1.2
+    # via llama-index-core
+beautifulsoup4==4.13.4
+    # via
+    #   ir-datasets
+    #   llama-index-readers-file
+cbor==1.0.0
+    # via trec-car-tools
+certifi==2025.4.26
+    # via
+    #   httpcore
+    #   httpx
+    #   llama-cloud
+    #   requests
+charset-normalizer==3.4.2
+    # via requests
+click==8.2.1
+    # via
+    #   duckduckgo-search
+    #   llama-cloud-services
+    #   nltk
+    #   typer
+    #   uvicorn
+colorama==0.4.6
+    # via griffe
+dataclasses-json==0.6.7
+    # via llama-index-core
+datasets==3.6.0
+    # via flagembedding
+deprecated==1.2.18
+    # via
+    #   banks
+    #   llama-index-core
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+dirtyjson==1.0.8
+    # via llama-index-core
+distro==1.9.0
+    # via openai
+duckduckgo-search==6.4.2
+    # via llama-index-tools-duckduckgo
+fastapi==0.115.12
+    # via gradio
+feedparser==6.0.11
+    # via arxiv
+ffmpy==0.6.0
+    # via gradio
+filelock==3.18.0
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+filetype==1.2.0
+    # via llama-index-core
+flagembedding==1.3.5
+    # via agent-hackathon (pyproject.toml)
+frozenlist==1.6.2
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.3.0
+    # via
+    #   datasets
+    #   gradio-client
+    #   huggingface-hub
+    #   llama-index-core
+    #   torch
+gradio==5.33.1
+    # via agent-hackathon (pyproject.toml)
+gradio-client==1.10.3
+    # via gradio
+greenlet==3.2.3
+    # via sqlalchemy
+griffe==1.7.3
+    # via banks
+groovy==0.1.2
+    # via gradio
+grpcio==1.67.1
+    # via pymilvus
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.1.3
+    # via huggingface-hub
+html2text==2025.4.15
+    # via llama-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   agent-hackathon (pyproject.toml)
+    #   gradio
+    #   gradio-client
+    #   llama-cloud
+    #   llama-index-core
+    #   openai
+    #   safehttpx
+huggingface-hub==0.32.4
+    # via
+    #   agent-hackathon (pyproject.toml)
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   llama-index-embeddings-huggingface
+    #   llama-index-embeddings-huggingface-api
+    #   llama-index-llms-huggingface-api
+    #   llama-index-utils-huggingface
+    #   peft
+    #   sentence-transformers
+    #   smolagents
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+ijson==3.4.0
+    # via ir-datasets
+inquirerpy==0.3.4
+    # via huggingface-hub
+inscriptis==2.6.0
+    # via ir-datasets
+ir-datasets==0.5.10
+    # via flagembedding
+jinja2==3.1.6
+    # via
+    #   banks
+    #   gradio
+    #   smolagents
+    #   torch
+jiter==0.10.0
+    # via openai
+joblib==1.5.1
+    # via
+    #   nltk
+    #   scikit-learn
+llama-cloud==0.1.23
+    # via
+    #   llama-cloud-services
+    #   llama-index-indices-managed-llama-cloud
+llama-cloud-services==0.6.30
+    # via llama-parse
+llama-hub==0.0.79.post1
+    # via agent-hackathon (pyproject.toml)
+llama-index==0.12.41
+    # via llama-hub
+llama-index-agent-openai==0.4.9
+    # via
+    #   llama-index
+    #   llama-index-program-openai
+llama-index-cli==0.4.3
+    # via llama-index
+llama-index-core==0.12.41
+    # via
+    #   llama-cloud-services
+    #   llama-index
+    #   llama-index-agent-openai
+    #   llama-index-cli
+    #   llama-index-embeddings-huggingface
+    #   llama-index-embeddings-huggingface-api
+    #   llama-index-embeddings-openai
+    #   llama-index-indices-managed-llama-cloud
+    #   llama-index-llms-huggingface
+    #   llama-index-llms-huggingface-api
+    #   llama-index-llms-nebius
+    #   llama-index-llms-openai
+    #   llama-index-llms-openai-like
+    #   llama-index-multi-modal-llms-openai
+    #   llama-index-program-openai
+    #   llama-index-question-gen-openai
+    #   llama-index-readers-file
+    #   llama-index-readers-llama-parse
+    #   llama-index-tools-duckduckgo
+    #   llama-index-utils-huggingface
+    #   llama-index-vector-stores-milvus
+llama-index-embeddings-huggingface==0.5.4
+    # via agent-hackathon (pyproject.toml)
+llama-index-embeddings-huggingface-api==0.3.1
+    # via agent-hackathon (pyproject.toml)
+llama-index-embeddings-openai==0.3.1
+    # via
+    #   llama-index
+    #   llama-index-cli
+llama-index-indices-managed-llama-cloud==0.7.4
+    # via llama-index
+llama-index-llms-huggingface==0.5.0
+    # via agent-hackathon (pyproject.toml)
+llama-index-llms-huggingface-api==0.5.0
+    # via agent-hackathon (pyproject.toml)
+llama-index-llms-nebius==0.1.2
+    # via agent-hackathon (pyproject.toml)
+llama-index-llms-openai==0.4.4
+    # via
+    #   llama-index
+    #   llama-index-agent-openai
+    #   llama-index-cli
+    #   llama-index-llms-openai-like
+    #   llama-index-multi-modal-llms-openai
+    #   llama-index-program-openai
+    #   llama-index-question-gen-openai
+llama-index-llms-openai-like==0.4.0
+    # via llama-index-llms-nebius
+llama-index-multi-modal-llms-openai==0.5.1
+    # via llama-index
+llama-index-program-openai==0.3.2
+    # via
+    #   llama-index
+    #   llama-index-question-gen-openai
+llama-index-question-gen-openai==0.3.1
+    # via llama-index
+llama-index-readers-file==0.4.8
+    # via llama-index
+llama-index-readers-llama-parse==0.4.0
+    # via llama-index
+llama-index-tools-duckduckgo==0.3.0
+    # via agent-hackathon (pyproject.toml)
+llama-index-utils-huggingface==0.3.0
+    # via llama-index-embeddings-huggingface-api
+llama-index-vector-stores-milvus==0.8.4
+    # via agent-hackathon (pyproject.toml)
+llama-parse==0.6.30
+    # via llama-index-readers-llama-parse
+lxml==5.4.0
+    # via
+    #   inscriptis
+    #   ir-datasets
+lz4==4.4.4
+    # via ir-datasets
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   gradio
+    #   jinja2
+marshmallow==3.26.1
+    # via dataclasses-json
+mdurl==0.1.2
+    # via markdown-it-py
+milvus-lite==2.4.12
+    # via pymilvus
+mpmath==1.3.0
+    # via sympy
+multidict==6.4.4
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+mypy-extensions==1.1.0
+    # via typing-inspect
+nest-asyncio==1.6.0
+    # via llama-index-core
+networkx==3.4.2
+    # via
+    #   llama-index-core
+    #   torch
+nltk==3.9.1
+    # via
+    #   llama-index
+    #   llama-index-core
+numpy==2.2.6
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   ir-datasets
+    #   llama-index-core
+    #   pandas
+    #   peft
+    #   scikit-learn
+    #   scipy
+    #   transformers
+    #   trec-car-tools
+openai==1.84.0
+    # via
+    #   agent-hackathon (pyproject.toml)
+    #   llama-index-agent-openai
+    #   llama-index-embeddings-openai
+    #   llama-index-llms-openai
+orjson==3.10.18
+    # via gradio
+packaging==25.0
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   marshmallow
+    #   peft
+    #   transformers
+pandas==2.3.0
+    # via
+    #   datasets
+    #   gradio
+    #   llama-index-readers-file
+    #   pymilvus
+peft==0.15.2
+    # via flagembedding
+pfzy==0.3.4
+    # via inquirerpy
+pillow==11.2.1
+    # via
+    #   gradio
+    #   llama-index-core
+    #   sentence-transformers
+    #   smolagents
+pip==25.1.1
+    # via agent-hackathon (pyproject.toml)
+platformdirs==4.3.8
+    # via
+    #   banks
+    #   llama-cloud-services
+primp==0.15.0
+    # via duckduckgo-search
+prompt-toolkit==3.0.51
+    # via inquirerpy
+propcache==0.3.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==6.31.1
+    # via
+    #   flagembedding
+    #   pymilvus
+psutil==7.0.0
+    # via
+    #   accelerate
+    #   llama-hub
+    #   peft
+pyaml==23.12.0
+    # via llama-hub
+pyarrow==20.0.0
+    # via
+    #   datasets
+    #   ir-datasets
+pydantic==2.11.5
+    # via
+    #   banks
+    #   fastapi
+    #   gradio
+    #   llama-cloud
+    #   llama-cloud-services
+    #   llama-index-core
+    #   openai
+pydantic-core==2.33.2
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.19.1
+    # via rich
+pymilvus==2.5.10
+    # via llama-index-vector-stores-milvus
+pypdf==5.6.0
+    # via llama-index-readers-file
+pyprojroot==0.3.0
+    # via agent-hackathon (pyproject.toml)
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.1.0
+    # via
+    #   agent-hackathon (pyproject.toml)
+    #   llama-cloud-services
+    #   pymilvus
+    #   smolagents
+python-multipart==0.0.20
+    # via gradio
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   ir-datasets
+    #   llama-index-core
+    #   peft
+    #   pyaml
+    #   transformers
+regex==2024.11.6
+    # via
+    #   nltk
+    #   tiktoken
+    #   transformers
+requests==2.32.3
+    # via
+    #   arxiv
+    #   datasets
+    #   huggingface-hub
+    #   inscriptis
+    #   ir-datasets
+    #   llama-index-core
+    #   smolagents
+    #   tiktoken
+    #   transformers
+retrying==1.3.4
+    # via llama-hub
+rich==14.0.0
+    # via
+    #   smolagents
+    #   typer
+ruff==0.11.13
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+safetensors==0.5.3
+    # via
+    #   accelerate
+    #   peft
+    #   transformers
+scikit-learn==1.7.0
+    # via sentence-transformers
+scipy==1.15.3
+    # via
+    #   scikit-learn
+    #   sentence-transformers
+semantic-version==2.10.0
+    # via gradio
+sentence-transformers==4.1.0
+    # via
+    #   flagembedding
+    #   llama-index-embeddings-huggingface
+sentencepiece==0.2.0
+    # via flagembedding
+setuptools==80.9.0
+    # via pymilvus
+sgmllib3k==1.0.0
+    # via feedparser
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via
+    #   python-dateutil
+    #   retrying
+smolagents==1.17.0
+    # via agent-hackathon (pyproject.toml)
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   openai
+soupsieve==2.7
+    # via beautifulsoup4
+sqlalchemy==2.0.41
+    # via llama-index-core
+starlette==0.46.2
+    # via
+    #   fastapi
+    #   gradio
+striprtf==0.0.26
+    # via llama-index-readers-file
+sympy==1.13.1
+    # via torch
+tenacity==9.1.2
+    # via llama-index-core
+threadpoolctl==3.6.0
+    # via scikit-learn
+tiktoken==0.9.0
+    # via llama-index-core
+tokenizers==0.21.1
+    # via transformers
+tomlkit==0.13.3
+    # via gradio
+torch==2.6.0
+    # via
+    #   accelerate
+    #   flagembedding
+    #   llama-index-llms-huggingface
+    #   peft
+    #   sentence-transformers
+    #   transformers
+tqdm==4.67.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   ir-datasets
+    #   llama-index-core
+    #   milvus-lite
+    #   nltk
+    #   openai
+    #   peft
+    #   sentence-transformers
+    #   transformers
+transformers==4.52.4
+    # via
+    #   flagembedding
+    #   llama-index-llms-huggingface
+    #   llama-index-llms-openai-like
+    #   peft
+    #   sentence-transformers
+trec-car-tools==2.6
+    # via ir-datasets
+typer==0.16.0
+    # via gradio
+typing-extensions==4.14.0
+    # via
+    #   aiosqlite
+    #   anyio
+    #   beautifulsoup4
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   llama-index-core
+    #   openai
+    #   pydantic
+    #   pydantic-core
+    #   pyprojroot
+    #   sentence-transformers
+    #   sqlalchemy
+    #   torch
+    #   typer
+    #   typing-inspect
+    #   typing-inspection
+typing-inspect==0.9.0
+    # via
+    #   dataclasses-json
+    #   llama-index-core
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+ujson==5.10.0
+    # via pymilvus
+unlzw3==0.2.3
+    # via ir-datasets
+urllib3==2.4.0
+    # via requests
+uvicorn==0.34.3
+    # via gradio
+warc3-wet==0.2.5
+    # via ir-datasets
+warc3-wet-clueweb09==0.2.5
+    # via ir-datasets
+wcwidth==0.2.13
+    # via prompt-toolkit
+websockets==15.0.1
+    # via gradio-client
+wrapt==1.17.2
+    # via
+    #   deprecated
+    #   llama-index-core
+xxhash==3.5.0
+    # via datasets
+yarl==1.20.0
+    # via aiohttp
+zlib-state==0.1.9
+    # via ir-datasets

src/agent_hackathon/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ def hello() -> str:
2	+ return "Hello from agent-hackathon!"

src/agent_hackathon/consts.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from pyprojroot import find_root, has_file
2	+
3	+ PROJECT_ROOT_DIR = find_root(criterion=has_file(file="README.md"))

src/agent_hackathon/create_vector_db.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import json
+from copy import deepcopy
+from dotenv import find_dotenv, load_dotenv
+from llama_index.core import StorageContext, VectorStoreIndex
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.schema import Document
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.vector_stores.milvus import MilvusVectorStore
+from llama_index.vector_stores.milvus.utils import BGEM3SparseEmbeddingFunction
+from src.agent_hackathon.consts import PROJECT_ROOT_DIR
+from src.agent_hackathon.logger import get_logger
+logger = get_logger(log_name="create_vector_db", log_dir=PROJECT_ROOT_DIR / "logs")
+class VectorDBCreator:
+    """Handles creation of a Milvus vector database from arXiv data."""
+    def __init__(
+        self,
+        data_path: str,
+        db_uri: str,
+        embedding_model: str = "Qwen/Qwen3-Embedding-0.6B",
+        chunk_size: int = 20_000,
+        chunk_overlap: int = 0,
+        vector_dim: int = 1024,
+        insert_batch_size: int = 8192,
+    ) -> None:
+        """
+        Initialize the VectorDBCreator.
+        Args:
+            data_path: Path to the JSON data file.
+            db_uri: URI for the Milvus database.
+            embedding_model: Name of the embedding model.
+            chunk_size: Size of text chunks for splitting.
+            chunk_overlap: Overlap between text chunks.
+            vector_dim: Dimension of the embedding vectors.
+            insert_batch_size: Batch size for insertion.
+        """
+        self.data_path = data_path
+        self.db_uri = db_uri
+        self.embedding_model = embedding_model
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.vector_dim = vector_dim
+        self.insert_batch_size = insert_batch_size
+        self.embed_model = HuggingFaceEmbedding(
+            model_name=self.embedding_model, device="cpu"
+        )
+        self.sent_splitter = SentenceSplitter(
+            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
+        )
+        logger.info("VectorDBCreator initialized.")
+    def load_data(self) -> list[dict]:
+        """
+        Load and return data from the JSON file.
+        Returns:
+            List of dictionaries containing arXiv data.
+        """
+        logger.info(f"Loading data from {self.data_path}")
+        with open(file=self.data_path) as f:
+            data = json.load(fp=f)
+        logger.info("Data loaded successfully.")
+        return deepcopy(x=data)
+    def prepare_documents(self, data: list[dict]) -> list[Document]:
+        """
+        Convert raw data into a list of Document objects.
+        Args:
+            data: List of dictionaries with arXiv data.
+        Returns:
+            List of Document objects.
+        """
+        logger.info("Preparing documents from data.")
+        docs = [Document(text=d.pop("abstract"), metadata=d) for d in data]
+        logger.info(f"Prepared {len(docs)} documents.")
+        return docs
+    def create_vector_store(self) -> MilvusVectorStore:
+        """
+        Create and return a MilvusVectorStore instance.
+        Returns:
+            Configured MilvusVectorStore.
+        """
+        logger.info(f"Creating MilvusVectorStore at {self.db_uri}")
+        store = MilvusVectorStore(
+            uri=self.db_uri,
+            dim=self.vector_dim,
+            enable_sparse=True,
+            sparse_embedding_function=BGEM3SparseEmbeddingFunction(),
+        )
+        logger.info("MilvusVectorStore created.")
+        return store
+    def build_index(
+        self, docs_list: list[Document], vector_store: MilvusVectorStore
+    ) -> VectorStoreIndex:
+        """
+        Build and return a VectorStoreIndex from documents.
+        Args:
+            docs_list: List of Document objects.
+            vector_store: MilvusVectorStore instance.
+        Returns:
+            VectorStoreIndex object.
+        """
+        logger.info("Building VectorStoreIndex.")
+        storage_context = StorageContext.from_defaults(vector_store=vector_store)
+        index = VectorStoreIndex.from_documents(
+            documents=docs_list,
+            storage_context=storage_context,
+            embed_model=self.embed_model,
+            transformations=[self.sent_splitter],
+            show_progress=True,
+            insert_batch_size=self.insert_batch_size,
+        )
+        logger.info("VectorStoreIndex built.")
+        return index
+    def run(self) -> None:
+        """
+        Execute the full pipeline: load data, prepare documents, create vector store, and build index.
+        """
+        logger.info("Running full vector DB creation pipeline.")
+        data = self.load_data()
+        docs_list = self.prepare_documents(data=data)
+        vector_store = self.create_vector_store()
+        self.build_index(docs_list=docs_list, vector_store=vector_store)
+        logger.info("Pipeline finished.")
+# if __name__ == "__main__":
+#     logger.info("Script started.")
+#     # Optionally load environment variables if needed
+#     _ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=True))
+#     creator = VectorDBCreator(
+#         data_path=f"{PROJECT_ROOT_DIR}/data/cs_data_arxiv.json", db_uri="arxiv_docs.db"
+#     )
+#     creator.run()
+#     logger.info("Script finished.")

src/agent_hackathon/generate_arxiv_responses.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import json
+from pathlib import Path
+from typing import Any
+from huggingface_hub import InferenceClient
+from src.agent_hackathon.consts import PROJECT_ROOT_DIR
+from src.agent_hackathon.create_vector_db import VectorDBCreator
+from src.agent_hackathon.logger import get_logger
+from src.agent_hackathon.query_vector_db import RetrieverEngineBuilder
+logger = get_logger(log_name="arxiv_responses", log_dir=PROJECT_ROOT_DIR / "logs")
+class ArxivResponseGenerator:
+    """
+    Handles retrieval and formatting of arXiv papers using a vector database and LLM.
+    """
+    def __init__(self, vector_store_path: Path) -> None:
+        """Initializes the ArxivResponseGenerator."""
+        self.vector_store_path = vector_store_path
+        self.client = self._initialise_client()
+        self.retriever = self._initialise_retriever()
+        logger.info("ArxivResponseGenerator initialized.")
+    def _initialise_retriever(self) -> Any:
+        """
+        Initializes and returns a retriever engine.
+        Returns:
+            Any: Retriever engine object.
+        """
+        logger.info("Initializing retriever engine.")
+        vector_db_creator = VectorDBCreator(
+            data_path=..., db_uri=self.vector_store_path.as_posix()
+        )
+        vector_store = vector_db_creator.create_vector_store()
+        retriever_class = RetrieverEngineBuilder(
+            vector_store=vector_store,
+        )
+        retriever = retriever_class.build_retriever_engine()
+        logger.info("Retriever engine initialized.")
+        return retriever
+    def _initialise_client(self) -> InferenceClient:
+        """
+        Initializes and returns an InferenceClient.
+        Returns:
+            InferenceClient: HuggingFace InferenceClient instance.
+        """
+        logger.info("Initializing InferenceClient.")
+        client = InferenceClient(
+            provider="auto",
+            # bill_to="VitalNest",
+        )
+        logger.info("InferenceClient initialized.")
+        return client
+    def retrieve_arxiv_papers(self, query: str) -> str:
+        """
+        Retrieves and formats arXiv papers for a given query.
+        Args:
+            query (str): The search query.
+        Returns:
+            str: Formatted response from the LLM.
+        """
+        logger.info(f"Retrieving arXiv papers for query: {query}")
+        try:
+            retrieved_content = json.dumps(
+                obj=[(i.get_content(), i.metadata) for i in self.retriever.retrieve(query)]
+            )
+            logger.info("Retrieved content from vector DB.")
+        except Exception as err:
+            logger.error(f"Error retrieving from vector DB: {err}")
+            raise
+        completion = self.client.chat.completions.create(
+            model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            temperature=0.1,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Format the following output neatly:{retrieved_content}. Return only the output.",
+                        },
+                    ],
+                }
+            ],
+        )
+        logger.info("Received completion from LLM.")
+        return completion.choices[0].message.content
+# if __name__ == "__main__":
+#     logger.info("Script started.")
+#     generator = ArxivResponseGenerator(
+#         vector_store_path=PROJECT_ROOT_DIR / "db/arxiv_docs.db"
+#     )
+#     query = "deep learning for NLP"  # Example query, replace as needed
+#     result = generator.retrieve_arxiv_papers(query=query)
+#     print(result)
+#     logger.info("Script finished.")

src/agent_hackathon/logger.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import logging
+from datetime import datetime
+from pathlib import Path
+from rich.logging import RichHandler
+def get_logger(log_name: str, log_dir: Path) -> logging.Logger:
+    """
+    Returns a logger with RichHandler and file handler.
+    Args:
+        log_name (str): Name prefix for the log file.
+        log_dir (Path): Directory to store log files.
+    Returns:
+        logging.Logger: Configured logger instance.
+    """
+    log_dir.mkdir(parents=True, exist_ok=True)
+    date_str = datetime.now().strftime(format="%m_%d_%Y")
+    log_file = log_dir / f"{log_name}_{date_str}.log"
+    logger = logging.getLogger(name=log_name)
+    logger.setLevel(level=logging.INFO)
+    logger.handlers.clear()
+    # Rich console handler
+    rich_handler = RichHandler(
+        rich_tracebacks=True, show_time=True, show_level=True, show_path=True
+    )
+    rich_handler.setLevel(level=logging.INFO)
+    # File handler
+    file_handler = logging.FileHandler(filename=log_file, encoding="utf-8")
+    file_handler.setLevel(level=logging.INFO)
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s"
+    )
+    file_handler.setFormatter(formatter)
+    logger.addHandler(rich_handler)
+    logger.addHandler(file_handler)
+    logger.propagate = False
+    return logger

src/agent_hackathon/multiagent.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# import asyncio
+from datetime import date
+import nest_asyncio
+from llama_index.core.agent.workflow import AgentWorkflow, ReActAgent
+from llama_index.core.tools import FunctionTool
+from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
+from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
+from src.agent_hackathon.consts import PROJECT_ROOT_DIR
+# from dotenv import find_dotenv, load_dotenv
+from src.agent_hackathon.generate_arxiv_responses import ArxivResponseGenerator
+from src.agent_hackathon.logger import get_logger
+nest_asyncio.apply()
+# _ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False), override=True)
+logger = get_logger(log_name="multiagent", log_dir=PROJECT_ROOT_DIR / "logs")
+class MultiAgentWorkflow:
+    """Multi-agent workflow for retrieving research papers and related events."""
+    def __init__(self) -> None:
+        """Initialize the workflow with LLM, tools, and generator."""
+        logger.info("Initializing MultiAgentWorkflow.")
+        self.llm = HuggingFaceInferenceAPI(
+            model="meta-llama/Llama-3.3-70B-Instruct",
+            provider="auto",
+            # provider="nebius",
+            temperature=0.1,
+            top_p=0.95,
+            # api_key=os.getenv(key="NEBIUS_API_KEY"),
+            # base_url="https://api.studio.nebius.com/v1/",
+            system_prompt="Don't just plan, but execute the plan until failure.",
+        )
+        self._generator = ArxivResponseGenerator(
+            vector_store_path=PROJECT_ROOT_DIR / "db/arxiv_docs.db"
+        )
+        self._arxiv_rag_tool = FunctionTool.from_defaults(
+            fn=self._arxiv_rag,
+            name="arxiv_rag",
+            description="Retrieves arxiv research papers.",
+            return_direct=False,
+        )
+        self._duckduckgo_search_tool = [
+            tool
+            for tool in DuckDuckGoSearchToolSpec().to_tool_list()
+            if tool.metadata.name == "duckduckgo_full_search"
+        ]
+        self._arxiv_agent = ReActAgent(
+            name="arxiv_agent",
+            description="Retrieves information about arxiv research papers",
+            system_prompt="You are arxiv research paper agent, who retrieves information "
+            "about arxiv research papers.",
+            tools=[self._arxiv_rag_tool],
+            llm=self.llm,
+        )
+        self._websearch_agent = ReActAgent(
+            name="web_search",
+            description="Searches the web",
+            system_prompt="You are search engine who searches the web using duckduckgo tool",
+            tools=self._duckduckgo_search_tool,
+            llm=self.llm,
+        )
+        self._workflow = AgentWorkflow(
+            agents=[self._arxiv_agent, self._websearch_agent],
+            root_agent="arxiv_agent",
+            timeout=180,
+        )
+        # AgentWorkflow.from_tools_or_functions(
+        #     tools_or_functions=self._duckduckgo_search_tool,
+        #     llm=self.llm,
+        #     system_prompt="You are an expert that  "
+        #     "searches for any corresponding events related to the "
+        #     "user query "
+        #     "using the duckduckgo_search_tool and returns the final results." \
+        #     "Don't return the steps but execute the necessary tools that you have " \
+        #     "access to and return the results.",
+        #     timeout=180,
+        # )
+        logger.info("MultiAgentWorkflow initialized.")
+    def _arxiv_rag(self, query: str) -> str:
+        """Retrieve research papers from arXiv based on the query.
+        Args:
+            query (str): The search query.
+        Returns:
+            str: Retrieved research papers as a string.
+        """
+        return self._generator.retrieve_arxiv_papers(query=query)
+    def _clean_response(self, result: str) -> str:
+        """Removes the think tags.
+        Args:
+            result (str): The result with the <think></think> content.
+        Returns:
+            str: The result without the <think></think> content.
+        """
+        if result.find("</think>"):
+            result = result[result.find("</think>") + len("</think>") :]
+        return result
+    async def run(self, user_query: str) -> str:
+        """Run the multi-agent workflow for a given user query.
+        Args:
+            user_query (str): The user's search query.
+        Returns:
+            str: The output string.
+        """
+        logger.info("Running multi-agent workflow.")
+        try:
+            user_msg = (
+                f"First, give me arxiv research papers about: {user_query}."
+                f"Then search with web search agent for any events related to : {user_query}.\n"
+                f"The web search results should be relevant to the current year: {date.today().year}."
+                "Return all the content from all the agents."
+            )
+            results = await self._workflow.run(user_msg=user_msg)
+            logger.info("Workflow run completed successfully.")
+            return results
+        except Exception as err:
+            logger.error(f"Workflow run failed: {err}")
+            raise
+# if __name__ == "__main__":
+#     USER_QUERY = "i want to learn more about nlp"
+#     workflow = MultiAgentWorkflow()
+#     logger.info("Starting workflow for user query.")
+#     try:
+#         result = asyncio.run(workflow.run(user_query=USER_QUERY))
+#         logger.info("Workflow finished. Output below:")
+#         print(result)
+#     except Exception as err:
+#         logger.error(f"Error during workflow execution: {err}")

src/agent_hackathon/py.typed ADDED Viewed

File without changes

src/agent_hackathon/query_vector_db.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+from typing import Any
+from dotenv import find_dotenv, load_dotenv
+from huggingface_hub import login
+from llama_index.core import VectorStoreIndex
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.vector_stores.milvus import MilvusVectorStore
+from src.agent_hackathon.consts import PROJECT_ROOT_DIR
+from src.agent_hackathon.logger import get_logger
+logger = get_logger(log_name="query_vector_db", log_dir=PROJECT_ROOT_DIR / "logs")
+class RetrieverEngineBuilder:
+    """
+    Handles the creation of a query engine for a vector database using HuggingFace and LlamaIndex.
+    """
+    def __init__(
+        self,
+        hf_token_env: str = "HF_TOKEN",
+        embedding_model: str = "Qwen/Qwen3-Embedding-0.6B",
+        vector_store: MilvusVectorStore = None,
+        device: str = "cpu",
+    ) -> None:
+        """
+        Initialize the QueryEngineBuilder.
+        Args:
+            hf_token_env: Environment variable name for HuggingFace token.
+            embedding_model: Name of the embedding model.
+            vector_store: An instance of MilvusVectorStore.
+            device: Device to run the embedding model on.
+        """
+        self.hf_token_env = hf_token_env
+        self.embedding_model = embedding_model
+        self.vector_store = vector_store
+        self.device = device
+        logger.info("Initializing RetrieverEngineBuilder.")
+        # self._login_huggingface()
+        # self._load_env()
+        self.embed_model = HuggingFaceEmbedding(
+            model_name=self.embedding_model, device=self.device
+        )
+        logger.info("RetrieverEngineBuilder initialized.")
+    def _login_huggingface(self) -> None:
+        """Login to HuggingFace using the token from environment variable."""
+        logger.info("Logging in to HuggingFace.")
+        login(token=os.getenv(key=self.hf_token_env))
+        logger.info("Logged in to HuggingFace.")
+    def _load_env(self) -> None:
+        """Load environment variables from .env file."""
+        logger.info("Loading environment variables.")
+        _ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False))
+        logger.info("Environment variables loaded.")
+    def build_retriever_engine(self) -> Any:
+        """
+        Build and return the retriever engine.
+        Returns:
+            Retriever engine object.
+        """
+        logger.info("Building retriever engine.")
+        index = VectorStoreIndex.from_vector_store(
+            vector_store=self.vector_store, embed_model=self.embed_model
+        )
+        retriever = index.as_retriever(
+            vector_store_query_mode="hybrid",
+            similarity_top_k=5,
+        )
+        logger.info("Retriever engine built.")
+        return retriever