Spaces:
Running
Running
File size: 4,148 Bytes
2eb41d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
import datasets
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
# from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
from transformers import AutoTokenizer
# from langchain_openai import OpenAIEmbeddings
from smolagents import LiteLLMModel, Tool
from smolagents.agents import CodeAgent
# from smolagents.agents import ToolCallingAgent
knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
source_docs = [
Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base
]
## For your own PDFs, you can use the following code to load them into source_docs
# pdf_directory = "pdfs"
# pdf_files = [
# os.path.join(pdf_directory, f)
# for f in os.listdir(pdf_directory)
# if f.endswith(".pdf")
# ]
# source_docs = []
# for file_path in pdf_files:
# loader = PyPDFLoader(file_path)
# docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
AutoTokenizer.from_pretrained("thenlper/gte-small"),
chunk_size=200,
chunk_overlap=20,
add_start_index=True,
strip_whitespace=True,
separators=["\n\n", "\n", ".", " ", ""],
)
# Split docs and keep only unique ones
print("Splitting documents...")
docs_processed = []
unique_texts = {}
for doc in tqdm(source_docs):
new_docs = text_splitter.split_documents([doc])
for new_doc in new_docs:
if new_doc.page_content not in unique_texts:
unique_texts[new_doc.page_content] = True
docs_processed.append(new_doc)
print("Embedding documents... This should take a few minutes (5 minutes on MacBook with M1 Pro)")
# Initialize embeddings and ChromaDB vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma.from_documents(docs_processed, embeddings, persist_directory="./chroma_db")
class RetrieverTool(Tool):
name = "retriever"
description = (
"Uses semantic search to retrieve the parts of documentation that could be most relevant to answer your query."
)
inputs = {
"query": {
"type": "string",
"description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
}
}
output_type = "string"
def __init__(self, vector_store, **kwargs):
super().__init__(**kwargs)
self.vector_store = vector_store
def forward(self, query: str) -> str:
assert isinstance(query, str), "Your search query must be a string"
docs = self.vector_store.similarity_search(query, k=3)
return "\nRetrieved documents:\n" + "".join(
[f"\n\n===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs)]
)
retriever_tool = RetrieverTool(vector_store)
# Choose which LLM engine to use!
# from smolagents import HfApiModel
# model = HfApiModel(model_id="meta-llama/Llama-3.3-70B-Instruct")
# from smolagents import TransformersModel
# model = TransformersModel(model_id="meta-llama/Llama-3.2-2B-Instruct")
# For anthropic: change model_id below to 'anthropic/claude-3-5-sonnet-20240620' and also change 'os.environ.get("ANTHROPIC_API_KEY")'
model = LiteLLMModel(
model_id="groq/llama-3.3-70b-versatile",
api_key=os.environ.get("GROQ_API_KEY"),
)
# # You can also use the ToolCallingAgent class
# agent = ToolCallingAgent(
# tools=[retriever_tool],
# model=model,
# verbose=True,
# )
agent = CodeAgent(
tools=[retriever_tool],
model=model,
max_steps=4,
verbosity_level=2,
)
agent_output = agent.run("How can I push a model to the Hub?")
print("Final output:")
print(agent_output)
|