import os import datasets from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_chroma import Chroma # from langchain_community.document_loaders import PyPDFLoader from langchain_huggingface import HuggingFaceEmbeddings from tqdm import tqdm from transformers import AutoTokenizer # from langchain_openai import OpenAIEmbeddings from smolagents import LiteLLMModel, Tool from smolagents.agents import CodeAgent # from smolagents.agents import ToolCallingAgent knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train") source_docs = [ Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base ] ## For your own PDFs, you can use the following code to load them into source_docs # pdf_directory = "pdfs" # pdf_files = [ # os.path.join(pdf_directory, f) # for f in os.listdir(pdf_directory) # if f.endswith(".pdf") # ] # source_docs = [] # for file_path in pdf_files: # loader = PyPDFLoader(file_path) # docs.extend(loader.load()) text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( AutoTokenizer.from_pretrained("thenlper/gte-small"), chunk_size=200, chunk_overlap=20, add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n", ".", " ", ""], ) # Split docs and keep only unique ones print("Splitting documents...") docs_processed = [] unique_texts = {} for doc in tqdm(source_docs): new_docs = text_splitter.split_documents([doc]) for new_doc in new_docs: if new_doc.page_content not in unique_texts: unique_texts[new_doc.page_content] = True docs_processed.append(new_doc) print("Embedding documents... This should take a few minutes (5 minutes on MacBook with M1 Pro)") # Initialize embeddings and ChromaDB vector store embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # embeddings = OpenAIEmbeddings(model="text-embedding-3-small") vector_store = Chroma.from_documents(docs_processed, embeddings, persist_directory="./chroma_db") class RetrieverTool(Tool): name = "retriever" description = ( "Uses semantic search to retrieve the parts of documentation that could be most relevant to answer your query." ) inputs = { "query": { "type": "string", "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.", } } output_type = "string" def __init__(self, vector_store, **kwargs): super().__init__(**kwargs) self.vector_store = vector_store def forward(self, query: str) -> str: assert isinstance(query, str), "Your search query must be a string" docs = self.vector_store.similarity_search(query, k=3) return "\nRetrieved documents:\n" + "".join( [f"\n\n===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs)] ) retriever_tool = RetrieverTool(vector_store) # Choose which LLM engine to use! # from smolagents import HfApiModel # model = HfApiModel(model_id="meta-llama/Llama-3.3-70B-Instruct") # from smolagents import TransformersModel # model = TransformersModel(model_id="meta-llama/Llama-3.2-2B-Instruct") # For anthropic: change model_id below to 'anthropic/claude-3-5-sonnet-20240620' and also change 'os.environ.get("ANTHROPIC_API_KEY")' model = LiteLLMModel( model_id="groq/llama-3.3-70b-versatile", api_key=os.environ.get("GROQ_API_KEY"), ) # # You can also use the ToolCallingAgent class # agent = ToolCallingAgent( # tools=[retriever_tool], # model=model, # verbose=True, # ) agent = CodeAgent( tools=[retriever_tool], model=model, max_steps=4, verbosity_level=2, ) agent_output = agent.run("How can I push a model to the Hub?") print("Final output:") print(agent_output)