import streamlit as st import os import torch import transformers from transformers import pipeline from langchain.llms import HuggingFacePipeline from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from langchain.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter ,CharacterTextSplitter from langchain.chains import RetrievalQA from langchain.document_loaders import TextLoader ,PyPDFLoader ,DirectoryLoader from langchain.document_loaders import GoogleDriveLoader #from datasets import load_dataset #dataset = load_dataset("heyal/carbon_data") def create_vecotrstore(embedding , texts, db_name = 'chromadb' ) -> None: "Extract vector embeddings from text and store to persistance directory and return vector object." persist_directory = db_name print("Creating vector store.") vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory) return vectordb #"Load and chunk from documents to small text chunks." def load_chunk(data_dir): loader = DirectoryLoader(data_dir , glob="./*.pdf", loader_cls=PyPDFLoader) #loader = GoogleDriveLoader(folder_id = data_dir, glob="./*.pdf", loader_cls=PyPDFLoader, credentials_path='googlecreds.json') documents = loader.load() #documents = dataset print(f"{len(documents)} documents are loaded.") text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20, length_function = len, separators=["\n\n", "\n", " ", ""]) text_chunks = text_splitter.split_documents(documents) print(f"{len(text_chunks)} are splitted from documents.") return text_chunks import textwrap def format_result(text, width=100): "Format to readable text form" lines = text.split('\n') wrapped_lines = '\n'.join([textwrap.fill(line, width=width) for line in lines]) return wrapped_lines def postprocess_response(llm_response): #" Format LLM response , query and semantic search results ." print(f"Query : {format_result(llm_response['query'])} \n") print(f"Result : {format_result(llm_response['result'])} \n") print('=' *90) print('\nFounded docs (text chunks from PDFs): \n\n') for source in llm_response["source_documents"]: print(f"Source PDF : {source.metadata['source']} \n\n") print(format_result(source.page_content)) print('-' *90) def postprocess_response_in_app(llm_response): st.write(format_result(llm_response['result'])) from langchain.embeddings import HuggingFaceEmbeddings def init_embedding(model_name : str): "Initialize text embedding model " embeddings = HuggingFaceEmbeddings(model_name = model_name, model_kwargs={"device": "cuda"}) return embeddings def init_LLM(model_name : str): "Initialize LLM for text generation " llm = HuggingFacePipeline.from_model_id(model_id = model_name, task="text2text-generation", device = 0, model_kwargs={"temperature":0, "max_length" : 512 ,}) return llm from langchain.llms import OpenAI #llm_model_id = "google/flan-t5-large" #for embeddings text_model_id = "all-mpnet-base-v2" text_embeddings = init_embedding(text_model_id) #llm_model = init_LLM(llm_model_id) API = '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' llm_model = OpenAI(temperature=0.7, openai_api_key=API) def generate_context(llm_model , vectordb , query : str , top_k : int): "Generate context information from query" # fetch similar docs using similarity serch retriever = vectordb.as_retriever(search_kwargs={"k": top_k}) # generate text using founded docs qa_chain = RetrievalQA.from_chain_type(llm=llm_model, chain_type="stuff", retriever=retriever, return_source_documents=True) results = qa_chain(query) return results #app st.title("Omdena-Transitry Carbon Project Demo") st.write("Mounting Google drive") from google.colab import drive drive.mount('/content/drive/') st.write("Loading documents") data_dir = '/content/drive/My Drive/carbon_data' #data_dir = 'https://drive.google.com/drive/folders/1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh' #data_dir = '1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh' texts = load_chunk(data_dir) st.write("Creating vector store") vectordb = create_vecotrstore(text_embeddings , texts) user_question = st.text_input( "Enter Your Question : ", placeholder = "Cyanobacteria can perform photosynthetsis , are they considered as plants?", ) #query = f"Can I develop a project whose purpose is to increase biodiversity? if so, how could biodiversity result in carbon credits?" query = user_question results = generate_context(llm_model , vectordb , query , 3) postprocess_response(results)