carbon_demo / app.py
heyal's picture
Update app.py
e3220cf
import streamlit as st
import os
import torch
import transformers
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter ,CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader ,PyPDFLoader ,DirectoryLoader
from langchain.document_loaders import GoogleDriveLoader
#from datasets import load_dataset
#dataset = load_dataset("heyal/carbon_data")
def create_vecotrstore(embedding , texts, db_name = 'chromadb' ) -> None:
"Extract vector embeddings from text and store to persistance directory and return vector object."
persist_directory = db_name
print("Creating vector store.")
vectordb = Chroma.from_documents(documents=texts,
embedding=embedding,
persist_directory=persist_directory)
return vectordb
#"Load and chunk from documents to small text chunks."
def load_chunk(data_dir):
loader = DirectoryLoader(data_dir , glob="./*.pdf", loader_cls=PyPDFLoader)
#loader = GoogleDriveLoader(folder_id = data_dir, glob="./*.pdf", loader_cls=PyPDFLoader, credentials_path='googlecreds.json')
documents = loader.load()
#documents = dataset
print(f"{len(documents)} documents are loaded.")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
chunk_overlap=20,
length_function = len,
separators=["\n\n", "\n", " ", ""])
text_chunks = text_splitter.split_documents(documents)
print(f"{len(text_chunks)} are splitted from documents.")
return text_chunks
import textwrap
def format_result(text, width=100):
"Format to readable text form"
lines = text.split('\n')
wrapped_lines = '\n'.join([textwrap.fill(line, width=width) for line in lines])
return wrapped_lines
def postprocess_response(llm_response):
#" Format LLM response , query and semantic search results ."
print(f"Query : {format_result(llm_response['query'])} \n")
print(f"Result : {format_result(llm_response['result'])} \n")
print('=' *90)
print('\nFounded docs (text chunks from PDFs): \n\n')
for source in llm_response["source_documents"]:
print(f"Source PDF : {source.metadata['source']} \n\n")
print(format_result(source.page_content))
print('-' *90)
def postprocess_response_in_app(llm_response):
st.write(format_result(llm_response['result']))
from langchain.embeddings import HuggingFaceEmbeddings
def init_embedding(model_name : str):
"Initialize text embedding model "
embeddings = HuggingFaceEmbeddings(model_name = model_name,
model_kwargs={"device": "cuda"})
return embeddings
def init_LLM(model_name : str):
"Initialize LLM for text generation "
llm = HuggingFacePipeline.from_model_id(model_id = model_name,
task="text2text-generation",
device = 0,
model_kwargs={"temperature":0,
"max_length" : 512 ,})
return llm
from langchain.llms import OpenAI
#llm_model_id = "google/flan-t5-large"
#for embeddings
text_model_id = "all-mpnet-base-v2"
text_embeddings = init_embedding(text_model_id)
#llm_model = init_LLM(llm_model_id)
API = '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'
llm_model = OpenAI(temperature=0.7, openai_api_key=API)
def generate_context(llm_model , vectordb , query : str , top_k : int):
"Generate context information from query"
# fetch similar docs using similarity serch
retriever = vectordb.as_retriever(search_kwargs={"k": top_k})
# generate text using founded docs
qa_chain = RetrievalQA.from_chain_type(llm=llm_model,
chain_type="stuff",
retriever=retriever,
return_source_documents=True)
results = qa_chain(query)
return results
#app
st.title("Omdena-Transitry Carbon Project Demo")
st.write("Mounting Google drive")
from google.colab import drive
drive.mount('/content/drive/')
st.write("Loading documents")
data_dir = '/content/drive/My Drive/carbon_data'
#data_dir = 'https://drive.google.com/drive/folders/1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh'
#data_dir = '1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh'
texts = load_chunk(data_dir)
st.write("Creating vector store")
vectordb = create_vecotrstore(text_embeddings , texts)
user_question = st.text_input(
"Enter Your Question : ",
placeholder = "Cyanobacteria can perform photosynthetsis , are they considered as plants?",
)
#query = f"Can I develop a project whose purpose is to increase biodiversity? if so, how could biodiversity result in carbon credits?"
query = user_question
results = generate_context(llm_model , vectordb , query , 3)
postprocess_response(results)