Spaces:
Runtime error
Runtime error
import streamlit as st | |
import os | |
import torch | |
import transformers | |
from transformers import pipeline | |
from langchain.llms import HuggingFacePipeline | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter ,CharacterTextSplitter | |
from langchain.chains import RetrievalQA | |
from langchain.document_loaders import TextLoader ,PyPDFLoader ,DirectoryLoader | |
from langchain.document_loaders import GoogleDriveLoader | |
#from datasets import load_dataset | |
#dataset = load_dataset("heyal/carbon_data") | |
def create_vecotrstore(embedding , texts, db_name = 'chromadb' ) -> None: | |
"Extract vector embeddings from text and store to persistance directory and return vector object." | |
persist_directory = db_name | |
print("Creating vector store.") | |
vectordb = Chroma.from_documents(documents=texts, | |
embedding=embedding, | |
persist_directory=persist_directory) | |
return vectordb | |
#"Load and chunk from documents to small text chunks." | |
def load_chunk(data_dir): | |
loader = DirectoryLoader(data_dir , glob="./*.pdf", loader_cls=PyPDFLoader) | |
#loader = GoogleDriveLoader(folder_id = data_dir, glob="./*.pdf", loader_cls=PyPDFLoader, credentials_path='googlecreds.json') | |
documents = loader.load() | |
#documents = dataset | |
print(f"{len(documents)} documents are loaded.") | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, | |
chunk_overlap=20, | |
length_function = len, | |
separators=["\n\n", "\n", " ", ""]) | |
text_chunks = text_splitter.split_documents(documents) | |
print(f"{len(text_chunks)} are splitted from documents.") | |
return text_chunks | |
import textwrap | |
def format_result(text, width=100): | |
"Format to readable text form" | |
lines = text.split('\n') | |
wrapped_lines = '\n'.join([textwrap.fill(line, width=width) for line in lines]) | |
return wrapped_lines | |
def postprocess_response(llm_response): | |
#" Format LLM response , query and semantic search results ." | |
print(f"Query : {format_result(llm_response['query'])} \n") | |
print(f"Result : {format_result(llm_response['result'])} \n") | |
print('=' *90) | |
print('\nFounded docs (text chunks from PDFs): \n\n') | |
for source in llm_response["source_documents"]: | |
print(f"Source PDF : {source.metadata['source']} \n\n") | |
print(format_result(source.page_content)) | |
print('-' *90) | |
def postprocess_response_in_app(llm_response): | |
st.write(format_result(llm_response['result'])) | |
from langchain.embeddings import HuggingFaceEmbeddings | |
def init_embedding(model_name : str): | |
"Initialize text embedding model " | |
embeddings = HuggingFaceEmbeddings(model_name = model_name, | |
model_kwargs={"device": "cuda"}) | |
return embeddings | |
def init_LLM(model_name : str): | |
"Initialize LLM for text generation " | |
llm = HuggingFacePipeline.from_model_id(model_id = model_name, | |
task="text2text-generation", | |
device = 0, | |
model_kwargs={"temperature":0, | |
"max_length" : 512 ,}) | |
return llm | |
from langchain.llms import OpenAI | |
#llm_model_id = "google/flan-t5-large" | |
#for embeddings | |
text_model_id = "all-mpnet-base-v2" | |
text_embeddings = init_embedding(text_model_id) | |
#llm_model = init_LLM(llm_model_id) | |
API = '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' | |
llm_model = OpenAI(temperature=0.7, openai_api_key=API) | |
def generate_context(llm_model , vectordb , query : str , top_k : int): | |
"Generate context information from query" | |
# fetch similar docs using similarity serch | |
retriever = vectordb.as_retriever(search_kwargs={"k": top_k}) | |
# generate text using founded docs | |
qa_chain = RetrievalQA.from_chain_type(llm=llm_model, | |
chain_type="stuff", | |
retriever=retriever, | |
return_source_documents=True) | |
results = qa_chain(query) | |
return results | |
#app | |
st.title("Omdena-Transitry Carbon Project Demo") | |
st.write("Mounting Google drive") | |
from google.colab import drive | |
drive.mount('/content/drive/') | |
st.write("Loading documents") | |
data_dir = '/content/drive/My Drive/carbon_data' | |
#data_dir = 'https://drive.google.com/drive/folders/1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh' | |
#data_dir = '1sSZGhGzXw6oqC8sxKtPwIuaDvx_PfMlh' | |
texts = load_chunk(data_dir) | |
st.write("Creating vector store") | |
vectordb = create_vecotrstore(text_embeddings , texts) | |
user_question = st.text_input( | |
"Enter Your Question : ", | |
placeholder = "Cyanobacteria can perform photosynthetsis , are they considered as plants?", | |
) | |
#query = f"Can I develop a project whose purpose is to increase biodiversity? if so, how could biodiversity result in carbon credits?" | |
query = user_question | |
results = generate_context(llm_model , vectordb , query , 3) | |
postprocess_response(results) |