import streamlit as st from dotenv import load_dotenv from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS # from langchain.chat_models import ChatOpenAI from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from htmlTemplates import css, bot_template, user_template from langchain.llms import HuggingFaceHub import os # from transformers import T5Tokenizer, T5ForConditionalGeneration # from langchain.callbacks import get_openai_callback hub_token = os.environ["HUGGINGFACE_HUB_TOKEN"] def split_pdfs(pdf_docs): """Splits a list of PDF documents into smaller chunks. Args: pdf_docs: A list of PDF documents. Returns: A list of lists of PDF documents, where each sublist contains a smaller chunk of the original PDF documents. """ pdf_chunks = [] # Split the PDF document into pages. pdf_reader = PdfReader(pdf_doc) pdf_pages = pdf_reader.pages # Split the PDF pages into chunks. pdf_chunks.append([]) for pdf_page in pdf_pages: # Add the PDF page to the current chunk. pdf_chunks[-1].append(pdf_page) # If the chunk is too large, start a new chunk. if len(pdf_chunks[-1]) >= 10: pdf_chunks.append([]) return pdf_chunks def generate_response(pdf_chunks, llm_model): """Generates a response to a query using a list of PDF documents and an LLM model. Args: pdf_chunks: A list of lists of PDF documents, where each sublist contains a smaller chunk of the original PDF documents. llm_model: An LLM model. Returns: A response to the query. """ # Generate a summary of each PDF chunk. pdf_summaries = [] for pdf_chunk in pdf_chunks: # Generate a summary of the PDF chunk. pdf_summary = llm_model.generate( prompt=f"Summarize the following text:\n{get_pdf_text(pdf_chunk)}", max_new_tokens=100 ) # Add the summary to the list of summaries. pdf_summaries.append(pdf_summary) # Generate a response to the query using the summaries of the PDF chunks. response = llm_model.generate( prompt=f"Answer the following question using the following summaries:\n{get_text_chunks(pdf_summaries)}\n\nQuestion:", max_new_tokens=200 ) return response def main(): load_dotenv() st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:") st.write(css, unsafe_allow_html=True) # Load the LLM model. llm_model = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", huggingfacehub_api_token=hub_token, verbose=True) if "conversation" not in st.session_state: st.session_state.conversation = None if "chat_history" not in st.session_state: st.session_state.chat_history = None st.header("Chat with multiple PDFs :books:") user_question = st.text_input("Ask a question about your documents:") # If the user asked a question, generate a response. if user_question: # Split the PDF documents into smaller chunks. pdf_chunks = split_pdfs("Geeta.pdf") # Generate a response to the query. response = generate_response(pdf_chunks, llm_model) st.write(response) main()