Spaces:

bacancydataprophets
/

MeDocChat

Runtime error

App Files Files Community

akash015 commited on Jun 27, 2024

Commit

6e0b1c1

verified ·

1 Parent(s): 3be45b9

Update app.py

Browse files

Files changed (1) hide show

app.py +184 -117

app.py CHANGED Viewed

@@ -1,115 +1,202 @@
-import re
-import PyPDF2
-from langchain_community.embeddings import OllamaEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import Chroma
-from langchain.chains import ConversationalRetrievalChain
-from langchain_community.chat_models import ChatOllama
-from langchain_groq import ChatGroq
-from langchain.memory import ChatMessageHistory, ConversationBufferMemory
-import chainlit as cl
-from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
-import logging
-import pypandoc
-import pdfkit
-from paddleocr import PaddleOCR
-import fitz
-import asyncio
-from langchain_nomic.embeddings import NomicEmbeddings
-llm_groq = ChatGroq(
-            model_name='llama3-70b-8192'
-    )
-# Initialize anonymizer
-anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
-def extract_text_from_pdf(file_path):
-    pdf = PyPDF2.PdfReader(file_path)
-    pdf_text = ""
-    for page in pdf.pages:
-        pdf_text += page.extract_text()
-    return pdf_text
-def has_sufficient_selectable_text(page, threshold=50):
-    text = page.extract_text()
-    if len(text.strip()) > threshold:
-        return True
-    return False
-async def get_text(file_path):
-    text = ""
-    try:
-        logging.info("Starting OCR process for file: %s", file_path)
-        extension = file_path.split(".")[-1].lower()
-        allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
-        if extension not in allowed_extension:
-            error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
-            logging.error(error)
-            return {"error": error}
-        if extension == "docx":
-            file_path = convert_docx_to_pdf(file_path)
-        ocr = PaddleOCR(use_angle_cls=True, lang='en')
-        result = ocr.ocr(file_path, cls=True)
-        for idx in range(len(result)):
-            res = result[idx]
-            for line in res:
-                text += line[1][0] + " "
-        logging.info("OCR process completed successfully for file: %s", file_path)
-    except Exception as e:
-        logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
-        text = "Error occurred during OCR process."
-    logging.info("Extracted text: %s", text)
-    return text
-def convert_docx_to_pdf(input_path):
-    html_path = input_path.replace('.docx', '.html')
-    output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
-    pypandoc.convert_file(input_path, 'html', outputfile=html_path)
-    pdfkit.from_file(html_path, output_path)
-    logging.info("DOCX Format Handled")
-    return output_path
-async def extract_text_from_mixed_pdf(file_path):
-    pdf = PyPDF2.PdfReader(file_path)
-    ocr = PaddleOCR(use_angle_cls=True, lang='en')
-    pdf_text = ""
-    for i, page in enumerate(pdf.pages):
-        text = page.extract_text()
-        if not has_sufficient_selectable_text(page):
-            logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
-            pdf_document = fitz.open(file_path)
-            pdf_page = pdf_document.load_page(i)
-            pix = pdf_page.get_pixmap()
-            image_path = f"page_{i+1}.png"
-            pix.save(image_path)
-            result = ocr.ocr(image_path, cls=True)
-            for idx in range(len(result)):
-                res = result[idx]
-                for line in res:
-                    text += line[1][0] + " "
-        pdf_text += text
-    return pdf_text
 @cl.on_chat_start
 async def on_chat_start():
-    files = None # Initialize variable to store uploaded files
     # Wait for the user to upload a file
     while files is None:
         files = await cl.AskFileMessage(
             content="Please upload a pdf file to begin!",
-            # accept=["application/pdf"],
             accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
             max_size_mb=100,
             timeout=180,
         ).send()
-    file = files[0] # Get the first uploaded file
     # Inform the user that processing has started
     msg = cl.Message(content=f"Processing `{file.name}`...")
     await msg.send()
@@ -130,7 +217,6 @@ async def on_chat_start():
     docsearch = await cl.make_async(Chroma.from_texts)(
         [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
     )
-    # }
     # Initialize message history for conversation
     message_history = ChatMessageHistory()
@@ -155,26 +241,7 @@ async def on_chat_start():
     # Let the user know that the system is ready
     msg.content = f"Processing `{file.name}` done. You can now ask questions!"
     await msg.update()
     # Store the chain in user session
     cl.user_session.set("chain", chain)
-@cl.on_message
-async def main(message: cl.Message):
-    # Retrieve the chain from user session
-    chain = cl.user_session.get("chain")
-    # Callbacks happen asynchronously/parallel
-    cb = cl.AsyncLangchainCallbackHandler()
-    # Call the chain with user's message content
-    res = await chain.ainvoke(message.content, callbacks=[cb])
-    answer = anonymizer.deanonymize(
-        res["answer"]
-    )
-    text_elements = []
-    # Return results
-    await cl.Message(content=answer, elements=text_elements).send()

+# import re
+# import PyPDF2
+# from langchain_community.embeddings import OllamaEmbeddings
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+# from langchain_community.vectorstores import Chroma
+# from langchain.chains import ConversationalRetrievalChain
+# from langchain_community.chat_models import ChatOllama
+# from langchain_groq import ChatGroq
+# from langchain.memory import ChatMessageHistory, ConversationBufferMemory
+# import chainlit as cl
+# from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
+# import logging
+# import pypandoc
+# import pdfkit
+# from paddleocr import PaddleOCR
+# import fitz
+# import asyncio
+# from langchain_nomic.embeddings import NomicEmbeddings
+# llm_groq = ChatGroq(
+#             model_name='llama3-70b-8192'
+#     )
+# # Initialize anonymizer
+# anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
+# def extract_text_from_pdf(file_path):
+#     pdf = PyPDF2.PdfReader(file_path)
+#     pdf_text = ""
+#     for page in pdf.pages:
+#         pdf_text += page.extract_text()
+#     return pdf_text
+# def has_sufficient_selectable_text(page, threshold=50):
+#     text = page.extract_text()
+#     if len(text.strip()) > threshold:
+#         return True
+#     return False
+# async def get_text(file_path):
+#     text = ""
+#     try:
+#         logging.info("Starting OCR process for file: %s", file_path)
+#         extension = file_path.split(".")[-1].lower()
+#         allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
+#         if extension not in allowed_extension:
+#             error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
+#             logging.error(error)
+#             return {"error": error}
+#         if extension == "docx":
+#             file_path = convert_docx_to_pdf(file_path)
+#         ocr = PaddleOCR(use_angle_cls=True, lang='en')
+#         result = ocr.ocr(file_path, cls=True)
+#         for idx in range(len(result)):
+#             res = result[idx]
+#             for line in res:
+#                 text += line[1][0] + " "
+#         logging.info("OCR process completed successfully for file: %s", file_path)
+#     except Exception as e:
+#         logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
+#         text = "Error occurred during OCR process."
+#     logging.info("Extracted text: %s", text)
+#     return text
+# def convert_docx_to_pdf(input_path):
+#     html_path = input_path.replace('.docx', '.html')
+#     output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
+#     pypandoc.convert_file(input_path, 'html', outputfile=html_path)
+#     pdfkit.from_file(html_path, output_path)
+#     logging.info("DOCX Format Handled")
+#     return output_path
+# async def extract_text_from_mixed_pdf(file_path):
+#     pdf = PyPDF2.PdfReader(file_path)
+#     ocr = PaddleOCR(use_angle_cls=True, lang='en')
+#     pdf_text = ""
+#     for i, page in enumerate(pdf.pages):
+#         text = page.extract_text()
+#         if not has_sufficient_selectable_text(page):
+#             logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
+#             pdf_document = fitz.open(file_path)
+#             pdf_page = pdf_document.load_page(i)
+#             pix = pdf_page.get_pixmap()
+#             image_path = f"page_{i+1}.png"
+#             pix.save(image_path)
+#             result = ocr.ocr(image_path, cls=True)
+#             for idx in range(len(result)):
+#                 res = result[idx]
+#                 for line in res:
+#                     text += line[1][0] + " "
+#         pdf_text += text
+#     return pdf_text
+# @cl.on_chat_start
+# async def on_chat_start():
+#     files = None # Initialize variable to store uploaded files
+#     # Wait for the user to upload a file
+#     while files is None:
+#         files = await cl.AskFileMessage(
+#             content="Please upload a pdf file to begin!",
+#             # accept=["application/pdf"],
+#             accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
+#             max_size_mb=100,
+#             timeout=180,
+#         ).send()
+#     file = files[0] # Get the first uploaded file
+#     # Inform the user that processing has started
+#     msg = cl.Message(content=f"Processing `{file.name}`...")
+#     await msg.send()
+#     # Extract text from PDF, checking for selectable and handwritten text
+#     if file.name.endswith('.pdf'):
+#         pdf_text = await extract_text_from_mixed_pdf(file.path)
+#     else:
+#         pdf_text = await get_text(file.path)
+#     # Anonymize the text
+#     anonymized_text = anonymizer.anonymize(
+#         pdf_text
+#     )
+#     embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
+#     docsearch = await cl.make_async(Chroma.from_texts)(
+#         [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
+#     )
+#     # }
+#     # Initialize message history for conversation
+#     message_history = ChatMessageHistory()
+#     # Memory for conversational context
+#     memory = ConversationBufferMemory(
+#         memory_key="chat_history",
+#         output_key="answer",
+#         chat_memory=message_history,
+#         return_messages=True,
+#     )
+#     # Create a chain that uses the Chroma vector store
+#     chain = ConversationalRetrievalChain.from_llm(
+#         llm = llm_groq,
+#         chain_type="stuff",
+#         retriever=docsearch.as_retriever(),
+#         memory=memory,
+#         return_source_documents=True,
+#     )
+#     # Let the user know that the system is ready
+#     msg.content = f"Processing `{file.name}` done. You can now ask questions!"
+#     await msg.update()
+#     # Store the chain in user session
+#     cl.user_session.set("chain", chain)
+# @cl.on_message
+# async def main(message: cl.Message):
+#     # Retrieve the chain from user session
+#     chain = cl.user_session.get("chain")
+#     # Callbacks happen asynchronously/parallel
+#     cb = cl.AsyncLangchainCallbackHandler()
+#     # Call the chain with user's message content
+#     res = await chain.ainvoke(message.content, callbacks=[cb])
+#     answer = anonymizer.deanonymize(
+#         res["answer"]
+#     )
+#     text_elements = []
+#     # Return results
+#     await cl.Message(content=answer, elements=text_elements).send()
+# v2:
 @cl.on_chat_start
 async def on_chat_start():
+    files = None  # Initialize variable to store uploaded files
     # Wait for the user to upload a file
     while files is None:
         files = await cl.AskFileMessage(
             content="Please upload a pdf file to begin!",
             accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
             max_size_mb=100,
             timeout=180,
         ).send()
+    file = files[0]  # Get the first uploaded file
     # Inform the user that processing has started
     msg = cl.Message(content=f"Processing `{file.name}`...")
     await msg.send()
     docsearch = await cl.make_async(Chroma.from_texts)(
         [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
     )
     # Initialize message history for conversation
     message_history = ChatMessageHistory()
     # Let the user know that the system is ready
     msg.content = f"Processing `{file.name}` done. You can now ask questions!"
     await msg.update()
     # Store the chain in user session
     cl.user_session.set("chain", chain)