Spaces:

Rivalcoder
/

Issurance_Agent_Rag

Running

App Files Files Community

Rivalcoder commited on 4 days ago

Commit

7d0e6b0

1 Parent(s): afd28fa

[Edit] Update of Image Data Handling

Browse files

Files changed (4) hide show

Dockerfile +7 -0
llm.py +8 -0
pdf_parser.py +69 -33
requirements.txt +2 -0

Dockerfile CHANGED Viewed

@@ -5,6 +5,13 @@ WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
 # Create a non-root user

 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     build-essential \
+    tesseract-ocr \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    poppler-utils \
+    && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 # Create a non-root user

llm.py CHANGED Viewed

@@ -30,6 +30,9 @@ You are an expert insurance assistant generating formal yet user-facing answers
 - Limit each answer to 2–3 sentences, and do not repeat unnecessary information.
 - If a question can be answered with a simple "Yes", "No", "Can apply", or "Cannot apply", then begin the answer with that phrase, followed by a short supporting Statement In Natural Human Like response.So Give A Good Answer For The Question With Correct Information.
 - Avoid giving  theory Based Long Long answers Try to Give Short Good Reasonable Answers.
 🛑 DO NOT:
 - Use words like "context", "document", or "text".
@@ -37,12 +40,17 @@ You are an expert insurance assistant generating formal yet user-facing answers
 - Say "helpful", "available", "allowed", "indemnified", "excluded", etc.
 - Use overly robotic passive constructions like "shall be indemnified".
 - Dont Give In Message Like "Based On The Context "Or "Nothing Refered In The context" Like That Dont Give In Response Try To Give Answer For The Question Alone
 ✅ DO:
 - Write in clean, informative language.
 - Give complete answers in 2–3 sentences maximum.
 📤 OUTPUT FORMAT (strict):

 - Limit each answer to 2–3 sentences, and do not repeat unnecessary information.
 - If a question can be answered with a simple "Yes", "No", "Can apply", or "Cannot apply", then begin the answer with that phrase, followed by a short supporting Statement In Natural Human Like response.So Give A Good Answer For The Question With Correct Information.
 - Avoid giving  theory Based Long Long answers Try to Give Short Good Reasonable Answers.
+- Dont Give Long theory Like Response Very Large Response Just Give Short And Good Response For The Question.
+- If the question is general (math, code, tech, etc.) and No Matches With Context, answer normally without referencing the document.
+- Avoid Saying  “Not found” or “Out of scope” For The Answer of The Question Try to Give Basic General Response For The Question.
 🛑 DO NOT:
 - Use words like "context", "document", or "text".
 - Say "helpful", "available", "allowed", "indemnified", "excluded", etc.
 - Use overly robotic passive constructions like "shall be indemnified".
 - Dont Give In Message Like "Based On The Context "Or "Nothing Refered In The context" Like That Dont Give In Response Try To Give Answer For The Question Alone
+- Over-explain or give long theory answers.
 ✅ DO:
 - Write in clean, informative language.
 - Give complete answers in 2–3 sentences maximum.
+📝 EXAMPLE ANSWERS:
+- "Yes, the policy covers damage to personal property caused by fire, up to a limit of $50,000."
+- "No, the policy does not cover pre-existing conditions."
+- "The waiting period for coverage to begin is 30 days from the start date of the policy."
 📤 OUTPUT FORMAT (strict):

pdf_parser.py CHANGED Viewed

@@ -2,49 +2,85 @@ import fitz  # PyMuPDF
 import requests
 from io import BytesIO
 from concurrent.futures import ThreadPoolExecutor
 def _extract_text(page):
     text = page.get_text()
     return text.strip() if text and text.strip() else None
 def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
     """
-    Download PDF from URL, extract text in parallel, optionally chunk pages.
     """
-    res = requests.get(url)
-    with fitz.open(stream=BytesIO(res.content), filetype="pdf") as doc:
-        num_pages = len(doc)
-        pages = list(doc)
-        # Step 1: Parallel text extraction
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            texts = list(executor.map(_extract_text, pages))
-        # Step 2: Optional chunking
-        if chunk_size > 1:
-            chunks = []
-            for i in range(0, len(texts), chunk_size):
-                chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
-                if chunk:
-                    chunks.append(chunk)
-            return chunks
-        # Default: return one chunk per page
-        return [t for t in texts if t]
 def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
     """
     Parse a local PDF file, extract text in parallel, optionally chunk pages.
     """
-    with fitz.open(file_path) as doc:
-        num_pages = len(doc)
-        pages = list(doc)
-        # Step 1: Parallel text extraction
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            texts = list(executor.map(_extract_text, pages))
-        # Step 2: Optional chunking
-        if chunk_size > 1:
-            chunks = []
-            for i in range(0, len(texts), chunk_size):
-                chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
-                if chunk:
-                    chunks.append(chunk)
-            return chunks
-        return [t for t in texts if t]

 import requests
 from io import BytesIO
 from concurrent.futures import ThreadPoolExecutor
+from PIL import Image
+import pytesseract
+import imghdr
 def _extract_text(page):
     text = page.get_text()
     return text.strip() if text and text.strip() else None
+def is_image(content):
+    return imghdr.what(None, h=content) in ["jpeg", "png", "bmp", "gif", "tiff", "webp"]
+def extract_text_from_image_bytes(image_bytes):
+    image = Image.open(BytesIO(image_bytes))
+    return pytesseract.image_to_string(image).strip()
 def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
     """
+    Download document (PDF or Image) from URL, extract text accordingly.
+    Gracefully return fallback message if unsupported or failed.
     """
+    try:
+        res = requests.get(url)
+        content = res.content
+        content_type = res.headers.get("content-type", "").lower()
+    except Exception as e:
+        print(f"❌ Failed to download: {str(e)}")
+        return [f"No data found in this document (download error)"]
+    # Check for unsupported content
+    if "zip" in content_type or url.endswith(".zip"):
+        return ["No data found in this document (zip)"]
+    if "octet-stream" in content_type or url.endswith(".bin"):
+        return ["No data found in this document (bin)"]
+    # OCR for image files
+    if "image" in content_type or is_image(content):
+        print("📷 Detected image file. Using OCR...")
+        try:
+            text = extract_text_from_image_bytes(content)
+            return [text] if text else ["No data found in this document (image empty)"]
+        except Exception as e:
+            print(f"❌ OCR failed: {str(e)}")
+            return [f"No data found in this document (image/OCR error)"]
+    # Try PDF fallback
+    try:
+        with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
+            pages = list(doc)
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                texts = list(executor.map(_extract_text, pages))
+            if chunk_size > 1:
+                chunks = []
+                for i in range(0, len(texts), chunk_size):
+                    chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
+                    if chunk:
+                        chunks.append(chunk)
+                return chunks if chunks else ["No data found in this document (empty PDF)"]
+            return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
+    except Exception as e:
+        print(f"❌ Failed to parse as PDF: {str(e)}")
+        return [f"No data found in this document (not PDF or corrupted)"]
 def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
     """
     Parse a local PDF file, extract text in parallel, optionally chunk pages.
     """
+    try:
+        with fitz.open(file_path) as doc:
+            pages = list(doc)
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                texts = list(executor.map(_extract_text, pages))
+            if chunk_size > 1:
+                chunks = []
+                for i in range(0, len(texts), chunk_size):
+                    chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
+                    if chunk:
+                        chunks.append(chunk)
+                return chunks if chunks else ["No data found in this document (local PDF empty)"]
+            return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
+    except Exception as e:
+        print(f"❌ Failed to open local file: {str(e)}")
+        return [f"No data found in this document (local file error)"]

requirements.txt CHANGED Viewed

@@ -7,4 +7,6 @@ PyMuPDF
 python-dotenv
 tf-keras
 google-generativeai

 python-dotenv
 tf-keras
 google-generativeai
+pytesseract
+Pillow