Spaces:
Running
Running
Rivalcoder
commited on
Commit
·
7d0e6b0
1
Parent(s):
afd28fa
[Edit] Update of Image Data Handling
Browse files- Dockerfile +7 -0
- llm.py +8 -0
- pdf_parser.py +69 -33
- requirements.txt +2 -0
Dockerfile
CHANGED
@@ -5,6 +5,13 @@ WORKDIR /app
|
|
5 |
# Install system dependencies
|
6 |
RUN apt-get update && apt-get install -y \
|
7 |
build-essential \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
&& rm -rf /var/lib/apt/lists/*
|
9 |
|
10 |
# Create a non-root user
|
|
|
5 |
# Install system dependencies
|
6 |
RUN apt-get update && apt-get install -y \
|
7 |
build-essential \
|
8 |
+
tesseract-ocr \
|
9 |
+
libglib2.0-0 \
|
10 |
+
libsm6 \
|
11 |
+
libxext6 \
|
12 |
+
libxrender-dev \
|
13 |
+
poppler-utils \
|
14 |
+
&& apt-get clean \
|
15 |
&& rm -rf /var/lib/apt/lists/*
|
16 |
|
17 |
# Create a non-root user
|
llm.py
CHANGED
@@ -30,6 +30,9 @@ You are an expert insurance assistant generating formal yet user-facing answers
|
|
30 |
- Limit each answer to 2–3 sentences, and do not repeat unnecessary information.
|
31 |
- If a question can be answered with a simple "Yes", "No", "Can apply", or "Cannot apply", then begin the answer with that phrase, followed by a short supporting Statement In Natural Human Like response.So Give A Good Answer For The Question With Correct Information.
|
32 |
- Avoid giving theory Based Long Long answers Try to Give Short Good Reasonable Answers.
|
|
|
|
|
|
|
33 |
|
34 |
🛑 DO NOT:
|
35 |
- Use words like "context", "document", or "text".
|
@@ -37,12 +40,17 @@ You are an expert insurance assistant generating formal yet user-facing answers
|
|
37 |
- Say "helpful", "available", "allowed", "indemnified", "excluded", etc.
|
38 |
- Use overly robotic passive constructions like "shall be indemnified".
|
39 |
- Dont Give In Message Like "Based On The Context "Or "Nothing Refered In The context" Like That Dont Give In Response Try To Give Answer For The Question Alone
|
|
|
40 |
|
41 |
✅ DO:
|
42 |
- Write in clean, informative language.
|
43 |
- Give complete answers in 2–3 sentences maximum.
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
46 |
|
47 |
|
48 |
📤 OUTPUT FORMAT (strict):
|
|
|
30 |
- Limit each answer to 2–3 sentences, and do not repeat unnecessary information.
|
31 |
- If a question can be answered with a simple "Yes", "No", "Can apply", or "Cannot apply", then begin the answer with that phrase, followed by a short supporting Statement In Natural Human Like response.So Give A Good Answer For The Question With Correct Information.
|
32 |
- Avoid giving theory Based Long Long answers Try to Give Short Good Reasonable Answers.
|
33 |
+
- Dont Give Long theory Like Response Very Large Response Just Give Short And Good Response For The Question.
|
34 |
+
- If the question is general (math, code, tech, etc.) and No Matches With Context, answer normally without referencing the document.
|
35 |
+
- Avoid Saying “Not found” or “Out of scope” For The Answer of The Question Try to Give Basic General Response For The Question.
|
36 |
|
37 |
🛑 DO NOT:
|
38 |
- Use words like "context", "document", or "text".
|
|
|
40 |
- Say "helpful", "available", "allowed", "indemnified", "excluded", etc.
|
41 |
- Use overly robotic passive constructions like "shall be indemnified".
|
42 |
- Dont Give In Message Like "Based On The Context "Or "Nothing Refered In The context" Like That Dont Give In Response Try To Give Answer For The Question Alone
|
43 |
+
- Over-explain or give long theory answers.
|
44 |
|
45 |
✅ DO:
|
46 |
- Write in clean, informative language.
|
47 |
- Give complete answers in 2–3 sentences maximum.
|
48 |
|
49 |
|
50 |
+
📝 EXAMPLE ANSWERS:
|
51 |
+
- "Yes, the policy covers damage to personal property caused by fire, up to a limit of $50,000."
|
52 |
+
- "No, the policy does not cover pre-existing conditions."
|
53 |
+
- "The waiting period for coverage to begin is 30 days from the start date of the policy."
|
54 |
|
55 |
|
56 |
📤 OUTPUT FORMAT (strict):
|
pdf_parser.py
CHANGED
@@ -2,49 +2,85 @@ import fitz # PyMuPDF
|
|
2 |
import requests
|
3 |
from io import BytesIO
|
4 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
5 |
|
6 |
def _extract_text(page):
|
7 |
text = page.get_text()
|
8 |
return text.strip() if text and text.strip() else None
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
11 |
"""
|
12 |
-
Download PDF from URL, extract text
|
|
|
13 |
"""
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
|
33 |
"""
|
34 |
Parse a local PDF file, extract text in parallel, optionally chunk pages.
|
35 |
"""
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
if
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
2 |
import requests
|
3 |
from io import BytesIO
|
4 |
from concurrent.futures import ThreadPoolExecutor
|
5 |
+
from PIL import Image
|
6 |
+
import pytesseract
|
7 |
+
import imghdr
|
8 |
|
9 |
def _extract_text(page):
|
10 |
text = page.get_text()
|
11 |
return text.strip() if text and text.strip() else None
|
12 |
|
13 |
+
def is_image(content):
|
14 |
+
return imghdr.what(None, h=content) in ["jpeg", "png", "bmp", "gif", "tiff", "webp"]
|
15 |
+
|
16 |
+
def extract_text_from_image_bytes(image_bytes):
|
17 |
+
image = Image.open(BytesIO(image_bytes))
|
18 |
+
return pytesseract.image_to_string(image).strip()
|
19 |
+
|
20 |
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
21 |
"""
|
22 |
+
Download document (PDF or Image) from URL, extract text accordingly.
|
23 |
+
Gracefully return fallback message if unsupported or failed.
|
24 |
"""
|
25 |
+
try:
|
26 |
+
res = requests.get(url)
|
27 |
+
content = res.content
|
28 |
+
content_type = res.headers.get("content-type", "").lower()
|
29 |
+
except Exception as e:
|
30 |
+
print(f"❌ Failed to download: {str(e)}")
|
31 |
+
return [f"No data found in this document (download error)"]
|
32 |
+
|
33 |
+
# Check for unsupported content
|
34 |
+
if "zip" in content_type or url.endswith(".zip"):
|
35 |
+
return ["No data found in this document (zip)"]
|
36 |
+
if "octet-stream" in content_type or url.endswith(".bin"):
|
37 |
+
return ["No data found in this document (bin)"]
|
38 |
+
|
39 |
+
# OCR for image files
|
40 |
+
if "image" in content_type or is_image(content):
|
41 |
+
print("📷 Detected image file. Using OCR...")
|
42 |
+
try:
|
43 |
+
text = extract_text_from_image_bytes(content)
|
44 |
+
return [text] if text else ["No data found in this document (image empty)"]
|
45 |
+
except Exception as e:
|
46 |
+
print(f"❌ OCR failed: {str(e)}")
|
47 |
+
return [f"No data found in this document (image/OCR error)"]
|
48 |
+
|
49 |
+
# Try PDF fallback
|
50 |
+
try:
|
51 |
+
with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
|
52 |
+
pages = list(doc)
|
53 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
54 |
+
texts = list(executor.map(_extract_text, pages))
|
55 |
+
if chunk_size > 1:
|
56 |
+
chunks = []
|
57 |
+
for i in range(0, len(texts), chunk_size):
|
58 |
+
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
|
59 |
+
if chunk:
|
60 |
+
chunks.append(chunk)
|
61 |
+
return chunks if chunks else ["No data found in this document (empty PDF)"]
|
62 |
+
return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
|
63 |
+
except Exception as e:
|
64 |
+
print(f"❌ Failed to parse as PDF: {str(e)}")
|
65 |
+
return [f"No data found in this document (not PDF or corrupted)"]
|
66 |
|
67 |
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
|
68 |
"""
|
69 |
Parse a local PDF file, extract text in parallel, optionally chunk pages.
|
70 |
"""
|
71 |
+
try:
|
72 |
+
with fitz.open(file_path) as doc:
|
73 |
+
pages = list(doc)
|
74 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
75 |
+
texts = list(executor.map(_extract_text, pages))
|
76 |
+
if chunk_size > 1:
|
77 |
+
chunks = []
|
78 |
+
for i in range(0, len(texts), chunk_size):
|
79 |
+
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
|
80 |
+
if chunk:
|
81 |
+
chunks.append(chunk)
|
82 |
+
return chunks if chunks else ["No data found in this document (local PDF empty)"]
|
83 |
+
return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
|
84 |
+
except Exception as e:
|
85 |
+
print(f"❌ Failed to open local file: {str(e)}")
|
86 |
+
return [f"No data found in this document (local file error)"]
|
requirements.txt
CHANGED
@@ -7,4 +7,6 @@ PyMuPDF
|
|
7 |
python-dotenv
|
8 |
tf-keras
|
9 |
google-generativeai
|
|
|
|
|
10 |
|
|
|
7 |
python-dotenv
|
8 |
tf-keras
|
9 |
google-generativeai
|
10 |
+
pytesseract
|
11 |
+
Pillow
|
12 |
|