Rivalcoder commited on
Commit
7d0e6b0
·
1 Parent(s): afd28fa

[Edit] Update of Image Data Handling

Browse files
Files changed (4) hide show
  1. Dockerfile +7 -0
  2. llm.py +8 -0
  3. pdf_parser.py +69 -33
  4. requirements.txt +2 -0
Dockerfile CHANGED
@@ -5,6 +5,13 @@ WORKDIR /app
5
  # Install system dependencies
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
 
 
 
 
 
 
 
8
  && rm -rf /var/lib/apt/lists/*
9
 
10
  # Create a non-root user
 
5
  # Install system dependencies
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
+ tesseract-ocr \
9
+ libglib2.0-0 \
10
+ libsm6 \
11
+ libxext6 \
12
+ libxrender-dev \
13
+ poppler-utils \
14
+ && apt-get clean \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
  # Create a non-root user
llm.py CHANGED
@@ -30,6 +30,9 @@ You are an expert insurance assistant generating formal yet user-facing answers
30
  - Limit each answer to 2–3 sentences, and do not repeat unnecessary information.
31
  - If a question can be answered with a simple "Yes", "No", "Can apply", or "Cannot apply", then begin the answer with that phrase, followed by a short supporting Statement In Natural Human Like response.So Give A Good Answer For The Question With Correct Information.
32
  - Avoid giving theory Based Long Long answers Try to Give Short Good Reasonable Answers.
 
 
 
33
 
34
  🛑 DO NOT:
35
  - Use words like "context", "document", or "text".
@@ -37,12 +40,17 @@ You are an expert insurance assistant generating formal yet user-facing answers
37
  - Say "helpful", "available", "allowed", "indemnified", "excluded", etc.
38
  - Use overly robotic passive constructions like "shall be indemnified".
39
  - Dont Give In Message Like "Based On The Context "Or "Nothing Refered In The context" Like That Dont Give In Response Try To Give Answer For The Question Alone
 
40
 
41
  ✅ DO:
42
  - Write in clean, informative language.
43
  - Give complete answers in 2–3 sentences maximum.
44
 
45
 
 
 
 
 
46
 
47
 
48
  📤 OUTPUT FORMAT (strict):
 
30
  - Limit each answer to 2–3 sentences, and do not repeat unnecessary information.
31
  - If a question can be answered with a simple "Yes", "No", "Can apply", or "Cannot apply", then begin the answer with that phrase, followed by a short supporting Statement In Natural Human Like response.So Give A Good Answer For The Question With Correct Information.
32
  - Avoid giving theory Based Long Long answers Try to Give Short Good Reasonable Answers.
33
+ - Dont Give Long theory Like Response Very Large Response Just Give Short And Good Response For The Question.
34
+ - If the question is general (math, code, tech, etc.) and No Matches With Context, answer normally without referencing the document.
35
+ - Avoid Saying “Not found” or “Out of scope” For The Answer of The Question Try to Give Basic General Response For The Question.
36
 
37
  🛑 DO NOT:
38
  - Use words like "context", "document", or "text".
 
40
  - Say "helpful", "available", "allowed", "indemnified", "excluded", etc.
41
  - Use overly robotic passive constructions like "shall be indemnified".
42
  - Dont Give In Message Like "Based On The Context "Or "Nothing Refered In The context" Like That Dont Give In Response Try To Give Answer For The Question Alone
43
+ - Over-explain or give long theory answers.
44
 
45
  ✅ DO:
46
  - Write in clean, informative language.
47
  - Give complete answers in 2–3 sentences maximum.
48
 
49
 
50
+ 📝 EXAMPLE ANSWERS:
51
+ - "Yes, the policy covers damage to personal property caused by fire, up to a limit of $50,000."
52
+ - "No, the policy does not cover pre-existing conditions."
53
+ - "The waiting period for coverage to begin is 30 days from the start date of the policy."
54
 
55
 
56
  📤 OUTPUT FORMAT (strict):
pdf_parser.py CHANGED
@@ -2,49 +2,85 @@ import fitz # PyMuPDF
2
  import requests
3
  from io import BytesIO
4
  from concurrent.futures import ThreadPoolExecutor
 
 
 
5
 
6
  def _extract_text(page):
7
  text = page.get_text()
8
  return text.strip() if text and text.strip() else None
9
 
 
 
 
 
 
 
 
10
  def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
11
  """
12
- Download PDF from URL, extract text in parallel, optionally chunk pages.
 
13
  """
14
- res = requests.get(url)
15
- with fitz.open(stream=BytesIO(res.content), filetype="pdf") as doc:
16
- num_pages = len(doc)
17
- pages = list(doc)
18
- # Step 1: Parallel text extraction
19
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
20
- texts = list(executor.map(_extract_text, pages))
21
- # Step 2: Optional chunking
22
- if chunk_size > 1:
23
- chunks = []
24
- for i in range(0, len(texts), chunk_size):
25
- chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
26
- if chunk:
27
- chunks.append(chunk)
28
- return chunks
29
- # Default: return one chunk per page
30
- return [t for t in texts if t]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
33
  """
34
  Parse a local PDF file, extract text in parallel, optionally chunk pages.
35
  """
36
- with fitz.open(file_path) as doc:
37
- num_pages = len(doc)
38
- pages = list(doc)
39
- # Step 1: Parallel text extraction
40
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
41
- texts = list(executor.map(_extract_text, pages))
42
- # Step 2: Optional chunking
43
- if chunk_size > 1:
44
- chunks = []
45
- for i in range(0, len(texts), chunk_size):
46
- chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
47
- if chunk:
48
- chunks.append(chunk)
49
- return chunks
50
- return [t for t in texts if t]
 
 
2
  import requests
3
  from io import BytesIO
4
  from concurrent.futures import ThreadPoolExecutor
5
+ from PIL import Image
6
+ import pytesseract
7
+ import imghdr
8
 
9
  def _extract_text(page):
10
  text = page.get_text()
11
  return text.strip() if text and text.strip() else None
12
 
13
+ def is_image(content):
14
+ return imghdr.what(None, h=content) in ["jpeg", "png", "bmp", "gif", "tiff", "webp"]
15
+
16
+ def extract_text_from_image_bytes(image_bytes):
17
+ image = Image.open(BytesIO(image_bytes))
18
+ return pytesseract.image_to_string(image).strip()
19
+
20
  def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
21
  """
22
+ Download document (PDF or Image) from URL, extract text accordingly.
23
+ Gracefully return fallback message if unsupported or failed.
24
  """
25
+ try:
26
+ res = requests.get(url)
27
+ content = res.content
28
+ content_type = res.headers.get("content-type", "").lower()
29
+ except Exception as e:
30
+ print(f"❌ Failed to download: {str(e)}")
31
+ return [f"No data found in this document (download error)"]
32
+
33
+ # Check for unsupported content
34
+ if "zip" in content_type or url.endswith(".zip"):
35
+ return ["No data found in this document (zip)"]
36
+ if "octet-stream" in content_type or url.endswith(".bin"):
37
+ return ["No data found in this document (bin)"]
38
+
39
+ # OCR for image files
40
+ if "image" in content_type or is_image(content):
41
+ print("📷 Detected image file. Using OCR...")
42
+ try:
43
+ text = extract_text_from_image_bytes(content)
44
+ return [text] if text else ["No data found in this document (image empty)"]
45
+ except Exception as e:
46
+ print(f"❌ OCR failed: {str(e)}")
47
+ return [f"No data found in this document (image/OCR error)"]
48
+
49
+ # Try PDF fallback
50
+ try:
51
+ with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
52
+ pages = list(doc)
53
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
54
+ texts = list(executor.map(_extract_text, pages))
55
+ if chunk_size > 1:
56
+ chunks = []
57
+ for i in range(0, len(texts), chunk_size):
58
+ chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
59
+ if chunk:
60
+ chunks.append(chunk)
61
+ return chunks if chunks else ["No data found in this document (empty PDF)"]
62
+ return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
63
+ except Exception as e:
64
+ print(f"❌ Failed to parse as PDF: {str(e)}")
65
+ return [f"No data found in this document (not PDF or corrupted)"]
66
 
67
  def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
68
  """
69
  Parse a local PDF file, extract text in parallel, optionally chunk pages.
70
  """
71
+ try:
72
+ with fitz.open(file_path) as doc:
73
+ pages = list(doc)
74
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
75
+ texts = list(executor.map(_extract_text, pages))
76
+ if chunk_size > 1:
77
+ chunks = []
78
+ for i in range(0, len(texts), chunk_size):
79
+ chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
80
+ if chunk:
81
+ chunks.append(chunk)
82
+ return chunks if chunks else ["No data found in this document (local PDF empty)"]
83
+ return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
84
+ except Exception as e:
85
+ print(f"❌ Failed to open local file: {str(e)}")
86
+ return [f"No data found in this document (local file error)"]
requirements.txt CHANGED
@@ -7,4 +7,6 @@ PyMuPDF
7
  python-dotenv
8
  tf-keras
9
  google-generativeai
 
 
10
 
 
7
  python-dotenv
8
  tf-keras
9
  google-generativeai
10
+ pytesseract
11
+ Pillow
12