WebashalarForML commited on
Commit
b2ae2a5
·
verified ·
1 Parent(s): 5fb93c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -104
app.py CHANGED
@@ -31,7 +31,7 @@ logger = logging.getLogger("health-agent")
31
  # --- Environment & config -------------------------------------------------
32
  load_dotenv()
33
  from pathlib import Path
34
- REPORTS_ROOT = Path(os.getenv("REPORTS_ROOT", "app/reports")).resolve() # e.g. /app/reports/<patient_id>/<file.pdf>
35
  SSRI_FILE = Path(os.getenv("SSRI_FILE", "app/medicationCategories/SSRI_list.txt")).resolve()
36
  MISC_FILE = Path(os.getenv("MISC_FILE", "app/medicationCategories/MISC_list.txt")).resolve()
37
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", None)
@@ -59,64 +59,59 @@ Fix missing quotes, trailing commas, unescaped newlines, stray assistant labels,
59
 
60
  # -------------------- JSON extraction / sanitizer ---------------------------
61
  def extract_json_from_llm_response(raw_response: str) -> dict:
62
- """
63
- Try extracting a JSON object from raw LLM text. Performs common cleanups seen in LLM outputs.
64
- Raises JSONDecodeError if parsing still fails.
65
- """
66
- # --- 1) Pull out the JSON code-block if present ---
67
- md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
68
- json_string = md.group(1).strip() if md else raw_response
69
-
70
- # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
71
- first, last = json_string.find('{'), json_string.rfind('}')
72
- if 0 <= first < last:
73
- json_string = json_string[first:last+1]
74
-
75
- # --- 3) PRE-CLEANUP: remove rogue assistant labels, fix boolean quotes ---
76
- json_string = re.sub(r'\b\w+\s*{', '{', json_string)
77
- json_string = re.sub(r'"assistant"\s*:', '', json_string)
78
- json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
79
-
80
- # --- 4) Escape embedded quotes in long string fields (best-effort) ---
81
- def _esc(m):
82
- prefix, body = m.group(1), m.group(2)
83
- return prefix + body.replace('"', r'\"')
84
- json_string = re.sub(
85
- r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
86
- _esc,
87
- json_string
88
- )
89
-
90
- # --- 5) Remove trailing commas before } or ] ---
91
- json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
92
- json_string = re.sub(r',\s*,', ',', json_string)
93
-
94
- # --- 6) Balance braces if obvious excess ---
95
- ob, cb = json_string.count('{'), json_string.count('}')
96
- if cb > ob:
97
- excess = cb - ob
98
- json_string = json_string.rstrip()[:-excess]
99
-
100
- # --- 7) Escape literal newlines inside strings so json.loads can parse ---
101
- def _escape_newlines_in_strings(s: str) -> str:
102
- return re.sub(
103
- r'"((?:[^"\\]|\\.)*?)"',
104
- lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
105
- s,
106
- flags=re.DOTALL
107
  )
108
- json_string = _escape_newlines_in_strings(json_string)
109
 
110
- # Final parse
111
- return json.loads(json_string)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  # -------------------- Utility: Bloatectomy wrapper ------------------------
114
  def clean_notes_with_bloatectomy(text: str, style: str = "remov") -> str:
115
- """
116
- Uses the bloatectomy class to remove duplicates.
117
- style: 'highlight'|'bold'|'remov' ; we use 'remov' to delete duplicates.
118
- Returns cleaned text (single string).
119
- """
120
  try:
121
  b = bloatectomy(text, style=style, output="html")
122
  tokens = getattr(b, "tokens", None)
@@ -129,55 +124,67 @@ def clean_notes_with_bloatectomy(text: str, style: str = "remov") -> str:
129
 
130
  # --------------- Utility: medication extraction (adapted) -----------------
131
  def readDrugs_from_file(path: Path):
132
- if not path.exists():
 
 
 
 
 
 
 
 
 
133
  return {}, []
134
- txt = path.read_text(encoding="utf-8", errors="ignore")
135
- generics = re.findall(r"^(.*?)\|", txt, re.MULTILINE)
136
- generics = [g.lower() for g in generics if g]
137
- lines = [ln.strip().lower() for ln in txt.splitlines() if ln.strip()]
138
- return dict(zip(generics, lines)), generics
139
 
140
  def addToDrugs_line(line: str, drugs_flags: List[int], listing: Dict[str,str], genList: List[str]) -> List[int]:
141
- gen_index = {g:i for i,g in enumerate(genList)}
142
- for generic, pattern_line in listing.items():
143
- try:
144
- if re.search(pattern_line, line, re.I):
145
- idx = gen_index.get(generic)
146
- if idx is not None:
147
- drugs_flags[idx] = 1
148
- except re.error:
149
- continue
150
- return drugs_flags
 
 
 
 
151
 
152
  def extract_medications_from_text(text: str) -> List[str]:
153
- ssri_map, ssri_generics = readDrugs_from_file(SSRI_FILE)
154
- misc_map, misc_generics = readDrugs_from_file(MISC_FILE)
155
- combined_map = {**ssri_map, **misc_map}
156
- combined_generics = []
157
- if ssri_generics:
158
- combined_generics.extend(ssri_generics)
159
- if misc_generics:
160
- combined_generics.extend(misc_generics)
161
-
162
- flags = [0]* len(combined_generics)
163
- meds_found = set()
164
- for ln in text.splitlines():
165
- ln = ln.strip()
166
- if not ln:
167
- continue
168
- if combined_map:
169
- flags = addToDrugs_line(ln, flags, combined_map, combined_generics)
170
- m = re.search(r"\b(Rx|Drug|Medication|Prescribed|Tablet)\s*[:\-]?\s*([A-Za-z0-9\-\s/\.]+)", ln, re.I)
171
- if m:
172
- meds_found.add(m.group(2).strip())
173
- m2 = re.findall(r"\b([A-Z][a-z0-9\-]{2,}\s*(?:[0-9]{1,4}\s*(?:mg|mcg|g|IU))?)", ln)
174
- for s in m2:
175
- if re.search(r"\b(mg|mcg|g|IU)\b", s, re.I):
176
- meds_found.add(s.strip())
177
- for i, f in enumerate(flags):
178
- if f == 1:
179
- meds_found.add(combined_generics[i])
180
- return list(meds_found)
 
 
 
 
181
 
182
  # -------------------- Node prompts --------------------------
183
  PATIENT_NODE_PROMPT = """
@@ -393,6 +400,7 @@ graph_builder.add_edge("condition_loop", END)
393
 
394
  graph = graph_builder.compile()
395
 
 
396
  # -------------------- Flask app & endpoints -------------------------------
397
  BASE_DIR = Path(__file__).resolve().parent
398
  static_folder = BASE_DIR / "static"
@@ -404,12 +412,18 @@ CORS(app) # dev convenience; lock down in production
404
  def serve_frontend():
405
  try:
406
  return app.send_static_file("frontend.html")
407
- except Exception:
 
408
  return "<h3>frontend.html not found in static/ — drop your frontend.html there.</h3>", 404
409
 
410
  @app.route("/process_reports", methods=["POST"])
411
  def process_reports():
412
- data = request.get_json(force=True)
 
 
 
 
 
413
  patient_id = data.get("patient_id")
414
  filenames = data.get("filenames", [])
415
  extra_patient_meta = data.get("patientDetails", {})
@@ -433,9 +447,13 @@ def process_reports():
433
  elements = partition_pdf(filename=str(file_path))
434
  page_text = "\n".join([el.text for el in elements if hasattr(el, "text") and el.text])
435
  except Exception:
436
- logger.exception("Failed to parse PDF %s", file_path)
437
  page_text = ""
438
- cleaned = clean_notes_with_bloatectomy(page_text, style="remov")
 
 
 
 
439
  documents.append({
440
  "filename": fname,
441
  "raw_text": page_text,
@@ -447,7 +465,11 @@ def process_reports():
447
  return jsonify({"error": "no valid documents found"}), 400
448
 
449
  combined_text = "\n\n".join(combined_text_parts)
450
- meds = extract_medications_from_text(combined_text)
 
 
 
 
451
 
452
  initial_state = {
453
  "patient_meta": extra_patient_meta,
@@ -462,7 +484,7 @@ def process_reports():
462
  # Validate and fill placeholders if needed
463
  if not result_state.get("valid", True):
464
  missing = result_state.get("missing", [])
465
- logger.info("Validation failed; missing keys: %s", missing)
466
  if "patientDetails" in missing:
467
  result_state["patientDetails"] = extra_patient_meta or {"name": "", "age": "", "sex": "", "pid": patient_id}
468
  if "reports" in missing:
@@ -497,3 +519,4 @@ def ping():
497
  if __name__ == "__main__":
498
  port = int(os.getenv("PORT", 7860))
499
  app.run(host="0.0.0.0", port=port, debug=True)
 
 
31
  # --- Environment & config -------------------------------------------------
32
  load_dotenv()
33
  from pathlib import Path
34
+ REPORTS_ROOT = Path(os.getenv("REPORTS_ROOT", "reports")).resolve() # e.g. /app/reports/<patient_id>/<file.pdf>
35
  SSRI_FILE = Path(os.getenv("SSRI_FILE", "app/medicationCategories/SSRI_list.txt")).resolve()
36
  MISC_FILE = Path(os.getenv("MISC_FILE", "app/medicationCategories/MISC_list.txt")).resolve()
37
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", None)
 
59
 
60
  # -------------------- JSON extraction / sanitizer ---------------------------
61
  def extract_json_from_llm_response(raw_response: str) -> dict:
62
+ try:
63
+ # --- 1) Pull out the JSON code-block if present ---
64
+ md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
65
+ json_string = md.group(1).strip() if md else raw_response
66
+
67
+ # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
68
+ first, last = json_string.find('{'), json_string.rfind('}')
69
+ if 0 <= first < last:
70
+ json_string = json_string[first:last+1]
71
+
72
+ # --- 3) PRE-CLEANUP: remove rogue assistant labels, fix boolean quotes ---
73
+ json_string = re.sub(r'\b\w+\s*{', '{', json_string)
74
+ json_string = re.sub(r'"assistant"\s*:', '', json_string)
75
+ json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
76
+
77
+ # --- 4) Escape embedded quotes in long string fields (best-effort) ---
78
+ def _esc(m):
79
+ prefix, body = m.group(1), m.group(2)
80
+ return prefix + body.replace('"', r'\"')
81
+ json_string = re.sub(
82
+ r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
83
+ _esc,
84
+ json_string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  )
 
86
 
87
+ # --- 5) Remove trailing commas before } or ] ---
88
+ json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
89
+ json_string = re.sub(r',\s*,', ',', json_string)
90
+
91
+ # --- 6) Balance braces if obvious excess ---
92
+ ob, cb = json_string.count('{'), json_string.count('}')
93
+ if cb > ob:
94
+ excess = cb - ob
95
+ json_string = json_string.rstrip()[:-excess]
96
+
97
+ # --- 7) Escape literal newlines inside strings so json.loads can parse ---
98
+ def _escape_newlines_in_strings(s: str) -> str:
99
+ return re.sub(
100
+ r'"((?:[^"\\]|\\.)*?)"',
101
+ lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
102
+ s,
103
+ flags=re.DOTALL
104
+ )
105
+ json_string = _escape_newlines_in_strings(json_string)
106
+
107
+ # Final parse
108
+ return json.loads(json_string)
109
+ except Exception as e:
110
+ logger.error(f"Failed to extract JSON from LLM response: {e}")
111
+ raise
112
 
113
  # -------------------- Utility: Bloatectomy wrapper ------------------------
114
  def clean_notes_with_bloatectomy(text: str, style: str = "remov") -> str:
 
 
 
 
 
115
  try:
116
  b = bloatectomy(text, style=style, output="html")
117
  tokens = getattr(b, "tokens", None)
 
124
 
125
  # --------------- Utility: medication extraction (adapted) -----------------
126
  def readDrugs_from_file(path: Path):
127
+ try:
128
+ if not path.exists():
129
+ return {}, []
130
+ txt = path.read_text(encoding="utf-8", errors="ignore")
131
+ generics = re.findall(r"^(.*?)\|", txt, re.MULTILINE)
132
+ generics = [g.lower() for g in generics if g]
133
+ lines = [ln.strip().lower() for ln in txt.splitlines() if ln.strip()]
134
+ return dict(zip(generics, lines)), generics
135
+ except Exception:
136
+ logger.exception(f"Failed to read drugs from file: {path}")
137
  return {}, []
 
 
 
 
 
138
 
139
  def addToDrugs_line(line: str, drugs_flags: List[int], listing: Dict[str,str], genList: List[str]) -> List[int]:
140
+ try:
141
+ gen_index = {g:i for i,g in enumerate(genList)}
142
+ for generic, pattern_line in listing.items():
143
+ try:
144
+ if re.search(pattern_line, line, re.I):
145
+ idx = gen_index.get(generic)
146
+ if idx is not None:
147
+ drugs_flags[idx] = 1
148
+ except re.error:
149
+ continue
150
+ return drugs_flags
151
+ except Exception:
152
+ logger.exception("Error in addToDrugs_line")
153
+ return drugs_flags
154
 
155
  def extract_medications_from_text(text: str) -> List[str]:
156
+ try:
157
+ ssri_map, ssri_generics = readDrugs_from_file(SSRI_FILE)
158
+ misc_map, misc_generics = readDrugs_from_file(MISC_FILE)
159
+ combined_map = {**ssri_map, **misc_map}
160
+ combined_generics = []
161
+ if ssri_generics:
162
+ combined_generics.extend(ssri_generics)
163
+ if misc_generics:
164
+ combined_generics.extend(misc_generics)
165
+
166
+ flags = [0]* len(combined_generics)
167
+ meds_found = set()
168
+ for ln in text.splitlines():
169
+ ln = ln.strip()
170
+ if not ln:
171
+ continue
172
+ if combined_map:
173
+ flags = addToDrugs_line(ln, flags, combined_map, combined_generics)
174
+ m = re.search(r"\b(Rx|Drug|Medication|Prescribed|Tablet)\s*[:\-]?\s*([A-Za-z0-9\-\s/\.]+)", ln, re.I)
175
+ if m:
176
+ meds_found.add(m.group(2).strip())
177
+ m2 = re.findall(r"\b([A-Z][a-z0-9\-]{2,}\s*(?:[0-9]{1,4}\s*(?:mg|mcg|g|IU))?)", ln)
178
+ for s in m2:
179
+ if re.search(r"\b(mg|mcg|g|IU)\b", s, re.I):
180
+ meds_found.add(s.strip())
181
+ for i, f in enumerate(flags):
182
+ if f == 1:
183
+ meds_found.add(combined_generics[i])
184
+ return list(meds_found)
185
+ except Exception:
186
+ logger.exception("Failed to extract medications from text")
187
+ return []
188
 
189
  # -------------------- Node prompts --------------------------
190
  PATIENT_NODE_PROMPT = """
 
400
 
401
  graph = graph_builder.compile()
402
 
403
+ # -------------------- Flask app & endpoints -------------------------------
404
  # -------------------- Flask app & endpoints -------------------------------
405
  BASE_DIR = Path(__file__).resolve().parent
406
  static_folder = BASE_DIR / "static"
 
412
  def serve_frontend():
413
  try:
414
  return app.send_static_file("frontend.html")
415
+ except Exception as e:
416
+ logger.error(f"Failed to serve frontend.html: {e}")
417
  return "<h3>frontend.html not found in static/ — drop your frontend.html there.</h3>", 404
418
 
419
  @app.route("/process_reports", methods=["POST"])
420
  def process_reports():
421
+ try:
422
+ data = request.get_json(force=True)
423
+ except Exception as e:
424
+ logger.error(f"Failed to parse JSON request: {e}")
425
+ return jsonify({"error": "Invalid JSON request"}), 400
426
+
427
  patient_id = data.get("patient_id")
428
  filenames = data.get("filenames", [])
429
  extra_patient_meta = data.get("patientDetails", {})
 
447
  elements = partition_pdf(filename=str(file_path))
448
  page_text = "\n".join([el.text for el in elements if hasattr(el, "text") and el.text])
449
  except Exception:
450
+ logger.exception(f"Failed to parse PDF {file_path}")
451
  page_text = ""
452
+ try:
453
+ cleaned = clean_notes_with_bloatectomy(page_text, style="remov")
454
+ except Exception:
455
+ logger.exception("Failed to clean notes with bloatectomy")
456
+ cleaned = page_text
457
  documents.append({
458
  "filename": fname,
459
  "raw_text": page_text,
 
465
  return jsonify({"error": "no valid documents found"}), 400
466
 
467
  combined_text = "\n\n".join(combined_text_parts)
468
+ try:
469
+ meds = extract_medications_from_text(combined_text)
470
+ except Exception:
471
+ logger.exception("Failed to extract medications")
472
+ meds = []
473
 
474
  initial_state = {
475
  "patient_meta": extra_patient_meta,
 
484
  # Validate and fill placeholders if needed
485
  if not result_state.get("valid", True):
486
  missing = result_state.get("missing", [])
487
+ logger.info(f"Validation failed; missing keys: {missing}")
488
  if "patientDetails" in missing:
489
  result_state["patientDetails"] = extra_patient_meta or {"name": "", "age": "", "sex": "", "pid": patient_id}
490
  if "reports" in missing:
 
519
  if __name__ == "__main__":
520
  port = int(os.getenv("PORT", 7860))
521
  app.run(host="0.0.0.0", port=port, debug=True)
522
+