Spaces:
Sleeping
Sleeping
#!/usr/bin/python | |
#-------------------------------- | |
# Written by Marzyeh Ghassemi, CSAIL, MIT | |
# Sept 21, 2012 | |
# Updated for Python 3, added Notebook, db connection | |
# by Tom J. Pollard 13 Nov, 2017 | |
# Please contact the author with errors found. | |
# mghassem {AT} mit {DOT} edu | |
#-------------------------------- | |
from __future__ import with_statement | |
import nltk | |
import os | |
import os.path | |
import re | |
import string | |
import sys | |
import time | |
def addToDrugs(line, drugs, listing, genList): | |
""" | |
###### function addToDrugs | |
# line: line of text to search | |
# drugs: array to modify | |
# listing: list of search terms in (generic:search list) form | |
# genList: list of all generic keys being searched for | |
# | |
# Searches the provided line for drugs that are listed. Inserts | |
# a 1 in the drugs array provided at the location which maps | |
# the found key to the generics list | |
""" | |
genList = dict(enumerate(genList)) | |
genList = dict((v,k) for k, v in genList.items()) | |
for (generic, names) in listing.items(): | |
if re.search(names, line, re.I): | |
drugs[genList[generic]] = 1 | |
return drugs | |
def readDrugs(f, genList): | |
""" | |
###### function readDrugs | |
# f: file | |
# genList: list of search terms in (generic:search list) form | |
# | |
# Converts lines of the form "generic|brand1|brand2" to a | |
# dictionary keyed by "generic" with value "generic|brand1|brand2 | |
""" | |
lines = f.read() | |
generics = re.findall("^(.*?)\|", lines, re.MULTILINE) | |
generics = [x.lower() for x in generics] | |
lines = lines.split("\n") | |
lines = [x.lower() for x in lines] | |
genList.append(generics) | |
return dict(zip(generics, lines)) | |
def search(NOTES, | |
SSRI_FILE = os.path.join(os.getcwd(), "SSRI_list.txt"), | |
MISC_FILE = os.path.join(os.getcwd(), "MISC_list.txt"), | |
SUMMARY_FILE = "output.csv", | |
VERBOSE = False): | |
""" | |
###### Search the notes | |
# NOTES: dataframe loaded from the noteevents table | |
# SSRI_FILE: list of SSRI drugs to search for | |
# MISC_FILE: list of additional drugs to search for | |
# | |
# NB: files should have a line for each distinct drug type, | |
# and drugs should be separated by a vertical bar '|' | |
# | |
# LIMIT FOR PARSING: max number of notes to search. | |
# OUTPUT: name of the output file. | |
""" | |
if os.path.isfile(SUMMARY_FILE): | |
print('The output file already exists.\n\nRemove the following file or save with a different filename:') | |
print(os.path.join(os.getcwd(), SUMMARY_FILE)) | |
return | |
starttime = time.time() | |
# Keep a list of all generics we are looking for | |
genList = [] | |
# Get the drugs into a structure we can use | |
with open(SSRI_FILE) as f: | |
SSRI = readDrugs(f, genList) | |
print("Using drugs from {}".format(SSRI_FILE)) | |
try: | |
with open(MISC_FILE) as f: | |
MISC = readDrugs(f, genList) | |
print("Using additional drugs from {}".format(MISC_FILE)) | |
except: | |
MISC = None | |
flatList = [item for sublist in genList for item in sublist] | |
# Create indices for the flat list | |
# This allows us to understand which "types" are being used | |
lengths = [len(type) for type in genList] | |
prevLeng = 0 | |
starts = [] | |
ends = [] | |
for leng in lengths: | |
starts.append(prevLeng) | |
ends.append(prevLeng + leng - 1) | |
prevLeng = prevLeng + leng | |
# Limit the analysis to discharge summaries | |
# Comment out because limitation is now in SQL query | |
# NOTES = NOTES[NOTES['category'] == 'Discharge summary'] | |
# Write heads and notes to new doc | |
with open(SUMMARY_FILE, 'a') as f_out: | |
f_out.write('"ROW_ID","SUBJECT_ID","HADM_ID","HIST_FOUND","DEPRESSION","ADMIT_FOUND","DIS_FOUND","GEN_DEPRESS_MEDS_FOUND","GROUP","SSRI","MISC","' \ | |
+ '","'.join(flatList) + '"\n') | |
# Parse each patient record | |
print("Reading documents...") | |
for note in NOTES.itertuples(): | |
if note.Index % 100 == 0: | |
print("...index: {}. row_id: {}. subject_id: {}. hadm_id: {}. \n".format(note.Index, note.row_id, note.subject_id, note.hadm_id)) | |
sys.stdout.flush() | |
# Reset some per-patient variables | |
section = "" | |
newSection = "" | |
admitFound = 0 # admission note found | |
dischargeFound = 0 # discharge summary found | |
histFound = 0 # medical history found | |
depressionHist = 0; | |
drugsAdmit = [0]*len(flatList) | |
drugsDis = [0]*len(flatList) | |
general_depression_drugs = 0 | |
# Read through lines sequentially | |
# If this looks like a section header, start looking for drugs | |
for line in note.text.split("\n"): | |
# Searches for a section header based on heuristics | |
m = re.search("""^((\d|[A-Z])(\.|\)))?\s*([a-zA-Z',\.\-\*\d\[\]\(\) ]+)(:| WERE | IS | ARE |INCLUDED|INCLUDING)""", line, re.I) | |
if m: | |
newSection = "" | |
# Past Medical History Section | |
if re.search('med(ical)?\s+hist(ory)?', line, re.I): | |
newSection = "hist" | |
histFound = 1 | |
# Discharge Medication Section | |
elif re.search('medication|meds', line, re.I) and re.search('disch(arge)?', line, re.I): | |
newSection = "discharge" | |
dischargeFound = 1 | |
# Admitting Medication Section | |
elif re.search('admission|admitting|home|nh|nmeds|pre(\-|\s)?(hosp|op)|current|previous|outpatient|outpt|outside|^[^a-zA-Z]*med(ication)?(s)?', line, re.I) \ | |
and (section == "admit" or re.search('medication|meds', line, re.I)): | |
newSection = "admit" | |
admitFound = 1 | |
# Med section ended, now in non-meds section | |
if section != newSection: | |
section = newSection | |
# If in history section, search for depression | |
if 'hist' in section: | |
if re.search('depression', line, re.I): | |
depressionHist = 1 | |
# If in meds section, look at each line for specific drugs | |
elif 'admit' in section: | |
drugsAdmit = addToDrugs(line, drugsAdmit, SSRI, flatList) | |
if MISC: | |
drugsAdmit = addToDrugs(line, drugsAdmit, MISC, flatList) | |
## Section just has something like 'Depression meds' | |
if re.search('depression\s+med(ication)?(s)?', line, re.I): | |
general_depression_drugs = 1 | |
## Already in meds section, look at each line for specific drugs | |
elif 'discharge' in section: | |
drugsDis = addToDrugs(line, drugsDis, SSRI, flatList) | |
if MISC: | |
drugsDis = addToDrugs(line, drugsDis, MISC, flatList) | |
# A line with information which we are uncertain about... | |
elif re.search('medication|meds', line, re.I) and re.search('admission|discharge|transfer', line, re.I): | |
if VERBOSE: | |
print('?? {}'.format(line)) | |
pass | |
group = 0 | |
# Group 0: Patient has no medications on admission section (or no targeted meds) | |
# and medications on discharge from the list | |
if dischargeFound == 1 and (1 in drugsDis) and (admitFound == 0 or not(1 in drugsAdmit)): | |
group = 0 | |
# Group 1: Patient has a medications on admission section with no targeted meds | |
# and no medications on discharge | |
elif admitFound == 1 and not(1 in drugsAdmit) and (dischargeFound == 0) and general_depression_drugs == 0: | |
group = 1 | |
# Group 2: Patient has medications on admission section, but none from the list | |
# and no medications on discharge from the list | |
elif admitFound == 1 and not(1 in drugsAdmit) and dischargeFound == 1 and not(1 in drugsDis) and general_depression_drugs == 0: | |
group = 2 | |
# Group 3: Patient has medications on admission (at least one from the list) | |
elif (1 in drugsAdmit): | |
group = 3 | |
else: | |
if VERBOSE: | |
print('Uncertain about group type for row_id = {}'.format(note.row_id)) | |
pass | |
if VERBOSE: | |
print('group is {}'.format(group)) | |
# Combine the admit and discharge drugs lists | |
combined = [w or x for w, x in zip(drugsAdmit, drugsDis)] | |
# Count the types of each drug | |
member = [] | |
member = [int(1 in drugsAdmit[s:e+1]) for s, e in zip(starts, ends)] | |
# save items to csv | |
f_out.write(str(note.row_id) + "," + str(note.subject_id) + "," + str(note.hadm_id) + "," + str(histFound) + "," \ | |
+ str(depressionHist) + "," + str(admitFound) + "," + str(dischargeFound) + "," \ | |
+ str(general_depression_drugs) + "," + str(group) + "," + ",".join(map(str, member)) \ | |
+ "," + ",".join(map(str, drugsAdmit)) + "\n") | |
# Print summary of analysis | |
stoptime = time.time() | |
print("Done analyzing {} documents in {} seconds ({} docs/sec)".format(len(NOTES), | |
round(stoptime - starttime, 2), round(len(NOTES) / (stoptime - starttime), 2))) | |
print("Summary file is in {}".format(os.getcwd())) | |