#!/usr/bin/python #-------------------------------- # Written by Marzyeh Ghassemi, CSAIL, MIT # Sept 21, 2012 # Updated for Python 3, added Notebook, db connection # by Tom J. Pollard 13 Nov, 2017 # Please contact the author with errors found. # mghassem {AT} mit {DOT} edu #-------------------------------- from __future__ import with_statement import nltk import os import os.path import re import string import sys import time def addToDrugs(line, drugs, listing, genList): """ ###### function addToDrugs # line: line of text to search # drugs: array to modify # listing: list of search terms in (generic:search list) form # genList: list of all generic keys being searched for # # Searches the provided line for drugs that are listed. Inserts # a 1 in the drugs array provided at the location which maps # the found key to the generics list """ genList = dict(enumerate(genList)) genList = dict((v,k) for k, v in genList.items()) for (generic, names) in listing.items(): if re.search(names, line, re.I): drugs[genList[generic]] = 1 return drugs def readDrugs(f, genList): """ ###### function readDrugs # f: file # genList: list of search terms in (generic:search list) form # # Converts lines of the form "generic|brand1|brand2" to a # dictionary keyed by "generic" with value "generic|brand1|brand2 """ lines = f.read() generics = re.findall("^(.*?)\|", lines, re.MULTILINE) generics = [x.lower() for x in generics] lines = lines.split("\n") lines = [x.lower() for x in lines] genList.append(generics) return dict(zip(generics, lines)) def search(NOTES, SSRI_FILE = os.path.join(os.getcwd(), "SSRI_list.txt"), MISC_FILE = os.path.join(os.getcwd(), "MISC_list.txt"), SUMMARY_FILE = "output.csv", VERBOSE = False): """ ###### Search the notes # NOTES: dataframe loaded from the noteevents table # SSRI_FILE: list of SSRI drugs to search for # MISC_FILE: list of additional drugs to search for # # NB: files should have a line for each distinct drug type, # and drugs should be separated by a vertical bar '|' # # LIMIT FOR PARSING: max number of notes to search. # OUTPUT: name of the output file. """ if os.path.isfile(SUMMARY_FILE): print('The output file already exists.\n\nRemove the following file or save with a different filename:') print(os.path.join(os.getcwd(), SUMMARY_FILE)) return starttime = time.time() # Keep a list of all generics we are looking for genList = [] # Get the drugs into a structure we can use with open(SSRI_FILE) as f: SSRI = readDrugs(f, genList) print("Using drugs from {}".format(SSRI_FILE)) try: with open(MISC_FILE) as f: MISC = readDrugs(f, genList) print("Using additional drugs from {}".format(MISC_FILE)) except: MISC = None flatList = [item for sublist in genList for item in sublist] # Create indices for the flat list # This allows us to understand which "types" are being used lengths = [len(type) for type in genList] prevLeng = 0 starts = [] ends = [] for leng in lengths: starts.append(prevLeng) ends.append(prevLeng + leng - 1) prevLeng = prevLeng + leng # Limit the analysis to discharge summaries # Comment out because limitation is now in SQL query # NOTES = NOTES[NOTES['category'] == 'Discharge summary'] # Write heads and notes to new doc with open(SUMMARY_FILE, 'a') as f_out: f_out.write('"ROW_ID","SUBJECT_ID","HADM_ID","HIST_FOUND","DEPRESSION","ADMIT_FOUND","DIS_FOUND","GEN_DEPRESS_MEDS_FOUND","GROUP","SSRI","MISC","' \ + '","'.join(flatList) + '"\n') # Parse each patient record print("Reading documents...") for note in NOTES.itertuples(): if note.Index % 100 == 0: print("...index: {}. row_id: {}. subject_id: {}. hadm_id: {}. \n".format(note.Index, note.row_id, note.subject_id, note.hadm_id)) sys.stdout.flush() # Reset some per-patient variables section = "" newSection = "" admitFound = 0 # admission note found dischargeFound = 0 # discharge summary found histFound = 0 # medical history found depressionHist = 0; drugsAdmit = [0]*len(flatList) drugsDis = [0]*len(flatList) general_depression_drugs = 0 # Read through lines sequentially # If this looks like a section header, start looking for drugs for line in note.text.split("\n"): # Searches for a section header based on heuristics m = re.search("""^((\d|[A-Z])(\.|\)))?\s*([a-zA-Z',\.\-\*\d\[\]\(\) ]+)(:| WERE | IS | ARE |INCLUDED|INCLUDING)""", line, re.I) if m: newSection = "" # Past Medical History Section if re.search('med(ical)?\s+hist(ory)?', line, re.I): newSection = "hist" histFound = 1 # Discharge Medication Section elif re.search('medication|meds', line, re.I) and re.search('disch(arge)?', line, re.I): newSection = "discharge" dischargeFound = 1 # Admitting Medication Section elif re.search('admission|admitting|home|nh|nmeds|pre(\-|\s)?(hosp|op)|current|previous|outpatient|outpt|outside|^[^a-zA-Z]*med(ication)?(s)?', line, re.I) \ and (section == "admit" or re.search('medication|meds', line, re.I)): newSection = "admit" admitFound = 1 # Med section ended, now in non-meds section if section != newSection: section = newSection # If in history section, search for depression if 'hist' in section: if re.search('depression', line, re.I): depressionHist = 1 # If in meds section, look at each line for specific drugs elif 'admit' in section: drugsAdmit = addToDrugs(line, drugsAdmit, SSRI, flatList) if MISC: drugsAdmit = addToDrugs(line, drugsAdmit, MISC, flatList) ## Section just has something like 'Depression meds' if re.search('depression\s+med(ication)?(s)?', line, re.I): general_depression_drugs = 1 ## Already in meds section, look at each line for specific drugs elif 'discharge' in section: drugsDis = addToDrugs(line, drugsDis, SSRI, flatList) if MISC: drugsDis = addToDrugs(line, drugsDis, MISC, flatList) # A line with information which we are uncertain about... elif re.search('medication|meds', line, re.I) and re.search('admission|discharge|transfer', line, re.I): if VERBOSE: print('?? {}'.format(line)) pass group = 0 # Group 0: Patient has no medications on admission section (or no targeted meds) # and medications on discharge from the list if dischargeFound == 1 and (1 in drugsDis) and (admitFound == 0 or not(1 in drugsAdmit)): group = 0 # Group 1: Patient has a medications on admission section with no targeted meds # and no medications on discharge elif admitFound == 1 and not(1 in drugsAdmit) and (dischargeFound == 0) and general_depression_drugs == 0: group = 1 # Group 2: Patient has medications on admission section, but none from the list # and no medications on discharge from the list elif admitFound == 1 and not(1 in drugsAdmit) and dischargeFound == 1 and not(1 in drugsDis) and general_depression_drugs == 0: group = 2 # Group 3: Patient has medications on admission (at least one from the list) elif (1 in drugsAdmit): group = 3 else: if VERBOSE: print('Uncertain about group type for row_id = {}'.format(note.row_id)) pass if VERBOSE: print('group is {}'.format(group)) # Combine the admit and discharge drugs lists combined = [w or x for w, x in zip(drugsAdmit, drugsDis)] # Count the types of each drug member = [] member = [int(1 in drugsAdmit[s:e+1]) for s, e in zip(starts, ends)] # save items to csv f_out.write(str(note.row_id) + "," + str(note.subject_id) + "," + str(note.hadm_id) + "," + str(histFound) + "," \ + str(depressionHist) + "," + str(admitFound) + "," + str(dischargeFound) + "," \ + str(general_depression_drugs) + "," + str(group) + "," + ",".join(map(str, member)) \ + "," + ",".join(map(str, drugsAdmit)) + "\n") # Print summary of analysis stoptime = time.time() print("Done analyzing {} documents in {} seconds ({} docs/sec)".format(len(NOTES), round(stoptime - starttime, 2), round(len(NOTES) / (stoptime - starttime), 2))) print("Summary file is in {}".format(os.getcwd()))