Spaces:
Sleeping
Sleeping
File size: 10,356 Bytes
f5863be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
#!/usr/bin/python
#--------------------------------
# Written by Marzyeh Ghassemi, CSAIL, MIT
# Sept 21, 2012
# Updated for Python 3, added Notebook, db connection
# by Tom J. Pollard 13 Nov, 2017
# Please contact the author with errors found.
# mghassem {AT} mit {DOT} edu
#--------------------------------
from __future__ import with_statement
import nltk
import os
import os.path
import re
import string
import sys
import time
def addToDrugs(line, drugs, listing, genList):
"""
###### function addToDrugs
# line: line of text to search
# drugs: array to modify
# listing: list of search terms in (generic:search list) form
# genList: list of all generic keys being searched for
#
# Searches the provided line for drugs that are listed. Inserts
# a 1 in the drugs array provided at the location which maps
# the found key to the generics list
"""
genList = dict(enumerate(genList))
genList = dict((v,k) for k, v in genList.items())
for (generic, names) in listing.items():
if re.search(names, line, re.I):
drugs[genList[generic]] = 1
return drugs
def readDrugs(f, genList):
"""
###### function readDrugs
# f: file
# genList: list of search terms in (generic:search list) form
#
# Converts lines of the form "generic|brand1|brand2" to a
# dictionary keyed by "generic" with value "generic|brand1|brand2
"""
lines = f.read()
generics = re.findall("^(.*?)\|", lines, re.MULTILINE)
generics = [x.lower() for x in generics]
lines = lines.split("\n")
lines = [x.lower() for x in lines]
genList.append(generics)
return dict(zip(generics, lines))
def search(NOTES,
SSRI_FILE = os.path.join(os.getcwd(), "SSRI_list.txt"),
MISC_FILE = os.path.join(os.getcwd(), "MISC_list.txt"),
SUMMARY_FILE = "output.csv",
VERBOSE = False):
"""
###### Search the notes
# NOTES: dataframe loaded from the noteevents table
# SSRI_FILE: list of SSRI drugs to search for
# MISC_FILE: list of additional drugs to search for
#
# NB: files should have a line for each distinct drug type,
# and drugs should be separated by a vertical bar '|'
#
# LIMIT FOR PARSING: max number of notes to search.
# OUTPUT: name of the output file.
"""
if os.path.isfile(SUMMARY_FILE):
print('The output file already exists.\n\nRemove the following file or save with a different filename:')
print(os.path.join(os.getcwd(), SUMMARY_FILE))
return
starttime = time.time()
# Keep a list of all generics we are looking for
genList = []
# Get the drugs into a structure we can use
with open(SSRI_FILE) as f:
SSRI = readDrugs(f, genList)
print("Using drugs from {}".format(SSRI_FILE))
try:
with open(MISC_FILE) as f:
MISC = readDrugs(f, genList)
print("Using additional drugs from {}".format(MISC_FILE))
except:
MISC = None
flatList = [item for sublist in genList for item in sublist]
# Create indices for the flat list
# This allows us to understand which "types" are being used
lengths = [len(type) for type in genList]
prevLeng = 0
starts = []
ends = []
for leng in lengths:
starts.append(prevLeng)
ends.append(prevLeng + leng - 1)
prevLeng = prevLeng + leng
# Limit the analysis to discharge summaries
# Comment out because limitation is now in SQL query
# NOTES = NOTES[NOTES['category'] == 'Discharge summary']
# Write heads and notes to new doc
with open(SUMMARY_FILE, 'a') as f_out:
f_out.write('"ROW_ID","SUBJECT_ID","HADM_ID","HIST_FOUND","DEPRESSION","ADMIT_FOUND","DIS_FOUND","GEN_DEPRESS_MEDS_FOUND","GROUP","SSRI","MISC","' \
+ '","'.join(flatList) + '"\n')
# Parse each patient record
print("Reading documents...")
for note in NOTES.itertuples():
if note.Index % 100 == 0:
print("...index: {}. row_id: {}. subject_id: {}. hadm_id: {}. \n".format(note.Index, note.row_id, note.subject_id, note.hadm_id))
sys.stdout.flush()
# Reset some per-patient variables
section = ""
newSection = ""
admitFound = 0 # admission note found
dischargeFound = 0 # discharge summary found
histFound = 0 # medical history found
depressionHist = 0;
drugsAdmit = [0]*len(flatList)
drugsDis = [0]*len(flatList)
general_depression_drugs = 0
# Read through lines sequentially
# If this looks like a section header, start looking for drugs
for line in note.text.split("\n"):
# Searches for a section header based on heuristics
m = re.search("""^((\d|[A-Z])(\.|\)))?\s*([a-zA-Z',\.\-\*\d\[\]\(\) ]+)(:| WERE | IS | ARE |INCLUDED|INCLUDING)""", line, re.I)
if m:
newSection = ""
# Past Medical History Section
if re.search('med(ical)?\s+hist(ory)?', line, re.I):
newSection = "hist"
histFound = 1
# Discharge Medication Section
elif re.search('medication|meds', line, re.I) and re.search('disch(arge)?', line, re.I):
newSection = "discharge"
dischargeFound = 1
# Admitting Medication Section
elif re.search('admission|admitting|home|nh|nmeds|pre(\-|\s)?(hosp|op)|current|previous|outpatient|outpt|outside|^[^a-zA-Z]*med(ication)?(s)?', line, re.I) \
and (section == "admit" or re.search('medication|meds', line, re.I)):
newSection = "admit"
admitFound = 1
# Med section ended, now in non-meds section
if section != newSection:
section = newSection
# If in history section, search for depression
if 'hist' in section:
if re.search('depression', line, re.I):
depressionHist = 1
# If in meds section, look at each line for specific drugs
elif 'admit' in section:
drugsAdmit = addToDrugs(line, drugsAdmit, SSRI, flatList)
if MISC:
drugsAdmit = addToDrugs(line, drugsAdmit, MISC, flatList)
## Section just has something like 'Depression meds'
if re.search('depression\s+med(ication)?(s)?', line, re.I):
general_depression_drugs = 1
## Already in meds section, look at each line for specific drugs
elif 'discharge' in section:
drugsDis = addToDrugs(line, drugsDis, SSRI, flatList)
if MISC:
drugsDis = addToDrugs(line, drugsDis, MISC, flatList)
# A line with information which we are uncertain about...
elif re.search('medication|meds', line, re.I) and re.search('admission|discharge|transfer', line, re.I):
if VERBOSE:
print('?? {}'.format(line))
pass
group = 0
# Group 0: Patient has no medications on admission section (or no targeted meds)
# and medications on discharge from the list
if dischargeFound == 1 and (1 in drugsDis) and (admitFound == 0 or not(1 in drugsAdmit)):
group = 0
# Group 1: Patient has a medications on admission section with no targeted meds
# and no medications on discharge
elif admitFound == 1 and not(1 in drugsAdmit) and (dischargeFound == 0) and general_depression_drugs == 0:
group = 1
# Group 2: Patient has medications on admission section, but none from the list
# and no medications on discharge from the list
elif admitFound == 1 and not(1 in drugsAdmit) and dischargeFound == 1 and not(1 in drugsDis) and general_depression_drugs == 0:
group = 2
# Group 3: Patient has medications on admission (at least one from the list)
elif (1 in drugsAdmit):
group = 3
else:
if VERBOSE:
print('Uncertain about group type for row_id = {}'.format(note.row_id))
pass
if VERBOSE:
print('group is {}'.format(group))
# Combine the admit and discharge drugs lists
combined = [w or x for w, x in zip(drugsAdmit, drugsDis)]
# Count the types of each drug
member = []
member = [int(1 in drugsAdmit[s:e+1]) for s, e in zip(starts, ends)]
# save items to csv
f_out.write(str(note.row_id) + "," + str(note.subject_id) + "," + str(note.hadm_id) + "," + str(histFound) + "," \
+ str(depressionHist) + "," + str(admitFound) + "," + str(dischargeFound) + "," \
+ str(general_depression_drugs) + "," + str(group) + "," + ",".join(map(str, member)) \
+ "," + ",".join(map(str, drugsAdmit)) + "\n")
# Print summary of analysis
stoptime = time.time()
print("Done analyzing {} documents in {} seconds ({} docs/sec)".format(len(NOTES),
round(stoptime - starttime, 2), round(len(NOTES) / (stoptime - starttime), 2)))
print("Summary file is in {}".format(os.getcwd()))
|