File size: 10,356 Bytes
f5863be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/python

#--------------------------------
# Written by Marzyeh Ghassemi, CSAIL, MIT 
# Sept 21, 2012
# Updated for Python 3, added Notebook, db connection
# by Tom J. Pollard 13 Nov, 2017
# Please contact the author with errors found. 
# mghassem {AT} mit {DOT} edu
#--------------------------------

from __future__ import with_statement
import nltk
import os
import os.path
import re
import string
import sys
import time

def addToDrugs(line, drugs, listing, genList):
    """

    ###### function addToDrugs 

    #   line:    line of text to search

    #   drugs:   array to modify

    #   listing: list of search terms in (generic:search list) form

    #   genList: list of all generic keys being searched for

    #

    #   Searches the provided line for drugs that are listed. Inserts 

    #   a 1 in the drugs array provided at the location which maps 

    #   the found key to the generics list

    """
    genList = dict(enumerate(genList))
    genList = dict((v,k) for k, v in genList.items())

    for (generic, names) in listing.items():
        if re.search(names, line, re.I):
            drugs[genList[generic]] = 1
    return drugs

def readDrugs(f, genList):
    """

    ###### function readDrugs 

    #   f:       file

    #   genList: list of search terms in (generic:search list) form

    #

    #   Converts lines of the form "generic|brand1|brand2" to a

    #   dictionary keyed by "generic" with value "generic|brand1|brand2

    """
    lines = f.read()
    generics = re.findall("^(.*?)\|", lines, re.MULTILINE)
    generics = [x.lower() for x in generics]
    lines = lines.split("\n")
    lines = [x.lower() for x in lines]
    genList.append(generics)
    return dict(zip(generics, lines))

def search(NOTES, 

           SSRI_FILE = os.path.join(os.getcwd(), "SSRI_list.txt"), 

           MISC_FILE = os.path.join(os.getcwd(), "MISC_list.txt"),

           SUMMARY_FILE = "output.csv",

           VERBOSE = False):
    """

    ###### Search the notes

    # NOTES: dataframe loaded from the noteevents table

    # SSRI_FILE: list of SSRI drugs to search for

    # MISC_FILE: list of additional drugs to search for

    # 

    # NB: files should have a line for each distinct drug type, 

    #      and drugs should be separated by a vertical bar '|'

    # 

    # LIMIT FOR PARSING: max number of notes to search.

    # OUTPUT: name of the output file.

    """

    if os.path.isfile(SUMMARY_FILE):
        print('The output file already exists.\n\nRemove the following file or save with a different filename:')
        print(os.path.join(os.getcwd(), SUMMARY_FILE))
        return

    starttime = time.time()
    
    # Keep a list of all generics we are looking for
    genList = []

    # Get the drugs into a structure we can use
    with open(SSRI_FILE) as f:
        SSRI = readDrugs(f, genList)
        print("Using drugs from {}".format(SSRI_FILE))
    try: 
        with open(MISC_FILE) as f:
            MISC = readDrugs(f, genList)
            print("Using additional drugs from {}".format(MISC_FILE))
    except:
        MISC = None
    flatList = [item for sublist in genList for item in sublist]

    # Create indices for the flat list
    # This allows us to understand which "types" are being used
    lengths = [len(type) for type in genList]
    prevLeng = 0
    starts = []
    ends = []
    for leng in lengths:
        starts.append(prevLeng)
        ends.append(prevLeng + leng - 1)
        prevLeng = prevLeng + leng

    # Limit the analysis to discharge summaries
    # Comment out because limitation is now in SQL query
    # NOTES = NOTES[NOTES['category'] == 'Discharge summary']

    # Write heads and notes to new doc
    with open(SUMMARY_FILE, 'a') as f_out:
        f_out.write('"ROW_ID","SUBJECT_ID","HADM_ID","HIST_FOUND","DEPRESSION","ADMIT_FOUND","DIS_FOUND","GEN_DEPRESS_MEDS_FOUND","GROUP","SSRI","MISC","' \
            + '","'.join(flatList) + '"\n')

        # Parse each patient record
        print("Reading documents...")

        for note in NOTES.itertuples():
            if note.Index % 100 == 0:
                print("...index: {}. row_id: {}. subject_id: {}. hadm_id: {}. \n".format(note.Index, note.row_id, note.subject_id, note.hadm_id))
                sys.stdout.flush()
            
            # Reset some per-patient variables
            section = ""
            newSection = ""
            admitFound = 0 # admission note found
            dischargeFound = 0 # discharge summary found
            histFound = 0 # medical history found
            depressionHist = 0;
            drugsAdmit = [0]*len(flatList)
            drugsDis = [0]*len(flatList)
            general_depression_drugs = 0

            # Read through lines sequentially
            # If this looks like a section header, start looking for drugs
            for line in note.text.split("\n"): 

                # Searches for a section header based on heuristics
                m = re.search("""^((\d|[A-Z])(\.|\)))?\s*([a-zA-Z',\.\-\*\d\[\]\(\) ]+)(:| WERE | IS | ARE |INCLUDED|INCLUDING)""", line, re.I)
                if m:
                    newSection = ""
                    # Past Medical History Section
                    if re.search('med(ical)?\s+hist(ory)?', line, re.I):
                        newSection = "hist"
                        histFound = 1

                    # Discharge Medication Section                                                        
                    elif re.search('medication|meds', line, re.I) and re.search('disch(arge)?', line, re.I):
                        newSection = "discharge"
                        dischargeFound = 1

                    # Admitting Medication Section
                    elif re.search('admission|admitting|home|nh|nmeds|pre(\-|\s)?(hosp|op)|current|previous|outpatient|outpt|outside|^[^a-zA-Z]*med(ication)?(s)?', line, re.I) \
                    and (section == "admit" or re.search('medication|meds', line, re.I)):
                        newSection = "admit"
                        admitFound = 1                                         
                        
                    # Med section ended, now in non-meds section                        
                    if section != newSection:
                        section = newSection
                
                # If in history section, search for depression
                if 'hist' in section:
                    if re.search('depression', line, re.I):
                        depressionHist = 1

                # If in meds section, look at each line for specific drugs
                elif 'admit' in section:
                    drugsAdmit = addToDrugs(line, drugsAdmit, SSRI, flatList)
                    if MISC:
                        drugsAdmit = addToDrugs(line, drugsAdmit, MISC, flatList)
                    
                    ## Section just has something like 'Depression meds' 
                    if re.search('depression\s+med(ication)?(s)?', line, re.I):
                        general_depression_drugs = 1
                    
                ## Already in meds section, look at each line for specific drugs
                elif 'discharge' in section:
                    drugsDis = addToDrugs(line, drugsDis, SSRI, flatList)
                    if MISC:
                        drugsDis = addToDrugs(line, drugsDis, MISC, flatList)                        
                    
                # A line with information which we are uncertain about... 
                elif re.search('medication|meds', line, re.I) and re.search('admission|discharge|transfer', line, re.I):
                    if VERBOSE:
                        print('?? {}'.format(line))
                    pass

            group = 0
            # Group 0: Patient has no medications on admission section (or no targeted meds) 
            #          and medications on discharge from the list
            if dischargeFound == 1 and (1 in drugsDis) and (admitFound == 0 or not(1 in drugsAdmit)):        
                group = 0

            # Group 1: Patient has a medications on admission section with no targeted meds
            #          and no medications on discharge
            elif admitFound == 1 and not(1 in drugsAdmit) and (dischargeFound == 0) and general_depression_drugs == 0:
                group = 1

            # Group 2: Patient has medications on admission section, but none from the list
            #          and no medications on discharge from the list
            elif admitFound == 1 and not(1 in drugsAdmit) and dischargeFound == 1 and not(1 in drugsDis) and general_depression_drugs == 0:
                group = 2                                

            # Group 3: Patient has medications on admission (at least one from the list)
            elif (1 in drugsAdmit):
                group = 3
                                
            else:
                if VERBOSE:
                    print('Uncertain about group type for row_id = {}'.format(note.row_id))
                pass

            if VERBOSE:
                print('group is {}'.format(group))

            # Combine the admit and discharge drugs lists
            combined = [w or x for w, x in zip(drugsAdmit, drugsDis)]
        
            # Count the types of each drug
            member = []
            member = [int(1 in drugsAdmit[s:e+1]) for s, e in zip(starts, ends)]

            # save items to csv
            f_out.write(str(note.row_id) + "," + str(note.subject_id) + "," + str(note.hadm_id) + "," + str(histFound) + "," \
                + str(depressionHist) + "," + str(admitFound) + "," + str(dischargeFound) + "," \
                + str(general_depression_drugs) + "," + str(group) + "," + ",".join(map(str, member)) \
                + "," + ",".join(map(str, drugsAdmit)) + "\n")

    # Print summary of analysis
    stoptime = time.time()
    print("Done analyzing {} documents in {} seconds ({} docs/sec)".format(len(NOTES), 
        round(stoptime - starttime, 2), round(len(NOTES) / (stoptime - starttime), 2)))
    print("Summary file is in {}".format(os.getcwd()))