NLP_Project_Code.ipynb 14.5 KB
Newer Older
Zaid A Ali's avatar
Zaid A Ali committed
1
2
3
4
5
6
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
7
    "For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes for each encounter)."
8
9
10
11
   ]
  },
  {
   "cell_type": "code",
12
   "execution_count": 16,
13
   "metadata": {},
14
15
16
17
18
19
20
21
22
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All modules imported successfully\n"
     ]
    }
   ],
23
24
25
   "source": [
    "#imports\n",
    "import pandas as pd\n",
26
27
28
29
30
31
32
33
34
35
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import confusion_matrix\n",
    "\n",
    "\n",
36
37
38
39
40
    "print(\"All modules imported successfully\")"
   ]
  },
  {
   "cell_type": "code",
41
   "execution_count": 37,
42
   "metadata": {},
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fetching data\n",
      "Done fetching all the data\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TEXT</th>\n",
       "      <th>ICD9_CODE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Admission Date:  [**2141-9-18**]              ...</td>\n",
       "      <td>40301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>PATIENT/TEST INFORMATION:\\nIndication: Pericar...</td>\n",
       "      <td>40301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Sinus rhythm\\nRightward axis\\nSince previous t...</td>\n",
       "      <td>40301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Chief Complaint:  hypotension, altered mental ...</td>\n",
       "      <td>40301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Chief Complaint:  hypotension, altered mental ...</td>\n",
       "      <td>40301</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                TEXT ICD9_CODE\n",
       "0  Admission Date:  [**2141-9-18**]              ...     40301\n",
       "1  PATIENT/TEST INFORMATION:\\nIndication: Pericar...     40301\n",
       "2  Sinus rhythm\\nRightward axis\\nSince previous t...     40301\n",
       "3  Chief Complaint:  hypotension, altered mental ...     40301\n",
       "4  Chief Complaint:  hypotension, altered mental ...     40301"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
121
   "source": [
122
123
124
    "print(\"Fetching data\")\n",
    "\"\"\"\n",
    "\n",
125
126
127
128
129
130
131
132
133
    "diagnoses = pd.read_csv(\"DIAGNOSES_ICD.csv\")\n",
    "diagnoses_clean = diagnoses[[\"SUBJECT_ID\", \"HADM_ID\", \"ICD9_CODE\"]]\n",
    "\n",
    "note_events = pd.read_csv(\"NOTEEVENTS.csv\", engine=\"python\", on_bad_lines='skip')\n",
    "note_events_clean = note_events[[\"SUBJECT_ID\", \"HADM_ID\",\"DESCRIPTION\", \"TEXT\"]]\n",
    "\n",
    "full_dataset = pd.merge(diagnoses_clean, note_events_clean, on =[\"HADM_ID\", \"SUBJECT_ID\"])\n",
    "full_dataset = full_dataset[:40000]\n",
    "\n",
134
135
136
137
138
139
    "\"\"\"\n",
    "\n",
    "full_dataset = pd.read_csv(\"ICDdata40k.csv\")[[ \"TEXT\", \"ICD9_CODE\"]]\n",
    "full_dataset= full_dataset[:1000]\n",
    "print(\"Done fetching all the data\")\n",
    "full_dataset.head()"
140
141
142
143
144
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
145
   "metadata": {},
146
147
148
149
150
151
152
153
154
   "outputs": [],
   "source": [
    "import nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
155
156
157
158
159
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
160
161
162
      "Transforming descriptions into TF-IDF vectors\n",
      "(1000, 9878)\n",
      "Done transforming data\n"
163
164
165
166
     ]
    }
   ],
   "source": [
167
168
169
170
171
172
173
174
175
    "#tf-idf vectorization\n",
    "print(\"Transforming descriptions into TF-IDF vectors\")\n",
    "texts = full_dataset.TEXT\n",
    "tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')\n",
    "feature_vectors = tfidf.fit_transform(texts)\n",
    "feature_vectors = feature_vectors.toarray()\n",
    "icd_codes = full_dataset.ICD9_CODE\n",
    "print(feature_vectors.shape)\n",
    "print(\"Done transforming data\")\n"
176
177
178
179
   ]
  },
  {
   "cell_type": "code",
180
   "execution_count": 39,
181
   "metadata": {},
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Model</th>\n",
       "      <th>Average_Accuracy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LinearSVC</td>\n",
       "      <td>0.038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MultinomialNB</td>\n",
       "      <td>0.041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LogisticRegression</td>\n",
       "      <td>0.038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>RandomForestClassifier</td>\n",
       "      <td>0.041</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Model  Average_Accuracy\n",
       "0               LinearSVC             0.038\n",
       "1           MultinomialNB             0.041\n",
       "2      LogisticRegression             0.038\n",
       "3  RandomForestClassifier             0.041"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
246
   "source": [
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
    "#Evaluating different models\n",
    "models = [\n",
    "    \n",
    "    LinearSVC(),\n",
    "    MultinomialNB(),\n",
    "    LogisticRegression(random_state=0),\n",
    "    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)\n",
    "]\n",
    "\n",
    "CV = 2\n",
    "\n",
    "cross_val_results = pd.DataFrame(columns=[\"Model\", \"Average_Accuracy\"])\n",
    "\n",
    "for model in models:\n",
    "    \n",
    "    model_name = model.__class__.__name__\n",
    "    accuracies = cross_val_score(model, feature_vectors, icd_codes, scoring='accuracy', cv=CV)\n",
    "    avg_accuracy = sum(accuracies)/len(accuracies)\n",
    "    cross_val_results = cross_val_results.append({\"Model\": model_name, \"Average_Accuracy\": avg_accuracy }, ignore_index=True)\n",
Zaid A Ali's avatar
Zaid A Ali committed
266
    "\n",
267
268
    "\n",
    "cross_val_results"
Zaid A Ali's avatar
Zaid A Ali committed
269
270
271
272
   ]
  },
  {
   "cell_type": "code",
273
   "execution_count": 40,
Zaid A Ali's avatar
Zaid A Ali committed
274
   "metadata": {},
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done exploring Random Forest Classifier\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQEAAAECCAYAAAD+eGJTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8+yak3AAAACXBIWXMAAAsTAAALEwEAmpwYAAAN00lEQVR4nO3dX4xc5X3G8efZXRuzQBqsDZbrQhP+Rs5F7WpF0hYVEG3q5ga4icpFZVWRnLYgJVJUCeUmuanETUJbqY3kFIovEtpIiQMXURtkRaZVU7ebAMEOpKDUKLaMHYtUcYv/sr9e7EFdyK7n3Zl3zjmzv+9HsmbmzLvv+Z0942fPmXnnPY4IAchrqusCAHSLEACSIwSA5AgBIDlCAEiOEACS6ywEbO+y/SPbr9p+uKs6RmH7qO0XbT9ve6HrekrYftz2KduHly3bbPsZ2680t9d2WePlrFL/520fb/bD87Y/1mWNl2P7etvfsf1D20dsf6pZ3tk+6CQEbE9L+mtJvy9pu6QHbG/vopYK7o6IHREx33UhhZ6QtOtdyx6WdCAibpF0oHncV0/oF+uXpEeb/bAjIr7Vck1rcUnSZyJiu6SPSHqwee13tg+6OhK4XdKrEfHjiLgg6e8l3dtRLalExLOS3njX4nsl7Wvu75N0X5s1rcUq9U+MiDgREd9v7p+R9JKkbepwH3QVAtsk/WTZ42PNskkTkr5t+3u293RdzAi2RMSJ5v7rkrZ0WcyQHrL9g+Z0obenM8vZfr+knZIOqcN9wBuDo7kjIn5dS6c1D9r+7a4LGlUsjSOftLHkX5J0k6Qdkk5I+kKn1RSwfbWkr0v6dET8fPlzbe+DrkLguKTrlz3+lWbZRImI483tKUn7tXSaM4lO2t4qSc3tqY7rWZOIOBkRb0XEoqQvq+f7wfYGLQXAVyLiG83izvZBVyHwH5Jusf0B2xsl/YGkpzuqZSi2r7J9zdv3JX1U0uHL/1RvPS1pd3N/t6SnOqxlzd7+z9O4Xz3eD7Yt6TFJL0XEF5c91dk+cFffImw+xvkLSdOSHo+IP++kkCHZvlFLf/0laUbSVydhG2w/KekuSXOSTkr6nKRvSvqapBskvSbp4xHRyzffVqn/Li2dCoSko5I+uez8ulds3yHpnyW9KGmxWfxZLb0v0Mk+6CwEAPQDbwwCyRECQHKEAJAcIQAkRwgAyXUeAhM+3Hbi65cmfxsmvX6p223oPAQkTfoOnPT6pcnfhkmvX+pwG/oQAgA61OpgoY2+Ijbpqncsu6jz2qArWqthNd5UVkOcO/+OxyvV75mZsr4uXSorbsza2IaS/obt68LiWW2cunLo/opc/Yv9r+h/zg7V/bj/H5zT/+pCnPdKz5Xt6VXY3iXpL7U09PdvI+KRy7XfpKv0Yd8zyirHZvrm24ravXXkR4P7mruurK+T/f2eTu1tKOmvZl9r6a/E4vzOonZTB5+rts6aDsWBVZ8b+nRgnc0OBKQ1ynsCzA4ErAOjhMB6mR0ISG2k9wRKNJ9/7pGkTZod9+oArNEoRwJFswNFxN6ImI+I+T58CgDgnUYJgYmfHQjACKcDEXHJ9kOS/kn/PzvQkWqVAWjFSO8JNBd56POFHoqVfP5f3FePP/8vVXsbavZXPJ5gS73xBK/uni7q69aDRc16hWHDQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAkRwgAyY39C0QZ1RykUnu962EgU6ma2/qeFzdW66tvOBIAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkmPE4BhkGpWXxQt/9jdF7X7v0R3jLWQMOBIAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkmt1xKA3XaHpm28b2K7mxUG7MP2hwdso9Xs7a29DSX81+5Iknf5ZWbsCN/3DHxe1u1n/Vm2dbeFIAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEiu1RGDce58r0fJVVNxpFpnKm9Dzf1ePLKw6tWhbyzqaxJxJAAkN9KRgO2jks5IekvSpYiYr1EUgPbUOB24OyJOV+gHQAc4HQCSGzUEQtK3bX/P9p6VGtjeY3vB9sJFnR9xdQBqG/V04I6IOG77OknP2H45Ip5d3iAi9kraK0nv8eYYcX0AKhvpSCAijje3pyTtl3R7jaIAtGfoELB9le1r3r4v6aOSDtcqDEA7Rjkd2CJpv+23+/lqRPzj5X7AMzOanhs8gKOLC3ou3rmzqN3UwecGtlkPFyS9+MFtRe2mCre1i+nFUgxMq2DoEIiIH0v6tYq1AOgAHxECyRECQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAk1+r0Yll0NaKt5kjFDaffLFtnYX8X52YHtunzX6Qbv7l+vwHb5987gBYQAkByhACQHCEAJEcIAMkRAkByhACQHCEAJEcIAMm1O2JwZlqau3Zwuw7m6Du3eWNRu8Hj3pjbbiUbXj4+sE3p6MMuLvha8/XRNxwJAMkRAkByhACQHCEAJEcIAMkRAkByhACQHCEAJEcIAMkxx+AYvPFHv1HUbvPffXfMlaxTJaNOJU1XXOXMn75e1nB/xZW2hCMBIDlCAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5FodLBTnzvd26q3Z/Yeq9fW+f3+jqF3xdFod6PPFUrt4DR39yXxRu1v12pgrqY8jASC5gSFg+3Hbp2wfXrZss+1nbL/S3JaN4wTQOyVHAk9I2vWuZQ9LOhARt0g60DwGMIEGhkBEPCvp3Se590ra19zfJ+m+umUBaMuw7wlsiYgTzf3XJW2pVA+Alo38xmBEhKRY7Xnbe2wv2F64qPOjrg5AZcOGwEnbWyWpuV3185+I2BsR8xExv0FXDLk6AOMybAg8LWl3c3+3pKfqlAOgbSUfET4p6buSbrN9zPYnJD0i6XdtvyLpd5rHACbQwBGDEfHAKk/ds9aVeWZG03PXDWxXc3RZqcU7dxa1mzr43MA2P719c1Ffm48UNSs2/aHbBrbpasRmzdqmtwx+Da1Fyett9pfOVl1nnzBiEEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEiu3TkGL13qZDRgiZKRgKXWwxyDtdUcqdjX19Ck4kgASI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASK7VEYN9nmOwdN66ktrO3Preor5mK88x2NcrPtdWc1+V9vfLf7WxqK9JxJEAkBwhACRHCADJEQJAcoQAkBwhACRHCADJEQJAckwv1qhZ1+z+Q9X6Wos+X5C0ptqvoZL+jv3JzUV93XBw1Grax5EAkBwhACRHCADJEQJAcoQAkBwhACRHCADJEQJAcoQAkBzTizVKRttJZSPuava1FjX7qz6F14SPZpx7Yf1ePpYjASC5gSFg+3Hbp2wfXrbs87aP236++fex8ZYJYFxKjgSekLRrheWPRsSO5t+36pYFoC0DQyAinpX0Rgu1AOjAKO8JPGT7B83pwrXVKgLQqmFD4EuSbpK0Q9IJSV9YraHtPbYXbC9cWDw75OoAjMtQIRARJyPirYhYlPRlSbdfpu3eiJiPiPmNU1cOWyeAMRkqBGxvXfbwfkmHV2sLoN8GDhay/aSkuyTN2T4m6XOS7rK9Q1JIOirpk+MrEcA4DQyBiHhghcWPDbOyXs8xWHO02umf1etrDUpG+ZX+/qvP49fj0YDZMWIQSI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASK7VOQZ19ZVanN85sNnUwedaKOadqs6pN1f4zeoOrq7blcU76+332vMflrjmP/+7bJ3V1tgejgSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEiOEACSa/eCpOcuasPLxwe262TARcUBPmdufW9RV7NHylZZquZFP0sG90jlA3w2nH5zYJvi/V64r6YLuysZVHRxbraor0n8qzqJNQOoiBAAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIrtURg1kuSHrNv/5X2TqrrbG+kpGdUvk21Pz9dnFx02N3X1nU7oaDYy5kDDgSAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJJrd47BmRlNzw2+mGQXowrfvP/DRe1m9x8a2ObiB7cV9TVV+4KkNUfSdXRR1RJdXJB07oU+j+8czcAjAdvX2/6O7R/aPmL7U83yzbafsf1Kc1v4qgHQJyWnA5ckfSYitkv6iKQHbW+X9LCkAxFxi6QDzWMAE2ZgCETEiYj4fnP/jKSXJG2TdK+kfU2zfZLuG1ONAMZoTW8M2n6/pJ2SDknaEhEnmqdel7SlbmkA2lAcAravlvR1SZ+OiJ8vfy4iQlKs8nN7bC/YXriweHakYgHUVxQCtjdoKQC+EhHfaBaftL21eX6rpBXfio2IvRExHxHzG6fKvpMNoD0lnw5Y0mOSXoqILy576mlJu5v7uyU9Vb88AONWMk7gtyT9oaQXbT/fLPuspEckfc32JyS9JunjY6kQwFgNDIGI+BdJXuXpe+qWA6BtrY4Y7LP1MC9gn5WM8isd4Vd7RGZJbev59cF3B4DkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJJrd7DQzHTZtFUdTFl15jc/UNRudv/g2mpfzHNdqLjfq18stWC9i3fuLOqr9pRxbeBIAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEjOS5cMaGll9k+1NCnpcnOSTrdWRH2TXr80+dsw6fVL49+GX42I9630RKshsGIB9kJEzHdaxAgmvX5p8rdh0uuXut0GTgeA5AgBILk+hMDergsY0aTXL03+Nkx6/VKH29D5ewIAutWHIwEAHSIEgOQIASA5QgBIjhAAkvs/WRLh0wdQMI0AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 288x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
296
   "source": [
297
298
299
300
301
    "#Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth\n",
    "from matplotlib.pyplot import matshow\n",
    "from seaborn import heatmap\n",
    "model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)\n",
    "X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.33, random_state=0)\n",
302
    "\n",
303
304
    "model.fit(X_train, y_train)\n",
    "y_pred = model.predict(X_test)\n",
305
    "\n",
306
    "conf_mat = confusion_matrix(y_test, y_pred)\n",
307
    "\n",
308
309
310
    "matshow(conf_mat)\n",
    "#heatmap(conf_mat)\n",
    "print(\"Done exploring Random Forest Classifier\")\n"
311
   ]
312
313
314
315
316
317
318
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
Zaid A Ali's avatar
Zaid A Ali committed
319
320
321
322
  }
 ],
 "metadata": {
  "kernelspec": {
323
   "display_name": "Python 3",
Zaid A Ali's avatar
Zaid A Ali committed
324
   "language": "python",
325
   "name": "python3"
Zaid A Ali's avatar
Zaid A Ali committed
326
327
328
329
330
331
332
333
334
335
336
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
337
   "version": "3.8.8"
Zaid A Ali's avatar
Zaid A Ali committed
338
339
340
  }
 },
 "nbformat": 4,
341
 "nbformat_minor": 4
Zaid A Ali's avatar
Zaid A Ali committed
342
}