Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Chia Ying Chiu
NLP Group Project
Commits
647c190f
Commit
647c190f
authored
Nov 28, 2021
by
Zaid A Ali
Browse files
Clean up code to make it more readable and add metrics
parent
dd422fef
Changes
1
Show whitespace changes
Inline
Side-by-side
NLP_Project_Code.ipynb
View file @
647c190f
...
...
@@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 1
6
,
"execution_count": 1
0
,
"metadata": {},
"outputs": [
{
...
...
@@ -31,6 +31,8 @@
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import confusion_matrix\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"\n",
"print(\"All modules imported successfully\")"
...
...
@@ -38,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count":
37
,
"execution_count":
2
,
"metadata": {},
"outputs": [
{
...
...
@@ -113,7 +115,7 @@
"4 Chief Complaint: hypotension, altered mental ... 40301"
]
},
"execution_count":
37
,
"execution_count":
2
,
"metadata": {},
"output_type": "execute_result"
}
...
...
@@ -121,16 +123,18 @@
"source": [
"print(\"Fetching data\")\n",
"\"\"\"\n",
"#This code is to import data from the MIMIC-III files.\n",
"\n",
"diagnoses = pd.read_csv(\"DIAGNOSES_ICD.csv\")\n",
"diagnoses_clean = diagnoses[[\"SUBJECT_ID\", \"HADM_ID\", \"ICD9_CODE\"]]\n",
"#For performance reasons, this code has been commented out\n",
"\n",
"diagnoses = pd.read_csv(\"DIAGNOSES_ICD.csv\")\n",
"note_events = pd.read_csv(\"NOTEEVENTS.csv\", engine=\"python\", on_bad_lines='skip')\n",
"note_events_clean = note_events[[\"SUBJECT_ID\", \"HADM_ID\",\"DESCRIPTION\", \"TEXT\"]]\n",
"\n",
"full_dataset = pd.merge(diagnoses_clean, note_events_clean, on =[\"HADM_ID\", \"SUBJECT_ID\"])\n",
"full_dataset = pd.merge(diagnoses, note_events, on =[\"HADM_ID\", \"SUBJECT_ID\"])\n",
"full_dataset = full_dataset[:40000]\n",
"\n",
"print(full_dataset.head())\n",
"full_dataset.to_csv(\"ICDdata40k.csv\")\n",
"\n",
"\"\"\"\n",
"\n",
"full_dataset = pd.read_csv(\"ICDdata40k.csv\")[[ \"TEXT\", \"ICD9_CODE\"]]\n",
...
...
@@ -150,7 +154,7 @@
},
{
"cell_type": "code",
"execution_count": 3
8
,
"execution_count": 3,
"metadata": {},
"outputs": [
{
...
...
@@ -177,7 +181,7 @@
},
{
"cell_type": "code",
"execution_count":
39
,
"execution_count":
4
,
"metadata": {},
"outputs": [
{
...
...
@@ -238,7 +242,7 @@
"3 RandomForestClassifier 0.041"
]
},
"execution_count":
39
,
"execution_count":
4
,
"metadata": {},
"output_type": "execute_result"
}
...
...
@@ -270,7 +274,7 @@
},
{
"cell_type": "code",
"execution_count":
40
,
"execution_count":
8
,
"metadata": {},
"outputs": [
{
...
...
@@ -282,7 +286,7 @@
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAQEAAAECCAYAAAD+eGJTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8+yak3AAAACXBIWXMAAAsTAAALEwEAmpwYAAA
N00lEQVR4nO3dX4xc5X3G8efZXRuzQBqsDZbrQhP+Rs5F7WpF0hYVEG3q5ga4icpFZVWRnLYgJVJUCeUmuanETUJbqY3kFIovEtpIiQMXURtkRaZVU7ebAMEOpKDUKLaMHYtUcYv/sr9e7EFdyK7n3Zl3zjmzv+9HsmbmzLvv+Z0942fPmXnnPY4IAchrqusCAHSLEACSIwSA5AgBIDlCAEiOEACS6ywEbO+y/SPbr9p+uKs6RmH7qO0XbT9ve6HrekrYftz2KduHly3bbPsZ2680t9d2WePlrFL/520fb/bD87Y/1mWNl2P7etvfsf1D20dsf6pZ3tk+6CQEbE9L+mtJvy9pu6QHbG/vopYK7o6IHREx33UhhZ6QtOtdyx6WdCAibpF0oHncV0/oF+uXpEeb/bAjIr7Vck1rcUnSZyJiu6SPSHqwee13tg+6OhK4XdKrEfHjiLgg6e8l3dtRLalExLOS3njX4nsl7Wvu75N0X5s1rcUq9U+MiDgREd9v7p+R9JKkbepwH3QVAtsk/WTZ42PNskkTkr5t+3u293RdzAi2RMSJ5v7rkrZ0WcyQHrL9g+Z0obenM8vZfr+knZIOqcN9wBuDo7kjIn5dS6c1D9r+7a4LGlUsjSOftLHkX5J0k6Qdkk5I+kKn1RSwfbWkr0v6dET8fPlzbe+DrkLguKTrlz3+lWbZRImI483tKUn7tXSaM4lO2t4qSc3tqY7rWZOIOBkRb0XEoqQvq+f7wfYGLQXAVyLiG83izvZBVyHwH5Jusf0B2xsl/YGkpzuqZSi2r7J9zdv3JX1U0uHL/1RvPS1pd3N/t6SnOqxlzd7+z9O4Xz3eD7Yt6TFJL0XEF5c91dk+cFffImw+xvkLSdOSHo+IP++kkCHZvlFLf/0laUbSVydhG2w/KekuSXOSTkr6nKRvSvqapBskvSbp4xHRyzffVqn/Li2dCoSko5I+uez8ulds3yHpnyW9KGmxWfxZLb0v0Mk+6CwEAPQDbwwCyRECQHKEAJAcIQAkRwgAyXUeAhM+3Hbi65cmfxsmvX6p223oPAQkTfoOnPT6pcnfhkmvX+pwG/oQAgA61OpgoY2+Ijbpqncsu6jz2qArWqthNd5UVkOcO/+OxyvV75mZsr4uXSorbsza2IaS/obt68LiWW2cunLo/opc/Yv9r+h/zg7V/bj/H5zT/+pCnPdKz5Xt6VXY3iXpL7U09PdvI+KRy7XfpKv0Yd8zyirHZvrm24ravXXkR4P7mruurK+T/f2eTu1tKOmvZl9r6a/E4vzOonZTB5+rts6aDsWBVZ8b+nRgnc0OBKQ1ynsCzA4ErAOjhMB6mR0ISG2k9wRKNJ9/7pGkTZod9+oArNEoRwJFswNFxN6ImI+I+T58CgDgnUYJgYmfHQjACKcDEXHJ9kOS/kn/PzvQkWqVAWjFSO8JNBd56POFHoqVfP5f3FePP/8vVXsbavZXPJ5gS73xBK/uni7q69aDRc16hWHDQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAkRwgAyY39C0QZ1RykUnu962EgU6ma2/qeFzdW66tvOBIAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkmPE4BhkGpWXxQt/9jdF7X7v0R3jLWQMOBIAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkmt1xKA3XaHpm28b2K7mxUG7MP2hwdso9Xs7a29DSX81+5Iknf5ZWbsCN/3DHxe1u1n/Vm2dbeFIAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEiu1RGDce58r0fJVVNxpFpnKm9Dzf1ePLKw6tWhbyzqaxJxJAAkN9KRgO2jks5IekvSpYiYr1EUgPbUOB24OyJOV+gHQAc4HQCSGzUEQtK3bX/P9p6VGtjeY3vB9sJFnR9xdQBqG/V04I6IOG77OknP2H45Ip5d3iAi9kraK0nv8eYYcX0AKhvpSCAijje3pyTtl3R7jaIAtGfoELB9le1r3r4v6aOSDtcqDEA7Rjkd2CJpv+23+/lqRPzj5X7AMzOanhs8gKOLC3ou3rmzqN3UwecGtlkPFyS9+MFtRe2mCre1i+nFUgxMq2DoEIiIH0v6tYq1AOgAHxECyRECQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAk1+r0Yll0NaKt5kjFDaffLFtnYX8X52YHtunzX6Qbv7l+vwHb5987gBYQAkByhACQHCEAJEcIAMkRAkByhACQHCEAJEcIAMm1O2JwZlqau3Zwuw7m6Du3eWNRu8Hj3pjbbiUbXj4+sE3p6MMuLvha8/XRNxwJAMkRAkByhACQHCEAJEcIAMkRAkByhACQHCEAJEcIAMkxx+AYvPFHv1HUbvPffXfMlaxTJaNOJU1XXOXMn75e1nB/xZW2hCMBIDlCAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5FodLBTnzvd26q3Z/Yeq9fW+f3+jqF3xdFod6PPFUrt4DR39yXxRu1v12pgrqY8jASC5gSFg+3Hbp2wfXrZss+1nbL/S3JaN4wTQOyVHAk9I2vWuZQ9LOhARt0g60DwGMIEGhkBEPCvp3Se590ra19zfJ+m+umUBaMuw7wlsiYgTzf3XJW2pVA+Alo38xmBEhKRY7Xnbe2wv2F64qPOjrg5AZcOGwEnbWyWpuV3185+I2BsR8xExv0FXDLk6AOMybAg8LWl3c3+3pKfqlAOgbSUfET4p6buSbrN9zPYnJD0i6XdtvyLpd5rHACbQwBGDEfHAKk/ds9aVeWZG03PXDWxXc3RZqcU7dxa1mzr43MA2P719c1Ffm48UNSs2/aHbBrbpasRmzdqmtwx+Da1Fyett9pfOVl1nnzBiEEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEiu3TkGL13qZDRgiZKRgKXWwxyDtdUcqdjX19Ck4kgASI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASK7VEYN9nmOwdN66ktrO3Preor5mK88x2NcrPtdWc1+V9vfLf7WxqK9JxJEAkBwhACRHCADJEQJAcoQAkBwhACRHCADJEQJAckwv1qhZ1+z+Q9X6Wos+X5C0ptqvoZL+jv3JzUV93XBw1Grax5EAkBwhACRHCADJEQJAcoQAkBwhACRHCADJEQJAcoQAkBzTizVKRttJZSPuava1FjX7qz6F14SPZpx7Yf1ePpYjASC5gSFg+3Hbp2wfXrbs87aP236++fex8ZYJYFxKjgSekLRrheWPRsSO5t+36pYFoC0DQyAinpX0Rgu1AOjAKO8JPGT7B83pwrXVKgLQqmFD4EuSbpK0Q9IJSV9YraHtPbYXbC9cWDw75OoAjMtQIRARJyPirYhYlPRlSbdfpu3eiJiPiPmNU1cOWyeAMRkqBGxvXfbwfkmHV2sLoN8GDhay/aSkuyTN2T4m6XOS7rK9Q1JIOirpk+MrEcA4DQyBiHhghcWPDbOyXs8xWHO02umf1etrDUpG+ZX+/qvP49fj0YDZMWIQSI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASK7VOQZ19ZVanN85sNnUwedaKOadqs6pN1f4zeoOrq7blcU76+332vMflrjmP/+7bJ3V1tgejgSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEiOEACSa/eCpOcuasPLxwe262TARcUBPmdufW9RV7NHylZZquZFP0sG90jlA3w2nH5zYJvi/V64r6YLuysZVHRxbraor0n8qzqJNQOoiBAAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIrtURg1kuSHrNv/5X2TqrrbG+kpGdUvk21Pz9dnFx02N3X1nU7oaDYy5kDDgSAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJJrd47BmRlNzw2+mGQXowrfvP/DRe1m9x8a2ObiB7cV9TVV+4KkNUfSdXRR1RJdXJB07oU+j+8czcAjAdvX2/6O7R/aPmL7U83yzbafsf1Kc1v4qgHQJyWnA5ckfSYitkv6iKQHbW+X9LCkAxFxi6QDzWMAE2ZgCETEiYj4fnP/jKSXJG2TdK+kfU2zfZLuG1ONAMZoTW8M2n6/pJ2SDknaEhEnmqdel7SlbmkA2lAcAravlvR1SZ+OiJ8vfy4iQlKs8nN7bC/YXriweHakYgHUVxQCtjdoKQC+EhHfaBaftL21eX6rpBXfio2IvRExHxHzG6fKvpMNoD0lnw5Y0mOSXoqILy576mlJu5v7uyU9Vb88AONWMk7gtyT9oaQXbT/fLPuspEckfc32JyS9JunjY6kQwFgNDIGI+BdJXuXpe+qWA6BtrY4Y7LP1MC9gn5WM8isd4Vd7RGZJbev59cF3B4DkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJJrd7DQzHTZtFUdTFl15jc/UNRudv/g2mpfzHNdqLjfq18stWC9i3fuLOqr9pRxbeBIAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEjOS5cMaGll9k+1NCnpcnOSTrdWRH2TXr80+dsw6fVL49+GX42I9630RKshsGIB9kJEzHdaxAgmvX5p8rdh0uuXut0GTgeA5AgBILk+hMDergsY0aTXL03+Nkx6/VKH29D5ewIAutWHIwEAHSIEgOQIASA5QgBIjhAAkvs/WRLh0wdQMI0AAAAASUVORK5CYII=
\n",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAQEAAAECCAYAAAD+eGJTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8+yak3AAAACXBIWXMAAAsTAAALEwEAmpwYAAA
MCUlEQVR4nO3dT4wW9R3H8c+ngBgQE4ktpVRrNaQJl651RdOaBqW11At4sdW0oYkJHjTRxEOJF7000YPaHhoTVAIHsTVRKwdTJasJNm2Iq1kVpQ2GYCrlTw0HCKbI4reHHdLV7vLMPs88M8883/crIc/zzMw+851n4MPMPN/9jSNCAPL6StMFAGgWIQAkRwgAyRECQHKEAJAcIQAk11gI2F5n+x+2P7S9uak6emH7oO33bE/YHm+6njJsb7V9zPbeadOW2t5le3/xeEmTNZ7PLPU/ZPtQsR8mbN/SZI3nY/sy26/b/sD2+7bvLaY3tg8aCQHb8yT9XtJPJa2SdLvtVU3UUoEbI2IkIkabLqSkbZLWfWnaZkljEbFS0ljxelBt0//XL0mPF/thJCJerrmmuZiUdH9ErJJ0vaS7i7/7je2Dpo4EVkv6MCIORMRnkv4gaX1DtaQSEbslHf/S5PWSthfPt0vaUGdNczFL/a0REYcj4u3i+UlJ+yStUIP7oKkQWCHpn9Nef1xMa5uQ9Krtt2xvarqYHiyLiMPF8yOSljVZTJfusf1ucbowsKcz09m+QtLVkvaowX3AhcHe3BAR39PUac3dtn/YdEG9iqk+8rb1kj8h6SpJI5IOS3q00WpKsH2RpOcl3RcRJ6bPq3sfNBUChyRdNu31N4tprRIRh4rHY5Je1NRpThsdtb1ckorHYw3XMycRcTQizkbE55Ke1IDvB9sLNBUAz0TEC8XkxvZBUyHwpqSVtr9t+wJJP5e0s6FaumJ7se0l555LulnS3vP/1MDaKWlj8XyjpJcarGXOzv3jKdyqAd4Pti3paUn7IuKxabMa2wdu6rcIi69xfitpnqStEfGbRgrpku0rNfW/vyTNl7SjDdtg+1lJayRdKumopAcl/UnSc5Iul/SRpNsiYiAvvs1S/xpNnQqEpIOS7pp2fj1QbN8g6Q1J70n6vJj8gKauCzSyDxoLAQCDgQuDQHKEAJAcIQAkRwgAyRECQHKNh0DL221bX7/U/m1oe/1Ss9vQeAhIavsObHv9Uvu3oe31Sw1uwyCEAIAG1dosdIEXxoVa/IVpZ3RaC7Swthqq1vb6pZm3IZYsKvWzPvlpP0qakzr2Qb8/j35vw390Sp/Fac80b34vb2x7naTfaar196mIePh8y1+oxbrOa3tZJWoyufqaUsvNf+2tPlcyGNr+eeyJsVnndX06MGSjAwFp9XJNgNGBgCHQSwgMy+hAQGo9XRMoo/j+c5MkXahyF1cA1KeXI4FSowNFxJaIGI2I0bZfRQeGUS8h0PrRgQD0cDoQEZO275H0iv43OtD7lVUGoBY9XRMobvIwyDd6aMTkTe3+Tlka7NpQLdqGgeQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkuv7LxC1xYEdI6WWu/KOiY7L0GgzfIZ5n3IkACRHCADJEQJAcoQAkBwhACRHCADJEQJAcoQAkBwhACRHx2DhZ6vKdYS9qXkdl7l24my59xrp/F4YDMMwZNxsOBIAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkqNjsFBl994fPyjXXXalJipbp1Suq62NHW39VrYbsAl17FOOBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJIjBIDk6BgsHPr190stt+KRv3ZcZuE7i3otpyt0A3ZnkD+3OmrjSABIrqcjAdsHJZ2UdFbSZESMVlEUgPpUcTpwY0R8UsH7AGgApwNAcr2GQEh61fZbtjfNtIDtTbbHbY+f0ekeVwegar2eDtwQEYdsf03SLtt/j4jd0xeIiC2StkjSxV4aPa4PQMV6OhKIiEPF4zFJL0paXUVRAOrTdQjYXmx7ybnnkm6WtLeqwgDUo5fTgWWSXrR97n12RMSfK6mq5co0FPVDlUNRDfMNONukjuHFug6BiDgg6bs9rR1A4/iKEEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDmGFytU2eVX5VBlc0H3Xv8M8z7lSABIjhAAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjo5BoIRhGDdyNhwJAMkRAkByhACQHCEAJEcIAMkRAkByhACQHCEAJEcIAMnV2jEYSxZpcnX/O6C6UeVdeDfc/kap93rzkXmllmvC0WsXllpuxWt9LmRAMMYggKFFCADJEQJAcoQAkBwhACRHCADJEQJAcoQAkJwjoraVXeylcZ3X1ra+uWiqGQSow54Y04k47pnmcSQAJNcxBGxvtX3M9t5p05ba3mV7f/F4SX/LBNAvZY4Etkla96VpmyWNRcRKSWPFawAt1DEEImK3pONfmrxe0vbi+XZJG6otC0Bdur0msCwiDhfPj0haVlE9AGrW84XBmPp6YdavGGxvsj1ue/yMTve6OgAV6zYEjtpeLknF47HZFoyILRExGhGjC1Tud9QB1KfbENgpaWPxfKOkl6opB0DdynxF+Kykv0n6ju2Pbd8p6WFJP7a9X9KPitcAWqjj8GIRcfssswaz9W8AvPKviVLL/eQbI32tAyiDjkEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEguVpvSDrIqhw7cO0v7iy13HzVf+NVdKfKG9YOGo4EgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkiMEgORq7RiMJYs0ubpz51Ubu66mO3ptuaHVV7zW50J6wF2av6jtfyfPhyMBIDlCAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5BwRta3sYi+N6zyY9zEd5uGjgD0xphNx3DPN40gASI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASK7W4cXOfH2xDv2q87BVTQxZVWUnYFNDc5XpeqTjsTvDvE85EgCS6xgCtrfaPmZ777RpD9k+ZHui+HNLf8sE0C9ljgS2SVo3w/THI2Kk+PNytWUBqEvHEIiI3ZKO11ALgAb0ck3gHtvvFqcLl1RWEYBadRsCT0i6StKIpMOSHp1tQdubbI/bHp/89FSXqwPQL12FQEQcjYizEfG5pCclrT7PslsiYjQiRucvWtxtnQD6pKsQsL182stbJe2dbVkAg61js5DtZyWtkXSp7Y8lPShpje0RSSHpoKS7+lcigH5ijMFClWMMMl4hBg1jDAKYFSEAJEcIAMkRAkByhACQHCEAJEcIAMkRAkByhACQXK1jDA4yuveQFUcCQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAkRwgAyXFD0sKBHSOllrvyjomOy9B4hDbhSABIjhAAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIrtaOwQVHTjXSDVjGwncWVfZeVXYfAv3GkQCQHCEAJEcIAMkRAkByhACQHCEAJEcIAMkRAkByhACQXK0dg7FkkSZXX9NxuSbG6Kuyk/Hyp+ZV9l5zMXlTdZ9tmfeay/u13TB/Hh2PBGxfZvt12x/Yft/2vcX0pbZ32d5fPF7S/3IBVK3M6cCkpPsjYpWk6yXdbXuVpM2SxiJipaSx4jWAlukYAhFxOCLeLp6flLRP0gpJ6yVtLxbbLmlDn2oE0EdzujBo+wpJV0vaI2lZRBwuZh2RtKza0gDUoXQI2L5I0vOS7ouIE9PnRURIill+bpPtcdvjZ86c6qlYANUrFQK2F2gqAJ6JiBeKyUdtLy/mL5d0bKafjYgtETEaEaMLFiyuomYAFSrz7YAlPS1pX0Q8Nm3WTkkbi+cbJb1UfXkA+q1Mn8APJP1S0nu2J4ppD0h6WNJztu+U9JGk2/pSIYC+6hgCEfEXSZ5l9tpqywFQt1o7BrNoqmusyvW2sfOtn4b58+B3B4DkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJKjWagPhnkoqqyG+SazHAkAyRECQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAkRwgAyXnqlgE1rcz+t6YGJZ3uUkmf1FZE9dpev9T+bWh7/VL/t+FbEfHVmWbUGgIzFmCPR8Roo0X0oO31S+3fhrbXLzW7DZwOAMkRAkBygxACW5ouoEdtr19q/za0vX6pwW1o/JoAgGYNwpEAgAYRAkByhACQHCEAJEcIAMn9F9vWLzlOtuIrAAAAAElFTkSuQmCC
\n",
"text/plain": [
"<Figure size 288x288 with 1 Axes>"
]
...
...
@@ -296,9 +300,8 @@
"source": [
"#Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth\n",
"from matplotlib.pyplot import matshow\n",
"from seaborn import heatmap\n",
"model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)\n",
"X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.
33
, random_state=0)\n",
"X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.
10
, random_state=0)\n",
"\n",
"model.fit(X_train, y_train)\n",
"y_pred = model.predict(X_test)\n",
...
...
@@ -306,10 +309,57 @@
"conf_mat = confusion_matrix(y_test, y_pred)\n",
"\n",
"matshow(conf_mat)\n",
"#heatmap(conf_mat)\n",
"print(\"Done exploring Random Forest Classifier\")\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 40301 0.00 0.00 0.00 2.0\n",
" 486 0.00 0.00 0.00 2.0\n",
" 58281 0.00 0.00 0.00 3.0\n",
" 5855 0.00 0.00 0.00 9.0\n",
" 4254 0.00 0.00 0.00 7.0\n",
" 2762 0.00 0.00 0.00 4.0\n",
" 7100 0.00 0.00 0.00 5.0\n",
" 2767 0.00 0.00 0.00 5.0\n",
" 7243 0.00 0.00 0.00 4.0\n",
" 45829 0.00 0.00 0.00 10.0\n",
" 2875 0.00 0.00 0.00 2.0\n",
" 28521 0.00 0.00 0.00 7.0\n",
" 28529 0.00 0.00 0.00 2.0\n",
" 27541 0.00 0.00 0.00 5.0\n",
" 5856 0.00 0.00 0.00 1.0\n",
" 58381 0.00 0.00 0.00 8.0\n",
" 5589 0.00 0.00 0.00 2.0\n",
" 32723 0.00 0.00 0.00 1.0\n",
" 22804 0.00 0.00 0.00 5.0\n",
" 33829 0.00 0.00 0.00 5.0\n",
" 78900 0.00 0.00 0.00 5.0\n",
" 79092 0.00 0.00 0.00 1.0\n",
" V4511 0.00 0.00 0.00 5.0\n",
"\n",
" accuracy 0.00 100.0\n",
" macro avg 0.00 0.00 0.00 100.0\n",
"weighted avg 0.00 0.00 0.00 100.0\n",
"\n"
]
}
],
"source": [
"from sklearn import metrics\n",
"#Display metrics on Random Forest Classifier\n",
"print(metrics.classification_report(y_test, y_pred, target_names=full_dataset['ICD9_CODE'].unique()))"
]
},
{
"cell_type": "code",
"execution_count": null,
...
...
%% Cell type:markdown id: tags:
For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes for each encounter).
%% Cell type:code id: tags:
```
python
#imports
import
pandas
as
pd
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.svm
import
LinearSVC
from
sklearn.model_selection
import
cross_val_score
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
confusion_matrix
import
warnings
warnings
.
filterwarnings
(
'ignore'
)
print
(
"All modules imported successfully"
)
```
%% Output
All modules imported successfully
%% Cell type:code id: tags:
```
python
print
(
"Fetching data"
)
"""
#This code is to import data from the MIMIC-III files.
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv")
diagnoses_clean = diagnoses[["SUBJECT_ID", "HADM_ID", "ICD9_CODE"]]
#For performance reasons, this code has been commented out
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv")
note_events = pd.read_csv("NOTEEVENTS.csv", engine="python", on_bad_lines='skip')
note_events_clean = note_events[["SUBJECT_ID", "HADM_ID","DESCRIPTION", "TEXT"]]
full_dataset = pd.merge(diagnoses_clean, note_events_clean, on =["HADM_ID", "SUBJECT_ID"])
full_dataset = pd.merge(diagnoses, note_events, on =["HADM_ID", "SUBJECT_ID"])
full_dataset = full_dataset[:40000]
print(full_dataset.head())
full_dataset.to_csv("ICDdata40k.csv")
"""
full_dataset
=
pd
.
read_csv
(
"ICDdata40k.csv"
)[[
"TEXT"
,
"ICD9_CODE"
]]
full_dataset
=
full_dataset
[:
1000
]
print
(
"Done fetching all the data"
)
full_dataset
.
head
()
```
%% Output
Fetching data
Done fetching all the data
TEXT ICD9_CODE
0 Admission Date: [**2141-9-18**] ... 40301
1 PATIENT/TEST INFORMATION:\nIndication: Pericar... 40301
2 Sinus rhythm\nRightward axis\nSince previous t... 40301
3 Chief Complaint: hypotension, altered mental ... 40301
4 Chief Complaint: hypotension, altered mental ... 40301
%% Cell type:code id: tags:
```
python
import
nltk
```
%% Cell type:code id: tags:
```
python
#tf-idf vectorization
print
(
"Transforming descriptions into TF-IDF vectors"
)
texts
=
full_dataset
.
TEXT
tfidf
=
TfidfVectorizer
(
sublinear_tf
=
True
,
min_df
=
5
,
norm
=
'l2'
,
encoding
=
'latin-1'
,
ngram_range
=
(
1
,
2
),
stop_words
=
'english'
)
feature_vectors
=
tfidf
.
fit_transform
(
texts
)
feature_vectors
=
feature_vectors
.
toarray
()
icd_codes
=
full_dataset
.
ICD9_CODE
print
(
feature_vectors
.
shape
)
print
(
"Done transforming data"
)
```
%% Output
Transforming descriptions into TF-IDF vectors
(1000, 9878)
Done transforming data
%% Cell type:code id: tags:
```
python
#Evaluating different models
models
=
[
LinearSVC
(),
MultinomialNB
(),
LogisticRegression
(
random_state
=
0
),
RandomForestClassifier
(
n_estimators
=
200
,
max_depth
=
3
,
random_state
=
0
)
]
CV
=
2
cross_val_results
=
pd
.
DataFrame
(
columns
=
[
"Model"
,
"Average_Accuracy"
])
for
model
in
models
:
model_name
=
model
.
__class__
.
__name__
accuracies
=
cross_val_score
(
model
,
feature_vectors
,
icd_codes
,
scoring
=
'accuracy'
,
cv
=
CV
)
avg_accuracy
=
sum
(
accuracies
)
/
len
(
accuracies
)
cross_val_results
=
cross_val_results
.
append
({
"Model"
:
model_name
,
"Average_Accuracy"
:
avg_accuracy
},
ignore_index
=
True
)
cross_val_results
```
%% Output
Model Average_Accuracy
0 LinearSVC 0.038
1 MultinomialNB 0.041
2 LogisticRegression 0.038
3 RandomForestClassifier 0.041
%% Cell type:code id: tags:
```
python
#Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth
from
matplotlib.pyplot
import
matshow
from
seaborn
import
heatmap
model
=
RandomForestClassifier
(
n_estimators
=
200
,
max_depth
=
3
,
random_state
=
0
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
feature_vectors
,
icd_codes
,
test_size
=
0.
33
,
random_state
=
0
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
feature_vectors
,
icd_codes
,
test_size
=
0.
10
,
random_state
=
0
)
model
.
fit
(
X_train
,
y_train
)
y_pred
=
model
.
predict
(
X_test
)
conf_mat
=
confusion_matrix
(
y_test
,
y_pred
)
matshow
(
conf_mat
)
#heatmap(conf_mat)
print
(
"Done exploring Random Forest Classifier"
)
```
%% Output
Done exploring Random Forest Classifier
%% Cell type:code id: tags:
```
python
from
sklearn
import
metrics
#Display metrics on Random Forest Classifier
print
(
metrics
.
classification_report
(
y_test
,
y_pred
,
target_names
=
full_dataset
[
'ICD9_CODE'
].
unique
()))
```
%% Output
precision recall f1-score support
40301 0.00 0.00 0.00 2.0
486 0.00 0.00 0.00 2.0
58281 0.00 0.00 0.00 3.0
5855 0.00 0.00 0.00 9.0
4254 0.00 0.00 0.00 7.0
2762 0.00 0.00 0.00 4.0
7100 0.00 0.00 0.00 5.0
2767 0.00 0.00 0.00 5.0
7243 0.00 0.00 0.00 4.0
45829 0.00 0.00 0.00 10.0
2875 0.00 0.00 0.00 2.0
28521 0.00 0.00 0.00 7.0
28529 0.00 0.00 0.00 2.0
27541 0.00 0.00 0.00 5.0
5856 0.00 0.00 0.00 1.0
58381 0.00 0.00 0.00 8.0
5589 0.00 0.00 0.00 2.0
32723 0.00 0.00 0.00 1.0
22804 0.00 0.00 0.00 5.0
33829 0.00 0.00 0.00 5.0
78900 0.00 0.00 0.00 5.0
79092 0.00 0.00 0.00 1.0
V4511 0.00 0.00 0.00 5.0
accuracy 0.00 100.0
macro avg 0.00 0.00 0.00 100.0
weighted avg 0.00 0.00 0.00 100.0
%% Cell type:code id: tags:
```
python
```
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment