Commit 1f5289e0 authored by Zaid A Ali's avatar Zaid A Ali
Browse files

Add titles to Jupyter Notebook

parent a43f6a1b
%% Cell type:markdown id: tags:
For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes for each encounter).
%% Cell type:markdown id: tags:
## Imports
%% Cell type:code id: tags:
``` python
#imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
print("All modules imported successfully")
```
%%%% Output: stream
All modules imported successfully
%% Cell type:markdown id: tags:
## Fetching Data from File in order to prepare it for processing
%% Cell type:code id: tags:
``` python
print("Fetching data")
"""
#This code is to import data from the MIMIC-III files.
#For performance reasons, this code has been commented out
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv")
note_events = pd.read_csv("NOTEEVENTS.csv", engine="python", on_bad_lines='skip')
full_dataset = pd.merge(diagnoses, note_events, on =["HADM_ID", "SUBJECT_ID"])
full_dataset = full_dataset[:40000]
print(full_dataset.head())
full_dataset.to_csv("ICDdata40k.csv")
"""
full_dataset = pd.read_csv("ICDdata40k.csv")[[ "TEXT", "ICD9_CODE"]]
full_dataset= full_dataset[:1000]
print("Done fetching all the data")
full_dataset.head()
```
%%%% Output: stream
Fetching data
Done fetching all the data
%%%% Output: execute_result
TEXT ICD9_CODE
0 Admission Date: [**2141-9-18**] ... 40301
1 PATIENT/TEST INFORMATION:\nIndication: Pericar... 40301
2 Sinus rhythm\nRightward axis\nSince previous t... 40301
3 Chief Complaint: hypotension, altered mental ... 40301
4 Chief Complaint: hypotension, altered mental ... 40301
%% Cell type:markdown id: tags:
## Converting clinical notes to tf-idf vectors
%% Cell type:code id: tags:
``` python
import nltk
```
%% Cell type:code id: tags:
``` python
#tf-idf vectorization
print("Transforming descriptions into TF-IDF vectors")
texts = full_dataset.TEXT
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
feature_vectors = tfidf.fit_transform(texts)
feature_vectors = feature_vectors.toarray()
icd_codes = full_dataset.ICD9_CODE
print(feature_vectors.shape)
print("Done transforming data")
```
%%%% Output: stream
Transforming descriptions into TF-IDF vectors
(1000, 9878)
Done transforming data
%% Cell type:markdown id: tags:
## Passing the tfi-idf vectors into multiple ML models in order to evaluate which one is the best model
%% Cell type:code id: tags:
``` python
#Evaluating different models
models = [
LinearSVC(),
MultinomialNB(),
LogisticRegression(random_state=0),
RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
]
CV = 2
cross_val_results = pd.DataFrame(columns=["Model", "Average_Accuracy"])
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, feature_vectors, icd_codes, scoring='accuracy', cv=CV)
avg_accuracy = sum(accuracies)/len(accuracies)
cross_val_results = cross_val_results.append({"Model": model_name, "Average_Accuracy": avg_accuracy }, ignore_index=True)
cross_val_results
```
%%%% Output: execute_result
Model Average_Accuracy
0 LinearSVC 0.038
1 MultinomialNB 0.041
2 LogisticRegression 0.038
3 RandomForestClassifier 0.041
%% Cell type:markdown id: tags:
## Further exploring the Random Forest Classifier by creating a confusion matrix
%% Cell type:code id: tags:
``` python
#Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth
from matplotlib.pyplot import matshow
model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.10, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
matshow(conf_mat)
print("Done exploring Random Forest Classifier")
```
%%%% Output: stream
Done exploring Random Forest Classifier
%%%% Output: display_data
![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAQEAAAECCAYAAAD+eGJTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8+yak3AAAACXBIWXMAAAsTAAALEwEAmpwYAAAMCUlEQVR4nO3dT4wW9R3H8c+ngBgQE4ktpVRrNaQJl651RdOaBqW11At4sdW0oYkJHjTRxEOJF7000YPaHhoTVAIHsTVRKwdTJasJNm2Iq1kVpQ2GYCrlTw0HCKbI4reHHdLV7vLMPs88M8883/crIc/zzMw+851n4MPMPN/9jSNCAPL6StMFAGgWIQAkRwgAyRECQHKEAJAcIQAk11gI2F5n+x+2P7S9uak6emH7oO33bE/YHm+6njJsb7V9zPbeadOW2t5le3/xeEmTNZ7PLPU/ZPtQsR8mbN/SZI3nY/sy26/b/sD2+7bvLaY3tg8aCQHb8yT9XtJPJa2SdLvtVU3UUoEbI2IkIkabLqSkbZLWfWnaZkljEbFS0ljxelBt0//XL0mPF/thJCJerrmmuZiUdH9ErJJ0vaS7i7/7je2Dpo4EVkv6MCIORMRnkv4gaX1DtaQSEbslHf/S5PWSthfPt0vaUGdNczFL/a0REYcj4u3i+UlJ+yStUIP7oKkQWCHpn9Nef1xMa5uQ9Krtt2xvarqYHiyLiMPF8yOSljVZTJfusf1ucbowsKcz09m+QtLVkvaowX3AhcHe3BAR39PUac3dtn/YdEG9iqk+8rb1kj8h6SpJI5IOS3q00WpKsH2RpOcl3RcRJ6bPq3sfNBUChyRdNu31N4tprRIRh4rHY5Je1NRpThsdtb1ckorHYw3XMycRcTQizkbE55Ke1IDvB9sLNBUAz0TEC8XkxvZBUyHwpqSVtr9t+wJJP5e0s6FaumJ7se0l555LulnS3vP/1MDaKWlj8XyjpJcarGXOzv3jKdyqAd4Pti3paUn7IuKxabMa2wdu6rcIi69xfitpnqStEfGbRgrpku0rNfW/vyTNl7SjDdtg+1lJayRdKumopAcl/UnSc5Iul/SRpNsiYiAvvs1S/xpNnQqEpIOS7pp2fj1QbN8g6Q1J70n6vJj8gKauCzSyDxoLAQCDgQuDQHKEAJAcIQAkRwgAyRECQHKNh0DL221bX7/U/m1oe/1Ss9vQeAhIavsObHv9Uvu3oe31Sw1uwyCEAIAG1dosdIEXxoVa/IVpZ3RaC7Swthqq1vb6pZm3IZYsKvWzPvlpP0qakzr2Qb8/j35vw390Sp/Fac80b34vb2x7naTfaar196mIePh8y1+oxbrOa3tZJWoyufqaUsvNf+2tPlcyGNr+eeyJsVnndX06MGSjAwFp9XJNgNGBgCHQSwgMy+hAQGo9XRMoo/j+c5MkXahyF1cA1KeXI4FSowNFxJaIGI2I0bZfRQeGUS8h0PrRgQD0cDoQEZO275H0iv43OtD7lVUGoBY9XRMobvIwyDd6aMTkTe3+Tlka7NpQLdqGgeQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkuv7LxC1xYEdI6WWu/KOiY7L0GgzfIZ5n3IkACRHCADJEQJAcoQAkBwhACRHCADJEQJAcoQAkBwhACRHx2DhZ6vKdYS9qXkdl7l24my59xrp/F4YDMMwZNxsOBIAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkqNjsFBl994fPyjXXXalJipbp1Suq62NHW39VrYbsAl17FOOBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJIjBIDk6BgsHPr190stt+KRv3ZcZuE7i3otpyt0A3ZnkD+3OmrjSABIrqcjAdsHJZ2UdFbSZESMVlEUgPpUcTpwY0R8UsH7AGgApwNAcr2GQEh61fZbtjfNtIDtTbbHbY+f0ekeVwegar2eDtwQEYdsf03SLtt/j4jd0xeIiC2StkjSxV4aPa4PQMV6OhKIiEPF4zFJL0paXUVRAOrTdQjYXmx7ybnnkm6WtLeqwgDUo5fTgWWSXrR97n12RMSfK6mq5co0FPVDlUNRDfMNONukjuHFug6BiDgg6bs9rR1A4/iKEEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDmGFytU2eVX5VBlc0H3Xv8M8z7lSABIjhAAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjo5BoIRhGDdyNhwJAMkRAkByhACQHCEAJEcIAMkRAkByhACQHCEAJEcIAMnV2jEYSxZpcnX/O6C6UeVdeDfc/kap93rzkXmllmvC0WsXllpuxWt9LmRAMMYggKFFCADJEQJAcoQAkBwhACRHCADJEQJAcoQAkJwjoraVXeylcZ3X1ra+uWiqGQSow54Y04k47pnmcSQAJNcxBGxvtX3M9t5p05ba3mV7f/F4SX/LBNAvZY4Etkla96VpmyWNRcRKSWPFawAt1DEEImK3pONfmrxe0vbi+XZJG6otC0Bdur0msCwiDhfPj0haVlE9AGrW84XBmPp6YdavGGxvsj1ue/yMTve6OgAV6zYEjtpeLknF47HZFoyILRExGhGjC1Tud9QB1KfbENgpaWPxfKOkl6opB0DdynxF+Kykv0n6ju2Pbd8p6WFJP7a9X9KPitcAWqjj8GIRcfssswaz9W8AvPKviVLL/eQbI32tAyiDjkEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEguVpvSDrIqhw7cO0v7iy13HzVf+NVdKfKG9YOGo4EgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkiMEgORq7RiMJYs0ubpz51Ubu66mO3ptuaHVV7zW50J6wF2av6jtfyfPhyMBIDlCAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5BwRta3sYi+N6zyY9zEd5uGjgD0xphNx3DPN40gASI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASK7W4cXOfH2xDv2q87BVTQxZVWUnYFNDc5XpeqTjsTvDvE85EgCS6xgCtrfaPmZ777RpD9k+ZHui+HNLf8sE0C9ljgS2SVo3w/THI2Kk+PNytWUBqEvHEIiI3ZKO11ALgAb0ck3gHtvvFqcLl1RWEYBadRsCT0i6StKIpMOSHp1tQdubbI/bHp/89FSXqwPQL12FQEQcjYizEfG5pCclrT7PslsiYjQiRucvWtxtnQD6pKsQsL182stbJe2dbVkAg61js5DtZyWtkXSp7Y8lPShpje0RSSHpoKS7+lcigH5ijMFClWMMMl4hBg1jDAKYFSEAJEcIAMkRAkByhACQHCEAJEcIAMkRAkByhACQXK1jDA4yuveQFUcCQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAkRwgAyXFD0sKBHSOllrvyjomOy9B4hDbhSABIjhAAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIrtaOwQVHTjXSDVjGwncWVfZeVXYfAv3GkQCQHCEAJEcIAMkRAkByhACQHCEAJEcIAMkRAkByhACQXK0dg7FkkSZXX9NxuSbG6Kuyk/Hyp+ZV9l5zMXlTdZ9tmfeay/u13TB/Hh2PBGxfZvt12x/Yft/2vcX0pbZ32d5fPF7S/3IBVK3M6cCkpPsjYpWk6yXdbXuVpM2SxiJipaSx4jWAlukYAhFxOCLeLp6flLRP0gpJ6yVtLxbbLmlDn2oE0EdzujBo+wpJV0vaI2lZRBwuZh2RtKza0gDUoXQI2L5I0vOS7ouIE9PnRURIill+bpPtcdvjZ86c6qlYANUrFQK2F2gqAJ6JiBeKyUdtLy/mL5d0bKafjYgtETEaEaMLFiyuomYAFSrz7YAlPS1pX0Q8Nm3WTkkbi+cbJb1UfXkA+q1Mn8APJP1S0nu2J4ppD0h6WNJztu+U9JGk2/pSIYC+6hgCEfEXSZ5l9tpqywFQt1o7BrNoqmusyvW2sfOtn4b58+B3B4DkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJKjWagPhnkoqqyG+SazHAkAyRECQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAkRwgAyXnqlgE1rcz+t6YGJZ3uUkmf1FZE9dpev9T+bWh7/VL/t+FbEfHVmWbUGgIzFmCPR8Roo0X0oO31S+3fhrbXLzW7DZwOAMkRAkBygxACW5ouoEdtr19q/za0vX6pwW1o/JoAgGYNwpEAgAYRAkByhACQHCEAJEcIAMn9F9vWLzlOtuIrAAAAAElFTkSuQmCC)
%% Cell type:markdown id: tags:
## Create final classification report for the Random Forest classifier
%% Cell type:code id: tags:
``` python
from sklearn import metrics
#Display metrics on Random Forest Classifier
print(metrics.classification_report(y_test, y_pred, target_names=full_dataset['ICD9_CODE'].unique()))
```
%%%% Output: stream
precision recall f1-score support
40301 0.00 0.00 0.00 2.0
486 0.00 0.00 0.00 2.0
58281 0.00 0.00 0.00 3.0
5855 0.00 0.00 0.00 9.0
4254 0.00 0.00 0.00 7.0
2762 0.00 0.00 0.00 4.0
7100 0.00 0.00 0.00 5.0
2767 0.00 0.00 0.00 5.0
7243 0.00 0.00 0.00 4.0
45829 0.00 0.00 0.00 10.0
2875 0.00 0.00 0.00 2.0
28521 0.00 0.00 0.00 7.0
28529 0.00 0.00 0.00 2.0
27541 0.00 0.00 0.00 5.0
5856 0.00 0.00 0.00 1.0
58381 0.00 0.00 0.00 8.0
5589 0.00 0.00 0.00 2.0
32723 0.00 0.00 0.00 1.0
22804 0.00 0.00 0.00 5.0
33829 0.00 0.00 0.00 5.0
78900 0.00 0.00 0.00 5.0
79092 0.00 0.00 0.00 1.0
V4511 0.00 0.00 0.00 5.0
accuracy 0.00 100.0
macro avg 0.00 0.00 0.00 100.0
weighted avg 0.00 0.00 0.00 100.0
%% Cell type:code id: tags:
%% Cell type:markdown id: tags:
``` python
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment