Commit 1f5289e0 authored by Zaid A Ali's avatar Zaid A Ali
Browse files

Add titles to Jupyter Notebook

parent a43f6a1b
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes for each encounter). For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes for each encounter).
%% Cell type:markdown id: tags:
## Imports
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#imports #imports
import pandas as pd import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
import warnings import warnings
warnings.filterwarnings('ignore') warnings.filterwarnings('ignore')
print("All modules imported successfully") print("All modules imported successfully")
``` ```
%% Output %% Output
All modules imported successfully All modules imported successfully
%% Cell type:markdown id: tags:
## Fetching Data from File in order to prepare it for processing
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print("Fetching data") print("Fetching data")
""" """
#This code is to import data from the MIMIC-III files. #This code is to import data from the MIMIC-III files.
#For performance reasons, this code has been commented out #For performance reasons, this code has been commented out
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv") diagnoses = pd.read_csv("DIAGNOSES_ICD.csv")
note_events = pd.read_csv("NOTEEVENTS.csv", engine="python", on_bad_lines='skip') note_events = pd.read_csv("NOTEEVENTS.csv", engine="python", on_bad_lines='skip')
full_dataset = pd.merge(diagnoses, note_events, on =["HADM_ID", "SUBJECT_ID"]) full_dataset = pd.merge(diagnoses, note_events, on =["HADM_ID", "SUBJECT_ID"])
full_dataset = full_dataset[:40000] full_dataset = full_dataset[:40000]
print(full_dataset.head()) print(full_dataset.head())
full_dataset.to_csv("ICDdata40k.csv") full_dataset.to_csv("ICDdata40k.csv")
""" """
full_dataset = pd.read_csv("ICDdata40k.csv")[[ "TEXT", "ICD9_CODE"]] full_dataset = pd.read_csv("ICDdata40k.csv")[[ "TEXT", "ICD9_CODE"]]
full_dataset= full_dataset[:1000] full_dataset= full_dataset[:1000]
print("Done fetching all the data") print("Done fetching all the data")
full_dataset.head() full_dataset.head()
``` ```
%% Output %% Output
Fetching data Fetching data
Done fetching all the data Done fetching all the data
TEXT ICD9_CODE TEXT ICD9_CODE
0 Admission Date: [**2141-9-18**] ... 40301 0 Admission Date: [**2141-9-18**] ... 40301
1 PATIENT/TEST INFORMATION:\nIndication: Pericar... 40301 1 PATIENT/TEST INFORMATION:\nIndication: Pericar... 40301
2 Sinus rhythm\nRightward axis\nSince previous t... 40301 2 Sinus rhythm\nRightward axis\nSince previous t... 40301
3 Chief Complaint: hypotension, altered mental ... 40301 3 Chief Complaint: hypotension, altered mental ... 40301
4 Chief Complaint: hypotension, altered mental ... 40301 4 Chief Complaint: hypotension, altered mental ... 40301
%% Cell type:markdown id: tags:
## Converting clinical notes to tf-idf vectors
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import nltk import nltk
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#tf-idf vectorization #tf-idf vectorization
print("Transforming descriptions into TF-IDF vectors") print("Transforming descriptions into TF-IDF vectors")
texts = full_dataset.TEXT texts = full_dataset.TEXT
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english') tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
feature_vectors = tfidf.fit_transform(texts) feature_vectors = tfidf.fit_transform(texts)
feature_vectors = feature_vectors.toarray() feature_vectors = feature_vectors.toarray()
icd_codes = full_dataset.ICD9_CODE icd_codes = full_dataset.ICD9_CODE
print(feature_vectors.shape) print(feature_vectors.shape)
print("Done transforming data") print("Done transforming data")
``` ```
%% Output %% Output
Transforming descriptions into TF-IDF vectors Transforming descriptions into TF-IDF vectors
(1000, 9878) (1000, 9878)
Done transforming data Done transforming data
%% Cell type:markdown id: tags:
## Passing the tfi-idf vectors into multiple ML models in order to evaluate which one is the best model
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Evaluating different models #Evaluating different models
models = [ models = [
LinearSVC(), LinearSVC(),
MultinomialNB(), MultinomialNB(),
LogisticRegression(random_state=0), LogisticRegression(random_state=0),
RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0) RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
] ]
CV = 2 CV = 2
cross_val_results = pd.DataFrame(columns=["Model", "Average_Accuracy"]) cross_val_results = pd.DataFrame(columns=["Model", "Average_Accuracy"])
for model in models: for model in models:
model_name = model.__class__.__name__ model_name = model.__class__.__name__
accuracies = cross_val_score(model, feature_vectors, icd_codes, scoring='accuracy', cv=CV) accuracies = cross_val_score(model, feature_vectors, icd_codes, scoring='accuracy', cv=CV)
avg_accuracy = sum(accuracies)/len(accuracies) avg_accuracy = sum(accuracies)/len(accuracies)
cross_val_results = cross_val_results.append({"Model": model_name, "Average_Accuracy": avg_accuracy }, ignore_index=True) cross_val_results = cross_val_results.append({"Model": model_name, "Average_Accuracy": avg_accuracy }, ignore_index=True)
cross_val_results cross_val_results
``` ```
%% Output %% Output
Model Average_Accuracy Model Average_Accuracy
0 LinearSVC 0.038 0 LinearSVC 0.038
1 MultinomialNB 0.041 1 MultinomialNB 0.041
2 LogisticRegression 0.038 2 LogisticRegression 0.038
3 RandomForestClassifier 0.041 3 RandomForestClassifier 0.041
%% Cell type:markdown id: tags:
## Further exploring the Random Forest Classifier by creating a confusion matrix
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth #Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth
from matplotlib.pyplot import matshow from matplotlib.pyplot import matshow
model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0) model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.10, random_state=0) X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.10, random_state=0)
model.fit(X_train, y_train) model.fit(X_train, y_train)
y_pred = model.predict(X_test) y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred) conf_mat = confusion_matrix(y_test, y_pred)
matshow(conf_mat) matshow(conf_mat)
print("Done exploring Random Forest Classifier") print("Done exploring Random Forest Classifier")
``` ```
%% Output %% Output
Done exploring Random Forest Classifier Done exploring Random Forest Classifier
%% Cell type:markdown id: tags:
## Create final classification report for the Random Forest classifier
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from sklearn import metrics from sklearn import metrics
#Display metrics on Random Forest Classifier #Display metrics on Random Forest Classifier
print(metrics.classification_report(y_test, y_pred, target_names=full_dataset['ICD9_CODE'].unique())) print(metrics.classification_report(y_test, y_pred, target_names=full_dataset['ICD9_CODE'].unique()))
``` ```
%% Output %% Output
precision recall f1-score support precision recall f1-score support
40301 0.00 0.00 0.00 2.0 40301 0.00 0.00 0.00 2.0
486 0.00 0.00 0.00 2.0 486 0.00 0.00 0.00 2.0
58281 0.00 0.00 0.00 3.0 58281 0.00 0.00 0.00 3.0
5855 0.00 0.00 0.00 9.0 5855 0.00 0.00 0.00 9.0
4254 0.00 0.00 0.00 7.0 4254 0.00 0.00 0.00 7.0
2762 0.00 0.00 0.00 4.0 2762 0.00 0.00 0.00 4.0
7100 0.00 0.00 0.00 5.0 7100 0.00 0.00 0.00 5.0
2767 0.00 0.00 0.00 5.0 2767 0.00 0.00 0.00 5.0
7243 0.00 0.00 0.00 4.0 7243 0.00 0.00 0.00 4.0
45829 0.00 0.00 0.00 10.0 45829 0.00 0.00 0.00 10.0
2875 0.00 0.00 0.00 2.0 2875 0.00 0.00 0.00 2.0
28521 0.00 0.00 0.00 7.0 28521 0.00 0.00 0.00 7.0
28529 0.00 0.00 0.00 2.0 28529 0.00 0.00 0.00 2.0
27541 0.00 0.00 0.00 5.0 27541 0.00 0.00 0.00 5.0
5856 0.00 0.00 0.00 1.0 5856 0.00 0.00 0.00 1.0
58381 0.00 0.00 0.00 8.0 58381 0.00 0.00 0.00 8.0
5589 0.00 0.00 0.00 2.0 5589 0.00 0.00 0.00 2.0
32723 0.00 0.00 0.00 1.0 32723 0.00 0.00 0.00 1.0
22804 0.00 0.00 0.00 5.0 22804 0.00 0.00 0.00 5.0
33829 0.00 0.00 0.00 5.0 33829 0.00 0.00 0.00 5.0
78900 0.00 0.00 0.00 5.0 78900 0.00 0.00 0.00 5.0
79092 0.00 0.00 0.00 1.0 79092 0.00 0.00 0.00 1.0
V4511 0.00 0.00 0.00 5.0 V4511 0.00 0.00 0.00 5.0
accuracy 0.00 100.0 accuracy 0.00 100.0
macro avg 0.00 0.00 0.00 100.0 macro avg 0.00 0.00 0.00 100.0
weighted avg 0.00 0.00 0.00 100.0 weighted avg 0.00 0.00 0.00 100.0
%% Cell type:code id: tags: %% Cell type:markdown id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment