Commit 647c190f authored by Zaid A Ali's avatar Zaid A Ali
Browse files

Clean up code to make it more readable and add metrics

parent dd422fef
%% Cell type:markdown id: tags:
For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes for each encounter).
%% Cell type:code id: tags:
``` python
#imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
print("All modules imported successfully")
```
%% Output
All modules imported successfully
%% Cell type:code id: tags:
``` python
print("Fetching data")
"""
#This code is to import data from the MIMIC-III files.
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv")
diagnoses_clean = diagnoses[["SUBJECT_ID", "HADM_ID", "ICD9_CODE"]]
#For performance reasons, this code has been commented out
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv")
note_events = pd.read_csv("NOTEEVENTS.csv", engine="python", on_bad_lines='skip')
note_events_clean = note_events[["SUBJECT_ID", "HADM_ID","DESCRIPTION", "TEXT"]]
full_dataset = pd.merge(diagnoses_clean, note_events_clean, on =["HADM_ID", "SUBJECT_ID"])
full_dataset = pd.merge(diagnoses, note_events, on =["HADM_ID", "SUBJECT_ID"])
full_dataset = full_dataset[:40000]
print(full_dataset.head())
full_dataset.to_csv("ICDdata40k.csv")
"""
full_dataset = pd.read_csv("ICDdata40k.csv")[[ "TEXT", "ICD9_CODE"]]
full_dataset= full_dataset[:1000]
print("Done fetching all the data")
full_dataset.head()
```
%% Output
Fetching data
Done fetching all the data
TEXT ICD9_CODE
0 Admission Date: [**2141-9-18**] ... 40301
1 PATIENT/TEST INFORMATION:\nIndication: Pericar... 40301
2 Sinus rhythm\nRightward axis\nSince previous t... 40301
3 Chief Complaint: hypotension, altered mental ... 40301
4 Chief Complaint: hypotension, altered mental ... 40301
%% Cell type:code id: tags:
``` python
import nltk
```
%% Cell type:code id: tags:
``` python
#tf-idf vectorization
print("Transforming descriptions into TF-IDF vectors")
texts = full_dataset.TEXT
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
feature_vectors = tfidf.fit_transform(texts)
feature_vectors = feature_vectors.toarray()
icd_codes = full_dataset.ICD9_CODE
print(feature_vectors.shape)
print("Done transforming data")
```
%% Output
Transforming descriptions into TF-IDF vectors
(1000, 9878)
Done transforming data
%% Cell type:code id: tags:
``` python
#Evaluating different models
models = [
LinearSVC(),
MultinomialNB(),
LogisticRegression(random_state=0),
RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
]
CV = 2
cross_val_results = pd.DataFrame(columns=["Model", "Average_Accuracy"])
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, feature_vectors, icd_codes, scoring='accuracy', cv=CV)
avg_accuracy = sum(accuracies)/len(accuracies)
cross_val_results = cross_val_results.append({"Model": model_name, "Average_Accuracy": avg_accuracy }, ignore_index=True)
cross_val_results
```
%% Output
Model Average_Accuracy
0 LinearSVC 0.038
1 MultinomialNB 0.041
2 LogisticRegression 0.038
3 RandomForestClassifier 0.041
%% Cell type:code id: tags:
``` python
#Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth
from matplotlib.pyplot import matshow
from seaborn import heatmap
model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.33, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.10, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
matshow(conf_mat)
#heatmap(conf_mat)
print("Done exploring Random Forest Classifier")
```
%% Output
Done exploring Random Forest Classifier
%% Cell type:code id: tags:
``` python
from sklearn import metrics
#Display metrics on Random Forest Classifier
print(metrics.classification_report(y_test, y_pred, target_names=full_dataset['ICD9_CODE'].unique()))
```
%% Output
precision recall f1-score support
40301 0.00 0.00 0.00 2.0
486 0.00 0.00 0.00 2.0
58281 0.00 0.00 0.00 3.0
5855 0.00 0.00 0.00 9.0
4254 0.00 0.00 0.00 7.0
2762 0.00 0.00 0.00 4.0
7100 0.00 0.00 0.00 5.0
2767 0.00 0.00 0.00 5.0
7243 0.00 0.00 0.00 4.0
45829 0.00 0.00 0.00 10.0
2875 0.00 0.00 0.00 2.0
28521 0.00 0.00 0.00 7.0
28529 0.00 0.00 0.00 2.0
27541 0.00 0.00 0.00 5.0
5856 0.00 0.00 0.00 1.0
58381 0.00 0.00 0.00 8.0
5589 0.00 0.00 0.00 2.0
32723 0.00 0.00 0.00 1.0
22804 0.00 0.00 0.00 5.0
33829 0.00 0.00 0.00 5.0
78900 0.00 0.00 0.00 5.0
79092 0.00 0.00 0.00 1.0
V4511 0.00 0.00 0.00 5.0
accuracy 0.00 100.0
macro avg 0.00 0.00 0.00 100.0
weighted avg 0.00 0.00 0.00 100.0
%% Cell type:code id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment