Commit dd422fef authored by Zaid A Ali's avatar Zaid A Ali
Browse files

Add tf-idf vectorization steps and then compare different ML models. Evaluate...

Add tf-idf vectorization steps and then compare different ML models. Evaluate the best ML model and evaluate using a confusion matrix
parent 8b348613
%% Cell type:markdown id: tags:
For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes for each encounter).
%% Cell type:code id: tags:
``` python
#imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
print("All modules imported successfully")
```
%% Output
All modules imported successfully
%% Cell type:code id: tags:
``` python
print("Fetching data")
"""
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv")
diagnoses_clean = diagnoses[["SUBJECT_ID", "HADM_ID", "ICD9_CODE"]]
note_events = pd.read_csv("NOTEEVENTS.csv", engine="python", on_bad_lines='skip')
note_events_clean = note_events[["SUBJECT_ID", "HADM_ID","DESCRIPTION", "TEXT"]]
full_dataset = pd.merge(diagnoses_clean, note_events_clean, on =["HADM_ID", "SUBJECT_ID"])
full_dataset = full_dataset[:40000]
print(full_dataset)
"""
full_dataset = pd.read_csv("ICDdata40k.csv")[[ "TEXT", "ICD9_CODE"]]
full_dataset= full_dataset[:1000]
print("Done fetching all the data")
full_dataset.head()
```
%% Output
Fetching data
Done fetching all the data
TEXT ICD9_CODE
0 Admission Date: [**2141-9-18**] ... 40301
1 PATIENT/TEST INFORMATION:\nIndication: Pericar... 40301
2 Sinus rhythm\nRightward axis\nSince previous t... 40301
3 Chief Complaint: hypotension, altered mental ... 40301
4 Chief Complaint: hypotension, altered mental ... 40301
%% Cell type:code id: tags:
``` python
import nltk
```
%% Cell type:code id: tags:
``` python
#tf-idf vectorization
print("Transforming descriptions into TF-IDF vectors")
texts = full_dataset.TEXT
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
feature_vectors = tfidf.fit_transform(texts)
feature_vectors = feature_vectors.toarray()
icd_codes = full_dataset.ICD9_CODE
print(feature_vectors.shape)
print("Done transforming data")
```
%% Output
NLTK Downloader
---------------------------------------------------------------------------
d) Download l) List u) Update c) Config h) Help q) Quit
---------------------------------------------------------------------------
Transforming descriptions into TF-IDF vectors
(1000, 9878)
Done transforming data
%% Cell type:code id: tags:
``` python
#change to lowercase
full_dataset = full_dataset.lower()
#Evaluating different models
models = [
#removing punctuation
import string
print(full_dataset.punctuation)
full_dataset_p = "".join([char for char in text if char not in full_dataset.punctuation])
print(full_dataset_p)
LinearSVC(),
MultinomialNB(),
LogisticRegression(random_state=0),
RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
]
CV = 2
cross_val_results = pd.DataFrame(columns=["Model", "Average_Accuracy"])
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, feature_vectors, icd_codes, scoring='accuracy', cv=CV)
avg_accuracy = sum(accuracies)/len(accuracies)
cross_val_results = cross_val_results.append({"Model": model_name, "Average_Accuracy": avg_accuracy }, ignore_index=True)
cross_val_results
```
%% Output
Model Average_Accuracy
0 LinearSVC 0.038
1 MultinomialNB 0.041
2 LogisticRegression 0.038
3 RandomForestClassifier 0.041
%% Cell type:code id: tags:
``` python
#tokenization
from nltk import word_tokenize
words = word_tokenize(full_dataset_p)
#print(words)
#Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth
from matplotlib.pyplot import matshow
from seaborn import heatmap
model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
matshow(conf_mat)
#heatmap(conf_mat)
print("Done exploring Random Forest Classifier")
```
%% Output
#stopword filtering
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
filtered_words = [word for word in words if word not in stop_words]
#print(filtered_words)
Done exploring Random Forest Classifier
#stemming
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in filtered_words]
#print(stemmed)
#POS
from nltk import pos_tag
pos = pos_tag(filtered_words)
print(pos)
%% Cell type:code id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment