Commit dd422fef authored by Zaid A Ali's avatar Zaid A Ali
Browse files

Add tf-idf vectorization steps and then compare different ML models. Evaluate...

Add tf-idf vectorization steps and then compare different ML models. Evaluate the best ML model and evaluate using a confusion matrix
parent 8b348613
%% Cell type:markdown id: tags:
For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes for each encounter).
%% Cell type:code id: tags:
``` python
#imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
print("All modules imported successfully")
```
%%%% Output: stream
All modules imported successfully
%% Cell type:code id: tags:
``` python
print("Fetching data")
"""
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv")
diagnoses_clean = diagnoses[["SUBJECT_ID", "HADM_ID", "ICD9_CODE"]]
note_events = pd.read_csv("NOTEEVENTS.csv", engine="python", on_bad_lines='skip')
note_events_clean = note_events[["SUBJECT_ID", "HADM_ID","DESCRIPTION", "TEXT"]]
full_dataset = pd.merge(diagnoses_clean, note_events_clean, on =["HADM_ID", "SUBJECT_ID"])
full_dataset = full_dataset[:40000]
print(full_dataset)
"""
full_dataset = pd.read_csv("ICDdata40k.csv")[[ "TEXT", "ICD9_CODE"]]
full_dataset= full_dataset[:1000]
print("Done fetching all the data")
full_dataset.head()
```
%%%% Output: stream
Fetching data
Done fetching all the data
%%%% Output: execute_result
TEXT ICD9_CODE
0 Admission Date: [**2141-9-18**] ... 40301
1 PATIENT/TEST INFORMATION:\nIndication: Pericar... 40301
2 Sinus rhythm\nRightward axis\nSince previous t... 40301
3 Chief Complaint: hypotension, altered mental ... 40301
4 Chief Complaint: hypotension, altered mental ... 40301
%% Cell type:code id: tags:
``` python
import nltk
```
%% Cell type:code id: tags:
``` python
#tf-idf vectorization
print("Transforming descriptions into TF-IDF vectors")
texts = full_dataset.TEXT
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
feature_vectors = tfidf.fit_transform(texts)
feature_vectors = feature_vectors.toarray()
icd_codes = full_dataset.ICD9_CODE
print(feature_vectors.shape)
print("Done transforming data")
```
%%%% Output: stream
NLTK Downloader
---------------------------------------------------------------------------
d) Download l) List u) Update c) Config h) Help q) Quit
---------------------------------------------------------------------------
Transforming descriptions into TF-IDF vectors
(1000, 9878)
Done transforming data
%% Cell type:code id: tags:
``` python
#change to lowercase
full_dataset = full_dataset.lower()
#Evaluating different models
models = [
LinearSVC(),
MultinomialNB(),
LogisticRegression(random_state=0),
RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
]
CV = 2
cross_val_results = pd.DataFrame(columns=["Model", "Average_Accuracy"])
#removing punctuation
import string
print(full_dataset.punctuation)
full_dataset_p = "".join([char for char in text if char not in full_dataset.punctuation])
print(full_dataset_p)
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, feature_vectors, icd_codes, scoring='accuracy', cv=CV)
avg_accuracy = sum(accuracies)/len(accuracies)
cross_val_results = cross_val_results.append({"Model": model_name, "Average_Accuracy": avg_accuracy }, ignore_index=True)
cross_val_results
```
%%%% Output: execute_result
Model Average_Accuracy
0 LinearSVC 0.038
1 MultinomialNB 0.041
2 LogisticRegression 0.038
3 RandomForestClassifier 0.041
%% Cell type:code id: tags:
``` python
#tokenization
from nltk import word_tokenize
words = word_tokenize(full_dataset_p)
#print(words)
#Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth
from matplotlib.pyplot import matshow
from seaborn import heatmap
model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, icd_codes, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
matshow(conf_mat)
#heatmap(conf_mat)
print("Done exploring Random Forest Classifier")
```
%%%% Output: stream
#stopword filtering
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
filtered_words = [word for word in words if word not in stop_words]
#print(filtered_words)
Done exploring Random Forest Classifier
#stemming
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in filtered_words]
#print(stemmed)
%%%% Output: display_data
#POS
from nltk import pos_tag
pos = pos_tag(filtered_words)
print(pos)
![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAQEAAAECCAYAAAD+eGJTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8+yak3AAAACXBIWXMAAAsTAAALEwEAmpwYAAAN00lEQVR4nO3dX4xc5X3G8efZXRuzQBqsDZbrQhP+Rs5F7WpF0hYVEG3q5ga4icpFZVWRnLYgJVJUCeUmuanETUJbqY3kFIovEtpIiQMXURtkRaZVU7ebAMEOpKDUKLaMHYtUcYv/sr9e7EFdyK7n3Zl3zjmzv+9HsmbmzLvv+Z0942fPmXnnPY4IAchrqusCAHSLEACSIwSA5AgBIDlCAEiOEACS6ywEbO+y/SPbr9p+uKs6RmH7qO0XbT9ve6HrekrYftz2KduHly3bbPsZ2680t9d2WePlrFL/520fb/bD87Y/1mWNl2P7etvfsf1D20dsf6pZ3tk+6CQEbE9L+mtJvy9pu6QHbG/vopYK7o6IHREx33UhhZ6QtOtdyx6WdCAibpF0oHncV0/oF+uXpEeb/bAjIr7Vck1rcUnSZyJiu6SPSHqwee13tg+6OhK4XdKrEfHjiLgg6e8l3dtRLalExLOS3njX4nsl7Wvu75N0X5s1rcUq9U+MiDgREd9v7p+R9JKkbepwH3QVAtsk/WTZ42PNskkTkr5t+3u293RdzAi2RMSJ5v7rkrZ0WcyQHrL9g+Z0obenM8vZfr+knZIOqcN9wBuDo7kjIn5dS6c1D9r+7a4LGlUsjSOftLHkX5J0k6Qdkk5I+kKn1RSwfbWkr0v6dET8fPlzbe+DrkLguKTrlz3+lWbZRImI483tKUn7tXSaM4lO2t4qSc3tqY7rWZOIOBkRb0XEoqQvq+f7wfYGLQXAVyLiG83izvZBVyHwH5Jusf0B2xsl/YGkpzuqZSi2r7J9zdv3JX1U0uHL/1RvPS1pd3N/t6SnOqxlzd7+z9O4Xz3eD7Yt6TFJL0XEF5c91dk+cFffImw+xvkLSdOSHo+IP++kkCHZvlFLf/0laUbSVydhG2w/KekuSXOSTkr6nKRvSvqapBskvSbp4xHRyzffVqn/Li2dCoSko5I+uez8ulds3yHpnyW9KGmxWfxZLb0v0Mk+6CwEAPQDbwwCyRECQHKEAJAcIQAkRwgAyXUeAhM+3Hbi65cmfxsmvX6p223oPAQkTfoOnPT6pcnfhkmvX+pwG/oQAgA61OpgoY2+Ijbpqncsu6jz2qArWqthNd5UVkOcO/+OxyvV75mZsr4uXSorbsza2IaS/obt68LiWW2cunLo/opc/Yv9r+h/zg7V/bj/H5zT/+pCnPdKz5Xt6VXY3iXpL7U09PdvI+KRy7XfpKv0Yd8zyirHZvrm24ravXXkR4P7mruurK+T/f2eTu1tKOmvZl9r6a/E4vzOonZTB5+rts6aDsWBVZ8b+nRgnc0OBKQ1ynsCzA4ErAOjhMB6mR0ISG2k9wRKNJ9/7pGkTZod9+oArNEoRwJFswNFxN6ImI+I+T58CgDgnUYJgYmfHQjACKcDEXHJ9kOS/kn/PzvQkWqVAWjFSO8JNBd56POFHoqVfP5f3FePP/8vVXsbavZXPJ5gS73xBK/uni7q69aDRc16hWHDQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAkRwgAyY39C0QZ1RykUnu962EgU6ma2/qeFzdW66tvOBIAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkmPE4BhkGpWXxQt/9jdF7X7v0R3jLWQMOBIAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIjhAAkmt1xKA3XaHpm28b2K7mxUG7MP2hwdso9Xs7a29DSX81+5Iknf5ZWbsCN/3DHxe1u1n/Vm2dbeFIAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEiu1RGDce58r0fJVVNxpFpnKm9Dzf1ePLKw6tWhbyzqaxJxJAAkN9KRgO2jks5IekvSpYiYr1EUgPbUOB24OyJOV+gHQAc4HQCSGzUEQtK3bX/P9p6VGtjeY3vB9sJFnR9xdQBqG/V04I6IOG77OknP2H45Ip5d3iAi9kraK0nv8eYYcX0AKhvpSCAijje3pyTtl3R7jaIAtGfoELB9le1r3r4v6aOSDtcqDEA7Rjkd2CJpv+23+/lqRPzj5X7AMzOanhs8gKOLC3ou3rmzqN3UwecGtlkPFyS9+MFtRe2mCre1i+nFUgxMq2DoEIiIH0v6tYq1AOgAHxECyRECQHKEAJAcIQAkRwgAyRECQHKEAJAcIQAk1+r0Yll0NaKt5kjFDaffLFtnYX8X52YHtunzX6Qbv7l+vwHb5987gBYQAkByhACQHCEAJEcIAMkRAkByhACQHCEAJEcIAMm1O2JwZlqau3Zwuw7m6Du3eWNRu8Hj3pjbbiUbXj4+sE3p6MMuLvha8/XRNxwJAMkRAkByhACQHCEAJEcIAMkRAkByhACQHCEAJEcIAMkxx+AYvPFHv1HUbvPffXfMlaxTJaNOJU1XXOXMn75e1nB/xZW2hCMBIDlCAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5FodLBTnzvd26q3Z/Yeq9fW+f3+jqF3xdFod6PPFUrt4DR39yXxRu1v12pgrqY8jASC5gSFg+3Hbp2wfXrZss+1nbL/S3JaN4wTQOyVHAk9I2vWuZQ9LOhARt0g60DwGMIEGhkBEPCvp3Se590ra19zfJ+m+umUBaMuw7wlsiYgTzf3XJW2pVA+Alo38xmBEhKRY7Xnbe2wv2F64qPOjrg5AZcOGwEnbWyWpuV3185+I2BsR8xExv0FXDLk6AOMybAg8LWl3c3+3pKfqlAOgbSUfET4p6buSbrN9zPYnJD0i6XdtvyLpd5rHACbQwBGDEfHAKk/ds9aVeWZG03PXDWxXc3RZqcU7dxa1mzr43MA2P719c1Ffm48UNSs2/aHbBrbpasRmzdqmtwx+Da1Fyett9pfOVl1nnzBiEEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEiu3TkGL13qZDRgiZKRgKXWwxyDtdUcqdjX19Ck4kgASI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASK7VEYN9nmOwdN66ktrO3Preor5mK88x2NcrPtdWc1+V9vfLf7WxqK9JxJEAkBwhACRHCADJEQJAcoQAkBwhACRHCADJEQJAckwv1qhZ1+z+Q9X6Wos+X5C0ptqvoZL+jv3JzUV93XBw1Grax5EAkBwhACRHCADJEQJAcoQAkBwhACRHCADJEQJAcoQAkBzTizVKRttJZSPuava1FjX7qz6F14SPZpx7Yf1ePpYjASC5gSFg+3Hbp2wfXrbs87aP236++fex8ZYJYFxKjgSekLRrheWPRsSO5t+36pYFoC0DQyAinpX0Rgu1AOjAKO8JPGT7B83pwrXVKgLQqmFD4EuSbpK0Q9IJSV9YraHtPbYXbC9cWDw75OoAjMtQIRARJyPirYhYlPRlSbdfpu3eiJiPiPmNU1cOWyeAMRkqBGxvXfbwfkmHV2sLoN8GDhay/aSkuyTN2T4m6XOS7rK9Q1JIOirpk+MrEcA4DQyBiHhghcWPDbOyXs8xWHO02umf1etrDUpG+ZX+/qvP49fj0YDZMWIQSI4QAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASK7VOQZ19ZVanN85sNnUwedaKOadqs6pN1f4zeoOrq7blcU76+332vMflrjmP/+7bJ3V1tgejgSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEiOEACSa/eCpOcuasPLxwe262TARcUBPmdufW9RV7NHylZZquZFP0sG90jlA3w2nH5zYJvi/V64r6YLuysZVHRxbraor0n8qzqJNQOoiBAAkiMEgOQIASA5QgBIjhAAkiMEgOQIASA5QgBIrtURg1kuSHrNv/5X2TqrrbG+kpGdUvk21Pz9dnFx02N3X1nU7oaDYy5kDDgSAJIjBIDkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJJrd47BmRlNzw2+mGQXowrfvP/DRe1m9x8a2ObiB7cV9TVV+4KkNUfSdXRR1RJdXJB07oU+j+8czcAjAdvX2/6O7R/aPmL7U83yzbafsf1Kc1v4qgHQJyWnA5ckfSYitkv6iKQHbW+X9LCkAxFxi6QDzWMAE2ZgCETEiYj4fnP/jKSXJG2TdK+kfU2zfZLuG1ONAMZoTW8M2n6/pJ2SDknaEhEnmqdel7SlbmkA2lAcAravlvR1SZ+OiJ8vfy4iQlKs8nN7bC/YXriweHakYgHUVxQCtjdoKQC+EhHfaBaftL21eX6rpBXfio2IvRExHxHzG6fKvpMNoD0lnw5Y0mOSXoqILy576mlJu5v7uyU9Vb88AONWMk7gtyT9oaQXbT/fLPuspEckfc32JyS9JunjY6kQwFgNDIGI+BdJXuXpe+qWA6BtrY4Y7LP1MC9gn5WM8isd4Vd7RGZJbev59cF3B4DkCAEgOUIASI4QAJIjBIDkCAEgOUIASI4QAJJrd7DQzHTZtFUdTFl15jc/UNRudv/g2mpfzHNdqLjfq18stWC9i3fuLOqr9pRxbeBIAEiOEACSIwSA5AgBIDlCAEiOEACSIwSA5AgBIDlCAEjOS5cMaGll9k+1NCnpcnOSTrdWRH2TXr80+dsw6fVL49+GX42I9630RKshsGIB9kJEzHdaxAgmvX5p8rdh0uuXut0GTgeA5AgBILk+hMDergsY0aTXL03+Nkx6/VKH29D5ewIAutWHIwEAHSIEgOQIASA5QgBIjhAAkvs/WRLh0wdQMI0AAAAASUVORK5CYII=)
%% Cell type:code id: tags:
``` python
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment