Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Chia Ying Chiu
NLP Group Project
Commits
1f5289e0
Commit
1f5289e0
authored
Dec 02, 2021
by
Zaid A Ali
Browse files
Add titles to Jupyter Notebook
parent
a43f6a1b
Changes
1
Hide whitespace changes
Inline
Side-by-side
NLP_Project_Code.ipynb
View file @
1f5289e0
%% Cell type:markdown id: tags:
For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes for each encounter).
%% Cell type:markdown id: tags:
## Imports
%% Cell type:code id: tags:
```
python
#imports
import
pandas
as
pd
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.svm
import
LinearSVC
from
sklearn.model_selection
import
cross_val_score
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
confusion_matrix
import
warnings
warnings
.
filterwarnings
(
'ignore'
)
print
(
"All modules imported successfully"
)
```
%%%% Output: stream
All modules imported successfully
%% Cell type:markdown id: tags:
## Fetching Data from File in order to prepare it for processing
%% Cell type:code id: tags:
```
python
print
(
"Fetching data"
)
"""
#This code is to import data from the MIMIC-III files.
#For performance reasons, this code has been commented out
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv")
note_events = pd.read_csv("NOTEEVENTS.csv", engine="python", on_bad_lines='skip')
full_dataset = pd.merge(diagnoses, note_events, on =["HADM_ID", "SUBJECT_ID"])
full_dataset = full_dataset[:40000]
print(full_dataset.head())
full_dataset.to_csv("ICDdata40k.csv")
"""
full_dataset
=
pd
.
read_csv
(
"ICDdata40k.csv"
)[[
"TEXT"
,
"ICD9_CODE"
]]
full_dataset
=
full_dataset
[:
1000
]
print
(
"Done fetching all the data"
)
full_dataset
.
head
()
```
%%%% Output: stream
Fetching data
Done fetching all the data
%%%% Output: execute_result
TEXT ICD9_CODE
0 Admission Date: [**2141-9-18**] ... 40301
1 PATIENT/TEST INFORMATION:\nIndication: Pericar... 40301
2 Sinus rhythm\nRightward axis\nSince previous t... 40301
3 Chief Complaint: hypotension, altered mental ... 40301
4 Chief Complaint: hypotension, altered mental ... 40301
%% Cell type:markdown id: tags:
## Converting clinical notes to tf-idf vectors
%% Cell type:code id: tags:
```
python
import
nltk
```
%% Cell type:code id: tags:
```
python
#tf-idf vectorization
print
(
"Transforming descriptions into TF-IDF vectors"
)
texts
=
full_dataset
.
TEXT
tfidf
=
TfidfVectorizer
(
sublinear_tf
=
True
,
min_df
=
5
,
norm
=
'l2'
,
encoding
=
'latin-1'
,
ngram_range
=
(
1
,
2
),
stop_words
=
'english'
)
feature_vectors
=
tfidf
.
fit_transform
(
texts
)
feature_vectors
=
feature_vectors
.
toarray
()
icd_codes
=
full_dataset
.
ICD9_CODE
print
(
feature_vectors
.
shape
)
print
(
"Done transforming data"
)
```
%%%% Output: stream
Transforming descriptions into TF-IDF vectors
(1000, 9878)
Done transforming data
%% Cell type:markdown id: tags:
## Passing the tfi-idf vectors into multiple ML models in order to evaluate which one is the best model
%% Cell type:code id: tags:
```
python
#Evaluating different models
models
=
[
LinearSVC
(),
MultinomialNB
(),
LogisticRegression
(
random_state
=
0
),
RandomForestClassifier
(
n_estimators
=
200
,
max_depth
=
3
,
random_state
=
0
)
]
CV
=
2
cross_val_results
=
pd
.
DataFrame
(
columns
=
[
"Model"
,
"Average_Accuracy"
])
for
model
in
models
:
model_name
=
model
.
__class__
.
__name__
accuracies
=
cross_val_score
(
model
,
feature_vectors
,
icd_codes
,
scoring
=
'accuracy'
,
cv
=
CV
)
avg_accuracy
=
sum
(
accuracies
)
/
len
(
accuracies
)
cross_val_results
=
cross_val_results
.
append
({
"Model"
:
model_name
,
"Average_Accuracy"
:
avg_accuracy
},
ignore_index
=
True
)
cross_val_results
```
%%%% Output: execute_result
Model Average_Accuracy
0 LinearSVC 0.038
1 MultinomialNB 0.041
2 LogisticRegression 0.038
3 RandomForestClassifier 0.041
%% Cell type:markdown id: tags:
## Further exploring the Random Forest Classifier by creating a confusion matrix
%% Cell type:code id: tags:
```
python
#Random Forest Classifier appears to have a slight edge over the other models, so it is time to explore it in more depth
from
matplotlib.pyplot
import
matshow
model
=
RandomForestClassifier
(
n_estimators
=
200
,
max_depth
=
3
,
random_state
=
0
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
feature_vectors
,
icd_codes
,
test_size
=
0.10
,
random_state
=
0
)
model
.
fit
(
X_train
,
y_train
)
y_pred
=
model
.
predict
(
X_test
)
conf_mat
=
confusion_matrix
(
y_test
,
y_pred
)
matshow
(
conf_mat
)
print
(
"Done exploring Random Forest Classifier"
)
```
%%%% Output: stream
Done exploring Random Forest Classifier
%%%% Output: display_data

%% Cell type:markdown id: tags:
## Create final classification report for the Random Forest classifier
%% Cell type:code id: tags:
```
python
from
sklearn
import
metrics
#Display metrics on Random Forest Classifier
print
(
metrics
.
classification_report
(
y_test
,
y_pred
,
target_names
=
full_dataset
[
'ICD9_CODE'
].
unique
()))
```
%%%% Output: stream
precision recall f1-score support
40301 0.00 0.00 0.00 2.0
486 0.00 0.00 0.00 2.0
58281 0.00 0.00 0.00 3.0
5855 0.00 0.00 0.00 9.0
4254 0.00 0.00 0.00 7.0
2762 0.00 0.00 0.00 4.0
7100 0.00 0.00 0.00 5.0
2767 0.00 0.00 0.00 5.0
7243 0.00 0.00 0.00 4.0
45829 0.00 0.00 0.00 10.0
2875 0.00 0.00 0.00 2.0
28521 0.00 0.00 0.00 7.0
28529 0.00 0.00 0.00 2.0
27541 0.00 0.00 0.00 5.0
5856 0.00 0.00 0.00 1.0
58381 0.00 0.00 0.00 8.0
5589 0.00 0.00 0.00 2.0
32723 0.00 0.00 0.00 1.0
22804 0.00 0.00 0.00 5.0
33829 0.00 0.00 0.00 5.0
78900 0.00 0.00 0.00 5.0
79092 0.00 0.00 0.00 1.0
V4511 0.00 0.00 0.00 5.0
accuracy 0.00 100.0
macro avg 0.00 0.00 0.00 100.0
weighted avg 0.00 0.00 0.00 100.0
%% Cell type:
code
id: tags:
%% Cell type:
markdown
id: tags:
```
python
```
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment