Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Chia Ying Chiu
NLP Group Project
Commits
4433bb78
Commit
4433bb78
authored
Nov 26, 2021
by
Chia Ying Chiu
Browse files
Replace NLP_Project_Code.ipynb
parent
cb059b1e
Changes
1
Hide whitespace changes
Inline
Side-by-side
NLP_Project_Code.ipynb
View file @
4433bb78
...
...
@@ -4,22 +4,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes
at
each encounter)."
"For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes
for
each encounter)."
]
},
{
"cell_type": "code",
"execution_count":
1
,
"execution_count":
null
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All modules imported successfully\n"
]
}
],
"outputs": [],
"source": [
"#imports\n",
"import pandas as pd\n",
...
...
@@ -28,82 +20,56 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"diagnoses = pd.read_csv(\"DIAGNOSES_ICD.csv\")\n",
"diagnoses_clean = diagnoses[[\"SUBJECT_ID\", \"HADM_ID\", \"ICD9_CODE\"]]\n",
"\n",
"note_events = pd.read_csv(\"NOTEEVENTS.csv\", engine=\"python\", on_bad_lines='skip')\n",
"note_events_clean = note_events[[\"SUBJECT_ID\", \"HADM_ID\",\"DESCRIPTION\", \"TEXT\"]]\n",
"\n",
"full_dataset = pd.merge(diagnoses_clean, note_events_clean, on =[\"HADM_ID\", \"SUBJECT_ID\"])\n",
"full_dataset = full_dataset[:40000]\n",
"\n",
"print(full_dataset)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ROW_ID_x SUBJECT_ID HADM_ID SEQ_NUM ICD9_CODE ROW_ID_y CHARTDATE \\\n",
"0 1297 109 172335 1.0 40301 14797 2141-09-24 \n",
"1 1297 109 172335 1.0 40301 72706 2141-09-21 \n",
"2 1297 109 172335 1.0 40301 170207 2141-09-18 \n",
"3 1297 109 172335 1.0 40301 341513 2141-09-21 \n",
"4 1297 109 172335 1.0 40301 341514 2141-09-21 \n",
"... ... ... ... ... ... ... ... \n",
"39995 801 101 175533 9.0 2762 15782 2196-10-12 \n",
"39996 801 101 175533 9.0 2762 170036 2196-09-26 \n",
"39997 801 101 175533 9.0 2762 170037 2196-09-26 \n",
"39998 801 101 175533 9.0 2762 170038 2196-09-26 \n",
"39999 801 101 175533 9.0 2762 173709 2196-09-30 \n",
"\n",
" CHARTTIME STORETIME CATEGORY \\\n",
"0 NaN NaN Discharge summary \n",
"1 NaN NaN Echo \n",
"2 NaN NaN ECG \n",
"3 2141-09-21 02:49:00 2141-09-21 02:49:45 Physician \n",
"4 2141-09-21 02:49:00 2141-09-21 02:57:11 Physician \n",
"... ... ... ... \n",
"39995 NaN NaN Discharge summary \n",
"39996 NaN NaN ECG \n",
"39997 NaN NaN ECG \n",
"39998 NaN NaN ECG \n",
"39999 NaN NaN ECG \n",
"\n",
" DESCRIPTION CGID ISERROR \\\n",
"0 Report NaN NaN \n",
"1 Report NaN NaN \n",
"2 Report NaN NaN \n",
"3 Physician Resident Admission Note 17650.0 NaN \n",
"4 Physician Resident Admission Note 17650.0 NaN \n",
"... ... ... ... \n",
"39995 Report NaN NaN \n",
"39996 Report NaN NaN \n",
"39997 Report NaN NaN \n",
"39998 Report NaN NaN \n",
"39999 Report NaN NaN \n",
"\n",
" TEXT \n",
"0 Admission Date: [**2141-9-18**] ... \n",
"1 PATIENT/TEST INFORMATION:\\nIndication: Pericar... \n",
"2 Sinus rhythm\\nRightward axis\\nSince previous t... \n",
"3 Chief Complaint: hypotension, altered mental ... \n",
"4 Chief Complaint: hypotension, altered mental ... \n",
"... ... \n",
"39995 Admission Date: [**2196-9-26**] Discharge... \n",
"39996 Baseline artifact\\nSinus rhythm\\nGeneralized l... \n",
"39997 Baseline artifact\\nProbable atrial flutter wit... \n",
"39998 Baseline artifact\\nProbable atrial flutter wit... \n",
"39999 Wide complex tachycardia with a right bundle-b... \n",
"\n",
"[40000 rows x 14 columns]\n"
"NLTK Downloader\n",
"---------------------------------------------------------------------------\n",
" d) Download l) List u) Update c) Config h) Help q) Quit\n",
"---------------------------------------------------------------------------\n"
]
}
],
"source": [
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#change to lowercase\n",
"full_dataset = full_dataset.lower()\n",
"\n",
"\n",
"\n",
" \n",
"\n",
"\n",
"diagnoses = pd.read_csv(\"DIAGNOSES_ICD.csv\")\n",
"note_events = pd.read_csv(\"NOTEEVENTS.csv\", engine=\"python\", on_bad_lines='skip')\n",
"full_dataset = pd.merge(diagnoses, note_events, on =[\"HADM_ID\", \"SUBJECT_ID\"])\n",
"full_dataset = full_dataset[:40000]\n",
"\n",
"print(full_dataset)\n"
"#removing punctuation\n",
"import string\n",
"print(full_dataset.punctuation)\n",
"full_dataset_p = \"\".join([char for char in text if char not in full_dataset.punctuation])\n",
"print(full_dataset_p)"
]
},
{
...
...
@@ -111,7 +77,29 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"#tokenization\n",
"from nltk import word_tokenize\n",
"words = word_tokenize(full_dataset_p)\n",
"#print(words)\n",
"\n",
"#stopword filtering\n",
"from nltk.corpus import stopwords\n",
"stop_words = stopwords.words('english')\n",
"filtered_words = [word for word in words if word not in stop_words]\n",
"#print(filtered_words)\n",
"\n",
"#stemming\n",
"from nltk.stem.porter import PorterStemmer\n",
"porter = PorterStemmer()\n",
"stemmed = [porter.stem(word) for word in filtered_words]\n",
"#print(stemmed)\n",
"\n",
"#POS\n",
"from nltk import pos_tag\n",
"pos = pos_tag(filtered_words)\n",
"print(pos)"
]
}
],
"metadata": {
...
...
%% Cell type:markdown id: tags:
For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes
at
each encounter).
For this project, our goal is create an NLP model to automatically assign ICD-9 encodings, given the clinical notes
for
each encounter).
%% Cell type:code id: tags:
```
python
#imports
import
pandas
as
pd
print
(
"All modules imported successfully"
)
```
%% Output
All modules imported successfully
%% Cell type:code id: tags:
```
python
diagnoses
=
pd
.
read_csv
(
"DIAGNOSES_ICD.csv"
)
diagnoses_clean
=
diagnoses
[[
"SUBJECT_ID"
,
"HADM_ID"
,
"ICD9_CODE"
]]
note_events
=
pd
.
read_csv
(
"NOTEEVENTS.csv"
,
engine
=
"python"
,
on_bad_lines
=
'skip'
)
full_dataset
=
pd
.
merge
(
diagnoses
,
note_events
,
on
=
[
"HADM_ID"
,
"SUBJECT_ID"
])
note_events_clean
=
note_events
[[
"SUBJECT_ID"
,
"HADM_ID"
,
"DESCRIPTION"
,
"TEXT"
]]
full_dataset
=
pd
.
merge
(
diagnoses_clean
,
note_events_clean
,
on
=
[
"HADM_ID"
,
"SUBJECT_ID"
])
full_dataset
=
full_dataset
[:
40000
]
print
(
full_dataset
)
```
%% Cell type:code id: tags:
```
python
import
nltk
```
%% Output
ROW_ID_x SUBJECT_ID HADM_ID SEQ_NUM ICD9_CODE ROW_ID_y CHARTDATE \
0 1297 109 172335 1.0 40301 14797 2141-09-24
1 1297 109 172335 1.0 40301 72706 2141-09-21
2 1297 109 172335 1.0 40301 170207 2141-09-18
3 1297 109 172335 1.0 40301 341513 2141-09-21
4 1297 109 172335 1.0 40301 341514 2141-09-21
... ... ... ... ... ... ... ...
39995 801 101 175533 9.0 2762 15782 2196-10-12
39996 801 101 175533 9.0 2762 170036 2196-09-26
39997 801 101 175533 9.0 2762 170037 2196-09-26
39998 801 101 175533 9.0 2762 170038 2196-09-26
39999 801 101 175533 9.0 2762 173709 2196-09-30
CHARTTIME STORETIME CATEGORY \
0 NaN NaN Discharge summary
1 NaN NaN Echo
2 NaN NaN ECG
3 2141-09-21 02:49:00 2141-09-21 02:49:45 Physician
4 2141-09-21 02:49:00 2141-09-21 02:57:11 Physician
... ... ... ...
39995 NaN NaN Discharge summary
39996 NaN NaN ECG
39997 NaN NaN ECG
39998 NaN NaN ECG
39999 NaN NaN ECG
DESCRIPTION CGID ISERROR \
0 Report NaN NaN
1 Report NaN NaN
2 Report NaN NaN
3 Physician Resident Admission Note 17650.0 NaN
4 Physician Resident Admission Note 17650.0 NaN
... ... ... ...
39995 Report NaN NaN
39996 Report NaN NaN
39997 Report NaN NaN
39998 Report NaN NaN
39999 Report NaN NaN
TEXT
0 Admission Date: [**2141-9-18**] ...
1 PATIENT/TEST INFORMATION:\nIndication: Pericar...
2 Sinus rhythm\nRightward axis\nSince previous t...
3 Chief Complaint: hypotension, altered mental ...
4 Chief Complaint: hypotension, altered mental ...
... ...
39995 Admission Date: [**2196-9-26**] Discharge...
39996 Baseline artifact\nSinus rhythm\nGeneralized l...
39997 Baseline artifact\nProbable atrial flutter wit...
39998 Baseline artifact\nProbable atrial flutter wit...
39999 Wide complex tachycardia with a right bundle-b...
[40000 rows x 14 columns]
NLTK Downloader
---------------------------------------------------------------------------
d) Download l) List u) Update c) Config h) Help q) Quit
---------------------------------------------------------------------------
%% Cell type:code id: tags:
```
python
#change to lowercase
full_dataset
=
full_dataset
.
lower
()
#removing punctuation
import
string
print
(
full_dataset
.
punctuation
)
full_dataset_p
=
""
.
join
([
char
for
char
in
text
if
char
not
in
full_dataset
.
punctuation
])
print
(
full_dataset_p
)
```
%% Cell type:code id: tags:
```
python
#tokenization
from
nltk
import
word_tokenize
words
=
word_tokenize
(
full_dataset_p
)
#print(words)
#stopword filtering
from
nltk.corpus
import
stopwords
stop_words
=
stopwords
.
words
(
'english'
)
filtered_words
=
[
word
for
word
in
words
if
word
not
in
stop_words
]
#print(filtered_words)
#stemming
from
nltk.stem.porter
import
PorterStemmer
porter
=
PorterStemmer
()
stemmed
=
[
porter
.
stem
(
word
)
for
word
in
filtered_words
]
#print(stemmed)
#POS
from
nltk
import
pos_tag
pos
=
pos_tag
(
filtered_words
)
print
(
pos
)
```
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment