Commit 9c3d401e authored by Ryan Melvin's avatar Ryan Melvin Committed by Tarun karthik kumar Mamidi
Browse files

Python models

parent a9b9f934
*.csv
\ No newline at end of file
*.csv
# Created by https://www.toptal.com/developers/gitignore/api/python
# Edit at https://www.toptal.com/developers/gitignore?templates=python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
doc/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
#poetry.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
# .env
.env/
.venv/
env/
venv/
ENV/
env.bak/
venv.bak/
pythonenv*
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# operating system-related files
*.DS_Store #file properties cache/storage on macOS
Thumbs.db #thumbnail cache on Windows
# profiling data
.prof
# End of https://www.toptal.com/developers/gitignore/api/python
\ No newline at end of file
# COVID-19_RISK_PREDICTOR
To develop a model that takes in demographics, living style and symptoms/conditions to predict risk for patients.
### Directory structure used to read parsed data
```
encoded-2-week-filter.csv
```
### Create conda environment before running anything
```
conda create env -n rico -f configs/environment.yaml
conda activate rico
```
### Run model training
```
python src/Model.py
```
output files are saved in the top level directory.
name: rico
channels:
- conda-forge
- defaults
dependencies:
- pip:
- matplotlib==3.3.4
- researchpy==0.2.3
- scorecardpy==0.1.9.2
- xverse==1.0.5
- scikit-learn==0.23.2
- pandas==1.2.1
- numpy==1.19.2
########### 6/11/20 ########################
Project - To develop a model that takes in symptoms/comditions and COVID test results and predicts/scores patients with risk. We might also develop an app giving something like a credit score to patients with risk.
Data - N3C data downloaded from i2b2.
################## Data processing ######################################################################
First filtering out patients who are tested for coronavirus
Files involved -
measurement.csv - test and results
condition_occurance.csv
\ No newline at end of file
#!/usr/bin/env python
#libraries
import pandas as pd
import numpy as np
import xverse
from xverse.transformer import WOE
import scorecardpy as sc
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
import statsmodels.api as sm
from matplotlib import pyplot
%matplotlib inline
from joblib import dump, load
# Functions for computing AUC CI using Delong's method
#!/usr/bin/python
"""
AUC DeLong CI
Created on Tue Nov 6 10:06:52 2018
@author: yandexdataschool
Original Code found in:
https://github.com/yandexdataschool/roc_comparison
updated: Raul Sanchez-Vazquez
"""
import numpy as np
import scipy.stats
from scipy import stats
# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
N = len(x)
T = np.zeros(N, dtype=np.float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = 0.5*(i + j - 1)
i = j
T2 = np.empty(N, dtype=np.float)
# Note(kazeevn) +1 is due to Python using 0-based indexing
# instead of 1-based in the AUC formula in the paper
T2[J] = T + 1
return T2
def compute_midrank_weight(x, sample_weight):
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
cumulative_weight = np.cumsum(sample_weight[J])
N = len(x)
T = np.zeros(N, dtype=np.float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = cumulative_weight[i:j].mean()
i = j
T2 = np.empty(N, dtype=np.float)
T2[J] = T
return T2
def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight):
if sample_weight is None:
return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
else:
return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)
def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
@article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=np.float)
ty = np.empty([k, n], dtype=np.float)
tz = np.empty([k, m + n], dtype=np.float)
for r in range(k):
tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
total_positive_weights = sample_weight[:m].sum()
total_negative_weights = sample_weight[m:].sum()
pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
total_pair_weights = pair_weights.sum()
aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
@article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Oerating
Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=np.float)
ty = np.empty([k, n], dtype=np.float)
tz = np.empty([k, m + n], dtype=np.float)
for r in range(k):
tx[r, :] = compute_midrank(positive_examples[r, :])
ty[r, :] = compute_midrank(negative_examples[r, :])
tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
v01 = (tz[:, :m] - tx[:, :]) / n
v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
def calc_pvalue(aucs, sigma):
"""Computes log(10) of p-values.
Args:
aucs: 1D array of AUCs
sigma: AUC DeLong covariances
Returns:
log10(pvalue)
"""
l = np.array([[1, -1]])
z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
def compute_ground_truth_statistics(ground_truth, sample_weight):
assert np.array_equal(np.unique(ground_truth), [0, 1])
order = (-ground_truth).argsort()
label_1_count = int(ground_truth.sum())
if sample_weight is None:
ordered_sample_weight = None
else:
ordered_sample_weight = sample_weight[order]
return order, label_1_count, ordered_sample_weight
def delong_roc_variance(ground_truth, predictions, sample_weight=None):
"""
Computes ROC AUC variance for a single set of predictions
Args:
ground_truth: np.array of 0 and 1
predictions: np.array of floats of the probability of being class 1
"""
order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
ground_truth, sample_weight)
predictions_sorted_transposed = predictions[np.newaxis, order]
aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
return aucs[0], delongcov
# Data setup
# Read, filter based on missingness and identical limits,
# train/test split, and perform WoE transform.
# load data
encoded = pd.read_csv('encoded-2-week-filter.csv')
encoded = encoded.drop(['PERSON_ID'],axis=1)
# filter variable via missing rate, iv, identical value rate
encoded_f = sc.var_filter(encoded
, y="class"
, positive='negative'
, identical_limit = 0.95
, iv_limit = 0
, missing_limit=0.95
, return_rm_reason=False # makes output a dictionary referencing 2 dfs
, var_kp=['f_R06'
, 'f_R05'
, 'f_R50'
, 'f_R53'
, 'f_M79'
, 'f_R09'
, 'f_R51'
, 'f_J44'
, 'f_E11'
, 'f_I25'
, 'f_I10'
]
, var_rm = [
'f_BMI-unknown'
, 'f_Unknown'
]
)
# breaking dt into train and test
train, test = sc.split_df(encoded_f, 'class').values()
# woe binning ------
bins = sc.woebin(encoded_f, y="class")
# converting train and test into woe values
train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)
# get xs and ys
y_train = train_woe.loc[:,'class']
X_train = train_woe.loc[:,train_woe.columns != 'class']
y_test = test_woe.loc[:,'class']
X_test = test_woe.loc[:,train_woe.columns != 'class']
# Lasso-based regression
# Determine a lambda for Lasso (l1) regularization using
# 10-fold cross validation, get predictions from best model, score, and make scorecard
# logistic regression ------
# lasso
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
lasso_cv = LogisticRegressionCV(penalty='l1'
, Cs = 100
, solver='saga'
, cv = StratifiedKFold(10)
, n_jobs=-1
, max_iter = 10000
, scoring = 'neg_log_loss'
, class_weight = 'balanced'
)
lasso_cv.fit(X_train, y_train)
# plot training ROC
sklearn.metrics.plot_roc_curve(lasso_cv, X_train, y_train)
pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
pyplot.title('2-Week LASSO Training ROC')
axes = pyplot.gca()
axes.set_facecolor("white")
axes.set_clip_on(False)
pyplot.savefig('2week_training_roc.png')
# plot testing ROC
sklearn.metrics.plot_roc_curve(lasso_cv, X_test, y_test)
pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
pyplot.title('2-Week LASSO Testing ROC')
axes = pyplot.gca()
axes.set_facecolor("white")
axes.set_clip_on(False)
pyplot.savefig('2week_testing_roc.png')
# predicted proability
train_pred = lasso_cv.predict_proba(X_train)[:,1]
train_pred_class = lasso_cv.predict(X_train)
test_pred = lasso_cv.predict_proba(X_test)[:,1]
test_pred_class = lasso_cv.predict(X_test)
# Make scorecard
card = sc.scorecard(bins, lasso_cv, X_train.columns)
# credit score
train_score = sc.scorecard_ply(train, card, print_step=0)
test_score = sc.scorecard_ply(test, card, print_step=0)
# psi
pyplot.rcParams["font.size"] = "18"
fig = sc.perf_psi(
score = {'train':train_score, 'test':test_score},
label = {'train':y_train, 'test':y_test},
x_tick_break=50
)
fig['pic']['score'].set_size_inches(18.5, 10.5)
fig['pic']['score'].savefig('2week_dist.png')
card_df = pd.concat(card)
card_df.to_csv('2week_lasso_card_df.csv')
scores_lasso_2week = sc.scorecard_ply(encoded, card, only_total_score=True, print_step=0, replace_blank_na=True)
scores_lasso_2week.to_csv('scores_lasso_2week.csv')
# Training Metrics and AUC CI
print("Training Metrics")
# calculate accuracy
acc = sklearn.metrics.accuracy_score(y_train, train_pred_class)
print('Accuracy: %.3f' % acc)
auc_score = sklearn.metrics.roc_auc_score(y_train, train_pred)
print('AUC: %.3f' % auc_score)
f_score = sklearn.metrics.f1_score(y_train, train_pred_class)
print('FS: %.3f' % f_score)
# delong ci
delong_alpha = 0.95
auc, auc_cov = delong_roc_variance(
np.ravel(y_train),
np.ravel(train_pred))
auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2)
ci = stats.norm.ppf(
lower_upper_q,
loc=auc_score,
scale=auc_std)
ci[ci > 1] = 1
print('AUC COV:', round(auc_cov,2))
print('95% AUC CI:', np.round(ci,2))
# Testing Metrics and AUC CI
print("Testing Metrics")
# calculate accuracy
acc = sklearn.metrics.accuracy_score(y_test, test_pred_class)
print('Accuracy: %.3f' % acc)
auc_score = sklearn.metrics.roc_auc_score(y_test, test_pred)
print('AUC: %.3f' % auc_score)
f_score = sklearn.metrics.f1_score(y_test, test_pred_class)
print('FS: %.3f' % f_score)
# delong ci
delong_alpha = 0.95
auc, auc_cov = delong_roc_variance(
np.ravel(y_test),
np.ravel(test_pred))
auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2)
ci = stats.norm.ppf(
lower_upper_q,
loc=auc_score,
scale=auc_std)
ci[ci > 1] = 1
print('AUC COV:', round(auc_cov,2))
print('95% AUC CI:', np.round(ci,2))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment