Commit d045e387 authored by Matthew K Defenderfer's avatar Matthew K Defenderfer
Browse files

Initial Commit????

parents
# ignore the models. They are very large and don't need to be sent up. They can be recreated through code on the user end if necessary
models/
# ignore any and all job files and outputs. these include cross-validation, train-test prediction, among others. Just extra bloat that can be recreated later on
cv-jobs/
cv-out/
train-test-predict-jobs/
# ignore the original data. this can be transferred via some other method than github. also prevents HCP restricted information from being uploaded
data/
# ctmodel-ml
This project is dedicated to using various linear and nonlinear machine learning algorithms to create a model of cortical thickness as a function of various anatomic and retinotopic properties in early visual cortex.
\ No newline at end of file
name: ctmodel-ml
channels:
- conda-forge
- anaconda
- defaults
dependencies:
- _tflow_1100_select=0.0.1=gpu
- absl-py=0.4.1=py36_0
- astor=0.7.1=py36_0
- bleach=1.5.0=py36_0
- cudatoolkit=9.0=h13b8566_0
- cudnn=7.1.2=cuda9.0_0
- cupti=9.0.176=0
- gast=0.2.0=py36_0
- grpcio=1.12.1=py36hdbcaa40_0
- hdf5=1.10.2=hba1933b_1
- html5lib=0.9999999=py36_0
- libprotobuf=3.6.0=hdbcaa40_0
- markdown=2.6.11=py36_0
- pyyaml=3.13=py36h14c3975_0
- termcolor=1.1.0=py36_1
- werkzeug=0.14.1=py36_0
- yaml=0.1.7=h96e3832_1
- nibabel=2.3.0=pyh24bf2e0_1
- pydicom=1.1.0=py_0
- appdirs=1.4.3=py36h28b3542_0
- asn1crypto=0.24.0=py36_0
- attrs=18.2.0=py36h28b3542_0
- automat=0.7.0=py36_0
- backcall=0.1.0=py36_0
- blas=1.0=mkl
- ca-certificates=2018.03.07=0
- certifi=2018.10.15=py36_0
- cffi=1.11.5=py36he75722e_1
- constantly=15.1.0=py36h28b3542_0
- cryptography=2.3.1=py36hc365091_0
- cycler=0.10.0=py36_0
- cython=0.28.5=py36hf484d3e_0
- dbus=1.13.2=h714fa37_1
- decorator=4.3.0=py36_0
- entrypoints=0.2.3=py36_2
- expat=2.2.6=he6710b0_0
- fontconfig=2.13.0=h9420a91_0
- freetype=2.9.1=h8a8886c_1
- future=0.16.0=py36_0
- glib=2.56.2=hd408876_0
- gmp=6.1.2=h6c8ec71_1
- gst-plugins-base=1.14.0=hbbd80ab_1
- gstreamer=1.14.0=hb453b48_1
- h5py=2.8.0=py36h989c5e5_3
- hyperlink=18.0.0=py36_0
- icu=58.2=h9c2bf20_1
- idna=2.7=py36_0
- incremental=17.5.0=py36_0
- intel-openmp=2019.0=118
- ipykernel=4.9.0=py36_0
- ipython=6.5.0=py36_0
- ipython_genutils=0.2.0=py36_0
- jedi=0.12.1=py36_0
- jinja2=2.10=py36_0
- jpeg=9b=h024ee3a_2
- jsonschema=2.6.0=py36_0
- jupyter_client=5.2.3=py36_0
- jupyter_core=4.4.0=py36_0
- keras-applications=1.0.4=py36_1
- keras-base=2.2.2=py36_0
- keras-preprocessing=1.0.2=py36_1
- kiwisolver=1.0.1=py36hf484d3e_0
- libedit=3.1.20170329=h6b74fdf_2
- libffi=3.2.1=hd88cf55_4
- libgcc-ng=8.2.0=hdf63c60_1
- libgfortran-ng=7.3.0=hdf63c60_0
- libpng=1.6.34=hb9fc6fc_0
- libsodium=1.0.16=h1bed415_0
- libstdcxx-ng=8.2.0=hdf63c60_1
- libuuid=1.0.3=h1bed415_2
- libxcb=1.13=h1bed415_1
- libxml2=2.9.8=h26e45fe_1
- markupsafe=1.0=py36h14c3975_1
- matplotlib=2.2.3=py36hb69df0a_0
- mistune=0.8.3=py36h14c3975_1
- mkl=2019.0=118
- mkl_fft=1.0.4=py36h4414c95_1
- mkl_random=1.0.1=py36h4414c95_1
- nbconvert=5.3.1=py36_0
- nbformat=4.4.0=py36_0
- ncurses=6.1=hf484d3e_0
- notebook=5.6.0=py36_0
- numpy=1.14.5=py36h1b885b7_4
- numpy-base=1.14.5=py36hdbf6ddf_4
- openssl=1.0.2p=h14c3975_0
- pandas=0.23.4=py36h04863e7_0
- pandoc=2.2.3.2=0
- pandocfilters=1.4.2=py36_1
- parso=0.3.1=py36_0
- patsy=0.5.0=py36_0
- pcre=8.42=h439df22_0
- pexpect=4.6.0=py36_0
- pickleshare=0.7.4=py36_0
- pip=10.0.1=py36_0
- prometheus_client=0.3.1=py36h28b3542_0
- prompt_toolkit=1.0.15=py36_0
- protobuf=3.6.0=py36hf484d3e_0
- ptyprocess=0.6.0=py36_0
- pyasn1=0.4.4=py36h28b3542_0
- pyasn1-modules=0.2.2=py36_0
- pycparser=2.18=py36_1
- pydot=1.2.4=py36_0
- pygments=2.2.0=py36_0
- pyopenssl=18.0.0=py36_0
- pyparsing=2.2.0=py36_1
- pyqt=5.9.2=py36h05f1152_2
- python=3.6.6=hc3d631a_0
- python-dateutil=2.7.3=py36_0
- pytz=2018.5=py36_0
- pyzmq=17.1.2=py36h14c3975_0
- qt=5.9.6=h8703b6f_2
- readline=7.0=h7b6447c_5
- scikit-learn=0.20.0=py36h4989274_1
- scipy=1.1.0=py36hd20e5f9_0
- seaborn=0.9.0=py36_0
- send2trash=1.5.0=py36_0
- service_identity=17.0.0=py36h28b3542_0
- setuptools=39.1.0=py36_0
- simplegeneric=0.8.1=py36_2
- sip=4.19.8=py36hf484d3e_0
- six=1.11.0=py36_1
- sqlite=3.24.0=h84994c4_0
- statsmodels=0.9.0=py36h035aef0_0
- tensorboard=1.10.0=py36hf484d3e_0
- tensorflow=1.10.0=gpu_py36h97a2126_0
- tensorflow-base=1.10.0=gpu_py36h6ecc378_0
- tensorflow-gpu=1.10.0=hf154084_0
- terminado=0.8.1=py36_1
- testpath=0.3.1=py36_0
- tk=8.6.8=hbc83047_0
- tornado=5.1=py36h14c3975_0
- traitlets=4.3.2=py36_0
- twisted=18.7.0=py36h14c3975_1
- wcwidth=0.1.7=py36_0
- webencodings=0.5.1=py36_1
- wheel=0.31.1=py36_0
- xz=5.2.4=h14c3975_4
- zeromq=4.2.5=hf484d3e_1
- zlib=1.2.11=ha838bed_2
- zope=1.0=py36_1
- zope.interface=4.5.0=py36h14c3975_0
- pip:
- blinker==1.4
- chardet==3.0.4
- cloudpickle==0.5.6
- configparser==3.5.0
- dask==0.19.1
- kaggle==1.4.7.1
- keras==2.2.2
- keras-vis==0.4.1
- networkx==2.2rc1
- packaging==17.1
- pillow==5.2.0
- pyhamcrest==1.9.0
- python-slugify==1.2.6
- pywavelets==1.0.0
- requests==2.19.1
- scikit-image==0.14.0
- simpleitk==1.1.0
- toolz==0.9.0
- tqdm==4.26.0
- unidecode==1.0.22
- urllib3==1.22
- xgboost==0.90
%% Cell type:markdown id: tags:
# Library and Data Setup
This section sets up a variety of libraries I use to import, manipulate, and test data with using linear models from sklearn. It also defines a function calculate_vif_ that will be used to find and remove any predictor variables that are considered colinear. A VIF cutoff of 10 will be used for feature removal. Models will be created for each of the different smoothing levels (0, 2, 5, and 10 mm kernels). For all models, unsmoothed Sulc values will be used
%% Cell type:code id: tags:
``` python
# math and models
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import cross_validate
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.externals import joblib
```
%% Cell type:code id: tags:
``` python
# Define a function to repeatedly calculate variance inflation factor and drop
def calculate_vif_(df_x, df_y, thresh=10.0):
# Set up the while loop. While we have something to drop, do ...
dropped = True
while dropped:
# get a list of the column names
variables = list(df_x.columns)
# concatenate the column names into a string separated by '+'
features = '+'.join(variables)
# re-concatenate the dataframes for creating the design matrices
df_tmp = pd.concat([df_x,df_y],axis = 1)
y, X = dmatrices(df_y.columns[0] + ' ~ ' + features, df_tmp, return_type='dataframe')
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif = vif[1:len(vif)]
maxloc = vif.index(max(vif))
thresh = 10
if max(vif) > thresh:
print('dropping \'' + variables[maxloc] +
'\' at index: ' + str(maxloc) + ' with VIF:' + str(vif[maxloc]))
df_x = df_x.drop(variables[maxloc],axis=1)
else:
dropped = False
print('Remaining variables:')
print(df_x.columns)
return df_x
```
%% Cell type:code id: tags:
``` python
# set basic options for the rest of the script
region = 'V1-withRet'
sm = '0'
```
%% Cell type:code id: tags:
``` python
# Load dataset
if 'V1' in region:
df = pd.read_csv('/data/user/mdefende/Projects/ctmodel-ml/data/V1_Thick_cleaned.csv')
else:
df = pd.read_csv('/data/user/mdefende/Projects/ctmodel-ml/data/' + region + '_Thick_cleaned.csv')
```
%% Cell type:code id: tags:
``` python
# Grab all variables besides thickness that contains the smoothing kernel
df_pred = df.filter(like=sm)
df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(like='Thick')))]
df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(like='Sulc')))]
# If sm = 0, remove all columns with 10. Else remove column with name Sulc + sm
if sm == '0':
df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(regex='10')))]
```
%% Cell type:code id: tags:
``` python
if 'withRet' in region:
df_x = pd.concat([df[['Age','Ecc','Pol','Sulc0']],df_pred], axis = 1)
else:
df_x = pd.concat([df[['Age','Sulc0']],df_pred], axis = 1)
df_y = df[['Thick' + sm]]
del df_pred
```
%% Cell type:code id: tags:
``` python
#df_x = calculate_vif_(df_x,df_y)
df_x = df_x.drop('MidArea0', axis = 1)
```
%% Cell type:code id: tags:
``` python
# Create interaction terms for the predictors
interaction = PolynomialFeatures(degree=df_x.columns.size,interaction_only=True)
X = interaction.fit_transform(df_x.values)
```
%% Cell type:code id: tags:
``` python
X.shape
```
%% Output
(6787085, 512)
%% Cell type:code id: tags:
``` python
# choose parameters for the model and fit it on the training set
reg=linear_model.LinearRegression(fit_intercept=True, n_jobs=-1,normalize=True)
#reg.fit(X,df_y)
```
%% Cell type:code id: tags:
``` python
reg.fit(X,df_y)
```
%% Output
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True)
%% Cell type:code id: tags:
``` python
# Perform 5-fold cross-validation on the model and return predicted error
scores = cross_validate(reg, X, y = df_y, cv=5, scoring=('neg_mean_squared_error','neg_mean_absolute_error'),
return_train_score=True, verbose = True, n_jobs = 5)
```
%% Output
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 21.7s remaining: 32.5s
[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 26.6s finished
%% Cell type:code id: tags:
``` python
formula = df_y.columns[0] + ' ~ ' + '*'.join(df_x.columns)
# first, we want to change the scores from a dict to dataframe and drop the time variables
scores_df = pd.DataFrame.from_dict(scores)
scores_df = scores_df.drop(columns = ['fit_time','score_time'])
# second, we should rename the columns to something more manageable
scores_df = scores_df.rename(columns = {'test_neg_mean_squared_error':'test_mse',
'train_neg_mean_squared_error':'train_mse',