Commit d3b0ca14 authored by Matthew K Defenderfer's avatar Matthew K Defenderfer
Browse files

Major overhaul of script storage. Dividing scripts by group in the overall processing pipeline

parent fc971ad0
function outlabel = avgOverROI(subdir,subj,labelpath,G,max_step,cutoff)
%{
This function a region of interest average over all vertices from a
freesurfer label file. It does this by finding all vertices within
max_steps steps of the current vertex, averages all values assigned to
those vertices that are in the label, and assigns the average back to
that vertex.
For example, averaging thickness values within 5 steps from vertex 10 in V1
for lh.V1-withRet.resid.label would look for all vertices within 5 steps of
vertex 10, average the thickness values at those vertices, and assign the
average back to vertex 10.
The function creates an adjacency matrix using the vertex and face
information from the given subject's ?h.white surface. Calculation of step
distances for each vertex is trivial.
Inputs:
subdir <string>: subjects directory
subj <string>: subject name
surfpath <string>: path to surface to apply the function to
G <graph>: graph object of the adjacency matrix.
max_step <double>: maximum step size to include in the ROI
cutoff <double>: number between 0 and 1. Vertices whose ROIs contained
less than cutoff * number of vertices from median ROI size are
removed. Default 0.
%}
%% General Setup
setenv('SUBJECTS_DIR',subdir)
if isempty(cutoff)
cutoff = 0;
end
%% loop through vertices
% load the surface of interest. create another object to keep track of
% the vertex numbers
parts = strsplit(labelpath,'/label/');
inlabel = read_label(subj,parts{2}(1:end-6));
% initialize the output surface for later as well as a surface for how many
% vertices went into the ROI, given the correct number of
% vertices here
outlabel = inlabel;
% initialize the avg variable. Loop through each vertex, find all
% vertices within max_step, remove vertices that aren't still a part of
% verts, average the remaining values
n = zeros(1,length(inlabel));
for vind = 1:length(inlabel)
d = distances(G,inlabel(vind,1)+1);
roivert = find(d' <= max_step) - 1;
outlabel(vind,5) = mean(inlabel(ismember(inlabel(:,1),roivert),5));
n(vind) = length(roivert);
end
outlabel(n < cutoff*median(n),5) = -999;
end
\ No newline at end of file
function avgThickResid(subdir,subj,labeldir,max_step,cutoff,outdir)
%{
This function controls the avgOverROI function, defining which surfaces
that function will be applied to for a given subject. The graph object for
a given hemisphere will be created here, and then passed into the
avgOverROI function to save time, as that process takes quite a while.
Inputs:
subdir <string>: subjects directory
subj <string>: subject name
labeldir <string>: full path to the directory with the labels
max_step <double>:
cutoff <double>:
outdir <string>: full path to the directory to put the labels in
%}
%% General Setup
% add toolbox path to get createAdjacency function
addpath(genpath('/home/mdefende/Documents/mkd-toolbox'))
% This part is poorly coded and should be changed later
% get path to the surfaces we are applying the avgOverROI function to.
labels = dir(fullfile(labeldir,'*label'));
% sort surfs so all lh files come first. That way we only have to calculate
% the graph object twice, once per hemisphere.
labels = struct2table(labels);
labels = sortrows(labels,'name');
% create the lh graph object if the first file contains lh. keep it until
% we switch to rh files
if contains(labels.name{1},'lh')
[~,faces] = freesurfer_read_surf(fullfile(subdir,subj,'surf','lh.white'));
adj = createAdjacency(faces);
G = graph(adj);
clearvars adj faces
end
isrh = false;
%% Loop over the surfs
for ii = 1:height(labels)
% Create graph object for rh, if we have switch to rh surfaces
if strcmp(labels.name{ii}(1:2),'rh') && ~isrh
clearvars G % clear it first to free up that memory
[~,faces] = freesurfer_read_surf(fullfile(subdir,subj,'surf','rh.white'));
adj = createAdjacency(faces);
G = graph(adj);
clearvars adj faces
isrh = true;
end
% create full path for this surface
labelpath = fullfile(labels.folder{ii},labels.name{ii});
outlabel = avgOverROI(subdir,subj,labelpath,G,max_step,cutoff);
outname = [labels.name{ii}(1:end-6) '-step' num2str(max_step) '.label'];
if ~exist(outdir,'dir')
mkdir(outdir)
end
write_label(outlabel(:,1),outlabel(:,2:4),outlabel(:,5),fullfile(outdir,outname),subj);
end
end
\ No newline at end of file
% this short script cleans the thickness labels for each participant and
% fsaverage in a dataset folder. the current pipeline creates thickness
% files for both V1-withRet and V1-noRet where thickness values are the
% same. this will delete the -noRet files and remove the -withRet tag from
% the others. This will overall save storage space by removing 25% of the
% thickness labels
dataset = 'HCPA';
subdir = ['/data/user/mdefende/datasets/' dataset '/subs'];
if strcmp(dataset,'HCP1200')
T = readtable('/data/user/mdefende/Projects/ctmodel-ml/data/HCP1200/HCP1200-train-test-split.csv');
T(~strcmp(T.group,'test'),:) = [];
else
T = readtable('/data/user/mdefende/Projects/ctmodel-ml/data/HCPA/HCPA-subj-list.csv');
end
if isnumeric(T.Subject)
T.Subject = cellstr(num2str(T.Subject));
end
% do for subjects
for ii = 1:height(T)
withret = dir(fullfile(subdir,T.Subject{ii},'label','MKD_labels','thick*','*V1-withRet*thick*'));
for jj = 1:length(withret)
newname = strrep(withret(jj).name,'-withRet','');
movefile(fullfile(withret(jj).folder,withret(jj).name),fullfile(withret(jj).folder,newname));
end
noret = dir(fullfile(subdir,T.Subject{ii},'label','MKD_labels','thick*','*V1-noRet*thick*'));
for jj = 1:length(noret)
delete(fullfile(noret(jj).folder,noret(jj).name))
end
end
% do for fsaverage
withret = dir(fullfile(subdir,'fsaverage','label','MKD_labels','thick*','*V1-withRet*thick*'));
for jj = 1:length(withret)
newname = strrep(withret(jj).name,'-withRet','');
movefile(fullfile(withret(jj).folder,withret(jj).name),fullfile(withret(jj).folder,newname));
end
noret = dir(fullfile(subdir,'fsaverage','label','MKD_labels','thick*','*V1-noRet*thick*'));
for jj = 1:length(noret)
delete(fullfile(noret(jj).folder,noret(jj).name))
end
\ No newline at end of file
# This script reads in many output cv files for the gbdt creation, collates them
# into a single data frame, and chooses the hyperparameter set for each
# combination of region smoothing kernel that creates the model with the least
# 10-fold cv error. These are then written to 'best-gbdt.csv'
cv.path <- '/data/user/mdefende/Projects/ctmodel-ml/results/HCP1200/gbdt/cv/withAge/out/'
list.files(cv.path, pattern = '*.csv', recursive = TRUE, full.names = TRUE) %>%
map(read_csv) %>%
bind_rows() %>%
select(-X1) %>%
group_by(area,sm) %>%
filter(test_mae == min(test_mae)) %>%
arrange(area,sm) %>%
write_csv(path = file.path(cv.path,'best-gbdt.csv'))
# This script organizes the different prediction files into a single csv, adding
# variables describing the region and smoothing kernel as well as calculating
# the residuals for each predicted data point. All data is in individual space
library(tidyverse)
# dataset string. set a string to be appending to the beginning of the file names to say what dataset the data came from
dataset.str <- 'HCP1200'
age.str <- 'noAge'
# where are the csv files. can be a folder with another group of folders, csv files found recursively
pred.path <- paste0('/data/user/mdefende/Projects/ctmodel-ml/results/HCP1200/gbdt/predict/',age.str,'/out')
files <- list.files(pred.path, pattern = '*.csv', full.names = TRUE, recursive = TRUE)
# custom function to perform reorganization steps on each csv individually
reorg <- function(file.name){
file.name %>%
read_csv() %>% # read in the file as a csv
as_tibble() %>% # transform to tibble
select(-X1) %>% # remove first column
rename_all(~str_remove(.,str_extract(file.name,'sm[0-9]{1,2}') %>% str_remove('sm'))) %>% # remove sm value from column names
mutate(Region = str_extract(file.name,'out/(.*?)/') %>% str_remove_all('out|/'), # extract region name
Sm = str_extract(file.name,'sm[0-9]{1,2}') %>% str_remove('sm'), # extract and store sm value
Resid = Thick-PredThick) %>% # calculate residual
{if ('Age' %in% colnames(.))
select(.,Subject, Age, Region, Sm, Hemi, VertexInd,everything())
else
select(.,Subject, Region, Sm, Hemi, VertexInd, everything())} # reorder columns
}
# perform custom reorganization function on each csv file
df <- files %>%
map(reorg)
# for list elements with BA45 as the region, bind rows for those elements and write to a csv file
df[str_detect(files,'BA45')] %>%
bind_rows() %>%
write_csv(path = file.path(pred.path,paste0(dataset.str,'-BA45-',age.str,'-resid-thick.csv')))
# same for dACC
df[str_detect(files,'dACC')] %>%
bind_rows() %>%
write_csv(path = file.path(pred.path,paste0(dataset.str,'-dACC-',age.str,'-resid-thick.csv')))
# same for V1-noRet
df[str_detect(files,'V1-noRet')] %>%
bind_rows() %>%
write_csv(path = file.path(pred.path,paste0(dataset.str,'-V1-noRet-',age.str,'-resid-thick.csv')))
# same for V1-withRet
df[str_detect(files,'V1-withRet')] %>%
bind_rows() %>%
write_csv(path = file.path(pred.path,paste0(dataset.str,'-V1-withRet-',age.str,'-resid-thick.csv')))
%{
This script takes residual and thickness label files in individual space
that only have data computed for single vertices and performs vertex-wise
averaging to simulate values from larger ROIs centered at each vertex. The
results are then stored in separate label files
%}
dataset = 'HCP1200';
outdir = 'resid-roi/noAge';
indir = 'resid/noAge';
max_step = 5;
cutoff = 0;
subdir = ['/data/user/mdefende/datasets/' dataset '/subs'];
if strcmp(dataset,'HCP1200')
T = readtable('/data/user/mdefende/Projects/ctmodel-ml/data/HCP1200/HCP1200-train-test-split.csv');
T(~strcmp(T.group,'test'),:) = [];
else
T = readtable('/data/user/mdefende/Projects/ctmodel-ml/data/HCPA/HCPA-subj-list.csv');
end
if isnumeric(T.Subject)
T.Subject = cellstr(num2str(T.Subject));
end
scriptdir = ['/data/user/mdefende/datasets/' dataset '/jobs/roi-jobs/' outdir];
if ~exist(scriptdir,'dir')
mkdir(scriptdir)
end
for ii = 1:height(T)
subj = T.Subject{ii};
scriptname = fullfile(scriptdir,[subj '-roi-step' num2str(max_step) '.sh']);
fid = fopen(scriptname,'w');
fprintf(fid,'#!/bin/bash\n');
fprintf(fid,'#SBATCH --partition=short\n');
fprintf(fid,'#SBATCH --time=3:00:00\n');
fprintf(fid,['#SBATCH --job-name=' subj '-roi-step' num2str(max_step) '\n']);
fprintf(fid,'#SBATCH --ntasks=1\n');
fprintf(fid,'#SBATCH --cpus-per-task=1\n');
fprintf(fid,'#SBATCH --mem-per-cpu=40000\n');
fprintf(fid,['#SBATCH -o ' fullfile(scriptdir,[subj '-roi-step' num2str(max_step) '.out\n'])]);
fprintf(fid,'module load rc/matlab/R2018a\n');
jobstring = ['matlab -nodisplay -nosplash -nodesktop -r "' ...
'addpath(genpath(''/data/user/mdefende/Projects/ctmodel-ml/misc-scripts''));'];
jobstring = [jobstring 'avgThickResid(''' subdir ''',''' subj ''','];
labeldir = fullfile(subdir,subj,'label','MKD_labels',indir);
jobstring = [jobstring '''' labeldir ''',' num2str(max_step) ','];
fulloutdir = fullfile(subdir,subj,'label','MKD_labels',outdir);
if ~exist(fulloutdir,'dir')
mkdir(fulloutdir)
end
jobstring = [jobstring num2str(cutoff) ',''' fulloutdir ''');"'];
fprintf(fid,jobstring);
fclose(fid);
end
\ No newline at end of file
dataset = 'HCP1200';
labelpath = 'resid-roi/noAge';
tag = 'resid-roi';
subdir = ['/data/user/mdefende/datasets/' dataset '/subs'];
scriptdir = ['/data/user/mdefende/datasets/' dataset '/jobs/roi-jobs/label2fsaverage/' tag];
if ~exist(scriptdir,'dir')
mkdir(scriptdir)
end
if strcmp(dataset,'HCP1200')
T = readtable('/data/user/mdefende/Projects/ctmodel-ml/data/HCP1200/HCP1200-train-test-split.csv');
T(~strcmp(T.group,'test'),:) = [];
else
T = readtable('/data/user/mdefende/Projects/ctmodel-ml/data/HCPA/HCPA-subj-list.csv');
end
if isnumeric(T.Subject)
T.Subject = cellstr(num2str(T.Subject));
end
for ii = 1:height(T)
subj = T.Subject{ii};
scriptname = fullfile(scriptdir,[subj '-' tag '.sh']);
fid = fopen(scriptname,'w');
fprintf(fid,'#!/bin/bash\n');
fprintf(fid,'#SBATCH --partition=express\n');
fprintf(fid,'#SBATCH --time=30:00\n');
fprintf(fid,['#SBATCH --job-name=' subj '-' tag '\n']);
fprintf(fid,'#SBATCH --ntasks=1\n');
fprintf(fid,'#SBATCH --cpus-per-task=1\n');
fprintf(fid,'#SBATCH --mem-per-cpu=4000\n');
fprintf(fid,['#SBATCH -o ' fullfile(scriptdir,[subj '-' tag '.out\n'])]);
fprintf(fid,'module load rc/matlab/R2018a\n');
jobstring = ['matlab -nodisplay -nosplash -nodesktop -r "' ...
'addpath(genpath(''/data/user/mdefende/Projects/ctmodel-ml/code/data-organization''));'];
jobstring = [jobstring 'label2fsaverage(''' subdir ''',''' subj ''',''' labelpath ''');"'];
fprintf(fid,jobstring);
fclose(fid);
end
\ No newline at end of file
%% Set basic parameters
region = {'BA45','dACC','V1-withRet','V1-noRet'};
sm = {'0','2','5','10'};
dataset = 'HCP1200';
ageStr = 'withAge';
thickOrResid = 'both';
scriptdir = ['/data/user/mdefende/datasets/' dataset '/jobs/residToLabel/' ageStr];
subdir = ['/data/user/mdefende/datasets/' dataset '/subs'];
if ~exist(scriptdir,'dir')
mkdir(scriptdir)
end
if strcmp(dataset,'HCP1200')
T = readtable('/data/user/mdefende/Projects/ctmodel-ml/data/HCP1200/HCP1200-train-test-split.csv');
T(~strcmp(T.group, 'test'),:) = [];
else
T = readtable('/data/user/mdefende/Projects/ctmodel-ml/data/HCPA/HCPA-subj-list.csv');
end
if isnumeric(T.Subject)
T.Subject = num2str(T.Subject);
elseif iscell(T.Subject)
T.Subject = cell2mat(T.Subject);
end
for ii = 1:height(T)
scriptname = fullfile(scriptdir,[T.Subject(ii,:) '-' ageStr '-residToLabel.sh']);
fid = fopen(scriptname,'w');
fprintf(fid,'#!/bin/bash\n');
fprintf(fid,'#SBATCH --partition=express\n');
fprintf(fid,'#SBATCH --time=30:00\n');
fprintf(fid,['#SBATCH --job-name=' T.Subject(ii,:) '-' ageStr '-residToLabel\n']);
fprintf(fid,'#SBATCH --ntasks=1\n');
fprintf(fid,'#SBATCH --cpus-per-task=1\n');
fprintf(fid,'#SBATCH --mem-per-cpu=8000\n');
fprintf(fid,['#SBATCH -o ' fullfile(scriptdir,[T.Subject(ii,:) '-' ageStr '-residToLabel.out\n'])]);
fprintf(fid,'module load rc/matlab/R2018a\n');
% jobstring begins with basic matlab call, and adding the path to the
% misc scripts directory
jobstring = ['matlab -nodisplay -nosplash -nodesktop -r "' ...
'addpath(genpath(''/data/user/mdefende/Projects/ctmodel-ml/code/data-organization''));'];
% add function call with subdir, subject, and dataset variables
jobstring = [jobstring ' residToLabel(''' subdir ''',''' T.Subject(ii,:) ''',''' dataset ''''];
% add withAge or noAge to jobstring
jobstring = [jobstring ',''' ageStr ''''];
% add each region to jobstring
jobstring = [jobstring ',{'];
for jj = 1:length(region)
if jj ~= length(region)
jobstring = [jobstring '''' region{jj} ''','];
else
jobstring = [jobstring '''' region{jj} '''}'];
end
end
% add sm to jobstring
jobstring = [jobstring ',{'];
for jj = 1:length(sm)
if jj ~= length(sm)
jobstring = [jobstring '''' sm{jj} ''','];
else
jobstring = [jobstring '''' sm{jj} '''}'];
end
end
% add thick or resid to jobstring
jobstring = [jobstring ',''' thickOrResid ''');"\n'];
fprintf(fid,jobstring);
fclose(fid);
end
\ No newline at end of file
function label2fsaverage(subdir,subj,labelpath)
setenv('SUBJECTS_DIR',subdir)
labels = dir(fullfile(subdir,subj,'label','MKD_labels',labelpath,'*.label'));
outdir = fullfile(subdir,'fsaverage','label','MKD_labels',labelpath);
if ~exist(outdir,'dir')
mkdir(outdir)
end
for ii = 1:length(labels)
if contains(labels(ii).name,'lh')
hemi = 'lh';
else
hemi = 'rh';
end
jobstring = ['mri_label2label --srclabel ' fullfile(labels(ii).folder,labels(ii).name) ' --srcsubject ' subj ...
' --trglabel ' fullfile(outdir,labels(ii).name) ' --trgsubject fsaverage --hemi ' hemi ' --regmethod surface'];
system(jobstring)
end
end
\ No newline at end of file
# This script reads in labels containing residuals and thickness values for each
# subject in fsaverage space, restructures the dataframes, and collates them all
# together.
library(freesurfer)
library(magrittr)
library(tidyverse)
# restructuring function
df.setup <- function(label){
label %<>%
read_fs_label() %>%
rename(r = 'r_coord',
a = 'a_coord',
s = 's_coord',
vertex = 'vertex_num') %>%
mutate(sm = str_extract(label,'sm[0-9]{1,2}') %>% str_remove('sm'),
region = str_extract(label,'h\\.(.*?)\\.') %>% str_remove_all('h\\.|\\.'),
hemi = str_extract(label,'[lr]h'),
subject = str_extract(label,'\\.HCA[0-9]*-|\\.[0-9]{6}-') %>% str_remove_all('\\.|-'),
step = ifelse(str_detect(label,'step'),
str_extract(label,'step[0-9]{1,2}') %>% str_remove('step'),
'0')) %>%
select(subject, region, sm, step, hemi, vertex:value)
}
# Say which dataset we are collating. Either HCPA or HCP1200
dataset <- 'HCP1200'
# Get residual label names, read them in, do restructuring, and collate them
labels <- c(list.files(paste0('/data/user/mdefende/datasets/',dataset,'/subs/fsaverage/label/MKD_labels/resid/noAge'),pattern = '*label', full.names = TRUE),
list.files(paste0('/data/user/mdefende/datasets/',dataset,'/subs/fsaverage/label/MKD_labels/resid-roi/noAge'),pattern = '*label', full.names = TRUE))
fs.resid <- labels %>%
map(df.setup) %>%
bind_rows() %>%
rename(resid = 'value') %>%
mutate_at(vars(sm,step,vertex,resid), as.numeric)
# Do the same for thickness values
labels <- c(list.files(paste0('/data/user/mdefende/datasets/',dataset,'/subs/fsaverage/label/MKD_labels/thick'),pattern = '*label', full.names = TRUE),
list.files(paste0('/data/user/mdefende/datasets/',dataset,'/subs/fsaverage/label/MKD_labels/thick-roi'),pattern = '*label', full.names = TRUE))
fs.thick <- labels %>%
map(df.setup) %>%
bind_rows() %>%
rename(thick = 'value') %>%
mutate_at(vars(sm,step,vertex), as.numeric)
fs.thick %<>%
select(-region,-r,-a,-s)
# left join thickness values to residuals, matching on subject, sm, step, hemi, and vertex
fs <- left_join(fs.resid,fs.thick, by = c('subject','sm','step','hemi','vertex'))
write_csv(fs,path = paste0('/data/user/mdefende/Projects/ctmodel-ml/data/',dataset,'/', dataset,'-fsaverage-resid-thick.csv'))
function residToLabel(subdir,subj,dataset,ageStr,region,sm,thickOrResid)
setenv('SUBJECTS_DIR',subdir)
lhcortex = read_label(subj, 'lh.cortex');
rhcortex = read_label(subj, 'rh.cortex');
for jj = 1:length(region)
csv = readtable(['/data/user/mdefende/Projects/ctmodel-ml/results/' dataset '/gbdt/predict/' ageStr '/out/' dataset '-' region{jj} '-' ageStr '-resid-thick.csv']);
if isnumeric(csv.Subject)
csv.Subject = cellstr(num2str(csv.Subject));
end
csv(~strcmp(csv.Subject,subj),:) = [];
for kk = 1:length(sm)
lhind = strcmp([csv.Hemi],'LH') & csv.Sm == str2double(sm{kk});
rhind = strcmp([csv.Hemi],'RH') & csv.Sm == str2double(sm{kk});
lhvert = csv.VertexInd(lhind) - 1;
rhvert = csv.VertexInd(rhind) - 1;
lhresid = csv.Resid(lhind);
rhresid = csv.Resid(rhind);
lhthick = csv.Thick(lhind);
rhthick = csv.Thick(rhind);
if strcmp(thickOrResid,'thick')
makelabel(subdir,subj,sm{kk},region{jj},lhvert,rhvert,lhthick,rhthick,lhcortex,rhcortex,ageStr,'thick');
elseif strcmp(thickOrResid,'resid')
makelabel(subdir,subj,sm{kk},region{jj},lhvert,rhvert,lhresid,rhresid,lhcortex,rhcortex,ageStr,'resid');
elseif strcmp(thickOrResid,'both')
makelabel(subdir,subj,sm{kk},region{jj},lhvert,rhvert,lhthick,rhthick,lhcortex,rhcortex,ageStr,'thick');
makelabel(subdir,subj,sm{kk},region{jj},lhvert,rhvert,lhresid,rhresid,lhcortex,rhcortex,ageStr,'resid');
end
end
end
end
function makelabel(subdir,subj,sm,region,lhvert,rhvert,lhval,rhval,lhcortex,rhcortex,ageStr,tag)
lhlabel = [lhcortex(ismember(lhcortex(:,1),lhvert),1:4),lhval];
rhlabel = [rhcortex(ismember(rhcortex(:,1),rhvert),1:4),rhval];
labelbase = [subj '-sm' sm '-' tag];