Alignment built, needed to revise preprocessing

4f59bb0d · Ryan Godwin · 0c82090d · 4f59bb0d · 4f59bb0d · 4f59bb0d
Commit 4f59bb0d authored 2 years ago by Ryan Godwin
--- a/src/config/args_ecg_preproc.json
+++ b/src/config/args_ecg_preproc.json
 {
-    "sampling_freq": 1000
+    "sampling_freq": 1000,
+    "sampling_freq_units": "Hz",
+    "intermediate_filename": "clean_ecg.csv",
+    "downsample_freq":500
 }
\ No newline at end of file
--- a/src/config/args_meg_preproc.json
+++ b/src/config/args_meg_preproc.json
@@ -4,5 +4,7 @@
    "notch_filter_freq": 60,
    "bandpass_lowfreq_cutoff": 0.5,
    "bandpass_highfreq_cutoff": 100,
-    "downsample_freq":500
+    "sampling_freq": 1200,
+    "downsample_freq":500,
+    "intermediate_filename":"heartifact.csv"
 }
\ No newline at end of file
--- a/src/config/config.py
+++ b/src/config/config.py
@@ -20,8 +20,17 @@ RESULTS_DIR = Path(DATA_DIR, "results")
 MEG_DATA_DIR = Path(RAW_DATA, "MEG_Data")
 # A Folder containing the RAW ECG data from biopack (.ACQ files converted to text)
 ECG_DATA_DIR = Path(RAW_DATA, "BioPac_Data", "TechDevScans")
+#The TF model for classifying MEG artifacts
 MODEL_FILE = Path(BASE_DIR, "model", "MEGnet_final_model.h5")

+SIGNAL_TUPS = [
+              tuple(("MEGandECG_DWG-Rest_20210816_01", "Subject1")),
+              tuple(("MEGandECG_DWG-Rest_20210816_02", "Subject2")),
+              tuple(("MEGandECG_DWG-Rest_20210817_02", "Subject3")),
+              tuple(("MEGandECG_DWG-Rest_20210817_03", "Subject4")),
+              tuple(("MEGandECG_DWG-Rest_20210817_04", "Subject5")),
+]
+
 ICA_HEADERS = [
    "Time",
    "IC1",

--- a/src/hrvmeg/compare.py
+++ b/src/hrvmeg/compare.py
+from pathlib import Path 
 import matplotlib.pyplot as plt
 import mpld3
 import numpy as np
 import pandas as pd
-import scipy.signal as signal
+from scipy.signal import correlate, correlation_lags
+import plotter

 from hrvmeg import data


 class CompareMEGICAtoECG:
-    def __init__(self, output_sub_dir, input_ecg, input_cardiac_ic):
-        """
-        The function takes in the subject_id, subject_path, output_sub_dir, and ICA_value and assigns
-        them to the class.
-
-        Args:
-        output_sub_dir: The directory where the output files will be saved.
-        subject_id: The subject ID
-        subject_path: The path to the subject's folder
-        ICA_value: The cardiac independent component from the ICA.
-        """
-        self.subject_path = subject_path
-        self.subject_id = subject_id
+    def __init__(self,
+                 input_ecg,
+                 ecg_name,
+                 input_cardiac_ic,
+                 meg_name,
+                 output_sub_dir
+                 ):
+        print('output directory - ', output_sub_dir)
        self.output_sub_dir = output_sub_dir
+        print('ecg input - ', input_ecg)
+        self.ecg_data = input_ecg
+        self.ecg_dir = ecg_name
+        print('ic input - ', input_cardiac_ic)
+        self.ica_data = input_cardiac_ic
+        self.meg_dir = Path(self.output_sub_dir,meg_name)
+        Path(self.meg_dir).mkdir(parents=True, exist_ok=True)
+        
+    def time_align(self):
+        #plot unaligned
+        plotter.plot_signals_comp(self.ica_data,
+                                  self.ecg_data,
+                                  Path(self.meg_dir, 'pre-align-both.png')
+                                )

-    def get_ica_rr(self):
-        ica_rr = data.GetRPeaks(
-            output_sub_dir=self.output_sub_dir,
-            subject_id=self.subject_id,
-            subject_path=self.subject_path,
-        )
+        correlation = correlate(self.ica_data, 
+                                self.ecg_data, 
+                                mode='full', 
+                                method='fft'
+                                )
+        
+        
+        print("ECG - ", self.ecg_data.shape)
+        print("MEG - ", self.ica_data.shape)
+        lags = correlation_lags(self.ica_data.size, self.ecg_data.size, mode = 'full')
+        lag = lags[np.argmax(correlation)]
+        print('Signal lag = ', lag)
+        
+    # def get_ica_rr(self):
+    #     ica_rr = data.GetRPeaks(
+    #         output_sub_dir=self.output_sub_dir,
+    #         subject_id=self.subject_id,
+    #         subject_path=self.subject_path,
+    #     )
--- a/src/hrvmeg/main.py
+++ b/src/hrvmeg/main.py
+from time import time
 import warnings
 from pathlib import Path
 from typing import Dict

+import numpy as np
 import matplotlib.pyplot as plt
 import mlflow
-import mpld3
 import pandas as pd
 import typer
-from sklearn.preprocessing import StandardScaler
+

 from config import config
-from config.config import INTERMEDIATE_DIR, logger
-from hrvmeg import compare, meg_to_ica, predict
-from hrvmeg.plotter import PlotCardiacIC
+from config.config import logger
+from hrvmeg import compare, meg_to_ica, predict, preprocess_ecg, plotter
 from py_utils import config_utils, get_data, print_info

 # Experiment Name
@@ -24,14 +24,14 @@ warnings.filterwarnings("ignore")


 @app.command()
-def extract_ic_from_meg(
+def preprocess_megs(
    args_fp: str = "config/args_meg_preproc.json",
    experiment_name: str = experiment_name,
    run_name: str = "extract_ica_from_meg",
    test_run: bool = False,
 ) -> None:
    """
-    `extract_ic_from_meg` takes in a `scan_file` (a .ds file), and outputs a .csv file with the ICA
+    `preprocess_megs` takes in a `scan_file` (a .ds file), and outputs a .csv file with the ICA
    components

    :param args_fp: str = "config/args.json", defaults to config/args.json
@@ -136,13 +136,13 @@ def label_cardiac_ic(
            IC_col = str(df.columns[1])

            output_png = Path(config.INTERMEDIATE_DIR, fp.parent.name, "Heartifact.png")
-            plot_cardiac_ic = PlotCardiacIC(
+            plot_cardiac_ic = plotter.PlotCardiacIC(
                output_file=output_png.absolute(), x_values=df[time_col], y_values=df[IC_col]
            )
            plot_out = plot_cardiac_ic()
            
            mlflow.set_experiment(experiment_name)
-            with mlflow.start_run(run_name=run_name) as run:
+            with mlflow.start_run(run_name=run_name):
                run_id = mlflow.active_run().info.run_id
                mlflow.log_params(args)
                mlflow.log_artifact(plot_out)
@@ -153,30 +153,40 @@ def label_cardiac_ic(
 @app.command()
 def preprocess_ecgs(
    experiment_name: str = experiment_name,
+    args_ecg: str = "config/args_ecg_preproc.json",
    run_name: str = "preprocess ECG",
    test_run: bool = True,
 ):
+    """
+    > This function takes in a list of ECG files, preprocesses them, and saves them to a new folder
+    
+    Args:
+      experiment_name (str): The name of the experiment. This is used to create a folder in the
+    `config.RESULTS_DIR` directory.
+      args_ecg (str): str = "config/args_ecg_preproc.json". Defaults to config/args_ecg_preproc.json
+      run_name (str): The name of the run. This is used to create a folder in the experiment directory.
+    Defaults to preprocess ECG
+      test_run (bool): bool = True. Defaults to True
+    """
    ecg_scan_folders = get_data.get_folders(config.ECG_DATA_DIR, ".txt")
-
-    # TODO: move this to a "get_files" in py_utils
    ecg_to_proc = get_data.get_files(ecg_scan_folders, "*.txt")
-
+    args = dict(config_utils.load_dict(filepath=args_ecg))
+    fs = args["sampling_freq"]
    for file in ecg_to_proc:
        fp = Path(file)
-        output_file = Path(config.INTERMEDIATE_DIR, fp.parent.name, "clean_ecg.csv")
-
-        ecg_dat = pd.read_csv(fp)
-
-        scaler = StandardScaler()
-        scaled_ecg = scaler.fit_transform(ecg_dat)
-
-        print(scaled_ecg)
+        output_folder = Path(config.INTERMEDIATE_DIR, fp.parent.name)
+        output_file = Path(output_folder, "clean_ecg.csv")
        
+        Path(output_folder).mkdir(parents=True, exist_ok=True)
+        data_to_write= preprocess_ecg.preproc_ecg(fp, fs, args["downsample_freq"],output_folder)
+        np.savetxt(output_file, data_to_write, delimiter=',')


 @app.command()
-def compare(
+def compare_signals(
    args_rr: str = "config/args_rr.json",
+    args_meg: str = "config/args_meg_preproc.json",
+    args_ecg: str = "config/args_ecg_preproc.json",
    experiment_name: str = "compare ECG + MEG",
    run_name: str = "comparison 1",
    test_run: bool = True,
@@ -193,30 +203,48 @@ def compare(
    directory. Defaults to comparison 1
      test_run (bool): bool = True. Defaults to True
    """
+    args_rr = dict(config_utils.load_dict(filepath=args_rr))
+    args_meg = dict(config_utils.load_dict(filepath=args_meg))
+    args_ecg = dict(config_utils.load_dict(filepath=args_ecg))

-    meg_scan_folders = get_data.get_folders(config.ECG_DATA_DIR, ".csv")
-    meg_to_proc = []
-
-    for file in meg_scan_folders:
-        text_file_generators = file.glob("heartifact.csv")
-        for text_file in text_file_generators:
-            meg_to_proc.append(text_file)
+    meg_scan_folders = get_data.get_folders(config.INTERMEDIATE_DIR, args_meg["intermediate_filename"])
+    meg_to_proc = get_data.get_files(meg_scan_folders,args_meg["intermediate_filename"])

    meg_dfs = {}
    for meg_scan in meg_to_proc:
-        meg_dfs[meg_scan] = pd.read_csv(meg_scan)
-
-    # #Now can use ecg_to_proc to index ecg_dfs
-    # if len(ecg_dfs)==len(meg_dfs):
-    #     for i in range(len(ecg_dfs)):
-    #         #do stuff with both meg and ecg data
-    #         #preproc ecg
-    #         pass
-
-    # else:
-    #     raise (IndexError)
-
-    args = dict(config_utils.load_dict(filepath=args_rr))
+        meg_dfs[Path(meg_scan).parent.name] = pd.read_csv(meg_scan)
+        
+    
+    ecg_scan_folders = get_data.get_folders(config.INTERMEDIATE_DIR, args_ecg["intermediate_filename"])
+    ecg_to_proc = get_data.get_files(ecg_scan_folders,args_ecg["intermediate_filename"])
+
+    ecg_dfs = {}
+    for ecg_scan in ecg_to_proc:
+        ecg_dfs[Path(ecg_scan).parent.name] = pd.read_csv(ecg_scan)
+
+    
+    for idx, value in enumerate(config.SIGNAL_TUPS):
+        meg_name = value[0]
+        ecg_name = value[1]
+        meg_data = meg_dfs[meg_name]
+        ecg_data = ecg_dfs[ecg_name]
+        print(meg_data.shape)
+        print(ecg_data.shape)
+        comp = compare.CompareMEGICAtoECG(ecg_data,
+                                          ecg_name,
+                                          meg_data,
+                                          meg_name,
+                                          config.RESULTS_DIR
+                                          )
+        #Do all the direct comparison between the MEG and ECG here
+        
+        comp.time_align()
+        #print('Correlation - ', corr_out)
+        
+ 
+    print(args_rr)
+    print(args_meg)
+    print(args_ecg)


 def load_artifacts(run_id: str = None) -> Dict:

--- a/src/hrvmeg/plotter.py
+++ b/src/hrvmeg/plotter.py
@@ -3,59 +3,48 @@ from pathlib import Path
 import matplotlib.pyplot as plt
 import mpld3

-
-class Plotter(object):
-    def __init__(self, output_file: Path(), x_values, y_values, fig_size=(12, 5)):
-        self._x = x_values
-        self._y = y_values
-        self._output_file = output_file
-        self._fig_size = fig_size
-        plt.close()
-
-
-    def __call__(self):
-        pass
-
-    def make_html(self, fig):
-        # save figure as HTML file
-        html_str = mpld3.fig_to_html(fig)
-        output_html = self._output_file.rename(self._output_file.with_suffix(".html"))
-        HTML_file = open(output_html, "w", encoding="utf8")
-        HTML_file.write(html_str)
-        HTML_file.close()
-        return output_html
-
-
-class PlotCardiacIC(Plotter):
-    def __init__(self, output_file, x_values, y_values, fig_size=(12, 5)):
-        super().__init__(output_file, x_values, y_values, fig_size)
-
-    def __call__(self):
-        plt.plot(self._x, self._y, linewidth=0.3)
-        plt.xlabel("Time (s)")
-        plt.ylabel("Cardiac Component")
-        plt.title("Selected Cardiac Component")
-      
-        plt.savefig(Path(self._output_file), dpi=330)
-        
-        #TODO- pull off the IC # and add to title
-        print('outfile - ', Path(self._output_file))  
-        #output_html = super().make_html(plt.gcf())
-        return self._output_file
-
-
-class PlotECG(Plotter):
-    def __init__(self, output_file, x_values, y_values, fig_size=(12, 5)):
-        super().__init__(output_file, x_values, y_values, fig_size)
-
-    def __call__(self):
-        plt.subplots(1, 1, figsize=(12, 5))
-        plt.plot(self._x, self._y, linewidth=0.2)
-        plt.xlabel("Time (s)")
-        plt.ylabel("ECG Signal")
-        plt.title("Scaled Signal")
-        plt.savefig(str(self._output_file), dpi=330)
-               
-        output_html = super().make_html(plt.gcf())
-        return output_html
-
+def make_html(fig):
+    # save figure as HTML file
+    html_str = mpld3.fig_to_html(fig)
+    output_html = self._output_file.rename(self._output_file.with_suffix(".html"))
+    HTML_file = open(output_html, "w", encoding="utf8")
+    HTML_file.write(html_str)
+    HTML_file.close()
+
+
+def plot_signals_comp(meg_values, ecg_values, output_file):
+    fig, ax = plt.subplot(211)
+    
+    print('Plotting Cardiac IC...')
+    fig.plot(meg_values.iloc[:,0],meg_values.iloc[:,1], linewidth=0.2)
+    ax.xlabel("Time (s)")
+    ax.ylabel("Cardiac Component")
+    ax.title("Selected Cardiac Component")
+    fig.savefig(Path(output_file), dpi=330)
+
+    # output_html = super().make_html(plt.gcf())
+    
+    fig.subplot(212)
+    print('Plotting ECG...')
+    fig.plot(ecg_values.iloc[:,0],ecg_values.iloc[:,1], linewidth=0.2)
+    ax.xlabel("Time (s)")
+    ax.ylabel("ECG Signal")
+    ax.title("Scaled ECG Signal")
+    fig.savefig(str(output_file), dpi=330)
+    print("outfile - ", Path(output_file))
+
+    make_html(fig)
+    plt.close()
+    
+def plot_ecg(ecg_values, output_file):
+
+    print('Plotting ECG...')
+    plt.plot(ecg_values.iloc[:,0],ecg_values.iloc[:,1], linewidth=0.2)
+    plt.xlabel("Time (s)")
+    plt.ylabel("ECG Signal")
+    plt.title("ECG Signal")
+    plt.savefig(str(output_file), dpi=330)
+    print("outfile - ", Path(output_file))
+
+    make_html(plt.gcf())
+    plt.close()
\ No newline at end of file
--- a/src/hrvmeg/predict.py
+++ b/src/hrvmeg/predict.py
 import numpy as np
 import pandas as pd
-import tensorflow_addons as tfa
 from tensorflow import keras

 from config import config
@@ -25,7 +24,7 @@ class label_ICA_components:

    The outputs are saved by numpy in a text file, that is easliy human readable and can be loaded using 
    np.loadtxt('/path/to/ICA_component_lables.txt')
-
+    
    example usage:
    python Label_ICA_Components.py --input_path example_data/ICA202DDisc \\
        --output_dir example_data/ICA202DDisc --output_type list

--- a/src/hrvmeg/preprocess_ecg.py
+++ b/src/hrvmeg/preprocess_ecg.py
+from email.mime import base
+from scipy import signal
+from pathlib import Path
+from sklearn.preprocessing import StandardScaler
+import pandas as pd
+import numpy as np
+from py_utils import baseline_wander_removal
+import plotter
+
+def preproc_ecg(file_path, fs, downsample_freq, output_folder):   
+    ecg_data = pd.read_csv(file_path)
+    #Determing the time domain of the raw signal from the length of data and sampling freq
+    time_s = np.arange(0, len(ecg_data)/fs, 1/fs)    
+    
+    #spot checking
+    output_file = Path(output_folder, 'raw_data.png')
+    plotter.plot_ecg(ecg_data,output_file)
+    
+    #Basline Drift Removal
+    baseline, ecg_out = baseline_wander_removal.bwr(ecg_data)
+    #spot check 2
+    output_file = Path(output_folder, 'bwr_data.png')
+    plotter.plot_ecg(ecg_out,output_file)
+    #Perfrom a standard scaling of the data (x-mu/sigma)
+    scaler = StandardScaler()
+    scaled_ecg = scaler.fit_transform(ecg_out)
+    #spot check 3
+    output_file = Path(output_folder, 'scaled_data.png')
+    plotter.plot_ecg(ecg_data,scaled_ecg)
+    #Need to downsample to match the MEG data
+    secs = len(scaled_ecg)/fs
+    sampls = secs*downsample_freq
+    print('int - ', int(sampls)) 
+    scaled_down_ecg = signal.resample(scaled_ecg, int(sampls), t = time_s)
+    #spot check 4
+    output_file = Path(output_folder, 'scaled_downsampled_data.png')
+    plotter.plot_ecg(ecg_data,scaled_down_ecg)
+    #Combining time data with signal data
+    print(scaled_down_ecg[0])
+    scaled_down_ecg_wtime = np.insert(scaled_down_ecg[0], 0, scaled_down_ecg[1], axis=1)
+
+    return scaled_down_ecg_wtime
+    
\ No newline at end of file
--- a/py_utils @ a8c6aebd
+++ b/py_utils @ a8c6aebd
-Subproject commit b550045afcb48f6a72cbd7ea8cd28353ba0996ff
+Subproject commit a8c6aebd5cd5e0ac79c58ee9d978bc7c64e093f9