diff --git a/.gitignore b/.gitignore index 0628429..f684eec 100755 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ GuPPy/runFiberPhotometryAnalysis.ipynb .clinerules/ testing_data/ + +CLAUDE.md diff --git a/src/guppy/analysis/__init__.py b/src/guppy/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py new file mode 100644 index 0000000..d3da042 --- /dev/null +++ b/src/guppy/analysis/artifact_removal.py @@ -0,0 +1,222 @@ +import logging + +import numpy as np + +logger = logging.getLogger(__name__) + + +def remove_artifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + method, +): + if method == "concatenate": + name_to_corrected_data, pair_name_to_corrected_timestamps, compound_name_to_corrected_ttl_timestamps = ( + processTimestampsForArtifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) + ) + logger.info("Artifacts removed using concatenate method.") + elif method == "replace with NaN": + name_to_corrected_data, compound_name_to_corrected_ttl_timestamps = addingNaNtoChunksWithArtifacts( + storesList, + pair_name_to_tsNew, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) + pair_name_to_corrected_timestamps = None + logger.info("Artifacts removed using NaN replacement method.") + else: + logger.error("Invalid artifact removal method specified.") + raise ValueError("Invalid artifact removal method specified.") + + return name_to_corrected_data, pair_name_to_corrected_timestamps, compound_name_to_corrected_ttl_timestamps + + +def addingNaNtoChunksWithArtifacts( + storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps +): + logger.debug("Replacing chunks with artifacts by NaN values.") + names_for_storenames = storesList[1, :] + pair_names = pair_name_to_tsNew.keys() + + name_to_corrected_data = {} + compound_name_to_corrected_ttl_timestamps = {} + for pair_name in pair_names: + tsNew = pair_name_to_tsNew[pair_name] + coords = pair_name_to_coords[pair_name] + for i in range(len(names_for_storenames)): + if ( + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() + ): # changes done + data = name_to_data[names_for_storenames[i]].reshape(-1) + data = addingNaNValues(data=data, ts=tsNew, coords=coords) + name_to_corrected_data[names_for_storenames[i]] = data + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + ttl_name = names_for_storenames[i] + compound_name = ttl_name + "_" + pair_name + ts = compound_name_to_ttl_timestamps[compound_name].reshape(-1) + ts = removeTTLs(ts=ts, coords=coords) + compound_name_to_corrected_ttl_timestamps[compound_name] = ts + logger.info("Chunks with artifacts are replaced by NaN values.") + + return name_to_corrected_data, compound_name_to_corrected_ttl_timestamps + + +# main function to align timestamps for control, signal and event timestamps for artifacts removal +def processTimestampsForArtifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, +): + logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") + names_for_storenames = storesList[1, :] + pair_names = pair_name_to_tsNew.keys() + + name_to_corrected_data = {} + pair_name_to_corrected_timestamps = {} + compound_name_to_corrected_ttl_timestamps = {} + for pair_name in pair_names: + sampling_rate = pair_name_to_sampling_rate[pair_name] + tsNew = pair_name_to_tsNew[pair_name] + coords = pair_name_to_coords[pair_name] + + for i in range(len(names_for_storenames)): + if ( + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() + ): # changes done + data = name_to_data[names_for_storenames[i]] + data, timestampNew = eliminateData( + data=data, + ts=tsNew, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, + ) + name_to_corrected_data[names_for_storenames[i]] = data + pair_name_to_corrected_timestamps[pair_name] = timestampNew + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + compound_name = names_for_storenames[i] + "_" + pair_name + ts = compound_name_to_ttl_timestamps[compound_name] + ts = eliminateTs( + ts=ts, + tsNew=tsNew, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, + ) + compound_name_to_corrected_ttl_timestamps[compound_name] = ts + + logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") + + return ( + name_to_corrected_data, + pair_name_to_corrected_timestamps, + compound_name_to_corrected_ttl_timestamps, + ) + + +# helper function to process control and signal timestamps +def eliminateData(*, data, ts, coords, timeForLightsTurnOn, sampling_rate): + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_arr = np.array([]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(arr) == 0: + arr = np.concatenate((arr, data[index])) + sub = ts[index][0] - timeForLightsTurnOn + new_ts = ts[index] - sub + ts_arr = np.concatenate((ts_arr, new_ts)) + else: + temp = data[index] + # new = temp + (arr[-1]-temp[0]) + temp_ts = ts[index] + new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) + arr = np.concatenate((arr, temp)) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + # logger.info(arr.shape, ts_arr.shape) + return arr, ts_arr + + +# helper function to align event timestamps with the control and signal timestamps +def eliminateTs(*, ts, tsNew, coords, timeForLightsTurnOn, sampling_rate): + + ts_arr = np.array([]) + tsNew_arr = np.array([]) + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(tsNew_arr) == 0: + sub = tsNew[tsNew_index][0] - timeForLightsTurnOn + tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) + ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) + else: + temp_tsNew = tsNew[tsNew_index] + temp_ts = ts[ts_index] + new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) + new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) + tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + return ts_arr + + +# adding nan values to removed chunks +# when using artifacts removal method - replace with NaN +def addingNaNValues(*, data, ts, coords): + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_index = np.arange(ts.shape[0]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + arr = np.concatenate((arr, index)) + + nan_indices = list(set(ts_index).symmetric_difference(arr)) + data[nan_indices] = np.nan + + return data + + +# remove event TTLs which falls in the removed chunks +# when using artifacts removal method - replace with NaN +def removeTTLs(*, ts, coords): + ts_arr = np.array([]) + for i in range(coords.shape[0]): + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + ts_arr = np.concatenate((ts_arr, ts[ts_index])) + + return ts_arr diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py new file mode 100644 index 0000000..6ccddc0 --- /dev/null +++ b/src/guppy/analysis/combine_data.py @@ -0,0 +1,128 @@ +import logging +import os + +import numpy as np + +from .io_utils import ( + decide_naming_convention, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, sampling_rate): + + arr = np.array([]) + ts_arr = np.array([]) + filepaths = list(filepath_to_timestamps.keys()) + for filepath in filepaths: + ts = filepath_to_timestamps[filepath] + data = filepath_to_data[filepath] + + if len(arr) == 0: + arr = np.concatenate((arr, data)) + sub = ts[0] - timeForLightsTurnOn + new_ts = ts - sub + ts_arr = np.concatenate((ts_arr, new_ts)) + else: + temp = data + temp_ts = ts + new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) + arr = np.concatenate((arr, temp)) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + return arr, ts_arr + + +def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, sampling_rate): + + ts_arr = np.array([]) + tsNew_arr = np.array([]) + filepaths = list(filepath_to_timestamps.keys()) + for filepath in filepaths: + ts = filepath_to_timestamps[filepath] + tsNew = filepath_to_ttl_timestamps[filepath] + if len(tsNew_arr) == 0: + sub = tsNew[0] - timeForLightsTurnOn + tsNew_arr = np.concatenate((tsNew_arr, tsNew - sub)) + ts_arr = np.concatenate((ts_arr, ts - sub)) + else: + temp_tsNew = tsNew + temp_ts = ts + new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) + new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) + tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + # logger.info(event) + # logger.info(ts_arr) + return ts_arr + + +def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_storenames, sampling_rate): + # filepath = [[folder1_output_0, folder2_output_0], [folder1_output_1, folder2_output_1], ...] + + logger.debug("Processing timestamps for combining data...") + + names_for_storenames = names_for_storenames[1, :] + + for single_output_filepaths in filepath: + # single_output_filepaths = [folder1_output_i, folder2_output_i, ...] + + path = decide_naming_convention(single_output_filepaths[0]) + + pair_name_to_tsNew = {} + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1] + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1 + + for i in range(len(names_for_storenames)): + if ( + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() + ): + filepath_to_timestamps = {} + filepath_to_data = {} + for filepath in single_output_filepaths: + ts = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") + data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) + filepath_to_timestamps[filepath] = ts + filepath_to_data[filepath] = data + + data, timestampNew = eliminateData( + filepath_to_timestamps, + filepath_to_data, + timeForLightsTurnOn, + sampling_rate, + ) + write_hdf5(data, names_for_storenames[i], single_output_filepaths[0], "data") + pair_name_to_tsNew[pair_name] = timestampNew + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + filepath_to_timestamps = {} + filepath_to_ttl_timestamps = {} + for filepath in single_output_filepaths: + tsNew = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") + if os.path.exists(os.path.join(filepath, names_for_storenames[i] + "_" + pair_name + ".hdf5")): + ts = read_hdf5(names_for_storenames[i] + "_" + pair_name, filepath, "ts").reshape(-1) + else: + ts = np.array([]) + filepath_to_timestamps[filepath] = tsNew + filepath_to_ttl_timestamps[filepath] = ts + + ts = eliminateTs( + filepath_to_timestamps, + filepath_to_ttl_timestamps, + timeForLightsTurnOn, + sampling_rate, + ) + write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, single_output_filepaths[0], "ts") + for pair_name, tsNew in pair_name_to_tsNew.items(): + write_hdf5(tsNew, "timeCorrection_" + pair_name, single_output_filepaths[0], "timestampNew") diff --git a/src/guppy/analysis/control_channel.py b/src/guppy/analysis/control_channel.py new file mode 100644 index 0000000..605bd17 --- /dev/null +++ b/src/guppy/analysis/control_channel.py @@ -0,0 +1,122 @@ +import logging +import os +import shutil + +import numpy as np +import pandas as pd +from scipy import signal as ss +from scipy.optimize import curve_fit + +from .io_utils import ( + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +# This function just creates placeholder Control-HDF5 files that are then immediately overwritten later on in the pipeline. +# TODO: Refactor this function to avoid unnecessary file creation. +# function to add control channel when there is no +# isosbestic control channel and update the storeslist file +def add_control_channel(filepath, arr): + + storenames = arr[0, :] + storesList = np.char.lower(arr[1, :]) + + keep_control = np.array([]) + # check a case if there is isosbestic control channel present + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "signal_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) > 1: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + if len(find_signal) == 0: + logger.error( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + raise Exception( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + else: + continue + + for i in range(storesList.shape[0]): + if "signal" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "control_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) == 0: + src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( + filepath, "cntrl" + str(i) + ".hdf5" + ) + shutil.copyfile(src, dst) + arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) + + np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") + + return arr + + +# main function to create control channel using +# signal channel and save it to a file +def create_control_channel(filepath, arr, window=5001): + + storenames = arr[0, :] + storesList = arr[1, :] + + for i in range(storesList.shape[0]): + event_name, event = storesList[i], storenames[i] + if "control" in event_name.lower() and "cntrl" in event.lower(): + logger.debug("Creating control channel from signal channel using curve-fitting") + name = event_name.split("_")[-1] + signal = read_hdf5("signal_" + name, filepath, "data") + timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + sampling_rate = np.full(timestampNew.shape, np.nan) + sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + + control = helper_create_control_channel(signal, timestampNew, window) + + write_hdf5(control, event_name, filepath, "data") + d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} + df = pd.DataFrame(d) + df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) + logger.info("Control channel from signal channel created using curve-fitting") + + +# TODO: figure out why a control channel is created for both timestamp correction and z-score steps. +# helper function to create control channel using signal channel +# by curve fitting signal channel to exponential function +# when there is no isosbestic control channel is present +def helper_create_control_channel(signal, timestamps, window): + # check if window is greater than signal shape + if window > signal.shape[0]: + window = ((signal.shape[0] + 1) / 2) + 1 + if window % 2 != 0: + window = window + else: + window = window + 1 + + filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) + + p0 = [5, 50, 60] + + try: + popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) + except Exception as e: + logger.error(str(e)) + + # logger.info('Curve Fit Parameters : ', popt) + control = curveFitFn(timestamps, *popt) + + return control + + +# curve fit exponential function +def curveFitFn(x, a, b, c): + return a + (b * np.exp(-(1 / c) * x)) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py new file mode 100644 index 0000000..b467c37 --- /dev/null +++ b/src/guppy/analysis/io_utils.py @@ -0,0 +1,196 @@ +import fnmatch +import glob +import logging +import os +import re + +import h5py +import numpy as np + +logger = logging.getLogger(__name__) + + +def takeOnlyDirs(paths): + removePaths = [] + for p in paths: + if os.path.isfile(p): + removePaths.append(p) + return list(set(paths) - set(removePaths)) + + +# find files by ignoring the case sensitivity +def find_files(path, glob_path, ignore_case=False): + rule = ( + re.compile(fnmatch.translate(glob_path), re.IGNORECASE) + if ignore_case + else re.compile(fnmatch.translate(glob_path)) + ) + + no_bytes_path = os.listdir(os.path.expanduser(path)) + str_path = [] + + # converting byte object to string + for x in no_bytes_path: + try: + str_path.append(x.decode("utf-8")) + except: + str_path.append(x) + return [os.path.join(path, n) for n in str_path if rule.match(n)] + + +# check if dealing with TDT files or csv files +def check_TDT(filepath): + path = glob.glob(os.path.join(filepath, "*.tsq")) + if len(path) > 0: + return True + else: + return False + + +# function to read hdf5 file +def read_hdf5(event, filepath, key): + if event: + event = event.replace("\\", "_") + event = event.replace("/", "_") + op = os.path.join(filepath, event + ".hdf5") + else: + op = filepath + + if os.path.exists(op): + with h5py.File(op, "r") as f: + arr = np.asarray(f[key]) + else: + logger.error(f"{event}.hdf5 file does not exist") + raise Exception("{}.hdf5 file does not exist".format(event)) + + return arr + + +# function to write hdf5 file +def write_hdf5(data, event, filepath, key): + event = event.replace("\\", "_") + event = event.replace("/", "_") + op = os.path.join(filepath, event + ".hdf5") + + # if file does not exist create a new file + if not os.path.exists(op): + with h5py.File(op, "w") as f: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + + # if file already exists, append data to it or add a new key to it + else: + with h5py.File(op, "r+") as f: + if key in list(f.keys()): + if type(data) is np.ndarray: + f[key].resize(data.shape) + arr = f[key] + arr[:] = data + else: + arr = f[key] + arr = data + else: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + + +# function to check if the naming convention for saving storeslist file was followed or not +def decide_naming_convention(filepath): + path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + + path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + + path = sorted(path_1 + path_2, key=str.casefold) + if len(path) % 2 != 0: + logger.error("There are not equal number of Control and Signal data") + raise Exception("There are not equal number of Control and Signal data") + + path = np.asarray(path).reshape(2, -1) + + return path + + +# function to read coordinates file which was saved by selecting chunks for artifacts removal +def fetchCoords(filepath, naming, data): + + path = os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy") + + if not os.path.exists(path): + coords = np.array([0, data[-1]]) + else: + coords = np.load(os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy"))[:, 0] + + if coords.shape[0] % 2 != 0: + logger.error("Number of values in coordsForPreProcessing file is not even.") + raise Exception("Number of values in coordsForPreProcessing file is not even.") + + coords = coords.reshape(-1, 2) + + return coords + + +def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redundant with fetchCoords + if removeArtifacts == True: + coords = fetchCoords(filepath, name, tsNew) + else: + dt = tsNew[1] - tsNew[0] + coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) + return coords + + +def get_all_stores_for_combining_data(folderNames): + op = [] + for i in range(100): + temp = [] + match = r"[\s\S]*" + "_output_" + str(i) + for j in folderNames: + temp.append(re.findall(match, j)) + temp = sorted(list(np.concatenate(temp).flatten()), key=str.casefold) + if len(temp) > 0: + op.append(temp) + + return op + + +# for combining data, reading storeslist file from both data and create a new storeslist array +def check_storeslistfile(folderNames): + storesList = np.array([[], []]) + for i in range(len(folderNames)): + filepath = folderNames[i] + storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList = np.concatenate( + ( + storesList, + np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1), + ), + axis=1, + ) + + storesList = np.unique(storesList, axis=1) + + return storesList + + +def get_control_and_signal_channel_names(storesList): + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + channels_arr = [] + for i in range(names_for_storenames.shape[0]): + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + channels_arr.append(names_for_storenames[i]) + + channels_arr = sorted(channels_arr, key=str.casefold) + try: + channels_arr = np.asarray(channels_arr).reshape(2, -1) + except: + logger.error("Error in saving stores list file or spelling mistake for control or signal") + raise Exception("Error in saving stores list file or spelling mistake for control or signal") + + return channels_arr diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py new file mode 100644 index 0000000..e7fe8e0 --- /dev/null +++ b/src/guppy/analysis/standard_io.py @@ -0,0 +1,210 @@ +import logging +import os + +import numpy as np + +from .io_utils import ( + decide_naming_convention, + fetchCoords, + get_control_and_signal_channel_names, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +def read_control_and_signal(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_data = {} + name_to_timestamps = {} + name_to_sampling_rate = {} + name_to_npoints = {} + + for i in range(channels_arr.shape[1]): + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + idx_c = np.where(names_for_storenames == control_name)[0] + idx_s = np.where(names_for_storenames == signal_name)[0] + control_storename = storenames[idx_c[0]] + signal_storename = storenames[idx_s[0]] + + control_data = read_hdf5(control_storename, filepath, "data") + signal_data = read_hdf5(signal_storename, filepath, "data") + control_timestamps = read_hdf5(control_storename, filepath, "timestamps") + signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps") + control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate") + signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate") + try: # TODO: define npoints for csv datasets + control_npoints = read_hdf5(control_storename, filepath, "npoints") + signal_npoints = read_hdf5(signal_storename, filepath, "npoints") + except KeyError: # npoints is not defined for csv datasets + control_npoints = None + signal_npoints = None + + name_to_data[control_name] = control_data + name_to_data[signal_name] = signal_data + name_to_timestamps[control_name] = control_timestamps + name_to_timestamps[signal_name] = signal_timestamps + name_to_sampling_rate[control_name] = control_sampling_rate + name_to_sampling_rate[signal_name] = signal_sampling_rate + name_to_npoints[control_name] = control_npoints + name_to_npoints[signal_name] = signal_npoints + + return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints + + +def read_ttl(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_timestamps = {} + for storename, name in zip(storenames, names_for_storenames): + if name in channels_arr: + continue + timestamps = read_hdf5(storename, filepath, "timestamps") + name_to_timestamps[name] = timestamps + + return name_to_timestamps + + +def write_corrected_timestamps( + filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex +): + for name, correctionIndex in name_to_correctionIndex.items(): + timestamps = name_to_timestamps[name] + corrected_timestamps = corrected_name_to_timestamps[name] + sampling_rate = name_to_sampling_rate[name] + if sampling_rate.shape == (): # numpy scalar + sampling_rate = np.asarray([sampling_rate]) + name_1 = name.split("_")[-1] + write_hdf5(np.asarray([timestamps[0]]), "timeCorrection_" + name_1, filepath, "timeRecStart") + write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") + + +def write_corrected_data(filepath, name_to_corrected_data): + for name, data in name_to_corrected_data.items(): + write_hdf5(data, name, filepath, "data") + + +def write_corrected_ttl_timestamps( + filepath, + compound_name_to_corrected_ttl_timestamps, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): + write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") + logger.info("Timestamps corrections applied to the data and event timestamps.") + + +def read_corrected_data(control_path, signal_path, filepath, name): + control = read_hdf5("", control_path, "data").reshape(-1) + signal = read_hdf5("", signal_path, "data").reshape(-1) + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + + return control, signal, tsNew + + +def write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr): + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + if temp_control_arr is not None: + write_hdf5(temp_control_arr, "control_" + name, filepath, "data") + + +def read_corrected_timestamps_pairwise(filepath): + pair_name_to_tsNew = {} + pair_name_to_sampling_rate = {} + path = decide_naming_convention(filepath) + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + if name_1[-1] != name_2[-1]: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + pair_name_to_tsNew[name] = tsNew + pair_name_to_sampling_rate[name] = sampling_rate + return pair_name_to_tsNew, pair_name_to_sampling_rate + + +def read_coords_pairwise(filepath, pair_name_to_tsNew): + pair_name_to_coords = {} + path = decide_naming_convention(filepath) + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + if name_1[-1] != name_2[-1]: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1[-1] + + tsNew = pair_name_to_tsNew[pair_name] + coords = fetchCoords(filepath, pair_name, tsNew) + pair_name_to_coords[pair_name] = coords + return pair_name_to_coords + + +def read_corrected_data_dict(filepath, storesList): # TODO: coordinate with read_corrected_data + name_to_corrected_data = {} + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + control_and_signal_names = get_control_and_signal_channel_names(storesList) + + for storename, name in zip(storenames, names_for_storenames): + if name not in control_and_signal_names: + continue + data = read_hdf5(name, filepath, "data").reshape(-1) + name_to_corrected_data[name] = data + + return name_to_corrected_data + + +def read_corrected_ttl_timestamps(filepath, storesList): + compound_name_to_ttl_timestamps = {} + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) + + for storename, name in zip(storenames, names_for_storenames): + if name in arr: + continue + ttl_name = name + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + compound_name = ttl_name + "_" + name_1 + ts = read_hdf5(compound_name, filepath, "ts") + compound_name_to_ttl_timestamps[compound_name] = ts + + return compound_name_to_ttl_timestamps + + +def write_artifact_corrected_timestamps(filepath, pair_name_to_corrected_timestamps): + for pair_name, timestamps in pair_name_to_corrected_timestamps.items(): + write_hdf5(timestamps, "timeCorrection_" + pair_name, filepath, "timestampNew") + + +def write_artifact_removal( + filepath, + name_to_corrected_data, + pair_name_to_corrected_timestamps, + compound_name_to_corrected_ttl_timestamps=None, +): + write_corrected_data(filepath, name_to_corrected_data) + write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps) + if pair_name_to_corrected_timestamps is not None: + write_artifact_corrected_timestamps(filepath, pair_name_to_corrected_timestamps) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py new file mode 100644 index 0000000..0806fb8 --- /dev/null +++ b/src/guppy/analysis/timestamp_correction.py @@ -0,0 +1,200 @@ +import logging + +import numpy as np + +from .io_utils import get_control_and_signal_channel_names + +logger = logging.getLogger(__name__) + + +def correct_timestamps( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + name_to_timestamps_ttl, + mode, +): + name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data = timestampCorrection( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + mode=mode, + ) + compound_name_to_corrected_ttl_timestamps = decide_naming_and_applyCorrection_ttl( + timeForLightsTurnOn, + storesList, + name_to_timestamps_ttl, + name_to_timestamps, + name_to_data, + mode=mode, + ) + + return ( + name_to_corrected_timestamps, + name_to_correctionIndex, + name_to_corrected_data, + compound_name_to_corrected_ttl_timestamps, + ) + + +# function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode) +def timestampCorrection( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + mode, +): + logger.debug( + f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" + ) + if mode not in ["tdt", "csv"]: + logger.error("Mode should be either 'tdt' or 'csv'") + raise ValueError("Mode should be either 'tdt' or 'csv'") + name_to_corrected_timestamps = {} + name_to_correctionIndex = {} + name_to_corrected_data = {} + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + channels_arr = get_control_and_signal_channel_names(storesList) + + indices = check_cntrl_sig_length(channels_arr, name_to_data) + + for i in range(channels_arr.shape[1]): + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + name_1 = channels_arr[0, i].split("_")[-1] + name_2 = channels_arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + # dirname = os.path.dirname(path[i]) + idx = np.where(names_for_storenames == indices[i])[0] + + if idx.shape[0] == 0: + logger.error(f"{channels_arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(channels_arr[0, i])) + + name = names_for_storenames[idx][0] + timestamp = name_to_timestamps[name] + sampling_rate = name_to_sampling_rate[name] + npoints = name_to_npoints[name] + + if mode == "tdt": + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) + adder = np.arange(npoints) / sampling_rate + lengthAdder = adder.shape[0] + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] + timestampNew = timestampNew[correctionIndex] + elif mode == "csv": + correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] + timestampNew = timestamp[correctionIndex] + + for displayName in [control_name, signal_name]: + name_to_corrected_timestamps[displayName] = timestampNew + name_to_correctionIndex[displayName] = correctionIndex + data = name_to_data[displayName] + if (data == 0).all() == True: + name_to_corrected_data[displayName] = data + else: + name_to_corrected_data[displayName] = data[correctionIndex] + + logger.info("Timestamps corrected and converted to seconds.") + return name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data + + +def decide_naming_and_applyCorrection_ttl( + timeForLightsTurnOn, + storesList, + name_to_timestamps_ttl, + name_to_timestamps, + name_to_data, + mode, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) + indices = check_cntrl_sig_length(arr, name_to_data) + + compound_name_to_corrected_ttl_timestamps = {} + for ttl_name, ttl_timestamps in name_to_timestamps_ttl.items(): + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + idx = np.where(names_for_storenames == indices[i])[0] + if idx.shape[0] == 0: + logger.error(f"{arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + + name = names_for_storenames[idx][0] + timestamps = name_to_timestamps[name] + timeRecStart = timestamps[0] + corrected_ttl_timestamps = applyCorrection_ttl( + timeForLightsTurnOn, + timeRecStart, + ttl_timestamps, + mode, + ) + compound_name = ttl_name + "_" + name_1 + compound_name_to_corrected_ttl_timestamps[compound_name] = corrected_ttl_timestamps + + logger.info("Timestamps corrections applied to the data and event timestamps.") + return compound_name_to_corrected_ttl_timestamps + + +def applyCorrection_ttl( + timeForLightsTurnOn, + timeRecStart, + ttl_timestamps, + mode, +): + corrected_ttl_timestamps = ttl_timestamps + if mode == "tdt": + res = (corrected_ttl_timestamps >= timeRecStart).all() + if res == True: + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeRecStart) + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn) + else: + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn) + elif mode == "csv": + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn) + return corrected_ttl_timestamps + + +# function to check control and signal channel has same length +# if not, take a smaller length and do pre-processing +def check_cntrl_sig_length(channels_arr, name_to_data): + + indices = [] + for i in range(channels_arr.shape[1]): + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + control = name_to_data[control_name] + signal = name_to_data[signal_name] + if control.shape[0] < signal.shape[0]: + indices.append(control_name) + elif control.shape[0] > signal.shape[0]: + indices.append(signal_name) + else: + indices.append(signal_name) + + return indices diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py new file mode 100644 index 0000000..34b29ee --- /dev/null +++ b/src/guppy/analysis/z_score.py @@ -0,0 +1,148 @@ +import logging + +import numpy as np +from scipy import signal as ss + +from .control_channel import helper_create_control_channel + +logger = logging.getLogger(__name__) + + +# high-level function to compute z-score and deltaF/F +def compute_z_score( + control, + signal, + tsNew, + coords, + artifactsRemovalMethod, + filter_window, + isosbestic_control, + zscore_method, + baseline_start, + baseline_end, +): + if (control == 0).all() == True: + control = np.zeros(tsNew.shape[0]) + + z_score_arr = np.array([]) + norm_data_arr = np.full(tsNew.shape[0], np.nan) + control_fit_arr = np.full(tsNew.shape[0], np.nan) + temp_control_arr = np.full(tsNew.shape[0], np.nan) + + # for artifacts removal, each chunk which was selected by user is being processed individually and then + # z-score is calculated + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + if isosbestic_control == False: + control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff(control_arr, signal_arr, isosbestic_control, filter_window) + temp_control_arr[tsNew_index] = control_arr + if i < coords.shape[0] - 1: + blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] + temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) + else: + control_arr = control[tsNew_index] + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff(control_arr, signal_arr, isosbestic_control, filter_window) + norm_data_arr[tsNew_index] = norm_data + control_fit_arr[tsNew_index] = control_fit + + if artifactsRemovalMethod == "concatenate": + norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] + control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] + z_score = z_score_computation(norm_data_arr, tsNew, zscore_method, baseline_start, baseline_end) + z_score_arr = np.concatenate((z_score_arr, z_score)) + + # handle the case if there are chunks being cut in the front and the end + if isosbestic_control == False: + coords = coords.flatten() + # front chunk + idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0] + temp_control_arr[idx] = np.full(idx.shape[0], np.nan) + # end chunk + idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0] + temp_control_arr[idx] = np.full(idx.shape[0], np.nan) + else: + temp_control_arr = None + + return z_score_arr, norm_data_arr, control_fit_arr, temp_control_arr + + +# function to filter control and signal channel, also execute above two function : controlFit and deltaFF +# function will also take care if there is only signal channel and no control channel +# if there is only signal channel, z-score will be computed using just signal channel +def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): + + if isosbestic_control == False: + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + else: + control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control_smooth, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + + return norm_data, control_fit + + +# function to compute deltaF/F using fitted control channel and filtered signal channel +def deltaFF(signal, control): + + res = np.subtract(signal, control) + normData = np.divide(res, control) + # deltaFF = normData + normData = normData * 100 + + return normData + + +# function to fit control channel to signal channel +def controlFit(control, signal): + + p = np.polyfit(control, signal, 1) + arr = (p[0] * control) + p[1] + return arr + + +def filterSignal(filter_window, signal): + if filter_window == 0: + return signal + elif filter_window > 1: + b = np.divide(np.ones((filter_window,)), filter_window) + a = 1 + filtered_signal = ss.filtfilt(b, a, signal) + return filtered_signal + else: + raise Exception("Moving average filter window value is not correct.") + + +# function to compute z-score based on z-score computation method +def z_score_computation(dff, timestamps, zscore_method, baseline_start, baseline_end): + if zscore_method == "standard z-score": + numerator = np.subtract(dff, np.nanmean(dff)) + zscore = np.divide(numerator, np.nanstd(dff)) + elif zscore_method == "baseline z-score": + idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] + if idx.shape[0] == 0: + logger.error( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + raise Exception( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + else: + baseline_mean = np.nanmean(dff[idx]) + baseline_std = np.nanstd(dff[idx]) + numerator = np.subtract(dff, baseline_mean) + zscore = np.divide(numerator, baseline_std) + else: + median = np.median(dff) + mad = np.median(np.abs(dff - median)) + numerator = 0.6745 * (dff - median) + zscore = np.divide(numerator, mad) + + return zscore diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 8b79039..0c41ae4 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -1,22 +1,40 @@ -import fnmatch import glob import json import logging import os -import re -import shutil import sys -import h5py import matplotlib.pyplot as plt import numpy as np -import pandas as pd -from scipy import signal as ss -from scipy.optimize import curve_fit -from .combineDataFn import processTimestampsForCombiningData - -logger = logging.getLogger(__name__) +from .analysis.artifact_removal import remove_artifacts +from .analysis.combine_data import combine_data +from .analysis.control_channel import add_control_channel, create_control_channel +from .analysis.io_utils import ( + check_storeslistfile, + check_TDT, + find_files, + get_all_stores_for_combining_data, # noqa: F401 -- Necessary for other modules that depend on preprocess.py + get_coords, + read_hdf5, + takeOnlyDirs, +) +from .analysis.standard_io import ( + read_control_and_signal, + read_coords_pairwise, + read_corrected_data, + read_corrected_data_dict, + read_corrected_timestamps_pairwise, + read_corrected_ttl_timestamps, + read_ttl, + write_artifact_removal, + write_corrected_data, + write_corrected_timestamps, + write_corrected_ttl_timestamps, + write_zscore, +) +from .analysis.timestamp_correction import correct_timestamps +from .analysis.z_score import compute_z_score logger = logging.getLogger(__name__) @@ -25,404 +43,11 @@ plt.switch_backend("TKAgg") -def takeOnlyDirs(paths): - removePaths = [] - for p in paths: - if os.path.isfile(p): - removePaths.append(p) - return list(set(paths) - set(removePaths)) - - def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) -# find files by ignoring the case sensitivity -def find_files(path, glob_path, ignore_case=False): - rule = ( - re.compile(fnmatch.translate(glob_path), re.IGNORECASE) - if ignore_case - else re.compile(fnmatch.translate(glob_path)) - ) - - no_bytes_path = os.listdir(os.path.expanduser(path)) - str_path = [] - - # converting byte object to string - for x in no_bytes_path: - try: - str_path.append(x.decode("utf-8")) - except: - str_path.append(x) - return [os.path.join(path, n) for n in str_path if rule.match(n)] - - -# curve fit exponential function -def curveFitFn(x, a, b, c): - return a + (b * np.exp(-(1 / c) * x)) - - -# helper function to create control channel using signal channel -# by curve fitting signal channel to exponential function -# when there is no isosbestic control channel is present -def helper_create_control_channel(signal, timestamps, window): - # check if window is greater than signal shape - if window > signal.shape[0]: - window = ((signal.shape[0] + 1) / 2) + 1 - if window % 2 != 0: - window = window - else: - window = window + 1 - - filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) - - p0 = [5, 50, 60] - - try: - popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) - except Exception as e: - logger.error(str(e)) - - # logger.info('Curve Fit Parameters : ', popt) - control = curveFitFn(timestamps, *popt) - - return control - - -# main function to create control channel using -# signal channel and save it to a file -def create_control_channel(filepath, arr, window=5001): - - storenames = arr[0, :] - storesList = arr[1, :] - - for i in range(storesList.shape[0]): - event_name, event = storesList[i], storenames[i] - if "control" in event_name.lower() and "cntrl" in event.lower(): - logger.debug("Creating control channel from signal channel using curve-fitting") - name = event_name.split("_")[-1] - signal = read_hdf5("signal_" + name, filepath, "data") - timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - sampling_rate = np.full(timestampNew.shape, np.nan) - sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - - control = helper_create_control_channel(signal, timestampNew, window) - - write_hdf5(control, event_name, filepath, "data") - d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} - df = pd.DataFrame(d) - df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) - logger.info("Control channel from signal channel created using curve-fitting") - - -# function to add control channel when there is no -# isosbestic control channel and update the storeslist file -def add_control_channel(filepath, arr): - - storenames = arr[0, :] - storesList = np.char.lower(arr[1, :]) - - keep_control = np.array([]) - # check a case if there is isosbestic control channel present - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "signal_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) > 1: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - if len(find_signal) == 0: - logger.error( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - raise Exception( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - else: - continue - - for i in range(storesList.shape[0]): - if "signal" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "control_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) == 0: - src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( - filepath, "cntrl" + str(i) + ".hdf5" - ) - shutil.copyfile(src, dst) - arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) - - np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") - - return arr - - -# check if dealing with TDT files or csv files -def check_TDT(filepath): - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 0: - return True - else: - return False - - -# function to read hdf5 file -def read_hdf5(event, filepath, key): - if event: - event = event.replace("\\", "_") - event = event.replace("/", "_") - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - logger.error(f"{event}.hdf5 file does not exist") - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - -# function to write hdf5 file -def write_hdf5(data, event, filepath, key): - event = event.replace("\\", "_") - event = event.replace("/", "_") - op = os.path.join(filepath, event + ".hdf5") - - # if file does not exist create a new file - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - # if file already exists, append data to it or add a new key to it - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - -# function to check control and signal channel has same length -# if not, take a smaller length and do pre-processing -def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): - - indices = [] - for i in range(channels_arr.shape[1]): - idx_c = np.where(storesList == channels_arr[0, i])[0] - idx_s = np.where(storesList == channels_arr[1, i])[0] - control = read_hdf5(storenames[idx_c[0]], filepath, "data") - signal = read_hdf5(storenames[idx_s[0]], filepath, "data") - if control.shape[0] < signal.shape[0]: - indices.append(storesList[idx_c[0]]) - elif control.shape[0] > signal.shape[0]: - indices.append(storesList[idx_s[0]]) - else: - indices.append(storesList[idx_s[0]]) - - return indices - - -# function to correct timestamps after eliminating first few seconds of the data (for csv data) -def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): - - logger.debug( - f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" - ) - storenames = storesList[0, :] - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] - - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") - - if name_1 == name_2: - correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] - timestampNew = timestamp[correctionIndex] - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") - - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info("Timestamps corrected and converted to seconds.") - - -# function to correct timestamps after eliminating first few seconds of the data (for TDT data) -def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): - - logger.debug( - f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" - ) - storenames = storesList[0, :] - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] - - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - npoints = read_hdf5(storenames[idx][0], filepath, "npoints") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") - - if name_1 == name_2: - timeRecStart = timestamp[0] - timestamps = np.subtract(timestamp, timeRecStart) - adder = np.arange(npoints) / sampling_rate - lengthAdder = adder.shape[0] - timestampNew = np.zeros((len(timestamps), lengthAdder)) - for i in range(lengthAdder): - timestampNew[:, i] = np.add(timestamps, adder[i]) - timestampNew = (timestampNew.T).reshape(-1, order="F") - correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] - timestampNew = timestampNew[correctionIndex] - - write_hdf5(np.asarray([timeRecStart]), "timeCorrection_" + name_1, filepath, "timeRecStart") - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray([sampling_rate]), "timeCorrection_" + name_1, filepath, "sampling_rate") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info("Timestamps corrected and converted to seconds.") - # return timeRecStart, correctionIndex, timestampNew - - -# function to apply correction to control, signal and event timestamps -def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): - - cond = check_TDT(os.path.dirname(filepath)) - - if cond == True: - timeRecStart = read_hdf5("timeCorrection_" + naming, filepath, "timeRecStart")[0] - - timestampNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - correctionIndex = read_hdf5("timeCorrection_" + naming, filepath, "correctionIndex") - - if "control" in displayName.lower() or "signal" in displayName.lower(): - split_name = displayName.split("_")[-1] - if split_name == naming: - pass - else: - correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") - arr = read_hdf5(event, filepath, "data") - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, displayName, filepath, "data") - else: - arr = read_hdf5(event, filepath, "timestamps") - if cond == True: - res = (arr >= timeRecStart).all() - if res == True: - arr = np.subtract(arr, timeRecStart) - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - write_hdf5(arr, displayName + "_" + naming, filepath, "ts") - - # if isosbestic_control==False and 'control' in displayName.lower(): - # control = create_control_channel(filepath, displayName) - # write_hdf5(control, displayName, filepath, 'data') - - -# function to check if naming convention was followed while saving storeslist file -# and apply timestamps correction using the function applyCorrection -def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): - - logger.debug("Applying correction of timestamps to the data and event timestamps") - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - arr = np.asarray(arr).reshape(2, -1) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - if name_1 == name_2: - applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info("Timestamps corrections applied to the data and event timestamps.") - - # function to plot z_score def visualize_z_score(filepath): @@ -590,421 +215,6 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) -# function to check if the naming convention for saving storeslist file was followed or not -def decide_naming_convention(filepath): - path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - - path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - if len(path) % 2 != 0: - logger.error("There are not equal number of Control and Signal data") - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - return path - - -# function to read coordinates file which was saved by selecting chunks for artifacts removal -def fetchCoords(filepath, naming, data): - - path = os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy") - - if not os.path.exists(path): - coords = np.array([0, data[-1]]) - else: - coords = np.load(os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy"))[:, 0] - - if coords.shape[0] % 2 != 0: - logger.error("Number of values in coordsForPreProcessing file is not even.") - raise Exception("Number of values in coordsForPreProcessing file is not even.") - - coords = coords.reshape(-1, 2) - - return coords - - -# helper function to process control and signal timestamps -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_arr = np.array([]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(arr) == 0: - arr = np.concatenate((arr, data[index])) - sub = ts[index][0] - timeForLightsTurnOn - new_ts = ts[index] - sub - ts_arr = np.concatenate((ts_arr, new_ts)) - else: - temp = data[index] - # new = temp + (arr[-1]-temp[0]) - temp_ts = ts[index] - new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) - arr = np.concatenate((arr, temp)) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - # logger.info(arr.shape, ts_arr.shape) - return arr, ts_arr - - -# helper function to align event timestamps with the control and signal timestamps -def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - tsNew_arr = np.array([]) - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(tsNew_arr) == 0: - sub = tsNew[tsNew_index][0] - timeForLightsTurnOn - tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) - ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) - else: - temp_tsNew = tsNew[tsNew_index] - temp_ts = ts[ts_index] - new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) - new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) - tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - return ts_arr - - -# adding nan values to removed chunks -# when using artifacts removal method - replace with NaN -def addingNaNValues(filepath, event, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_index = np.arange(ts.shape[0]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - arr = np.concatenate((arr, index)) - - nan_indices = list(set(ts_index).symmetric_difference(arr)) - data[nan_indices] = np.nan - - return data - - -# remove event TTLs which falls in the removed chunks -# when using artifacts removal method - replace with NaN -def removeTTLs(filepath, event, naming): - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - for i in range(coords.shape[0]): - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - ts_arr = np.concatenate((ts_arr, ts[ts_index])) - - return ts_arr - - -def addingNaNtoChunksWithArtifacts(filepath, events): - - logger.debug("Replacing chunks with artifacts by NaN values.") - storesList = events[1, :] - - path = decide_naming_convention(filepath) - - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): # changes done - data = addingNaNValues(filepath, storesList[i], name) - write_hdf5(data, storesList[i], filepath, "data") - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = removeTTLs(filepath, storesList[i], name) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") - - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - logger.info("Chunks with artifacts are replaced by NaN values.") - - -# main function to align timestamps for control, signal and event timestamps for artifacts removal -def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): - - logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") - storesList = events[1, :] - - path = decide_naming_convention(filepath) - - timestamp_dict = dict() - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): # changes done - data, timestampNew = eliminateData( - filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name - ) - write_hdf5(data, storesList[i], filepath, "data") - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = eliminateTs(filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") - - # timestamp_dict[name] = timestampNew - write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") - - -# function to compute deltaF/F using fitted control channel and filtered signal channel -def deltaFF(signal, control): - - res = np.subtract(signal, control) - normData = np.divide(res, control) - # deltaFF = normData - normData = normData * 100 - - return normData - - -# function to fit control channel to signal channel -def controlFit(control, signal): - - p = np.polyfit(control, signal, 1) - arr = (p[0] * control) + p[1] - return arr - - -def filterSignal(filter_window, signal): - if filter_window == 0: - return signal - elif filter_window > 1: - b = np.divide(np.ones((filter_window,)), filter_window) - a = 1 - filtered_signal = ss.filtfilt(b, a, signal) - return filtered_signal - else: - raise Exception("Moving average filter window value is not correct.") - - -# function to filter control and signal channel, also execute above two function : controlFit and deltaFF -# function will also take care if there is only signal channel and no control channel -# if there is only signal channel, z-score will be computed using just signal channel -def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): - - if isosbestic_control == False: - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - else: - control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control_smooth, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - - return norm_data, control_fit - - -# function to compute z-score based on z-score computation method -def z_score_computation(dff, timestamps, inputParameters): - - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] - - if zscore_method == "standard z-score": - numerator = np.subtract(dff, np.nanmean(dff)) - zscore = np.divide(numerator, np.nanstd(dff)) - elif zscore_method == "baseline z-score": - idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] - if idx.shape[0] == 0: - logger.error( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - raise Exception( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - else: - baseline_mean = np.nanmean(dff[idx]) - baseline_std = np.nanstd(dff[idx]) - numerator = np.subtract(dff, baseline_mean) - zscore = np.divide(numerator, baseline_std) - else: - median = np.median(dff) - mad = np.median(np.abs(dff - median)) - numerator = 0.6745 * (dff - median) - zscore = np.divide(numerator, mad) - - return zscore - - -# helper function to compute z-score and deltaF/F -def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): - - removeArtifacts = inputParameters["removeArtifacts"] - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - filter_window = inputParameters["filter_window"] - - isosbestic_control = inputParameters["isosbestic_control"] - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy") - - logger.info("Remove Artifacts : ", removeArtifacts) - - if (control == 0).all() == True: - control = np.zeros(tsNew.shape[0]) - - z_score_arr = np.array([]) - norm_data_arr = np.full(tsNew.shape[0], np.nan) - control_fit_arr = np.full(tsNew.shape[0], np.nan) - temp_control_arr = np.full(tsNew.shape[0], np.nan) - - if removeArtifacts == True: - coords = fetchCoords(filepath, name, tsNew) - - # for artifacts removal, each chunk which was selected by user is being processed individually and then - # z-score is calculated - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - if isosbestic_control == False: - control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - temp_control_arr[tsNew_index] = control_arr - if i < coords.shape[0] - 1: - blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] - temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) - else: - control_arr = control[tsNew_index] - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - norm_data_arr[tsNew_index] = norm_data - control_fit_arr[tsNew_index] = control_fit - - if artifactsRemovalMethod == "concatenate": - norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] - control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] - z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) - else: - tsNew_index = np.arange(tsNew.shape[0]) - norm_data, control_fit = execute_controlFit_dff(control, signal, isosbestic_control, filter_window) - z_score = z_score_computation(norm_data, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) - norm_data_arr[tsNew_index] = norm_data # np.concatenate((norm_data_arr, norm_data)) - control_fit_arr[tsNew_index] = control_fit # np.concatenate((control_fit_arr, control_fit)) - - # handle the case if there are chunks being cut in the front and the end - if isosbestic_control == False and removeArtifacts == True: - coords = coords.flatten() - # front chunk - idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0] - temp_control_arr[idx] = np.full(idx.shape[0], np.nan) - # end chunk - idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0] - temp_control_arr[idx] = np.full(idx.shape[0], np.nan) - write_hdf5(temp_control_arr, "control_" + name, filepath, "data") - - return z_score_arr, norm_data_arr, control_fit_arr - - -# compute z-score and deltaF/F and save it to hdf5 file -def compute_z_score(filepath, inputParameters): - - logger.debug(f"Computing z-score for each of the data in {filepath}") - remove_artifacts = inputParameters["removeArtifacts"] - - path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - b = np.divide(np.ones((100,)), 100) - a = 1 - - if len(path) % 2 != 0: - logger.error("There are not equal number of Control and Signal data") - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - for i in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - - if name_1[-1] == name_2[-1]: - name = name_1[-1] - control = read_hdf5("", path[0, i], "data").reshape(-1) - signal = read_hdf5("", path[1, i], "data").reshape(-1) - # control_smooth = ss.filtfilt(b, a, control) - # signal_smooth = ss.filtfilt(b, a, signal) - # _score, dff = helper_z_score(control_smooth, signal_smooth) - z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters) - if remove_artifacts == True: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - else: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info(f"z-score for the data in {filepath} computed.") - - # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection def execute_timestamp_correction(folderNames, inputParameters): @@ -1014,7 +224,7 @@ def execute_timestamp_correction(folderNames, inputParameters): for i in range(len(folderNames)): filepath = folderNames[i] storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - cond = check_TDT(folderNames[i]) + mode = "tdt" if check_TDT(folderNames[i]) else "csv" logger.debug(f"Timestamps corrections started for {filepath}") for j in range(len(storesListPath)): filepath = storesListPath[j] @@ -1025,15 +235,36 @@ def execute_timestamp_correction(folderNames, inputParameters): if isosbestic_control == False: storesList = add_control_channel(filepath, storesList) - if cond == True: - timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) - else: - timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) - - for k in range(storesList.shape[1]): - decide_naming_convention_and_applyCorrection( - filepath, timeForLightsTurnOn, storesList[0, k], storesList[1, k], storesList - ) + control_and_signal_dicts = read_control_and_signal(filepath, storesList) + name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts + name_to_timestamps_ttl = read_ttl(filepath, storesList) + + timestamps_dicts = correct_timestamps( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + name_to_timestamps_ttl, + mode=mode, + ) + ( + name_to_corrected_timestamps, + name_to_correctionIndex, + name_to_corrected_data, + compound_name_to_corrected_ttl_timestamps, + ) = timestamps_dicts + + write_corrected_timestamps( + filepath, + name_to_corrected_timestamps, + name_to_timestamps, + name_to_sampling_rate, + name_to_correctionIndex, + ) + write_corrected_data(filepath, name_to_corrected_data) + write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps) # check if isosbestic control is false and also if new control channel is added if isosbestic_control == False: @@ -1044,45 +275,133 @@ def execute_timestamp_correction(folderNames, inputParameters): logger.info(f"Timestamps corrections finished for {filepath}") -# for combining data, reading storeslist file from both data and create a new storeslist array -def check_storeslistfile(folderNames): - storesList = np.array([[], []]) +# function to compute z-score and deltaF/F +def execute_zscore(folderNames, inputParameters): + + plot_zScore_dff = inputParameters["plot_zScore_dff"] + combine_data = inputParameters["combine_data"] + remove_artifacts = inputParameters["removeArtifacts"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + filter_window = inputParameters["filter_window"] + isosbestic_control = inputParameters["isosbestic_control"] + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] + + storesListPath = [] for i in range(len(folderNames)): - filepath = folderNames[i] - storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList = np.concatenate( - ( - storesList, - np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1), - ), - axis=1, + if combine_data == True: + storesListPath.append([folderNames[i][0]]) + else: + filepath = folderNames[i] + storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + storesListPath = np.concatenate(storesListPath) + + for j in range(len(storesListPath)): + filepath = storesListPath[j] + logger.debug(f"Computing z-score for each of the data in {filepath}") + path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + path = sorted(path_1 + path_2, key=str.casefold) + if len(path) % 2 != 0: + logger.error("There are not equal number of Control and Signal data") + raise Exception("There are not equal number of Control and Signal data") + path = np.asarray(path).reshape(2, -1) + + for i in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") + if name_1[-1] != name_2[-1]: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + control, signal, tsNew = read_corrected_data(path[0, i], path[1, i], filepath, name) + coords = get_coords(filepath, name, tsNew, remove_artifacts) + z_score, dff, control_fit, temp_control_arr = compute_z_score( + control, + signal, + tsNew, + coords, + artifactsRemovalMethod, + filter_window, + isosbestic_control, + zscore_method, + baseline_start, + baseline_end, ) + write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr) - storesList = np.unique(storesList, axis=1) + logger.info(f"z-score for the data in {filepath} computed.") - return storesList + if not remove_artifacts: + visualizeControlAndSignal(filepath, removeArtifacts=remove_artifacts) + if plot_zScore_dff == "z_score": + visualize_z_score(filepath) + if plot_zScore_dff == "dff": + visualize_dff(filepath) + if plot_zScore_dff == "Both": + visualize_z_score(filepath) + visualize_dff(filepath) -def get_all_stores_for_combining_data(folderNames): - op = [] - for i in range(100): - temp = [] - match = r"[\s\S]*" + "_output_" + str(i) - for j in folderNames: - temp.append(re.findall(match, j)) - temp = sorted(list(np.concatenate(temp).flatten()), key=str.casefold) - if len(temp) > 0: - op.append(temp) + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + + plt.show() + logger.info("Z-score computation completed.") - return op + +# function to remove artifacts from z-score data +def execute_artifact_removal(folderNames, inputParameters): + + timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + combine_data = inputParameters["combine_data"] + + storesListPath = [] + for i in range(len(folderNames)): + if combine_data == True: + storesListPath.append([folderNames[i][0]]) + else: + filepath = folderNames[i] + storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + + storesListPath = np.concatenate(storesListPath) + + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + + name_to_data = read_corrected_data_dict(filepath, storesList) + pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath) + pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) + compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) + + logger.debug("Removing artifacts from the data...") + name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = remove_artifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + method=artifactsRemovalMethod, + ) + + write_artifact_removal(filepath, name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps) + visualizeControlAndSignal(filepath, removeArtifacts=True) + + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + + plt.show() + logger.info("Artifact removal completed.") # function to combine data when there are two different data files for the same recording session # it will combine the data, do timestamps processing and save the combined data in the first output folder. -def combineData(folderNames, inputParameters, storesList): - +def execute_combine_data(folderNames, inputParameters, storesList): logger.debug("Combining Data from different data files...") timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] op_folder = [] @@ -1117,64 +436,12 @@ def combineData(folderNames, inputParameters, storesList): op = get_all_stores_for_combining_data(op_folder) # processing timestamps for combining the data - processTimestampsForCombiningData(op, timeForLightsTurnOn, storesList, sampling_rate[0]) + combine_data(op, timeForLightsTurnOn, storesList, sampling_rate[0]) logger.info("Data is combined from different data files.") return op -# function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts -def execute_zscore(folderNames, inputParameters): - - timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] - remove_artifacts = inputParameters["removeArtifacts"] - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - plot_zScore_dff = inputParameters["plot_zScore_dff"] - combine_data = inputParameters["combine_data"] - isosbestic_control = inputParameters["isosbestic_control"] - - storesListPath = [] - for i in range(len(folderNames)): - if combine_data == True: - storesListPath.append([folderNames[i][0]]) - else: - filepath = folderNames[i] - storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - - storesListPath = np.concatenate(storesListPath) - - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) - - if remove_artifacts == True: - logger.debug("Removing Artifacts from the data and correcting timestamps...") - compute_z_score(filepath, inputParameters) - if artifactsRemovalMethod == "concatenate": - processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) - else: - addingNaNtoChunksWithArtifacts(filepath, storesList) - visualizeControlAndSignal(filepath, remove_artifacts) - logger.info("Artifacts from the data are removed and timestamps are corrected.") - else: - compute_z_score(filepath, inputParameters) - visualizeControlAndSignal(filepath, remove_artifacts) - - if plot_zScore_dff == "z_score": - visualize_z_score(filepath) - if plot_zScore_dff == "dff": - visualize_dff(filepath) - if plot_zScore_dff == "Both": - visualize_z_score(filepath) - visualize_dff(filepath) - - writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") - inputParameters["step"] += 1 - - plt.show() - logger.info("Signal data and event timestamps are extracted.") - - def extractTsAndSignal(inputParameters): logger.debug("Extracting signal data and event timestamps...") @@ -1203,13 +470,17 @@ def extractTsAndSignal(inputParameters): writeToFile(str((pbMaxValue + 1) * 10) + "\n" + str(10) + "\n") execute_timestamp_correction(folderNames, inputParameters) execute_zscore(folderNames, inputParameters) + if remove_artifacts == True: + execute_artifact_removal(folderNames, inputParameters) else: pbMaxValue = 1 + len(folderNames) writeToFile(str((pbMaxValue) * 10) + "\n" + str(10) + "\n") execute_timestamp_correction(folderNames, inputParameters) storesList = check_storeslistfile(folderNames) - op_folder = combineData(folderNames, inputParameters, storesList) + op_folder = execute_combine_data(folderNames, inputParameters, storesList) execute_zscore(op_folder, inputParameters) + if remove_artifacts == True: + execute_artifact_removal(op_folder, inputParameters) def main(input_parameters): diff --git a/step4_data_flow_analysis.md b/step4_data_flow_analysis.md new file mode 100644 index 0000000..d86e938 --- /dev/null +++ b/step4_data_flow_analysis.md @@ -0,0 +1,348 @@ +# Step 4 (preprocess.py) Data Flow Analysis + +## Overview + +Step 4 processes timestamp-corrected photometry data and computes normalized signals (ΔF/F and z-scores). It handles artifact removal, data combination from multiple sessions, and generates quality control visualizations. + +## High-Level Data Flow + +```mermaid +flowchart TD + A[Entry: extractTsAndSignal] --> B{combine_data?} + + B -->|False| C[execute_timestamp_correction] + B -->|True| D[execute_timestamp_correction] + + C --> E[execute_zscore] + + D --> F[check_storeslistfile] + F --> G[combineData] + G --> H[execute_zscore] + + E --> I[Output: z_score, dff, cntrl_sig_fit HDF5 files] + H --> I + + style A fill:#e1f5ff + style I fill:#d4edda +``` + +## Main Processing Paths + +### Entry Point +**`extractTsAndSignal(inputParameters)`** (line 1178) is the main entry point called by the GUI or API. + +### Path 1: Normal Processing (combine_data = False) +1. `execute_timestamp_correction()` → Correct timestamps and align data +2. `execute_zscore()` → Compute z-scores and ΔF/F + +### Path 2: Combined Data Processing (combine_data = True) +1. `execute_timestamp_correction()` → Correct timestamps for each file +2. `check_storeslistfile()` → Merge store lists from multiple files +3. `combineData()` → Combine data from multiple recording sessions +4. `execute_zscore()` → Compute z-scores and ΔF/F on combined data + +## Detailed Processing Stages + +### Stage 1: Timestamp Correction + +```mermaid +flowchart LR + A[Raw HDF5 files] --> B[Read storesList.csv] + B --> C{isosbestic_control?} + C -->|No| D[add_control_channel] + C -->|Yes| E[timestampCorrection_tdt/csv] + D --> E + E --> F[Eliminate first N seconds] + F --> G[decide_naming_convention_and_applyCorrection] + G --> H[applyCorrection for each store] + H --> I{isosbestic_control?} + I -->|No| J[create_control_channel via curve fitting] + I -->|Yes| K[timeCorrection_*.hdf5 files] + J --> K + + style A fill:#e1f5ff + style K fill:#d4edda +``` + +#### Function: `execute_timestamp_correction(folderNames, inputParameters)` + +**Input:** +- Raw HDF5 files from extractors: `control_*.hdf5`, `signal_*.hdf5`, `event_*.hdf5` + +**Process:** +1. For each session folder: + - Read `storesList.csv` (mapping of raw names to semantic names) + - If no isosbestic control: `add_control_channel()` creates placeholder control files + - **`timestampCorrection_tdt()`** or **`timestampCorrection_csv()`**: + - Eliminates first N seconds (`timeForLightsTurnOn`) + - For TDT: expands timestamps from block timestamps + sampling rate + - For CSV: uses timestamps as-is + - Writes `timeCorrection_*.hdf5` with keys: `timestampNew`, `correctionIndex`, `sampling_rate` + - **`decide_naming_convention_and_applyCorrection()`**: + - For each store, calls `applyCorrection()` to crop data using `correctionIndex` + - For control/signal channels: crops data arrays + - For event channels: subtracts time offset from timestamps + - If no isosbestic control: **`create_control_channel()`** generates synthetic control via curve fitting + +**Output:** +- Timestamp-corrected HDF5 files with trimmed data +- `timeCorrection_*.hdf5` files containing corrected timestamps + +### Stage 2: Z-Score Computation + +```mermaid +flowchart TD + A[Timestamp-corrected HDF5] --> B[compute_z_score] + B --> C{removeArtifacts?} + + C -->|No| D[helper_z_score: full data] + C -->|Yes| E[helper_z_score: chunk-by-chunk] + + D --> F[filterSignal] + E --> F + + F --> G[controlFit: linear regression] + G --> H[deltaFF: compute ΔF/F] + H --> I[z_score_computation] + + I --> J{removeArtifacts?} + + J -->|No| K[Write z_score, dff, cntrl_sig_fit] + J -->|Yes| L{artifactsRemovalMethod?} + + L -->|concatenate| M[processTimestampsForArtifacts] + L -->|NaN| N[addingNaNtoChunksWithArtifacts] + + M --> K + N --> K + + K --> O[visualizeControlAndSignal] + + style A fill:#e1f5ff + style K fill:#d4edda + style O fill:#fff3cd +``` + +#### Function: `execute_zscore(folderNames, inputParameters)` + +**Input:** +- Timestamp-corrected HDF5 files + +**Process:** +1. For each output folder: + + **`compute_z_score(filepath, inputParameters)`**: + - For each control/signal pair: + - **`helper_z_score(control, signal, filepath, name, inputParameters)`**: + + **Without artifacts removal:** + - `execute_controlFit_dff()`: Filter signals → fit control to signal → compute ΔF/F + - `z_score_computation()`: Compute z-score from ΔF/F + + **With artifacts removal:** + - For each user-selected chunk (from `coordsForPreProcessing_*.npy`): + - If no isosbestic: `helper_create_control_channel()` creates synthetic control + - `execute_controlFit_dff()` on chunk + - Concatenate or NaN-fill between chunks + - `z_score_computation()` on processed data + + - Writes: `z_score_*.hdf5`, `dff_*.hdf5`, `cntrl_sig_fit_*.hdf5` + + **If artifacts removal with concatenate method:** + - **`processTimestampsForArtifacts()`**: + - `eliminateData()`: Concatenates good chunks, adjusts timestamps to be continuous + - `eliminateTs()`: Aligns event timestamps with new timeline + - Overwrites data files with concatenated versions + + **If artifacts removal with NaN method:** + - **`addingNaNtoChunksWithArtifacts()`**: + - `addingNaNValues()`: Replaces bad chunks with NaN + - `removeTTLs()`: Filters event timestamps to keep only valid times + + - **`visualizeControlAndSignal()`**: Plots control, signal, cntrl_sig_fit for QC + +**Output:** +- `z_score_*.hdf5` (z-scored signal) +- `dff_*.hdf5` (ΔF/F) +- `cntrl_sig_fit_*.hdf5` (fitted control channel) + +## Key Data Transformations + +### Signal Processing Pipeline + +```mermaid +flowchart LR + A[Raw Signal] --> B[filterSignal: Moving Average] + C[Raw Control] --> D[filterSignal: Moving Average] + + B --> E[controlFit: Linear Regression] + D --> E + + E --> F[control_fit = p0*control + p1] + F --> G[deltaFF] + + B --> G + + G --> H[ΔF/F = signal - control_fit / control_fit * 100] + H --> I[z_score_computation] + + I --> J{zscore_method?} + J -->|standard| K[z = ΔF/F - mean / std] + J -->|baseline| L[z = ΔF/F - baseline_mean / baseline_std] + J -->|robust| M[z = 0.6745 * ΔF/F - median / MAD] + + K --> N[Z-Score Output] + L --> N + M --> N + + style A fill:#e1f5ff + style C fill:#e1f5ff + style N fill:#d4edda +``` + +### Transformation Functions + +1. **`filterSignal(filter_window, signal)`** (line 822) + - Applies moving average filter with configurable window + - Uses `scipy.signal.filtfilt` for zero-phase filtering + +2. **`controlFit(control, signal)`** (line 815) + - Linear regression: fits control to signal + - Returns: `fitted_control = p[0] * control + p[1]` + +3. **`deltaFF(signal, control)`** (line 804) + - Formula: `((signal - control) / control) * 100` + - Computes normalized fluorescence change + +4. **`z_score_computation(dff, timestamps, inputParameters)`** (line 853) + - **Standard z-score:** `(ΔF/F - mean(ΔF/F)) / std(ΔF/F)` + - **Baseline z-score:** `(ΔF/F - mean(baseline)) / std(baseline)` + - **Robust z-score:** `0.6745 * (ΔF/F - median) / MAD` + +## Artifact Removal Workflow + +### Interactive Artifact Selection + +The `visualize()` function (line 469) provides an interactive matplotlib plot: +- **Space key:** Mark artifact boundary (vertical line drawn) +- **'d' key:** Delete last marked boundary +- **Close plot:** Save coordinates to `coordsForPreProcessing_*.npy` + +### Two Removal Methods + +**Concatenate Method:** +- Removes artifact chunks completely +- Concatenates good chunks end-to-end +- Adjusts timestamps to be continuous +- Event timestamps realigned to new timeline + +**NaN Method:** +- Replaces artifact chunks with NaN values +- Preserves original timeline +- Filters out event timestamps in artifact regions + +## Supporting Functions + +### Control Channel Creation + +**`helper_create_control_channel(signal, timestamps, window)`** (line 69) +- Used when no isosbestic control is available +- Applies Savitzky-Golay filter to signal +- Fits to exponential function: `f(x) = a + b * exp(-(1/c) * x)` +- Returns synthetic control channel + +### Data Combination + +**`combineData(folderNames, inputParameters, storesList)`** (line 1084) +- Merges data from multiple recording sessions +- Validates that sampling rates match across sessions +- Calls `processTimestampsForCombiningData()` to align timelines +- Saves combined data to first output folder + +### Coordinate Fetching + +**`fetchCoords(filepath, naming, data)`** (line 610) +- Reads `coordsForPreProcessing_*.npy` (artifact boundary coordinates) +- If file doesn't exist: uses `[0, data[-1]]` (entire recording) +- Validates even number of coordinates (pairs of boundaries) +- Returns reshaped array of coordinate pairs + +## File I/O Summary + +### Files Read + +| File Pattern | Content | Source | +|-------------|---------|--------| +| `control_*.hdf5` | Control channel data | Extractors (Step 3) | +| `signal_*.hdf5` | Signal channel data | Extractors (Step 3) | +| `event_*.hdf5` | Event timestamps | Extractors (Step 3) | +| `storesList.csv` | Channel name mapping | Step 2 | +| `coordsForPreProcessing_*.npy` | Artifact boundaries | User selection (optional) | + +### Files Written + +| File Pattern | Content | Keys | +|-------------|---------|------| +| `timeCorrection_*.hdf5` | Corrected timestamps | `timestampNew`, `correctionIndex`, `sampling_rate`, `timeRecStart` (TDT only) | +| `z_score_*.hdf5` | Z-scored signal | `data` | +| `dff_*.hdf5` | ΔF/F signal | `data` | +| `cntrl_sig_fit_*.hdf5` | Fitted control | `data` | +| `event_*_*.hdf5` | Corrected event timestamps | `ts` | + +## Key Parameters from inputParameters + +| Parameter | Purpose | Default/Options | +|-----------|---------|-----------------| +| `timeForLightsTurnOn` | Seconds to eliminate from start | 1 | +| `filter_window` | Moving average window size | 100 | +| `isosbestic_control` | Use isosbestic control channel? | True/False | +| `removeArtifacts` | Enable artifact removal? | True/False | +| `artifactsRemovalMethod` | How to handle artifacts | "concatenate" / "NaN" | +| `zscore_method` | Z-score computation method | "standard z-score" / "baseline z-score" / "robust z-score" | +| `baselineWindowStart` | Baseline window start (seconds) | 0 | +| `baselineWindowEnd` | Baseline window end (seconds) | 0 | +| `combine_data` | Combine multiple recordings? | True/False | + +## Architecture Notes for Refactoring + +### Current Coupling Issues + +1. **GUI Progress Tracking:** `writeToFile()` writes to `~/pbSteps.txt` for progress bar updates (lines 36-38, 1042, 1171, 1203, 1208, 1220) +2. **Interactive Plotting:** `visualize()` requires user interaction (matplotlib event handlers) +3. **File Path Assumptions:** Hard-coded path patterns (`*_output_*`, naming conventions) +4. **Mixed Responsibilities:** Single functions handle both computation and I/O + +### Recommended Separation Points + +**Backend Analysis Layer Should Include:** +- `filterSignal()` - pure signal processing +- `controlFit()` - pure regression +- `deltaFF()` - pure computation +- `z_score_computation()` - pure statistical computation +- `helper_create_control_channel()` - algorithmic control generation +- Core timestamp correction logic (separated from I/O) +- Core artifact removal logic (separated from I/O) + +**Data I/O Layer Should Include:** +- `read_hdf5()`, `write_hdf5()` - file operations +- Store list reading/writing +- Coordinate file handling +- HDF5 file discovery and path management + +**Frontend Visualization Layer Should Include:** +- `visualize()` - interactive artifact selection +- `visualizeControlAndSignal()` - QC plots +- `visualize_z_score()`, `visualize_dff()` - result visualization +- Progress tracking callbacks (replace `writeToFile()`) + +### Potential Refactoring Strategy + +1. **Extract pure computation functions** into a `signal_processing` module +2. **Create data models** (dataclasses) for: + - TimeCorrectionResult + - ProcessedSignal (with z_score, dff, control_fit) + - ArtifactRegions +3. **Separate I/O operations** into `io_utils` module with consistent interfaces +4. **Create processing pipelines** that accept data objects, return data objects +5. **Move visualization to separate module** with callbacks for progress/interaction +6. **Use dependency injection** for progress callbacks instead of hard-coded file writes diff --git a/timestamp_correction_analysis.md b/timestamp_correction_analysis.md new file mode 100644 index 0000000..121aa3f --- /dev/null +++ b/timestamp_correction_analysis.md @@ -0,0 +1,723 @@ +# Timestamp Correction Module Analysis + +## Overview + +The `timestamp_correction.py` module handles the correction of timestamps for photometry data, including: +- Eliminating the first N seconds of recording (light stabilization period) +- Expanding TDT block timestamps into continuous timestamps +- Creating synthetic control channels when no isosbestic control is present +- Applying corrections to both data channels and event markers + +## Module Structure + +### Entry Point from preprocess.py + +```python +execute_timestamp_correction(folderNames, inputParameters) # preprocess.py:212 +``` + +This orchestrator loops through all session folders and calls functions in this module. + +## Two-Phase Control Channel Creation Pattern + +### Understanding add_control_channel vs create_control_channel + +These two functions work together in a **two-phase process** to handle synthetic control channel generation. They are **not redundant** but serve distinct purposes: + +#### Phase 1: `add_control_channel` (Called BEFORE timestamp correction) + +**Execution:** Line 229 in `execute_timestamp_correction` + +**Purpose:** Create **PLACEHOLDER** control files to satisfy workflow requirements + +**What it does:** +1. Validates that if `isosbestic_control=False`, no real control channels exist +2. For each signal channel without a matching control: + - Copies the raw signal HDF5 file to `cntrl{i}.hdf5` (placeholder) + - Adds entry to storesList: `[["cntrl{i}"], ["control_{region}"]]` +3. Saves updated `storesList.csv` + +**Files created:** +- `cntrl0.hdf5`, `cntrl1.hdf5`, etc. (copies of **RAW** signal data) +- Updated `storesList.csv` with placeholder entries + +**Why it's needed:** +- Timestamp correction workflow expects **paired** control/signal channels in storesList +- Without placeholders, the pairing logic in `timestampCorrection_xxx` and `check_cntrl_sig_length` would fail +- The placeholder **data is never actually used** - it just satisfies structural requirements + +#### Phase 2: `create_control_channel` (Called AFTER timestamp correction) + +**Execution:** Line 243 in `execute_timestamp_correction` + +**Purpose:** Generate **ACTUAL** synthetic control via curve fitting and overwrite placeholders + +**What it does:** +1. Looks for placeholder files (checks: `"control" in event_name.lower() and "cntrl" in event.lower()`) +2. Reads the **CORRECTED** signal data: `signal_{region}.hdf5` (after timestamp correction) +3. Calls `helper_create_control_channel()` to: + - Apply Savitzky-Golay filter to cleaned signal + - Fit to exponential function: `f(x) = a + b * exp(-(1/c) * x)` +4. **OVERWRITES** the placeholder `control_{region}.hdf5` with real synthetic control +5. Also exports to CSV format (legacy) + +**Files written:** +- `control_{region}.hdf5` → `data` (replaces placeholder with curve-fitted control) +- `{raw_name}.csv` (timestamps, data, sampling_rate columns) + +**Why it's separate:** +- Requires **timestamp-corrected** signal data (doesn't exist until after lines 232-239) +- Curve fitting algorithm needs clean timestamps (first N seconds eliminated) +- Cannot be done before timestamp correction without re-correcting the synthetic control + +#### Execution Timeline + +```python +# When isosbestic_control == False: + +# ========== PHASE 1: BEFORE TIMESTAMP CORRECTION ========== +# Line 229: Create placeholders (just file copies) +storesList = add_control_channel(filepath, storesList) +# Result: storesList now has paired structure +# [["Dv1A", "cntrl0"], ["signal_dms", "control_dms"]] +# Files: cntrl0.hdf5 (copy of raw signal, never used) + +# ========== TIMESTAMP CORRECTION PHASE ========== +# Lines 232-234: Process both signal AND placeholder control +timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) +# Result: Creates timeCorrection_dms.hdf5 with correctionIndex + +# Lines 236-239: Apply corrections to all channels +decide_naming_convention_and_applyCorrection(...) +# Result: signal_dms.hdf5 now contains corrected signal data +# control_dms.hdf5 still contains uncorrected placeholder copy + +# ========== PHASE 2: AFTER TIMESTAMP CORRECTION ========== +# Line 243: Generate REAL synthetic controls +create_control_channel(filepath, storesList, window=101) +# Result: control_dms.hdf5 OVERWRITTEN with curve-fitted synthetic control +# Now contains valid control data derived from corrected signal +``` + +#### Why This Design Exists + +This is a **chicken-and-egg problem solved with placeholders:** + +1. **Requirement:** Timestamp correction expects paired control/signal channels +2. **Constraint:** Synthetic control generation requires timestamp-corrected signal data +3. **Solution:** Create dummy placeholders → correct everything → replace placeholders with real data + +#### Visual Flow + +```mermaid +flowchart TD + A[isosbestic_control = False] --> B[add_control_channel] + B --> C[Copy signal.hdf5 to cntrl0.hdf5] + C --> D[Update storesList.csv] + + D --> E[timestampCorrection_xxx] + E --> F[Creates timeCorrection_dms.hdf5] + + F --> G[decide_naming_convention_and_applyCorrection] + G --> H[Corrects signal_dms.hdf5] + G --> I[Corrects control_dms.hdf5
still contains placeholder] + + I --> J[create_control_channel] + J --> K[Read corrected signal_dms.hdf5] + K --> L[helper_create_control_channel
curve fit] + L --> M[OVERWRITE control_dms.hdf5
with synthetic control] + + style C fill:#fff3cd + style I fill:#fff3cd + style M fill:#d4edda +``` + +#### Refactoring Opportunity + +This placeholder pattern is a **code smell** indicating potential design improvements: + +**Issues:** +1. **Unnecessary I/O:** Placeholder files are written and then overwritten +2. **Confusing flow:** Hard to understand that placeholders are temporary +3. **Tight coupling:** Timestamp correction assumes paired files exist +4. **Wasted computation:** Placeholder controls get timestamp-corrected unnecessarily + +**Potential Improvements:** + +**Option 1: Lazy Control Creation** +- Modify timestamp correction to handle missing controls gracefully +- Only create synthetic controls after all corrections complete +- Remove placeholder file creation entirely + +**Option 2: Data Structure Refactoring** +- Use a data structure that doesn't require physical paired files upfront +- Track "needs synthetic control" as metadata rather than file presence +- Generate and write controls only once at the end + +**Option 3: Two-Pass Workflow** +- First pass: Correct only signal channels +- Second pass: Generate synthetic controls from corrected signals +- Would require refactoring `check_cntrl_sig_length` and pairing logic + +## Function Catalog + +### 1. add_control_channel +**Location:** `timestamp_correction.py:20` +**Purpose:** Create placeholder control channel files when no isosbestic control exists + +```python +def add_control_channel(filepath, arr) -> arr +``` + +**Input:** +- `filepath`: Path to session output folder +- `arr`: 2D array `[[storenames], [storesList]]` from storesList.csv + +**Process:** +1. Validates that control/signal pairs match (raises error if mismatched) +2. For each signal channel without a matching control: + - Copies signal HDF5 file to `cntrl{i}.hdf5` (placeholder) + - Adds entry to storesList array: `[["cntrl{i}"], ["control_{region}"]]` +3. Writes updated storesList.csv + +**Output:** +- Updated `arr` with new control channel entries +- **Files Written:** Updated `storesList.csv`, copied `cntrl*.hdf5` files + +**I/O Summary:** +- **Reads:** Signal HDF5 files (via shutil.copyfile) +- **Writes:** `storesList.csv`, placeholder `cntrl*.hdf5` files + +--- + +### 2. timestampCorrection_csv +**Location:** `timestamp_correction.py:65` +**Purpose:** Correct timestamps for CSV-format data (Doric, NPM, custom CSV) + +```python +def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) +``` + +**Input:** +- `filepath`: Path to session output folder +- `timeForLightsTurnOn`: Seconds to eliminate from start (default: 1) +- `storesList`: 2D array `[[storenames], [storesList]]` + +**Process:** +1. Filters storesList to control/signal channels only +2. Pairs control/signal channels, validates naming matches +3. Calls `check_cntrl_sig_length()` to determine which channel to use (shorter one) +4. For each control/signal pair: + - **Reads:** `timestamps` and `sampling_rate` from raw HDF5 + - **Computes:** `correctionIndex = np.where(timestamp >= timeForLightsTurnOn)` + - **Writes:** `timeCorrection_{region}.hdf5` with keys: + - `timestampNew`: Corrected timestamps + - `correctionIndex`: Indices to keep + - `sampling_rate`: Sampling rate + +**Output:** +- **Files Written:** `timeCorrection_{region}.hdf5` for each control/signal pair + +**I/O Summary:** +- **Reads:** `{storename}.hdf5` → `timestamps`, `sampling_rate` +- **Writes:** `timeCorrection_{region}.hdf5` → `timestampNew`, `correctionIndex`, `sampling_rate` + +--- + +### 3. timestampCorrection_tdt +**Location:** `timestamp_correction.py:115` +**Purpose:** Correct timestamps for TDT-format data (expands block timestamps) + +```python +def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) +``` + +**Input:** Same as `timestampCorrection_csv` + +**Process:** +1. Filters storesList to control/signal channels only +2. Pairs control/signal channels, validates naming matches +3. Calls `check_cntrl_sig_length()` to determine which channel to use +4. For each control/signal pair: + - **Reads:** `timestamps`, `npoints`, `sampling_rate` from raw HDF5 + - **TDT-specific expansion algorithm:** + ```python + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) # Zero-base + adder = np.arange(npoints) / sampling_rate # Within-block offsets + # Expand: for each block timestamp, add within-block offsets + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") # Flatten + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn) + timestampNew = timestampNew[correctionIndex] + ``` + - **Writes:** `timeCorrection_{region}.hdf5` with keys: + - `timeRecStart`: Recording start time (TDT-specific) + - `timestampNew`: Expanded, corrected timestamps + - `correctionIndex`: Indices to keep + - `sampling_rate`: Sampling rate + +**Output:** +- **Files Written:** `timeCorrection_{region}.hdf5` with TDT-specific `timeRecStart` key + +**I/O Summary:** +- **Reads:** `{storename}.hdf5` → `timestamps`, `npoints`, `sampling_rate` +- **Writes:** `timeCorrection_{region}.hdf5` → `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate` + +--- + +### 4. check_cntrl_sig_length +**Location:** `timestamp_correction.py:273` +**Purpose:** Determine which channel (control or signal) to use as reference based on length + +```python +def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList) -> indices +``` + +**Input:** +- `filepath`: Path to session output folder +- `channels_arr`: Paired control/signal array `[["control_A", "control_B"], ["signal_A", "signal_B"]]` +- `storenames`: Raw HDF5 filenames +- `storesList`: Semantic channel names + +**Process:** +1. For each control/signal pair: + - **Reads:** `data` from both control and signal HDF5 + - Compares lengths: `control.shape[0]` vs `signal.shape[0]` + - Returns the shorter one's storename (or signal if equal) + +**Output:** +- List of storenames to use for timestamp correction (one per pair) + +**I/O Summary:** +- **Reads:** `{control_storename}.hdf5` → `data`, `{signal_storename}.hdf5` → `data` + +**Note:** This is a pure analysis function but performs I/O to determine which data to use. + +--- + +### 5. decide_naming_convention_and_applyCorrection +**Location:** `timestamp_correction.py:178` +**Purpose:** Loop through all channels and apply timestamp corrections + +```python +def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList) +``` + +**Input:** +- `filepath`: Path to session output folder +- `timeForLightsTurnOn`: Seconds eliminated from start +- `event`: Raw storename (e.g., "Dv1A") +- `displayName`: Semantic name (e.g., "control_DMS") +- `storesList`: Full storesList array + +**Process:** +1. Filters storesList to control/signal channels +2. Pairs channels and validates naming conventions +3. For each pair, calls `applyCorrection(filepath, timeForLightsTurnOn, event, displayName, region)` + +**Output:** +- Delegates to `applyCorrection()` (no direct I/O) + +--- + +### 6. applyCorrection +**Location:** `timestamp_correction.py:205` +**Purpose:** Apply timestamp corrections to data channels or event markers + +```python +def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming) +``` + +**Input:** +- `filepath`: Path to session output folder +- `timeForLightsTurnOn`: Seconds eliminated from start +- `event`: Raw storename +- `displayName`: Semantic display name +- `naming`: Region identifier (e.g., "dms") + +**Process:** + +**For Control/Signal Channels:** +1. **Reads:** `timeCorrection_{naming}.hdf5` → `correctionIndex` +2. **Reads:** `{event}.hdf5` → `data` +3. **Applies:** `arr = arr[correctionIndex]` (crops data) +4. **Writes:** `{displayName}.hdf5` → `data` (overwrites with corrected data) + +**For Event Channels:** +1. Detects TDT format: `check_TDT(os.path.dirname(filepath))` +2. **Reads:** `timeCorrection_{naming}.hdf5` → `timeRecStart` (if TDT) +3. **Reads:** `{event}.hdf5` → `timestamps` +4. **Applies corrections:** + - If TDT and timestamps >= timeRecStart: subtract both `timeRecStart` and `timeForLightsTurnOn` + - Otherwise: subtract only `timeForLightsTurnOn` +5. **Writes:** `{event}_{naming}.hdf5` → `ts` (corrected event timestamps) + +**Output:** +- **Files Written:** + - `{displayName}.hdf5` → `data` (for control/signal) + - `{event}_{naming}.hdf5` → `ts` (for events) + +**I/O Summary:** +- **Reads:** `timeCorrection_{naming}.hdf5`, `{event}.hdf5` +- **Writes:** `{displayName}.hdf5` or `{event}_{naming}.hdf5` + +--- + +### 7. create_control_channel +**Location:** `timestamp_correction.py:247` +**Purpose:** Generate synthetic control channel using curve fitting (when no isosbestic control exists) + +```python +def create_control_channel(filepath, arr, window=5001) +``` + +**Input:** +- `filepath`: Path to session output folder +- `arr`: storesList array `[[storenames], [storesList]]` +- `window`: Savitzky-Golay filter window (default: 5001) + +**Process:** +1. Loops through storesList to find placeholder control channels (`cntrl` in storename) +2. For each placeholder: + - **Reads:** `signal_{region}.hdf5` → `data` (corrected signal) + - **Reads:** `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate` + - **Calls:** `helper_create_control_channel(signal, timestampNew, window)` from `control_channel.py` + - Applies Savitzky-Golay filter + - Fits to exponential: `f(x) = a + b * exp(-(1/c) * x)` + - **Writes:** `{control_name}.hdf5` → `data` (synthetic control) + - **Writes:** `{event_name}.csv` with columns: `timestamps`, `data`, `sampling_rate` + +**Output:** +- **Files Written:** + - `control_{region}.hdf5` → `data` (replaces placeholder) + - `{raw_name}.csv` (legacy format export) + +**I/O Summary:** +- **Reads:** `signal_{region}.hdf5` → `data`, `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate` +- **Writes:** `control_{region}.hdf5` → `data`, `{raw_name}.csv` + +--- + +## Data Flow Diagram + +### High-Level Flow (called from execute_timestamp_correction) + +```mermaid +flowchart TD + A[execute_timestamp_correction] --> B[Read storesList.csv] + B --> C{isosbestic_control?} + + C -->|False| D[add_control_channel] + C -->|True| E{Check format} + D --> E + + E -->|TDT| F[timestampCorrection_tdt] + E -->|CSV/Doric/NPM| G[timestampCorrection_csv] + + F --> H[Loop: decide_naming_convention_and_applyCorrection] + G --> H + + H --> I[For each store: applyCorrection] + + I --> J{isosbestic_control?} + J -->|False| K[create_control_channel] + J -->|True| L[Done] + K --> L + + style A fill:#e1f5ff + style L fill:#d4edda +``` + +### Detailed Flow: timestampCorrection Functions + +```mermaid +flowchart LR + A[Raw HDF5 files] --> B[check_cntrl_sig_length] + B --> C[Read control & signal data] + C --> D[Return shorter channel name] + + D --> E{Format?} + E -->|CSV| F[timestampCorrection_csv] + E -->|TDT| G[timestampCorrection_tdt] + + F --> H[Read timestamps from selected channel] + G --> I[Read timestamps, npoints, sampling_rate] + + H --> J[correctionIndex = where >= timeForLightsTurnOn] + I --> K[Expand block timestamps] + K --> J + + J --> L[Write timeCorrection_{region}.hdf5] + + style A fill:#e1f5ff + style L fill:#d4edda +``` + +### Detailed Flow: applyCorrection + +```mermaid +flowchart TD + A[applyCorrection called] --> B{Channel type?} + + B -->|control/signal| C[Read correctionIndex] + B -->|event| D[Read event timestamps] + + C --> E[Read raw data] + E --> F[data = data correctionIndex] + F --> G[Write displayName.hdf5] + + D --> H{TDT format?} + H -->|Yes| I[Read timeRecStart] + H -->|No| J[ts -= timeForLightsTurnOn] + + I --> K[ts -= timeRecStart] + K --> J + J --> L[Write event_region.hdf5] + + style A fill:#e1f5ff + style G fill:#d4edda + style L fill:#d4edda +``` + +### Detailed Flow: Control Channel Creation + +```mermaid +flowchart LR + A[add_control_channel] --> B[For each signal without control] + B --> C[Copy signal.hdf5 to cntrl_i.hdf5] + C --> D[Update storesList.csv] + + D --> E[... timestamp correction ...] + + E --> F[create_control_channel] + F --> G[For each cntrl_i placeholder] + G --> H[Read signal_{region}.hdf5] + H --> I[helper_create_control_channel] + I --> J[Savitzky-Golay filter] + J --> K[Curve fit to exponential] + K --> L[Write control_{region}.hdf5] + L --> M[Export to CSV] + + style A fill:#fff3cd + style M fill:#d4edda +``` + +## Execution Order in execute_timestamp_correction + +```python +# preprocess.py:212-247 +for each session in folderNames: + for each output_folder in session: + # Step 1: Read metadata + storesList = np.genfromtxt("storesList.csv") + + # Step 2: Add placeholder controls if needed + if isosbestic_control == False: + storesList = add_control_channel(filepath, storesList) + + # Step 3: Compute correctionIndex and timestampNew + if check_TDT(folderName): + timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) + else: + timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) + + # Step 4: Apply corrections to all channels/events + for each store in storesList: + decide_naming_convention_and_applyCorrection( + filepath, timeForLightsTurnOn, storename, displayName, storesList + ) + # ^ This calls applyCorrection for each channel + + # Step 5: Generate synthetic controls via curve fitting + if isosbestic_control == False: + create_control_channel(filepath, storesList, window=101) +``` + +## File I/O Summary + +### Files Read + +| Function | Files Read | Keys | +|----------|-----------|------| +| `add_control_channel` | `signal_*.hdf5` (for copying) | - | +| `timestampCorrection_csv` | `{storename}.hdf5` | `timestamps`, `sampling_rate` | +| `timestampCorrection_tdt` | `{storename}.hdf5` | `timestamps`, `npoints`, `sampling_rate` | +| `check_cntrl_sig_length` | `control_*.hdf5`, `signal_*.hdf5` | `data` | +| `applyCorrection` | `timeCorrection_{region}.hdf5`
`{event}.hdf5` | `correctionIndex`, `timeRecStart` (TDT)
`data` or `timestamps` | +| `create_control_channel` | `signal_{region}.hdf5`
`timeCorrection_{region}.hdf5` | `data`
`timestampNew`, `sampling_rate` | + +### Files Written + +| Function | Files Written | Keys | Notes | +|----------|--------------|------|-------| +| `add_control_channel` | `storesList.csv`
`cntrl{i}.hdf5` | -
(copy of signal) | Placeholder files | +| `timestampCorrection_csv` | `timeCorrection_{region}.hdf5` | `timestampNew`, `correctionIndex`, `sampling_rate` | One per region | +| `timestampCorrection_tdt` | `timeCorrection_{region}.hdf5` | `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate` | TDT-specific | +| `applyCorrection` | `{displayName}.hdf5`
`{event}_{region}.hdf5` | `data`
`ts` | Overwrites with corrected data | +| `create_control_channel` | `control_{region}.hdf5`
`{raw_name}.csv` | `data`
timestamps, data, sampling_rate | Replaces placeholder | + +## Key Transformations + +### 1. Timestamp Expansion (TDT only) + +**Input:** Block timestamps (one per acquisition block) +**Algorithm:** +```python +timeRecStart = timestamp[0] +timestamps = timestamp - timeRecStart # Zero-base +adder = np.arange(npoints) / sampling_rate # Within-block offsets [0, 1/fs, 2/fs, ...] +# Matrix multiplication to expand: +timestampNew = zeros((n_blocks, npoints)) +for i in range(npoints): + timestampNew[:, i] = timestamps + adder[i] +timestampNew = timestampNew.T.reshape(-1, order='F') # Column-major flatten +``` +**Output:** Continuous timestamps at full sampling rate + +### 2. Correction Index Computation + +**Input:** Timestamps array, `timeForLightsTurnOn` +**Algorithm:** +```python +correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] +``` +**Output:** Indices of timestamps to keep (after eliminating first N seconds) + +### 3. Data Cropping + +**Applied to:** Control/signal data channels +**Algorithm:** +```python +data_corrected = data[correctionIndex] +``` + +### 4. Event Timestamp Adjustment + +**Applied to:** Event markers (TTL pulses) +**Algorithm:** +```python +# CSV format: +ts_corrected = ts - timeForLightsTurnOn + +# TDT format (if ts >= timeRecStart): +ts_corrected = ts - timeRecStart - timeForLightsTurnOn +``` + +### 5. Synthetic Control Generation + +**Input:** Signal channel (already corrected) +**Algorithm:** +1. Apply Savitzky-Golay filter: `filtered_signal = savgol_filter(signal, window, polyorder=3)` +2. Curve fit to exponential: `control = a + b * exp(-(1/c) * t)` +3. Return fitted curve as synthetic control + +## Analysis for I/O Separation + +### Pure Analysis Functions (Minimal I/O) +These could be extracted with I/O injected: +- ❌ None - all functions perform substantial I/O + +### Orchestration Functions (Heavy I/O, Light Analysis) +These coordinate reading/writing and delegate computation: +- `add_control_channel` - File copying and CSV writing +- `decide_naming_convention_and_applyCorrection` - Loops and delegates +- `create_control_channel` - Orchestrates read → process → write + +### Mixed Functions (I/O + Analysis) +These perform both I/O and computation inline: +- `timestampCorrection_csv` - Reads data, computes correctionIndex, writes results +- `timestampCorrection_tdt` - Reads data, expands timestamps, computes correctionIndex, writes +- `applyCorrection` - Reads multiple files, applies transformations, writes +- `check_cntrl_sig_length` - Reads data just to compare lengths + +## Refactoring Recommendations for I/O Separation + +### Option 1: Extract Pure Computation Functions + +Create new pure functions: +```python +# Pure analysis (no I/O) +def compute_correction_index(timestamps, timeForLightsTurnOn): + return np.where(timestamps >= timeForLightsTurnOn)[0] + +def expand_tdt_timestamps(block_timestamps, npoints, sampling_rate): + # TDT expansion algorithm + ... + return expanded_timestamps + +def crop_data_by_index(data, correctionIndex): + return data[correctionIndex] + +def adjust_event_timestamps(ts, timeRecStart, timeForLightsTurnOn, is_tdt): + # Event adjustment logic + ... + return adjusted_ts +``` + +Then modify existing functions to use these pure functions, keeping I/O separate. + +### Option 2: Reader/Writer Pattern + +Create dedicated I/O classes: +```python +class TimestampCorrectionReader: + def read_raw_timestamps(self, filepath, storename): + ... + + def read_correction_data(self, filepath, region): + ... + +class TimestampCorrectionWriter: + def write_correction_file(self, filepath, region, data): + ... + + def write_corrected_data(self, filepath, displayName, data): + ... +``` + +### Option 3: Data Class Pattern + +Return data objects instead of writing directly: +```python +@dataclass +class TimestampCorrection: + timestampNew: np.ndarray + correctionIndex: np.ndarray + sampling_rate: float + timeRecStart: Optional[float] = None # TDT only + +def timestampCorrection_tdt(...) -> TimestampCorrection: + # Compute all values + return TimestampCorrection( + timestampNew=..., + correctionIndex=..., + sampling_rate=..., + timeRecStart=... + ) + +# Separate writer function +def write_timestamp_correction(filepath, region, correction: TimestampCorrection): + write_hdf5(correction.timestampNew, f"timeCorrection_{region}", filepath, "timestampNew") + # ... etc +``` + +## Current I/O Patterns to Refactor + +1. **Inline writes in computation functions:** + - `timestampCorrection_csv` and `timestampCorrection_tdt` compute AND write + - Should separate: compute → return data → write in caller + +2. **Reading for validation only:** + - `check_cntrl_sig_length` reads full data arrays just to compare shapes + - Could be optimized to read only array metadata/shapes + +3. **Side-effect file creation:** + - `add_control_channel` creates files as side effect + - `create_control_channel` both generates data AND writes multiple formats (HDF5 + CSV) + +4. **Mixed responsibilities in applyCorrection:** + - Handles both control/signal cropping AND event timestamp adjustment + - Could be split into two separate functions