diff --git a/.gitignore b/.gitignore
index 0628429..f684eec 100755
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ GuPPy/runFiberPhotometryAnalysis.ipynb
.clinerules/
testing_data/
+
+CLAUDE.md
diff --git a/src/guppy/analysis/__init__.py b/src/guppy/analysis/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py
new file mode 100644
index 0000000..d3da042
--- /dev/null
+++ b/src/guppy/analysis/artifact_removal.py
@@ -0,0 +1,222 @@
+import logging
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def remove_artifacts(
+ timeForLightsTurnOn,
+ storesList,
+ pair_name_to_tsNew,
+ pair_name_to_sampling_rate,
+ pair_name_to_coords,
+ name_to_data,
+ compound_name_to_ttl_timestamps,
+ method,
+):
+ if method == "concatenate":
+ name_to_corrected_data, pair_name_to_corrected_timestamps, compound_name_to_corrected_ttl_timestamps = (
+ processTimestampsForArtifacts(
+ timeForLightsTurnOn,
+ storesList,
+ pair_name_to_tsNew,
+ pair_name_to_sampling_rate,
+ pair_name_to_coords,
+ name_to_data,
+ compound_name_to_ttl_timestamps,
+ )
+ )
+ logger.info("Artifacts removed using concatenate method.")
+ elif method == "replace with NaN":
+ name_to_corrected_data, compound_name_to_corrected_ttl_timestamps = addingNaNtoChunksWithArtifacts(
+ storesList,
+ pair_name_to_tsNew,
+ pair_name_to_coords,
+ name_to_data,
+ compound_name_to_ttl_timestamps,
+ )
+ pair_name_to_corrected_timestamps = None
+ logger.info("Artifacts removed using NaN replacement method.")
+ else:
+ logger.error("Invalid artifact removal method specified.")
+ raise ValueError("Invalid artifact removal method specified.")
+
+ return name_to_corrected_data, pair_name_to_corrected_timestamps, compound_name_to_corrected_ttl_timestamps
+
+
+def addingNaNtoChunksWithArtifacts(
+ storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps
+):
+ logger.debug("Replacing chunks with artifacts by NaN values.")
+ names_for_storenames = storesList[1, :]
+ pair_names = pair_name_to_tsNew.keys()
+
+ name_to_corrected_data = {}
+ compound_name_to_corrected_ttl_timestamps = {}
+ for pair_name in pair_names:
+ tsNew = pair_name_to_tsNew[pair_name]
+ coords = pair_name_to_coords[pair_name]
+ for i in range(len(names_for_storenames)):
+ if (
+ "control_" + pair_name.lower() in names_for_storenames[i].lower()
+ or "signal_" + pair_name.lower() in names_for_storenames[i].lower()
+ ): # changes done
+ data = name_to_data[names_for_storenames[i]].reshape(-1)
+ data = addingNaNValues(data=data, ts=tsNew, coords=coords)
+ name_to_corrected_data[names_for_storenames[i]] = data
+ else:
+ if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower():
+ continue
+ ttl_name = names_for_storenames[i]
+ compound_name = ttl_name + "_" + pair_name
+ ts = compound_name_to_ttl_timestamps[compound_name].reshape(-1)
+ ts = removeTTLs(ts=ts, coords=coords)
+ compound_name_to_corrected_ttl_timestamps[compound_name] = ts
+ logger.info("Chunks with artifacts are replaced by NaN values.")
+
+ return name_to_corrected_data, compound_name_to_corrected_ttl_timestamps
+
+
+# main function to align timestamps for control, signal and event timestamps for artifacts removal
+def processTimestampsForArtifacts(
+ timeForLightsTurnOn,
+ storesList,
+ pair_name_to_tsNew,
+ pair_name_to_sampling_rate,
+ pair_name_to_coords,
+ name_to_data,
+ compound_name_to_ttl_timestamps,
+):
+ logger.debug("Processing timestamps to get rid of artifacts using concatenate method...")
+ names_for_storenames = storesList[1, :]
+ pair_names = pair_name_to_tsNew.keys()
+
+ name_to_corrected_data = {}
+ pair_name_to_corrected_timestamps = {}
+ compound_name_to_corrected_ttl_timestamps = {}
+ for pair_name in pair_names:
+ sampling_rate = pair_name_to_sampling_rate[pair_name]
+ tsNew = pair_name_to_tsNew[pair_name]
+ coords = pair_name_to_coords[pair_name]
+
+ for i in range(len(names_for_storenames)):
+ if (
+ "control_" + pair_name.lower() in names_for_storenames[i].lower()
+ or "signal_" + pair_name.lower() in names_for_storenames[i].lower()
+ ): # changes done
+ data = name_to_data[names_for_storenames[i]]
+ data, timestampNew = eliminateData(
+ data=data,
+ ts=tsNew,
+ coords=coords,
+ timeForLightsTurnOn=timeForLightsTurnOn,
+ sampling_rate=sampling_rate,
+ )
+ name_to_corrected_data[names_for_storenames[i]] = data
+ pair_name_to_corrected_timestamps[pair_name] = timestampNew
+ else:
+ if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower():
+ continue
+ compound_name = names_for_storenames[i] + "_" + pair_name
+ ts = compound_name_to_ttl_timestamps[compound_name]
+ ts = eliminateTs(
+ ts=ts,
+ tsNew=tsNew,
+ coords=coords,
+ timeForLightsTurnOn=timeForLightsTurnOn,
+ sampling_rate=sampling_rate,
+ )
+ compound_name_to_corrected_ttl_timestamps[compound_name] = ts
+
+ logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.")
+
+ return (
+ name_to_corrected_data,
+ pair_name_to_corrected_timestamps,
+ compound_name_to_corrected_ttl_timestamps,
+ )
+
+
+# helper function to process control and signal timestamps
+def eliminateData(*, data, ts, coords, timeForLightsTurnOn, sampling_rate):
+
+ if (data == 0).all() == True:
+ data = np.zeros(ts.shape[0])
+
+ arr = np.array([])
+ ts_arr = np.array([])
+ for i in range(coords.shape[0]):
+
+ index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0]
+
+ if len(arr) == 0:
+ arr = np.concatenate((arr, data[index]))
+ sub = ts[index][0] - timeForLightsTurnOn
+ new_ts = ts[index] - sub
+ ts_arr = np.concatenate((ts_arr, new_ts))
+ else:
+ temp = data[index]
+ # new = temp + (arr[-1]-temp[0])
+ temp_ts = ts[index]
+ new_ts = temp_ts - (temp_ts[0] - ts_arr[-1])
+ arr = np.concatenate((arr, temp))
+ ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate)))
+
+ # logger.info(arr.shape, ts_arr.shape)
+ return arr, ts_arr
+
+
+# helper function to align event timestamps with the control and signal timestamps
+def eliminateTs(*, ts, tsNew, coords, timeForLightsTurnOn, sampling_rate):
+
+ ts_arr = np.array([])
+ tsNew_arr = np.array([])
+ for i in range(coords.shape[0]):
+ tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0]
+ ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0]
+
+ if len(tsNew_arr) == 0:
+ sub = tsNew[tsNew_index][0] - timeForLightsTurnOn
+ tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub))
+ ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub))
+ else:
+ temp_tsNew = tsNew[tsNew_index]
+ temp_ts = ts[ts_index]
+ new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1])
+ new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1])
+ tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate)))
+ ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate)))
+
+ return ts_arr
+
+
+# adding nan values to removed chunks
+# when using artifacts removal method - replace with NaN
+def addingNaNValues(*, data, ts, coords):
+
+ if (data == 0).all() == True:
+ data = np.zeros(ts.shape[0])
+
+ arr = np.array([])
+ ts_index = np.arange(ts.shape[0])
+ for i in range(coords.shape[0]):
+
+ index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0]
+ arr = np.concatenate((arr, index))
+
+ nan_indices = list(set(ts_index).symmetric_difference(arr))
+ data[nan_indices] = np.nan
+
+ return data
+
+
+# remove event TTLs which falls in the removed chunks
+# when using artifacts removal method - replace with NaN
+def removeTTLs(*, ts, coords):
+ ts_arr = np.array([])
+ for i in range(coords.shape[0]):
+ ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0]
+ ts_arr = np.concatenate((ts_arr, ts[ts_index]))
+
+ return ts_arr
diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py
new file mode 100644
index 0000000..6ccddc0
--- /dev/null
+++ b/src/guppy/analysis/combine_data.py
@@ -0,0 +1,128 @@
+import logging
+import os
+
+import numpy as np
+
+from .io_utils import (
+ decide_naming_convention,
+ read_hdf5,
+ write_hdf5,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, sampling_rate):
+
+ arr = np.array([])
+ ts_arr = np.array([])
+ filepaths = list(filepath_to_timestamps.keys())
+ for filepath in filepaths:
+ ts = filepath_to_timestamps[filepath]
+ data = filepath_to_data[filepath]
+
+ if len(arr) == 0:
+ arr = np.concatenate((arr, data))
+ sub = ts[0] - timeForLightsTurnOn
+ new_ts = ts - sub
+ ts_arr = np.concatenate((ts_arr, new_ts))
+ else:
+ temp = data
+ temp_ts = ts
+ new_ts = temp_ts - (temp_ts[0] - ts_arr[-1])
+ arr = np.concatenate((arr, temp))
+ ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate)))
+
+ return arr, ts_arr
+
+
+def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, sampling_rate):
+
+ ts_arr = np.array([])
+ tsNew_arr = np.array([])
+ filepaths = list(filepath_to_timestamps.keys())
+ for filepath in filepaths:
+ ts = filepath_to_timestamps[filepath]
+ tsNew = filepath_to_ttl_timestamps[filepath]
+ if len(tsNew_arr) == 0:
+ sub = tsNew[0] - timeForLightsTurnOn
+ tsNew_arr = np.concatenate((tsNew_arr, tsNew - sub))
+ ts_arr = np.concatenate((ts_arr, ts - sub))
+ else:
+ temp_tsNew = tsNew
+ temp_ts = ts
+ new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1])
+ new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1])
+ tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate)))
+ ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate)))
+
+ # logger.info(event)
+ # logger.info(ts_arr)
+ return ts_arr
+
+
+def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_storenames, sampling_rate):
+ # filepath = [[folder1_output_0, folder2_output_0], [folder1_output_1, folder2_output_1], ...]
+
+ logger.debug("Processing timestamps for combining data...")
+
+ names_for_storenames = names_for_storenames[1, :]
+
+ for single_output_filepaths in filepath:
+ # single_output_filepaths = [folder1_output_i, folder2_output_i, ...]
+
+ path = decide_naming_convention(single_output_filepaths[0])
+
+ pair_name_to_tsNew = {}
+ for j in range(path.shape[1]):
+ name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1]
+ name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1]
+ if name_1 != name_2:
+ logger.error("Error in naming convention of files or Error in storesList file")
+ raise Exception("Error in naming convention of files or Error in storesList file")
+ pair_name = name_1
+
+ for i in range(len(names_for_storenames)):
+ if (
+ "control_" + pair_name.lower() in names_for_storenames[i].lower()
+ or "signal_" + pair_name.lower() in names_for_storenames[i].lower()
+ ):
+ filepath_to_timestamps = {}
+ filepath_to_data = {}
+ for filepath in single_output_filepaths:
+ ts = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew")
+ data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1)
+ filepath_to_timestamps[filepath] = ts
+ filepath_to_data[filepath] = data
+
+ data, timestampNew = eliminateData(
+ filepath_to_timestamps,
+ filepath_to_data,
+ timeForLightsTurnOn,
+ sampling_rate,
+ )
+ write_hdf5(data, names_for_storenames[i], single_output_filepaths[0], "data")
+ pair_name_to_tsNew[pair_name] = timestampNew
+ else:
+ if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower():
+ continue
+ filepath_to_timestamps = {}
+ filepath_to_ttl_timestamps = {}
+ for filepath in single_output_filepaths:
+ tsNew = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew")
+ if os.path.exists(os.path.join(filepath, names_for_storenames[i] + "_" + pair_name + ".hdf5")):
+ ts = read_hdf5(names_for_storenames[i] + "_" + pair_name, filepath, "ts").reshape(-1)
+ else:
+ ts = np.array([])
+ filepath_to_timestamps[filepath] = tsNew
+ filepath_to_ttl_timestamps[filepath] = ts
+
+ ts = eliminateTs(
+ filepath_to_timestamps,
+ filepath_to_ttl_timestamps,
+ timeForLightsTurnOn,
+ sampling_rate,
+ )
+ write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, single_output_filepaths[0], "ts")
+ for pair_name, tsNew in pair_name_to_tsNew.items():
+ write_hdf5(tsNew, "timeCorrection_" + pair_name, single_output_filepaths[0], "timestampNew")
diff --git a/src/guppy/analysis/control_channel.py b/src/guppy/analysis/control_channel.py
new file mode 100644
index 0000000..605bd17
--- /dev/null
+++ b/src/guppy/analysis/control_channel.py
@@ -0,0 +1,122 @@
+import logging
+import os
+import shutil
+
+import numpy as np
+import pandas as pd
+from scipy import signal as ss
+from scipy.optimize import curve_fit
+
+from .io_utils import (
+ read_hdf5,
+ write_hdf5,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# This function just creates placeholder Control-HDF5 files that are then immediately overwritten later on in the pipeline.
+# TODO: Refactor this function to avoid unnecessary file creation.
+# function to add control channel when there is no
+# isosbestic control channel and update the storeslist file
+def add_control_channel(filepath, arr):
+
+ storenames = arr[0, :]
+ storesList = np.char.lower(arr[1, :])
+
+ keep_control = np.array([])
+ # check a case if there is isosbestic control channel present
+ for i in range(storesList.shape[0]):
+ if "control" in storesList[i].lower():
+ name = storesList[i].split("_")[-1]
+ new_str = "signal_" + str(name).lower()
+ find_signal = [True for i in storesList if i == new_str]
+ if len(find_signal) > 1:
+ logger.error("Error in naming convention of files or Error in storesList file")
+ raise Exception("Error in naming convention of files or Error in storesList file")
+ if len(find_signal) == 0:
+ logger.error(
+ "Isosbectic control channel parameter is set to False and still \
+ storeslist file shows there is control channel present"
+ )
+ raise Exception(
+ "Isosbectic control channel parameter is set to False and still \
+ storeslist file shows there is control channel present"
+ )
+ else:
+ continue
+
+ for i in range(storesList.shape[0]):
+ if "signal" in storesList[i].lower():
+ name = storesList[i].split("_")[-1]
+ new_str = "control_" + str(name).lower()
+ find_signal = [True for i in storesList if i == new_str]
+ if len(find_signal) == 0:
+ src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join(
+ filepath, "cntrl" + str(i) + ".hdf5"
+ )
+ shutil.copyfile(src, dst)
+ arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1)
+
+ np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s")
+
+ return arr
+
+
+# main function to create control channel using
+# signal channel and save it to a file
+def create_control_channel(filepath, arr, window=5001):
+
+ storenames = arr[0, :]
+ storesList = arr[1, :]
+
+ for i in range(storesList.shape[0]):
+ event_name, event = storesList[i], storenames[i]
+ if "control" in event_name.lower() and "cntrl" in event.lower():
+ logger.debug("Creating control channel from signal channel using curve-fitting")
+ name = event_name.split("_")[-1]
+ signal = read_hdf5("signal_" + name, filepath, "data")
+ timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew")
+ sampling_rate = np.full(timestampNew.shape, np.nan)
+ sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0]
+
+ control = helper_create_control_channel(signal, timestampNew, window)
+
+ write_hdf5(control, event_name, filepath, "data")
+ d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate}
+ df = pd.DataFrame(d)
+ df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False)
+ logger.info("Control channel from signal channel created using curve-fitting")
+
+
+# TODO: figure out why a control channel is created for both timestamp correction and z-score steps.
+# helper function to create control channel using signal channel
+# by curve fitting signal channel to exponential function
+# when there is no isosbestic control channel is present
+def helper_create_control_channel(signal, timestamps, window):
+ # check if window is greater than signal shape
+ if window > signal.shape[0]:
+ window = ((signal.shape[0] + 1) / 2) + 1
+ if window % 2 != 0:
+ window = window
+ else:
+ window = window + 1
+
+ filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3)
+
+ p0 = [5, 50, 60]
+
+ try:
+ popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0)
+ except Exception as e:
+ logger.error(str(e))
+
+ # logger.info('Curve Fit Parameters : ', popt)
+ control = curveFitFn(timestamps, *popt)
+
+ return control
+
+
+# curve fit exponential function
+def curveFitFn(x, a, b, c):
+ return a + (b * np.exp(-(1 / c) * x))
diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py
new file mode 100644
index 0000000..b467c37
--- /dev/null
+++ b/src/guppy/analysis/io_utils.py
@@ -0,0 +1,196 @@
+import fnmatch
+import glob
+import logging
+import os
+import re
+
+import h5py
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def takeOnlyDirs(paths):
+ removePaths = []
+ for p in paths:
+ if os.path.isfile(p):
+ removePaths.append(p)
+ return list(set(paths) - set(removePaths))
+
+
+# find files by ignoring the case sensitivity
+def find_files(path, glob_path, ignore_case=False):
+ rule = (
+ re.compile(fnmatch.translate(glob_path), re.IGNORECASE)
+ if ignore_case
+ else re.compile(fnmatch.translate(glob_path))
+ )
+
+ no_bytes_path = os.listdir(os.path.expanduser(path))
+ str_path = []
+
+ # converting byte object to string
+ for x in no_bytes_path:
+ try:
+ str_path.append(x.decode("utf-8"))
+ except:
+ str_path.append(x)
+ return [os.path.join(path, n) for n in str_path if rule.match(n)]
+
+
+# check if dealing with TDT files or csv files
+def check_TDT(filepath):
+ path = glob.glob(os.path.join(filepath, "*.tsq"))
+ if len(path) > 0:
+ return True
+ else:
+ return False
+
+
+# function to read hdf5 file
+def read_hdf5(event, filepath, key):
+ if event:
+ event = event.replace("\\", "_")
+ event = event.replace("/", "_")
+ op = os.path.join(filepath, event + ".hdf5")
+ else:
+ op = filepath
+
+ if os.path.exists(op):
+ with h5py.File(op, "r") as f:
+ arr = np.asarray(f[key])
+ else:
+ logger.error(f"{event}.hdf5 file does not exist")
+ raise Exception("{}.hdf5 file does not exist".format(event))
+
+ return arr
+
+
+# function to write hdf5 file
+def write_hdf5(data, event, filepath, key):
+ event = event.replace("\\", "_")
+ event = event.replace("/", "_")
+ op = os.path.join(filepath, event + ".hdf5")
+
+ # if file does not exist create a new file
+ if not os.path.exists(op):
+ with h5py.File(op, "w") as f:
+ if type(data) is np.ndarray:
+ f.create_dataset(key, data=data, maxshape=(None,), chunks=True)
+ else:
+ f.create_dataset(key, data=data)
+
+ # if file already exists, append data to it or add a new key to it
+ else:
+ with h5py.File(op, "r+") as f:
+ if key in list(f.keys()):
+ if type(data) is np.ndarray:
+ f[key].resize(data.shape)
+ arr = f[key]
+ arr[:] = data
+ else:
+ arr = f[key]
+ arr = data
+ else:
+ if type(data) is np.ndarray:
+ f.create_dataset(key, data=data, maxshape=(None,), chunks=True)
+ else:
+ f.create_dataset(key, data=data)
+
+
+# function to check if the naming convention for saving storeslist file was followed or not
+def decide_naming_convention(filepath):
+ path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*'))
+
+ path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*'))
+
+ path = sorted(path_1 + path_2, key=str.casefold)
+ if len(path) % 2 != 0:
+ logger.error("There are not equal number of Control and Signal data")
+ raise Exception("There are not equal number of Control and Signal data")
+
+ path = np.asarray(path).reshape(2, -1)
+
+ return path
+
+
+# function to read coordinates file which was saved by selecting chunks for artifacts removal
+def fetchCoords(filepath, naming, data):
+
+ path = os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy")
+
+ if not os.path.exists(path):
+ coords = np.array([0, data[-1]])
+ else:
+ coords = np.load(os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy"))[:, 0]
+
+ if coords.shape[0] % 2 != 0:
+ logger.error("Number of values in coordsForPreProcessing file is not even.")
+ raise Exception("Number of values in coordsForPreProcessing file is not even.")
+
+ coords = coords.reshape(-1, 2)
+
+ return coords
+
+
+def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redundant with fetchCoords
+ if removeArtifacts == True:
+ coords = fetchCoords(filepath, name, tsNew)
+ else:
+ dt = tsNew[1] - tsNew[0]
+ coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]])
+ return coords
+
+
+def get_all_stores_for_combining_data(folderNames):
+ op = []
+ for i in range(100):
+ temp = []
+ match = r"[\s\S]*" + "_output_" + str(i)
+ for j in folderNames:
+ temp.append(re.findall(match, j))
+ temp = sorted(list(np.concatenate(temp).flatten()), key=str.casefold)
+ if len(temp) > 0:
+ op.append(temp)
+
+ return op
+
+
+# for combining data, reading storeslist file from both data and create a new storeslist array
+def check_storeslistfile(folderNames):
+ storesList = np.array([[], []])
+ for i in range(len(folderNames)):
+ filepath = folderNames[i]
+ storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))
+ for j in range(len(storesListPath)):
+ filepath = storesListPath[j]
+ storesList = np.concatenate(
+ (
+ storesList,
+ np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1),
+ ),
+ axis=1,
+ )
+
+ storesList = np.unique(storesList, axis=1)
+
+ return storesList
+
+
+def get_control_and_signal_channel_names(storesList):
+ storenames = storesList[0, :]
+ names_for_storenames = storesList[1, :]
+
+ channels_arr = []
+ for i in range(names_for_storenames.shape[0]):
+ if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower():
+ channels_arr.append(names_for_storenames[i])
+
+ channels_arr = sorted(channels_arr, key=str.casefold)
+ try:
+ channels_arr = np.asarray(channels_arr).reshape(2, -1)
+ except:
+ logger.error("Error in saving stores list file or spelling mistake for control or signal")
+ raise Exception("Error in saving stores list file or spelling mistake for control or signal")
+
+ return channels_arr
diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py
new file mode 100644
index 0000000..e7fe8e0
--- /dev/null
+++ b/src/guppy/analysis/standard_io.py
@@ -0,0 +1,210 @@
+import logging
+import os
+
+import numpy as np
+
+from .io_utils import (
+ decide_naming_convention,
+ fetchCoords,
+ get_control_and_signal_channel_names,
+ read_hdf5,
+ write_hdf5,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def read_control_and_signal(filepath, storesList):
+ channels_arr = get_control_and_signal_channel_names(storesList)
+ storenames = storesList[0, :]
+ names_for_storenames = storesList[1, :]
+
+ name_to_data = {}
+ name_to_timestamps = {}
+ name_to_sampling_rate = {}
+ name_to_npoints = {}
+
+ for i in range(channels_arr.shape[1]):
+ control_name = channels_arr[0, i]
+ signal_name = channels_arr[1, i]
+ idx_c = np.where(names_for_storenames == control_name)[0]
+ idx_s = np.where(names_for_storenames == signal_name)[0]
+ control_storename = storenames[idx_c[0]]
+ signal_storename = storenames[idx_s[0]]
+
+ control_data = read_hdf5(control_storename, filepath, "data")
+ signal_data = read_hdf5(signal_storename, filepath, "data")
+ control_timestamps = read_hdf5(control_storename, filepath, "timestamps")
+ signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps")
+ control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate")
+ signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate")
+ try: # TODO: define npoints for csv datasets
+ control_npoints = read_hdf5(control_storename, filepath, "npoints")
+ signal_npoints = read_hdf5(signal_storename, filepath, "npoints")
+ except KeyError: # npoints is not defined for csv datasets
+ control_npoints = None
+ signal_npoints = None
+
+ name_to_data[control_name] = control_data
+ name_to_data[signal_name] = signal_data
+ name_to_timestamps[control_name] = control_timestamps
+ name_to_timestamps[signal_name] = signal_timestamps
+ name_to_sampling_rate[control_name] = control_sampling_rate
+ name_to_sampling_rate[signal_name] = signal_sampling_rate
+ name_to_npoints[control_name] = control_npoints
+ name_to_npoints[signal_name] = signal_npoints
+
+ return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints
+
+
+def read_ttl(filepath, storesList):
+ channels_arr = get_control_and_signal_channel_names(storesList)
+ storenames = storesList[0, :]
+ names_for_storenames = storesList[1, :]
+
+ name_to_timestamps = {}
+ for storename, name in zip(storenames, names_for_storenames):
+ if name in channels_arr:
+ continue
+ timestamps = read_hdf5(storename, filepath, "timestamps")
+ name_to_timestamps[name] = timestamps
+
+ return name_to_timestamps
+
+
+def write_corrected_timestamps(
+ filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex
+):
+ for name, correctionIndex in name_to_correctionIndex.items():
+ timestamps = name_to_timestamps[name]
+ corrected_timestamps = corrected_name_to_timestamps[name]
+ sampling_rate = name_to_sampling_rate[name]
+ if sampling_rate.shape == (): # numpy scalar
+ sampling_rate = np.asarray([sampling_rate])
+ name_1 = name.split("_")[-1]
+ write_hdf5(np.asarray([timestamps[0]]), "timeCorrection_" + name_1, filepath, "timeRecStart")
+ write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew")
+ write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex")
+ write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate")
+
+
+def write_corrected_data(filepath, name_to_corrected_data):
+ for name, data in name_to_corrected_data.items():
+ write_hdf5(data, name, filepath, "data")
+
+
+def write_corrected_ttl_timestamps(
+ filepath,
+ compound_name_to_corrected_ttl_timestamps,
+):
+ logger.debug("Applying correction of timestamps to the data and event timestamps")
+ for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items():
+ write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts")
+ logger.info("Timestamps corrections applied to the data and event timestamps.")
+
+
+def read_corrected_data(control_path, signal_path, filepath, name):
+ control = read_hdf5("", control_path, "data").reshape(-1)
+ signal = read_hdf5("", signal_path, "data").reshape(-1)
+ tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew")
+
+ return control, signal, tsNew
+
+
+def write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr):
+ write_hdf5(z_score, "z_score_" + name, filepath, "data")
+ write_hdf5(dff, "dff_" + name, filepath, "data")
+ write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data")
+ if temp_control_arr is not None:
+ write_hdf5(temp_control_arr, "control_" + name, filepath, "data")
+
+
+def read_corrected_timestamps_pairwise(filepath):
+ pair_name_to_tsNew = {}
+ pair_name_to_sampling_rate = {}
+ path = decide_naming_convention(filepath)
+ for j in range(path.shape[1]):
+ name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")
+ name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")
+ if name_1[-1] != name_2[-1]:
+ logger.error("Error in naming convention of files or Error in storesList file")
+ raise Exception("Error in naming convention of files or Error in storesList file")
+ name = name_1[-1]
+
+ tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew")
+ sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0]
+ pair_name_to_tsNew[name] = tsNew
+ pair_name_to_sampling_rate[name] = sampling_rate
+ return pair_name_to_tsNew, pair_name_to_sampling_rate
+
+
+def read_coords_pairwise(filepath, pair_name_to_tsNew):
+ pair_name_to_coords = {}
+ path = decide_naming_convention(filepath)
+ for j in range(path.shape[1]):
+ name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")
+ name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")
+ if name_1[-1] != name_2[-1]:
+ logger.error("Error in naming convention of files or Error in storesList file")
+ raise Exception("Error in naming convention of files or Error in storesList file")
+ pair_name = name_1[-1]
+
+ tsNew = pair_name_to_tsNew[pair_name]
+ coords = fetchCoords(filepath, pair_name, tsNew)
+ pair_name_to_coords[pair_name] = coords
+ return pair_name_to_coords
+
+
+def read_corrected_data_dict(filepath, storesList): # TODO: coordinate with read_corrected_data
+ name_to_corrected_data = {}
+ storenames = storesList[0, :]
+ names_for_storenames = storesList[1, :]
+ control_and_signal_names = get_control_and_signal_channel_names(storesList)
+
+ for storename, name in zip(storenames, names_for_storenames):
+ if name not in control_and_signal_names:
+ continue
+ data = read_hdf5(name, filepath, "data").reshape(-1)
+ name_to_corrected_data[name] = data
+
+ return name_to_corrected_data
+
+
+def read_corrected_ttl_timestamps(filepath, storesList):
+ compound_name_to_ttl_timestamps = {}
+ storenames = storesList[0, :]
+ names_for_storenames = storesList[1, :]
+ arr = get_control_and_signal_channel_names(storesList)
+
+ for storename, name in zip(storenames, names_for_storenames):
+ if name in arr:
+ continue
+ ttl_name = name
+ for i in range(arr.shape[1]):
+ name_1 = arr[0, i].split("_")[-1]
+ name_2 = arr[1, i].split("_")[-1]
+ if name_1 != name_2:
+ logger.error("Error in naming convention of files or Error in storesList file")
+ raise Exception("Error in naming convention of files or Error in storesList file")
+ compound_name = ttl_name + "_" + name_1
+ ts = read_hdf5(compound_name, filepath, "ts")
+ compound_name_to_ttl_timestamps[compound_name] = ts
+
+ return compound_name_to_ttl_timestamps
+
+
+def write_artifact_corrected_timestamps(filepath, pair_name_to_corrected_timestamps):
+ for pair_name, timestamps in pair_name_to_corrected_timestamps.items():
+ write_hdf5(timestamps, "timeCorrection_" + pair_name, filepath, "timestampNew")
+
+
+def write_artifact_removal(
+ filepath,
+ name_to_corrected_data,
+ pair_name_to_corrected_timestamps,
+ compound_name_to_corrected_ttl_timestamps=None,
+):
+ write_corrected_data(filepath, name_to_corrected_data)
+ write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps)
+ if pair_name_to_corrected_timestamps is not None:
+ write_artifact_corrected_timestamps(filepath, pair_name_to_corrected_timestamps)
diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py
new file mode 100644
index 0000000..0806fb8
--- /dev/null
+++ b/src/guppy/analysis/timestamp_correction.py
@@ -0,0 +1,200 @@
+import logging
+
+import numpy as np
+
+from .io_utils import get_control_and_signal_channel_names
+
+logger = logging.getLogger(__name__)
+
+
+def correct_timestamps(
+ timeForLightsTurnOn,
+ storesList,
+ name_to_timestamps,
+ name_to_data,
+ name_to_sampling_rate,
+ name_to_npoints,
+ name_to_timestamps_ttl,
+ mode,
+):
+ name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data = timestampCorrection(
+ timeForLightsTurnOn,
+ storesList,
+ name_to_timestamps,
+ name_to_data,
+ name_to_sampling_rate,
+ name_to_npoints,
+ mode=mode,
+ )
+ compound_name_to_corrected_ttl_timestamps = decide_naming_and_applyCorrection_ttl(
+ timeForLightsTurnOn,
+ storesList,
+ name_to_timestamps_ttl,
+ name_to_timestamps,
+ name_to_data,
+ mode=mode,
+ )
+
+ return (
+ name_to_corrected_timestamps,
+ name_to_correctionIndex,
+ name_to_corrected_data,
+ compound_name_to_corrected_ttl_timestamps,
+ )
+
+
+# function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode)
+def timestampCorrection(
+ timeForLightsTurnOn,
+ storesList,
+ name_to_timestamps,
+ name_to_data,
+ name_to_sampling_rate,
+ name_to_npoints,
+ mode,
+):
+ logger.debug(
+ f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds"
+ )
+ if mode not in ["tdt", "csv"]:
+ logger.error("Mode should be either 'tdt' or 'csv'")
+ raise ValueError("Mode should be either 'tdt' or 'csv'")
+ name_to_corrected_timestamps = {}
+ name_to_correctionIndex = {}
+ name_to_corrected_data = {}
+ storenames = storesList[0, :]
+ names_for_storenames = storesList[1, :]
+ channels_arr = get_control_and_signal_channel_names(storesList)
+
+ indices = check_cntrl_sig_length(channels_arr, name_to_data)
+
+ for i in range(channels_arr.shape[1]):
+ control_name = channels_arr[0, i]
+ signal_name = channels_arr[1, i]
+ name_1 = channels_arr[0, i].split("_")[-1]
+ name_2 = channels_arr[1, i].split("_")[-1]
+ if name_1 != name_2:
+ logger.error("Error in naming convention of files or Error in storesList file")
+ raise Exception("Error in naming convention of files or Error in storesList file")
+
+ # dirname = os.path.dirname(path[i])
+ idx = np.where(names_for_storenames == indices[i])[0]
+
+ if idx.shape[0] == 0:
+ logger.error(f"{channels_arr[0,i]} does not exist in the stores list file.")
+ raise Exception("{} does not exist in the stores list file.".format(channels_arr[0, i]))
+
+ name = names_for_storenames[idx][0]
+ timestamp = name_to_timestamps[name]
+ sampling_rate = name_to_sampling_rate[name]
+ npoints = name_to_npoints[name]
+
+ if mode == "tdt":
+ timeRecStart = timestamp[0]
+ timestamps = np.subtract(timestamp, timeRecStart)
+ adder = np.arange(npoints) / sampling_rate
+ lengthAdder = adder.shape[0]
+ timestampNew = np.zeros((len(timestamps), lengthAdder))
+ for i in range(lengthAdder):
+ timestampNew[:, i] = np.add(timestamps, adder[i])
+ timestampNew = (timestampNew.T).reshape(-1, order="F")
+ correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0]
+ timestampNew = timestampNew[correctionIndex]
+ elif mode == "csv":
+ correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0]
+ timestampNew = timestamp[correctionIndex]
+
+ for displayName in [control_name, signal_name]:
+ name_to_corrected_timestamps[displayName] = timestampNew
+ name_to_correctionIndex[displayName] = correctionIndex
+ data = name_to_data[displayName]
+ if (data == 0).all() == True:
+ name_to_corrected_data[displayName] = data
+ else:
+ name_to_corrected_data[displayName] = data[correctionIndex]
+
+ logger.info("Timestamps corrected and converted to seconds.")
+ return name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data
+
+
+def decide_naming_and_applyCorrection_ttl(
+ timeForLightsTurnOn,
+ storesList,
+ name_to_timestamps_ttl,
+ name_to_timestamps,
+ name_to_data,
+ mode,
+):
+ logger.debug("Applying correction of timestamps to the data and event timestamps")
+ storenames = storesList[0, :]
+ names_for_storenames = storesList[1, :]
+ arr = get_control_and_signal_channel_names(storesList)
+ indices = check_cntrl_sig_length(arr, name_to_data)
+
+ compound_name_to_corrected_ttl_timestamps = {}
+ for ttl_name, ttl_timestamps in name_to_timestamps_ttl.items():
+ for i in range(arr.shape[1]):
+ name_1 = arr[0, i].split("_")[-1]
+ name_2 = arr[1, i].split("_")[-1]
+ if name_1 != name_2:
+ logger.error("Error in naming convention of files or Error in storesList file")
+ raise Exception("Error in naming convention of files or Error in storesList file")
+
+ idx = np.where(names_for_storenames == indices[i])[0]
+ if idx.shape[0] == 0:
+ logger.error(f"{arr[0,i]} does not exist in the stores list file.")
+ raise Exception("{} does not exist in the stores list file.".format(arr[0, i]))
+
+ name = names_for_storenames[idx][0]
+ timestamps = name_to_timestamps[name]
+ timeRecStart = timestamps[0]
+ corrected_ttl_timestamps = applyCorrection_ttl(
+ timeForLightsTurnOn,
+ timeRecStart,
+ ttl_timestamps,
+ mode,
+ )
+ compound_name = ttl_name + "_" + name_1
+ compound_name_to_corrected_ttl_timestamps[compound_name] = corrected_ttl_timestamps
+
+ logger.info("Timestamps corrections applied to the data and event timestamps.")
+ return compound_name_to_corrected_ttl_timestamps
+
+
+def applyCorrection_ttl(
+ timeForLightsTurnOn,
+ timeRecStart,
+ ttl_timestamps,
+ mode,
+):
+ corrected_ttl_timestamps = ttl_timestamps
+ if mode == "tdt":
+ res = (corrected_ttl_timestamps >= timeRecStart).all()
+ if res == True:
+ corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeRecStart)
+ corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn)
+ else:
+ corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn)
+ elif mode == "csv":
+ corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn)
+ return corrected_ttl_timestamps
+
+
+# function to check control and signal channel has same length
+# if not, take a smaller length and do pre-processing
+def check_cntrl_sig_length(channels_arr, name_to_data):
+
+ indices = []
+ for i in range(channels_arr.shape[1]):
+ control_name = channels_arr[0, i]
+ signal_name = channels_arr[1, i]
+ control = name_to_data[control_name]
+ signal = name_to_data[signal_name]
+ if control.shape[0] < signal.shape[0]:
+ indices.append(control_name)
+ elif control.shape[0] > signal.shape[0]:
+ indices.append(signal_name)
+ else:
+ indices.append(signal_name)
+
+ return indices
diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py
new file mode 100644
index 0000000..34b29ee
--- /dev/null
+++ b/src/guppy/analysis/z_score.py
@@ -0,0 +1,148 @@
+import logging
+
+import numpy as np
+from scipy import signal as ss
+
+from .control_channel import helper_create_control_channel
+
+logger = logging.getLogger(__name__)
+
+
+# high-level function to compute z-score and deltaF/F
+def compute_z_score(
+ control,
+ signal,
+ tsNew,
+ coords,
+ artifactsRemovalMethod,
+ filter_window,
+ isosbestic_control,
+ zscore_method,
+ baseline_start,
+ baseline_end,
+):
+ if (control == 0).all() == True:
+ control = np.zeros(tsNew.shape[0])
+
+ z_score_arr = np.array([])
+ norm_data_arr = np.full(tsNew.shape[0], np.nan)
+ control_fit_arr = np.full(tsNew.shape[0], np.nan)
+ temp_control_arr = np.full(tsNew.shape[0], np.nan)
+
+ # for artifacts removal, each chunk which was selected by user is being processed individually and then
+ # z-score is calculated
+ for i in range(coords.shape[0]):
+ tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0]
+ if isosbestic_control == False:
+ control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101)
+ signal_arr = signal[tsNew_index]
+ norm_data, control_fit = execute_controlFit_dff(control_arr, signal_arr, isosbestic_control, filter_window)
+ temp_control_arr[tsNew_index] = control_arr
+ if i < coords.shape[0] - 1:
+ blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0]
+ temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan)
+ else:
+ control_arr = control[tsNew_index]
+ signal_arr = signal[tsNew_index]
+ norm_data, control_fit = execute_controlFit_dff(control_arr, signal_arr, isosbestic_control, filter_window)
+ norm_data_arr[tsNew_index] = norm_data
+ control_fit_arr[tsNew_index] = control_fit
+
+ if artifactsRemovalMethod == "concatenate":
+ norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)]
+ control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)]
+ z_score = z_score_computation(norm_data_arr, tsNew, zscore_method, baseline_start, baseline_end)
+ z_score_arr = np.concatenate((z_score_arr, z_score))
+
+ # handle the case if there are chunks being cut in the front and the end
+ if isosbestic_control == False:
+ coords = coords.flatten()
+ # front chunk
+ idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0]
+ temp_control_arr[idx] = np.full(idx.shape[0], np.nan)
+ # end chunk
+ idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0]
+ temp_control_arr[idx] = np.full(idx.shape[0], np.nan)
+ else:
+ temp_control_arr = None
+
+ return z_score_arr, norm_data_arr, control_fit_arr, temp_control_arr
+
+
+# function to filter control and signal channel, also execute above two function : controlFit and deltaFF
+# function will also take care if there is only signal channel and no control channel
+# if there is only signal channel, z-score will be computed using just signal channel
+def execute_controlFit_dff(control, signal, isosbestic_control, filter_window):
+
+ if isosbestic_control == False:
+ signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal)
+ control_fit = controlFit(control, signal_smooth)
+ norm_data = deltaFF(signal_smooth, control_fit)
+ else:
+ control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control)
+ signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal)
+ control_fit = controlFit(control_smooth, signal_smooth)
+ norm_data = deltaFF(signal_smooth, control_fit)
+
+ return norm_data, control_fit
+
+
+# function to compute deltaF/F using fitted control channel and filtered signal channel
+def deltaFF(signal, control):
+
+ res = np.subtract(signal, control)
+ normData = np.divide(res, control)
+ # deltaFF = normData
+ normData = normData * 100
+
+ return normData
+
+
+# function to fit control channel to signal channel
+def controlFit(control, signal):
+
+ p = np.polyfit(control, signal, 1)
+ arr = (p[0] * control) + p[1]
+ return arr
+
+
+def filterSignal(filter_window, signal):
+ if filter_window == 0:
+ return signal
+ elif filter_window > 1:
+ b = np.divide(np.ones((filter_window,)), filter_window)
+ a = 1
+ filtered_signal = ss.filtfilt(b, a, signal)
+ return filtered_signal
+ else:
+ raise Exception("Moving average filter window value is not correct.")
+
+
+# function to compute z-score based on z-score computation method
+def z_score_computation(dff, timestamps, zscore_method, baseline_start, baseline_end):
+ if zscore_method == "standard z-score":
+ numerator = np.subtract(dff, np.nanmean(dff))
+ zscore = np.divide(numerator, np.nanstd(dff))
+ elif zscore_method == "baseline z-score":
+ idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0]
+ if idx.shape[0] == 0:
+ logger.error(
+ "Baseline Window Parameters for baseline z-score computation zscore_method \
+ are not correct."
+ )
+ raise Exception(
+ "Baseline Window Parameters for baseline z-score computation zscore_method \
+ are not correct."
+ )
+ else:
+ baseline_mean = np.nanmean(dff[idx])
+ baseline_std = np.nanstd(dff[idx])
+ numerator = np.subtract(dff, baseline_mean)
+ zscore = np.divide(numerator, baseline_std)
+ else:
+ median = np.median(dff)
+ mad = np.median(np.abs(dff - median))
+ numerator = 0.6745 * (dff - median)
+ zscore = np.divide(numerator, mad)
+
+ return zscore
diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py
index 8b79039..0c41ae4 100755
--- a/src/guppy/preprocess.py
+++ b/src/guppy/preprocess.py
@@ -1,22 +1,40 @@
-import fnmatch
import glob
import json
import logging
import os
-import re
-import shutil
import sys
-import h5py
import matplotlib.pyplot as plt
import numpy as np
-import pandas as pd
-from scipy import signal as ss
-from scipy.optimize import curve_fit
-from .combineDataFn import processTimestampsForCombiningData
-
-logger = logging.getLogger(__name__)
+from .analysis.artifact_removal import remove_artifacts
+from .analysis.combine_data import combine_data
+from .analysis.control_channel import add_control_channel, create_control_channel
+from .analysis.io_utils import (
+ check_storeslistfile,
+ check_TDT,
+ find_files,
+ get_all_stores_for_combining_data, # noqa: F401 -- Necessary for other modules that depend on preprocess.py
+ get_coords,
+ read_hdf5,
+ takeOnlyDirs,
+)
+from .analysis.standard_io import (
+ read_control_and_signal,
+ read_coords_pairwise,
+ read_corrected_data,
+ read_corrected_data_dict,
+ read_corrected_timestamps_pairwise,
+ read_corrected_ttl_timestamps,
+ read_ttl,
+ write_artifact_removal,
+ write_corrected_data,
+ write_corrected_timestamps,
+ write_corrected_ttl_timestamps,
+ write_zscore,
+)
+from .analysis.timestamp_correction import correct_timestamps
+from .analysis.z_score import compute_z_score
logger = logging.getLogger(__name__)
@@ -25,404 +43,11 @@
plt.switch_backend("TKAgg")
-def takeOnlyDirs(paths):
- removePaths = []
- for p in paths:
- if os.path.isfile(p):
- removePaths.append(p)
- return list(set(paths) - set(removePaths))
-
-
def writeToFile(value: str):
with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file:
file.write(value)
-# find files by ignoring the case sensitivity
-def find_files(path, glob_path, ignore_case=False):
- rule = (
- re.compile(fnmatch.translate(glob_path), re.IGNORECASE)
- if ignore_case
- else re.compile(fnmatch.translate(glob_path))
- )
-
- no_bytes_path = os.listdir(os.path.expanduser(path))
- str_path = []
-
- # converting byte object to string
- for x in no_bytes_path:
- try:
- str_path.append(x.decode("utf-8"))
- except:
- str_path.append(x)
- return [os.path.join(path, n) for n in str_path if rule.match(n)]
-
-
-# curve fit exponential function
-def curveFitFn(x, a, b, c):
- return a + (b * np.exp(-(1 / c) * x))
-
-
-# helper function to create control channel using signal channel
-# by curve fitting signal channel to exponential function
-# when there is no isosbestic control channel is present
-def helper_create_control_channel(signal, timestamps, window):
- # check if window is greater than signal shape
- if window > signal.shape[0]:
- window = ((signal.shape[0] + 1) / 2) + 1
- if window % 2 != 0:
- window = window
- else:
- window = window + 1
-
- filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3)
-
- p0 = [5, 50, 60]
-
- try:
- popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0)
- except Exception as e:
- logger.error(str(e))
-
- # logger.info('Curve Fit Parameters : ', popt)
- control = curveFitFn(timestamps, *popt)
-
- return control
-
-
-# main function to create control channel using
-# signal channel and save it to a file
-def create_control_channel(filepath, arr, window=5001):
-
- storenames = arr[0, :]
- storesList = arr[1, :]
-
- for i in range(storesList.shape[0]):
- event_name, event = storesList[i], storenames[i]
- if "control" in event_name.lower() and "cntrl" in event.lower():
- logger.debug("Creating control channel from signal channel using curve-fitting")
- name = event_name.split("_")[-1]
- signal = read_hdf5("signal_" + name, filepath, "data")
- timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew")
- sampling_rate = np.full(timestampNew.shape, np.nan)
- sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0]
-
- control = helper_create_control_channel(signal, timestampNew, window)
-
- write_hdf5(control, event_name, filepath, "data")
- d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate}
- df = pd.DataFrame(d)
- df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False)
- logger.info("Control channel from signal channel created using curve-fitting")
-
-
-# function to add control channel when there is no
-# isosbestic control channel and update the storeslist file
-def add_control_channel(filepath, arr):
-
- storenames = arr[0, :]
- storesList = np.char.lower(arr[1, :])
-
- keep_control = np.array([])
- # check a case if there is isosbestic control channel present
- for i in range(storesList.shape[0]):
- if "control" in storesList[i].lower():
- name = storesList[i].split("_")[-1]
- new_str = "signal_" + str(name).lower()
- find_signal = [True for i in storesList if i == new_str]
- if len(find_signal) > 1:
- logger.error("Error in naming convention of files or Error in storesList file")
- raise Exception("Error in naming convention of files or Error in storesList file")
- if len(find_signal) == 0:
- logger.error(
- "Isosbectic control channel parameter is set to False and still \
- storeslist file shows there is control channel present"
- )
- raise Exception(
- "Isosbectic control channel parameter is set to False and still \
- storeslist file shows there is control channel present"
- )
- else:
- continue
-
- for i in range(storesList.shape[0]):
- if "signal" in storesList[i].lower():
- name = storesList[i].split("_")[-1]
- new_str = "control_" + str(name).lower()
- find_signal = [True for i in storesList if i == new_str]
- if len(find_signal) == 0:
- src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join(
- filepath, "cntrl" + str(i) + ".hdf5"
- )
- shutil.copyfile(src, dst)
- arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1)
-
- np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s")
-
- return arr
-
-
-# check if dealing with TDT files or csv files
-def check_TDT(filepath):
- path = glob.glob(os.path.join(filepath, "*.tsq"))
- if len(path) > 0:
- return True
- else:
- return False
-
-
-# function to read hdf5 file
-def read_hdf5(event, filepath, key):
- if event:
- event = event.replace("\\", "_")
- event = event.replace("/", "_")
- op = os.path.join(filepath, event + ".hdf5")
- else:
- op = filepath
-
- if os.path.exists(op):
- with h5py.File(op, "r") as f:
- arr = np.asarray(f[key])
- else:
- logger.error(f"{event}.hdf5 file does not exist")
- raise Exception("{}.hdf5 file does not exist".format(event))
-
- return arr
-
-
-# function to write hdf5 file
-def write_hdf5(data, event, filepath, key):
- event = event.replace("\\", "_")
- event = event.replace("/", "_")
- op = os.path.join(filepath, event + ".hdf5")
-
- # if file does not exist create a new file
- if not os.path.exists(op):
- with h5py.File(op, "w") as f:
- if type(data) is np.ndarray:
- f.create_dataset(key, data=data, maxshape=(None,), chunks=True)
- else:
- f.create_dataset(key, data=data)
-
- # if file already exists, append data to it or add a new key to it
- else:
- with h5py.File(op, "r+") as f:
- if key in list(f.keys()):
- if type(data) is np.ndarray:
- f[key].resize(data.shape)
- arr = f[key]
- arr[:] = data
- else:
- arr = f[key]
- arr = data
- else:
- if type(data) is np.ndarray:
- f.create_dataset(key, data=data, maxshape=(None,), chunks=True)
- else:
- f.create_dataset(key, data=data)
-
-
-# function to check control and signal channel has same length
-# if not, take a smaller length and do pre-processing
-def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList):
-
- indices = []
- for i in range(channels_arr.shape[1]):
- idx_c = np.where(storesList == channels_arr[0, i])[0]
- idx_s = np.where(storesList == channels_arr[1, i])[0]
- control = read_hdf5(storenames[idx_c[0]], filepath, "data")
- signal = read_hdf5(storenames[idx_s[0]], filepath, "data")
- if control.shape[0] < signal.shape[0]:
- indices.append(storesList[idx_c[0]])
- elif control.shape[0] > signal.shape[0]:
- indices.append(storesList[idx_s[0]])
- else:
- indices.append(storesList[idx_s[0]])
-
- return indices
-
-
-# function to correct timestamps after eliminating first few seconds of the data (for csv data)
-def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList):
-
- logger.debug(
- f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds"
- )
- storenames = storesList[0, :]
- storesList = storesList[1, :]
-
- arr = []
- for i in range(storesList.shape[0]):
- if "control" in storesList[i].lower() or "signal" in storesList[i].lower():
- arr.append(storesList[i])
-
- arr = sorted(arr, key=str.casefold)
- try:
- arr = np.asarray(arr).reshape(2, -1)
- except:
- logger.error("Error in saving stores list file or spelling mistake for control or signal")
- raise Exception("Error in saving stores list file or spelling mistake for control or signal")
-
- indices = check_cntrl_sig_length(filepath, arr, storenames, storesList)
-
- for i in range(arr.shape[1]):
- name_1 = arr[0, i].split("_")[-1]
- name_2 = arr[1, i].split("_")[-1]
- # dirname = os.path.dirname(path[i])
- idx = np.where(storesList == indices[i])[0]
-
- if idx.shape[0] == 0:
- logger.error(f"{arr[0,i]} does not exist in the stores list file.")
- raise Exception("{} does not exist in the stores list file.".format(arr[0, i]))
-
- timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps")
- sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate")
-
- if name_1 == name_2:
- correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0]
- timestampNew = timestamp[correctionIndex]
- write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew")
- write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex")
- write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate")
-
- else:
- logger.error("Error in naming convention of files or Error in storesList file")
- raise Exception("Error in naming convention of files or Error in storesList file")
-
- logger.info("Timestamps corrected and converted to seconds.")
-
-
-# function to correct timestamps after eliminating first few seconds of the data (for TDT data)
-def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList):
-
- logger.debug(
- f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds"
- )
- storenames = storesList[0, :]
- storesList = storesList[1, :]
-
- arr = []
- for i in range(storesList.shape[0]):
- if "control" in storesList[i].lower() or "signal" in storesList[i].lower():
- arr.append(storesList[i])
-
- arr = sorted(arr, key=str.casefold)
-
- try:
- arr = np.asarray(arr).reshape(2, -1)
- except:
- logger.error("Error in saving stores list file or spelling mistake for control or signal")
- raise Exception("Error in saving stores list file or spelling mistake for control or signal")
-
- indices = check_cntrl_sig_length(filepath, arr, storenames, storesList)
-
- for i in range(arr.shape[1]):
- name_1 = arr[0, i].split("_")[-1]
- name_2 = arr[1, i].split("_")[-1]
- # dirname = os.path.dirname(path[i])
- idx = np.where(storesList == indices[i])[0]
-
- if idx.shape[0] == 0:
- logger.error(f"{arr[0,i]} does not exist in the stores list file.")
- raise Exception("{} does not exist in the stores list file.".format(arr[0, i]))
-
- timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps")
- npoints = read_hdf5(storenames[idx][0], filepath, "npoints")
- sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate")
-
- if name_1 == name_2:
- timeRecStart = timestamp[0]
- timestamps = np.subtract(timestamp, timeRecStart)
- adder = np.arange(npoints) / sampling_rate
- lengthAdder = adder.shape[0]
- timestampNew = np.zeros((len(timestamps), lengthAdder))
- for i in range(lengthAdder):
- timestampNew[:, i] = np.add(timestamps, adder[i])
- timestampNew = (timestampNew.T).reshape(-1, order="F")
- correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0]
- timestampNew = timestampNew[correctionIndex]
-
- write_hdf5(np.asarray([timeRecStart]), "timeCorrection_" + name_1, filepath, "timeRecStart")
- write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew")
- write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex")
- write_hdf5(np.asarray([sampling_rate]), "timeCorrection_" + name_1, filepath, "sampling_rate")
- else:
- logger.error("Error in naming convention of files or Error in storesList file")
- raise Exception("Error in naming convention of files or Error in storesList file")
-
- logger.info("Timestamps corrected and converted to seconds.")
- # return timeRecStart, correctionIndex, timestampNew
-
-
-# function to apply correction to control, signal and event timestamps
-def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming):
-
- cond = check_TDT(os.path.dirname(filepath))
-
- if cond == True:
- timeRecStart = read_hdf5("timeCorrection_" + naming, filepath, "timeRecStart")[0]
-
- timestampNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew")
- correctionIndex = read_hdf5("timeCorrection_" + naming, filepath, "correctionIndex")
-
- if "control" in displayName.lower() or "signal" in displayName.lower():
- split_name = displayName.split("_")[-1]
- if split_name == naming:
- pass
- else:
- correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex")
- arr = read_hdf5(event, filepath, "data")
- if (arr == 0).all() == True:
- arr = arr
- else:
- arr = arr[correctionIndex]
- write_hdf5(arr, displayName, filepath, "data")
- else:
- arr = read_hdf5(event, filepath, "timestamps")
- if cond == True:
- res = (arr >= timeRecStart).all()
- if res == True:
- arr = np.subtract(arr, timeRecStart)
- arr = np.subtract(arr, timeForLightsTurnOn)
- else:
- arr = np.subtract(arr, timeForLightsTurnOn)
- else:
- arr = np.subtract(arr, timeForLightsTurnOn)
- write_hdf5(arr, displayName + "_" + naming, filepath, "ts")
-
- # if isosbestic_control==False and 'control' in displayName.lower():
- # control = create_control_channel(filepath, displayName)
- # write_hdf5(control, displayName, filepath, 'data')
-
-
-# function to check if naming convention was followed while saving storeslist file
-# and apply timestamps correction using the function applyCorrection
-def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList):
-
- logger.debug("Applying correction of timestamps to the data and event timestamps")
- storesList = storesList[1, :]
-
- arr = []
- for i in range(storesList.shape[0]):
- if "control" in storesList[i].lower() or "signal" in storesList[i].lower():
- arr.append(storesList[i])
-
- arr = sorted(arr, key=str.casefold)
- arr = np.asarray(arr).reshape(2, -1)
-
- for i in range(arr.shape[1]):
- name_1 = arr[0, i].split("_")[-1]
- name_2 = arr[1, i].split("_")[-1]
- # dirname = os.path.dirname(path[i])
- if name_1 == name_2:
- applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1)
- else:
- logger.error("Error in naming convention of files or Error in storesList file")
- raise Exception("Error in naming convention of files or Error in storesList file")
-
- logger.info("Timestamps corrections applied to the data and event timestamps.")
-
-
# function to plot z_score
def visualize_z_score(filepath):
@@ -590,421 +215,6 @@ def visualizeControlAndSignal(filepath, removeArtifacts):
visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts)
-# function to check if the naming convention for saving storeslist file was followed or not
-def decide_naming_convention(filepath):
- path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*'))
-
- path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*'))
-
- path = sorted(path_1 + path_2, key=str.casefold)
- if len(path) % 2 != 0:
- logger.error("There are not equal number of Control and Signal data")
- raise Exception("There are not equal number of Control and Signal data")
-
- path = np.asarray(path).reshape(2, -1)
-
- return path
-
-
-# function to read coordinates file which was saved by selecting chunks for artifacts removal
-def fetchCoords(filepath, naming, data):
-
- path = os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy")
-
- if not os.path.exists(path):
- coords = np.array([0, data[-1]])
- else:
- coords = np.load(os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy"))[:, 0]
-
- if coords.shape[0] % 2 != 0:
- logger.error("Number of values in coordsForPreProcessing file is not even.")
- raise Exception("Number of values in coordsForPreProcessing file is not even.")
-
- coords = coords.reshape(-1, 2)
-
- return coords
-
-
-# helper function to process control and signal timestamps
-def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming):
-
- ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew")
- data = read_hdf5(event, filepath, "data").reshape(-1)
- coords = fetchCoords(filepath, naming, ts)
-
- if (data == 0).all() == True:
- data = np.zeros(ts.shape[0])
-
- arr = np.array([])
- ts_arr = np.array([])
- for i in range(coords.shape[0]):
-
- index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0]
-
- if len(arr) == 0:
- arr = np.concatenate((arr, data[index]))
- sub = ts[index][0] - timeForLightsTurnOn
- new_ts = ts[index] - sub
- ts_arr = np.concatenate((ts_arr, new_ts))
- else:
- temp = data[index]
- # new = temp + (arr[-1]-temp[0])
- temp_ts = ts[index]
- new_ts = temp_ts - (temp_ts[0] - ts_arr[-1])
- arr = np.concatenate((arr, temp))
- ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate)))
-
- # logger.info(arr.shape, ts_arr.shape)
- return arr, ts_arr
-
-
-# helper function to align event timestamps with the control and signal timestamps
-def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming):
-
- tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew")
- ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1)
- coords = fetchCoords(filepath, naming, tsNew)
-
- ts_arr = np.array([])
- tsNew_arr = np.array([])
- for i in range(coords.shape[0]):
- tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0]
- ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0]
-
- if len(tsNew_arr) == 0:
- sub = tsNew[tsNew_index][0] - timeForLightsTurnOn
- tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub))
- ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub))
- else:
- temp_tsNew = tsNew[tsNew_index]
- temp_ts = ts[ts_index]
- new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1])
- new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1])
- tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate)))
- ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate)))
-
- return ts_arr
-
-
-# adding nan values to removed chunks
-# when using artifacts removal method - replace with NaN
-def addingNaNValues(filepath, event, naming):
-
- ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew")
- data = read_hdf5(event, filepath, "data").reshape(-1)
- coords = fetchCoords(filepath, naming, ts)
-
- if (data == 0).all() == True:
- data = np.zeros(ts.shape[0])
-
- arr = np.array([])
- ts_index = np.arange(ts.shape[0])
- for i in range(coords.shape[0]):
-
- index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0]
- arr = np.concatenate((arr, index))
-
- nan_indices = list(set(ts_index).symmetric_difference(arr))
- data[nan_indices] = np.nan
-
- return data
-
-
-# remove event TTLs which falls in the removed chunks
-# when using artifacts removal method - replace with NaN
-def removeTTLs(filepath, event, naming):
- tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew")
- ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1)
- coords = fetchCoords(filepath, naming, tsNew)
-
- ts_arr = np.array([])
- for i in range(coords.shape[0]):
- ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0]
- ts_arr = np.concatenate((ts_arr, ts[ts_index]))
-
- return ts_arr
-
-
-def addingNaNtoChunksWithArtifacts(filepath, events):
-
- logger.debug("Replacing chunks with artifacts by NaN values.")
- storesList = events[1, :]
-
- path = decide_naming_convention(filepath)
-
- for j in range(path.shape[1]):
- name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")
- name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")
- # dirname = os.path.dirname(path[i])
- if name_1[-1] == name_2[-1]:
- name = name_1[-1]
- sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0]
- for i in range(len(storesList)):
- if (
- "control_" + name.lower() in storesList[i].lower()
- or "signal_" + name.lower() in storesList[i].lower()
- ): # changes done
- data = addingNaNValues(filepath, storesList[i], name)
- write_hdf5(data, storesList[i], filepath, "data")
- else:
- if "control" in storesList[i].lower() or "signal" in storesList[i].lower():
- continue
- else:
- ts = removeTTLs(filepath, storesList[i], name)
- write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts")
-
- else:
- logger.error("Error in naming convention of files or Error in storesList file")
- raise Exception("Error in naming convention of files or Error in storesList file")
- logger.info("Chunks with artifacts are replaced by NaN values.")
-
-
-# main function to align timestamps for control, signal and event timestamps for artifacts removal
-def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events):
-
- logger.debug("Processing timestamps to get rid of artifacts using concatenate method...")
- storesList = events[1, :]
-
- path = decide_naming_convention(filepath)
-
- timestamp_dict = dict()
- for j in range(path.shape[1]):
- name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")
- name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")
- # dirname = os.path.dirname(path[i])
- if name_1[-1] == name_2[-1]:
- name = name_1[-1]
- sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0]
-
- for i in range(len(storesList)):
- if (
- "control_" + name.lower() in storesList[i].lower()
- or "signal_" + name.lower() in storesList[i].lower()
- ): # changes done
- data, timestampNew = eliminateData(
- filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name
- )
- write_hdf5(data, storesList[i], filepath, "data")
- else:
- if "control" in storesList[i].lower() or "signal" in storesList[i].lower():
- continue
- else:
- ts = eliminateTs(filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name)
- write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts")
-
- # timestamp_dict[name] = timestampNew
- write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew")
- else:
- logger.error("Error in naming convention of files or Error in storesList file")
- raise Exception("Error in naming convention of files or Error in storesList file")
- logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.")
-
-
-# function to compute deltaF/F using fitted control channel and filtered signal channel
-def deltaFF(signal, control):
-
- res = np.subtract(signal, control)
- normData = np.divide(res, control)
- # deltaFF = normData
- normData = normData * 100
-
- return normData
-
-
-# function to fit control channel to signal channel
-def controlFit(control, signal):
-
- p = np.polyfit(control, signal, 1)
- arr = (p[0] * control) + p[1]
- return arr
-
-
-def filterSignal(filter_window, signal):
- if filter_window == 0:
- return signal
- elif filter_window > 1:
- b = np.divide(np.ones((filter_window,)), filter_window)
- a = 1
- filtered_signal = ss.filtfilt(b, a, signal)
- return filtered_signal
- else:
- raise Exception("Moving average filter window value is not correct.")
-
-
-# function to filter control and signal channel, also execute above two function : controlFit and deltaFF
-# function will also take care if there is only signal channel and no control channel
-# if there is only signal channel, z-score will be computed using just signal channel
-def execute_controlFit_dff(control, signal, isosbestic_control, filter_window):
-
- if isosbestic_control == False:
- signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal)
- control_fit = controlFit(control, signal_smooth)
- norm_data = deltaFF(signal_smooth, control_fit)
- else:
- control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control)
- signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal)
- control_fit = controlFit(control_smooth, signal_smooth)
- norm_data = deltaFF(signal_smooth, control_fit)
-
- return norm_data, control_fit
-
-
-# function to compute z-score based on z-score computation method
-def z_score_computation(dff, timestamps, inputParameters):
-
- zscore_method = inputParameters["zscore_method"]
- baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"]
-
- if zscore_method == "standard z-score":
- numerator = np.subtract(dff, np.nanmean(dff))
- zscore = np.divide(numerator, np.nanstd(dff))
- elif zscore_method == "baseline z-score":
- idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0]
- if idx.shape[0] == 0:
- logger.error(
- "Baseline Window Parameters for baseline z-score computation zscore_method \
- are not correct."
- )
- raise Exception(
- "Baseline Window Parameters for baseline z-score computation zscore_method \
- are not correct."
- )
- else:
- baseline_mean = np.nanmean(dff[idx])
- baseline_std = np.nanstd(dff[idx])
- numerator = np.subtract(dff, baseline_mean)
- zscore = np.divide(numerator, baseline_std)
- else:
- median = np.median(dff)
- mad = np.median(np.abs(dff - median))
- numerator = 0.6745 * (dff - median)
- zscore = np.divide(numerator, mad)
-
- return zscore
-
-
-# helper function to compute z-score and deltaF/F
-def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth):
-
- removeArtifacts = inputParameters["removeArtifacts"]
- artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"]
- filter_window = inputParameters["filter_window"]
-
- isosbestic_control = inputParameters["isosbestic_control"]
- tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew")
- coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy")
-
- logger.info("Remove Artifacts : ", removeArtifacts)
-
- if (control == 0).all() == True:
- control = np.zeros(tsNew.shape[0])
-
- z_score_arr = np.array([])
- norm_data_arr = np.full(tsNew.shape[0], np.nan)
- control_fit_arr = np.full(tsNew.shape[0], np.nan)
- temp_control_arr = np.full(tsNew.shape[0], np.nan)
-
- if removeArtifacts == True:
- coords = fetchCoords(filepath, name, tsNew)
-
- # for artifacts removal, each chunk which was selected by user is being processed individually and then
- # z-score is calculated
- for i in range(coords.shape[0]):
- tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0]
- if isosbestic_control == False:
- control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101)
- signal_arr = signal[tsNew_index]
- norm_data, control_fit = execute_controlFit_dff(
- control_arr, signal_arr, isosbestic_control, filter_window
- )
- temp_control_arr[tsNew_index] = control_arr
- if i < coords.shape[0] - 1:
- blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0]
- temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan)
- else:
- control_arr = control[tsNew_index]
- signal_arr = signal[tsNew_index]
- norm_data, control_fit = execute_controlFit_dff(
- control_arr, signal_arr, isosbestic_control, filter_window
- )
- norm_data_arr[tsNew_index] = norm_data
- control_fit_arr[tsNew_index] = control_fit
-
- if artifactsRemovalMethod == "concatenate":
- norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)]
- control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)]
- z_score = z_score_computation(norm_data_arr, tsNew, inputParameters)
- z_score_arr = np.concatenate((z_score_arr, z_score))
- else:
- tsNew_index = np.arange(tsNew.shape[0])
- norm_data, control_fit = execute_controlFit_dff(control, signal, isosbestic_control, filter_window)
- z_score = z_score_computation(norm_data, tsNew, inputParameters)
- z_score_arr = np.concatenate((z_score_arr, z_score))
- norm_data_arr[tsNew_index] = norm_data # np.concatenate((norm_data_arr, norm_data))
- control_fit_arr[tsNew_index] = control_fit # np.concatenate((control_fit_arr, control_fit))
-
- # handle the case if there are chunks being cut in the front and the end
- if isosbestic_control == False and removeArtifacts == True:
- coords = coords.flatten()
- # front chunk
- idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0]
- temp_control_arr[idx] = np.full(idx.shape[0], np.nan)
- # end chunk
- idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0]
- temp_control_arr[idx] = np.full(idx.shape[0], np.nan)
- write_hdf5(temp_control_arr, "control_" + name, filepath, "data")
-
- return z_score_arr, norm_data_arr, control_fit_arr
-
-
-# compute z-score and deltaF/F and save it to hdf5 file
-def compute_z_score(filepath, inputParameters):
-
- logger.debug(f"Computing z-score for each of the data in {filepath}")
- remove_artifacts = inputParameters["removeArtifacts"]
-
- path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*'))
- path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*'))
-
- path = sorted(path_1 + path_2, key=str.casefold)
-
- b = np.divide(np.ones((100,)), 100)
- a = 1
-
- if len(path) % 2 != 0:
- logger.error("There are not equal number of Control and Signal data")
- raise Exception("There are not equal number of Control and Signal data")
-
- path = np.asarray(path).reshape(2, -1)
-
- for i in range(path.shape[1]):
- name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_")
- name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_")
- # dirname = os.path.dirname(path[i])
-
- if name_1[-1] == name_2[-1]:
- name = name_1[-1]
- control = read_hdf5("", path[0, i], "data").reshape(-1)
- signal = read_hdf5("", path[1, i], "data").reshape(-1)
- # control_smooth = ss.filtfilt(b, a, control)
- # signal_smooth = ss.filtfilt(b, a, signal)
- # _score, dff = helper_z_score(control_smooth, signal_smooth)
- z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters)
- if remove_artifacts == True:
- write_hdf5(z_score, "z_score_" + name, filepath, "data")
- write_hdf5(dff, "dff_" + name, filepath, "data")
- write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data")
- else:
- write_hdf5(z_score, "z_score_" + name, filepath, "data")
- write_hdf5(dff, "dff_" + name, filepath, "data")
- write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data")
- else:
- logger.error("Error in naming convention of files or Error in storesList file")
- raise Exception("Error in naming convention of files or Error in storesList file")
-
- logger.info(f"z-score for the data in {filepath} computed.")
-
-
# function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection
def execute_timestamp_correction(folderNames, inputParameters):
@@ -1014,7 +224,7 @@ def execute_timestamp_correction(folderNames, inputParameters):
for i in range(len(folderNames)):
filepath = folderNames[i]
storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))
- cond = check_TDT(folderNames[i])
+ mode = "tdt" if check_TDT(folderNames[i]) else "csv"
logger.debug(f"Timestamps corrections started for {filepath}")
for j in range(len(storesListPath)):
filepath = storesListPath[j]
@@ -1025,15 +235,36 @@ def execute_timestamp_correction(folderNames, inputParameters):
if isosbestic_control == False:
storesList = add_control_channel(filepath, storesList)
- if cond == True:
- timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList)
- else:
- timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList)
-
- for k in range(storesList.shape[1]):
- decide_naming_convention_and_applyCorrection(
- filepath, timeForLightsTurnOn, storesList[0, k], storesList[1, k], storesList
- )
+ control_and_signal_dicts = read_control_and_signal(filepath, storesList)
+ name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts
+ name_to_timestamps_ttl = read_ttl(filepath, storesList)
+
+ timestamps_dicts = correct_timestamps(
+ timeForLightsTurnOn,
+ storesList,
+ name_to_timestamps,
+ name_to_data,
+ name_to_sampling_rate,
+ name_to_npoints,
+ name_to_timestamps_ttl,
+ mode=mode,
+ )
+ (
+ name_to_corrected_timestamps,
+ name_to_correctionIndex,
+ name_to_corrected_data,
+ compound_name_to_corrected_ttl_timestamps,
+ ) = timestamps_dicts
+
+ write_corrected_timestamps(
+ filepath,
+ name_to_corrected_timestamps,
+ name_to_timestamps,
+ name_to_sampling_rate,
+ name_to_correctionIndex,
+ )
+ write_corrected_data(filepath, name_to_corrected_data)
+ write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps)
# check if isosbestic control is false and also if new control channel is added
if isosbestic_control == False:
@@ -1044,45 +275,133 @@ def execute_timestamp_correction(folderNames, inputParameters):
logger.info(f"Timestamps corrections finished for {filepath}")
-# for combining data, reading storeslist file from both data and create a new storeslist array
-def check_storeslistfile(folderNames):
- storesList = np.array([[], []])
+# function to compute z-score and deltaF/F
+def execute_zscore(folderNames, inputParameters):
+
+ plot_zScore_dff = inputParameters["plot_zScore_dff"]
+ combine_data = inputParameters["combine_data"]
+ remove_artifacts = inputParameters["removeArtifacts"]
+ artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"]
+ filter_window = inputParameters["filter_window"]
+ isosbestic_control = inputParameters["isosbestic_control"]
+ zscore_method = inputParameters["zscore_method"]
+ baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"]
+
+ storesListPath = []
for i in range(len(folderNames)):
- filepath = folderNames[i]
- storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))
- for j in range(len(storesListPath)):
- filepath = storesListPath[j]
- storesList = np.concatenate(
- (
- storesList,
- np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1),
- ),
- axis=1,
+ if combine_data == True:
+ storesListPath.append([folderNames[i][0]])
+ else:
+ filepath = folderNames[i]
+ storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))))
+ storesListPath = np.concatenate(storesListPath)
+
+ for j in range(len(storesListPath)):
+ filepath = storesListPath[j]
+ logger.debug(f"Computing z-score for each of the data in {filepath}")
+ path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*'))
+ path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*'))
+ path = sorted(path_1 + path_2, key=str.casefold)
+ if len(path) % 2 != 0:
+ logger.error("There are not equal number of Control and Signal data")
+ raise Exception("There are not equal number of Control and Signal data")
+ path = np.asarray(path).reshape(2, -1)
+
+ for i in range(path.shape[1]):
+ name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_")
+ name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_")
+ if name_1[-1] != name_2[-1]:
+ logger.error("Error in naming convention of files or Error in storesList file")
+ raise Exception("Error in naming convention of files or Error in storesList file")
+ name = name_1[-1]
+
+ control, signal, tsNew = read_corrected_data(path[0, i], path[1, i], filepath, name)
+ coords = get_coords(filepath, name, tsNew, remove_artifacts)
+ z_score, dff, control_fit, temp_control_arr = compute_z_score(
+ control,
+ signal,
+ tsNew,
+ coords,
+ artifactsRemovalMethod,
+ filter_window,
+ isosbestic_control,
+ zscore_method,
+ baseline_start,
+ baseline_end,
)
+ write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr)
- storesList = np.unique(storesList, axis=1)
+ logger.info(f"z-score for the data in {filepath} computed.")
- return storesList
+ if not remove_artifacts:
+ visualizeControlAndSignal(filepath, removeArtifacts=remove_artifacts)
+ if plot_zScore_dff == "z_score":
+ visualize_z_score(filepath)
+ if plot_zScore_dff == "dff":
+ visualize_dff(filepath)
+ if plot_zScore_dff == "Both":
+ visualize_z_score(filepath)
+ visualize_dff(filepath)
-def get_all_stores_for_combining_data(folderNames):
- op = []
- for i in range(100):
- temp = []
- match = r"[\s\S]*" + "_output_" + str(i)
- for j in folderNames:
- temp.append(re.findall(match, j))
- temp = sorted(list(np.concatenate(temp).flatten()), key=str.casefold)
- if len(temp) > 0:
- op.append(temp)
+ writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n")
+ inputParameters["step"] += 1
+
+ plt.show()
+ logger.info("Z-score computation completed.")
- return op
+
+# function to remove artifacts from z-score data
+def execute_artifact_removal(folderNames, inputParameters):
+
+ timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"]
+ artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"]
+ combine_data = inputParameters["combine_data"]
+
+ storesListPath = []
+ for i in range(len(folderNames)):
+ if combine_data == True:
+ storesListPath.append([folderNames[i][0]])
+ else:
+ filepath = folderNames[i]
+ storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))))
+
+ storesListPath = np.concatenate(storesListPath)
+
+ for j in range(len(storesListPath)):
+ filepath = storesListPath[j]
+ storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1)
+
+ name_to_data = read_corrected_data_dict(filepath, storesList)
+ pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath)
+ pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew)
+ compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList)
+
+ logger.debug("Removing artifacts from the data...")
+ name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = remove_artifacts(
+ timeForLightsTurnOn,
+ storesList,
+ pair_name_to_tsNew,
+ pair_name_to_sampling_rate,
+ pair_name_to_coords,
+ name_to_data,
+ compound_name_to_ttl_timestamps,
+ method=artifactsRemovalMethod,
+ )
+
+ write_artifact_removal(filepath, name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps)
+ visualizeControlAndSignal(filepath, removeArtifacts=True)
+
+ writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n")
+ inputParameters["step"] += 1
+
+ plt.show()
+ logger.info("Artifact removal completed.")
# function to combine data when there are two different data files for the same recording session
# it will combine the data, do timestamps processing and save the combined data in the first output folder.
-def combineData(folderNames, inputParameters, storesList):
-
+def execute_combine_data(folderNames, inputParameters, storesList):
logger.debug("Combining Data from different data files...")
timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"]
op_folder = []
@@ -1117,64 +436,12 @@ def combineData(folderNames, inputParameters, storesList):
op = get_all_stores_for_combining_data(op_folder)
# processing timestamps for combining the data
- processTimestampsForCombiningData(op, timeForLightsTurnOn, storesList, sampling_rate[0])
+ combine_data(op, timeForLightsTurnOn, storesList, sampling_rate[0])
logger.info("Data is combined from different data files.")
return op
-# function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts
-def execute_zscore(folderNames, inputParameters):
-
- timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"]
- remove_artifacts = inputParameters["removeArtifacts"]
- artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"]
- plot_zScore_dff = inputParameters["plot_zScore_dff"]
- combine_data = inputParameters["combine_data"]
- isosbestic_control = inputParameters["isosbestic_control"]
-
- storesListPath = []
- for i in range(len(folderNames)):
- if combine_data == True:
- storesListPath.append([folderNames[i][0]])
- else:
- filepath = folderNames[i]
- storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))))
-
- storesListPath = np.concatenate(storesListPath)
-
- for j in range(len(storesListPath)):
- filepath = storesListPath[j]
- storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1)
-
- if remove_artifacts == True:
- logger.debug("Removing Artifacts from the data and correcting timestamps...")
- compute_z_score(filepath, inputParameters)
- if artifactsRemovalMethod == "concatenate":
- processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList)
- else:
- addingNaNtoChunksWithArtifacts(filepath, storesList)
- visualizeControlAndSignal(filepath, remove_artifacts)
- logger.info("Artifacts from the data are removed and timestamps are corrected.")
- else:
- compute_z_score(filepath, inputParameters)
- visualizeControlAndSignal(filepath, remove_artifacts)
-
- if plot_zScore_dff == "z_score":
- visualize_z_score(filepath)
- if plot_zScore_dff == "dff":
- visualize_dff(filepath)
- if plot_zScore_dff == "Both":
- visualize_z_score(filepath)
- visualize_dff(filepath)
-
- writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n")
- inputParameters["step"] += 1
-
- plt.show()
- logger.info("Signal data and event timestamps are extracted.")
-
-
def extractTsAndSignal(inputParameters):
logger.debug("Extracting signal data and event timestamps...")
@@ -1203,13 +470,17 @@ def extractTsAndSignal(inputParameters):
writeToFile(str((pbMaxValue + 1) * 10) + "\n" + str(10) + "\n")
execute_timestamp_correction(folderNames, inputParameters)
execute_zscore(folderNames, inputParameters)
+ if remove_artifacts == True:
+ execute_artifact_removal(folderNames, inputParameters)
else:
pbMaxValue = 1 + len(folderNames)
writeToFile(str((pbMaxValue) * 10) + "\n" + str(10) + "\n")
execute_timestamp_correction(folderNames, inputParameters)
storesList = check_storeslistfile(folderNames)
- op_folder = combineData(folderNames, inputParameters, storesList)
+ op_folder = execute_combine_data(folderNames, inputParameters, storesList)
execute_zscore(op_folder, inputParameters)
+ if remove_artifacts == True:
+ execute_artifact_removal(op_folder, inputParameters)
def main(input_parameters):
diff --git a/step4_data_flow_analysis.md b/step4_data_flow_analysis.md
new file mode 100644
index 0000000..d86e938
--- /dev/null
+++ b/step4_data_flow_analysis.md
@@ -0,0 +1,348 @@
+# Step 4 (preprocess.py) Data Flow Analysis
+
+## Overview
+
+Step 4 processes timestamp-corrected photometry data and computes normalized signals (ΔF/F and z-scores). It handles artifact removal, data combination from multiple sessions, and generates quality control visualizations.
+
+## High-Level Data Flow
+
+```mermaid
+flowchart TD
+ A[Entry: extractTsAndSignal] --> B{combine_data?}
+
+ B -->|False| C[execute_timestamp_correction]
+ B -->|True| D[execute_timestamp_correction]
+
+ C --> E[execute_zscore]
+
+ D --> F[check_storeslistfile]
+ F --> G[combineData]
+ G --> H[execute_zscore]
+
+ E --> I[Output: z_score, dff, cntrl_sig_fit HDF5 files]
+ H --> I
+
+ style A fill:#e1f5ff
+ style I fill:#d4edda
+```
+
+## Main Processing Paths
+
+### Entry Point
+**`extractTsAndSignal(inputParameters)`** (line 1178) is the main entry point called by the GUI or API.
+
+### Path 1: Normal Processing (combine_data = False)
+1. `execute_timestamp_correction()` → Correct timestamps and align data
+2. `execute_zscore()` → Compute z-scores and ΔF/F
+
+### Path 2: Combined Data Processing (combine_data = True)
+1. `execute_timestamp_correction()` → Correct timestamps for each file
+2. `check_storeslistfile()` → Merge store lists from multiple files
+3. `combineData()` → Combine data from multiple recording sessions
+4. `execute_zscore()` → Compute z-scores and ΔF/F on combined data
+
+## Detailed Processing Stages
+
+### Stage 1: Timestamp Correction
+
+```mermaid
+flowchart LR
+ A[Raw HDF5 files] --> B[Read storesList.csv]
+ B --> C{isosbestic_control?}
+ C -->|No| D[add_control_channel]
+ C -->|Yes| E[timestampCorrection_tdt/csv]
+ D --> E
+ E --> F[Eliminate first N seconds]
+ F --> G[decide_naming_convention_and_applyCorrection]
+ G --> H[applyCorrection for each store]
+ H --> I{isosbestic_control?}
+ I -->|No| J[create_control_channel via curve fitting]
+ I -->|Yes| K[timeCorrection_*.hdf5 files]
+ J --> K
+
+ style A fill:#e1f5ff
+ style K fill:#d4edda
+```
+
+#### Function: `execute_timestamp_correction(folderNames, inputParameters)`
+
+**Input:**
+- Raw HDF5 files from extractors: `control_*.hdf5`, `signal_*.hdf5`, `event_*.hdf5`
+
+**Process:**
+1. For each session folder:
+ - Read `storesList.csv` (mapping of raw names to semantic names)
+ - If no isosbestic control: `add_control_channel()` creates placeholder control files
+ - **`timestampCorrection_tdt()`** or **`timestampCorrection_csv()`**:
+ - Eliminates first N seconds (`timeForLightsTurnOn`)
+ - For TDT: expands timestamps from block timestamps + sampling rate
+ - For CSV: uses timestamps as-is
+ - Writes `timeCorrection_*.hdf5` with keys: `timestampNew`, `correctionIndex`, `sampling_rate`
+ - **`decide_naming_convention_and_applyCorrection()`**:
+ - For each store, calls `applyCorrection()` to crop data using `correctionIndex`
+ - For control/signal channels: crops data arrays
+ - For event channels: subtracts time offset from timestamps
+ - If no isosbestic control: **`create_control_channel()`** generates synthetic control via curve fitting
+
+**Output:**
+- Timestamp-corrected HDF5 files with trimmed data
+- `timeCorrection_*.hdf5` files containing corrected timestamps
+
+### Stage 2: Z-Score Computation
+
+```mermaid
+flowchart TD
+ A[Timestamp-corrected HDF5] --> B[compute_z_score]
+ B --> C{removeArtifacts?}
+
+ C -->|No| D[helper_z_score: full data]
+ C -->|Yes| E[helper_z_score: chunk-by-chunk]
+
+ D --> F[filterSignal]
+ E --> F
+
+ F --> G[controlFit: linear regression]
+ G --> H[deltaFF: compute ΔF/F]
+ H --> I[z_score_computation]
+
+ I --> J{removeArtifacts?}
+
+ J -->|No| K[Write z_score, dff, cntrl_sig_fit]
+ J -->|Yes| L{artifactsRemovalMethod?}
+
+ L -->|concatenate| M[processTimestampsForArtifacts]
+ L -->|NaN| N[addingNaNtoChunksWithArtifacts]
+
+ M --> K
+ N --> K
+
+ K --> O[visualizeControlAndSignal]
+
+ style A fill:#e1f5ff
+ style K fill:#d4edda
+ style O fill:#fff3cd
+```
+
+#### Function: `execute_zscore(folderNames, inputParameters)`
+
+**Input:**
+- Timestamp-corrected HDF5 files
+
+**Process:**
+1. For each output folder:
+
+ **`compute_z_score(filepath, inputParameters)`**:
+ - For each control/signal pair:
+ - **`helper_z_score(control, signal, filepath, name, inputParameters)`**:
+
+ **Without artifacts removal:**
+ - `execute_controlFit_dff()`: Filter signals → fit control to signal → compute ΔF/F
+ - `z_score_computation()`: Compute z-score from ΔF/F
+
+ **With artifacts removal:**
+ - For each user-selected chunk (from `coordsForPreProcessing_*.npy`):
+ - If no isosbestic: `helper_create_control_channel()` creates synthetic control
+ - `execute_controlFit_dff()` on chunk
+ - Concatenate or NaN-fill between chunks
+ - `z_score_computation()` on processed data
+
+ - Writes: `z_score_*.hdf5`, `dff_*.hdf5`, `cntrl_sig_fit_*.hdf5`
+
+ **If artifacts removal with concatenate method:**
+ - **`processTimestampsForArtifacts()`**:
+ - `eliminateData()`: Concatenates good chunks, adjusts timestamps to be continuous
+ - `eliminateTs()`: Aligns event timestamps with new timeline
+ - Overwrites data files with concatenated versions
+
+ **If artifacts removal with NaN method:**
+ - **`addingNaNtoChunksWithArtifacts()`**:
+ - `addingNaNValues()`: Replaces bad chunks with NaN
+ - `removeTTLs()`: Filters event timestamps to keep only valid times
+
+ - **`visualizeControlAndSignal()`**: Plots control, signal, cntrl_sig_fit for QC
+
+**Output:**
+- `z_score_*.hdf5` (z-scored signal)
+- `dff_*.hdf5` (ΔF/F)
+- `cntrl_sig_fit_*.hdf5` (fitted control channel)
+
+## Key Data Transformations
+
+### Signal Processing Pipeline
+
+```mermaid
+flowchart LR
+ A[Raw Signal] --> B[filterSignal: Moving Average]
+ C[Raw Control] --> D[filterSignal: Moving Average]
+
+ B --> E[controlFit: Linear Regression]
+ D --> E
+
+ E --> F[control_fit = p0*control + p1]
+ F --> G[deltaFF]
+
+ B --> G
+
+ G --> H[ΔF/F = signal - control_fit / control_fit * 100]
+ H --> I[z_score_computation]
+
+ I --> J{zscore_method?}
+ J -->|standard| K[z = ΔF/F - mean / std]
+ J -->|baseline| L[z = ΔF/F - baseline_mean / baseline_std]
+ J -->|robust| M[z = 0.6745 * ΔF/F - median / MAD]
+
+ K --> N[Z-Score Output]
+ L --> N
+ M --> N
+
+ style A fill:#e1f5ff
+ style C fill:#e1f5ff
+ style N fill:#d4edda
+```
+
+### Transformation Functions
+
+1. **`filterSignal(filter_window, signal)`** (line 822)
+ - Applies moving average filter with configurable window
+ - Uses `scipy.signal.filtfilt` for zero-phase filtering
+
+2. **`controlFit(control, signal)`** (line 815)
+ - Linear regression: fits control to signal
+ - Returns: `fitted_control = p[0] * control + p[1]`
+
+3. **`deltaFF(signal, control)`** (line 804)
+ - Formula: `((signal - control) / control) * 100`
+ - Computes normalized fluorescence change
+
+4. **`z_score_computation(dff, timestamps, inputParameters)`** (line 853)
+ - **Standard z-score:** `(ΔF/F - mean(ΔF/F)) / std(ΔF/F)`
+ - **Baseline z-score:** `(ΔF/F - mean(baseline)) / std(baseline)`
+ - **Robust z-score:** `0.6745 * (ΔF/F - median) / MAD`
+
+## Artifact Removal Workflow
+
+### Interactive Artifact Selection
+
+The `visualize()` function (line 469) provides an interactive matplotlib plot:
+- **Space key:** Mark artifact boundary (vertical line drawn)
+- **'d' key:** Delete last marked boundary
+- **Close plot:** Save coordinates to `coordsForPreProcessing_*.npy`
+
+### Two Removal Methods
+
+**Concatenate Method:**
+- Removes artifact chunks completely
+- Concatenates good chunks end-to-end
+- Adjusts timestamps to be continuous
+- Event timestamps realigned to new timeline
+
+**NaN Method:**
+- Replaces artifact chunks with NaN values
+- Preserves original timeline
+- Filters out event timestamps in artifact regions
+
+## Supporting Functions
+
+### Control Channel Creation
+
+**`helper_create_control_channel(signal, timestamps, window)`** (line 69)
+- Used when no isosbestic control is available
+- Applies Savitzky-Golay filter to signal
+- Fits to exponential function: `f(x) = a + b * exp(-(1/c) * x)`
+- Returns synthetic control channel
+
+### Data Combination
+
+**`combineData(folderNames, inputParameters, storesList)`** (line 1084)
+- Merges data from multiple recording sessions
+- Validates that sampling rates match across sessions
+- Calls `processTimestampsForCombiningData()` to align timelines
+- Saves combined data to first output folder
+
+### Coordinate Fetching
+
+**`fetchCoords(filepath, naming, data)`** (line 610)
+- Reads `coordsForPreProcessing_*.npy` (artifact boundary coordinates)
+- If file doesn't exist: uses `[0, data[-1]]` (entire recording)
+- Validates even number of coordinates (pairs of boundaries)
+- Returns reshaped array of coordinate pairs
+
+## File I/O Summary
+
+### Files Read
+
+| File Pattern | Content | Source |
+|-------------|---------|--------|
+| `control_*.hdf5` | Control channel data | Extractors (Step 3) |
+| `signal_*.hdf5` | Signal channel data | Extractors (Step 3) |
+| `event_*.hdf5` | Event timestamps | Extractors (Step 3) |
+| `storesList.csv` | Channel name mapping | Step 2 |
+| `coordsForPreProcessing_*.npy` | Artifact boundaries | User selection (optional) |
+
+### Files Written
+
+| File Pattern | Content | Keys |
+|-------------|---------|------|
+| `timeCorrection_*.hdf5` | Corrected timestamps | `timestampNew`, `correctionIndex`, `sampling_rate`, `timeRecStart` (TDT only) |
+| `z_score_*.hdf5` | Z-scored signal | `data` |
+| `dff_*.hdf5` | ΔF/F signal | `data` |
+| `cntrl_sig_fit_*.hdf5` | Fitted control | `data` |
+| `event_*_*.hdf5` | Corrected event timestamps | `ts` |
+
+## Key Parameters from inputParameters
+
+| Parameter | Purpose | Default/Options |
+|-----------|---------|-----------------|
+| `timeForLightsTurnOn` | Seconds to eliminate from start | 1 |
+| `filter_window` | Moving average window size | 100 |
+| `isosbestic_control` | Use isosbestic control channel? | True/False |
+| `removeArtifacts` | Enable artifact removal? | True/False |
+| `artifactsRemovalMethod` | How to handle artifacts | "concatenate" / "NaN" |
+| `zscore_method` | Z-score computation method | "standard z-score" / "baseline z-score" / "robust z-score" |
+| `baselineWindowStart` | Baseline window start (seconds) | 0 |
+| `baselineWindowEnd` | Baseline window end (seconds) | 0 |
+| `combine_data` | Combine multiple recordings? | True/False |
+
+## Architecture Notes for Refactoring
+
+### Current Coupling Issues
+
+1. **GUI Progress Tracking:** `writeToFile()` writes to `~/pbSteps.txt` for progress bar updates (lines 36-38, 1042, 1171, 1203, 1208, 1220)
+2. **Interactive Plotting:** `visualize()` requires user interaction (matplotlib event handlers)
+3. **File Path Assumptions:** Hard-coded path patterns (`*_output_*`, naming conventions)
+4. **Mixed Responsibilities:** Single functions handle both computation and I/O
+
+### Recommended Separation Points
+
+**Backend Analysis Layer Should Include:**
+- `filterSignal()` - pure signal processing
+- `controlFit()` - pure regression
+- `deltaFF()` - pure computation
+- `z_score_computation()` - pure statistical computation
+- `helper_create_control_channel()` - algorithmic control generation
+- Core timestamp correction logic (separated from I/O)
+- Core artifact removal logic (separated from I/O)
+
+**Data I/O Layer Should Include:**
+- `read_hdf5()`, `write_hdf5()` - file operations
+- Store list reading/writing
+- Coordinate file handling
+- HDF5 file discovery and path management
+
+**Frontend Visualization Layer Should Include:**
+- `visualize()` - interactive artifact selection
+- `visualizeControlAndSignal()` - QC plots
+- `visualize_z_score()`, `visualize_dff()` - result visualization
+- Progress tracking callbacks (replace `writeToFile()`)
+
+### Potential Refactoring Strategy
+
+1. **Extract pure computation functions** into a `signal_processing` module
+2. **Create data models** (dataclasses) for:
+ - TimeCorrectionResult
+ - ProcessedSignal (with z_score, dff, control_fit)
+ - ArtifactRegions
+3. **Separate I/O operations** into `io_utils` module with consistent interfaces
+4. **Create processing pipelines** that accept data objects, return data objects
+5. **Move visualization to separate module** with callbacks for progress/interaction
+6. **Use dependency injection** for progress callbacks instead of hard-coded file writes
diff --git a/timestamp_correction_analysis.md b/timestamp_correction_analysis.md
new file mode 100644
index 0000000..121aa3f
--- /dev/null
+++ b/timestamp_correction_analysis.md
@@ -0,0 +1,723 @@
+# Timestamp Correction Module Analysis
+
+## Overview
+
+The `timestamp_correction.py` module handles the correction of timestamps for photometry data, including:
+- Eliminating the first N seconds of recording (light stabilization period)
+- Expanding TDT block timestamps into continuous timestamps
+- Creating synthetic control channels when no isosbestic control is present
+- Applying corrections to both data channels and event markers
+
+## Module Structure
+
+### Entry Point from preprocess.py
+
+```python
+execute_timestamp_correction(folderNames, inputParameters) # preprocess.py:212
+```
+
+This orchestrator loops through all session folders and calls functions in this module.
+
+## Two-Phase Control Channel Creation Pattern
+
+### Understanding add_control_channel vs create_control_channel
+
+These two functions work together in a **two-phase process** to handle synthetic control channel generation. They are **not redundant** but serve distinct purposes:
+
+#### Phase 1: `add_control_channel` (Called BEFORE timestamp correction)
+
+**Execution:** Line 229 in `execute_timestamp_correction`
+
+**Purpose:** Create **PLACEHOLDER** control files to satisfy workflow requirements
+
+**What it does:**
+1. Validates that if `isosbestic_control=False`, no real control channels exist
+2. For each signal channel without a matching control:
+ - Copies the raw signal HDF5 file to `cntrl{i}.hdf5` (placeholder)
+ - Adds entry to storesList: `[["cntrl{i}"], ["control_{region}"]]`
+3. Saves updated `storesList.csv`
+
+**Files created:**
+- `cntrl0.hdf5`, `cntrl1.hdf5`, etc. (copies of **RAW** signal data)
+- Updated `storesList.csv` with placeholder entries
+
+**Why it's needed:**
+- Timestamp correction workflow expects **paired** control/signal channels in storesList
+- Without placeholders, the pairing logic in `timestampCorrection_xxx` and `check_cntrl_sig_length` would fail
+- The placeholder **data is never actually used** - it just satisfies structural requirements
+
+#### Phase 2: `create_control_channel` (Called AFTER timestamp correction)
+
+**Execution:** Line 243 in `execute_timestamp_correction`
+
+**Purpose:** Generate **ACTUAL** synthetic control via curve fitting and overwrite placeholders
+
+**What it does:**
+1. Looks for placeholder files (checks: `"control" in event_name.lower() and "cntrl" in event.lower()`)
+2. Reads the **CORRECTED** signal data: `signal_{region}.hdf5` (after timestamp correction)
+3. Calls `helper_create_control_channel()` to:
+ - Apply Savitzky-Golay filter to cleaned signal
+ - Fit to exponential function: `f(x) = a + b * exp(-(1/c) * x)`
+4. **OVERWRITES** the placeholder `control_{region}.hdf5` with real synthetic control
+5. Also exports to CSV format (legacy)
+
+**Files written:**
+- `control_{region}.hdf5` → `data` (replaces placeholder with curve-fitted control)
+- `{raw_name}.csv` (timestamps, data, sampling_rate columns)
+
+**Why it's separate:**
+- Requires **timestamp-corrected** signal data (doesn't exist until after lines 232-239)
+- Curve fitting algorithm needs clean timestamps (first N seconds eliminated)
+- Cannot be done before timestamp correction without re-correcting the synthetic control
+
+#### Execution Timeline
+
+```python
+# When isosbestic_control == False:
+
+# ========== PHASE 1: BEFORE TIMESTAMP CORRECTION ==========
+# Line 229: Create placeholders (just file copies)
+storesList = add_control_channel(filepath, storesList)
+# Result: storesList now has paired structure
+# [["Dv1A", "cntrl0"], ["signal_dms", "control_dms"]]
+# Files: cntrl0.hdf5 (copy of raw signal, never used)
+
+# ========== TIMESTAMP CORRECTION PHASE ==========
+# Lines 232-234: Process both signal AND placeholder control
+timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList)
+# Result: Creates timeCorrection_dms.hdf5 with correctionIndex
+
+# Lines 236-239: Apply corrections to all channels
+decide_naming_convention_and_applyCorrection(...)
+# Result: signal_dms.hdf5 now contains corrected signal data
+# control_dms.hdf5 still contains uncorrected placeholder copy
+
+# ========== PHASE 2: AFTER TIMESTAMP CORRECTION ==========
+# Line 243: Generate REAL synthetic controls
+create_control_channel(filepath, storesList, window=101)
+# Result: control_dms.hdf5 OVERWRITTEN with curve-fitted synthetic control
+# Now contains valid control data derived from corrected signal
+```
+
+#### Why This Design Exists
+
+This is a **chicken-and-egg problem solved with placeholders:**
+
+1. **Requirement:** Timestamp correction expects paired control/signal channels
+2. **Constraint:** Synthetic control generation requires timestamp-corrected signal data
+3. **Solution:** Create dummy placeholders → correct everything → replace placeholders with real data
+
+#### Visual Flow
+
+```mermaid
+flowchart TD
+ A[isosbestic_control = False] --> B[add_control_channel]
+ B --> C[Copy signal.hdf5 to cntrl0.hdf5]
+ C --> D[Update storesList.csv]
+
+ D --> E[timestampCorrection_xxx]
+ E --> F[Creates timeCorrection_dms.hdf5]
+
+ F --> G[decide_naming_convention_and_applyCorrection]
+ G --> H[Corrects signal_dms.hdf5]
+ G --> I[Corrects control_dms.hdf5
still contains placeholder]
+
+ I --> J[create_control_channel]
+ J --> K[Read corrected signal_dms.hdf5]
+ K --> L[helper_create_control_channel
curve fit]
+ L --> M[OVERWRITE control_dms.hdf5
with synthetic control]
+
+ style C fill:#fff3cd
+ style I fill:#fff3cd
+ style M fill:#d4edda
+```
+
+#### Refactoring Opportunity
+
+This placeholder pattern is a **code smell** indicating potential design improvements:
+
+**Issues:**
+1. **Unnecessary I/O:** Placeholder files are written and then overwritten
+2. **Confusing flow:** Hard to understand that placeholders are temporary
+3. **Tight coupling:** Timestamp correction assumes paired files exist
+4. **Wasted computation:** Placeholder controls get timestamp-corrected unnecessarily
+
+**Potential Improvements:**
+
+**Option 1: Lazy Control Creation**
+- Modify timestamp correction to handle missing controls gracefully
+- Only create synthetic controls after all corrections complete
+- Remove placeholder file creation entirely
+
+**Option 2: Data Structure Refactoring**
+- Use a data structure that doesn't require physical paired files upfront
+- Track "needs synthetic control" as metadata rather than file presence
+- Generate and write controls only once at the end
+
+**Option 3: Two-Pass Workflow**
+- First pass: Correct only signal channels
+- Second pass: Generate synthetic controls from corrected signals
+- Would require refactoring `check_cntrl_sig_length` and pairing logic
+
+## Function Catalog
+
+### 1. add_control_channel
+**Location:** `timestamp_correction.py:20`
+**Purpose:** Create placeholder control channel files when no isosbestic control exists
+
+```python
+def add_control_channel(filepath, arr) -> arr
+```
+
+**Input:**
+- `filepath`: Path to session output folder
+- `arr`: 2D array `[[storenames], [storesList]]` from storesList.csv
+
+**Process:**
+1. Validates that control/signal pairs match (raises error if mismatched)
+2. For each signal channel without a matching control:
+ - Copies signal HDF5 file to `cntrl{i}.hdf5` (placeholder)
+ - Adds entry to storesList array: `[["cntrl{i}"], ["control_{region}"]]`
+3. Writes updated storesList.csv
+
+**Output:**
+- Updated `arr` with new control channel entries
+- **Files Written:** Updated `storesList.csv`, copied `cntrl*.hdf5` files
+
+**I/O Summary:**
+- **Reads:** Signal HDF5 files (via shutil.copyfile)
+- **Writes:** `storesList.csv`, placeholder `cntrl*.hdf5` files
+
+---
+
+### 2. timestampCorrection_csv
+**Location:** `timestamp_correction.py:65`
+**Purpose:** Correct timestamps for CSV-format data (Doric, NPM, custom CSV)
+
+```python
+def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList)
+```
+
+**Input:**
+- `filepath`: Path to session output folder
+- `timeForLightsTurnOn`: Seconds to eliminate from start (default: 1)
+- `storesList`: 2D array `[[storenames], [storesList]]`
+
+**Process:**
+1. Filters storesList to control/signal channels only
+2. Pairs control/signal channels, validates naming matches
+3. Calls `check_cntrl_sig_length()` to determine which channel to use (shorter one)
+4. For each control/signal pair:
+ - **Reads:** `timestamps` and `sampling_rate` from raw HDF5
+ - **Computes:** `correctionIndex = np.where(timestamp >= timeForLightsTurnOn)`
+ - **Writes:** `timeCorrection_{region}.hdf5` with keys:
+ - `timestampNew`: Corrected timestamps
+ - `correctionIndex`: Indices to keep
+ - `sampling_rate`: Sampling rate
+
+**Output:**
+- **Files Written:** `timeCorrection_{region}.hdf5` for each control/signal pair
+
+**I/O Summary:**
+- **Reads:** `{storename}.hdf5` → `timestamps`, `sampling_rate`
+- **Writes:** `timeCorrection_{region}.hdf5` → `timestampNew`, `correctionIndex`, `sampling_rate`
+
+---
+
+### 3. timestampCorrection_tdt
+**Location:** `timestamp_correction.py:115`
+**Purpose:** Correct timestamps for TDT-format data (expands block timestamps)
+
+```python
+def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList)
+```
+
+**Input:** Same as `timestampCorrection_csv`
+
+**Process:**
+1. Filters storesList to control/signal channels only
+2. Pairs control/signal channels, validates naming matches
+3. Calls `check_cntrl_sig_length()` to determine which channel to use
+4. For each control/signal pair:
+ - **Reads:** `timestamps`, `npoints`, `sampling_rate` from raw HDF5
+ - **TDT-specific expansion algorithm:**
+ ```python
+ timeRecStart = timestamp[0]
+ timestamps = np.subtract(timestamp, timeRecStart) # Zero-base
+ adder = np.arange(npoints) / sampling_rate # Within-block offsets
+ # Expand: for each block timestamp, add within-block offsets
+ timestampNew = np.zeros((len(timestamps), lengthAdder))
+ for i in range(lengthAdder):
+ timestampNew[:, i] = np.add(timestamps, adder[i])
+ timestampNew = (timestampNew.T).reshape(-1, order="F") # Flatten
+ correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)
+ timestampNew = timestampNew[correctionIndex]
+ ```
+ - **Writes:** `timeCorrection_{region}.hdf5` with keys:
+ - `timeRecStart`: Recording start time (TDT-specific)
+ - `timestampNew`: Expanded, corrected timestamps
+ - `correctionIndex`: Indices to keep
+ - `sampling_rate`: Sampling rate
+
+**Output:**
+- **Files Written:** `timeCorrection_{region}.hdf5` with TDT-specific `timeRecStart` key
+
+**I/O Summary:**
+- **Reads:** `{storename}.hdf5` → `timestamps`, `npoints`, `sampling_rate`
+- **Writes:** `timeCorrection_{region}.hdf5` → `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate`
+
+---
+
+### 4. check_cntrl_sig_length
+**Location:** `timestamp_correction.py:273`
+**Purpose:** Determine which channel (control or signal) to use as reference based on length
+
+```python
+def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList) -> indices
+```
+
+**Input:**
+- `filepath`: Path to session output folder
+- `channels_arr`: Paired control/signal array `[["control_A", "control_B"], ["signal_A", "signal_B"]]`
+- `storenames`: Raw HDF5 filenames
+- `storesList`: Semantic channel names
+
+**Process:**
+1. For each control/signal pair:
+ - **Reads:** `data` from both control and signal HDF5
+ - Compares lengths: `control.shape[0]` vs `signal.shape[0]`
+ - Returns the shorter one's storename (or signal if equal)
+
+**Output:**
+- List of storenames to use for timestamp correction (one per pair)
+
+**I/O Summary:**
+- **Reads:** `{control_storename}.hdf5` → `data`, `{signal_storename}.hdf5` → `data`
+
+**Note:** This is a pure analysis function but performs I/O to determine which data to use.
+
+---
+
+### 5. decide_naming_convention_and_applyCorrection
+**Location:** `timestamp_correction.py:178`
+**Purpose:** Loop through all channels and apply timestamp corrections
+
+```python
+def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList)
+```
+
+**Input:**
+- `filepath`: Path to session output folder
+- `timeForLightsTurnOn`: Seconds eliminated from start
+- `event`: Raw storename (e.g., "Dv1A")
+- `displayName`: Semantic name (e.g., "control_DMS")
+- `storesList`: Full storesList array
+
+**Process:**
+1. Filters storesList to control/signal channels
+2. Pairs channels and validates naming conventions
+3. For each pair, calls `applyCorrection(filepath, timeForLightsTurnOn, event, displayName, region)`
+
+**Output:**
+- Delegates to `applyCorrection()` (no direct I/O)
+
+---
+
+### 6. applyCorrection
+**Location:** `timestamp_correction.py:205`
+**Purpose:** Apply timestamp corrections to data channels or event markers
+
+```python
+def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming)
+```
+
+**Input:**
+- `filepath`: Path to session output folder
+- `timeForLightsTurnOn`: Seconds eliminated from start
+- `event`: Raw storename
+- `displayName`: Semantic display name
+- `naming`: Region identifier (e.g., "dms")
+
+**Process:**
+
+**For Control/Signal Channels:**
+1. **Reads:** `timeCorrection_{naming}.hdf5` → `correctionIndex`
+2. **Reads:** `{event}.hdf5` → `data`
+3. **Applies:** `arr = arr[correctionIndex]` (crops data)
+4. **Writes:** `{displayName}.hdf5` → `data` (overwrites with corrected data)
+
+**For Event Channels:**
+1. Detects TDT format: `check_TDT(os.path.dirname(filepath))`
+2. **Reads:** `timeCorrection_{naming}.hdf5` → `timeRecStart` (if TDT)
+3. **Reads:** `{event}.hdf5` → `timestamps`
+4. **Applies corrections:**
+ - If TDT and timestamps >= timeRecStart: subtract both `timeRecStart` and `timeForLightsTurnOn`
+ - Otherwise: subtract only `timeForLightsTurnOn`
+5. **Writes:** `{event}_{naming}.hdf5` → `ts` (corrected event timestamps)
+
+**Output:**
+- **Files Written:**
+ - `{displayName}.hdf5` → `data` (for control/signal)
+ - `{event}_{naming}.hdf5` → `ts` (for events)
+
+**I/O Summary:**
+- **Reads:** `timeCorrection_{naming}.hdf5`, `{event}.hdf5`
+- **Writes:** `{displayName}.hdf5` or `{event}_{naming}.hdf5`
+
+---
+
+### 7. create_control_channel
+**Location:** `timestamp_correction.py:247`
+**Purpose:** Generate synthetic control channel using curve fitting (when no isosbestic control exists)
+
+```python
+def create_control_channel(filepath, arr, window=5001)
+```
+
+**Input:**
+- `filepath`: Path to session output folder
+- `arr`: storesList array `[[storenames], [storesList]]`
+- `window`: Savitzky-Golay filter window (default: 5001)
+
+**Process:**
+1. Loops through storesList to find placeholder control channels (`cntrl` in storename)
+2. For each placeholder:
+ - **Reads:** `signal_{region}.hdf5` → `data` (corrected signal)
+ - **Reads:** `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate`
+ - **Calls:** `helper_create_control_channel(signal, timestampNew, window)` from `control_channel.py`
+ - Applies Savitzky-Golay filter
+ - Fits to exponential: `f(x) = a + b * exp(-(1/c) * x)`
+ - **Writes:** `{control_name}.hdf5` → `data` (synthetic control)
+ - **Writes:** `{event_name}.csv` with columns: `timestamps`, `data`, `sampling_rate`
+
+**Output:**
+- **Files Written:**
+ - `control_{region}.hdf5` → `data` (replaces placeholder)
+ - `{raw_name}.csv` (legacy format export)
+
+**I/O Summary:**
+- **Reads:** `signal_{region}.hdf5` → `data`, `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate`
+- **Writes:** `control_{region}.hdf5` → `data`, `{raw_name}.csv`
+
+---
+
+## Data Flow Diagram
+
+### High-Level Flow (called from execute_timestamp_correction)
+
+```mermaid
+flowchart TD
+ A[execute_timestamp_correction] --> B[Read storesList.csv]
+ B --> C{isosbestic_control?}
+
+ C -->|False| D[add_control_channel]
+ C -->|True| E{Check format}
+ D --> E
+
+ E -->|TDT| F[timestampCorrection_tdt]
+ E -->|CSV/Doric/NPM| G[timestampCorrection_csv]
+
+ F --> H[Loop: decide_naming_convention_and_applyCorrection]
+ G --> H
+
+ H --> I[For each store: applyCorrection]
+
+ I --> J{isosbestic_control?}
+ J -->|False| K[create_control_channel]
+ J -->|True| L[Done]
+ K --> L
+
+ style A fill:#e1f5ff
+ style L fill:#d4edda
+```
+
+### Detailed Flow: timestampCorrection Functions
+
+```mermaid
+flowchart LR
+ A[Raw HDF5 files] --> B[check_cntrl_sig_length]
+ B --> C[Read control & signal data]
+ C --> D[Return shorter channel name]
+
+ D --> E{Format?}
+ E -->|CSV| F[timestampCorrection_csv]
+ E -->|TDT| G[timestampCorrection_tdt]
+
+ F --> H[Read timestamps from selected channel]
+ G --> I[Read timestamps, npoints, sampling_rate]
+
+ H --> J[correctionIndex = where >= timeForLightsTurnOn]
+ I --> K[Expand block timestamps]
+ K --> J
+
+ J --> L[Write timeCorrection_{region}.hdf5]
+
+ style A fill:#e1f5ff
+ style L fill:#d4edda
+```
+
+### Detailed Flow: applyCorrection
+
+```mermaid
+flowchart TD
+ A[applyCorrection called] --> B{Channel type?}
+
+ B -->|control/signal| C[Read correctionIndex]
+ B -->|event| D[Read event timestamps]
+
+ C --> E[Read raw data]
+ E --> F[data = data correctionIndex]
+ F --> G[Write displayName.hdf5]
+
+ D --> H{TDT format?}
+ H -->|Yes| I[Read timeRecStart]
+ H -->|No| J[ts -= timeForLightsTurnOn]
+
+ I --> K[ts -= timeRecStart]
+ K --> J
+ J --> L[Write event_region.hdf5]
+
+ style A fill:#e1f5ff
+ style G fill:#d4edda
+ style L fill:#d4edda
+```
+
+### Detailed Flow: Control Channel Creation
+
+```mermaid
+flowchart LR
+ A[add_control_channel] --> B[For each signal without control]
+ B --> C[Copy signal.hdf5 to cntrl_i.hdf5]
+ C --> D[Update storesList.csv]
+
+ D --> E[... timestamp correction ...]
+
+ E --> F[create_control_channel]
+ F --> G[For each cntrl_i placeholder]
+ G --> H[Read signal_{region}.hdf5]
+ H --> I[helper_create_control_channel]
+ I --> J[Savitzky-Golay filter]
+ J --> K[Curve fit to exponential]
+ K --> L[Write control_{region}.hdf5]
+ L --> M[Export to CSV]
+
+ style A fill:#fff3cd
+ style M fill:#d4edda
+```
+
+## Execution Order in execute_timestamp_correction
+
+```python
+# preprocess.py:212-247
+for each session in folderNames:
+ for each output_folder in session:
+ # Step 1: Read metadata
+ storesList = np.genfromtxt("storesList.csv")
+
+ # Step 2: Add placeholder controls if needed
+ if isosbestic_control == False:
+ storesList = add_control_channel(filepath, storesList)
+
+ # Step 3: Compute correctionIndex and timestampNew
+ if check_TDT(folderName):
+ timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList)
+ else:
+ timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList)
+
+ # Step 4: Apply corrections to all channels/events
+ for each store in storesList:
+ decide_naming_convention_and_applyCorrection(
+ filepath, timeForLightsTurnOn, storename, displayName, storesList
+ )
+ # ^ This calls applyCorrection for each channel
+
+ # Step 5: Generate synthetic controls via curve fitting
+ if isosbestic_control == False:
+ create_control_channel(filepath, storesList, window=101)
+```
+
+## File I/O Summary
+
+### Files Read
+
+| Function | Files Read | Keys |
+|----------|-----------|------|
+| `add_control_channel` | `signal_*.hdf5` (for copying) | - |
+| `timestampCorrection_csv` | `{storename}.hdf5` | `timestamps`, `sampling_rate` |
+| `timestampCorrection_tdt` | `{storename}.hdf5` | `timestamps`, `npoints`, `sampling_rate` |
+| `check_cntrl_sig_length` | `control_*.hdf5`, `signal_*.hdf5` | `data` |
+| `applyCorrection` | `timeCorrection_{region}.hdf5`
`{event}.hdf5` | `correctionIndex`, `timeRecStart` (TDT)
`data` or `timestamps` |
+| `create_control_channel` | `signal_{region}.hdf5`
`timeCorrection_{region}.hdf5` | `data`
`timestampNew`, `sampling_rate` |
+
+### Files Written
+
+| Function | Files Written | Keys | Notes |
+|----------|--------------|------|-------|
+| `add_control_channel` | `storesList.csv`
`cntrl{i}.hdf5` | -
(copy of signal) | Placeholder files |
+| `timestampCorrection_csv` | `timeCorrection_{region}.hdf5` | `timestampNew`, `correctionIndex`, `sampling_rate` | One per region |
+| `timestampCorrection_tdt` | `timeCorrection_{region}.hdf5` | `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate` | TDT-specific |
+| `applyCorrection` | `{displayName}.hdf5`
`{event}_{region}.hdf5` | `data`
`ts` | Overwrites with corrected data |
+| `create_control_channel` | `control_{region}.hdf5`
`{raw_name}.csv` | `data`
timestamps, data, sampling_rate | Replaces placeholder |
+
+## Key Transformations
+
+### 1. Timestamp Expansion (TDT only)
+
+**Input:** Block timestamps (one per acquisition block)
+**Algorithm:**
+```python
+timeRecStart = timestamp[0]
+timestamps = timestamp - timeRecStart # Zero-base
+adder = np.arange(npoints) / sampling_rate # Within-block offsets [0, 1/fs, 2/fs, ...]
+# Matrix multiplication to expand:
+timestampNew = zeros((n_blocks, npoints))
+for i in range(npoints):
+ timestampNew[:, i] = timestamps + adder[i]
+timestampNew = timestampNew.T.reshape(-1, order='F') # Column-major flatten
+```
+**Output:** Continuous timestamps at full sampling rate
+
+### 2. Correction Index Computation
+
+**Input:** Timestamps array, `timeForLightsTurnOn`
+**Algorithm:**
+```python
+correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0]
+```
+**Output:** Indices of timestamps to keep (after eliminating first N seconds)
+
+### 3. Data Cropping
+
+**Applied to:** Control/signal data channels
+**Algorithm:**
+```python
+data_corrected = data[correctionIndex]
+```
+
+### 4. Event Timestamp Adjustment
+
+**Applied to:** Event markers (TTL pulses)
+**Algorithm:**
+```python
+# CSV format:
+ts_corrected = ts - timeForLightsTurnOn
+
+# TDT format (if ts >= timeRecStart):
+ts_corrected = ts - timeRecStart - timeForLightsTurnOn
+```
+
+### 5. Synthetic Control Generation
+
+**Input:** Signal channel (already corrected)
+**Algorithm:**
+1. Apply Savitzky-Golay filter: `filtered_signal = savgol_filter(signal, window, polyorder=3)`
+2. Curve fit to exponential: `control = a + b * exp(-(1/c) * t)`
+3. Return fitted curve as synthetic control
+
+## Analysis for I/O Separation
+
+### Pure Analysis Functions (Minimal I/O)
+These could be extracted with I/O injected:
+- ❌ None - all functions perform substantial I/O
+
+### Orchestration Functions (Heavy I/O, Light Analysis)
+These coordinate reading/writing and delegate computation:
+- `add_control_channel` - File copying and CSV writing
+- `decide_naming_convention_and_applyCorrection` - Loops and delegates
+- `create_control_channel` - Orchestrates read → process → write
+
+### Mixed Functions (I/O + Analysis)
+These perform both I/O and computation inline:
+- `timestampCorrection_csv` - Reads data, computes correctionIndex, writes results
+- `timestampCorrection_tdt` - Reads data, expands timestamps, computes correctionIndex, writes
+- `applyCorrection` - Reads multiple files, applies transformations, writes
+- `check_cntrl_sig_length` - Reads data just to compare lengths
+
+## Refactoring Recommendations for I/O Separation
+
+### Option 1: Extract Pure Computation Functions
+
+Create new pure functions:
+```python
+# Pure analysis (no I/O)
+def compute_correction_index(timestamps, timeForLightsTurnOn):
+ return np.where(timestamps >= timeForLightsTurnOn)[0]
+
+def expand_tdt_timestamps(block_timestamps, npoints, sampling_rate):
+ # TDT expansion algorithm
+ ...
+ return expanded_timestamps
+
+def crop_data_by_index(data, correctionIndex):
+ return data[correctionIndex]
+
+def adjust_event_timestamps(ts, timeRecStart, timeForLightsTurnOn, is_tdt):
+ # Event adjustment logic
+ ...
+ return adjusted_ts
+```
+
+Then modify existing functions to use these pure functions, keeping I/O separate.
+
+### Option 2: Reader/Writer Pattern
+
+Create dedicated I/O classes:
+```python
+class TimestampCorrectionReader:
+ def read_raw_timestamps(self, filepath, storename):
+ ...
+
+ def read_correction_data(self, filepath, region):
+ ...
+
+class TimestampCorrectionWriter:
+ def write_correction_file(self, filepath, region, data):
+ ...
+
+ def write_corrected_data(self, filepath, displayName, data):
+ ...
+```
+
+### Option 3: Data Class Pattern
+
+Return data objects instead of writing directly:
+```python
+@dataclass
+class TimestampCorrection:
+ timestampNew: np.ndarray
+ correctionIndex: np.ndarray
+ sampling_rate: float
+ timeRecStart: Optional[float] = None # TDT only
+
+def timestampCorrection_tdt(...) -> TimestampCorrection:
+ # Compute all values
+ return TimestampCorrection(
+ timestampNew=...,
+ correctionIndex=...,
+ sampling_rate=...,
+ timeRecStart=...
+ )
+
+# Separate writer function
+def write_timestamp_correction(filepath, region, correction: TimestampCorrection):
+ write_hdf5(correction.timestampNew, f"timeCorrection_{region}", filepath, "timestampNew")
+ # ... etc
+```
+
+## Current I/O Patterns to Refactor
+
+1. **Inline writes in computation functions:**
+ - `timestampCorrection_csv` and `timestampCorrection_tdt` compute AND write
+ - Should separate: compute → return data → write in caller
+
+2. **Reading for validation only:**
+ - `check_cntrl_sig_length` reads full data arrays just to compare shapes
+ - Could be optimized to read only array metadata/shapes
+
+3. **Side-effect file creation:**
+ - `add_control_channel` creates files as side effect
+ - `create_control_channel` both generates data AND writes multiple formats (HDF5 + CSV)
+
+4. **Mixed responsibilities in applyCorrection:**
+ - Handles both control/signal cropping AND event timestamp adjustment
+ - Could be split into two separate functions