From aa4340d97f554c7434a73a58a02fc4ee994f33dc Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 10:59:52 -0800 Subject: [PATCH 001/125] Moved readtsq to tdt_step2.py. --- src/guppy/saveStoresList.py | 21 ++------------------- src/guppy/tdt_step2.py | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 19 deletions(-) create mode 100644 src/guppy/tdt_step2.py diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index ed3a7cf..a837aa9 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -21,6 +21,8 @@ import panel as pn from numpy import float32, float64, int32, int64, uint16 +from guppy.tdt_step2 import readtsq + # hv.extension() pn.extension() @@ -86,25 +88,6 @@ def check_header(df): return arr, check_float -# function to read 'tsq' file -def readtsq(filepath): - names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") - formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) - offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 - tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 1: - logger.error("Two tsq files are present at the location.") - raise Exception("Two tsq files are present at the location.") - elif len(path) == 0: - return 0 - else: - path = path[0] - tsq = np.fromfile(path, dtype=tsq_dtype) - df = pd.DataFrame(tsq) - return df - - # function to show GUI and save def saveStorenames(inputParameters, data, event_name, flag, filepath): diff --git a/src/guppy/tdt_step2.py b/src/guppy/tdt_step2.py new file mode 100644 index 0000000..09456a7 --- /dev/null +++ b/src/guppy/tdt_step2.py @@ -0,0 +1,26 @@ +import glob +import logging +import os +import numpy as np +from numpy import float32, float64, int32, int64, uint16 +import pandas as pd + +logger = logging.getLogger(__name__) + +# function to read 'tsq' file +def readtsq(filepath): + names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") + formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) + offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 + tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) + path = glob.glob(os.path.join(filepath, "*.tsq")) + if len(path) > 1: + logger.error("Two tsq files are present at the location.") + raise Exception("Two tsq files are present at the location.") + elif len(path) == 0: + return 0 + else: + path = path[0] + tsq = np.fromfile(path, dtype=tsq_dtype) + df = pd.DataFrame(tsq) + return df \ No newline at end of file From c868823138945399df1fb2b043f1026b2099e3b6 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 11:19:34 -0800 Subject: [PATCH 002/125] Moved import_np_doric_csv to np_doric_csv_step2.py. --- src/guppy/np_doric_csv_step2.py | 523 ++++++++++++++++++++++++++++++++ src/guppy/saveStoresList.py | 498 +----------------------------- 2 files changed, 524 insertions(+), 497 deletions(-) create mode 100644 src/guppy/np_doric_csv_step2.py diff --git a/src/guppy/np_doric_csv_step2.py b/src/guppy/np_doric_csv_step2.py new file mode 100644 index 0000000..d06dcc1 --- /dev/null +++ b/src/guppy/np_doric_csv_step2.py @@ -0,0 +1,523 @@ +import glob +import logging +import os +import tkinter as tk +from tkinter import StringVar, messagebox, ttk + +import h5py +import numpy as np +import pandas as pd +import panel as pn + +pn.extension() + +logger = logging.getLogger(__name__) + +# function to see if there are 'csv' files present +# and recognize type of 'csv' files either from +# Neurophotometrics, Doric systems or custom made 'csv' files +# and read data accordingly +def import_np_doric_csv(filepath, isosbestic_control, num_ch, inputParameters=None): + + logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") + # Headless configuration (used to avoid any UI prompts when running tests) + headless = bool(os.environ.get("GUPPY_BASE_DIR")) + npm_timestamp_column_name = None + npm_time_unit = None + npm_split_events = None + if isinstance(inputParameters, dict): + npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") + npm_time_unit = inputParameters.get("npm_time_unit", "seconds") + npm_split_events = inputParameters.get("npm_split_events", True) + path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) + path_chev = glob.glob(os.path.join(filepath, "*chev*")) + path_chod = glob.glob(os.path.join(filepath, "*chod*")) + path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + path_event = glob.glob(os.path.join(filepath, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + dirname = os.path.dirname(path[i]) + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "doric": + key_names = read_doric(path[i]) + event_from_filename.extend(key_names) + flag = "doric_doric" + else: + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + if len(check_all_str) == len(df_arr): + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + df = df.drop(["Time(s)"], axis=1) + event_from_filename.extend(list(df.columns)) + flag = "doric_csv" + logger.info(flag) + else: + df = pd.read_csv(path[i], index_col=False) + # with warnings.catch_warnings(): + # warnings.simplefilter("error") + # try: + # df = pd.read_csv(path[i], index_col=False, dtype=float) + # except: + # df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) # to make process faster reading just first 10 rows + # df = df.drop(['Time(s)'], axis=1) + # event_from_filename.extend(list(df.columns)) + # flag = 'doric_csv' + if flag == "doric_csv" or flag == "doric_doric": + continue + else: + colnames, value = check_header(df) + # logger.info(len(colnames), len(value)) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + if len(cols) == 1: + if cols[0].lower() != "timestamps": + logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + else: + flag = "event_csv" + elif len(cols) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(cols)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + else: + flag = "data_csv" + elif len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) >= 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + else: + pass + + flag_arr.append(flag) + logger.info(flag) + if flag == "event_csv" or flag == "data_csv": + name = os.path.basename(path[i]).split(".")[0] + event_from_filename.append(name) + elif flag == "data_np": + file = f"file{str(i)}_" + df, indices_dict, num_channels = decide_indices(file, df, flag, num_ch) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + elif flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if headless: + response = 1 if bool(npm_split_events) else 0 + else: + window = tk.Tk() + if len(type_val_unique) > 1: + response = messagebox.askyesno( + "Multiple event TTLs", + "Based on the TTL file,\ + it looks like TTLs \ + belongs to multiple behavior type. \ + Do you want to create multiple files for each \ + behavior type ?", + ) + else: + response = 0 + window.destroy() + if response == 1: + timestamps = np.array(df.iloc[:, 0]) + for j in range(len(type_val_unique)): + idx = np.where(type_val == type_val_unique[j]) + d = dict() + d["timestamps"] = timestamps[idx] + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) + event_from_filename.append("event" + str(type_val_unique[j])) + else: + timestamps = np.array(df.iloc[:, 0]) + d = dict() + d["timestamps"] = timestamps + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) + event_from_filename.append("event" + str(0)) + else: + file = f"file{str(i)}_" + df, ts_unit = decide_ts_unit_for_npm( + df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless + ) + df, indices_dict, num_channels = decide_indices(file, df, flag) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + path_chev = glob.glob(os.path.join(filepath, "*chev*")) + path_chod = glob.glob(os.path.join(filepath, "*chod*")) + path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + path_event = glob.glob(os.path.join(filepath, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) + path_chev_chod_chpr = [path_chev, path_chod, path_chpr] + if ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) + and ("event_np" in flag_arr) + and (i == len(path) - 1) + ) or ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) + ): # i==len(path)-1 and or 'event_np' in flag + num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) + arr_len, no_ch = [], [] + for i in range(len(path_chev_chod_chpr)): + if len(path_chev_chod_chpr[i]) > 0: + arr_len.append(len(path_chev_chod_chpr[i])) + else: + continue + + unique_arr_len = np.unique(np.array(arr_len)) + if "data_np_v2" in flag_arr: + if ts_unit == "seconds": + divisor = 1 + elif ts_unit == "milliseconds": + divisor = 1e3 + else: + divisor = 1e6 + else: + divisor = 1000 + + for j in range(len(path_event)): + df_event = pd.read_csv(path_event[j]) + df_chev = pd.read_csv(path_chev[0]) + df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor + df_event.to_csv(path_event[j], index=False) + if unique_arr_len.shape[0] == 1: + for j in range(len(path_chev)): + if file + "chev" in indices_dict.keys(): + df_chev = pd.read_csv(path_chev[j]) + df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor + df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) + df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( + df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] + ) + df_chev.to_csv(path_chev[j], index=False) + + if file + "chod" in indices_dict.keys(): + df_chod = pd.read_csv(path_chod[j]) + df_chod["timestamps"] = df_chev["timestamps"] + df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) + df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chod.to_csv(path_chod[j], index=False) + + if file + "chpr" in indices_dict.keys(): + df_chpr = pd.read_csv(path_chpr[j]) + df_chpr["timestamps"] = df_chev["timestamps"] + df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) + df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chpr.to_csv(path_chpr[j], index=False) + else: + logger.error("Number of channels should be same for all regions.") + raise Exception("Number of channels should be same for all regions.") + else: + pass + logger.info("Importing of either NPM or Doric or csv file is done.") + return event_from_filename, flag_arr + +# ---------------------------------------------------------------------------------------------------------------------- +# Functions that import_np_doric_csv uses +# ---------------------------------------------------------------------------------------------------------------------- + +def read_doric(filepath): + with h5py.File(filepath, "r") as f: + if "Traces" in list(f.keys()): + keys = access_keys_doricV1(f) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = access_keys_doricV6(f) + + return keys + + +def check_header(df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float + +# function to decide indices of interleaved channels +# in neurophotometrics data +def decide_indices(file, df, flag, num_ch=2): + ch_name = [file + "chev", file + "chod", file + "chpr"] + if len(ch_name) < num_ch: + logger.error( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + if flag == "data_np": + indices_dict = dict() + for i in range(num_ch): + indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) + + else: + cols = np.array(list(df.columns)) + if "flags" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "Flags"] + state = np.array(df["Flags"]) + elif "ledstate" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "LedState"] + state = np.array(df["LedState"]) + else: + logger.error( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + raise Exception( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + + num_ch, ch = check_channels(state) + indices_dict = dict() + for i in range(num_ch): + first_occurrence = np.where(state == ch[i])[0] + indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) + + df = df.drop(arr, axis=1) + + return df, indices_dict, num_ch + + +# function to decide NPM timestamps unit (seconds, ms or us) +def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): + col_names = np.array(list(df.columns)) + col_names_ts = [""] + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + + ts_unit = "seconds" + if len(col_names_ts) > 2: + # Headless path: auto-select column/unit without any UI + if headless: + if timestamp_column_name is not None: + assert ( + timestamp_column_name in col_names_ts + ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" + chosen = timestamp_column_name + else: + chosen = col_names_ts[1] + df.insert(1, "Timestamp", df[chosen]) + df = df.drop(col_names_ts[1:], axis=1) + valid_units = {"seconds", "milliseconds", "microseconds"} + ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" + return df, ts_unit + # def comboBoxSelected(event): + # logger.info(event.widget.get()) + + window = tk.Tk() + window.title("Select appropriate options for timestamps") + window.geometry("500x200") + holdComboboxValues = dict() + + timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( + row=0, column=1, pady=25, padx=25 + ) + holdComboboxValues["timestamps"] = StringVar() + timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) + timestamps_combo.grid(row=0, column=2, pady=25, padx=25) + timestamps_combo.current(0) + # timestamps_combo.bind("<>", comboBoxSelected) + + time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) + holdComboboxValues["time_unit"] = StringVar() + time_unit_combo = ttk.Combobox( + window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] + ) + time_unit_combo.grid(row=1, column=2, pady=25, padx=25) + time_unit_combo.current(0) + # time_unit_combo.bind("<>", comboBoxSelected) + window.lift() + window.after(500, lambda: window.lift()) + window.mainloop() + + if holdComboboxValues["timestamps"].get(): + df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) + df = df.drop(col_names_ts[1:], axis=1) + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + if holdComboboxValues["time_unit"].get(): + if holdComboboxValues["time_unit"].get() == "seconds": + ts_unit = holdComboboxValues["time_unit"].get() + elif holdComboboxValues["time_unit"].get() == "milliseconds": + ts_unit = holdComboboxValues["time_unit"].get() + else: + ts_unit = holdComboboxValues["time_unit"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + else: + pass + + return df, ts_unit + + +# ---------------------------------------------------------------------------------------------------------------------- +# Functions that read_doric uses +# ---------------------------------------------------------------------------------------------------------------------- + +def access_keys_doricV6(doric_file): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + keys = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + keys.append(f"{sep_values[-3]}/{sep_values[-2]}") + else: + keys.append(f"{sep_values[-2]}/{sep_values[-1]}") + + return keys + + +def access_keys_doricV1(doric_file): + keys = list(doric_file["Traces"]["Console"].keys()) + keys.remove("Time(s)") + + return keys + +# ---------------------------------------------------------------------------------------------------------------------- +# Functions that decide_indices uses +# ---------------------------------------------------------------------------------------------------------------------- + +# check flag consistency in neurophotometrics data +def check_channels(state): + state = state.astype(int) + unique_state = np.unique(state[2:12]) + if unique_state.shape[0] > 3: + logger.error( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + + return unique_state.shape[0], unique_state + + +# ---------------------------------------------------------------------------------------------------------------------- +# Functions that access_keys_doricV6 uses +# ---------------------------------------------------------------------------------------------------------------------- +def separate_last_element(arr): + l = arr[-1] + return arr[:-1], l \ No newline at end of file diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index a837aa9..d7380ec 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -22,6 +22,7 @@ from numpy import float32, float64, int32, int64, uint16 from guppy.tdt_step2 import readtsq +from guppy.np_doric_csv_step2 import import_np_doric_csv # hv.extension() pn.extension() @@ -76,18 +77,6 @@ def make_dir(filepath): return op -def check_header(df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - - # function to show GUI and save def saveStorenames(inputParameters, data, event_name, flag, filepath): @@ -582,491 +571,6 @@ def save_button(event=None): template.show(port=number) -# check flag consistency in neurophotometrics data -def check_channels(state): - state = state.astype(int) - unique_state = np.unique(state[2:12]) - if unique_state.shape[0] > 3: - logger.error( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - - return unique_state.shape[0], unique_state - - -# function to decide NPM timestamps unit (seconds, ms or us) -def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): - col_names = np.array(list(df.columns)) - col_names_ts = [""] - for name in col_names: - if "timestamp" in name.lower(): - col_names_ts.append(name) - - ts_unit = "seconds" - if len(col_names_ts) > 2: - # Headless path: auto-select column/unit without any UI - if headless: - if timestamp_column_name is not None: - assert ( - timestamp_column_name in col_names_ts - ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" - chosen = timestamp_column_name - else: - chosen = col_names_ts[1] - df.insert(1, "Timestamp", df[chosen]) - df = df.drop(col_names_ts[1:], axis=1) - valid_units = {"seconds", "milliseconds", "microseconds"} - ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" - return df, ts_unit - # def comboBoxSelected(event): - # logger.info(event.widget.get()) - - window = tk.Tk() - window.title("Select appropriate options for timestamps") - window.geometry("500x200") - holdComboboxValues = dict() - - timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( - row=0, column=1, pady=25, padx=25 - ) - holdComboboxValues["timestamps"] = StringVar() - timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) - timestamps_combo.grid(row=0, column=2, pady=25, padx=25) - timestamps_combo.current(0) - # timestamps_combo.bind("<>", comboBoxSelected) - - time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) - holdComboboxValues["time_unit"] = StringVar() - time_unit_combo = ttk.Combobox( - window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] - ) - time_unit_combo.grid(row=1, column=2, pady=25, padx=25) - time_unit_combo.current(0) - # time_unit_combo.bind("<>", comboBoxSelected) - window.lift() - window.after(500, lambda: window.lift()) - window.mainloop() - - if holdComboboxValues["timestamps"].get(): - df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) - df = df.drop(col_names_ts[1:], axis=1) - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - if holdComboboxValues["time_unit"].get(): - if holdComboboxValues["time_unit"].get() == "seconds": - ts_unit = holdComboboxValues["time_unit"].get() - elif holdComboboxValues["time_unit"].get() == "milliseconds": - ts_unit = holdComboboxValues["time_unit"].get() - else: - ts_unit = holdComboboxValues["time_unit"].get() - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - else: - pass - - return df, ts_unit - - -# function to decide indices of interleaved channels -# in neurophotometrics data -def decide_indices(file, df, flag, num_ch=2): - ch_name = [file + "chev", file + "chod", file + "chpr"] - if len(ch_name) < num_ch: - logger.error( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - if flag == "data_np": - indices_dict = dict() - for i in range(num_ch): - indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) - - else: - cols = np.array(list(df.columns)) - if "flags" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "Flags"] - state = np.array(df["Flags"]) - elif "ledstate" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "LedState"] - state = np.array(df["LedState"]) - else: - logger.error( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - raise Exception( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - - num_ch, ch = check_channels(state) - indices_dict = dict() - for i in range(num_ch): - first_occurrence = np.where(state == ch[i])[0] - indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) - - df = df.drop(arr, axis=1) - - return df, indices_dict, num_ch - - -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l - - -def access_keys_doricV6(doric_file): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - keys = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - keys.append(f"{sep_values[-3]}/{sep_values[-2]}") - else: - keys.append(f"{sep_values[-2]}/{sep_values[-1]}") - - return keys - - -def access_keys_doricV1(doric_file): - keys = list(doric_file["Traces"]["Console"].keys()) - keys.remove("Time(s)") - - return keys - - -def read_doric(filepath): - with h5py.File(filepath, "r") as f: - if "Traces" in list(f.keys()): - keys = access_keys_doricV1(f) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_keys_doricV6(f) - - return keys - - -# function to see if there are 'csv' files present -# and recognize type of 'csv' files either from -# Neurophotometrics, Doric systems or custom made 'csv' files -# and read data accordingly -def import_np_doric_csv(filepath, isosbestic_control, num_ch, inputParameters=None): - - logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") - # Headless configuration (used to avoid any UI prompts when running tests) - headless = bool(os.environ.get("GUPPY_BASE_DIR")) - npm_timestamp_column_name = None - npm_time_unit = None - npm_split_events = None - if isinstance(inputParameters, dict): - npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") - npm_time_unit = inputParameters.get("npm_time_unit", "seconds") - npm_split_events = inputParameters.get("npm_split_events", True) - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_event = path_chev + path_chod + path_event + path_chpr - - path = sorted(list(set(path) - set(path_chev_chod_event))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - dirname = os.path.dirname(path[i]) - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "doric": - key_names = read_doric(path[i]) - event_from_filename.extend(key_names) - flag = "doric_doric" - else: - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - if len(check_all_str) == len(df_arr): - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - df = df.drop(["Time(s)"], axis=1) - event_from_filename.extend(list(df.columns)) - flag = "doric_csv" - logger.info(flag) - else: - df = pd.read_csv(path[i], index_col=False) - # with warnings.catch_warnings(): - # warnings.simplefilter("error") - # try: - # df = pd.read_csv(path[i], index_col=False, dtype=float) - # except: - # df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) # to make process faster reading just first 10 rows - # df = df.drop(['Time(s)'], axis=1) - # event_from_filename.extend(list(df.columns)) - # flag = 'doric_csv' - if flag == "doric_csv" or flag == "doric_doric": - continue - else: - colnames, value = check_header(df) - # logger.info(len(colnames), len(value)) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - df = df - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - # check the structure of dataframe and assign flag to the type of file - if len(cols) == 1: - if cols[0].lower() != "timestamps": - logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - else: - flag = "event_csv" - elif len(cols) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(cols)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - else: - flag = "data_csv" - elif len(cols) == 2: - flag = "event_or_data_np" - elif len(cols) >= 2: - flag = "data_np" - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - if columns_isstr == True and ( - "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) - ): - flag = flag + "_v2" - else: - flag = flag - - # used assigned flags to process the files and read the data - if flag == "event_or_data_np": - arr = list(df.iloc[:, 1]) - check_float = [True for i in arr if isinstance(i, float)] - if len(arr) == len(check_float) and columns_isstr == False: - flag = "data_np" - elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): - flag = "event_np" - else: - flag = "event_np" - else: - pass - - flag_arr.append(flag) - logger.info(flag) - if flag == "event_csv" or flag == "data_csv": - name = os.path.basename(path[i]).split(".")[0] - event_from_filename.append(name) - elif flag == "data_np": - file = f"file{str(i)}_" - df, indices_dict, num_channels = decide_indices(file, df, flag, num_ch) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - elif flag == "event_np": - type_val = np.array(df.iloc[:, 1]) - type_val_unique = np.unique(type_val) - if headless: - response = 1 if bool(npm_split_events) else 0 - else: - window = tk.Tk() - if len(type_val_unique) > 1: - response = messagebox.askyesno( - "Multiple event TTLs", - "Based on the TTL file,\ - it looks like TTLs \ - belongs to multiple behavior type. \ - Do you want to create multiple files for each \ - behavior type ?", - ) - else: - response = 0 - window.destroy() - if response == 1: - timestamps = np.array(df.iloc[:, 0]) - for j in range(len(type_val_unique)): - idx = np.where(type_val == type_val_unique[j]) - d = dict() - d["timestamps"] = timestamps[idx] - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) - event_from_filename.append("event" + str(type_val_unique[j])) - else: - timestamps = np.array(df.iloc[:, 0]) - d = dict() - d["timestamps"] = timestamps - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) - event_from_filename.append("event" + str(0)) - else: - file = f"file{str(i)}_" - df, ts_unit = decide_ts_unit_for_npm( - df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless - ) - df, indices_dict, num_channels = decide_indices(file, df, flag) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_chpr = [path_chev, path_chod, path_chpr] - if ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) - and ("event_np" in flag_arr) - and (i == len(path) - 1) - ) or ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) - ): # i==len(path)-1 and or 'event_np' in flag - num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) - arr_len, no_ch = [], [] - for i in range(len(path_chev_chod_chpr)): - if len(path_chev_chod_chpr[i]) > 0: - arr_len.append(len(path_chev_chod_chpr[i])) - else: - continue - - unique_arr_len = np.unique(np.array(arr_len)) - if "data_np_v2" in flag_arr: - if ts_unit == "seconds": - divisor = 1 - elif ts_unit == "milliseconds": - divisor = 1e3 - else: - divisor = 1e6 - else: - divisor = 1000 - - for j in range(len(path_event)): - df_event = pd.read_csv(path_event[j]) - df_chev = pd.read_csv(path_chev[0]) - df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor - df_event.to_csv(path_event[j], index=False) - if unique_arr_len.shape[0] == 1: - for j in range(len(path_chev)): - if file + "chev" in indices_dict.keys(): - df_chev = pd.read_csv(path_chev[j]) - df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor - df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) - df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( - df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] - ) - df_chev.to_csv(path_chev[j], index=False) - - if file + "chod" in indices_dict.keys(): - df_chod = pd.read_csv(path_chod[j]) - df_chod["timestamps"] = df_chev["timestamps"] - df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) - df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chod.to_csv(path_chod[j], index=False) - - if file + "chpr" in indices_dict.keys(): - df_chpr = pd.read_csv(path_chpr[j]) - df_chpr["timestamps"] = df_chev["timestamps"] - df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) - df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chpr.to_csv(path_chpr[j], index=False) - else: - logger.error("Number of channels should be same for all regions.") - raise Exception("Number of channels should be same for all regions.") - else: - pass - logger.info("Importing of either NPM or Doric or csv file is done.") - return event_from_filename, flag_arr - # function to read input parameters and run the saveStorenames function def execute(inputParameters): From a06cae4233657f52a77d5935928bcab9bceb6de7 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 12:04:16 -0800 Subject: [PATCH 003/125] Split import_csv out from import_np_doric_csv --- src/guppy/csv_step2.py | 99 +++++++++++++++++++++++++++++++++++++ src/guppy/saveStoresList.py | 16 ++++-- 2 files changed, 111 insertions(+), 4 deletions(-) create mode 100644 src/guppy/csv_step2.py diff --git a/src/guppy/csv_step2.py b/src/guppy/csv_step2.py new file mode 100644 index 0000000..4d9b800 --- /dev/null +++ b/src/guppy/csv_step2.py @@ -0,0 +1,99 @@ +import glob +import logging +import os +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + +def check_header(df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float + +def import_csv_step2(filepath): + logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") + path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + + path = sorted(list(set(path))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + assert ext == "csv", "Only .csv files are supported by import_csv function." + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) != len(df_arr), "This file appears to be doric .csv. This function only supports standard .csv files." + df = pd.read_csv(path[i], index_col=False) + + _, value = check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + if len(cols) == 1: + if cols[0].lower() != "timestamps": + logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + else: + flag = "event_csv" + elif len(cols) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(cols)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + else: + flag = "data_csv" + elif len(cols) == 2: + raise ValueError("Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data.") + elif len(cols) >= 2: + raise ValueError("Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data.") + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + flag_arr.append(flag) + logger.info(flag) + assert flag == "event_csv" or flag == "data_csv", "This function only supports standard event_csv and data_csv files." + name = os.path.basename(path[i]).split(".")[0] + event_from_filename.append(name) + + logger.info("Importing of csv file is done.") + return event_from_filename, flag_arr \ No newline at end of file diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index d7380ec..1f6bae7 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -21,8 +21,10 @@ import panel as pn from numpy import float32, float64, int32, int64, uint16 +from guppy.readTevTsq import import_csv from guppy.tdt_step2 import readtsq from guppy.np_doric_csv_step2 import import_np_doric_csv +from guppy.csv_step2 import import_csv_step2 # hv.extension() pn.extension() @@ -585,10 +587,16 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - data = readtsq(filepath) - event_name, flag = import_np_doric_csv( - filepath, isosbestic_control, num_ch, inputParameters=inputParameters - ) + modality = "csv" # TODO: ask for modality from the user + if modality == "tdt": + data = readtsq(filepath) + event_name, flag = None, None + elif modality == "csv": + data = 0 + event_name, flag = import_csv_step2(filepath) + else: + raise ValueError("Modality not recognized. Please use 'tdt' or 'csv'.") + saveStorenames(inputParameters, data, event_name, flag, filepath) logger.info("#" * 400) except Exception as e: From 66d60e2aabf95eac48556d747dd8bbf2a26b0dd6 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 12:58:33 -0800 Subject: [PATCH 004/125] Fixed TDT --- src/guppy/saveStoresList.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 1f6bae7..392c04e 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -587,10 +587,10 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = "csv" # TODO: ask for modality from the user + modality = "tdt" # TODO: ask for modality from the user if modality == "tdt": data = readtsq(filepath) - event_name, flag = None, None + event_name, flag = [], [] elif modality == "csv": data = 0 event_name, flag = import_csv_step2(filepath) From 4f4e1c921da919e28d5d595827f8bf397c74c5e4 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:15:47 -0800 Subject: [PATCH 005/125] Split import_doric out from import_np_doric_csv --- src/guppy/doric_step2.py | 92 +++++++++++++++++++++++++++++++++++++ src/guppy/saveStoresList.py | 6 ++- 2 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 src/guppy/doric_step2.py diff --git a/src/guppy/doric_step2.py b/src/guppy/doric_step2.py new file mode 100644 index 0000000..69022aa --- /dev/null +++ b/src/guppy/doric_step2.py @@ -0,0 +1,92 @@ +import glob +import logging +import os +import tkinter as tk +from tkinter import StringVar, messagebox, ttk + +import h5py +import numpy as np +import pandas as pd +import panel as pn + +pn.extension() + +logger = logging.getLogger(__name__) + +def import_doric(filepath): + + logger.debug("If it exists, importing Doric file based on the structure of file") + path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) + + path = sorted(list(set(path))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "doric": + key_names = read_doric(path[i]) + event_from_filename.extend(key_names) + flag = "doric_doric" + else: + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) == len(df_arr), "This file appears to be standard .csv. This function only supports doric .csv files." + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + df = df.drop(["Time(s)"], axis=1) + event_from_filename.extend(list(df.columns)) + flag = "doric_csv" + logger.info(flag) + logger.info("Importing of Doric file is done.") + return event_from_filename, flag_arr + + +def read_doric(filepath): + with h5py.File(filepath, "r") as f: + if "Traces" in list(f.keys()): + keys = access_keys_doricV1(f) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = access_keys_doricV6(f) + + return keys + +def access_keys_doricV6(doric_file): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + keys = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + keys.append(f"{sep_values[-3]}/{sep_values[-2]}") + else: + keys.append(f"{sep_values[-2]}/{sep_values[-1]}") + + return keys + + +def access_keys_doricV1(doric_file): + keys = list(doric_file["Traces"]["Console"].keys()) + keys.remove("Time(s)") + + return keys + +def separate_last_element(arr): + l = arr[-1] + return arr[:-1], l \ No newline at end of file diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 392c04e..26065e4 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -25,6 +25,7 @@ from guppy.tdt_step2 import readtsq from guppy.np_doric_csv_step2 import import_np_doric_csv from guppy.csv_step2 import import_csv_step2 +from guppy.doric_step2 import import_doric # hv.extension() pn.extension() @@ -587,13 +588,16 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = "tdt" # TODO: ask for modality from the user + modality = "doric" # TODO: ask for modality from the user if modality == "tdt": data = readtsq(filepath) event_name, flag = [], [] elif modality == "csv": data = 0 event_name, flag = import_csv_step2(filepath) + elif modality == "doric": + data = 0 + event_name, flag = import_doric(filepath) else: raise ValueError("Modality not recognized. Please use 'tdt' or 'csv'.") From 341d77d722844c63fdbfd4c189e446adf390c3f0 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:18:35 -0800 Subject: [PATCH 006/125] Removed unnecessary imports --- src/guppy/doric_step2.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/guppy/doric_step2.py b/src/guppy/doric_step2.py index 69022aa..bf402d1 100644 --- a/src/guppy/doric_step2.py +++ b/src/guppy/doric_step2.py @@ -1,15 +1,10 @@ import glob import logging import os -import tkinter as tk -from tkinter import StringVar, messagebox, ttk import h5py import numpy as np import pandas as pd -import panel as pn - -pn.extension() logger = logging.getLogger(__name__) From 0bcd4fee319ba485519bd71f75a7c756bea36157 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:40:01 -0800 Subject: [PATCH 007/125] Split import_npm out from import_np_doric_csv --- src/guppy/npm_step2.py | 408 ++++++++++++++++++++++++++++++++++++ src/guppy/saveStoresList.py | 6 +- 2 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 src/guppy/npm_step2.py diff --git a/src/guppy/npm_step2.py b/src/guppy/npm_step2.py new file mode 100644 index 0000000..f0fafec --- /dev/null +++ b/src/guppy/npm_step2.py @@ -0,0 +1,408 @@ +import glob +import logging +import os +import tkinter as tk +from tkinter import StringVar, messagebox, ttk + +import numpy as np +import pandas as pd +import panel as pn + +pn.extension() + +logger = logging.getLogger(__name__) + +def import_npm(filepath, num_ch, inputParameters=None): + + logger.debug("If it exists, importing NPM file based on the structure of file") + # Headless configuration (used to avoid any UI prompts when running tests) + headless = bool(os.environ.get("GUPPY_BASE_DIR")) + npm_timestamp_column_name = None + npm_time_unit = None + npm_split_events = None + if isinstance(inputParameters, dict): + npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") + npm_time_unit = inputParameters.get("npm_time_unit", "seconds") + npm_split_events = inputParameters.get("npm_split_events", True) + path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) + path_chev = glob.glob(os.path.join(filepath, "*chev*")) + path_chod = glob.glob(os.path.join(filepath, "*chod*")) + path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + path_event = glob.glob(os.path.join(filepath, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) # TODO: what is this for? + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + dirname = os.path.dirname(path[i]) + ext = os.path.basename(path[i]).split(".")[-1] + assert ext != "doric", "Doric files are not supported by import_npm function." + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) != len(df_arr), "This file appears to be doric .csv. This function only supports NPM .csv files." + df = pd.read_csv(path[i], index_col=False) + _, value = check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + assert len(cols) != 1, "File appears to be event .csv. This function only supports NPM .csv files." + assert len(cols) != 3, "File appears to be data .csv. This function only supports NPM .csv files." + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) >= 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + + flag_arr.append(flag) + logger.info(flag) + if flag == "data_np": + file = f"file{str(i)}_" + df, indices_dict, _ = decide_indices(file, df, flag, num_ch) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + elif flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if headless: + response = 1 if bool(npm_split_events) else 0 + else: + window = tk.Tk() + if len(type_val_unique) > 1: + response = messagebox.askyesno( + "Multiple event TTLs", + "Based on the TTL file,\ + it looks like TTLs \ + belongs to multiple behavior type. \ + Do you want to create multiple files for each \ + behavior type ?", + ) + else: + response = 0 + window.destroy() + if response == 1: + timestamps = np.array(df.iloc[:, 0]) + for j in range(len(type_val_unique)): + idx = np.where(type_val == type_val_unique[j]) + d = dict() + d["timestamps"] = timestamps[idx] + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) + event_from_filename.append("event" + str(type_val_unique[j])) + else: + timestamps = np.array(df.iloc[:, 0]) + d = dict() + d["timestamps"] = timestamps + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) + event_from_filename.append("event" + str(0)) + else: + file = f"file{str(i)}_" + df, ts_unit = decide_ts_unit_for_npm( + df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless + ) + df, indices_dict, _ = decide_indices(file, df, flag) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + path_chev = glob.glob(os.path.join(filepath, "*chev*")) + path_chod = glob.glob(os.path.join(filepath, "*chod*")) + path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + path_event = glob.glob(os.path.join(filepath, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) + path_chev_chod_chpr = [path_chev, path_chod, path_chpr] + if ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) + and ("event_np" in flag_arr) + and (i == len(path) - 1) + ) or ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) + ): # i==len(path)-1 and or 'event_np' in flag + num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) + arr_len, no_ch = [], [] + for i in range(len(path_chev_chod_chpr)): + if len(path_chev_chod_chpr[i]) > 0: + arr_len.append(len(path_chev_chod_chpr[i])) + else: + continue + + unique_arr_len = np.unique(np.array(arr_len)) + if "data_np_v2" in flag_arr: + if ts_unit == "seconds": + divisor = 1 + elif ts_unit == "milliseconds": + divisor = 1e3 + else: + divisor = 1e6 + else: + divisor = 1000 + + for j in range(len(path_event)): + df_event = pd.read_csv(path_event[j]) + df_chev = pd.read_csv(path_chev[0]) + df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor + df_event.to_csv(path_event[j], index=False) + if unique_arr_len.shape[0] == 1: + for j in range(len(path_chev)): + if file + "chev" in indices_dict.keys(): + df_chev = pd.read_csv(path_chev[j]) + df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor + df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) + df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( + df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] + ) + df_chev.to_csv(path_chev[j], index=False) + + if file + "chod" in indices_dict.keys(): + df_chod = pd.read_csv(path_chod[j]) + df_chod["timestamps"] = df_chev["timestamps"] + df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) + df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chod.to_csv(path_chod[j], index=False) + + if file + "chpr" in indices_dict.keys(): + df_chpr = pd.read_csv(path_chpr[j]) + df_chpr["timestamps"] = df_chev["timestamps"] + df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) + df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chpr.to_csv(path_chpr[j], index=False) + else: + logger.error("Number of channels should be same for all regions.") + raise Exception("Number of channels should be same for all regions.") + logger.info("Importing of NPM file is done.") + return event_from_filename, flag_arr + +def check_header(df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float + + +# function to decide indices of interleaved channels +# in neurophotometrics data +def decide_indices(file, df, flag, num_ch=2): + ch_name = [file + "chev", file + "chod", file + "chpr"] + if len(ch_name) < num_ch: + logger.error( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + if flag == "data_np": + indices_dict = dict() + for i in range(num_ch): + indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) + + else: + cols = np.array(list(df.columns)) + if "flags" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "Flags"] + state = np.array(df["Flags"]) + elif "ledstate" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "LedState"] + state = np.array(df["LedState"]) + else: + logger.error( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + raise Exception( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + + num_ch, ch = check_channels(state) + indices_dict = dict() + for i in range(num_ch): + first_occurrence = np.where(state == ch[i])[0] + indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) + + df = df.drop(arr, axis=1) + + return df, indices_dict, num_ch + +# check flag consistency in neurophotometrics data +def check_channels(state): + state = state.astype(int) + unique_state = np.unique(state[2:12]) + if unique_state.shape[0] > 3: + logger.error( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + + return unique_state.shape[0], unique_state + + +# function to decide NPM timestamps unit (seconds, ms or us) +def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): + col_names = np.array(list(df.columns)) + col_names_ts = [""] + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + + ts_unit = "seconds" + if len(col_names_ts) > 2: + # Headless path: auto-select column/unit without any UI + if headless: + if timestamp_column_name is not None: + assert ( + timestamp_column_name in col_names_ts + ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" + chosen = timestamp_column_name + else: + chosen = col_names_ts[1] + df.insert(1, "Timestamp", df[chosen]) + df = df.drop(col_names_ts[1:], axis=1) + valid_units = {"seconds", "milliseconds", "microseconds"} + ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" + return df, ts_unit + # def comboBoxSelected(event): + # logger.info(event.widget.get()) + + window = tk.Tk() + window.title("Select appropriate options for timestamps") + window.geometry("500x200") + holdComboboxValues = dict() + + timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( + row=0, column=1, pady=25, padx=25 + ) + holdComboboxValues["timestamps"] = StringVar() + timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) + timestamps_combo.grid(row=0, column=2, pady=25, padx=25) + timestamps_combo.current(0) + # timestamps_combo.bind("<>", comboBoxSelected) + + time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) + holdComboboxValues["time_unit"] = StringVar() + time_unit_combo = ttk.Combobox( + window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] + ) + time_unit_combo.grid(row=1, column=2, pady=25, padx=25) + time_unit_combo.current(0) + # time_unit_combo.bind("<>", comboBoxSelected) + window.lift() + window.after(500, lambda: window.lift()) + window.mainloop() + + if holdComboboxValues["timestamps"].get(): + df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) + df = df.drop(col_names_ts[1:], axis=1) + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + if holdComboboxValues["time_unit"].get(): + if holdComboboxValues["time_unit"].get() == "seconds": + ts_unit = holdComboboxValues["time_unit"].get() + elif holdComboboxValues["time_unit"].get() == "milliseconds": + ts_unit = holdComboboxValues["time_unit"].get() + else: + ts_unit = holdComboboxValues["time_unit"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + else: + pass + + return df, ts_unit \ No newline at end of file diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 26065e4..db9a4fc 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -26,6 +26,7 @@ from guppy.np_doric_csv_step2 import import_np_doric_csv from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric +from guppy.npm_step2 import import_npm # hv.extension() pn.extension() @@ -588,7 +589,7 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = "doric" # TODO: ask for modality from the user + modality = "npm" # TODO: ask for modality from the user if modality == "tdt": data = readtsq(filepath) event_name, flag = [], [] @@ -598,6 +599,9 @@ def execute(inputParameters): elif modality == "doric": data = 0 event_name, flag = import_doric(filepath) + elif modality == "npm": + data = 0 + event_name, flag = import_npm(filepath, num_ch) else: raise ValueError("Modality not recognized. Please use 'tdt' or 'csv'.") From 7b36f64266a7b5c35b78310272f65fcecd6a6d3b Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:50:35 -0800 Subject: [PATCH 008/125] Added modality selector to the GUI. --- src/guppy/saveStoresList.py | 4 ++-- src/guppy/savingInputParameters.py | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index db9a4fc..f9921f9 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -589,7 +589,7 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = "npm" # TODO: ask for modality from the user + modality = inputParameters.get("modality", "tdt") if modality == "tdt": data = readtsq(filepath) event_name, flag = [], [] @@ -603,7 +603,7 @@ def execute(inputParameters): data = 0 event_name, flag = import_npm(filepath, num_ch) else: - raise ValueError("Modality not recognized. Please use 'tdt' or 'csv'.") + raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") saveStorenames(inputParameters, data, event_name, flag, filepath) logger.info("#" * 400) diff --git a/src/guppy/savingInputParameters.py b/src/guppy/savingInputParameters.py index cd515ab..b0a5feb 100644 --- a/src/guppy/savingInputParameters.py +++ b/src/guppy/savingInputParameters.py @@ -119,6 +119,21 @@ def readPBIncrementValues(progressBar): files_1 = pn.widgets.FileSelector(folder_path, name="folderNames", width=950) + explain_modality = pn.pane.Markdown( + """ + **Data Modality:** Select the type of data acquisition system used for your recordings: + - **tdt**: Tucker-Davis Technologies system + - **csv**: Generic CSV format + - **doric**: Doric Photometry system + - **npm**: Neurophotometrics system + """, + width=600, + ) + + modality_selector = pn.widgets.Select( + name="Data Modality", value="tdt", options=["tdt", "csv", "doric", "npm"], width=320 + ) + explain_time_artifacts = pn.pane.Markdown( """ - ***Number of cores :*** Number of cores used for analysis. Try to @@ -357,6 +372,7 @@ def getInputParameters(): inputParameters = { "abspath": abspath[0], "folderNames": files_1.value, + "modality": modality_selector.value, "numberOfCores": numberOfCores.value, "combine_data": combine_data.value, "isosbestic_control": isosbestic_control.value, @@ -538,7 +554,7 @@ def onclickpsth(event=None): psth_baseline_param = pn.Column(zscore_param_wd, psth_param_wd, baseline_param_wd, peak_param_wd) - widget = pn.Column(mark_down_1, files_1, pn.Row(individual_analysis_wd_2, psth_baseline_param)) + widget = pn.Column(mark_down_1, files_1, explain_modality, modality_selector, pn.Row(individual_analysis_wd_2, psth_baseline_param)) # file_selector = pn.WidgetBox(files_1) styles = dict(background="WhiteSmoke") From 100ad14058e8f07a48aee74083e6f04d46a027fa Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:52:00 -0800 Subject: [PATCH 009/125] Added modality selector to the GUI. --- src/guppy/saveStoresList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index f9921f9..72dc604 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -583,13 +583,13 @@ def execute(inputParameters): folderNames = inputParameters["folderNames"] isosbestic_control = inputParameters["isosbestic_control"] num_ch = inputParameters["noChannels"] + modality = inputParameters.get("modality", "tdt") logger.info(folderNames) try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = inputParameters.get("modality", "tdt") if modality == "tdt": data = readtsq(filepath) event_name, flag = [], [] From ef978ec2cb8f7e51b9eacb8ce2d6f88bf73e01ea Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 14:17:33 -0800 Subject: [PATCH 010/125] Added modality option to the api and tests --- src/guppy/testing/api.py | 7 +++++++ tests/test_step2.py | 17 +++++++++++++++-- tests/test_step3.py | 18 ++++++++++++++++-- tests/test_step4.py | 19 +++++++++++++++++-- tests/test_step5.py | 20 ++++++++++++++++++-- 5 files changed, 73 insertions(+), 8 deletions(-) diff --git a/src/guppy/testing/api.py b/src/guppy/testing/api.py index 587a022..0e16f23 100644 --- a/src/guppy/testing/api.py +++ b/src/guppy/testing/api.py @@ -68,6 +68,7 @@ def step2( base_dir: str, selected_folders: Iterable[str], storenames_map: dict[str, str], + modality: str = "tdt", npm_timestamp_column_name: str | None = None, npm_time_unit: str = "seconds", npm_split_events: bool = True, @@ -150,6 +151,9 @@ def step2( # Inject storenames mapping for headless execution input_params["storenames_map"] = dict(storenames_map) + # Inject modality + input_params["modality"] = modality + # Add npm parameters input_params["npm_timestamp_column_name"] = npm_timestamp_column_name input_params["npm_time_unit"] = npm_time_unit @@ -163,6 +167,7 @@ def step3( *, base_dir: str, selected_folders: Iterable[str], + modality: str = "tdt", npm_timestamp_column_name: str | None = None, npm_time_unit: str = "seconds", npm_split_events: bool = True, @@ -240,6 +245,7 @@ def step4( *, base_dir: str, selected_folders: Iterable[str], + modality: str = "tdt", npm_timestamp_column_name: str | None = None, npm_time_unit: str = "seconds", npm_split_events: bool = True, @@ -317,6 +323,7 @@ def step5( *, base_dir: str, selected_folders: Iterable[str], + modality: str = "tdt", npm_timestamp_column_name: str | None = None, npm_time_unit: str = "seconds", npm_split_events: bool = True, diff --git a/tests/test_step2.py b/tests/test_step2.py index 55181ab..34777be 100644 --- a/tests/test_step2.py +++ b/tests/test_step2.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize( - "session_subdir, storenames_map", + "session_subdir, storenames_map, modality", [ ( "SampleData_csv/sample_data_csv_1", @@ -19,6 +19,7 @@ "Sample_Signal_Channel": "signal_region", "Sample_TTL": "ttl", }, + "csv", ), ( "SampleData_Doric/sample_doric_1", @@ -27,6 +28,7 @@ "AIn-2 - Raw": "signal_region", "DI--O-1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_2", @@ -35,6 +37,7 @@ "AIn-1 - Dem (da)": "signal_region", "DI/O-1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_3", @@ -43,6 +46,7 @@ "CAM1_EXC2/ROI01": "signal_region", "DigitalIO/CAM1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_4", @@ -50,6 +54,7 @@ "Series0001/AIN01xAOUT01-LockIn": "control_region", "Series0001/AIN01xAOUT02-LockIn": "signal_region", }, + "doric", ), ( "SampleData_Doric/sample_doric_5", @@ -57,6 +62,7 @@ "Series0001/AIN01xAOUT01-LockIn": "control_region", "Series0001/AIN01xAOUT02-LockIn": "signal_region", }, + "doric", ), ( "SampleData_Clean/Photo_63_207-181030-103332", @@ -65,6 +71,7 @@ "Dv2A": "signal_dms", "PrtN": "port_entries_dms", }, + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", @@ -73,6 +80,7 @@ "Dv2A": "signal_dms", "PrtN": "port_entries_dms", }, + "tdt", ), # TODO: Add sampleData_NPM_1 after fixing Doric vs. NPM determination bug. ( @@ -81,6 +89,7 @@ "file0_chev6": "control_region", "file1_chev6": "signal_region", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_3", @@ -89,6 +98,7 @@ "file0_chod3": "signal_region3", "event3": "ttl_region3", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_4", @@ -97,6 +107,7 @@ "file0_chod1": "signal_region1", "eventTrue": "ttl_true_region1", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_5", @@ -105,6 +116,7 @@ "file0_chod1": "signal_region1", "event0": "ttl_region1", }, + "npm", ), ], ids=[ @@ -122,7 +134,7 @@ "sample_npm_5", ], ) -def test_step2(tmp_path, session_subdir, storenames_map): +def test_step2(tmp_path, session_subdir, storenames_map, modality): """ Step 2 integration test (Save Storenames) using real sample data, isolated to a temporary workspace. For each dataset: @@ -170,6 +182,7 @@ def test_step2(tmp_path, session_subdir, storenames_map): base_dir=str(tmp_base), selected_folders=[str(session_copy)], storenames_map=storenames_map, + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, diff --git a/tests/test_step3.py b/tests/test_step3.py index 655fb10..d167585 100644 --- a/tests/test_step3.py +++ b/tests/test_step3.py @@ -20,7 +20,7 @@ def storenames_map(): @pytest.mark.parametrize( - "session_subdir, storenames_map", + "session_subdir, storenames_map, modality", [ ( "SampleData_csv/sample_data_csv_1", @@ -29,6 +29,7 @@ def storenames_map(): "Sample_Signal_Channel": "signal_region", "Sample_TTL": "ttl", }, + "csv", ), ( "SampleData_Doric/sample_doric_1", @@ -37,6 +38,7 @@ def storenames_map(): "AIn-2 - Raw": "signal_region", "DI--O-1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_2", @@ -45,6 +47,7 @@ def storenames_map(): "AIn-1 - Dem (da)": "signal_region", "DI/O-1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_3", @@ -53,6 +56,7 @@ def storenames_map(): "CAM1_EXC2/ROI01": "signal_region", "DigitalIO/CAM1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_4", @@ -60,6 +64,7 @@ def storenames_map(): "Series0001/AIN01xAOUT01-LockIn": "control_region", "Series0001/AIN01xAOUT02-LockIn": "signal_region", }, + "doric", ), ( "SampleData_Doric/sample_doric_5", @@ -67,6 +72,7 @@ def storenames_map(): "Series0001/AIN01xAOUT01-LockIn": "control_region", "Series0001/AIN01xAOUT02-LockIn": "signal_region", }, + "doric", ), ( "SampleData_Clean/Photo_63_207-181030-103332", @@ -75,6 +81,7 @@ def storenames_map(): "Dv2A": "signal_dms", "PrtN": "port_entries_dms", }, + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", @@ -83,6 +90,7 @@ def storenames_map(): "Dv2A": "signal_dms", "PrtN": "port_entries_dms", }, + "tdt", ), ( "SampleData_Neurophotometrics/sampleData_NPM_2", @@ -90,6 +98,7 @@ def storenames_map(): "file0_chev6": "control_region", "file1_chev6": "signal_region", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_3", @@ -98,6 +107,7 @@ def storenames_map(): "file0_chod3": "signal_region3", "event3": "ttl_region3", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_4", @@ -106,6 +116,7 @@ def storenames_map(): "file0_chod1": "signal_region1", "eventTrue": "ttl_true_region1", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_5", @@ -114,6 +125,7 @@ def storenames_map(): "file0_chod1": "signal_region1", "event0": "ttl_region1", }, + "npm", ), ], ids=[ @@ -131,7 +143,7 @@ def storenames_map(): "sample_npm_5", ], ) -def test_step3(tmp_path, storenames_map, session_subdir): +def test_step3(tmp_path, storenames_map, session_subdir, modality): """ Full integration test for Step 3 (Read Raw Data) using real CSV sample data, isolated to a temporary workspace to avoid mutating shared sample data. @@ -182,6 +194,7 @@ def test_step3(tmp_path, storenames_map, session_subdir): base_dir=str(tmp_base), selected_folders=[str(session_copy)], storenames_map=storenames_map, + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -191,6 +204,7 @@ def test_step3(tmp_path, storenames_map, session_subdir): step3( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, diff --git a/tests/test_step4.py b/tests/test_step4.py index 9a2e9bb..80c2d3f 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize( - "session_subdir, storenames_map, expected_region, expected_ttl", + "session_subdir, storenames_map, expected_region, expected_ttl, modality", [ ( "SampleData_csv/sample_data_csv_1", @@ -21,6 +21,7 @@ }, "region", "ttl", + "csv", ), ( "SampleData_Doric/sample_doric_1", @@ -31,6 +32,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_2", @@ -41,6 +43,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_3", @@ -51,6 +54,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_4", @@ -60,6 +64,7 @@ }, "region", None, + "doric", ), ( "SampleData_Doric/sample_doric_5", @@ -69,6 +74,7 @@ }, "region", None, + "doric", ), ( "SampleData_Clean/Photo_63_207-181030-103332", @@ -79,6 +85,7 @@ }, "dms", "port_entries_dms", + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", @@ -89,6 +96,7 @@ }, "dms", "port_entries_dms", + "tdt", ), ( "SampleData_Neurophotometrics/sampleData_NPM_2", @@ -98,6 +106,7 @@ }, "region", None, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_3", @@ -108,6 +117,7 @@ }, "region3", "ttl_region3", + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_4", @@ -118,6 +128,7 @@ }, "region1", "ttl_true_region1", + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_5", @@ -128,6 +139,7 @@ }, "region1", "ttl_region1", + "npm", ), ], ids=[ @@ -146,7 +158,7 @@ ], ) @pytest.mark.filterwarnings("ignore::UserWarning") -def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_region, expected_ttl): +def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_region, expected_ttl, modality): """ Full integration test for Step 4 (Extract timestamps and signal) using real CSV sample data, isolated to a temporary workspace to avoid mutating shared sample data. @@ -202,6 +214,7 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], storenames_map=storenames_map, + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -211,6 +224,7 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step3( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -220,6 +234,7 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step4( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, diff --git a/tests/test_step5.py b/tests/test_step5.py index 5593ee0..d2de1f5 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize( - "session_subdir, storenames_map, expected_region, expected_ttl", + "session_subdir, storenames_map, expected_region, expected_ttl, modality", [ ( "SampleData_csv/sample_data_csv_1", @@ -21,6 +21,7 @@ }, "region", "ttl", + "csv", ), ( "SampleData_Doric/sample_doric_1", @@ -31,6 +32,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_2", @@ -41,6 +43,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_3", @@ -51,6 +54,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_4", @@ -60,6 +64,7 @@ }, "region", None, + "doric", ), ( "SampleData_Doric/sample_doric_5", @@ -69,6 +74,7 @@ }, "region", None, + "doric", ), ( "SampleData_Clean/Photo_63_207-181030-103332", @@ -79,6 +85,7 @@ }, "dms", "port_entries_dms", + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", @@ -89,6 +96,7 @@ }, "dms", "port_entries_dms", + "tdt", ), ( "SampleData_Neurophotometrics/sampleData_NPM_2", @@ -98,6 +106,7 @@ }, "region", None, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_3", @@ -108,6 +117,7 @@ }, "region3", "ttl_region3", + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_4", @@ -118,6 +128,7 @@ }, "region1", "ttl_true_region1", + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_5", @@ -128,6 +139,7 @@ }, "region1", "ttl_region1", + "npm", ), ], ids=[ @@ -146,7 +158,7 @@ ], ) @pytest.mark.filterwarnings("ignore::UserWarning") -def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_region, expected_ttl): +def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_region, expected_ttl, modality): """ Full integration test for Step 5 (PSTH Computation) using real CSV sample data, isolated to a temporary workspace to avoid mutating shared sample data. @@ -204,6 +216,7 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], storenames_map=storenames_map, + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -213,6 +226,7 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step3( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -222,6 +236,7 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step4( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -231,6 +246,7 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step5( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, From 6589139f61a55ac673c5867e165c3a5a4cb3d657 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 11:15:21 -0800 Subject: [PATCH 011/125] Removed intermediate np_doric_csv_step2 module. --- src/guppy/np_doric_csv_step2.py | 523 -------------------------------- 1 file changed, 523 deletions(-) delete mode 100644 src/guppy/np_doric_csv_step2.py diff --git a/src/guppy/np_doric_csv_step2.py b/src/guppy/np_doric_csv_step2.py deleted file mode 100644 index d06dcc1..0000000 --- a/src/guppy/np_doric_csv_step2.py +++ /dev/null @@ -1,523 +0,0 @@ -import glob -import logging -import os -import tkinter as tk -from tkinter import StringVar, messagebox, ttk - -import h5py -import numpy as np -import pandas as pd -import panel as pn - -pn.extension() - -logger = logging.getLogger(__name__) - -# function to see if there are 'csv' files present -# and recognize type of 'csv' files either from -# Neurophotometrics, Doric systems or custom made 'csv' files -# and read data accordingly -def import_np_doric_csv(filepath, isosbestic_control, num_ch, inputParameters=None): - - logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") - # Headless configuration (used to avoid any UI prompts when running tests) - headless = bool(os.environ.get("GUPPY_BASE_DIR")) - npm_timestamp_column_name = None - npm_time_unit = None - npm_split_events = None - if isinstance(inputParameters, dict): - npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") - npm_time_unit = inputParameters.get("npm_time_unit", "seconds") - npm_split_events = inputParameters.get("npm_split_events", True) - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_event = path_chev + path_chod + path_event + path_chpr - - path = sorted(list(set(path) - set(path_chev_chod_event))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - dirname = os.path.dirname(path[i]) - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "doric": - key_names = read_doric(path[i]) - event_from_filename.extend(key_names) - flag = "doric_doric" - else: - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - if len(check_all_str) == len(df_arr): - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - df = df.drop(["Time(s)"], axis=1) - event_from_filename.extend(list(df.columns)) - flag = "doric_csv" - logger.info(flag) - else: - df = pd.read_csv(path[i], index_col=False) - # with warnings.catch_warnings(): - # warnings.simplefilter("error") - # try: - # df = pd.read_csv(path[i], index_col=False, dtype=float) - # except: - # df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) # to make process faster reading just first 10 rows - # df = df.drop(['Time(s)'], axis=1) - # event_from_filename.extend(list(df.columns)) - # flag = 'doric_csv' - if flag == "doric_csv" or flag == "doric_doric": - continue - else: - colnames, value = check_header(df) - # logger.info(len(colnames), len(value)) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - df = df - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - # check the structure of dataframe and assign flag to the type of file - if len(cols) == 1: - if cols[0].lower() != "timestamps": - logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - else: - flag = "event_csv" - elif len(cols) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(cols)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - else: - flag = "data_csv" - elif len(cols) == 2: - flag = "event_or_data_np" - elif len(cols) >= 2: - flag = "data_np" - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - if columns_isstr == True and ( - "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) - ): - flag = flag + "_v2" - else: - flag = flag - - # used assigned flags to process the files and read the data - if flag == "event_or_data_np": - arr = list(df.iloc[:, 1]) - check_float = [True for i in arr if isinstance(i, float)] - if len(arr) == len(check_float) and columns_isstr == False: - flag = "data_np" - elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): - flag = "event_np" - else: - flag = "event_np" - else: - pass - - flag_arr.append(flag) - logger.info(flag) - if flag == "event_csv" or flag == "data_csv": - name = os.path.basename(path[i]).split(".")[0] - event_from_filename.append(name) - elif flag == "data_np": - file = f"file{str(i)}_" - df, indices_dict, num_channels = decide_indices(file, df, flag, num_ch) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - elif flag == "event_np": - type_val = np.array(df.iloc[:, 1]) - type_val_unique = np.unique(type_val) - if headless: - response = 1 if bool(npm_split_events) else 0 - else: - window = tk.Tk() - if len(type_val_unique) > 1: - response = messagebox.askyesno( - "Multiple event TTLs", - "Based on the TTL file,\ - it looks like TTLs \ - belongs to multiple behavior type. \ - Do you want to create multiple files for each \ - behavior type ?", - ) - else: - response = 0 - window.destroy() - if response == 1: - timestamps = np.array(df.iloc[:, 0]) - for j in range(len(type_val_unique)): - idx = np.where(type_val == type_val_unique[j]) - d = dict() - d["timestamps"] = timestamps[idx] - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) - event_from_filename.append("event" + str(type_val_unique[j])) - else: - timestamps = np.array(df.iloc[:, 0]) - d = dict() - d["timestamps"] = timestamps - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) - event_from_filename.append("event" + str(0)) - else: - file = f"file{str(i)}_" - df, ts_unit = decide_ts_unit_for_npm( - df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless - ) - df, indices_dict, num_channels = decide_indices(file, df, flag) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_chpr = [path_chev, path_chod, path_chpr] - if ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) - and ("event_np" in flag_arr) - and (i == len(path) - 1) - ) or ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) - ): # i==len(path)-1 and or 'event_np' in flag - num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) - arr_len, no_ch = [], [] - for i in range(len(path_chev_chod_chpr)): - if len(path_chev_chod_chpr[i]) > 0: - arr_len.append(len(path_chev_chod_chpr[i])) - else: - continue - - unique_arr_len = np.unique(np.array(arr_len)) - if "data_np_v2" in flag_arr: - if ts_unit == "seconds": - divisor = 1 - elif ts_unit == "milliseconds": - divisor = 1e3 - else: - divisor = 1e6 - else: - divisor = 1000 - - for j in range(len(path_event)): - df_event = pd.read_csv(path_event[j]) - df_chev = pd.read_csv(path_chev[0]) - df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor - df_event.to_csv(path_event[j], index=False) - if unique_arr_len.shape[0] == 1: - for j in range(len(path_chev)): - if file + "chev" in indices_dict.keys(): - df_chev = pd.read_csv(path_chev[j]) - df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor - df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) - df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( - df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] - ) - df_chev.to_csv(path_chev[j], index=False) - - if file + "chod" in indices_dict.keys(): - df_chod = pd.read_csv(path_chod[j]) - df_chod["timestamps"] = df_chev["timestamps"] - df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) - df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chod.to_csv(path_chod[j], index=False) - - if file + "chpr" in indices_dict.keys(): - df_chpr = pd.read_csv(path_chpr[j]) - df_chpr["timestamps"] = df_chev["timestamps"] - df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) - df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chpr.to_csv(path_chpr[j], index=False) - else: - logger.error("Number of channels should be same for all regions.") - raise Exception("Number of channels should be same for all regions.") - else: - pass - logger.info("Importing of either NPM or Doric or csv file is done.") - return event_from_filename, flag_arr - -# ---------------------------------------------------------------------------------------------------------------------- -# Functions that import_np_doric_csv uses -# ---------------------------------------------------------------------------------------------------------------------- - -def read_doric(filepath): - with h5py.File(filepath, "r") as f: - if "Traces" in list(f.keys()): - keys = access_keys_doricV1(f) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_keys_doricV6(f) - - return keys - - -def check_header(df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - -# function to decide indices of interleaved channels -# in neurophotometrics data -def decide_indices(file, df, flag, num_ch=2): - ch_name = [file + "chev", file + "chod", file + "chpr"] - if len(ch_name) < num_ch: - logger.error( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - if flag == "data_np": - indices_dict = dict() - for i in range(num_ch): - indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) - - else: - cols = np.array(list(df.columns)) - if "flags" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "Flags"] - state = np.array(df["Flags"]) - elif "ledstate" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "LedState"] - state = np.array(df["LedState"]) - else: - logger.error( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - raise Exception( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - - num_ch, ch = check_channels(state) - indices_dict = dict() - for i in range(num_ch): - first_occurrence = np.where(state == ch[i])[0] - indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) - - df = df.drop(arr, axis=1) - - return df, indices_dict, num_ch - - -# function to decide NPM timestamps unit (seconds, ms or us) -def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): - col_names = np.array(list(df.columns)) - col_names_ts = [""] - for name in col_names: - if "timestamp" in name.lower(): - col_names_ts.append(name) - - ts_unit = "seconds" - if len(col_names_ts) > 2: - # Headless path: auto-select column/unit without any UI - if headless: - if timestamp_column_name is not None: - assert ( - timestamp_column_name in col_names_ts - ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" - chosen = timestamp_column_name - else: - chosen = col_names_ts[1] - df.insert(1, "Timestamp", df[chosen]) - df = df.drop(col_names_ts[1:], axis=1) - valid_units = {"seconds", "milliseconds", "microseconds"} - ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" - return df, ts_unit - # def comboBoxSelected(event): - # logger.info(event.widget.get()) - - window = tk.Tk() - window.title("Select appropriate options for timestamps") - window.geometry("500x200") - holdComboboxValues = dict() - - timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( - row=0, column=1, pady=25, padx=25 - ) - holdComboboxValues["timestamps"] = StringVar() - timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) - timestamps_combo.grid(row=0, column=2, pady=25, padx=25) - timestamps_combo.current(0) - # timestamps_combo.bind("<>", comboBoxSelected) - - time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) - holdComboboxValues["time_unit"] = StringVar() - time_unit_combo = ttk.Combobox( - window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] - ) - time_unit_combo.grid(row=1, column=2, pady=25, padx=25) - time_unit_combo.current(0) - # time_unit_combo.bind("<>", comboBoxSelected) - window.lift() - window.after(500, lambda: window.lift()) - window.mainloop() - - if holdComboboxValues["timestamps"].get(): - df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) - df = df.drop(col_names_ts[1:], axis=1) - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - if holdComboboxValues["time_unit"].get(): - if holdComboboxValues["time_unit"].get() == "seconds": - ts_unit = holdComboboxValues["time_unit"].get() - elif holdComboboxValues["time_unit"].get() == "milliseconds": - ts_unit = holdComboboxValues["time_unit"].get() - else: - ts_unit = holdComboboxValues["time_unit"].get() - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - else: - pass - - return df, ts_unit - - -# ---------------------------------------------------------------------------------------------------------------------- -# Functions that read_doric uses -# ---------------------------------------------------------------------------------------------------------------------- - -def access_keys_doricV6(doric_file): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - keys = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - keys.append(f"{sep_values[-3]}/{sep_values[-2]}") - else: - keys.append(f"{sep_values[-2]}/{sep_values[-1]}") - - return keys - - -def access_keys_doricV1(doric_file): - keys = list(doric_file["Traces"]["Console"].keys()) - keys.remove("Time(s)") - - return keys - -# ---------------------------------------------------------------------------------------------------------------------- -# Functions that decide_indices uses -# ---------------------------------------------------------------------------------------------------------------------- - -# check flag consistency in neurophotometrics data -def check_channels(state): - state = state.astype(int) - unique_state = np.unique(state[2:12]) - if unique_state.shape[0] > 3: - logger.error( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - - return unique_state.shape[0], unique_state - - -# ---------------------------------------------------------------------------------------------------------------------- -# Functions that access_keys_doricV6 uses -# ---------------------------------------------------------------------------------------------------------------------- -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l \ No newline at end of file From e7ac4d8982da9383b1d50dccfde9c50f7171e90c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 11:15:58 -0800 Subject: [PATCH 012/125] Split tdt_step3.py off from read_raw_data.py. --- src/guppy/common_step3.py | 51 +++++++++ src/guppy/readTevTsq.py | 204 +----------------------------------- src/guppy/saveStoresList.py | 1 - src/guppy/tdt_step3.py | 183 ++++++++++++++++++++++++++++++++ 4 files changed, 237 insertions(+), 202 deletions(-) create mode 100644 src/guppy/common_step3.py create mode 100644 src/guppy/tdt_step3.py diff --git a/src/guppy/common_step3.py b/src/guppy/common_step3.py new file mode 100644 index 0000000..4ea5c95 --- /dev/null +++ b/src/guppy/common_step3.py @@ -0,0 +1,51 @@ +import glob +import json +import logging +import multiprocessing as mp +import os +import re +import sys +import time +import warnings +from itertools import repeat + +import h5py +import numpy as np +import pandas as pd +from numpy import float32, float64, int32, int64, uint16 + +logger = logging.getLogger(__name__) + +# function to write data to a hdf5 file +def write_hdf5(data, event, filepath, key): + + # replacing \\ or / in storenames with _ (to avoid errors while saving data) + event = event.replace("\\", "_") + event = event.replace("/", "_") + + op = os.path.join(filepath, event + ".hdf5") + + # if file does not exist create a new file + if not os.path.exists(op): + with h5py.File(op, "w") as f: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + + # if file already exists, append data to it or add a new key to it + else: + with h5py.File(op, "r+") as f: + if key in list(f.keys()): + if type(data) is np.ndarray: + f[key].resize(data.shape) + arr = f[key] + arr[:] = data + else: + arr = f[key] + arr = data + else: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) \ No newline at end of file diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 6deb3b1..fe16add 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -14,6 +14,9 @@ import pandas as pd from numpy import float32, float64, int32, int64, uint16 +from guppy.common_step3 import write_hdf5 +from guppy.tdt_step3 import execute_readtev + logger = logging.getLogger(__name__) @@ -91,47 +94,6 @@ def check_doric(filepath): return flag_arr[0] -# check if a particular element is there in an array or not -def ismember(arr, element): - res = [1 if i == element else 0 for i in arr] - return np.asarray(res) - - -# function to write data to a hdf5 file -def write_hdf5(data, event, filepath, key): - - # replacing \\ or / in storenames with _ (to avoid errors while saving data) - event = event.replace("\\", "_") - event = event.replace("/", "_") - - op = os.path.join(filepath, event + ".hdf5") - - # if file does not exist create a new file - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - # if file already exists, append data to it or add a new key to it - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - # function to read event timestamps csv file. def import_csv(filepath, event, outputPath): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") @@ -177,166 +139,6 @@ def import_csv(filepath, event, outputPath): return data, key -# function to save data read from tev file to hdf5 file -def save_dict_to_hdf5(S, event, outputPath): - write_hdf5(S["storename"], event, outputPath, "storename") - write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") - write_hdf5(S["timestamps"], event, outputPath, "timestamps") - - write_hdf5(S["data"], event, outputPath, "data") - write_hdf5(S["npoints"], event, outputPath, "npoints") - write_hdf5(S["channels"], event, outputPath, "channels") - - -# function to check event data (checking whether event timestamps belongs to same event or multiple events) -def check_data(S, filepath, event, outputPath): - # logger.info("Checking event storename data for creating multiple event names from single event storename...") - new_event = event.replace("\\", "") - new_event = event.replace("/", "") - diff = np.diff(S["data"]) - arr = np.full(diff.shape[0], 1) - - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) - - if diff.shape[0] == 0: - return 0 - - if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: - logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" - ) - i_d = np.unique(S["data"]) - for i in range(i_d.shape[0]): - new_S = dict() - idx = np.where(S["data"] == i_d[i])[0] - new_S["timestamps"] = S["timestamps"][idx] - new_S["storename"] = new_event + str(int(i_d[i])) - new_S["sampling_rate"] = S["sampling_rate"] - new_S["data"] = S["data"] - new_S["npoints"] = S["npoints"] - new_S["channels"] = S["channels"] - storesList = np.concatenate( - (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 - ) - save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) - - idx = np.where(storesList[0] == event)[0] - storesList = np.delete(storesList, idx, axis=1) - if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): - os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) - if idx.shape[0] == 0: - pass - else: - np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info( - "\033[1m" - + "Timestamp files for individual new event are created \ - and the stores list file is changed." - + "\033[0m" - ) - - -# function to read tev file -def readtev(data, filepath, event, outputPath): - - logger.debug("Reading data for event {} ...".format(event)) - tevfilepath = glob.glob(os.path.join(filepath, "*.tev")) - if len(tevfilepath) > 1: - raise Exception("Two tev files are present at the location.") - else: - tevfilepath = tevfilepath[0] - - data["name"] = np.asarray(data["name"], dtype=str) - - allnames = np.unique(data["name"]) - - index = [] - for i in range(len(allnames)): - length = len(str(allnames[i])) - if length < 4: - index.append(i) - - allnames = np.delete(allnames, index, 0) - - eventNew = np.array(list(event)) - - # logger.info(allnames) - # logger.info(eventNew) - row = ismember(data["name"], event) - - if sum(row) == 0: - logger.info("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") - logger.info("\033[1m" + "File contains the following TDT store names:" + "\033[0m") - logger.info("\033[1m" + str(allnames) + "\033[0m") - logger.info("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") - import_csv(filepath, event, outputPath) - - return 0 - - allIndexesWhereEventIsPresent = np.where(row == 1) - first_row = allIndexesWhereEventIsPresent[0][0] - - formatNew = data["format"][first_row] + 1 - - table = np.array( - [ - [0, 0, 0, 0], - [0, "float", 1, np.float32], - [0, "long", 1, np.int32], - [0, "short", 2, np.int16], - [0, "byte", 4, np.int8], - ] - ) - - S = dict() - - S["storename"] = str(event) - S["sampling_rate"] = data["frequency"][first_row] - S["timestamps"] = np.asarray(data["timestamp"][allIndexesWhereEventIsPresent[0]]) - S["channels"] = np.asarray(data["chan"][allIndexesWhereEventIsPresent[0]]) - - fp_loc = np.asarray(data["fp_loc"][allIndexesWhereEventIsPresent[0]]) - data_size = np.asarray(data["size"]) - - if formatNew != 5: - nsample = (data_size[first_row,] - 10) * int(table[formatNew, 2]) - S["data"] = np.zeros((len(fp_loc), nsample)) - for i in range(0, len(fp_loc)): - with open(tevfilepath, "rb") as fp: - fp.seek(fp_loc[i], os.SEEK_SET) - S["data"][i, :] = np.fromfile(fp, dtype=table[formatNew, 3], count=nsample).reshape( - 1, nsample, order="F" - ) - # S['data'] = S['data'].swapaxes() - S["npoints"] = nsample - else: - S["data"] = np.asarray(data["strobe"][allIndexesWhereEventIsPresent[0]]) - S["npoints"] = 1 - S["channels"] = np.tile(1, (S["data"].shape[0],)) - - S["data"] = (S["data"].T).reshape(-1, order="F") - - save_dict_to_hdf5(S, event, outputPath) - - check_data(S, filepath, event, outputPath) - - logger.info("Data for event {} fetched and stored.".format(event)) - - -# function to execute readtev function using multiprocessing to make it faster -def execute_readtev(data, filepath, event, outputPath, numProcesses=mp.cpu_count()): - - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) - # p = mp.Pool(mp.cpu_count()) - # p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) - # p.close() - # p.join() - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()): # logger.info("Reading data for event {} ...".format(event)) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 72dc604..c2867ba 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -23,7 +23,6 @@ from guppy.readTevTsq import import_csv from guppy.tdt_step2 import readtsq -from guppy.np_doric_csv_step2 import import_np_doric_csv from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric from guppy.npm_step2 import import_npm diff --git a/src/guppy/tdt_step3.py b/src/guppy/tdt_step3.py new file mode 100644 index 0000000..04ba0dd --- /dev/null +++ b/src/guppy/tdt_step3.py @@ -0,0 +1,183 @@ +import glob +import json +import logging +import multiprocessing as mp +import os +import re +import sys +import time +import warnings +from itertools import repeat + +import h5py +import numpy as np +import pandas as pd +from numpy import float32, float64, int32, int64, uint16 + +from guppy.common_step3 import write_hdf5 + +logger = logging.getLogger(__name__) + +# function to execute readtev function using multiprocessing to make it faster +def execute_readtev(data, filepath, event, outputPath, numProcesses=mp.cpu_count()): + + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) + # p = mp.Pool(mp.cpu_count()) + # p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) + # p.close() + # p.join() + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +# function to read tev file +def readtev(data, filepath, event, outputPath): + + logger.debug("Reading data for event {} ...".format(event)) + tevfilepath = glob.glob(os.path.join(filepath, "*.tev")) + if len(tevfilepath) > 1: + raise Exception("Two tev files are present at the location.") + else: + tevfilepath = tevfilepath[0] + + data["name"] = np.asarray(data["name"], dtype=str) + + allnames = np.unique(data["name"]) + + index = [] + for i in range(len(allnames)): + length = len(str(allnames[i])) + if length < 4: + index.append(i) + + allnames = np.delete(allnames, index, 0) + + eventNew = np.array(list(event)) + + # logger.info(allnames) + # logger.info(eventNew) + row = ismember(data["name"], event) + + if sum(row) == 0: + logger.info("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") + logger.info("\033[1m" + "File contains the following TDT store names:" + "\033[0m") + logger.info("\033[1m" + str(allnames) + "\033[0m") + logger.info("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") + import_csv(filepath, event, outputPath) + + return 0 + + allIndexesWhereEventIsPresent = np.where(row == 1) + first_row = allIndexesWhereEventIsPresent[0][0] + + formatNew = data["format"][first_row] + 1 + + table = np.array( + [ + [0, 0, 0, 0], + [0, "float", 1, np.float32], + [0, "long", 1, np.int32], + [0, "short", 2, np.int16], + [0, "byte", 4, np.int8], + ] + ) + + S = dict() + + S["storename"] = str(event) + S["sampling_rate"] = data["frequency"][first_row] + S["timestamps"] = np.asarray(data["timestamp"][allIndexesWhereEventIsPresent[0]]) + S["channels"] = np.asarray(data["chan"][allIndexesWhereEventIsPresent[0]]) + + fp_loc = np.asarray(data["fp_loc"][allIndexesWhereEventIsPresent[0]]) + data_size = np.asarray(data["size"]) + + if formatNew != 5: + nsample = (data_size[first_row,] - 10) * int(table[formatNew, 2]) + S["data"] = np.zeros((len(fp_loc), nsample)) + for i in range(0, len(fp_loc)): + with open(tevfilepath, "rb") as fp: + fp.seek(fp_loc[i], os.SEEK_SET) + S["data"][i, :] = np.fromfile(fp, dtype=table[formatNew, 3], count=nsample).reshape( + 1, nsample, order="F" + ) + # S['data'] = S['data'].swapaxes() + S["npoints"] = nsample + else: + S["data"] = np.asarray(data["strobe"][allIndexesWhereEventIsPresent[0]]) + S["npoints"] = 1 + S["channels"] = np.tile(1, (S["data"].shape[0],)) + + S["data"] = (S["data"].T).reshape(-1, order="F") + + save_dict_to_hdf5(S, event, outputPath) + + check_data(S, filepath, event, outputPath) + + logger.info("Data for event {} fetched and stored.".format(event)) + +# check if a particular element is there in an array or not +def ismember(arr, element): + res = [1 if i == element else 0 for i in arr] + return np.asarray(res) + + +# function to save data read from tev file to hdf5 file +def save_dict_to_hdf5(S, event, outputPath): + write_hdf5(S["storename"], event, outputPath, "storename") + write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") + write_hdf5(S["timestamps"], event, outputPath, "timestamps") + + write_hdf5(S["data"], event, outputPath, "data") + write_hdf5(S["npoints"], event, outputPath, "npoints") + write_hdf5(S["channels"], event, outputPath, "channels") + + +# function to check event data (checking whether event timestamps belongs to same event or multiple events) +def check_data(S, filepath, event, outputPath): + # logger.info("Checking event storename data for creating multiple event names from single event storename...") + new_event = event.replace("\\", "") + new_event = event.replace("/", "") + diff = np.diff(S["data"]) + arr = np.full(diff.shape[0], 1) + + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + + if diff.shape[0] == 0: + return 0 + + if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: + logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug( + "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + ) + i_d = np.unique(S["data"]) + for i in range(i_d.shape[0]): + new_S = dict() + idx = np.where(S["data"] == i_d[i])[0] + new_S["timestamps"] = S["timestamps"][idx] + new_S["storename"] = new_event + str(int(i_d[i])) + new_S["sampling_rate"] = S["sampling_rate"] + new_S["data"] = S["data"] + new_S["npoints"] = S["npoints"] + new_S["channels"] = S["channels"] + storesList = np.concatenate( + (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 + ) + save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) + + idx = np.where(storesList[0] == event)[0] + storesList = np.delete(storesList, idx, axis=1) + if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): + os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) + if idx.shape[0] == 0: + pass + else: + np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") + logger.info( + "\033[1m" + + "Timestamp files for individual new event are created \ + and the stores list file is changed." + + "\033[0m" + ) \ No newline at end of file From 2f57867030294aae62a6e931864b30c0e341c8d2 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 11:41:03 -0800 Subject: [PATCH 013/125] Hard-coded modality to simplify read. --- src/guppy/readTevTsq.py | 50 +++++++---------------------------------- src/guppy/tdt_step3.py | 42 ++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 49 deletions(-) diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index fe16add..96fd59e 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -33,34 +33,6 @@ def writeToFile(value: str): file.write(value) -# function to read tsq file -def readtsq(filepath): - logger.debug("Trying to read tsq file.") - names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") - formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) - offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 - tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 1: - logger.error("Two tsq files are present at the location.") - raise Exception("Two tsq files are present at the location.") - elif len(path) == 0: - logger.info("\033[1m" + "tsq file not found." + "\033[1m") - return 0, 0 - else: - path = path[0] - flag = "tsq" - - # reading tsq file - tsq = np.fromfile(path, dtype=tsq_dtype) - - # creating dataframe of the data - df = pd.DataFrame(tsq) - - logger.info("Data from tsq file fetched.") - return df, flag - - # function to check if doric file exists def check_doric(filepath): logger.debug("Checking if doric file exists") @@ -294,13 +266,7 @@ def readRawData(inputParameters): filepath = folderNames[i] logger.debug(f"### Reading raw data for folder {folderNames[i]}") storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - # reading tsq file - data, flag = readtsq(filepath) - # checking if doric file exists - if flag == "tsq": - pass - else: - flag = check_doric(filepath) + modality = "tdt" # read data corresponding to each storename selected by user while saving the storeslist file for j in range(len(storesListPath)): @@ -314,14 +280,14 @@ def readRawData(inputParameters): 2, -1 ) - if isinstance(data, pd.DataFrame) and flag == "tsq": - execute_readtev(data, filepath, np.unique(storesList[0, :]), op, numProcesses) - elif flag == "doric_csv": - execute_import_doric(filepath, storesList, flag, op) - elif flag == "doric_doric": - execute_import_doric(filepath, storesList, flag, op) - else: + if modality == "tdt": + execute_readtev(filepath, np.unique(storesList[0, :]), op, numProcesses) + elif modality == "doric": + execute_import_doric(filepath, storesList, modality, op) + elif modality == "csv" or modality == "npm": execute_import_csv(filepath, np.unique(storesList[0, :]), op, numProcesses) + else: + raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") writeToFile(str(10 + ((step + 1) * 10)) + "\n") step += 1 diff --git a/src/guppy/tdt_step3.py b/src/guppy/tdt_step3.py index 04ba0dd..bc629f0 100644 --- a/src/guppy/tdt_step3.py +++ b/src/guppy/tdt_step3.py @@ -18,8 +18,36 @@ logger = logging.getLogger(__name__) +# function to read tsq file +def readtsq(filepath): + logger.debug("Trying to read tsq file.") + names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") + formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) + offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 + tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) + path = glob.glob(os.path.join(filepath, "*.tsq")) + if len(path) > 1: + logger.error("Two tsq files are present at the location.") + raise Exception("Two tsq files are present at the location.") + elif len(path) == 0: + logger.info("\033[1m" + "tsq file not found." + "\033[1m") + return 0, 0 + else: + path = path[0] + flag = "tsq" + + # reading tsq file + tsq = np.fromfile(path, dtype=tsq_dtype) + + # creating dataframe of the data + df = pd.DataFrame(tsq) + + logger.info("Data from tsq file fetched.") + return df, flag + # function to execute readtev function using multiprocessing to make it faster -def execute_readtev(data, filepath, event, outputPath, numProcesses=mp.cpu_count()): +def execute_readtev(filepath, event, outputPath, numProcesses=mp.cpu_count()): + data, _ = readtsq(filepath) start = time.time() with mp.Pool(numProcesses) as p: @@ -60,13 +88,13 @@ def readtev(data, filepath, event, outputPath): row = ismember(data["name"], event) if sum(row) == 0: - logger.info("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") - logger.info("\033[1m" + "File contains the following TDT store names:" + "\033[0m") - logger.info("\033[1m" + str(allnames) + "\033[0m") - logger.info("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") - import_csv(filepath, event, outputPath) + logger.error("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") + logger.error("\033[1m" + "File contains the following TDT store names:" + "\033[0m") + logger.error("\033[1m" + str(allnames) + "\033[0m") + logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") + raise ValueError("Requested store name not found.") + - return 0 allIndexesWhereEventIsPresent = np.where(row == 1) first_row = allIndexesWhereEventIsPresent[0][0] From 092e1b7f40934bae192fd428b77e0486c9f516c0 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 13:23:54 -0800 Subject: [PATCH 014/125] Split doric_step3.py off from read_raw_data.py. --- src/guppy/doric_step3.py | 126 +++++++++++++++++++++++++++++++++ src/guppy/readTevTsq.py | 146 +-------------------------------------- 2 files changed, 128 insertions(+), 144 deletions(-) create mode 100644 src/guppy/doric_step3.py diff --git a/src/guppy/doric_step3.py b/src/guppy/doric_step3.py new file mode 100644 index 0000000..792c54e --- /dev/null +++ b/src/guppy/doric_step3.py @@ -0,0 +1,126 @@ +import glob +import logging +import os +import re + +import h5py +import numpy as np +import pandas as pd + +from guppy.common_step3 import write_hdf5 + +logger = logging.getLogger(__name__) + + +def execute_import_doric(filepath, storesList, flag, outputPath): + + if flag == "doric_csv": + path = glob.glob(os.path.join(filepath, "*.csv")) + if len(path) > 1: + logger.error("An error occurred : More than one Doric csv file present at the location") + raise Exception("More than one Doric csv file present at the location") + else: + df = pd.read_csv(path[0], header=1, index_col=False) + df = df.dropna(axis=1, how="all") + df = df.dropna(axis=0, how="any") + df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(df["Time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") + write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") + else: + ttl = df[storesList[0, i]] + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5( + df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" + ) + else: + path = glob.glob(os.path.join(filepath, "*.doric")) + if len(path) > 1: + logger.error("An error occurred : More than one Doric file present at the location") + raise Exception("More than one Doric file present at the location") + else: + with h5py.File(path[0], "r") as f: + if "Traces" in list(f.keys()): + keys = access_data_doricV1(f, storesList, outputPath) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = access_data_doricV6(f, storesList, outputPath) + + + +def access_data_doricV6(doric_file, storesList, outputPath): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + decide_path = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: + decide_path.append(element) + else: + if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: + decide_path.append(element) + + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") + idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] + if len(idx) > 1: + logger.error("More than one string matched (which should not be the case)") + raise Exception("More than one string matched (which should not be the case)") + idx = idx[0] + data = np.array(doric_file[decide_path[idx]]) + timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") + write_hdf5(data, storesList[0, i], outputPath, "data") + else: + regex = re.compile("(.*?)" + storesList[0, i] + "$") + idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] + if len(idx) > 1: + logger.error("More than one string matched (which should not be the case)") + raise Exception("More than one string matched (which should not be the case)") + idx = idx[0] + ttl = np.array(doric_file[decide_path[idx]]) + timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + + +def access_data_doricV1(doric_file, storesList, outputPath): + keys = list(doric_file["Traces"]["Console"].keys()) + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") + write_hdf5(data, storesList[0, i], outputPath, "data") + else: + timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) + ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + + +def separate_last_element(arr): + l = arr[-1] + return arr[:-1], l \ No newline at end of file diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 96fd59e..6fdee1e 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -16,6 +16,7 @@ from guppy.common_step3 import write_hdf5 from guppy.tdt_step3 import execute_readtev +from guppy.doric_step3 import execute_import_doric logger = logging.getLogger(__name__) @@ -33,39 +34,6 @@ def writeToFile(value: str): file.write(value) -# function to check if doric file exists -def check_doric(filepath): - logger.debug("Checking if doric file exists") - path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) - - flag_arr = [] - for i in range(len(path)): - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "csv": - with warnings.catch_warnings(): - warnings.simplefilter("error") - try: - df = pd.read_csv(path[i], index_col=False, dtype=float) - except: - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - flag = "doric_csv" - flag_arr.append(flag) - elif ext == "doric": - flag = "doric_doric" - flag_arr.append(flag) - else: - pass - - if len(flag_arr) > 1: - logger.error("Two doric files are present at the same location") - raise Exception("Two doric files are present at the same location") - if len(flag_arr) == 0: - logger.error("\033[1m" + "Doric file not found." + "\033[1m") - return 0 - logger.info("Doric file found.") - return flag_arr[0] - - # function to read event timestamps csv file. def import_csv(filepath, event, outputPath): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") @@ -120,27 +88,7 @@ def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()) logger.info("Time taken = {0:.5f}".format(time.time() - start)) -def access_data_doricV1(doric_file, storesList, outputPath): - keys = list(doric_file["Traces"]["Console"].keys()) - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") - else: - timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") - -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l def find_string(regex, arr): @@ -149,96 +97,6 @@ def find_string(regex, arr): return i -def access_data_doricV6(doric_file, storesList, outputPath): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - decide_path = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: - decide_path.append(element) - else: - if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: - decide_path.append(element) - - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") - idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] - if len(idx) > 1: - logger.error("More than one string matched (which should not be the case)") - raise Exception("More than one string matched (which should not be the case)") - idx = idx[0] - data = np.array(doric_file[decide_path[idx]]) - timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") - else: - regex = re.compile("(.*?)" + storesList[0, i] + "$") - idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] - if len(idx) > 1: - logger.error("More than one string matched (which should not be the case)") - raise Exception("More than one string matched (which should not be the case)") - idx = idx[0] - ttl = np.array(doric_file[decide_path[idx]]) - timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") - - -def execute_import_doric(filepath, storesList, flag, outputPath): - - if flag == "doric_csv": - path = glob.glob(os.path.join(filepath, "*.csv")) - if len(path) > 1: - logger.error("An error occurred : More than one Doric csv file present at the location") - raise Exception("More than one Doric csv file present at the location") - else: - df = pd.read_csv(path[0], header=1, index_col=False) - df = df.dropna(axis=1, how="all") - df = df.dropna(axis=0, how="any") - df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(df["Time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") - write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") - else: - ttl = df[storesList[0, i]] - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5( - df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" - ) - else: - path = glob.glob(os.path.join(filepath, "*.doric")) - if len(path) > 1: - logger.error("An error occurred : More than one Doric file present at the location") - raise Exception("More than one Doric file present at the location") - else: - with h5py.File(path[0], "r") as f: - if "Traces" in list(f.keys()): - keys = access_data_doricV1(f, storesList, outputPath) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_data_doricV6(f, storesList, outputPath) - - # function to read data from 'tsq' and 'tev' files def readRawData(inputParameters): @@ -266,7 +124,7 @@ def readRawData(inputParameters): filepath = folderNames[i] logger.debug(f"### Reading raw data for folder {folderNames[i]}") storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - modality = "tdt" + modality = "doric" # read data corresponding to each storename selected by user while saving the storeslist file for j in range(len(storesListPath)): From 7abb8e09dd475ffcc1fb15156393ec04ac2c5c94 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 13:31:15 -0800 Subject: [PATCH 015/125] Added check_doric to doric_step3.py. --- src/guppy/doric_step3.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/guppy/doric_step3.py b/src/guppy/doric_step3.py index 792c54e..2c30887 100644 --- a/src/guppy/doric_step3.py +++ b/src/guppy/doric_step3.py @@ -2,6 +2,7 @@ import logging import os import re +import warnings import h5py import numpy as np @@ -11,8 +12,39 @@ logger = logging.getLogger(__name__) +def check_doric(filepath): + logger.debug("Checking if doric file exists") + path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) + + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "csv": + with warnings.catch_warnings(): + warnings.simplefilter("error") + try: + df = pd.read_csv(path[i], index_col=False, dtype=float) + except: + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + flag = "doric_csv" + flag_arr.append(flag) + elif ext == "doric": + flag = "doric_doric" + flag_arr.append(flag) + else: + pass + + if len(flag_arr) > 1: + logger.error("Two doric files are present at the same location") + raise Exception("Two doric files are present at the same location") + if len(flag_arr) == 0: + logger.error("\033[1m" + "Doric file not found." + "\033[1m") + return 0 + logger.info("Doric file found.") + return flag_arr[0] def execute_import_doric(filepath, storesList, flag, outputPath): + flag = check_doric(filepath) if flag == "doric_csv": path = glob.glob(os.path.join(filepath, "*.csv")) From b653538fad3acf017085b8943d846a6b633d7d99 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 13:45:55 -0800 Subject: [PATCH 016/125] Split csv_step3.py off from read_raw_data.py. --- src/guppy/csv_step3.py | 73 +++++++++++++++++++++++++++++++++++++ src/guppy/readTevTsq.py | 67 +--------------------------------- src/guppy/saveStoresList.py | 1 - 3 files changed, 75 insertions(+), 66 deletions(-) create mode 100644 src/guppy/csv_step3.py diff --git a/src/guppy/csv_step3.py b/src/guppy/csv_step3.py new file mode 100644 index 0000000..97d3eb5 --- /dev/null +++ b/src/guppy/csv_step3.py @@ -0,0 +1,73 @@ +import glob +import json +import logging +import multiprocessing as mp +import os +import re +import sys +import time +import warnings +from itertools import repeat + +import h5py +import numpy as np +import pandas as pd +from numpy import float32, float64, int32, int64, uint16 + +from guppy.common_step3 import write_hdf5 + +logger = logging.getLogger(__name__) + + +def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()): + # logger.info("Reading data for event {} ...".format(event)) + + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(import_csv, zip(repeat(filepath), event, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +# function to read event timestamps csv file. +def import_csv(filepath, event, outputPath): + logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") + if not os.path.exists(os.path.join(filepath, event + ".csv")): + logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + + df = pd.read_csv(os.path.join(filepath, event + ".csv"), index_col=False) + data = df + key = list(df.columns) + + if len(key) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(key)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + + if len(key) == 1: + if key[0].lower() != "timestamps": + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") + + if len(key) != 3 and len(key) != 1: + logger.error( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + + for i in range(len(key)): + write_hdf5(data[key[i]].dropna(), event, outputPath, key[i].lower()) + + logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") + + return data, key \ No newline at end of file diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 6fdee1e..c080b58 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -17,6 +17,7 @@ from guppy.common_step3 import write_hdf5 from guppy.tdt_step3 import execute_readtev from guppy.doric_step3 import execute_import_doric +from guppy.csv_step3 import execute_import_csv logger = logging.getLogger(__name__) @@ -33,70 +34,6 @@ def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) - -# function to read event timestamps csv file. -def import_csv(filepath, event, outputPath): - logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") - if not os.path.exists(os.path.join(filepath, event + ".csv")): - logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - - df = pd.read_csv(os.path.join(filepath, event + ".csv"), index_col=False) - data = df - key = list(df.columns) - - if len(key) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(key)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - - if len(key) == 1: - if key[0].lower() != "timestamps": - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") - - if len(key) != 3 and len(key) != 1: - logger.error( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - - for i in range(len(key)): - write_hdf5(data[key[i]].dropna(), event, outputPath, key[i].lower()) - - logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - - return data, key - - -def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()): - # logger.info("Reading data for event {} ...".format(event)) - - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(import_csv, zip(repeat(filepath), event, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - - - - -def find_string(regex, arr): - for i in range(len(arr)): - if regex.match(arr[i]): - return i - - # function to read data from 'tsq' and 'tev' files def readRawData(inputParameters): @@ -124,7 +61,7 @@ def readRawData(inputParameters): filepath = folderNames[i] logger.debug(f"### Reading raw data for folder {folderNames[i]}") storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - modality = "doric" + modality = "csv" # read data corresponding to each storename selected by user while saving the storeslist file for j in range(len(storesListPath)): diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index c2867ba..a432546 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -21,7 +21,6 @@ import panel as pn from numpy import float32, float64, int32, int64, uint16 -from guppy.readTevTsq import import_csv from guppy.tdt_step2 import readtsq from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric From 6d661c291a389f16d7e0c109b2510622c7892289 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 15:11:32 -0800 Subject: [PATCH 017/125] Added modality to Step 3. --- src/guppy/readTevTsq.py | 2 +- src/guppy/testing/api.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index c080b58..e0bedfa 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -42,6 +42,7 @@ def readRawData(inputParameters): inputParameters = inputParameters folderNames = inputParameters["folderNames"] numProcesses = inputParameters["numberOfCores"] + modality = inputParameters["modality"] storesListPath = [] if numProcesses == 0: numProcesses = mp.cpu_count() @@ -61,7 +62,6 @@ def readRawData(inputParameters): filepath = folderNames[i] logger.debug(f"### Reading raw data for folder {folderNames[i]}") storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - modality = "csv" # read data corresponding to each storename selected by user while saving the storeslist file for j in range(len(storesListPath)): diff --git a/src/guppy/testing/api.py b/src/guppy/testing/api.py index 0e16f23..d7e390d 100644 --- a/src/guppy/testing/api.py +++ b/src/guppy/testing/api.py @@ -237,6 +237,9 @@ def step3( input_params["npm_time_unit"] = npm_time_unit input_params["npm_split_events"] = npm_split_events + # Inject modality + input_params["modality"] = modality + # Call the underlying Step 3 worker directly (no subprocess) readRawData(input_params) @@ -315,6 +318,9 @@ def step4( input_params["npm_time_unit"] = npm_time_unit input_params["npm_split_events"] = npm_split_events + # Inject modality + input_params["modality"] = modality + # Call the underlying Step 4 worker directly (no subprocess) extractTsAndSignal(input_params) @@ -393,6 +399,9 @@ def step5( input_params["npm_time_unit"] = npm_time_unit input_params["npm_split_events"] = npm_split_events + # Inject modality + input_params["modality"] = modality + # Call the underlying Step 5 worker directly (no subprocess) psthForEachStorename(input_params) From a4f6583ecbd3071929551ecf71df4a5716a49791 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 09:12:07 -0800 Subject: [PATCH 018/125] Added tdtRecordingExtractor --- src/guppy/extractors/__init__.py | 1 + .../extractors/tdt_recording_extractor.py | 197 ++++++++++++++++++ src/guppy/readTevTsq.py | 11 +- src/guppy/saveStoresList.py | 5 +- 4 files changed, 211 insertions(+), 3 deletions(-) create mode 100644 src/guppy/extractors/__init__.py create mode 100644 src/guppy/extractors/tdt_recording_extractor.py diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py new file mode 100644 index 0000000..249daf9 --- /dev/null +++ b/src/guppy/extractors/__init__.py @@ -0,0 +1 @@ +from .tdt_recording_extractor import TdtRecordingExtractor \ No newline at end of file diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py new file mode 100644 index 0000000..98ae3cd --- /dev/null +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -0,0 +1,197 @@ +import glob +import logging +import os +import numpy as np +from numpy import float32, float64, int32, int64, uint16 +import pandas as pd +import multiprocessing as mp +import time +from itertools import repeat + +from guppy.common_step3 import write_hdf5 + +logger = logging.getLogger(__name__) + +class TdtRecordingExtractor: + + def __init__(self, folder_path): + self.folder_path = folder_path + self.header_df, _ = self.readtsq(folder_path) + + def readtsq(self, folder_path): + logger.debug("Trying to read tsq file.") + names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") + formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) + offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 + tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) + path = glob.glob(os.path.join(folder_path, "*.tsq")) + if len(path) > 1: + logger.error("Two tsq files are present at the location.") + raise Exception("Two tsq files are present at the location.") + elif len(path) == 0: + logger.info("\033[1m" + "tsq file not found." + "\033[1m") + return 0, 0 + else: + path = path[0] + flag = "tsq" + + # reading tsq file + tsq = np.fromfile(path, dtype=tsq_dtype) + + # creating dataframe of the data + df = pd.DataFrame(tsq) + + logger.info("Data from tsq file fetched.") + return df, flag + + # function to execute readtev function using multiprocessing to make it faster + def execute_readtev(self, filepath, event, outputPath, numProcesses=mp.cpu_count()): + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(self.readtev, zip(repeat(self.header_df), repeat(filepath), event, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + + # function to read tev file + def readtev(self, event): + data = self.header_df + filepath = self.folder_path + + logger.debug("Reading data for event {} ...".format(event)) + tevfilepath = glob.glob(os.path.join(filepath, "*.tev")) + if len(tevfilepath) > 1: + raise Exception("Two tev files are present at the location.") + else: + tevfilepath = tevfilepath[0] + + data["name"] = np.asarray(data["name"], dtype=str) + + allnames = np.unique(data["name"]) + + index = [] + for i in range(len(allnames)): + length = len(str(allnames[i])) + if length < 4: + index.append(i) + + allnames = np.delete(allnames, index, 0) + + eventNew = np.array(list(event)) + + # logger.info(allnames) + # logger.info(eventNew) + row = self.ismember(data["name"], event) + + if sum(row) == 0: + logger.error("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") + logger.error("\033[1m" + "File contains the following TDT store names:" + "\033[0m") + logger.error("\033[1m" + str(allnames) + "\033[0m") + logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") + raise ValueError("Requested store name not found.") + + + + allIndexesWhereEventIsPresent = np.where(row == 1) + first_row = allIndexesWhereEventIsPresent[0][0] + + formatNew = data["format"][first_row] + 1 + + table = np.array( + [ + [0, 0, 0, 0], + [0, "float", 1, np.float32], + [0, "long", 1, np.int32], + [0, "short", 2, np.int16], + [0, "byte", 4, np.int8], + ] + ) + + S = dict() + + S["storename"] = str(event) + S["sampling_rate"] = data["frequency"][first_row] + S["timestamps"] = np.asarray(data["timestamp"][allIndexesWhereEventIsPresent[0]]) + S["channels"] = np.asarray(data["chan"][allIndexesWhereEventIsPresent[0]]) + + fp_loc = np.asarray(data["fp_loc"][allIndexesWhereEventIsPresent[0]]) + data_size = np.asarray(data["size"]) + + if formatNew != 5: + nsample = (data_size[first_row,] - 10) * int(table[formatNew, 2]) + S["data"] = np.zeros((len(fp_loc), nsample)) + for i in range(0, len(fp_loc)): + with open(tevfilepath, "rb") as fp: + fp.seek(fp_loc[i], os.SEEK_SET) + S["data"][i, :] = np.fromfile(fp, dtype=table[formatNew, 3], count=nsample).reshape( + 1, nsample, order="F" + ) + # S['data'] = S['data'].swapaxes() + S["npoints"] = nsample + else: + S["data"] = np.asarray(data["strobe"][allIndexesWhereEventIsPresent[0]]) + S["npoints"] = 1 + S["channels"] = np.tile(1, (S["data"].shape[0],)) + + S["data"] = (S["data"].T).reshape(-1, order="F") + + return S + + # check if a particular element is there in an array or not + def ismember(self, arr, element): # TODO: replace this function with more standard usage + res = [1 if i == element else 0 for i in arr] + return np.asarray(res) + + + # function to save data read from tev file to hdf5 file + def save_dict_to_hdf5(self, S, event, outputPath): + write_hdf5(S["storename"], event, outputPath, "storename") + write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") + write_hdf5(S["timestamps"], event, outputPath, "timestamps") + + write_hdf5(S["data"], event, outputPath, "data") + write_hdf5(S["npoints"], event, outputPath, "npoints") + write_hdf5(S["channels"], event, outputPath, "channels") + + + # function to check event data (checking whether event timestamps belongs to same event or multiple events) + def check_data(self, S, event, outputPath): + # logger.info("Checking event storename data for creating multiple event names from single event storename...") + new_event = event.replace("\\", "") + new_event = event.replace("/", "") + diff = np.diff(S["data"]) + arr = np.full(diff.shape[0], 1) + + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + + if diff.shape[0] == 0: + return 0 + + if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: + logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug( + "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + ) + i_d = np.unique(S["data"]) + for i in range(i_d.shape[0]): + new_S = dict() + idx = np.where(S["data"] == i_d[i])[0] + new_S["timestamps"] = S["timestamps"][idx] + new_S["storename"] = new_event + str(int(i_d[i])) + new_S["sampling_rate"] = S["sampling_rate"] + new_S["data"] = S["data"] + new_S["npoints"] = S["npoints"] + new_S["channels"] = S["channels"] + storesList = np.concatenate( + (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 + ) + self.save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) + + idx = np.where(storesList[0] == event)[0] + storesList = np.delete(storesList, idx, axis=1) + if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): + os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) + if idx.shape[0] == 0: + pass + else: + np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") + logger.info("\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m") \ No newline at end of file diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index e0bedfa..d3c9147 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -16,6 +16,7 @@ from guppy.common_step3 import write_hdf5 from guppy.tdt_step3 import execute_readtev +from guppy.extractors import TdtRecordingExtractor from guppy.doric_step3 import execute_import_doric from guppy.csv_step3 import execute_import_csv @@ -76,7 +77,15 @@ def readRawData(inputParameters): ) if modality == "tdt": - execute_readtev(filepath, np.unique(storesList[0, :]), op, numProcesses) + # execute_readtev(filepath, np.unique(storesList[0, :]), op, numProcesses) + extractor = TdtRecordingExtractor(folder_path=filepath) + event = np.unique(storesList[0, :]) + for e in event: + S = extractor.readtev(event=e) + extractor.save_dict_to_hdf5(S=S, event=e, outputPath=op) + extractor.check_data(S=S, event=e, outputPath=op) + logger.info("Data for event {} fetched and stored.".format(e)) + elif modality == "doric": execute_import_doric(filepath, storesList, modality, op) elif modality == "csv" or modality == "npm": diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index a432546..79fa71a 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -21,7 +21,7 @@ import panel as pn from numpy import float32, float64, int32, int64, uint16 -from guppy.tdt_step2 import readtsq +from guppy.extractors import TdtRecordingExtractor from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric from guppy.npm_step2 import import_npm @@ -589,7 +589,8 @@ def execute(inputParameters): for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) if modality == "tdt": - data = readtsq(filepath) + extractor = TdtRecordingExtractor(folder_path=filepath) + data = extractor.header_df event_name, flag = [], [] elif modality == "csv": data = 0 From 882556e8b72fca014f51f3cd71ec2b51b9368b4d Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 09:49:16 -0800 Subject: [PATCH 019/125] Adapted parallel execute function to use new extractor. --- .../extractors/tdt_recording_extractor.py | 23 +++++++++++-------- src/guppy/readTevTsq.py | 11 ++------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 98ae3cd..c0b01f9 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -12,6 +12,19 @@ logger = logging.getLogger(__name__) +# function to execute readtev function using multiprocessing to make it faster +def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): + extractor = TdtRecordingExtractor(folder_path=folder_path) + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + +def read_tdt_and_save_hdf5(extractor, event, outputPath): + S = extractor.readtev(event=event) + extractor.save_dict_to_hdf5(S=S, event=event, outputPath=outputPath) + logger.info("Data for event {} fetched and stored.".format(event)) + class TdtRecordingExtractor: def __init__(self, folder_path): @@ -43,14 +56,6 @@ def readtsq(self, folder_path): logger.info("Data from tsq file fetched.") return df, flag - - # function to execute readtev function using multiprocessing to make it faster - def execute_readtev(self, filepath, event, outputPath, numProcesses=mp.cpu_count()): - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(self.readtev, zip(repeat(self.header_df), repeat(filepath), event, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - # function to read tev file def readtev(self, event): @@ -154,7 +159,7 @@ def save_dict_to_hdf5(self, S, event, outputPath): # function to check event data (checking whether event timestamps belongs to same event or multiple events) - def check_data(self, S, event, outputPath): + def check_data(self, S, event, outputPath): # TODO: fold this function into the main read/get function # logger.info("Checking event storename data for creating multiple event names from single event storename...") new_event = event.replace("\\", "") new_event = event.replace("/", "") diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index d3c9147..47b7962 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -16,7 +16,6 @@ from guppy.common_step3 import write_hdf5 from guppy.tdt_step3 import execute_readtev -from guppy.extractors import TdtRecordingExtractor from guppy.doric_step3 import execute_import_doric from guppy.csv_step3 import execute_import_csv @@ -77,14 +76,8 @@ def readRawData(inputParameters): ) if modality == "tdt": - # execute_readtev(filepath, np.unique(storesList[0, :]), op, numProcesses) - extractor = TdtRecordingExtractor(folder_path=filepath) - event = np.unique(storesList[0, :]) - for e in event: - S = extractor.readtev(event=e) - extractor.save_dict_to_hdf5(S=S, event=e, outputPath=op) - extractor.check_data(S=S, event=e, outputPath=op) - logger.info("Data for event {} fetched and stored.".format(e)) + events = np.unique(storesList[0, :]) + execute_readtev(filepath, events, op, numProcesses) elif modality == "doric": execute_import_doric(filepath, storesList, modality, op) From df7b9e160a46723c12193946d2aebaa156fe336c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:09:51 -0800 Subject: [PATCH 020/125] Added CsvRecordingExtractor for step 2 --- src/guppy/extractors/__init__.py | 3 +- .../extractors/csv_recording_extractor.py | 115 ++++++++++++++++++ src/guppy/saveStoresList.py | 15 +-- 3 files changed, 123 insertions(+), 10 deletions(-) create mode 100644 src/guppy/extractors/csv_recording_extractor.py diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index 249daf9..812622b 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1 +1,2 @@ -from .tdt_recording_extractor import TdtRecordingExtractor \ No newline at end of file +from .tdt_recording_extractor import TdtRecordingExtractor +from .csv_recording_extractor import CsvRecordingExtractor diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py new file mode 100644 index 0000000..f5a73e9 --- /dev/null +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -0,0 +1,115 @@ +import glob +import logging +import os + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + + +class CsvRecordingExtractor: + + def __init__(self, folder_path): + self.folder_path = folder_path + + logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") + path = sorted(glob.glob(os.path.join(self.folder_path, "*.csv"))) + + path = sorted(list(set(path))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + assert ext == "csv", "Only .csv files are supported by import_csv function." + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) != len( + df_arr + ), "This file appears to be doric .csv. This function only supports standard .csv files." + df = pd.read_csv(path[i], index_col=False) + + _, value = self.check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + if len(cols) == 1: + if cols[0].lower() != "timestamps": + logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + else: + flag = "event_csv" + elif len(cols) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(cols)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + else: + flag = "data_csv" + elif len(cols) == 2: + raise ValueError( + "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." + ) + elif len(cols) >= 2: + raise ValueError( + "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." + ) + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + flag_arr.append(flag) + logger.info(flag) + assert ( + flag == "event_csv" or flag == "data_csv" + ), "This function only supports standard event_csv and data_csv files." + name = os.path.basename(path[i]).split(".")[0] + event_from_filename.append(name) + + logger.info("Importing of csv file is done.") + + self.events = event_from_filename + self.flags = flag_arr + + def check_header(self, df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 79fa71a..e64be8c 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -9,21 +9,16 @@ import logging import os import socket -import tkinter as tk from pathlib import Path from random import randint -from tkinter import StringVar, messagebox, ttk -import h5py import holoviews as hv import numpy as np import pandas as pd import panel as pn -from numpy import float32, float64, int32, int64, uint16 -from guppy.extractors import TdtRecordingExtractor -from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric +from guppy.extractors import CsvRecordingExtractor, TdtRecordingExtractor from guppy.npm_step2 import import_npm # hv.extension() @@ -573,7 +568,6 @@ def save_button(event=None): template.show(port=number) - # function to read input parameters and run the saveStorenames function def execute(inputParameters): @@ -594,7 +588,10 @@ def execute(inputParameters): event_name, flag = [], [] elif modality == "csv": data = 0 - event_name, flag = import_csv_step2(filepath) + extractor = CsvRecordingExtractor(folder_path=filepath) + event_name = extractor.events + flag = extractor.flags + elif modality == "doric": data = 0 event_name, flag = import_doric(filepath) @@ -603,7 +600,7 @@ def execute(inputParameters): event_name, flag = import_npm(filepath, num_ch) else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") - + saveStorenames(inputParameters, data, event_name, flag, filepath) logger.info("#" * 400) except Exception as e: From bcb78a51d52b54f3126d50260e735c4929da3a4e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:10:24 -0800 Subject: [PATCH 021/125] Installed pre-commit. --- src/guppy/common_step3.py | 13 ++------ src/guppy/csv_step2.py | 21 +++++++++--- src/guppy/csv_step3.py | 9 +----- src/guppy/doric_step2.py | 9 ++++-- src/guppy/doric_step3.py | 5 +-- .../extractors/tdt_recording_extractor.py | 32 +++++++++++-------- src/guppy/npm_step2.py | 13 +++++--- src/guppy/readTevTsq.py | 13 ++------ src/guppy/savingInputParameters.py | 4 ++- src/guppy/tdt_step2.py | 6 ++-- src/guppy/tdt_step3.py | 12 +++---- 11 files changed, 70 insertions(+), 67 deletions(-) diff --git a/src/guppy/common_step3.py b/src/guppy/common_step3.py index 4ea5c95..09e763f 100644 --- a/src/guppy/common_step3.py +++ b/src/guppy/common_step3.py @@ -1,21 +1,12 @@ -import glob -import json import logging -import multiprocessing as mp import os -import re -import sys -import time -import warnings -from itertools import repeat import h5py import numpy as np -import pandas as pd -from numpy import float32, float64, int32, int64, uint16 logger = logging.getLogger(__name__) + # function to write data to a hdf5 file def write_hdf5(data, event, filepath, key): @@ -48,4 +39,4 @@ def write_hdf5(data, event, filepath, key): if type(data) is np.ndarray: f.create_dataset(key, data=data, maxshape=(None,), chunks=True) else: - f.create_dataset(key, data=data) \ No newline at end of file + f.create_dataset(key, data=data) diff --git a/src/guppy/csv_step2.py b/src/guppy/csv_step2.py index 4d9b800..ba4b34f 100644 --- a/src/guppy/csv_step2.py +++ b/src/guppy/csv_step2.py @@ -1,11 +1,13 @@ import glob import logging import os + import numpy as np import pandas as pd logger = logging.getLogger(__name__) + def check_header(df): arr = list(df.columns) check_float = [] @@ -17,6 +19,7 @@ def check_header(df): return arr, check_float + def import_csv_step2(filepath): logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) @@ -37,7 +40,9 @@ def import_csv_step2(filepath): float(element) except: check_all_str.append(i) - assert len(check_all_str) != len(df_arr), "This file appears to be doric .csv. This function only supports standard .csv files." + assert len(check_all_str) != len( + df_arr + ), "This file appears to be doric .csv. This function only supports standard .csv files." df = pd.read_csv(path[i], index_col=False) _, value = check_header(df) @@ -75,9 +80,13 @@ def import_csv_step2(filepath): else: flag = "data_csv" elif len(cols) == 2: - raise ValueError("Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data.") + raise ValueError( + "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." + ) elif len(cols) >= 2: - raise ValueError("Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data.") + raise ValueError( + "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." + ) else: logger.error("Number of columns in csv file does not make sense.") raise Exception("Number of columns in csv file does not make sense.") @@ -91,9 +100,11 @@ def import_csv_step2(filepath): flag_arr.append(flag) logger.info(flag) - assert flag == "event_csv" or flag == "data_csv", "This function only supports standard event_csv and data_csv files." + assert ( + flag == "event_csv" or flag == "data_csv" + ), "This function only supports standard event_csv and data_csv files." name = os.path.basename(path[i]).split(".")[0] event_from_filename.append(name) logger.info("Importing of csv file is done.") - return event_from_filename, flag_arr \ No newline at end of file + return event_from_filename, flag_arr diff --git a/src/guppy/csv_step3.py b/src/guppy/csv_step3.py index 97d3eb5..985959a 100644 --- a/src/guppy/csv_step3.py +++ b/src/guppy/csv_step3.py @@ -1,18 +1,11 @@ -import glob -import json import logging import multiprocessing as mp import os -import re -import sys import time -import warnings from itertools import repeat -import h5py import numpy as np import pandas as pd -from numpy import float32, float64, int32, int64, uint16 from guppy.common_step3 import write_hdf5 @@ -70,4 +63,4 @@ def import_csv(filepath, event, outputPath): logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - return data, key \ No newline at end of file + return data, key diff --git a/src/guppy/doric_step2.py b/src/guppy/doric_step2.py index bf402d1..26ab22e 100644 --- a/src/guppy/doric_step2.py +++ b/src/guppy/doric_step2.py @@ -8,6 +8,7 @@ logger = logging.getLogger(__name__) + def import_doric(filepath): logger.debug("If it exists, importing Doric file based on the structure of file") @@ -33,7 +34,9 @@ def import_doric(filepath): float(element) except: check_all_str.append(i) - assert len(check_all_str) == len(df_arr), "This file appears to be standard .csv. This function only supports doric .csv files." + assert len(check_all_str) == len( + df_arr + ), "This file appears to be standard .csv. This function only supports doric .csv files." df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) df = df.drop(["Time(s)"], axis=1) event_from_filename.extend(list(df.columns)) @@ -52,6 +55,7 @@ def read_doric(filepath): return keys + def access_keys_doricV6(doric_file): data = [doric_file["DataAcquisition"]] res = [] @@ -82,6 +86,7 @@ def access_keys_doricV1(doric_file): return keys + def separate_last_element(arr): l = arr[-1] - return arr[:-1], l \ No newline at end of file + return arr[:-1], l diff --git a/src/guppy/doric_step3.py b/src/guppy/doric_step3.py index 2c30887..e9fd7cc 100644 --- a/src/guppy/doric_step3.py +++ b/src/guppy/doric_step3.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) + def check_doric(filepath): logger.debug("Checking if doric file exists") path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) @@ -43,6 +44,7 @@ def check_doric(filepath): logger.info("Doric file found.") return flag_arr[0] + def execute_import_doric(filepath, storesList, flag, outputPath): flag = check_doric(filepath) @@ -83,7 +85,6 @@ def execute_import_doric(filepath, storesList, flag, outputPath): keys = access_data_doricV6(f, storesList, outputPath) - def access_data_doricV6(doric_file, storesList, outputPath): data = [doric_file["DataAcquisition"]] res = [] @@ -155,4 +156,4 @@ def access_data_doricV1(doric_file, storesList, outputPath): def separate_last_element(arr): l = arr[-1] - return arr[:-1], l \ No newline at end of file + return arr[:-1], l diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index c0b01f9..1d46b1e 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -1,17 +1,19 @@ import glob import logging -import os -import numpy as np -from numpy import float32, float64, int32, int64, uint16 -import pandas as pd import multiprocessing as mp +import os import time from itertools import repeat +import numpy as np +import pandas as pd +from numpy import float32, float64, int32, int64, uint16 + from guppy.common_step3 import write_hdf5 logger = logging.getLogger(__name__) + # function to execute readtev function using multiprocessing to make it faster def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): extractor = TdtRecordingExtractor(folder_path=folder_path) @@ -20,11 +22,13 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) logger.info("Time taken = {0:.5f}".format(time.time() - start)) + def read_tdt_and_save_hdf5(extractor, event, outputPath): S = extractor.readtev(event=event) extractor.save_dict_to_hdf5(S=S, event=event, outputPath=outputPath) logger.info("Data for event {} fetched and stored.".format(event)) + class TdtRecordingExtractor: def __init__(self, folder_path): @@ -94,8 +98,6 @@ def readtev(self, event): logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") raise ValueError("Requested store name not found.") - - allIndexesWhereEventIsPresent = np.where(row == 1) first_row = allIndexesWhereEventIsPresent[0][0] @@ -142,11 +144,10 @@ def readtev(self, event): return S # check if a particular element is there in an array or not - def ismember(self, arr, element): # TODO: replace this function with more standard usage + def ismember(self, arr, element): # TODO: replace this function with more standard usage res = [1 if i == element else 0 for i in arr] return np.asarray(res) - # function to save data read from tev file to hdf5 file def save_dict_to_hdf5(self, S, event, outputPath): write_hdf5(S["storename"], event, outputPath, "storename") @@ -157,16 +158,17 @@ def save_dict_to_hdf5(self, S, event, outputPath): write_hdf5(S["npoints"], event, outputPath, "npoints") write_hdf5(S["channels"], event, outputPath, "channels") - # function to check event data (checking whether event timestamps belongs to same event or multiple events) - def check_data(self, S, event, outputPath): # TODO: fold this function into the main read/get function + def check_data(self, S, event, outputPath): # TODO: fold this function into the main read/get function # logger.info("Checking event storename data for creating multiple event names from single event storename...") new_event = event.replace("\\", "") new_event = event.replace("/", "") diff = np.diff(S["data"]) arr = np.full(diff.shape[0], 1) - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( + 2, -1 + ) if diff.shape[0] == 0: return 0 @@ -174,7 +176,9 @@ def check_data(self, S, event, outputPath): # TODO: fold this function into the if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + "\033[1m" + + "Create timestamp files for individual new event and change the stores list file." + + "\033[0m" ) i_d = np.unique(S["data"]) for i in range(i_d.shape[0]): @@ -199,4 +203,6 @@ def check_data(self, S, event, outputPath): # TODO: fold this function into the pass else: np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info("\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m") \ No newline at end of file + logger.info( + "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" + ) diff --git a/src/guppy/npm_step2.py b/src/guppy/npm_step2.py index f0fafec..14b776f 100644 --- a/src/guppy/npm_step2.py +++ b/src/guppy/npm_step2.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) + def import_npm(filepath, num_ch, inputParameters=None): logger.debug("If it exists, importing NPM file based on the structure of file") @@ -49,7 +50,9 @@ def import_npm(filepath, num_ch, inputParameters=None): float(element) except: check_all_str.append(i) - assert len(check_all_str) != len(df_arr), "This file appears to be doric .csv. This function only supports NPM .csv files." + assert len(check_all_str) != len( + df_arr + ), "This file appears to be doric .csv. This function only supports NPM .csv files." df = pd.read_csv(path[i], index_col=False) _, value = check_header(df) @@ -174,9 +177,7 @@ def import_npm(filepath, num_ch, inputParameters=None): # path_sig = glob.glob(os.path.join(filepath, 'sig*')) path_chev_chod_chpr = [path_chev, path_chod, path_chpr] if ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) - and ("event_np" in flag_arr) - and (i == len(path) - 1) + ("data_np_v2" in flag_arr or "data_np" in flag_arr) and ("event_np" in flag_arr) and (i == len(path) - 1) ) or ( ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) ): # i==len(path)-1 and or 'event_np' in flag @@ -234,6 +235,7 @@ def import_npm(filepath, num_ch, inputParameters=None): logger.info("Importing of NPM file is done.") return event_from_filename, flag_arr + def check_header(df): arr = list(df.columns) check_float = [] @@ -294,6 +296,7 @@ def decide_indices(file, df, flag, num_ch=2): return df, indices_dict, num_ch + # check flag consistency in neurophotometrics data def check_channels(state): state = state.astype(int) @@ -405,4 +408,4 @@ def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headl else: pass - return df, ts_unit \ No newline at end of file + return df, ts_unit diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 47b7962..b86f6a2 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -3,21 +3,13 @@ import logging import multiprocessing as mp import os -import re import sys -import time -import warnings -from itertools import repeat -import h5py import numpy as np -import pandas as pd -from numpy import float32, float64, int32, int64, uint16 -from guppy.common_step3 import write_hdf5 -from guppy.tdt_step3 import execute_readtev -from guppy.doric_step3 import execute_import_doric from guppy.csv_step3 import execute_import_csv +from guppy.doric_step3 import execute_import_doric +from guppy.tdt_step3 import execute_readtev logger = logging.getLogger(__name__) @@ -34,6 +26,7 @@ def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) + # function to read data from 'tsq' and 'tev' files def readRawData(inputParameters): diff --git a/src/guppy/savingInputParameters.py b/src/guppy/savingInputParameters.py index b0a5feb..a1bd35e 100644 --- a/src/guppy/savingInputParameters.py +++ b/src/guppy/savingInputParameters.py @@ -554,7 +554,9 @@ def onclickpsth(event=None): psth_baseline_param = pn.Column(zscore_param_wd, psth_param_wd, baseline_param_wd, peak_param_wd) - widget = pn.Column(mark_down_1, files_1, explain_modality, modality_selector, pn.Row(individual_analysis_wd_2, psth_baseline_param)) + widget = pn.Column( + mark_down_1, files_1, explain_modality, modality_selector, pn.Row(individual_analysis_wd_2, psth_baseline_param) + ) # file_selector = pn.WidgetBox(files_1) styles = dict(background="WhiteSmoke") diff --git a/src/guppy/tdt_step2.py b/src/guppy/tdt_step2.py index 09456a7..130ace8 100644 --- a/src/guppy/tdt_step2.py +++ b/src/guppy/tdt_step2.py @@ -1,12 +1,14 @@ import glob import logging import os + import numpy as np -from numpy import float32, float64, int32, int64, uint16 import pandas as pd +from numpy import float32, float64, int32, int64, uint16 logger = logging.getLogger(__name__) + # function to read 'tsq' file def readtsq(filepath): names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") @@ -23,4 +25,4 @@ def readtsq(filepath): path = path[0] tsq = np.fromfile(path, dtype=tsq_dtype) df = pd.DataFrame(tsq) - return df \ No newline at end of file + return df diff --git a/src/guppy/tdt_step3.py b/src/guppy/tdt_step3.py index bc629f0..be92d4c 100644 --- a/src/guppy/tdt_step3.py +++ b/src/guppy/tdt_step3.py @@ -1,15 +1,10 @@ import glob -import json import logging import multiprocessing as mp import os -import re -import sys import time -import warnings from itertools import repeat -import h5py import numpy as np import pandas as pd from numpy import float32, float64, int32, int64, uint16 @@ -18,6 +13,7 @@ logger = logging.getLogger(__name__) + # function to read tsq file def readtsq(filepath): logger.debug("Trying to read tsq file.") @@ -45,6 +41,7 @@ def readtsq(filepath): logger.info("Data from tsq file fetched.") return df, flag + # function to execute readtev function using multiprocessing to make it faster def execute_readtev(filepath, event, outputPath, numProcesses=mp.cpu_count()): data, _ = readtsq(filepath) @@ -94,8 +91,6 @@ def readtev(data, filepath, event, outputPath): logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") raise ValueError("Requested store name not found.") - - allIndexesWhereEventIsPresent = np.where(row == 1) first_row = allIndexesWhereEventIsPresent[0][0] @@ -145,6 +140,7 @@ def readtev(data, filepath, event, outputPath): logger.info("Data for event {} fetched and stored.".format(event)) + # check if a particular element is there in an array or not def ismember(arr, element): res = [1 if i == element else 0 for i in arr] @@ -208,4 +204,4 @@ def check_data(S, filepath, event, outputPath): + "Timestamp files for individual new event are created \ and the stores list file is changed." + "\033[0m" - ) \ No newline at end of file + ) From 1c8ee07e09d566578219e08e41bc87a54bb9854a Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:33:31 -0800 Subject: [PATCH 022/125] Added CsvRecordingExtractor for step 3 --- src/guppy/extractors/__init__.py | 4 +- .../extractors/csv_recording_extractor.py | 65 +++++++++++++++++++ src/guppy/readTevTsq.py | 7 +- 3 files changed, 71 insertions(+), 5 deletions(-) diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index 812622b..a421290 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,2 +1,2 @@ -from .tdt_recording_extractor import TdtRecordingExtractor -from .csv_recording_extractor import CsvRecordingExtractor +from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev +from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index f5a73e9..3df76f6 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -1,13 +1,34 @@ import glob import logging +import multiprocessing as mp import os +import time +from itertools import repeat import numpy as np import pandas as pd +from guppy.common_step3 import write_hdf5 + logger = logging.getLogger(__name__) +def execute_import_csv(filepath, events, outputPath, numProcesses=mp.cpu_count()): + logger.info("Reading data for event {} ...".format(events)) + + extractor = CsvRecordingExtractor(folder_path=filepath) + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(read_csv_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +def read_csv_and_save_hdf5(extractor, event, outputPath): + df = extractor.read_csv(event=event) + extractor.save_to_hdf5(df=df, event=event, outputPath=outputPath) + logger.info("Data for event {} fetched and stored.".format(event)) + + class CsvRecordingExtractor: def __init__(self, folder_path): @@ -113,3 +134,47 @@ def check_header(self, df): pass return arr, check_float + + def read_csv(self, event): + logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") + if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): + logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + + df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) + return df + + def save_to_hdf5(self, df, event, outputPath): + key = list(df.columns) + + # TODO: clean up these if branches + if len(key) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(key)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + + if len(key) == 1: + if key[0].lower() != "timestamps": + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") + + if len(key) != 3 and len(key) != 1: + logger.error( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + + for i in range(len(key)): + write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) + + logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index b86f6a2..c67f075 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -7,9 +7,8 @@ import numpy as np -from guppy.csv_step3 import execute_import_csv from guppy.doric_step3 import execute_import_doric -from guppy.tdt_step3 import execute_readtev +from guppy.extractors import execute_import_csv, execute_readtev logger = logging.getLogger(__name__) @@ -74,8 +73,10 @@ def readRawData(inputParameters): elif modality == "doric": execute_import_doric(filepath, storesList, modality, op) - elif modality == "csv" or modality == "npm": + elif modality == "csv": execute_import_csv(filepath, np.unique(storesList[0, :]), op, numProcesses) + elif modality == "npm": + raise NotImplementedError("NPM modality is not yet implemented.") else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") From 9262a5ad3cf21497a6a489183a7b093768cd15cb Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:51:24 -0800 Subject: [PATCH 023/125] Added DoricRecordingExtractor for step 2 --- src/guppy/extractors/__init__.py | 1 + src/guppy/saveStoresList.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index a421290..ebb9fb0 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,2 +1,3 @@ from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv +from .doric_recording_extractor import DoricRecordingExtractor diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index e64be8c..baec41e 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -17,8 +17,11 @@ import pandas as pd import panel as pn -from guppy.doric_step2 import import_doric -from guppy.extractors import CsvRecordingExtractor, TdtRecordingExtractor +from guppy.extractors import ( + CsvRecordingExtractor, + DoricRecordingExtractor, + TdtRecordingExtractor, +) from guppy.npm_step2 import import_npm # hv.extension() @@ -594,7 +597,10 @@ def execute(inputParameters): elif modality == "doric": data = 0 - event_name, flag = import_doric(filepath) + extractor = DoricRecordingExtractor(folder_path=filepath) + event_name = extractor.events + flag = extractor.flags + elif modality == "npm": data = 0 event_name, flag = import_npm(filepath, num_ch) From 9c5afced4ccabb31edc87eaafaa1b54df5d95eb9 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:51:49 -0800 Subject: [PATCH 024/125] Added DoricRecordingExtractor for step 2 --- .../extractors/doric_recording_extractor.py | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 src/guppy/extractors/doric_recording_extractor.py diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py new file mode 100644 index 0000000..f45df50 --- /dev/null +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -0,0 +1,94 @@ +import glob +import logging +import os + +import h5py +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + + +class DoricRecordingExtractor: + + def __init__(self, folder_path): + self.folder_path = folder_path + logger.debug("If it exists, importing Doric file based on the structure of file") + path = sorted(glob.glob(os.path.join(self.folder_path, "*.csv"))) + sorted( + glob.glob(os.path.join(self.folder_path, "*.doric")) + ) + + path = sorted(list(set(path))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "doric": + key_names = self.read_doric(path[i]) + event_from_filename.extend(key_names) + flag = "doric_doric" + else: + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) == len( + df_arr + ), "This file appears to be standard .csv. This function only supports doric .csv files." + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + df = df.drop(["Time(s)"], axis=1) + event_from_filename.extend(list(df.columns)) + flag = "doric_csv" + logger.info(flag) + logger.info("Importing of Doric file is done.") + + self.events = event_from_filename + self.flags = flag_arr + + def read_doric(self, filepath): + with h5py.File(filepath, "r") as f: + if "Traces" in list(f.keys()): + keys = self.access_keys_doricV1(f) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = self.access_keys_doricV6(f) + + return keys + + def access_keys_doricV6(self, doric_file): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = self.separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + keys = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + keys.append(f"{sep_values[-3]}/{sep_values[-2]}") + else: + keys.append(f"{sep_values[-2]}/{sep_values[-1]}") + + return keys + + def access_keys_doricV1(self, doric_file): + keys = list(doric_file["Traces"]["Console"].keys()) + keys.remove("Time(s)") + + return keys + + def separate_last_element(self, arr): + l = arr[-1] + return arr[:-1], l From 914f23f36b7a4adc9a4edeb9af1a316c146e9586 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 14:22:40 -0800 Subject: [PATCH 025/125] Added DoricRecordingExtractor for step 3 --- src/guppy/extractors/__init__.py | 2 +- .../extractors/doric_recording_extractor.py | 152 ++++++++++++++++++ 2 files changed, 153 insertions(+), 1 deletion(-) diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index ebb9fb0..b3c2c3a 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,3 +1,3 @@ from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv -from .doric_recording_extractor import DoricRecordingExtractor +from .doric_recording_extractor import DoricRecordingExtractor, execute_import_doric diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index f45df50..cbade8b 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -1,14 +1,31 @@ import glob import logging import os +import re +import warnings import h5py import numpy as np import pandas as pd +from guppy.common_step3 import write_hdf5 + logger = logging.getLogger(__name__) +def execute_import_doric(folder_path, storesList, flag, outputPath): + extractor = DoricRecordingExtractor(folder_path=folder_path) + flag = extractor.check_doric(folder_path) + + if flag == "doric_csv": + extractor.read_doric_csv(folder_path, storesList, outputPath) + elif flag == "doric_doric": + extractor.read_doric_doric(folder_path, storesList, outputPath) + else: + logger.error("Doric file not found or not recognized.") + raise FileNotFoundError("Doric file not found or not recognized.") + + class DoricRecordingExtractor: def __init__(self, folder_path): @@ -92,3 +109,138 @@ def access_keys_doricV1(self, doric_file): def separate_last_element(self, arr): l = arr[-1] return arr[:-1], l + + def check_doric(self, filepath): + logger.debug("Checking if doric file exists") + path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) + + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "csv": + with warnings.catch_warnings(): + warnings.simplefilter("error") + try: + df = pd.read_csv(path[i], index_col=False, dtype=float) + except: # TODO: fix this bare try-except + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + flag = "doric_csv" + flag_arr.append(flag) + elif ext == "doric": + flag = "doric_doric" + flag_arr.append(flag) + else: + pass + + if len(flag_arr) > 1: + logger.error("Two doric files are present at the same location") + raise Exception("Two doric files are present at the same location") + if len(flag_arr) == 0: + logger.error("\033[1m" + "Doric file not found." + "\033[1m") + return 0 + logger.info("Doric file found.") + return flag_arr[0] + + def read_doric_csv(self, filepath, storesList, outputPath): + path = glob.glob(os.path.join(filepath, "*.csv")) + if len(path) > 1: + logger.error("An error occurred : More than one Doric csv file present at the location") + raise Exception("More than one Doric csv file present at the location") + else: + df = pd.read_csv(path[0], header=1, index_col=False) + df = df.dropna(axis=1, how="all") + df = df.dropna(axis=0, how="any") + df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(df["Time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") + write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") + else: + ttl = df[storesList[0, i]] + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5( + df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" + ) + + def read_doric_doric(self, filepath, storesList, outputPath): + path = glob.glob(os.path.join(filepath, "*.doric")) + if len(path) > 1: + logger.error("An error occurred : More than one Doric file present at the location") + raise Exception("More than one Doric file present at the location") + else: + with h5py.File(path[0], "r") as f: + if "Traces" in list(f.keys()): + keys = self.access_data_doricV1(f, storesList, outputPath) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = self.access_data_doricV6(f, storesList, outputPath) + + def access_data_doricV6(self, doric_file, storesList, outputPath): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = self.separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + decide_path = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: + decide_path.append(element) + else: + if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: + decide_path.append(element) + + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") + idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] + if len(idx) > 1: + logger.error("More than one string matched (which should not be the case)") + raise Exception("More than one string matched (which should not be the case)") + idx = idx[0] + data = np.array(doric_file[decide_path[idx]]) + timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") + write_hdf5(data, storesList[0, i], outputPath, "data") + else: + regex = re.compile("(.*?)" + storesList[0, i] + "$") + idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] + if len(idx) > 1: + logger.error("More than one string matched (which should not be the case)") + raise Exception("More than one string matched (which should not be the case)") + idx = idx[0] + ttl = np.array(doric_file[decide_path[idx]]) + timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + + def access_data_doricV1(self, doric_file, storesList, outputPath): + keys = list(doric_file["Traces"]["Console"].keys()) + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") + write_hdf5(data, storesList[0, i], outputPath, "data") + else: + timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) + ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") From cd966ae4acf2c07bcae716ed69b403758d7e819f Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 14:34:52 -0800 Subject: [PATCH 026/125] streamlined inputs --- src/guppy/extractors/doric_recording_extractor.py | 2 +- src/guppy/readTevTsq.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index cbade8b..e5a97cb 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -13,7 +13,7 @@ logger = logging.getLogger(__name__) -def execute_import_doric(folder_path, storesList, flag, outputPath): +def execute_import_doric(folder_path, storesList, outputPath): extractor = DoricRecordingExtractor(folder_path=folder_path) flag = extractor.check_doric(folder_path) diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index c67f075..c5c52da 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -67,14 +67,13 @@ def readRawData(inputParameters): 2, -1 ) + events = np.unique(storesList[0, :]) if modality == "tdt": - events = np.unique(storesList[0, :]) execute_readtev(filepath, events, op, numProcesses) - elif modality == "doric": - execute_import_doric(filepath, storesList, modality, op) + execute_import_doric(filepath, storesList, op) elif modality == "csv": - execute_import_csv(filepath, np.unique(storesList[0, :]), op, numProcesses) + execute_import_csv(filepath, events, op, numProcesses) elif modality == "npm": raise NotImplementedError("NPM modality is not yet implemented.") else: From ac158de53025dbe370238a0080c71f1dbf9fb9d1 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 15:14:44 -0800 Subject: [PATCH 027/125] Added NpmRecordingExtractor for step 2 --- src/guppy/extractors/__init__.py | 1 + .../extractors/npm_recording_extractor.py | 429 ++++++++++++++++++ src/guppy/saveStoresList.py | 6 +- 3 files changed, 434 insertions(+), 2 deletions(-) create mode 100644 src/guppy/extractors/npm_recording_extractor.py diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index b3c2c3a..b876012 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,3 +1,4 @@ from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv from .doric_recording_extractor import DoricRecordingExtractor, execute_import_doric +from .npm_recording_extractor import NpmRecordingExtractor, execute_import_npm diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py new file mode 100644 index 0000000..c15987f --- /dev/null +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -0,0 +1,429 @@ +import glob +import logging +import os +import tkinter as tk +from tkinter import StringVar, messagebox, ttk + +import numpy as np +import pandas as pd +import panel as pn + +pn.extension() + +logger = logging.getLogger(__name__) + + +def execute_import_npm(): + raise NotImplementedError("This function is a placeholder for execute_import_npm functionality.") + + +class NpmRecordingExtractor: + + def __init__(self, folder_path, num_ch, inputParameters=None): + self.folder_path = folder_path + self.num_ch = num_ch + self.inputParameters = inputParameters + self.events, self.flags = self.import_npm( + folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters + ) + + def import_npm(self, folder_path, num_ch, inputParameters=None): + + logger.debug("If it exists, importing NPM file based on the structure of file") + # Headless configuration (used to avoid any UI prompts when running tests) + headless = bool(os.environ.get("GUPPY_BASE_DIR")) + npm_timestamp_column_name = None + npm_time_unit = None + npm_split_events = None + if isinstance(inputParameters, dict): + npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") + npm_time_unit = inputParameters.get("npm_time_unit", "seconds") + npm_split_events = inputParameters.get("npm_split_events", True) + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + sorted( + glob.glob(os.path.join(folder_path, "*.doric")) + ) + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) # TODO: what is this for? + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + dirname = os.path.dirname(path[i]) + ext = os.path.basename(path[i]).split(".")[-1] + assert ext != "doric", "Doric files are not supported by import_npm function." + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) != len( + df_arr + ), "This file appears to be doric .csv. This function only supports NPM .csv files." + df = pd.read_csv(path[i], index_col=False) + _, value = self.check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + assert len(cols) != 1, "File appears to be event .csv. This function only supports NPM .csv files." + assert len(cols) != 3, "File appears to be data .csv. This function only supports NPM .csv files." + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) >= 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + + flag_arr.append(flag) + logger.info(flag) + if flag == "data_np": + file = f"file{str(i)}_" + df, indices_dict, _ = self.decide_indices(file, df, flag, num_ch) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + elif flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if headless: + response = 1 if bool(npm_split_events) else 0 + else: + window = tk.Tk() + if len(type_val_unique) > 1: + response = messagebox.askyesno( + "Multiple event TTLs", + "Based on the TTL file,\ + it looks like TTLs \ + belongs to multiple behavior type. \ + Do you want to create multiple files for each \ + behavior type ?", + ) + else: + response = 0 + window.destroy() + if response == 1: + timestamps = np.array(df.iloc[:, 0]) + for j in range(len(type_val_unique)): + idx = np.where(type_val == type_val_unique[j]) + d = dict() + d["timestamps"] = timestamps[idx] + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) + event_from_filename.append("event" + str(type_val_unique[j])) + else: + timestamps = np.array(df.iloc[:, 0]) + d = dict() + d["timestamps"] = timestamps + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) + event_from_filename.append("event" + str(0)) + else: + file = f"file{str(i)}_" + df, ts_unit = self.decide_ts_unit_for_npm( + df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless + ) + df, indices_dict, _ = self.decide_indices(file, df, flag) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) + path_chev_chod_chpr = [path_chev, path_chod, path_chpr] + if ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) + and ("event_np" in flag_arr) + and (i == len(path) - 1) + ) or ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) + ): # i==len(path)-1 and or 'event_np' in flag + num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) + arr_len, no_ch = [], [] + for i in range(len(path_chev_chod_chpr)): + if len(path_chev_chod_chpr[i]) > 0: + arr_len.append(len(path_chev_chod_chpr[i])) + else: + continue + + unique_arr_len = np.unique(np.array(arr_len)) + if "data_np_v2" in flag_arr: + if ts_unit == "seconds": + divisor = 1 + elif ts_unit == "milliseconds": + divisor = 1e3 + else: + divisor = 1e6 + else: + divisor = 1000 + + for j in range(len(path_event)): + df_event = pd.read_csv(path_event[j]) + df_chev = pd.read_csv(path_chev[0]) + df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor + df_event.to_csv(path_event[j], index=False) + if unique_arr_len.shape[0] == 1: + for j in range(len(path_chev)): + if file + "chev" in indices_dict.keys(): + df_chev = pd.read_csv(path_chev[j]) + df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor + df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) + df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( + df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] + ) + df_chev.to_csv(path_chev[j], index=False) + + if file + "chod" in indices_dict.keys(): + df_chod = pd.read_csv(path_chod[j]) + df_chod["timestamps"] = df_chev["timestamps"] + df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) + df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chod.to_csv(path_chod[j], index=False) + + if file + "chpr" in indices_dict.keys(): + df_chpr = pd.read_csv(path_chpr[j]) + df_chpr["timestamps"] = df_chev["timestamps"] + df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) + df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chpr.to_csv(path_chpr[j], index=False) + else: + logger.error("Number of channels should be same for all regions.") + raise Exception("Number of channels should be same for all regions.") + logger.info("Importing of NPM file is done.") + return event_from_filename, flag_arr + + def check_header(self, df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float + + # function to decide indices of interleaved channels + # in neurophotometrics data + def decide_indices(self, file, df, flag, num_ch=2): + ch_name = [file + "chev", file + "chod", file + "chpr"] + if len(ch_name) < num_ch: + logger.error( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + if flag == "data_np": + indices_dict = dict() + for i in range(num_ch): + indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) + + else: + cols = np.array(list(df.columns)) + if "flags" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "Flags"] + state = np.array(df["Flags"]) + elif "ledstate" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "LedState"] + state = np.array(df["LedState"]) + else: + logger.error( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + raise Exception( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + + num_ch, ch = self.check_channels(state) + indices_dict = dict() + for i in range(num_ch): + first_occurrence = np.where(state == ch[i])[0] + indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) + + df = df.drop(arr, axis=1) + + return df, indices_dict, num_ch + + # check flag consistency in neurophotometrics data + def check_channels(self, state): + state = state.astype(int) + unique_state = np.unique(state[2:12]) + if unique_state.shape[0] > 3: + logger.error( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + + return unique_state.shape[0], unique_state + + # function to decide NPM timestamps unit (seconds, ms or us) + def decide_ts_unit_for_npm(self, df, timestamp_column_name=None, time_unit=None, headless=False): + col_names = np.array(list(df.columns)) + col_names_ts = [""] + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + + ts_unit = "seconds" + if len(col_names_ts) > 2: + # Headless path: auto-select column/unit without any UI + if headless: + if timestamp_column_name is not None: + assert ( + timestamp_column_name in col_names_ts + ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" + chosen = timestamp_column_name + else: + chosen = col_names_ts[1] + df.insert(1, "Timestamp", df[chosen]) + df = df.drop(col_names_ts[1:], axis=1) + valid_units = {"seconds", "milliseconds", "microseconds"} + ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" + return df, ts_unit + # def comboBoxSelected(event): + # logger.info(event.widget.get()) + + window = tk.Tk() + window.title("Select appropriate options for timestamps") + window.geometry("500x200") + holdComboboxValues = dict() + + timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( + row=0, column=1, pady=25, padx=25 + ) + holdComboboxValues["timestamps"] = StringVar() + timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) + timestamps_combo.grid(row=0, column=2, pady=25, padx=25) + timestamps_combo.current(0) + # timestamps_combo.bind("<>", comboBoxSelected) + + time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid( + row=1, column=1, pady=25, padx=25 + ) + holdComboboxValues["time_unit"] = StringVar() + time_unit_combo = ttk.Combobox( + window, + values=["", "seconds", "milliseconds", "microseconds"], + textvariable=holdComboboxValues["time_unit"], + ) + time_unit_combo.grid(row=1, column=2, pady=25, padx=25) + time_unit_combo.current(0) + # time_unit_combo.bind("<>", comboBoxSelected) + window.lift() + window.after(500, lambda: window.lift()) + window.mainloop() + + if holdComboboxValues["timestamps"].get(): + df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) + df = df.drop(col_names_ts[1:], axis=1) + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + if holdComboboxValues["time_unit"].get(): + if holdComboboxValues["time_unit"].get() == "seconds": + ts_unit = holdComboboxValues["time_unit"].get() + elif holdComboboxValues["time_unit"].get() == "milliseconds": + ts_unit = holdComboboxValues["time_unit"].get() + else: + ts_unit = holdComboboxValues["time_unit"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + else: + pass + + return df, ts_unit diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index baec41e..daf7457 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -20,9 +20,9 @@ from guppy.extractors import ( CsvRecordingExtractor, DoricRecordingExtractor, + NpmRecordingExtractor, TdtRecordingExtractor, ) -from guppy.npm_step2 import import_npm # hv.extension() pn.extension() @@ -603,7 +603,9 @@ def execute(inputParameters): elif modality == "npm": data = 0 - event_name, flag = import_npm(filepath, num_ch) + extractor = NpmRecordingExtractor(folder_path=filepath, num_ch=num_ch, inputParameters=inputParameters) + event_name = extractor.events + flag = extractor.flags else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") From 6a470a1a9d11c8e1abd6a32de7eaf7390cad1472 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 16:48:57 -0800 Subject: [PATCH 028/125] Added NpmRecordingExtractor for step 3 --- .../extractors/npm_recording_extractor.py | 65 ++++++++++++++++++- src/guppy/readTevTsq.py | 11 +++- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index c15987f..a8cfd98 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -1,20 +1,37 @@ import glob import logging +import multiprocessing as mp import os +import time import tkinter as tk +from itertools import repeat from tkinter import StringVar, messagebox, ttk import numpy as np import pandas as pd import panel as pn +from guppy.common_step3 import write_hdf5 + pn.extension() logger = logging.getLogger(__name__) -def execute_import_npm(): - raise NotImplementedError("This function is a placeholder for execute_import_npm functionality.") +def execute_import_npm(folder_path, num_ch, inputParameters, events, outputPath, numProcesses=mp.cpu_count()): + logger.info("Reading data for event {} ...".format(events)) + + extractor = NpmRecordingExtractor(folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters) + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(read_npm_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +def read_npm_and_save_hdf5(extractor, event, outputPath): + df = extractor.read_npm(event=event) + extractor.save_to_hdf5(df=df, event=event, outputPath=outputPath) + logger.info("Data for event {} fetched and stored.".format(event)) class NpmRecordingExtractor: @@ -427,3 +444,47 @@ def decide_ts_unit_for_npm(self, df, timestamp_column_name=None, time_unit=None, pass return df, ts_unit + + def read_npm(self, event): + logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") + if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): + logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + + df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) + return df + + def save_to_hdf5(self, df, event, outputPath): + key = list(df.columns) + + # TODO: clean up these if branches + if len(key) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(key)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + + if len(key) == 1: + if key[0].lower() != "timestamps": + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") + + if len(key) != 3 and len(key) != 1: + logger.error( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + + for i in range(len(key)): + write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) + + logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index c5c52da..f2c9419 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -7,8 +7,12 @@ import numpy as np -from guppy.doric_step3 import execute_import_doric -from guppy.extractors import execute_import_csv, execute_readtev +from guppy.extractors import ( + execute_import_csv, + execute_import_doric, + execute_import_npm, + execute_readtev, +) logger = logging.getLogger(__name__) @@ -35,6 +39,7 @@ def readRawData(inputParameters): folderNames = inputParameters["folderNames"] numProcesses = inputParameters["numberOfCores"] modality = inputParameters["modality"] + num_ch = inputParameters["noChannels"] storesListPath = [] if numProcesses == 0: numProcesses = mp.cpu_count() @@ -75,7 +80,7 @@ def readRawData(inputParameters): elif modality == "csv": execute_import_csv(filepath, events, op, numProcesses) elif modality == "npm": - raise NotImplementedError("NPM modality is not yet implemented.") + execute_import_npm(filepath, num_ch, inputParameters, events, op, numProcesses) else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") From 9b88cad73cbf64fa7648b4210682aeefff9d2782 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 20 Nov 2025 16:45:18 -0800 Subject: [PATCH 029/125] Add a tdt_check_data example session to the tests. --- tests/test_step2.py | 3 ++- tests/test_step3.py | 3 ++- tests/test_step4.py | 3 ++- tests/test_step5.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_step2.py b/tests/test_step2.py index 01d32e2..b34fe64 100644 --- a/tests/test_step2.py +++ b/tests/test_step2.py @@ -87,8 +87,9 @@ { "405R": "control_region", "490R": "signal_region", - "Tick": "ttl", + "PAB/": "ttl", }, + "tdt", ), # TODO: Add sampleData_NPM_1 after fixing Doric vs. NPM determination bug. ( diff --git a/tests/test_step3.py b/tests/test_step3.py index cfe2294..330d017 100644 --- a/tests/test_step3.py +++ b/tests/test_step3.py @@ -88,8 +88,9 @@ def storenames_map(): { "405R": "control_region", "490R": "signal_region", - "Tick": "ttl", + "PAB/": "ttl", }, + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", diff --git a/tests/test_step4.py b/tests/test_step4.py index d691d06..cdaf0ec 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -92,10 +92,11 @@ { "405R": "control_region", "490R": "signal_region", - "Tick": "ttl", + "PAB/": "ttl", }, "region", "ttl", + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", diff --git a/tests/test_step5.py b/tests/test_step5.py index ddd6935..4bed772 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -92,10 +92,11 @@ { "405R": "control_region", "490R": "signal_region", - "Tick": "ttl", + "PAB/": "ttl", }, "region", "ttl", + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", From 73e6a1c3586ec361155bde7cec610729412d7041 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 09:28:59 -0800 Subject: [PATCH 030/125] Added event-splitting to tdt --- .../extractors/tdt_recording_extractor.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 1d46b1e..2cc2f15 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -26,6 +26,8 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() def read_tdt_and_save_hdf5(extractor, event, outputPath): S = extractor.readtev(event=event) extractor.save_dict_to_hdf5(S=S, event=event, outputPath=outputPath) + if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): + extractor.split_event_data(S, event, outputPath) logger.info("Data for event {} fetched and stored.".format(event)) @@ -148,6 +150,53 @@ def ismember(self, arr, element): # TODO: replace this function with more stand res = [1 if i == element else 0 for i in arr] return np.asarray(res) + # TODO: this is broken, and I need to fix it. + def event_needs_splitting(self, data, sampling_rate): + diff = np.diff(data) + if diff.shape[0] == 0: + return False + if sampling_rate == 0 and not (np.all(diff == diff[0])): + return True + return False + + def split_event_data(self, S, event, outputPath): + event = event.replace("\\", "") + event = event.replace("/", "") + logger.info("Checking event storename data for creating multiple event names from single event storename...") + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( + 2, -1 + ) + logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug( + "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + ) + i_d = np.unique(S["data"]) + for i in range(i_d.shape[0]): + new_S = dict() + idx = np.where(S["data"] == i_d[i])[0] + new_S["timestamps"] = S["timestamps"][idx] + new_S["storename"] = event + str(int(i_d[i])) + new_S["sampling_rate"] = S["sampling_rate"] + new_S["data"] = S["data"] + new_S["npoints"] = S["npoints"] + new_S["channels"] = S["channels"] + storesList = np.concatenate( + (storesList, [[event + str(int(i_d[i]))], [event + "_" + str(int(i_d[i]))]]), axis=1 + ) + self.save_dict_to_hdf5(new_S, event + str(int(i_d[i])), outputPath) + + idx = np.where(storesList[0] == event)[0] + storesList = np.delete(storesList, idx, axis=1) + if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): + os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) + if idx.shape[0] == 0: + pass + else: + np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") + logger.info( + "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" + ) + # function to save data read from tev file to hdf5 file def save_dict_to_hdf5(self, S, event, outputPath): write_hdf5(S["storename"], event, outputPath, "storename") From a036090c79e166a6d454e3997d2b984867e4d469 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 10:12:05 -0800 Subject: [PATCH 031/125] Fixed event vs. new event bug. --- src/guppy/extractors/tdt_recording_extractor.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 2cc2f15..527235f 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -160,8 +160,9 @@ def event_needs_splitting(self, data, sampling_rate): return False def split_event_data(self, S, event, outputPath): - event = event.replace("\\", "") - event = event.replace("/", "") + # Note that new_event is only used for the new storesList and event is still used for the old storesList + new_event = event.replace("\\", "") + new_event = event.replace("/", "") logger.info("Checking event storename data for creating multiple event names from single event storename...") storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( 2, -1 @@ -175,15 +176,15 @@ def split_event_data(self, S, event, outputPath): new_S = dict() idx = np.where(S["data"] == i_d[i])[0] new_S["timestamps"] = S["timestamps"][idx] - new_S["storename"] = event + str(int(i_d[i])) + new_S["storename"] = new_event + str(int(i_d[i])) new_S["sampling_rate"] = S["sampling_rate"] new_S["data"] = S["data"] new_S["npoints"] = S["npoints"] new_S["channels"] = S["channels"] storesList = np.concatenate( - (storesList, [[event + str(int(i_d[i]))], [event + "_" + str(int(i_d[i]))]]), axis=1 + (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 ) - self.save_dict_to_hdf5(new_S, event + str(int(i_d[i])), outputPath) + self.save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) idx = np.where(storesList[0] == event)[0] storesList = np.delete(storesList, idx, axis=1) From 7ecdf7809454bd5aee9b9b3a3a9164437784edd1 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 10:12:54 -0800 Subject: [PATCH 032/125] Fixed event vs. new event bug. --- src/guppy/extractors/tdt_recording_extractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 527235f..71c8d29 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -150,7 +150,6 @@ def ismember(self, arr, element): # TODO: replace this function with more stand res = [1 if i == element else 0 for i in arr] return np.asarray(res) - # TODO: this is broken, and I need to fix it. def event_needs_splitting(self, data, sampling_rate): diff = np.diff(data) if diff.shape[0] == 0: From b87e79ff4409d889fcdb4536d328f4189043aec8 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:07:35 -0800 Subject: [PATCH 033/125] Refactored save_dict_to_hdf5 to compute event from S. --- .../extractors/tdt_recording_extractor.py | 67 +++++-------------- 1 file changed, 15 insertions(+), 52 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 71c8d29..530ccc5 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -25,7 +25,7 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() def read_tdt_and_save_hdf5(extractor, event, outputPath): S = extractor.readtev(event=event) - extractor.save_dict_to_hdf5(S=S, event=event, outputPath=outputPath) + extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): extractor.split_event_data(S, event, outputPath) logger.info("Data for event {} fetched and stored.".format(event)) @@ -145,6 +145,17 @@ def readtev(self, event): return S + def read(self, events): + output_dicts = [] + for event in events: + S = self.readtev(event=event) + if self.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): + event_dicts = self.split_event_data(S, event, None) + else: + event_dicts = [S] + output_dicts.extend(event_dicts) + return output_dicts + # check if a particular element is there in an array or not def ismember(self, arr, element): # TODO: replace this function with more standard usage res = [1 if i == element else 0 for i in arr] @@ -183,7 +194,7 @@ def split_event_data(self, S, event, outputPath): storesList = np.concatenate( (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 ) - self.save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) + self.save_dict_to_hdf5(new_S, outputPath) idx = np.where(storesList[0] == event)[0] storesList = np.delete(storesList, idx, axis=1) @@ -198,7 +209,8 @@ def split_event_data(self, S, event, outputPath): ) # function to save data read from tev file to hdf5 file - def save_dict_to_hdf5(self, S, event, outputPath): + def save_dict_to_hdf5(self, S, outputPath): + event = S["storename"] write_hdf5(S["storename"], event, outputPath, "storename") write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") write_hdf5(S["timestamps"], event, outputPath, "timestamps") @@ -206,52 +218,3 @@ def save_dict_to_hdf5(self, S, event, outputPath): write_hdf5(S["data"], event, outputPath, "data") write_hdf5(S["npoints"], event, outputPath, "npoints") write_hdf5(S["channels"], event, outputPath, "channels") - - # function to check event data (checking whether event timestamps belongs to same event or multiple events) - def check_data(self, S, event, outputPath): # TODO: fold this function into the main read/get function - # logger.info("Checking event storename data for creating multiple event names from single event storename...") - new_event = event.replace("\\", "") - new_event = event.replace("/", "") - diff = np.diff(S["data"]) - arr = np.full(diff.shape[0], 1) - - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( - 2, -1 - ) - - if diff.shape[0] == 0: - return 0 - - if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: - logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" - + "Create timestamp files for individual new event and change the stores list file." - + "\033[0m" - ) - i_d = np.unique(S["data"]) - for i in range(i_d.shape[0]): - new_S = dict() - idx = np.where(S["data"] == i_d[i])[0] - new_S["timestamps"] = S["timestamps"][idx] - new_S["storename"] = new_event + str(int(i_d[i])) - new_S["sampling_rate"] = S["sampling_rate"] - new_S["data"] = S["data"] - new_S["npoints"] = S["npoints"] - new_S["channels"] = S["channels"] - storesList = np.concatenate( - (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 - ) - self.save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) - - idx = np.where(storesList[0] == event)[0] - storesList = np.delete(storesList, idx, axis=1) - if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): - os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) - if idx.shape[0] == 0: - pass - else: - np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info( - "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" - ) From 11922663bd537b4c6dddf5f460ddb959ff1cc993 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:27:33 -0800 Subject: [PATCH 034/125] Peeled split_event_storesList from split_event_data. --- .../extractors/tdt_recording_extractor.py | 65 +++++++++++++------ 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 530ccc5..0659d3a 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -3,7 +3,6 @@ import multiprocessing as mp import os import time -from itertools import repeat import numpy as np import pandas as pd @@ -14,23 +13,32 @@ logger = logging.getLogger(__name__) -# function to execute readtev function using multiprocessing to make it faster +# # function to execute readtev function using multiprocessing to make it faster +# def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): +# extractor = TdtRecordingExtractor(folder_path=folder_path) +# start = time.time() +# with mp.Pool(numProcesses) as p: +# p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) +# logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +# def read_tdt_and_save_hdf5(extractor, event, outputPath): +# S = extractor.readtev(event=event) +# extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) +# if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): +# extractor.split_event_data(S, event, outputPath) +# logger.info("Data for event {} fetched and stored.".format(event)) + + def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): extractor = TdtRecordingExtractor(folder_path=folder_path) start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + output_dicts = extractor.read(events=events, outputPath=outputPath) + for S in output_dicts: + extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) logger.info("Time taken = {0:.5f}".format(time.time() - start)) -def read_tdt_and_save_hdf5(extractor, event, outputPath): - S = extractor.readtev(event=event) - extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) - if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): - extractor.split_event_data(S, event, outputPath) - logger.info("Data for event {} fetched and stored.".format(event)) - - class TdtRecordingExtractor: def __init__(self, folder_path): @@ -145,12 +153,13 @@ def readtev(self, event): return S - def read(self, events): + def read(self, events, outputPath): output_dicts = [] for event in events: S = self.readtev(event=event) if self.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): - event_dicts = self.split_event_data(S, event, None) + event_dicts = self.split_event_data(S, event) + self.split_event_storesList(S, event, outputPath) else: event_dicts = [S] output_dicts.extend(event_dicts) @@ -169,19 +178,17 @@ def event_needs_splitting(self, data, sampling_rate): return True return False - def split_event_data(self, S, event, outputPath): + def split_event_data(self, S, event): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") logger.info("Checking event storename data for creating multiple event names from single event storename...") - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( - 2, -1 - ) logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") logger.debug( "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" ) i_d = np.unique(S["data"]) + event_dicts = [S] for i in range(i_d.shape[0]): new_S = dict() idx = np.where(S["data"] == i_d[i])[0] @@ -191,10 +198,30 @@ def split_event_data(self, S, event, outputPath): new_S["data"] = S["data"] new_S["npoints"] = S["npoints"] new_S["channels"] = S["channels"] + event_dicts.append(new_S) + logger.info( + "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" + ) + + return event_dicts + + def split_event_storesList(self, S, event, outputPath): + # Note that new_event is only used for the new storesList and event is still used for the old storesList + new_event = event.replace("\\", "") + new_event = event.replace("/", "") + logger.info("Checking event storename data for creating multiple event names from single event storename...") + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( + 2, -1 + ) + logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug( + "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + ) + i_d = np.unique(S["data"]) + for i in range(i_d.shape[0]): storesList = np.concatenate( (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 ) - self.save_dict_to_hdf5(new_S, outputPath) idx = np.where(storesList[0] == event)[0] storesList = np.delete(storesList, idx, axis=1) From 9231f5fc01192b810eb82d216bee819a06bc934e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:31:49 -0800 Subject: [PATCH 035/125] updated logging. --- .../extractors/tdt_recording_extractor.py | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 0659d3a..4743185 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -171,6 +171,7 @@ def ismember(self, arr, element): # TODO: replace this function with more stand return np.asarray(res) def event_needs_splitting(self, data, sampling_rate): + logger.info("Checking event storename data for creating multiple event names from single event storename...") diff = np.diff(data) if diff.shape[0] == 0: return False @@ -182,11 +183,8 @@ def split_event_data(self, S, event): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") - logger.info("Checking event storename data for creating multiple event names from single event storename...") logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" - ) + logger.debug("\033[1m" + "Create timestamp files for individual new event." + "\033[0m") i_d = np.unique(S["data"]) event_dicts = [S] for i in range(i_d.shape[0]): @@ -199,9 +197,7 @@ def split_event_data(self, S, event): new_S["npoints"] = S["npoints"] new_S["channels"] = S["channels"] event_dicts.append(new_S) - logger.info( - "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" - ) + logger.info("\033[1m Timestamp files for individual new event are created.\033[0m") return event_dicts @@ -209,14 +205,11 @@ def split_event_storesList(self, S, event, outputPath): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") - logger.info("Checking event storename data for creating multiple event names from single event storename...") storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( 2, -1 ) - logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" - ) + logger.info("\033[1m" + "StoresList in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug("\033[1m" + "Change the stores list file for individual new event." + "\033[0m") i_d = np.unique(S["data"]) for i in range(i_d.shape[0]): storesList = np.concatenate( @@ -231,9 +224,7 @@ def split_event_storesList(self, S, event, outputPath): pass else: np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info( - "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" - ) + logger.info("\033[1m The stores list file is changed.\033[0m") # function to save data read from tev file to hdf5 file def save_dict_to_hdf5(self, S, outputPath): From ddf6ae5a34effe3e835e5107b18242307dcaa42c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:37:10 -0800 Subject: [PATCH 036/125] Added high-level save --- src/guppy/extractors/tdt_recording_extractor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 4743185..a503bb2 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -34,8 +34,7 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() extractor = TdtRecordingExtractor(folder_path=folder_path) start = time.time() output_dicts = extractor.read(events=events, outputPath=outputPath) - for S in output_dicts: - extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) logger.info("Time taken = {0:.5f}".format(time.time() - start)) @@ -236,3 +235,7 @@ def save_dict_to_hdf5(self, S, outputPath): write_hdf5(S["data"], event, outputPath, "data") write_hdf5(S["npoints"], event, outputPath, "npoints") write_hdf5(S["channels"], event, outputPath, "channels") + + def save(self, output_dicts, outputPath): + for S in output_dicts: + self.save_dict_to_hdf5(S=S, outputPath=outputPath) From 212c7c5a7cf3f22e84804d77762978493d06aa5c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:40:03 -0800 Subject: [PATCH 037/125] Added TODO --- src/guppy/extractors/tdt_recording_extractor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index a503bb2..b5dc670 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -200,6 +200,9 @@ def split_event_data(self, S, event): return event_dicts + # This function saves a new storesList.csv file, which is a bit of a side effect in the overall read path, + # which is supposed to just return a list of dictionaries. + # TODO: long term I'd like to move these storesList shenanigans somewhere else, likely outside of the extractor. def split_event_storesList(self, S, event, outputPath): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") From 33682d26b074ac9f44bc8fd64f9c9bcae5171656 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:55:18 -0800 Subject: [PATCH 038/125] Added multi-processing back in. --- .../extractors/tdt_recording_extractor.py | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index b5dc670..58cde99 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -3,6 +3,7 @@ import multiprocessing as mp import os import time +from itertools import repeat import numpy as np import pandas as pd @@ -13,28 +14,16 @@ logger = logging.getLogger(__name__) -# # function to execute readtev function using multiprocessing to make it faster -# def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): -# extractor = TdtRecordingExtractor(folder_path=folder_path) -# start = time.time() -# with mp.Pool(numProcesses) as p: -# p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) -# logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -# def read_tdt_and_save_hdf5(extractor, event, outputPath): -# S = extractor.readtev(event=event) -# extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) -# if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): -# extractor.split_event_data(S, event, outputPath) -# logger.info("Data for event {} fetched and stored.".format(event)) +def read_and_save_tdt(extractor, event, outputPath): + output_dicts = extractor.read(events=[event], outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): extractor = TdtRecordingExtractor(folder_path=folder_path) start = time.time() - output_dicts = extractor.read(events=events, outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) + with mp.Pool(numProcesses) as p: + p.starmap(read_and_save_tdt, zip(repeat(extractor), events, repeat(outputPath))) logger.info("Time taken = {0:.5f}".format(time.time() - start)) From f84c550bb181fa51a53587fd4374266746c6c88e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 14:47:45 -0800 Subject: [PATCH 039/125] Fixed test_step5.py for tdt_check_data --- tests/test_step5.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_step5.py b/tests/test_step5.py index 4bed772..870fb7c 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -95,7 +95,7 @@ "PAB/": "ttl", }, "region", - "ttl", + ["PAB_0", "PAB_16", "PAB_2064"], # This session has an event which gets split into three sub-events. "tdt", ), ( @@ -278,7 +278,13 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r assert os.path.exists(stores_fp), "Missing storesList.csv after Steps 2-5" # Expected PSTH outputs (defaults compute z_score PSTH) - only for datasets with TTLs - if expected_ttl is not None: + if expected_ttl is None: + expected_ttls = [] + elif isinstance(expected_ttl, str): + expected_ttls = [expected_ttl] + else: + expected_ttls = expected_ttl + for expected_ttl in expected_ttls: psth_h5 = os.path.join(out_dir, f"{expected_ttl}_{expected_region}_z_score_{expected_region}.h5") psth_baseline_uncorr_h5 = os.path.join( out_dir, f"{expected_ttl}_{expected_region}_baselineUncorrected_z_score_{expected_region}.h5" From c55a230bd8034a608d5e7cbd259bed5d20a4b282 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 14:50:04 -0800 Subject: [PATCH 040/125] Fixed test_step4.py for tdt_check_data --- tests/test_step4.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_step4.py b/tests/test_step4.py index cdaf0ec..109e7da 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -95,7 +95,7 @@ "PAB/": "ttl", }, "region", - "ttl", + ["PAB_0", "PAB_16", "PAB_2064"], # This session has an event which gets split into three sub-events. "tdt", ), ( @@ -272,7 +272,13 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r assert "timestampNew" in f, f"Expected 'timestampNew' dataset in {timecorr}" # If TTLs exist, check their per-region 'ts' outputs - if expected_ttl is not None: + if expected_ttl is None: + expected_ttls = [] + elif isinstance(expected_ttl, str): + expected_ttls = [expected_ttl] + else: + expected_ttls = expected_ttl + for expected_ttl in expected_ttls: ttl_fp = os.path.join(out_dir, f"{expected_ttl}_{expected_region}.hdf5") assert os.path.exists(ttl_fp), f"Missing TTL-aligned file {ttl_fp}" with h5py.File(ttl_fp, "r") as f: From 03ffd54c7c61d449cab7ac077f3bf1e746e206fb Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 15:00:59 -0800 Subject: [PATCH 041/125] Renamed test_case from tdt_check_data to tdt_split_event. --- tests/test_step2.py | 2 +- tests/test_step3.py | 2 +- tests/test_step4.py | 2 +- tests/test_step5.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_step2.py b/tests/test_step2.py index b34fe64..f7e34d1 100644 --- a/tests/test_step2.py +++ b/tests/test_step2.py @@ -136,7 +136,7 @@ "sample_doric_4", "sample_doric_5", "tdt_clean", - "tdt_check_data", + "tdt_split_event", "tdt_with_artifacts", "sample_npm_2", "sample_npm_3", diff --git a/tests/test_step3.py b/tests/test_step3.py index 330d017..26dac14 100644 --- a/tests/test_step3.py +++ b/tests/test_step3.py @@ -145,7 +145,7 @@ def storenames_map(): "sample_doric_4", "sample_doric_5", "tdt_clean", - "tdt_check_data", + "tdt_split_event", "tdt_with_artifacts", "sample_npm_2", "sample_npm_3", diff --git a/tests/test_step4.py b/tests/test_step4.py index 109e7da..df18f75 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -161,7 +161,7 @@ "sample_doric_4", "sample_doric_5", "tdt_clean", - "tdt_check_data", + "tdt_split_event", "tdt_with_artifacts", "sample_npm_2", "sample_npm_3", diff --git a/tests/test_step5.py b/tests/test_step5.py index 870fb7c..a8cdeb4 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -161,7 +161,7 @@ "sample_doric_4", "sample_doric_5", "tdt_clean", - "tdt_check_data", + "tdt_split_event", "tdt_with_artifacts", "sample_npm_2", "sample_npm_3", From 27acc6cecee233b83dd8d6961bbf5fc5bb669a74 Mon Sep 17 00:00:00 2001 From: Paul Adkisson-Floro Date: Mon, 1 Dec 2025 19:43:14 -0500 Subject: [PATCH 042/125] Standardize read and save (#188) --- .../extractors/csv_recording_extractor.py | 23 ++- .../extractors/doric_recording_extractor.py | 139 +++++++++++------- .../extractors/npm_recording_extractor.py | 23 ++- 3 files changed, 126 insertions(+), 59 deletions(-) diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index 3df76f6..5a42bd1 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -19,13 +19,13 @@ def execute_import_csv(filepath, events, outputPath, numProcesses=mp.cpu_count() extractor = CsvRecordingExtractor(folder_path=filepath) start = time.time() with mp.Pool(numProcesses) as p: - p.starmap(read_csv_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + p.starmap(read_and_save_csv, zip(repeat(extractor), events, repeat(outputPath))) logger.info("Time taken = {0:.5f}".format(time.time() - start)) -def read_csv_and_save_hdf5(extractor, event, outputPath): - df = extractor.read_csv(event=event) - extractor.save_to_hdf5(df=df, event=event, outputPath=outputPath) +def read_and_save_csv(extractor, event, outputPath): + output_dicts = extractor.read(events=[event], outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) logger.info("Data for event {} fetched and stored.".format(event)) @@ -178,3 +178,18 @@ def save_to_hdf5(self, df, event, outputPath): write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") + + def read(self, events, outputPath): + output_dicts = [] + for event in events: + df = self.read_csv(event=event) + S = df.to_dict() + S["storename"] = event + output_dicts.append(S) + return output_dicts + + def save(self, output_dicts, outputPath): + for S in output_dicts: + event = S.pop("storename") + df = pd.DataFrame.from_dict(S) + self.save_to_hdf5(df=df, event=event, outputPath=outputPath) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index e5a97cb..2966ec6 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -15,18 +15,12 @@ def execute_import_doric(folder_path, storesList, outputPath): extractor = DoricRecordingExtractor(folder_path=folder_path) - flag = extractor.check_doric(folder_path) - - if flag == "doric_csv": - extractor.read_doric_csv(folder_path, storesList, outputPath) - elif flag == "doric_doric": - extractor.read_doric_doric(folder_path, storesList, outputPath) - else: - logger.error("Doric file not found or not recognized.") - raise FileNotFoundError("Doric file not found or not recognized.") + output_dicts = extractor.read(storesList=storesList) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) class DoricRecordingExtractor: + # TODO: consolidate duplicate flag logic between the `__init__` and the `check_doric` method. def __init__(self, folder_path): self.folder_path = folder_path @@ -110,9 +104,9 @@ def separate_last_element(self, arr): l = arr[-1] return arr[:-1], l - def check_doric(self, filepath): + def check_doric(self): logger.debug("Checking if doric file exists") - path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) + path = glob.glob(os.path.join(self.folder_path, "*.csv")) + glob.glob(os.path.join(self.folder_path, "*.doric")) flag_arr = [] for i in range(len(path)): @@ -141,44 +135,50 @@ def check_doric(self, filepath): logger.info("Doric file found.") return flag_arr[0] - def read_doric_csv(self, filepath, storesList, outputPath): - path = glob.glob(os.path.join(filepath, "*.csv")) + def read_doric_csv(self, storesList): + path = glob.glob(os.path.join(self.folder_path, "*.csv")) if len(path) > 1: logger.error("An error occurred : More than one Doric csv file present at the location") raise Exception("More than one Doric csv file present at the location") - else: - df = pd.read_csv(path[0], header=1, index_col=False) - df = df.dropna(axis=1, how="all") - df = df.dropna(axis=0, how="any") - df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(df["Time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") - write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") - else: - ttl = df[storesList[0, i]] - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5( - df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" - ) - - def read_doric_doric(self, filepath, storesList, outputPath): - path = glob.glob(os.path.join(filepath, "*.doric")) + + df = pd.read_csv(path[0], header=1, index_col=False) + df = df.dropna(axis=1, how="all") + df = df.dropna(axis=0, how="any") + df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] + + output_dicts = [] + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(df["Time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + data = np.array(df[storesList[0, i]]) + storename = storesList[0, i] + S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} + output_dicts.append(S) + else: + ttl = df[storesList[0, i]] + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + timestamps = df["Time(s)"][indices[diff_indices] + 1].to_numpy() + storename = storesList[0, i] + S = {"storename": storename, "timestamps": timestamps} + output_dicts.append(S) + + return output_dicts + + def read_doric_doric(self, storesList): + path = glob.glob(os.path.join(self.folder_path, "*.doric")) if len(path) > 1: logger.error("An error occurred : More than one Doric file present at the location") raise Exception("More than one Doric file present at the location") - else: - with h5py.File(path[0], "r") as f: - if "Traces" in list(f.keys()): - keys = self.access_data_doricV1(f, storesList, outputPath) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = self.access_data_doricV6(f, storesList, outputPath) + with h5py.File(path[0], "r") as f: + if "Traces" in list(f.keys()): + output_dicts = self.access_data_doricV1(f, storesList) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + output_dicts = self.access_data_doricV6(f, storesList) + return output_dicts - def access_data_doricV6(self, doric_file, storesList, outputPath): + def access_data_doricV6(self, doric_file, storesList): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: @@ -201,6 +201,7 @@ def access_data_doricV6(self, doric_file, storesList, outputPath): if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: decide_path.append(element) + output_dicts = [] for i in range(storesList.shape[1]): if "control" in storesList[1, i] or "signal" in storesList[1, i]: regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") @@ -212,9 +213,9 @@ def access_data_doricV6(self, doric_file, storesList, outputPath): data = np.array(doric_file[decide_path[idx]]) timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") + storename = storesList[0, i] + S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} + output_dicts.append(S) else: regex = re.compile("(.*?)" + storesList[0, i] + "$") idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] @@ -226,21 +227,57 @@ def access_data_doricV6(self, doric_file, storesList, outputPath): timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + timestamps = timestamps[indices[diff_indices] + 1] + storename = storesList[0, i] + S = {"storename": storename, "timestamps": timestamps} + output_dicts.append(S) - def access_data_doricV1(self, doric_file, storesList, outputPath): + return output_dicts + + def access_data_doricV1(self, doric_file, storesList): keys = list(doric_file["Traces"]["Console"].keys()) + output_dicts = [] for i in range(storesList.shape[1]): if "control" in storesList[1, i] or "signal" in storesList[1, i]: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") + storename = storesList[0, i] + S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} + output_dicts.append(S) else: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + timestamps = timestamps[indices[diff_indices] + 1] + storename = storesList[0, i] + S = {"storename": storename, "timestamps": timestamps} + output_dicts.append(S) + + return output_dicts + + def save_dict_to_hdf5(self, S, outputPath): + event = S["storename"] + write_hdf5(S["timestamps"], event, outputPath, "timestamps") + + if "sampling_rate" in S: + write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") + if "data" in S: + write_hdf5(S["data"], event, outputPath, "data") + + def read(self, storesList): + flag = self.check_doric() + if flag == "doric_csv": + output_dicts = self.read_doric_csv(storesList) + elif flag == "doric_doric": + output_dicts = self.read_doric_doric(storesList) + else: + logger.error("Doric file not found or not recognized.") + raise FileNotFoundError("Doric file not found or not recognized.") + + return output_dicts + + def save(self, output_dicts, outputPath): + for S in output_dicts: + self.save_dict_to_hdf5(S=S, outputPath=outputPath) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index a8cfd98..bc9b210 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -24,13 +24,13 @@ def execute_import_npm(folder_path, num_ch, inputParameters, events, outputPath, extractor = NpmRecordingExtractor(folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters) start = time.time() with mp.Pool(numProcesses) as p: - p.starmap(read_npm_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + p.starmap(read_and_save_npm, zip(repeat(extractor), events, repeat(outputPath))) logger.info("Time taken = {0:.5f}".format(time.time() - start)) -def read_npm_and_save_hdf5(extractor, event, outputPath): - df = extractor.read_npm(event=event) - extractor.save_to_hdf5(df=df, event=event, outputPath=outputPath) +def read_and_save_npm(extractor, event, outputPath): + output_dicts = extractor.read(events=[event], outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) logger.info("Data for event {} fetched and stored.".format(event)) @@ -488,3 +488,18 @@ def save_to_hdf5(self, df, event, outputPath): write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") + + def read(self, events, outputPath): + output_dicts = [] + for event in events: + df = self.read_npm(event=event) + S = df.to_dict() + S["storename"] = event + output_dicts.append(S) + return output_dicts + + def save(self, output_dicts, outputPath): + for S in output_dicts: + event = S.pop("storename") + df = pd.DataFrame.from_dict(S) + self.save_to_hdf5(df=df, event=event, outputPath=outputPath) From a633550144b26b2ed6cc1a4d86696f1296a6e9f1 Mon Sep 17 00:00:00 2001 From: Paul Adkisson-Floro Date: Wed, 3 Dec 2025 13:18:21 -0500 Subject: [PATCH 043/125] Remove tkinter from NPM (#189) Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../extractors/npm_recording_extractor.py | 293 ++++++++++-------- src/guppy/saveStoresList.py | 118 +++++++ src/guppy/testing/api.py | 78 +++-- tests/test_step2.py | 19 +- tests/test_step3.py | 22 +- tests/test_step4.py | 26 +- tests/test_step5.py | 30 +- 7 files changed, 387 insertions(+), 199 deletions(-) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index bc9b210..ae4f540 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -3,9 +3,7 @@ import multiprocessing as mp import os import time -import tkinter as tk from itertools import repeat -from tkinter import StringVar, messagebox, ttk import numpy as np import pandas as pd @@ -36,7 +34,7 @@ def read_and_save_npm(extractor, event, outputPath): class NpmRecordingExtractor: - def __init__(self, folder_path, num_ch, inputParameters=None): + def __init__(self, folder_path, num_ch, inputParameters=None): # TODO: make inputParameters mandatory self.folder_path = folder_path self.num_ch = num_ch self.inputParameters = inputParameters @@ -44,18 +42,70 @@ def __init__(self, folder_path, num_ch, inputParameters=None): folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters ) + @classmethod + def has_multiple_event_ttls(cls, folder_path): + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + multiple_event_ttls = [] + for i in range(len(path)): + df = pd.read_csv(path[i], index_col=False) + _, value = cls.check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) > 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + + if flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if len(type_val_unique) > 1: + multiple_event_ttls.append(True) + else: + multiple_event_ttls.append(False) + else: + multiple_event_ttls.append(False) + + return multiple_event_ttls + def import_npm(self, folder_path, num_ch, inputParameters=None): logger.debug("If it exists, importing NPM file based on the structure of file") # Headless configuration (used to avoid any UI prompts when running tests) headless = bool(os.environ.get("GUPPY_BASE_DIR")) - npm_timestamp_column_name = None - npm_time_unit = None - npm_split_events = None if isinstance(inputParameters, dict): - npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") - npm_time_unit = inputParameters.get("npm_time_unit", "seconds") - npm_split_events = inputParameters.get("npm_split_events", True) + npm_timestamp_column_names = inputParameters.get("npm_timestamp_column_names") + npm_time_units = inputParameters.get("npm_time_units") + # TODO: come up with a better name for npm_split_events that can be appropriately pluralized for a list + npm_split_events = inputParameters.get("npm_split_events") path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + sorted( glob.glob(os.path.join(folder_path, "*.doric")) ) @@ -71,6 +121,20 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): event_from_filename = [] flag_arr = [] for i in range(len(path)): + # TODO: validate npm_timestamp_column_names, npm_time_units, npm_split_events lengths + if npm_timestamp_column_names is None: + npm_timestamp_column_name = None + else: + npm_timestamp_column_name = npm_timestamp_column_names[i] + if npm_time_units is None: + npm_time_unit = "seconds" + else: + npm_time_unit = npm_time_units[i] + if npm_split_events is None: + split_events = False + else: + split_events = npm_split_events[i] + dirname = os.path.dirname(path[i]) ext = os.path.basename(path[i]).split(".")[-1] assert ext != "doric", "Doric files are not supported by import_npm function." @@ -103,7 +167,7 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): assert len(cols) != 3, "File appears to be data .csv. This function only supports NPM .csv files." if len(cols) == 2: flag = "event_or_data_np" - elif len(cols) >= 2: + elif len(cols) > 2: flag = "data_np" else: logger.error("Number of columns in csv file does not make sense.") @@ -150,23 +214,7 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): elif flag == "event_np": type_val = np.array(df.iloc[:, 1]) type_val_unique = np.unique(type_val) - if headless: - response = 1 if bool(npm_split_events) else 0 - else: - window = tk.Tk() - if len(type_val_unique) > 1: - response = messagebox.askyesno( - "Multiple event TTLs", - "Based on the TTL file,\ - it looks like TTLs \ - belongs to multiple behavior type. \ - Do you want to create multiple files for each \ - behavior type ?", - ) - else: - response = 0 - window.destroy() - if response == 1: + if split_events: timestamps = np.array(df.iloc[:, 0]) for j in range(len(type_val_unique)): idx = np.where(type_val == type_val_unique[j]) @@ -184,9 +232,8 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): event_from_filename.append("event" + str(0)) else: file = f"file{str(i)}_" - df, ts_unit = self.decide_ts_unit_for_npm( - df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless - ) + ts_unit = npm_time_unit + df = self.update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) df, indices_dict, _ = self.decide_indices(file, df, flag) keys = list(indices_dict.keys()) for k in range(len(keys)): @@ -270,7 +317,8 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): logger.info("Importing of NPM file is done.") return event_from_filename, flag_arr - def check_header(self, df): + @classmethod + def check_header(cls, df): arr = list(df.columns) check_float = [] for i in arr: @@ -283,7 +331,8 @@ def check_header(self, df): # function to decide indices of interleaved channels # in neurophotometrics data - def decide_indices(self, file, df, flag, num_ch=2): + @classmethod + def decide_indices(cls, file, df, flag, num_ch=2): ch_name = [file + "chev", file + "chod", file + "chpr"] if len(ch_name) < num_ch: logger.error( @@ -319,7 +368,7 @@ def decide_indices(self, file, df, flag, num_ch=2): data but column names does not have Flags or LedState" ) - num_ch, ch = self.check_channels(state) + num_ch, ch = cls.check_channels(state) indices_dict = dict() for i in range(num_ch): first_occurrence = np.where(state == ch[i])[0] @@ -330,7 +379,8 @@ def decide_indices(self, file, df, flag, num_ch=2): return df, indices_dict, num_ch # check flag consistency in neurophotometrics data - def check_channels(self, state): + @classmethod + def check_channels(cls, state): state = state.astype(int) unique_state = np.unique(state[2:12]) if unique_state.shape[0] > 3: @@ -345,105 +395,94 @@ def check_channels(self, state): return unique_state.shape[0], unique_state - # function to decide NPM timestamps unit (seconds, ms or us) - def decide_ts_unit_for_npm(self, df, timestamp_column_name=None, time_unit=None, headless=False): - col_names = np.array(list(df.columns)) + @classmethod + def needs_ts_unit(cls, folder_path, num_ch): + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + sorted( + glob.glob(os.path.join(folder_path, "*.doric")) + ) + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) # TODO: what is this for? + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + ts_unit_needs = [] col_names_ts = [""] - for name in col_names: - if "timestamp" in name.lower(): - col_names_ts.append(name) + for i in range(len(path)): + df = pd.read_csv(path[i], index_col=False) + _, value = cls.check_header(df) - ts_unit = "seconds" - if len(col_names_ts) > 2: - # Headless path: auto-select column/unit without any UI - if headless: - if timestamp_column_name is not None: - assert ( - timestamp_column_name in col_names_ts - ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" - chosen = timestamp_column_name - else: - chosen = col_names_ts[1] - df.insert(1, "Timestamp", df[chosen]) - df = df.drop(col_names_ts[1:], axis=1) - valid_units = {"seconds", "milliseconds", "microseconds"} - ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" - return df, ts_unit - # def comboBoxSelected(event): - # logger.info(event.widget.get()) - - window = tk.Tk() - window.title("Select appropriate options for timestamps") - window.geometry("500x200") - holdComboboxValues = dict() - - timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( - row=0, column=1, pady=25, padx=25 - ) - holdComboboxValues["timestamps"] = StringVar() - timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) - timestamps_combo.grid(row=0, column=2, pady=25, padx=25) - timestamps_combo.current(0) - # timestamps_combo.bind("<>", comboBoxSelected) - - time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid( - row=1, column=1, pady=25, padx=25 - ) - holdComboboxValues["time_unit"] = StringVar() - time_unit_combo = ttk.Combobox( - window, - values=["", "seconds", "milliseconds", "microseconds"], - textvariable=holdComboboxValues["time_unit"], - ) - time_unit_combo.grid(row=1, column=2, pady=25, padx=25) - time_unit_combo.current(0) - # time_unit_combo.bind("<>", comboBoxSelected) - window.lift() - window.after(500, lambda: window.lift()) - window.mainloop() - - if holdComboboxValues["timestamps"].get(): - df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) - df = df.drop(col_names_ts[1:], axis=1) + # check dataframe structure and read data accordingly + if len(value) > 0: + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + columns_isstr = False else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - if holdComboboxValues["time_unit"].get(): - if holdComboboxValues["time_unit"].get() == "seconds": - ts_unit = holdComboboxValues["time_unit"].get() - elif holdComboboxValues["time_unit"].get() == "milliseconds": - ts_unit = holdComboboxValues["time_unit"].get() + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) > 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" else: - ts_unit = holdComboboxValues["time_unit"].get() + flag = "event_np" + + if flag == "data_np": + file = f"file{str(i)}_" + df, _, _ = cls.decide_indices(file, df, flag, num_ch) + + if flag == "event_np" or flag == "data_np": + ts_unit_needs.append(False) + continue + + col_names = np.array(list(df.columns)) + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + + if len(col_names_ts) > 2: + ts_unit_needs.append(True) else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - else: - pass + ts_unit_needs.append(False) - return df, ts_unit + return ts_unit_needs, col_names_ts + + def update_df_with_timestamp_columns(self, df, timestamp_column_name): + col_names = np.array(list(df.columns)) + col_names_ts = [""] + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + if len(col_names_ts) <= 2: + return df + + timestamp_column_name = timestamp_column_name if timestamp_column_name is not None else col_names_ts[1] + assert ( + timestamp_column_name in col_names_ts + ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" + df.insert(1, "Timestamp", df[timestamp_column_name]) + df = df.drop(col_names_ts[1:], axis=1) + return df def read_npm(self, event): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index daf7457..552d76c 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -9,8 +9,10 @@ import logging import os import socket +import tkinter as tk from pathlib import Path from random import randint +from tkinter import StringVar, messagebox, ttk import holoviews as hv import numpy as np @@ -602,6 +604,23 @@ def execute(inputParameters): flag = extractor.flags elif modality == "npm": + headless = bool(os.environ.get("GUPPY_BASE_DIR")) + if not headless: + # Resolve multiple event TTLs + multiple_event_ttls = NpmRecordingExtractor.has_multiple_event_ttls(folder_path=filepath) + responses = get_multi_event_responses(multiple_event_ttls) + inputParameters["npm_split_events"] = responses + + # Resolve timestamp units and columns + ts_unit_needs, col_names_ts = NpmRecordingExtractor.needs_ts_unit( + folder_path=filepath, num_ch=num_ch + ) + ts_units, npm_timestamp_column_names = get_timestamp_configuration(ts_unit_needs, col_names_ts) + inputParameters["npm_time_units"] = ts_units if ts_units else None + inputParameters["npm_timestamp_column_names"] = ( + npm_timestamp_column_names if npm_timestamp_column_names else None + ) + data = 0 extractor = NpmRecordingExtractor(folder_path=filepath, num_ch=num_ch, inputParameters=inputParameters) event_name = extractor.events @@ -614,3 +633,102 @@ def execute(inputParameters): except Exception as e: logger.error(str(e)) raise e + + +def get_multi_event_responses(multiple_event_ttls): + responses = [] + for has_multiple in multiple_event_ttls: + if not has_multiple: + responses.append(False) + continue + window = tk.Tk() + response = messagebox.askyesno( + "Multiple event TTLs", + ( + "Based on the TTL file, " + "it looks like TTLs " + "belong to multiple behavior types. " + "Do you want to create multiple files for each " + "behavior type?" + ), + ) + window.destroy() + responses.append(response) + return responses + + +def get_timestamp_configuration(ts_unit_needs, col_names_ts): + ts_units, npm_timestamp_column_names = [], [] + for need in ts_unit_needs: + if not need: + ts_units.append("seconds") + npm_timestamp_column_names.append(None) + continue + window = tk.Tk() + window.title("Select appropriate options for timestamps") + window.geometry("500x200") + holdComboboxValues = dict() + + timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( + row=0, column=1, pady=25, padx=25 + ) + holdComboboxValues["timestamps"] = StringVar() + timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) + timestamps_combo.grid(row=0, column=2, pady=25, padx=25) + timestamps_combo.current(0) + # timestamps_combo.bind("<>", comboBoxSelected) + + time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) + holdComboboxValues["time_unit"] = StringVar() + time_unit_combo = ttk.Combobox( + window, + values=["", "seconds", "milliseconds", "microseconds"], + textvariable=holdComboboxValues["time_unit"], + ) + time_unit_combo.grid(row=1, column=2, pady=25, padx=25) + time_unit_combo.current(0) + # time_unit_combo.bind("<>", comboBoxSelected) + window.lift() + window.after(500, lambda: window.lift()) + window.mainloop() + + if holdComboboxValues["timestamps"].get(): + npm_timestamp_column_name = holdComboboxValues["timestamps"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + if holdComboboxValues["time_unit"].get(): + if holdComboboxValues["time_unit"].get() == "seconds": + ts_unit = holdComboboxValues["time_unit"].get() + elif holdComboboxValues["time_unit"].get() == "milliseconds": + ts_unit = holdComboboxValues["time_unit"].get() + else: + ts_unit = holdComboboxValues["time_unit"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + ts_units.append(ts_unit) + npm_timestamp_column_names.append(npm_timestamp_column_name) + return ts_units, npm_timestamp_column_names diff --git a/src/guppy/testing/api.py b/src/guppy/testing/api.py index d7e390d..c647907 100644 --- a/src/guppy/testing/api.py +++ b/src/guppy/testing/api.py @@ -69,9 +69,9 @@ def step2( selected_folders: Iterable[str], storenames_map: dict[str, str], modality: str = "tdt", - npm_timestamp_column_name: str | None = None, - npm_time_unit: str = "seconds", - npm_split_events: bool = True, + npm_timestamp_column_names: list[str | None] | None = None, + npm_time_units: list[str] | None = None, + npm_split_events: list[bool] | None = None, ) -> None: """ Run pipeline Step 2 (Save Storenames) via the actual Panel-backed logic. @@ -94,6 +94,14 @@ def step2( storenames_map : dict[str, str] Mapping from raw storenames (e.g., "Dv1A") to semantic names (e.g., "control_DMS"). Insertion order is preserved. + modality : str + Data acquisition modality (e.g., 'tdt', 'csv', 'doric', 'npm'). + npm_timestamp_column_names : list[str | None] | None + List of timestamp column names for NPM files, one per CSV file. None if not applicable. + npm_time_units : list[str] | None + List of time units for NPM files, one per CSV file (e.g., 'seconds', 'milliseconds'). None if not applicable. + npm_split_events : list[bool] | None + List of booleans indicating whether to split events for NPM files, one per CSV file. None if not applicable. Raises ------ @@ -155,8 +163,8 @@ def step2( input_params["modality"] = modality # Add npm parameters - input_params["npm_timestamp_column_name"] = npm_timestamp_column_name - input_params["npm_time_unit"] = npm_time_unit + input_params["npm_timestamp_column_names"] = npm_timestamp_column_names + input_params["npm_time_units"] = npm_time_units input_params["npm_split_events"] = npm_split_events # Call the underlying Step 2 executor (now headless-aware) @@ -168,9 +176,9 @@ def step3( base_dir: str, selected_folders: Iterable[str], modality: str = "tdt", - npm_timestamp_column_name: str | None = None, - npm_time_unit: str = "seconds", - npm_split_events: bool = True, + npm_timestamp_column_names: list[str | None] | None = None, + npm_time_units: list[str] | None = None, + npm_split_events: list[bool] | None = None, ) -> None: """ Run pipeline Step 3 (Read Raw Data) via the actual Panel-backed logic, headlessly. @@ -188,6 +196,14 @@ def step3( must reside directly under this path. selected_folders : Iterable[str] Absolute paths to the session directories to process. + modality : str + Data acquisition modality (e.g., 'tdt', 'csv', 'doric', 'npm'). + npm_timestamp_column_names : list[str | None] | None + List of timestamp column names for NPM files, one per CSV file. None if not applicable. + npm_time_units : list[str] | None + List of time units for NPM files, one per CSV file (e.g., 'seconds', 'milliseconds'). None if not applicable. + npm_split_events : list[bool] | None + List of booleans indicating whether to split events for NPM files, one per CSV file. None if not applicable. Raises ------ @@ -232,9 +248,9 @@ def step3( template._widgets["files_1"].value = abs_sessions input_params = template._hooks["getInputParameters"]() - # Inject explicit NPM parameters (match Step 2 style) - input_params["npm_timestamp_column_name"] = npm_timestamp_column_name - input_params["npm_time_unit"] = npm_time_unit + # Inject explicit NPM parameters + input_params["npm_timestamp_column_names"] = npm_timestamp_column_names + input_params["npm_time_units"] = npm_time_units input_params["npm_split_events"] = npm_split_events # Inject modality @@ -249,9 +265,9 @@ def step4( base_dir: str, selected_folders: Iterable[str], modality: str = "tdt", - npm_timestamp_column_name: str | None = None, - npm_time_unit: str = "seconds", - npm_split_events: bool = True, + npm_timestamp_column_names: list[str | None] | None = None, + npm_time_units: list[str] | None = None, + npm_split_events: list[bool] | None = None, ) -> None: """ Run pipeline Step 4 (Extract timestamps and signal) via the Panel-backed logic, headlessly. @@ -269,6 +285,14 @@ def step4( must reside directly under this path. selected_folders : Iterable[str] Absolute paths to the session directories to process. + modality : str + Data acquisition modality (e.g., 'tdt', 'csv', 'doric', 'npm'). + npm_timestamp_column_names : list[str | None] | None + List of timestamp column names for NPM files, one per CSV file. None if not applicable. + npm_time_units : list[str] | None + List of time units for NPM files, one per CSV file (e.g., 'seconds', 'milliseconds'). None if not applicable. + npm_split_events : list[bool] | None + List of booleans indicating whether to split events for NPM files, one per CSV file. None if not applicable. Raises ------ @@ -313,9 +337,9 @@ def step4( template._widgets["files_1"].value = abs_sessions input_params = template._hooks["getInputParameters"]() - # Inject explicit NPM parameters (match Step 2 style) - input_params["npm_timestamp_column_name"] = npm_timestamp_column_name - input_params["npm_time_unit"] = npm_time_unit + # Inject explicit NPM parameters + input_params["npm_timestamp_column_names"] = npm_timestamp_column_names + input_params["npm_time_units"] = npm_time_units input_params["npm_split_events"] = npm_split_events # Inject modality @@ -330,9 +354,9 @@ def step5( base_dir: str, selected_folders: Iterable[str], modality: str = "tdt", - npm_timestamp_column_name: str | None = None, - npm_time_unit: str = "seconds", - npm_split_events: bool = True, + npm_timestamp_column_names: list[str | None] | None = None, + npm_time_units: list[str] | None = None, + npm_split_events: list[bool] | None = None, ) -> None: """ Run pipeline Step 5 (PSTH Computation) via the Panel-backed logic, headlessly. @@ -350,6 +374,14 @@ def step5( must reside directly under this path. selected_folders : Iterable[str] Absolute paths to the session directories to process. + modality : str + Data acquisition modality (e.g., 'tdt', 'csv', 'doric', 'npm'). + npm_timestamp_column_names : list[str | None] | None + List of timestamp column names for NPM files, one per CSV file. None if not applicable. + npm_time_units : list[str] | None + List of time units for NPM files, one per CSV file (e.g., 'seconds', 'milliseconds'). None if not applicable. + npm_split_events : list[bool] | None + List of booleans indicating whether to split events for NPM files, one per CSV file. None if not applicable. Raises ------ @@ -394,9 +426,9 @@ def step5( template._widgets["files_1"].value = abs_sessions input_params = template._hooks["getInputParameters"]() - # Inject explicit NPM parameters (match Step 2 style) - input_params["npm_timestamp_column_name"] = npm_timestamp_column_name - input_params["npm_time_unit"] = npm_time_unit + # Inject explicit NPM parameters + input_params["npm_timestamp_column_names"] = npm_timestamp_column_names + input_params["npm_time_units"] = npm_time_units input_params["npm_split_events"] = npm_split_events # Inject modality diff --git a/tests/test_step2.py b/tests/test_step2.py index f7e34d1..6ab85eb 100644 --- a/tests/test_step2.py +++ b/tests/test_step2.py @@ -154,16 +154,15 @@ def test_step2(tmp_path, session_subdir, storenames_map, modality): - Asserts storesList.csv exists and exactly matches the provided mapping (2xN) """ if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_3": - npm_timestamp_column_name = "ComputerTimestamp" - npm_time_unit = "milliseconds" + npm_timestamp_column_names = ["ComputerTimestamp", None] + npm_time_units = ["milliseconds", "seconds"] + npm_split_events = [False, True] else: - npm_timestamp_column_name = None - npm_time_unit = None + npm_timestamp_column_names = None + npm_time_units = None + npm_split_events = [True, True] if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_5": - npm_split_events = False - else: - npm_split_events = True - + npm_split_events = None # Source sample data src_base_dir = str(Path(".") / "testing_data") src_session = os.path.join(src_base_dir, session_subdir) @@ -193,8 +192,8 @@ def test_step2(tmp_path, session_subdir, storenames_map, modality): selected_folders=[str(session_copy)], storenames_map=storenames_map, modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) diff --git a/tests/test_step3.py b/tests/test_step3.py index 26dac14..e4b5150 100644 --- a/tests/test_step3.py +++ b/tests/test_step3.py @@ -167,15 +167,15 @@ def test_step3(tmp_path, storenames_map, session_subdir, modality): the temp copy (never touching the original sample path). """ if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_3": - npm_timestamp_column_name = "ComputerTimestamp" - npm_time_unit = "milliseconds" + npm_timestamp_column_names = ["ComputerTimestamp", None] + npm_time_units = ["milliseconds", "seconds"] + npm_split_events = [False, True] else: - npm_timestamp_column_name = None - npm_time_unit = None + npm_timestamp_column_names = None + npm_time_units = None + npm_split_events = [True, True] if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_5": - npm_split_events = False - else: - npm_split_events = True + npm_split_events = None src_base_dir = str(Path(".") / "testing_data") src_session = os.path.join(src_base_dir, session_subdir) @@ -205,8 +205,8 @@ def test_step3(tmp_path, storenames_map, session_subdir, modality): selected_folders=[str(session_copy)], storenames_map=storenames_map, modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -215,8 +215,8 @@ def test_step3(tmp_path, storenames_map, session_subdir, modality): base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) diff --git a/tests/test_step4.py b/tests/test_step4.py index df18f75..8e5f989 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -185,15 +185,15 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r - Assertions confirm creation of key HDF5 outputs expected from Step 4. """ if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_3": - npm_timestamp_column_name = "ComputerTimestamp" - npm_time_unit = "milliseconds" + npm_timestamp_column_names = ["ComputerTimestamp", None] + npm_time_units = ["milliseconds", "seconds"] + npm_split_events = [False, True] else: - npm_timestamp_column_name = None - npm_time_unit = None + npm_timestamp_column_names = None + npm_time_units = None + npm_split_events = [True, True] if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_5": - npm_split_events = False - else: - npm_split_events = True + npm_split_events = None # Use the CSV sample session src_base_dir = str(Path(".") / "testing_data") @@ -227,8 +227,8 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r selected_folders=[str(session_copy)], storenames_map=storenames_map, modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -237,8 +237,8 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -247,8 +247,8 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) diff --git a/tests/test_step5.py b/tests/test_step5.py index a8cdeb4..1837ebf 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -187,15 +187,15 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r - Defaults are used for input parameters; PSTH computation defaults to z_score. """ if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_3": - npm_timestamp_column_name = "ComputerTimestamp" - npm_time_unit = "milliseconds" + npm_timestamp_column_names = ["ComputerTimestamp", None] + npm_time_units = ["milliseconds", "seconds"] + npm_split_events = [False, True] else: - npm_timestamp_column_name = None - npm_time_unit = None + npm_timestamp_column_names = None + npm_time_units = None + npm_split_events = [True, True] if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_5": - npm_split_events = False - else: - npm_split_events = True + npm_split_events = None # Use the sample session src_base_dir = str(Path(".") / "testing_data") @@ -229,8 +229,8 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r selected_folders=[str(session_copy)], storenames_map=storenames_map, modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -239,8 +239,8 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -249,8 +249,8 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -259,8 +259,8 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) From d55bba7887bfc2c94c05b6c26214dc4350495395 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 10:59:09 -0800 Subject: [PATCH 044/125] Defined BaseRecordingExtractor. --- src/guppy/extractors/__init__.py | 1 + .../extractors/base_recording_extractor.py | 128 ++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 src/guppy/extractors/base_recording_extractor.py diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index b876012..75933c7 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,3 +1,4 @@ +from .base_recording_extractor import BaseRecordingExtractor from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv from .doric_recording_extractor import DoricRecordingExtractor, execute_import_doric diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py new file mode 100644 index 0000000..7058a0a --- /dev/null +++ b/src/guppy/extractors/base_recording_extractor.py @@ -0,0 +1,128 @@ +"""Base class for recording extractors.""" + +import os +from abc import ABC, abstractmethod +from typing import Any + +import h5py +import numpy as np + + +class BaseRecordingExtractor(ABC): + """ + Abstract base class for recording extractors. + + Defines the interface contract for reading and saving fiber photometry + data from various acquisition formats (TDT, Doric, CSV, NPM, etc.). + """ + + @property + @abstractmethod + def events(self) -> list[str]: + """ + List of available event/store names in the data. + + Returns + ------- + list of str + Names of all events or stores available in the dataset. + """ + pass + + @property + @abstractmethod + def flags(self) -> list: + """ + Format indicators or file type flags. + + Returns + ------- + list + Flags indicating file types or data formats. + """ + pass + + @abstractmethod + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + """ + Read data from source files for specified events. + + Parameters + ---------- + events : list of str + List of event/store names to extract from the data. + outputPath : str + Path to the output directory. + **kwargs + Additional extractor-specific parameters. + + Returns + ------- + list of dict + List of dictionaries containing extracted data. Each dictionary + represents one event/store and contains keys such as 'storename', + 'timestamps', 'data', 'sampling_rate', etc. + """ + pass + + @abstractmethod + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + """ + Save extracted data dictionaries to HDF5 format. + + Parameters + ---------- + output_dicts : list of dict + List of data dictionaries from read(). + outputPath : str + Path to the output directory. + **kwargs + Additional extractor-specific parameters. + """ + pass + + @staticmethod + def _write_hdf5(data: Any, storename: str, output_path: str, key: str) -> None: + """ + Write data to HDF5 file. + + Parameters + ---------- + data : array-like + Data to write to the HDF5 file. + storename : str + Name of the store/event. + output_path : str + Directory path where HDF5 file will be written. + key : str + Key name for this data field in the HDF5 file. + """ + # Replace invalid characters in storename to avoid filesystem errors + storename = storename.replace("\\", "_") + storename = storename.replace("/", "_") + + filepath = os.path.join(output_path, storename + ".hdf5") + + # Create new file if it doesn't exist + if not os.path.exists(filepath): + with h5py.File(filepath, "w") as f: + if isinstance(data, np.ndarray): + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + # Append to existing file + else: + with h5py.File(filepath, "r+") as f: + if key in list(f.keys()): + if isinstance(data, np.ndarray): + f[key].resize(data.shape) + arr = f[key] + arr[:] = data + else: + arr = f[key] + arr[()] = data + else: + if isinstance(data, np.ndarray): + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) From 1689b7ef15c188e62f9f3a38fa63cc7329b08d2c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 11:00:47 -0800 Subject: [PATCH 045/125] Removed obsolete intermediates extractor steps --- src/guppy/csv_step2.py | 110 ----------- src/guppy/csv_step3.py | 66 ------- src/guppy/doric_step2.py | 92 --------- src/guppy/doric_step3.py | 159 --------------- src/guppy/npm_step2.py | 411 --------------------------------------- src/guppy/tdt_step2.py | 28 --- src/guppy/tdt_step3.py | 207 -------------------- 7 files changed, 1073 deletions(-) delete mode 100644 src/guppy/csv_step2.py delete mode 100644 src/guppy/csv_step3.py delete mode 100644 src/guppy/doric_step2.py delete mode 100644 src/guppy/doric_step3.py delete mode 100644 src/guppy/npm_step2.py delete mode 100644 src/guppy/tdt_step2.py delete mode 100644 src/guppy/tdt_step3.py diff --git a/src/guppy/csv_step2.py b/src/guppy/csv_step2.py deleted file mode 100644 index ba4b34f..0000000 --- a/src/guppy/csv_step2.py +++ /dev/null @@ -1,110 +0,0 @@ -import glob -import logging -import os - -import numpy as np -import pandas as pd - -logger = logging.getLogger(__name__) - - -def check_header(df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - - -def import_csv_step2(filepath): - logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) - - path = sorted(list(set(path))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - ext = os.path.basename(path[i]).split(".")[-1] - assert ext == "csv", "Only .csv files are supported by import_csv function." - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - assert len(check_all_str) != len( - df_arr - ), "This file appears to be doric .csv. This function only supports standard .csv files." - df = pd.read_csv(path[i], index_col=False) - - _, value = check_header(df) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - df = df - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - # check the structure of dataframe and assign flag to the type of file - if len(cols) == 1: - if cols[0].lower() != "timestamps": - logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - else: - flag = "event_csv" - elif len(cols) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(cols)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - else: - flag = "data_csv" - elif len(cols) == 2: - raise ValueError( - "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." - ) - elif len(cols) >= 2: - raise ValueError( - "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." - ) - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - if columns_isstr == True and ( - "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) - ): - flag = flag + "_v2" - else: - flag = flag - - flag_arr.append(flag) - logger.info(flag) - assert ( - flag == "event_csv" or flag == "data_csv" - ), "This function only supports standard event_csv and data_csv files." - name = os.path.basename(path[i]).split(".")[0] - event_from_filename.append(name) - - logger.info("Importing of csv file is done.") - return event_from_filename, flag_arr diff --git a/src/guppy/csv_step3.py b/src/guppy/csv_step3.py deleted file mode 100644 index 985959a..0000000 --- a/src/guppy/csv_step3.py +++ /dev/null @@ -1,66 +0,0 @@ -import logging -import multiprocessing as mp -import os -import time -from itertools import repeat - -import numpy as np -import pandas as pd - -from guppy.common_step3 import write_hdf5 - -logger = logging.getLogger(__name__) - - -def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()): - # logger.info("Reading data for event {} ...".format(event)) - - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(import_csv, zip(repeat(filepath), event, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -# function to read event timestamps csv file. -def import_csv(filepath, event, outputPath): - logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") - if not os.path.exists(os.path.join(filepath, event + ".csv")): - logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - - df = pd.read_csv(os.path.join(filepath, event + ".csv"), index_col=False) - data = df - key = list(df.columns) - - if len(key) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(key)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - - if len(key) == 1: - if key[0].lower() != "timestamps": - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") - - if len(key) != 3 and len(key) != 1: - logger.error( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - - for i in range(len(key)): - write_hdf5(data[key[i]].dropna(), event, outputPath, key[i].lower()) - - logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - - return data, key diff --git a/src/guppy/doric_step2.py b/src/guppy/doric_step2.py deleted file mode 100644 index 26ab22e..0000000 --- a/src/guppy/doric_step2.py +++ /dev/null @@ -1,92 +0,0 @@ -import glob -import logging -import os - -import h5py -import numpy as np -import pandas as pd - -logger = logging.getLogger(__name__) - - -def import_doric(filepath): - - logger.debug("If it exists, importing Doric file based on the structure of file") - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) - - path = sorted(list(set(path))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "doric": - key_names = read_doric(path[i]) - event_from_filename.extend(key_names) - flag = "doric_doric" - else: - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - assert len(check_all_str) == len( - df_arr - ), "This file appears to be standard .csv. This function only supports doric .csv files." - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - df = df.drop(["Time(s)"], axis=1) - event_from_filename.extend(list(df.columns)) - flag = "doric_csv" - logger.info(flag) - logger.info("Importing of Doric file is done.") - return event_from_filename, flag_arr - - -def read_doric(filepath): - with h5py.File(filepath, "r") as f: - if "Traces" in list(f.keys()): - keys = access_keys_doricV1(f) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_keys_doricV6(f) - - return keys - - -def access_keys_doricV6(doric_file): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - keys = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - keys.append(f"{sep_values[-3]}/{sep_values[-2]}") - else: - keys.append(f"{sep_values[-2]}/{sep_values[-1]}") - - return keys - - -def access_keys_doricV1(doric_file): - keys = list(doric_file["Traces"]["Console"].keys()) - keys.remove("Time(s)") - - return keys - - -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l diff --git a/src/guppy/doric_step3.py b/src/guppy/doric_step3.py deleted file mode 100644 index e9fd7cc..0000000 --- a/src/guppy/doric_step3.py +++ /dev/null @@ -1,159 +0,0 @@ -import glob -import logging -import os -import re -import warnings - -import h5py -import numpy as np -import pandas as pd - -from guppy.common_step3 import write_hdf5 - -logger = logging.getLogger(__name__) - - -def check_doric(filepath): - logger.debug("Checking if doric file exists") - path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) - - flag_arr = [] - for i in range(len(path)): - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "csv": - with warnings.catch_warnings(): - warnings.simplefilter("error") - try: - df = pd.read_csv(path[i], index_col=False, dtype=float) - except: - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - flag = "doric_csv" - flag_arr.append(flag) - elif ext == "doric": - flag = "doric_doric" - flag_arr.append(flag) - else: - pass - - if len(flag_arr) > 1: - logger.error("Two doric files are present at the same location") - raise Exception("Two doric files are present at the same location") - if len(flag_arr) == 0: - logger.error("\033[1m" + "Doric file not found." + "\033[1m") - return 0 - logger.info("Doric file found.") - return flag_arr[0] - - -def execute_import_doric(filepath, storesList, flag, outputPath): - flag = check_doric(filepath) - - if flag == "doric_csv": - path = glob.glob(os.path.join(filepath, "*.csv")) - if len(path) > 1: - logger.error("An error occurred : More than one Doric csv file present at the location") - raise Exception("More than one Doric csv file present at the location") - else: - df = pd.read_csv(path[0], header=1, index_col=False) - df = df.dropna(axis=1, how="all") - df = df.dropna(axis=0, how="any") - df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(df["Time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") - write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") - else: - ttl = df[storesList[0, i]] - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5( - df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" - ) - else: - path = glob.glob(os.path.join(filepath, "*.doric")) - if len(path) > 1: - logger.error("An error occurred : More than one Doric file present at the location") - raise Exception("More than one Doric file present at the location") - else: - with h5py.File(path[0], "r") as f: - if "Traces" in list(f.keys()): - keys = access_data_doricV1(f, storesList, outputPath) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_data_doricV6(f, storesList, outputPath) - - -def access_data_doricV6(doric_file, storesList, outputPath): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - decide_path = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: - decide_path.append(element) - else: - if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: - decide_path.append(element) - - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") - idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] - if len(idx) > 1: - logger.error("More than one string matched (which should not be the case)") - raise Exception("More than one string matched (which should not be the case)") - idx = idx[0] - data = np.array(doric_file[decide_path[idx]]) - timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") - else: - regex = re.compile("(.*?)" + storesList[0, i] + "$") - idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] - if len(idx) > 1: - logger.error("More than one string matched (which should not be the case)") - raise Exception("More than one string matched (which should not be the case)") - idx = idx[0] - ttl = np.array(doric_file[decide_path[idx]]) - timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") - - -def access_data_doricV1(doric_file, storesList, outputPath): - keys = list(doric_file["Traces"]["Console"].keys()) - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") - else: - timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") - - -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l diff --git a/src/guppy/npm_step2.py b/src/guppy/npm_step2.py deleted file mode 100644 index 14b776f..0000000 --- a/src/guppy/npm_step2.py +++ /dev/null @@ -1,411 +0,0 @@ -import glob -import logging -import os -import tkinter as tk -from tkinter import StringVar, messagebox, ttk - -import numpy as np -import pandas as pd -import panel as pn - -pn.extension() - -logger = logging.getLogger(__name__) - - -def import_npm(filepath, num_ch, inputParameters=None): - - logger.debug("If it exists, importing NPM file based on the structure of file") - # Headless configuration (used to avoid any UI prompts when running tests) - headless = bool(os.environ.get("GUPPY_BASE_DIR")) - npm_timestamp_column_name = None - npm_time_unit = None - npm_split_events = None - if isinstance(inputParameters, dict): - npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") - npm_time_unit = inputParameters.get("npm_time_unit", "seconds") - npm_split_events = inputParameters.get("npm_split_events", True) - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) # TODO: what is this for? - path_chev_chod_event = path_chev + path_chod + path_event + path_chpr - - path = sorted(list(set(path) - set(path_chev_chod_event))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - dirname = os.path.dirname(path[i]) - ext = os.path.basename(path[i]).split(".")[-1] - assert ext != "doric", "Doric files are not supported by import_npm function." - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - assert len(check_all_str) != len( - df_arr - ), "This file appears to be doric .csv. This function only supports NPM .csv files." - df = pd.read_csv(path[i], index_col=False) - _, value = check_header(df) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - df = df - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - # check the structure of dataframe and assign flag to the type of file - assert len(cols) != 1, "File appears to be event .csv. This function only supports NPM .csv files." - assert len(cols) != 3, "File appears to be data .csv. This function only supports NPM .csv files." - if len(cols) == 2: - flag = "event_or_data_np" - elif len(cols) >= 2: - flag = "data_np" - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - if columns_isstr == True and ( - "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) - ): - flag = flag + "_v2" - else: - flag = flag - - # used assigned flags to process the files and read the data - if flag == "event_or_data_np": - arr = list(df.iloc[:, 1]) - check_float = [True for i in arr if isinstance(i, float)] - if len(arr) == len(check_float) and columns_isstr == False: - flag = "data_np" - elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): - flag = "event_np" - else: - flag = "event_np" - - flag_arr.append(flag) - logger.info(flag) - if flag == "data_np": - file = f"file{str(i)}_" - df, indices_dict, _ = decide_indices(file, df, flag, num_ch) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - elif flag == "event_np": - type_val = np.array(df.iloc[:, 1]) - type_val_unique = np.unique(type_val) - if headless: - response = 1 if bool(npm_split_events) else 0 - else: - window = tk.Tk() - if len(type_val_unique) > 1: - response = messagebox.askyesno( - "Multiple event TTLs", - "Based on the TTL file,\ - it looks like TTLs \ - belongs to multiple behavior type. \ - Do you want to create multiple files for each \ - behavior type ?", - ) - else: - response = 0 - window.destroy() - if response == 1: - timestamps = np.array(df.iloc[:, 0]) - for j in range(len(type_val_unique)): - idx = np.where(type_val == type_val_unique[j]) - d = dict() - d["timestamps"] = timestamps[idx] - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) - event_from_filename.append("event" + str(type_val_unique[j])) - else: - timestamps = np.array(df.iloc[:, 0]) - d = dict() - d["timestamps"] = timestamps - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) - event_from_filename.append("event" + str(0)) - else: - file = f"file{str(i)}_" - df, ts_unit = decide_ts_unit_for_npm( - df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless - ) - df, indices_dict, _ = decide_indices(file, df, flag) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_chpr = [path_chev, path_chod, path_chpr] - if ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) and ("event_np" in flag_arr) and (i == len(path) - 1) - ) or ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) - ): # i==len(path)-1 and or 'event_np' in flag - num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) - arr_len, no_ch = [], [] - for i in range(len(path_chev_chod_chpr)): - if len(path_chev_chod_chpr[i]) > 0: - arr_len.append(len(path_chev_chod_chpr[i])) - else: - continue - - unique_arr_len = np.unique(np.array(arr_len)) - if "data_np_v2" in flag_arr: - if ts_unit == "seconds": - divisor = 1 - elif ts_unit == "milliseconds": - divisor = 1e3 - else: - divisor = 1e6 - else: - divisor = 1000 - - for j in range(len(path_event)): - df_event = pd.read_csv(path_event[j]) - df_chev = pd.read_csv(path_chev[0]) - df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor - df_event.to_csv(path_event[j], index=False) - if unique_arr_len.shape[0] == 1: - for j in range(len(path_chev)): - if file + "chev" in indices_dict.keys(): - df_chev = pd.read_csv(path_chev[j]) - df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor - df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) - df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( - df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] - ) - df_chev.to_csv(path_chev[j], index=False) - - if file + "chod" in indices_dict.keys(): - df_chod = pd.read_csv(path_chod[j]) - df_chod["timestamps"] = df_chev["timestamps"] - df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) - df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chod.to_csv(path_chod[j], index=False) - - if file + "chpr" in indices_dict.keys(): - df_chpr = pd.read_csv(path_chpr[j]) - df_chpr["timestamps"] = df_chev["timestamps"] - df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) - df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chpr.to_csv(path_chpr[j], index=False) - else: - logger.error("Number of channels should be same for all regions.") - raise Exception("Number of channels should be same for all regions.") - logger.info("Importing of NPM file is done.") - return event_from_filename, flag_arr - - -def check_header(df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - - -# function to decide indices of interleaved channels -# in neurophotometrics data -def decide_indices(file, df, flag, num_ch=2): - ch_name = [file + "chev", file + "chod", file + "chpr"] - if len(ch_name) < num_ch: - logger.error( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - if flag == "data_np": - indices_dict = dict() - for i in range(num_ch): - indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) - - else: - cols = np.array(list(df.columns)) - if "flags" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "Flags"] - state = np.array(df["Flags"]) - elif "ledstate" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "LedState"] - state = np.array(df["LedState"]) - else: - logger.error( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - raise Exception( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - - num_ch, ch = check_channels(state) - indices_dict = dict() - for i in range(num_ch): - first_occurrence = np.where(state == ch[i])[0] - indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) - - df = df.drop(arr, axis=1) - - return df, indices_dict, num_ch - - -# check flag consistency in neurophotometrics data -def check_channels(state): - state = state.astype(int) - unique_state = np.unique(state[2:12]) - if unique_state.shape[0] > 3: - logger.error( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - - return unique_state.shape[0], unique_state - - -# function to decide NPM timestamps unit (seconds, ms or us) -def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): - col_names = np.array(list(df.columns)) - col_names_ts = [""] - for name in col_names: - if "timestamp" in name.lower(): - col_names_ts.append(name) - - ts_unit = "seconds" - if len(col_names_ts) > 2: - # Headless path: auto-select column/unit without any UI - if headless: - if timestamp_column_name is not None: - assert ( - timestamp_column_name in col_names_ts - ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" - chosen = timestamp_column_name - else: - chosen = col_names_ts[1] - df.insert(1, "Timestamp", df[chosen]) - df = df.drop(col_names_ts[1:], axis=1) - valid_units = {"seconds", "milliseconds", "microseconds"} - ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" - return df, ts_unit - # def comboBoxSelected(event): - # logger.info(event.widget.get()) - - window = tk.Tk() - window.title("Select appropriate options for timestamps") - window.geometry("500x200") - holdComboboxValues = dict() - - timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( - row=0, column=1, pady=25, padx=25 - ) - holdComboboxValues["timestamps"] = StringVar() - timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) - timestamps_combo.grid(row=0, column=2, pady=25, padx=25) - timestamps_combo.current(0) - # timestamps_combo.bind("<>", comboBoxSelected) - - time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) - holdComboboxValues["time_unit"] = StringVar() - time_unit_combo = ttk.Combobox( - window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] - ) - time_unit_combo.grid(row=1, column=2, pady=25, padx=25) - time_unit_combo.current(0) - # time_unit_combo.bind("<>", comboBoxSelected) - window.lift() - window.after(500, lambda: window.lift()) - window.mainloop() - - if holdComboboxValues["timestamps"].get(): - df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) - df = df.drop(col_names_ts[1:], axis=1) - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - if holdComboboxValues["time_unit"].get(): - if holdComboboxValues["time_unit"].get() == "seconds": - ts_unit = holdComboboxValues["time_unit"].get() - elif holdComboboxValues["time_unit"].get() == "milliseconds": - ts_unit = holdComboboxValues["time_unit"].get() - else: - ts_unit = holdComboboxValues["time_unit"].get() - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - else: - pass - - return df, ts_unit diff --git a/src/guppy/tdt_step2.py b/src/guppy/tdt_step2.py deleted file mode 100644 index 130ace8..0000000 --- a/src/guppy/tdt_step2.py +++ /dev/null @@ -1,28 +0,0 @@ -import glob -import logging -import os - -import numpy as np -import pandas as pd -from numpy import float32, float64, int32, int64, uint16 - -logger = logging.getLogger(__name__) - - -# function to read 'tsq' file -def readtsq(filepath): - names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") - formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) - offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 - tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 1: - logger.error("Two tsq files are present at the location.") - raise Exception("Two tsq files are present at the location.") - elif len(path) == 0: - return 0 - else: - path = path[0] - tsq = np.fromfile(path, dtype=tsq_dtype) - df = pd.DataFrame(tsq) - return df diff --git a/src/guppy/tdt_step3.py b/src/guppy/tdt_step3.py deleted file mode 100644 index be92d4c..0000000 --- a/src/guppy/tdt_step3.py +++ /dev/null @@ -1,207 +0,0 @@ -import glob -import logging -import multiprocessing as mp -import os -import time -from itertools import repeat - -import numpy as np -import pandas as pd -from numpy import float32, float64, int32, int64, uint16 - -from guppy.common_step3 import write_hdf5 - -logger = logging.getLogger(__name__) - - -# function to read tsq file -def readtsq(filepath): - logger.debug("Trying to read tsq file.") - names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") - formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) - offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 - tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 1: - logger.error("Two tsq files are present at the location.") - raise Exception("Two tsq files are present at the location.") - elif len(path) == 0: - logger.info("\033[1m" + "tsq file not found." + "\033[1m") - return 0, 0 - else: - path = path[0] - flag = "tsq" - - # reading tsq file - tsq = np.fromfile(path, dtype=tsq_dtype) - - # creating dataframe of the data - df = pd.DataFrame(tsq) - - logger.info("Data from tsq file fetched.") - return df, flag - - -# function to execute readtev function using multiprocessing to make it faster -def execute_readtev(filepath, event, outputPath, numProcesses=mp.cpu_count()): - data, _ = readtsq(filepath) - - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) - # p = mp.Pool(mp.cpu_count()) - # p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) - # p.close() - # p.join() - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -# function to read tev file -def readtev(data, filepath, event, outputPath): - - logger.debug("Reading data for event {} ...".format(event)) - tevfilepath = glob.glob(os.path.join(filepath, "*.tev")) - if len(tevfilepath) > 1: - raise Exception("Two tev files are present at the location.") - else: - tevfilepath = tevfilepath[0] - - data["name"] = np.asarray(data["name"], dtype=str) - - allnames = np.unique(data["name"]) - - index = [] - for i in range(len(allnames)): - length = len(str(allnames[i])) - if length < 4: - index.append(i) - - allnames = np.delete(allnames, index, 0) - - eventNew = np.array(list(event)) - - # logger.info(allnames) - # logger.info(eventNew) - row = ismember(data["name"], event) - - if sum(row) == 0: - logger.error("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") - logger.error("\033[1m" + "File contains the following TDT store names:" + "\033[0m") - logger.error("\033[1m" + str(allnames) + "\033[0m") - logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") - raise ValueError("Requested store name not found.") - - allIndexesWhereEventIsPresent = np.where(row == 1) - first_row = allIndexesWhereEventIsPresent[0][0] - - formatNew = data["format"][first_row] + 1 - - table = np.array( - [ - [0, 0, 0, 0], - [0, "float", 1, np.float32], - [0, "long", 1, np.int32], - [0, "short", 2, np.int16], - [0, "byte", 4, np.int8], - ] - ) - - S = dict() - - S["storename"] = str(event) - S["sampling_rate"] = data["frequency"][first_row] - S["timestamps"] = np.asarray(data["timestamp"][allIndexesWhereEventIsPresent[0]]) - S["channels"] = np.asarray(data["chan"][allIndexesWhereEventIsPresent[0]]) - - fp_loc = np.asarray(data["fp_loc"][allIndexesWhereEventIsPresent[0]]) - data_size = np.asarray(data["size"]) - - if formatNew != 5: - nsample = (data_size[first_row,] - 10) * int(table[formatNew, 2]) - S["data"] = np.zeros((len(fp_loc), nsample)) - for i in range(0, len(fp_loc)): - with open(tevfilepath, "rb") as fp: - fp.seek(fp_loc[i], os.SEEK_SET) - S["data"][i, :] = np.fromfile(fp, dtype=table[formatNew, 3], count=nsample).reshape( - 1, nsample, order="F" - ) - # S['data'] = S['data'].swapaxes() - S["npoints"] = nsample - else: - S["data"] = np.asarray(data["strobe"][allIndexesWhereEventIsPresent[0]]) - S["npoints"] = 1 - S["channels"] = np.tile(1, (S["data"].shape[0],)) - - S["data"] = (S["data"].T).reshape(-1, order="F") - - save_dict_to_hdf5(S, event, outputPath) - - check_data(S, filepath, event, outputPath) - - logger.info("Data for event {} fetched and stored.".format(event)) - - -# check if a particular element is there in an array or not -def ismember(arr, element): - res = [1 if i == element else 0 for i in arr] - return np.asarray(res) - - -# function to save data read from tev file to hdf5 file -def save_dict_to_hdf5(S, event, outputPath): - write_hdf5(S["storename"], event, outputPath, "storename") - write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") - write_hdf5(S["timestamps"], event, outputPath, "timestamps") - - write_hdf5(S["data"], event, outputPath, "data") - write_hdf5(S["npoints"], event, outputPath, "npoints") - write_hdf5(S["channels"], event, outputPath, "channels") - - -# function to check event data (checking whether event timestamps belongs to same event or multiple events) -def check_data(S, filepath, event, outputPath): - # logger.info("Checking event storename data for creating multiple event names from single event storename...") - new_event = event.replace("\\", "") - new_event = event.replace("/", "") - diff = np.diff(S["data"]) - arr = np.full(diff.shape[0], 1) - - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) - - if diff.shape[0] == 0: - return 0 - - if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: - logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" - ) - i_d = np.unique(S["data"]) - for i in range(i_d.shape[0]): - new_S = dict() - idx = np.where(S["data"] == i_d[i])[0] - new_S["timestamps"] = S["timestamps"][idx] - new_S["storename"] = new_event + str(int(i_d[i])) - new_S["sampling_rate"] = S["sampling_rate"] - new_S["data"] = S["data"] - new_S["npoints"] = S["npoints"] - new_S["channels"] = S["channels"] - storesList = np.concatenate( - (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 - ) - save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) - - idx = np.where(storesList[0] == event)[0] - storesList = np.delete(storesList, idx, axis=1) - if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): - os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) - if idx.shape[0] == 0: - pass - else: - np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info( - "\033[1m" - + "Timestamp files for individual new event are created \ - and the stores list file is changed." - + "\033[0m" - ) From b35e04b0db575f6ca72ea198d9db12bde06e6b68 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 11:08:20 -0800 Subject: [PATCH 046/125] Refactored csv_recording_extractor to inherit from base_recording_extractor. --- .../extractors/csv_recording_extractor.py | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index 5a42bd1..792ad01 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -4,11 +4,12 @@ import os import time from itertools import repeat +from typing import Any import numpy as np import pandas as pd -from guppy.common_step3 import write_hdf5 +from guppy.extractors import BaseRecordingExtractor logger = logging.getLogger(__name__) @@ -29,7 +30,7 @@ def read_and_save_csv(extractor, event, outputPath): logger.info("Data for event {} fetched and stored.".format(event)) -class CsvRecordingExtractor: +class CsvRecordingExtractor(BaseRecordingExtractor): def __init__(self, folder_path): self.folder_path = folder_path @@ -58,7 +59,7 @@ def __init__(self, folder_path): ), "This file appears to be doric .csv. This function only supports standard .csv files." df = pd.read_csv(path[i], index_col=False) - _, value = self.check_header(df) + _, value = self._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -121,10 +122,18 @@ def __init__(self, folder_path): logger.info("Importing of csv file is done.") - self.events = event_from_filename - self.flags = flag_arr + self._events = event_from_filename + self._flags = flag_arr - def check_header(self, df): + @property + def events(self) -> list[str]: + return self._events + + @property + def flags(self) -> list: + return self._flags + + def _check_header(self, df): arr = list(df.columns) check_float = [] for i in arr: @@ -135,7 +144,7 @@ def check_header(self, df): return arr, check_float - def read_csv(self, event): + def _read_csv(self, event): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") @@ -144,7 +153,7 @@ def read_csv(self, event): df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) return df - def save_to_hdf5(self, df, event, outputPath): + def _save_to_hdf5(self, df, event, outputPath): key = list(df.columns) # TODO: clean up these if branches @@ -175,21 +184,21 @@ def save_to_hdf5(self, df, event, outputPath): ) for i in range(len(key)): - write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) + self._write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - def read(self, events, outputPath): + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: output_dicts = [] for event in events: - df = self.read_csv(event=event) + df = self._read_csv(event=event) S = df.to_dict() S["storename"] = event output_dicts.append(S) return output_dicts - def save(self, output_dicts, outputPath): + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: for S in output_dicts: event = S.pop("storename") df = pd.DataFrame.from_dict(S) - self.save_to_hdf5(df=df, event=event, outputPath=outputPath) + self._save_to_hdf5(df=df, event=event, outputPath=outputPath) From b330a64b43e87ec20536e3cdfa815efcb3b7f054 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 11:46:39 -0800 Subject: [PATCH 047/125] Refactored tdt_recording_extractor to inherit from base_recording_extractor. --- .../extractors/tdt_recording_extractor.py | 84 +++++++++++-------- src/guppy/saveStoresList.py | 30 ++----- 2 files changed, 57 insertions(+), 57 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 58cde99..6e712fb 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -4,12 +4,13 @@ import os import time from itertools import repeat +from typing import Any import numpy as np import pandas as pd from numpy import float32, float64, int32, int64, uint16 -from guppy.common_step3 import write_hdf5 +from guppy.extractors import BaseRecordingExtractor logger = logging.getLogger(__name__) @@ -27,13 +28,37 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() logger.info("Time taken = {0:.5f}".format(time.time() - start)) -class TdtRecordingExtractor: +class TdtRecordingExtractor(BaseRecordingExtractor): def __init__(self, folder_path): self.folder_path = folder_path - self.header_df, _ = self.readtsq(folder_path) + self._header_df, _ = self._readtsq(folder_path) + + # Populate events from header_df + if isinstance(self._header_df, pd.DataFrame): + self._header_df["name"] = np.asarray(self._header_df["name"], dtype=str) + allnames = np.unique(self._header_df["name"]) + index = [] + for i in range(len(allnames)): + length = len(str(allnames[i])) + if length < 4: + index.append(i) + allnames = np.delete(allnames, index, 0) + self._events = list(allnames) + else: + self._events = [] + + self._flags = [] + + @property + def events(self) -> list[str]: + return self._events - def readtsq(self, folder_path): + @property + def flags(self) -> list: + return self._flags + + def _readtsq(self, folder_path): logger.debug("Trying to read tsq file.") names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) @@ -59,9 +84,8 @@ def readtsq(self, folder_path): logger.info("Data from tsq file fetched.") return df, flag - # function to read tev file - def readtev(self, event): - data = self.header_df + def _readtev(self, event): + data = self._header_df filepath = self.folder_path logger.debug("Reading data for event {} ...".format(event)) @@ -87,7 +111,7 @@ def readtev(self, event): # logger.info(allnames) # logger.info(eventNew) - row = self.ismember(data["name"], event) + row = self._ismember(data["name"], event) if sum(row) == 0: logger.error("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") @@ -141,24 +165,23 @@ def readtev(self, event): return S - def read(self, events, outputPath): + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: output_dicts = [] for event in events: - S = self.readtev(event=event) - if self.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): - event_dicts = self.split_event_data(S, event) - self.split_event_storesList(S, event, outputPath) + S = self._readtev(event=event) + if self._event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): + event_dicts = self._split_event_data(S, event) + self._split_event_storesList(S, event, outputPath) else: event_dicts = [S] output_dicts.extend(event_dicts) return output_dicts - # check if a particular element is there in an array or not - def ismember(self, arr, element): # TODO: replace this function with more standard usage + def _ismember(self, arr, element): res = [1 if i == element else 0 for i in arr] return np.asarray(res) - def event_needs_splitting(self, data, sampling_rate): + def _event_needs_splitting(self, data, sampling_rate): logger.info("Checking event storename data for creating multiple event names from single event storename...") diff = np.diff(data) if diff.shape[0] == 0: @@ -167,7 +190,7 @@ def event_needs_splitting(self, data, sampling_rate): return True return False - def split_event_data(self, S, event): + def _split_event_data(self, S, event): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") @@ -189,10 +212,7 @@ def split_event_data(self, S, event): return event_dicts - # This function saves a new storesList.csv file, which is a bit of a side effect in the overall read path, - # which is supposed to just return a list of dictionaries. - # TODO: long term I'd like to move these storesList shenanigans somewhere else, likely outside of the extractor. - def split_event_storesList(self, S, event, outputPath): + def _split_event_storesList(self, S, event, outputPath): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") @@ -217,17 +237,15 @@ def split_event_storesList(self, S, event, outputPath): np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") logger.info("\033[1m The stores list file is changed.\033[0m") - # function to save data read from tev file to hdf5 file - def save_dict_to_hdf5(self, S, outputPath): + def _save_dict_to_hdf5(self, S, outputPath): event = S["storename"] - write_hdf5(S["storename"], event, outputPath, "storename") - write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") - write_hdf5(S["timestamps"], event, outputPath, "timestamps") - - write_hdf5(S["data"], event, outputPath, "data") - write_hdf5(S["npoints"], event, outputPath, "npoints") - write_hdf5(S["channels"], event, outputPath, "channels") - - def save(self, output_dicts, outputPath): + self._write_hdf5(S["storename"], event, outputPath, "storename") + self._write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") + self._write_hdf5(S["timestamps"], event, outputPath, "timestamps") + self._write_hdf5(S["data"], event, outputPath, "data") + self._write_hdf5(S["npoints"], event, outputPath, "npoints") + self._write_hdf5(S["channels"], event, outputPath, "channels") + + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: for S in output_dicts: - self.save_dict_to_hdf5(S=S, outputPath=outputPath) + self._save_dict_to_hdf5(S=S, outputPath=outputPath) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 552d76c..74602a5 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -80,7 +80,7 @@ def make_dir(filepath): # function to show GUI and save -def saveStorenames(inputParameters, data, event_name, flag, filepath): +def saveStorenames(inputParameters, event_name, flag, filepath): logger.debug("Saving stores list file.") # getting input parameters @@ -96,20 +96,8 @@ def saveStorenames(inputParameters, data, event_name, flag, filepath): logger.info("Storeslist : \n" + str(arr)) return - # reading storenames from the data fetched using 'readtsq' function - if isinstance(data, pd.DataFrame): - data["name"] = np.asarray(data["name"], dtype=str) - allnames = np.unique(data["name"]) - index = [] - for i in range(len(allnames)): - length = len(str(allnames[i])) - if length < 4: - index.append(i) - allnames = np.delete(allnames, index, 0) - allnames = list(allnames) - - else: - allnames = [] + # Get storenames from extractor's events property + allnames = event_name if "data_np_v2" in flag or "data_np" in flag or "event_np" in flag: path_chev = glob.glob(os.path.join(filepath, "*chev*")) @@ -152,9 +140,6 @@ def plot(plot_select): else: pass - # finalizing all the storenames - allnames = allnames + event_name - # instructions about how to save the storeslist file mark_down = pn.pane.Markdown( """ @@ -589,16 +574,14 @@ def execute(inputParameters): filepath = os.path.join(inputParameters["abspath"], i) if modality == "tdt": extractor = TdtRecordingExtractor(folder_path=filepath) - data = extractor.header_df - event_name, flag = [], [] + event_name = extractor.events + flag = extractor.flags elif modality == "csv": - data = 0 extractor = CsvRecordingExtractor(folder_path=filepath) event_name = extractor.events flag = extractor.flags elif modality == "doric": - data = 0 extractor = DoricRecordingExtractor(folder_path=filepath) event_name = extractor.events flag = extractor.flags @@ -621,14 +604,13 @@ def execute(inputParameters): npm_timestamp_column_names if npm_timestamp_column_names else None ) - data = 0 extractor = NpmRecordingExtractor(folder_path=filepath, num_ch=num_ch, inputParameters=inputParameters) event_name = extractor.events flag = extractor.flags else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") - saveStorenames(inputParameters, data, event_name, flag, filepath) + saveStorenames(inputParameters, event_name, flag, filepath) logger.info("#" * 400) except Exception as e: logger.error(str(e)) From 8af3b2be7e73eeaa326de65344bb36e8955f4207 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 11:49:11 -0800 Subject: [PATCH 048/125] Updated parameter names for saveStoresList. --- src/guppy/saveStoresList.py | 62 ++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 74602a5..318bc5f 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -80,7 +80,7 @@ def make_dir(filepath): # function to show GUI and save -def saveStorenames(inputParameters, event_name, flag, filepath): +def saveStorenames(inputParameters, events, flags, folder_path): logger.debug("Saving stores list file.") # getting input parameters @@ -89,7 +89,7 @@ def saveStorenames(inputParameters, event_name, flag, filepath): # Headless path: if storenames_map provided, write storesList.csv without building the Panel UI storenames_map = inputParameters.get("storenames_map") if isinstance(storenames_map, dict) and len(storenames_map) > 0: - op = make_dir(filepath) + op = make_dir(folder_path) arr = np.asarray([list(storenames_map.keys()), list(storenames_map.values())], dtype=str) np.savetxt(os.path.join(op, "storesList.csv"), arr, delimiter=",", fmt="%s") logger.info(f"Storeslist file saved at {op}") @@ -97,12 +97,12 @@ def saveStorenames(inputParameters, event_name, flag, filepath): return # Get storenames from extractor's events property - allnames = event_name + allnames = events - if "data_np_v2" in flag or "data_np" in flag or "event_np" in flag: - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + if "data_np_v2" in flags or "data_np" in flags or "event_np" in flags: + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) combine_paths = path_chev + path_chod + path_chpr d = dict() for i in range(len(combine_paths)): @@ -179,7 +179,9 @@ def plot(plot_select): ) # creating GUI template - template = pn.template.BootstrapTemplate(title="Storenames GUI - {}".format(os.path.basename(filepath), mark_down)) + template = pn.template.BootstrapTemplate( + title="Storenames GUI - {}".format(os.path.basename(folder_path), mark_down) + ) # creating different buttons and selectors for the GUI cross_selector = pn.widgets.CrossSelector(name="Store Names Selection", value=[], options=allnames, width=600) @@ -253,10 +255,10 @@ def callback(target, event): # on clicking overwrite_button, following function is executed def overwrite_button_actions(event): if event.new == "over_write_file": - select_location.options = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + select_location.options = takeOnlyDirs(glob.glob(os.path.join(folder_path, "*_output_*"))) # select_location.value = select_location.options[0] else: - select_location.options = [show_dir(filepath)] + select_location.options = [show_dir(folder_path)] # select_location.value = select_location.options[0] def fetchValues(event): @@ -513,8 +515,8 @@ def save_button(event=None): # creating widgets, adding them to template and showing a GUI on a new browser window number = scanPortsAndFind(start_port=5000, end_port=5200) - if "data_np_v2" in flag or "data_np" in flag or "event_np" in flag: - widget_1 = pn.Column("# " + os.path.basename(filepath), mark_down, mark_down_np, plot_select, plot) + if "data_np_v2" in flags or "data_np" in flags or "event_np" in flags: + widget_1 = pn.Column("# " + os.path.basename(folder_path), mark_down, mark_down_np, plot_select, plot) widget_2 = pn.Column( repeat_storenames, repeat_storename_wd, @@ -535,7 +537,7 @@ def save_button(event=None): template.main.append(pn.Row(widget_1, widget_2)) else: - widget_1 = pn.Column("# " + os.path.basename(filepath), mark_down) + widget_1 = pn.Column("# " + os.path.basename(folder_path), mark_down) widget_2 = pn.Column( repeat_storenames, repeat_storename_wd, @@ -571,32 +573,32 @@ def execute(inputParameters): try: for i in folderNames: - filepath = os.path.join(inputParameters["abspath"], i) + folder_path = os.path.join(inputParameters["abspath"], i) if modality == "tdt": - extractor = TdtRecordingExtractor(folder_path=filepath) - event_name = extractor.events - flag = extractor.flags + extractor = TdtRecordingExtractor(folder_path=folder_path) + events = extractor.events + flags = extractor.flags elif modality == "csv": - extractor = CsvRecordingExtractor(folder_path=filepath) - event_name = extractor.events - flag = extractor.flags + extractor = CsvRecordingExtractor(folder_path=folder_path) + events = extractor.events + flags = extractor.flags elif modality == "doric": - extractor = DoricRecordingExtractor(folder_path=filepath) - event_name = extractor.events - flag = extractor.flags + extractor = DoricRecordingExtractor(folder_path=folder_path) + events = extractor.events + flags = extractor.flags elif modality == "npm": headless = bool(os.environ.get("GUPPY_BASE_DIR")) if not headless: # Resolve multiple event TTLs - multiple_event_ttls = NpmRecordingExtractor.has_multiple_event_ttls(folder_path=filepath) + multiple_event_ttls = NpmRecordingExtractor.has_multiple_event_ttls(folder_path=folder_path) responses = get_multi_event_responses(multiple_event_ttls) inputParameters["npm_split_events"] = responses # Resolve timestamp units and columns ts_unit_needs, col_names_ts = NpmRecordingExtractor.needs_ts_unit( - folder_path=filepath, num_ch=num_ch + folder_path=folder_path, num_ch=num_ch ) ts_units, npm_timestamp_column_names = get_timestamp_configuration(ts_unit_needs, col_names_ts) inputParameters["npm_time_units"] = ts_units if ts_units else None @@ -604,13 +606,15 @@ def execute(inputParameters): npm_timestamp_column_names if npm_timestamp_column_names else None ) - extractor = NpmRecordingExtractor(folder_path=filepath, num_ch=num_ch, inputParameters=inputParameters) - event_name = extractor.events - flag = extractor.flags + extractor = NpmRecordingExtractor( + folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters + ) + events = extractor.events + flags = extractor.flags else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") - saveStorenames(inputParameters, event_name, flag, filepath) + saveStorenames(inputParameters, events, flags, folder_path) logger.info("#" * 400) except Exception as e: logger.error(str(e)) From 5dc6d78626a796bb2fde267ec9996b372f040eba Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 13:22:03 -0800 Subject: [PATCH 049/125] Refactored npm_recording_extractor to inherit from base_recording_extractor. --- .../extractors/npm_recording_extractor.py | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index ae4f540..6d9b26a 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -4,12 +4,13 @@ import os import time from itertools import repeat +from typing import Any import numpy as np import pandas as pd import panel as pn -from guppy.common_step3 import write_hdf5 +from guppy.extractors import BaseRecordingExtractor pn.extension() @@ -32,16 +33,24 @@ def read_and_save_npm(extractor, event, outputPath): logger.info("Data for event {} fetched and stored.".format(event)) -class NpmRecordingExtractor: +class NpmRecordingExtractor(BaseRecordingExtractor): def __init__(self, folder_path, num_ch, inputParameters=None): # TODO: make inputParameters mandatory self.folder_path = folder_path self.num_ch = num_ch self.inputParameters = inputParameters - self.events, self.flags = self.import_npm( + self._events, self._flags = self._import_npm( folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters ) + @property + def events(self) -> list[str]: + return self._events + + @property + def flags(self) -> list: + return self._flags + @classmethod def has_multiple_event_ttls(cls, folder_path): path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) @@ -96,7 +105,7 @@ def has_multiple_event_ttls(cls, folder_path): return multiple_event_ttls - def import_npm(self, folder_path, num_ch, inputParameters=None): + def _import_npm(self, folder_path, num_ch, inputParameters=None): logger.debug("If it exists, importing NPM file based on the structure of file") # Headless configuration (used to avoid any UI prompts when running tests) @@ -233,7 +242,7 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): else: file = f"file{str(i)}_" ts_unit = npm_time_unit - df = self.update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) + df = self._update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) df, indices_dict, _ = self.decide_indices(file, df, flag) keys = list(indices_dict.keys()) for k in range(len(keys)): @@ -467,7 +476,7 @@ def needs_ts_unit(cls, folder_path, num_ch): return ts_unit_needs, col_names_ts - def update_df_with_timestamp_columns(self, df, timestamp_column_name): + def _update_df_with_timestamp_columns(self, df, timestamp_column_name): col_names = np.array(list(df.columns)) col_names_ts = [""] for name in col_names: @@ -484,7 +493,7 @@ def update_df_with_timestamp_columns(self, df, timestamp_column_name): df = df.drop(col_names_ts[1:], axis=1) return df - def read_npm(self, event): + def _read_npm(self, event): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") @@ -493,7 +502,7 @@ def read_npm(self, event): df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) return df - def save_to_hdf5(self, df, event, outputPath): + def _save_to_hdf5(self, df, event, outputPath): key = list(df.columns) # TODO: clean up these if branches @@ -524,21 +533,21 @@ def save_to_hdf5(self, df, event, outputPath): ) for i in range(len(key)): - write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) + self._write_hdf5(data=df[key[i]].dropna(), storename=event, output_path=outputPath, key=key[i].lower()) logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - def read(self, events, outputPath): + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: output_dicts = [] for event in events: - df = self.read_npm(event=event) + df = self._read_npm(event=event) S = df.to_dict() S["storename"] = event output_dicts.append(S) return output_dicts - def save(self, output_dicts, outputPath): + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: for S in output_dicts: event = S.pop("storename") df = pd.DataFrame.from_dict(S) - self.save_to_hdf5(df=df, event=event, outputPath=outputPath) + self._save_to_hdf5(df=df, event=event, outputPath=outputPath) From 861e991cbacaf8165d3023b33ada9961e0f14424 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 13:44:03 -0800 Subject: [PATCH 050/125] Refactored doric_recording_extractor to inherit from base_recording_extractor. --- .../extractors/doric_recording_extractor.py | 128 ++++++++++-------- 1 file changed, 72 insertions(+), 56 deletions(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index 2966ec6..51c22ca 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -3,23 +3,28 @@ import os import re import warnings +from typing import Any import h5py import numpy as np import pandas as pd -from guppy.common_step3 import write_hdf5 +from guppy.extractors import BaseRecordingExtractor logger = logging.getLogger(__name__) def execute_import_doric(folder_path, storesList, outputPath): + # Parse storesList into events and event_types + events = list(storesList[0, :]) + event_types = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} + extractor = DoricRecordingExtractor(folder_path=folder_path) - output_dicts = extractor.read(storesList=storesList) + output_dicts = extractor.read(events=events, outputPath=outputPath, event_types=event_types) extractor.save(output_dicts=output_dicts, outputPath=outputPath) -class DoricRecordingExtractor: +class DoricRecordingExtractor(BaseRecordingExtractor): # TODO: consolidate duplicate flag logic between the `__init__` and the `check_doric` method. def __init__(self, folder_path): @@ -36,7 +41,7 @@ def __init__(self, folder_path): for i in range(len(path)): ext = os.path.basename(path[i]).split(".")[-1] if ext == "doric": - key_names = self.read_doric(path[i]) + key_names = self._read_doric(path[i]) event_from_filename.extend(key_names) flag = "doric_doric" else: @@ -59,26 +64,34 @@ def __init__(self, folder_path): logger.info(flag) logger.info("Importing of Doric file is done.") - self.events = event_from_filename - self.flags = flag_arr + self._events = event_from_filename + self._flags = flag_arr + + @property + def events(self) -> list[str]: + return self._events + + @property + def flags(self) -> list: + return self._flags - def read_doric(self, filepath): + def _read_doric(self, filepath): with h5py.File(filepath, "r") as f: if "Traces" in list(f.keys()): - keys = self.access_keys_doricV1(f) + keys = self._access_keys_doricV1(f) elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = self.access_keys_doricV6(f) + keys = self._access_keys_doricV6(f) return keys - def access_keys_doricV6(self, doric_file): + def _access_keys_doricV6(self, doric_file): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: members = len(data) while members != 0: members -= 1 - data, last_element = self.separate_last_element(data) + data, last_element = self._separate_last_element(data) if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): res.append(last_element.name) elif isinstance(last_element, h5py.Group): @@ -94,17 +107,17 @@ def access_keys_doricV6(self, doric_file): return keys - def access_keys_doricV1(self, doric_file): + def _access_keys_doricV1(self, doric_file): keys = list(doric_file["Traces"]["Console"].keys()) keys.remove("Time(s)") return keys - def separate_last_element(self, arr): + def _separate_last_element(self, arr): l = arr[-1] return arr[:-1], l - def check_doric(self): + def _check_doric(self): logger.debug("Checking if doric file exists") path = glob.glob(os.path.join(self.folder_path, "*.csv")) + glob.glob(os.path.join(self.folder_path, "*.doric")) @@ -135,7 +148,7 @@ def check_doric(self): logger.info("Doric file found.") return flag_arr[0] - def read_doric_csv(self, storesList): + def _read_doric_csv(self, events, event_types): path = glob.glob(os.path.join(self.folder_path, "*.csv")) if len(path) > 1: logger.error("An error occurred : More than one Doric csv file present at the location") @@ -147,45 +160,46 @@ def read_doric_csv(self, storesList): df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] output_dicts = [] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: + for event in events: + event_type = event_types[event] + if "control" in event_type or "signal" in event_type: timestamps = np.array(df["Time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - data = np.array(df[storesList[0, i]]) - storename = storesList[0, i] + data = np.array(df[event]) + storename = event S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} output_dicts.append(S) else: - ttl = df[storesList[0, i]] + ttl = df[event] indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] timestamps = df["Time(s)"][indices[diff_indices] + 1].to_numpy() - storename = storesList[0, i] + storename = event S = {"storename": storename, "timestamps": timestamps} output_dicts.append(S) return output_dicts - def read_doric_doric(self, storesList): + def _read_doric_doric(self, events, event_types): path = glob.glob(os.path.join(self.folder_path, "*.doric")) if len(path) > 1: logger.error("An error occurred : More than one Doric file present at the location") raise Exception("More than one Doric file present at the location") with h5py.File(path[0], "r") as f: if "Traces" in list(f.keys()): - output_dicts = self.access_data_doricV1(f, storesList) + output_dicts = self._access_data_doricV1(f, events, event_types) elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - output_dicts = self.access_data_doricV6(f, storesList) + output_dicts = self._access_data_doricV6(f, events, event_types) return output_dicts - def access_data_doricV6(self, doric_file, storesList): + def _access_data_doricV6(self, doric_file, events, event_types): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: members = len(data) while members != 0: members -= 1 - data, last_element = self.separate_last_element(data) + data, last_element = self._separate_last_element(data) if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): res.append(last_element.name) elif isinstance(last_element, h5py.Group): @@ -195,16 +209,17 @@ def access_data_doricV6(self, doric_file, storesList): for element in res: sep_values = element.split("/") if sep_values[-1] == "Values": - if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: + if f"{sep_values[-3]}/{sep_values[-2]}" in events: decide_path.append(element) else: - if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: + if f"{sep_values[-2]}/{sep_values[-1]}" in events: decide_path.append(element) output_dicts = [] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") + for event in events: + event_type = event_types[event] + if "control" in event_type or "signal" in event_type: + regex = re.compile("(.*?)" + str(event) + "(.*?)") idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] if len(idx) > 1: logger.error("More than one string matched (which should not be the case)") @@ -213,11 +228,11 @@ def access_data_doricV6(self, doric_file, storesList): data = np.array(doric_file[decide_path[idx]]) timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - storename = storesList[0, i] + storename = event S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} output_dicts.append(S) else: - regex = re.compile("(.*?)" + storesList[0, i] + "$") + regex = re.compile("(.*?)" + event + "$") idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] if len(idx) > 1: logger.error("More than one string matched (which should not be the case)") @@ -228,56 +243,57 @@ def access_data_doricV6(self, doric_file, storesList): indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] timestamps = timestamps[indices[diff_indices] + 1] - storename = storesList[0, i] + storename = event S = {"storename": storename, "timestamps": timestamps} output_dicts.append(S) return output_dicts - def access_data_doricV1(self, doric_file, storesList): + def _access_data_doricV1(self, doric_file, events, event_types): keys = list(doric_file["Traces"]["Console"].keys()) output_dicts = [] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: + for event in events: + event_type = event_types[event] + if "control" in event_type or "signal" in event_type: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - storename = storesList[0, i] + data = np.array(doric_file["Traces"]["Console"][event][event]) + storename = event S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} output_dicts.append(S) else: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + ttl = np.array(doric_file["Traces"]["Console"][event][event]) indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] timestamps = timestamps[indices[diff_indices] + 1] - storename = storesList[0, i] + storename = event S = {"storename": storename, "timestamps": timestamps} output_dicts.append(S) return output_dicts - def save_dict_to_hdf5(self, S, outputPath): - event = S["storename"] - write_hdf5(S["timestamps"], event, outputPath, "timestamps") - - if "sampling_rate" in S: - write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") - if "data" in S: - write_hdf5(S["data"], event, outputPath, "data") - - def read(self, storesList): - flag = self.check_doric() + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + event_types = kwargs["event_types"] + flag = self._check_doric() if flag == "doric_csv": - output_dicts = self.read_doric_csv(storesList) + output_dicts = self._read_doric_csv(events, event_types) elif flag == "doric_doric": - output_dicts = self.read_doric_doric(storesList) + output_dicts = self._read_doric_doric(events, event_types) else: logger.error("Doric file not found or not recognized.") raise FileNotFoundError("Doric file not found or not recognized.") return output_dicts - def save(self, output_dicts, outputPath): + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: for S in output_dicts: - self.save_dict_to_hdf5(S=S, outputPath=outputPath) + storename = S["storename"] + self._write_hdf5(data=S["timestamps"], storename=storename, output_path=outputPath, key="timestamps") + + if "sampling_rate" in S: + self._write_hdf5( + data=S["sampling_rate"], storename=storename, output_path=outputPath, key="sampling_rate" + ) + if "data" in S: + self._write_hdf5(data=S["data"], storename=storename, output_path=outputPath, key="data") From dd40cb4aa0629e539322e100c908f05509acdd07 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 14:23:53 -0800 Subject: [PATCH 051/125] Refactored doric_recording_extractor to use class method for events and flags. --- .../extractors/doric_recording_extractor.py | 87 ++++++++++++------- src/guppy/saveStoresList.py | 4 +- 2 files changed, 56 insertions(+), 35 deletions(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index 51c22ca..f67e3f1 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -15,33 +15,48 @@ def execute_import_doric(folder_path, storesList, outputPath): - # Parse storesList into events and event_types events = list(storesList[0, :]) - event_types = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} + event_name_to_event_type = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} - extractor = DoricRecordingExtractor(folder_path=folder_path) - output_dicts = extractor.read(events=events, outputPath=outputPath, event_types=event_types) + extractor = DoricRecordingExtractor(folder_path=folder_path, event_name_to_event_type=event_name_to_event_type) + output_dicts = extractor.read(events=events, outputPath=outputPath) extractor.save(output_dicts=output_dicts, outputPath=outputPath) class DoricRecordingExtractor(BaseRecordingExtractor): # TODO: consolidate duplicate flag logic between the `__init__` and the `check_doric` method. - def __init__(self, folder_path): - self.folder_path = folder_path - logger.debug("If it exists, importing Doric file based on the structure of file") - path = sorted(glob.glob(os.path.join(self.folder_path, "*.csv"))) + sorted( - glob.glob(os.path.join(self.folder_path, "*.doric")) + @classmethod + def discover_events_and_flags(cls, folder_path): + """ + Discover available events and file format flags from Doric files. + + Parameters + ---------- + folder_path : str + Path to the folder containing Doric files + + Returns + ------- + events : list + List of discovered event names + flags : list + List of format flags (e.g., 'doric_csv', 'doric_doric') + """ + logger.debug("Discovering Doric events from file headers") + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + sorted( + glob.glob(os.path.join(folder_path, "*.doric")) ) path = sorted(list(set(path))) flag = "None" event_from_filename = [] flag_arr = [] + for i in range(len(path)): ext = os.path.basename(path[i]).split(".")[-1] if ext == "doric": - key_names = self._read_doric(path[i]) + key_names = cls._read_doric_file(path[i]) event_from_filename.extend(key_names) flag = "doric_doric" else: @@ -62,10 +77,14 @@ def __init__(self, folder_path): event_from_filename.extend(list(df.columns)) flag = "doric_csv" logger.info(flag) - logger.info("Importing of Doric file is done.") - self._events = event_from_filename - self._flags = flag_arr + logger.info("Doric event discovery complete.") + return event_from_filename, flag_arr + + def __init__(self, folder_path, event_name_to_event_type): + self.folder_path = folder_path + self._event_name_to_event_type = event_name_to_event_type + self._events, self._flags = self.discover_events_and_flags(folder_path) @property def events(self) -> list[str]: @@ -75,23 +94,26 @@ def events(self) -> list[str]: def flags(self) -> list: return self._flags - def _read_doric(self, filepath): + @staticmethod + def _read_doric_file(filepath): + """Static helper to read Doric file headers for event discovery.""" with h5py.File(filepath, "r") as f: if "Traces" in list(f.keys()): - keys = self._access_keys_doricV1(f) + keys = DoricRecordingExtractor._access_keys_doricV1(f) elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = self._access_keys_doricV6(f) + keys = DoricRecordingExtractor._access_keys_doricV6(f) return keys - def _access_keys_doricV6(self, doric_file): + @staticmethod + def _access_keys_doricV6(doric_file): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: members = len(data) while members != 0: members -= 1 - data, last_element = self._separate_last_element(data) + data, last_element = DoricRecordingExtractor._separate_last_element(data) if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): res.append(last_element.name) elif isinstance(last_element, h5py.Group): @@ -107,13 +129,15 @@ def _access_keys_doricV6(self, doric_file): return keys - def _access_keys_doricV1(self, doric_file): + @staticmethod + def _access_keys_doricV1(doric_file): keys = list(doric_file["Traces"]["Console"].keys()) keys.remove("Time(s)") return keys - def _separate_last_element(self, arr): + @staticmethod + def _separate_last_element(arr): l = arr[-1] return arr[:-1], l @@ -148,7 +172,7 @@ def _check_doric(self): logger.info("Doric file found.") return flag_arr[0] - def _read_doric_csv(self, events, event_types): + def _read_doric_csv(self, events): path = glob.glob(os.path.join(self.folder_path, "*.csv")) if len(path) > 1: logger.error("An error occurred : More than one Doric csv file present at the location") @@ -161,7 +185,7 @@ def _read_doric_csv(self, events, event_types): output_dicts = [] for event in events: - event_type = event_types[event] + event_type = self._event_name_to_event_type[event] if "control" in event_type or "signal" in event_type: timestamps = np.array(df["Time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) @@ -180,19 +204,19 @@ def _read_doric_csv(self, events, event_types): return output_dicts - def _read_doric_doric(self, events, event_types): + def _read_doric_doric(self, events): path = glob.glob(os.path.join(self.folder_path, "*.doric")) if len(path) > 1: logger.error("An error occurred : More than one Doric file present at the location") raise Exception("More than one Doric file present at the location") with h5py.File(path[0], "r") as f: if "Traces" in list(f.keys()): - output_dicts = self._access_data_doricV1(f, events, event_types) + output_dicts = self._access_data_doricV1(f, events) elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - output_dicts = self._access_data_doricV6(f, events, event_types) + output_dicts = self._access_data_doricV6(f, events) return output_dicts - def _access_data_doricV6(self, doric_file, events, event_types): + def _access_data_doricV6(self, doric_file, events): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: @@ -217,7 +241,7 @@ def _access_data_doricV6(self, doric_file, events, event_types): output_dicts = [] for event in events: - event_type = event_types[event] + event_type = self._event_name_to_event_type[event] if "control" in event_type or "signal" in event_type: regex = re.compile("(.*?)" + str(event) + "(.*?)") idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] @@ -249,11 +273,11 @@ def _access_data_doricV6(self, doric_file, events, event_types): return output_dicts - def _access_data_doricV1(self, doric_file, events, event_types): + def _access_data_doricV1(self, doric_file, events): keys = list(doric_file["Traces"]["Console"].keys()) output_dicts = [] for event in events: - event_type = event_types[event] + event_type = self._event_name_to_event_type[event] if "control" in event_type or "signal" in event_type: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) @@ -274,12 +298,11 @@ def _access_data_doricV1(self, doric_file, events, event_types): return output_dicts def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: - event_types = kwargs["event_types"] flag = self._check_doric() if flag == "doric_csv": - output_dicts = self._read_doric_csv(events, event_types) + output_dicts = self._read_doric_csv(events) elif flag == "doric_doric": - output_dicts = self._read_doric_doric(events, event_types) + output_dicts = self._read_doric_doric(events) else: logger.error("Doric file not found or not recognized.") raise FileNotFoundError("Doric file not found or not recognized.") diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 318bc5f..acc62f4 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -584,9 +584,7 @@ def execute(inputParameters): flags = extractor.flags elif modality == "doric": - extractor = DoricRecordingExtractor(folder_path=folder_path) - events = extractor.events - flags = extractor.flags + events, flags = DoricRecordingExtractor.discover_events_and_flags(folder_path=folder_path) elif modality == "npm": headless = bool(os.environ.get("GUPPY_BASE_DIR")) From 4619964733e040b64f375254dce8d6dde99d94d1 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:23:19 -0800 Subject: [PATCH 052/125] Refactored Extractors to use class method discover_events and flags instead of properties. --- .../extractors/base_recording_extractor.py | 25 +-- .../extractors/csv_recording_extractor.py | 40 +++-- .../extractors/npm_recording_extractor.py | 166 +++++++++--------- .../extractors/tdt_recording_extractor.py | 47 +++-- src/guppy/readTevTsq.py | 2 +- src/guppy/saveStoresList.py | 12 +- 6 files changed, 151 insertions(+), 141 deletions(-) diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py index 7058a0a..76d4f3c 100644 --- a/src/guppy/extractors/base_recording_extractor.py +++ b/src/guppy/extractors/base_recording_extractor.py @@ -16,29 +16,18 @@ class BaseRecordingExtractor(ABC): data from various acquisition formats (TDT, Doric, CSV, NPM, etc.). """ - @property + @classmethod @abstractmethod - def events(self) -> list[str]: + def discover_events_and_flags(cls) -> tuple[list[str], list[str]]: """ - List of available event/store names in the data. + Discover available events and format flags from data files. Returns ------- - list of str - Names of all events or stores available in the dataset. - """ - pass - - @property - @abstractmethod - def flags(self) -> list: - """ - Format indicators or file type flags. - - Returns - ------- - list - Flags indicating file types or data formats. + events : list of str + Names of all events/stores available in the dataset. + flags : list of str + Format indicators or file type flags. """ pass diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index 792ad01..41ee7ab 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -32,11 +32,25 @@ def read_and_save_csv(extractor, event, outputPath): class CsvRecordingExtractor(BaseRecordingExtractor): - def __init__(self, folder_path): - self.folder_path = folder_path - + @classmethod + def discover_events_and_flags(cls, folder_path) -> tuple[list[str], list[str]]: + """ + Discover available events and format flags from CSV files. + + Parameters + ---------- + folder_path : str + Path to the folder containing CSV files. + + Returns + ------- + events : list of str + Names of all events/stores available in the dataset. + flags : list of str + Format indicators or file type flags. + """ logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") - path = sorted(glob.glob(os.path.join(self.folder_path, "*.csv"))) + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) path = sorted(list(set(path))) flag = "None" @@ -59,7 +73,7 @@ def __init__(self, folder_path): ), "This file appears to be doric .csv. This function only supports standard .csv files." df = pd.read_csv(path[i], index_col=False) - _, value = self._check_header(df) + _, value = cls._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -121,19 +135,13 @@ def __init__(self, folder_path): event_from_filename.append(name) logger.info("Importing of csv file is done.") + return event_from_filename, flag_arr - self._events = event_from_filename - self._flags = flag_arr - - @property - def events(self) -> list[str]: - return self._events - - @property - def flags(self) -> list: - return self._flags + def __init__(self, folder_path): + self.folder_path = folder_path - def _check_header(self, df): + @staticmethod + def _check_header(df): arr = list(df.columns) check_float = [] for i in arr: diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index 6d9b26a..110ba56 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -17,10 +17,10 @@ logger = logging.getLogger(__name__) -def execute_import_npm(folder_path, num_ch, inputParameters, events, outputPath, numProcesses=mp.cpu_count()): +def execute_import_npm(folder_path, events, outputPath, numProcesses=mp.cpu_count()): logger.info("Reading data for event {} ...".format(events)) - extractor = NpmRecordingExtractor(folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters) + extractor = NpmRecordingExtractor(folder_path=folder_path) start = time.time() with mp.Pool(numProcesses) as p: p.starmap(read_and_save_npm, zip(repeat(extractor), events, repeat(outputPath))) @@ -35,81 +35,29 @@ def read_and_save_npm(extractor, event, outputPath): class NpmRecordingExtractor(BaseRecordingExtractor): - def __init__(self, folder_path, num_ch, inputParameters=None): # TODO: make inputParameters mandatory - self.folder_path = folder_path - self.num_ch = num_ch - self.inputParameters = inputParameters - self._events, self._flags = self._import_npm( - folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters - ) - - @property - def events(self) -> list[str]: - return self._events - - @property - def flags(self) -> list: - return self._flags - + # TODO: make inputParameters mandatory @classmethod - def has_multiple_event_ttls(cls, folder_path): - path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) - path_chev = glob.glob(os.path.join(folder_path, "*chev*")) - path_chod = glob.glob(os.path.join(folder_path, "*chod*")) - path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) - path_event = glob.glob(os.path.join(folder_path, "event*")) - path_chev_chod_event = path_chev + path_chod + path_event + path_chpr - - path = sorted(list(set(path) - set(path_chev_chod_event))) - multiple_event_ttls = [] - for i in range(len(path)): - df = pd.read_csv(path[i], index_col=False) - _, value = cls.check_header(df) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - if len(cols) == 2: - flag = "event_or_data_np" - elif len(cols) > 2: - flag = "data_np" - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - # used assigned flags to process the files and read the data - if flag == "event_or_data_np": - arr = list(df.iloc[:, 1]) - check_float = [True for i in arr if isinstance(i, float)] - if len(arr) == len(check_float) and columns_isstr == False: - flag = "data_np" - elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): - flag = "event_np" - else: - flag = "event_np" - - if flag == "event_np": - type_val = np.array(df.iloc[:, 1]) - type_val_unique = np.unique(type_val) - if len(type_val_unique) > 1: - multiple_event_ttls.append(True) - else: - multiple_event_ttls.append(False) - else: - multiple_event_ttls.append(False) - - return multiple_event_ttls - - def _import_npm(self, folder_path, num_ch, inputParameters=None): - + def discover_events_and_flags(cls, folder_path, num_ch, inputParameters=None) -> tuple[list[str], list[str]]: + """ + Discover available events and format flags from NPM files. + + Parameters + ---------- + folder_path : str + Path to the folder containing NPM files. + num_ch : int + Number of channels in the recording. + inputParameters : dict, optional + Input parameters containing NPM-specific configuration. + + Returns + ------- + events : list of str + Names of all events/stores available in the dataset. + flags : list of str + Format indicators or file type flags. + """ logger.debug("If it exists, importing NPM file based on the structure of file") - # Headless configuration (used to avoid any UI prompts when running tests) - headless = bool(os.environ.get("GUPPY_BASE_DIR")) if isinstance(inputParameters, dict): npm_timestamp_column_names = inputParameters.get("npm_timestamp_column_names") npm_time_units = inputParameters.get("npm_time_units") @@ -160,7 +108,7 @@ def _import_npm(self, folder_path, num_ch, inputParameters=None): df_arr ), "This file appears to be doric .csv. This function only supports NPM .csv files." df = pd.read_csv(path[i], index_col=False) - _, value = self.check_header(df) + _, value = cls.check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -204,7 +152,7 @@ def _import_npm(self, folder_path, num_ch, inputParameters=None): logger.info(flag) if flag == "data_np": file = f"file{str(i)}_" - df, indices_dict, _ = self.decide_indices(file, df, flag, num_ch) + df, indices_dict, _ = cls.decide_indices(file, df, flag, num_ch) keys = list(indices_dict.keys()) for k in range(len(keys)): for j in range(df.shape[1]): @@ -242,8 +190,8 @@ def _import_npm(self, folder_path, num_ch, inputParameters=None): else: file = f"file{str(i)}_" ts_unit = npm_time_unit - df = self._update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) - df, indices_dict, _ = self.decide_indices(file, df, flag) + df = cls._update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) + df, indices_dict, _ = cls.decide_indices(file, df, flag) keys = list(indices_dict.keys()) for k in range(len(keys)): for j in range(df.shape[1]): @@ -326,6 +274,63 @@ def _import_npm(self, folder_path, num_ch, inputParameters=None): logger.info("Importing of NPM file is done.") return event_from_filename, flag_arr + def __init__(self, folder_path): + self.folder_path = folder_path + + @classmethod + def has_multiple_event_ttls(cls, folder_path): + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + multiple_event_ttls = [] + for i in range(len(path)): + df = pd.read_csv(path[i], index_col=False) + _, value = cls.check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) > 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + + if flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if len(type_val_unique) > 1: + multiple_event_ttls.append(True) + else: + multiple_event_ttls.append(False) + else: + multiple_event_ttls.append(False) + + return multiple_event_ttls + @classmethod def check_header(cls, df): arr = list(df.columns) @@ -476,7 +481,8 @@ def needs_ts_unit(cls, folder_path, num_ch): return ts_unit_needs, col_names_ts - def _update_df_with_timestamp_columns(self, df, timestamp_column_name): + @staticmethod + def _update_df_with_timestamp_columns(df, timestamp_column_name): col_names = np.array(list(df.columns)) col_names_ts = [""] for name in col_names: diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 6e712fb..949c9ec 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -30,35 +30,48 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() class TdtRecordingExtractor(BaseRecordingExtractor): - def __init__(self, folder_path): - self.folder_path = folder_path - self._header_df, _ = self._readtsq(folder_path) + @classmethod + def discover_events_and_flags(cls, folder_path) -> tuple[list[str], list[str]]: + """ + Discover available events and format flags from TDT files. + + Parameters + ---------- + folder_path : str + Path to the folder containing TDT files. + + Returns + ------- + events : list of str + Names of all events/stores available in the dataset. + flags : list of str + Format indicators or file type flags. + """ + header_df, _ = cls._readtsq(folder_path) # Populate events from header_df - if isinstance(self._header_df, pd.DataFrame): - self._header_df["name"] = np.asarray(self._header_df["name"], dtype=str) - allnames = np.unique(self._header_df["name"]) + if isinstance(header_df, pd.DataFrame): + header_df["name"] = np.asarray(header_df["name"], dtype=str) + allnames = np.unique(header_df["name"]) index = [] for i in range(len(allnames)): length = len(str(allnames[i])) if length < 4: index.append(i) allnames = np.delete(allnames, index, 0) - self._events = list(allnames) + events = list(allnames) else: - self._events = [] - - self._flags = [] + events = [] - @property - def events(self) -> list[str]: - return self._events + flags = [] + return events, flags - @property - def flags(self) -> list: - return self._flags + def __init__(self, folder_path): + self.folder_path = folder_path + self._header_df, _ = self._readtsq(folder_path) - def _readtsq(self, folder_path): + @staticmethod + def _readtsq(folder_path): logger.debug("Trying to read tsq file.") names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index f2c9419..2ae0c59 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -80,7 +80,7 @@ def readRawData(inputParameters): elif modality == "csv": execute_import_csv(filepath, events, op, numProcesses) elif modality == "npm": - execute_import_npm(filepath, num_ch, inputParameters, events, op, numProcesses) + execute_import_npm(filepath, events, op, numProcesses) else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index acc62f4..20a5c94 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -575,13 +575,9 @@ def execute(inputParameters): for i in folderNames: folder_path = os.path.join(inputParameters["abspath"], i) if modality == "tdt": - extractor = TdtRecordingExtractor(folder_path=folder_path) - events = extractor.events - flags = extractor.flags + events, flags = TdtRecordingExtractor.discover_events_and_flags(folder_path=folder_path) elif modality == "csv": - extractor = CsvRecordingExtractor(folder_path=folder_path) - events = extractor.events - flags = extractor.flags + events, flags = CsvRecordingExtractor.discover_events_and_flags(folder_path=folder_path) elif modality == "doric": events, flags = DoricRecordingExtractor.discover_events_and_flags(folder_path=folder_path) @@ -604,11 +600,9 @@ def execute(inputParameters): npm_timestamp_column_names if npm_timestamp_column_names else None ) - extractor = NpmRecordingExtractor( + events, flags = NpmRecordingExtractor.discover_events_and_flags( folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters ) - events = extractor.events - flags = extractor.flags else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") From beb585fab80a38d728f354364d91a7875680db0b Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:34:09 -0800 Subject: [PATCH 053/125] Refactored Extractors to use class method discover_events and flags instead of properties. --- src/guppy/extractors/doric_recording_extractor.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index f67e3f1..dd0ecdd 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -84,15 +84,6 @@ def discover_events_and_flags(cls, folder_path): def __init__(self, folder_path, event_name_to_event_type): self.folder_path = folder_path self._event_name_to_event_type = event_name_to_event_type - self._events, self._flags = self.discover_events_and_flags(folder_path) - - @property - def events(self) -> list[str]: - return self._events - - @property - def flags(self) -> list: - return self._flags @staticmethod def _read_doric_file(filepath): From 1b5e8ca6b4ea454636978e295eab8ca70a38027e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:42:36 -0800 Subject: [PATCH 054/125] Added comment about discover_events_and_flags signature --- src/guppy/extractors/base_recording_extractor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py index 76d4f3c..c71297b 100644 --- a/src/guppy/extractors/base_recording_extractor.py +++ b/src/guppy/extractors/base_recording_extractor.py @@ -29,6 +29,11 @@ def discover_events_and_flags(cls) -> tuple[list[str], list[str]]: flags : list of str Format indicators or file type flags. """ + # NOTE: This method signature is intentionally minimal and flexible. + # Different formats have different discovery requirements: + # - TDT/CSV/Doric: need only folder_path parameter + # - NPM: needs folder_path, num_ch, and optional inputParameters for interleaved channels + # Each child class defines its own signature with the parameters it needs. pass @abstractmethod From 2e38ee8afd6b5c061c0f4618ba606619b1ce142c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:46:28 -0800 Subject: [PATCH 055/125] Removed unused quarks. --- src/guppy/extractors/base_recording_extractor.py | 8 ++------ src/guppy/extractors/csv_recording_extractor.py | 4 ++-- src/guppy/extractors/doric_recording_extractor.py | 4 ++-- src/guppy/extractors/npm_recording_extractor.py | 4 ++-- src/guppy/extractors/tdt_recording_extractor.py | 4 ++-- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py index c71297b..839c3db 100644 --- a/src/guppy/extractors/base_recording_extractor.py +++ b/src/guppy/extractors/base_recording_extractor.py @@ -37,7 +37,7 @@ def discover_events_and_flags(cls) -> tuple[list[str], list[str]]: pass @abstractmethod - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: """ Read data from source files for specified events. @@ -47,8 +47,6 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str List of event/store names to extract from the data. outputPath : str Path to the output directory. - **kwargs - Additional extractor-specific parameters. Returns ------- @@ -60,7 +58,7 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str pass @abstractmethod - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: """ Save extracted data dictionaries to HDF5 format. @@ -70,8 +68,6 @@ def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) List of data dictionaries from read(). outputPath : str Path to the output directory. - **kwargs - Additional extractor-specific parameters. """ pass diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index 41ee7ab..d74cfde 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -196,7 +196,7 @@ def _save_to_hdf5(self, df, event, outputPath): logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: output_dicts = [] for event in events: df = self._read_csv(event=event) @@ -205,7 +205,7 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str output_dicts.append(S) return output_dicts - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: for S in output_dicts: event = S.pop("storename") df = pd.DataFrame.from_dict(S) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index dd0ecdd..62a8586 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -288,7 +288,7 @@ def _access_data_doricV1(self, doric_file, events): return output_dicts - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: flag = self._check_doric() if flag == "doric_csv": output_dicts = self._read_doric_csv(events) @@ -300,7 +300,7 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str return output_dicts - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: for S in output_dicts: storename = S["storename"] self._write_hdf5(data=S["timestamps"], storename=storename, output_path=outputPath, key="timestamps") diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index 110ba56..e3042c2 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -543,7 +543,7 @@ def _save_to_hdf5(self, df, event, outputPath): logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: output_dicts = [] for event in events: df = self._read_npm(event=event) @@ -552,7 +552,7 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str output_dicts.append(S) return output_dicts - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: for S in output_dicts: event = S.pop("storename") df = pd.DataFrame.from_dict(S) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 949c9ec..a877a8b 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -178,7 +178,7 @@ def _readtev(self, event): return S - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: output_dicts = [] for event in events: S = self._readtev(event=event) @@ -259,6 +259,6 @@ def _save_dict_to_hdf5(self, S, outputPath): self._write_hdf5(S["npoints"], event, outputPath, "npoints") self._write_hdf5(S["channels"], event, outputPath, "channels") - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: for S in output_dicts: self._save_dict_to_hdf5(S=S, outputPath=outputPath) From cdecf428d97d4db8e36dbc9cd44510c7d529016f Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:51:26 -0800 Subject: [PATCH 056/125] Refactored NpmRecordingExtractor to inherit from CsvRecordingExtractor. --- .../extractors/npm_recording_extractor.py | 87 ++----------------- 1 file changed, 7 insertions(+), 80 deletions(-) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index e3042c2..68d13f7 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -4,13 +4,12 @@ import os import time from itertools import repeat -from typing import Any import numpy as np import pandas as pd import panel as pn -from guppy.extractors import BaseRecordingExtractor +from guppy.extractors import CsvRecordingExtractor pn.extension() @@ -33,7 +32,9 @@ def read_and_save_npm(extractor, event, outputPath): logger.info("Data for event {} fetched and stored.".format(event)) -class NpmRecordingExtractor(BaseRecordingExtractor): +class NpmRecordingExtractor(CsvRecordingExtractor): + # Inherits from CsvRecordingExtractor to reuse identical read/save logic. + # Only overrides discover_events_and_flags() and adds NPM-specific helper methods. # TODO: make inputParameters mandatory @classmethod @@ -108,7 +109,7 @@ def discover_events_and_flags(cls, folder_path, num_ch, inputParameters=None) -> df_arr ), "This file appears to be doric .csv. This function only supports NPM .csv files." df = pd.read_csv(path[i], index_col=False) - _, value = cls.check_header(df) + _, value = cls._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -274,9 +275,6 @@ def discover_events_and_flags(cls, folder_path, num_ch, inputParameters=None) -> logger.info("Importing of NPM file is done.") return event_from_filename, flag_arr - def __init__(self, folder_path): - self.folder_path = folder_path - @classmethod def has_multiple_event_ttls(cls, folder_path): path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) @@ -290,7 +288,7 @@ def has_multiple_event_ttls(cls, folder_path): multiple_event_ttls = [] for i in range(len(path)): df = pd.read_csv(path[i], index_col=False) - _, value = cls.check_header(df) + _, value = cls._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -331,18 +329,6 @@ def has_multiple_event_ttls(cls, folder_path): return multiple_event_ttls - @classmethod - def check_header(cls, df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - # function to decide indices of interleaved channels # in neurophotometrics data @classmethod @@ -426,7 +412,7 @@ def needs_ts_unit(cls, folder_path, num_ch): col_names_ts = [""] for i in range(len(path)): df = pd.read_csv(path[i], index_col=False) - _, value = cls.check_header(df) + _, value = cls._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -498,62 +484,3 @@ def _update_df_with_timestamp_columns(df, timestamp_column_name): df.insert(1, "Timestamp", df[timestamp_column_name]) df = df.drop(col_names_ts[1:], axis=1) return df - - def _read_npm(self, event): - logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") - if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): - logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - - df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) - return df - - def _save_to_hdf5(self, df, event, outputPath): - key = list(df.columns) - - # TODO: clean up these if branches - if len(key) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(key)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - - if len(key) == 1: - if key[0].lower() != "timestamps": - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") - - if len(key) != 3 and len(key) != 1: - logger.error( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - - for i in range(len(key)): - self._write_hdf5(data=df[key[i]].dropna(), storename=event, output_path=outputPath, key=key[i].lower()) - - logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - - def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: - output_dicts = [] - for event in events: - df = self._read_npm(event=event) - S = df.to_dict() - S["storename"] = event - output_dicts.append(S) - return output_dicts - - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: - for S in output_dicts: - event = S.pop("storename") - df = pd.DataFrame.from_dict(S) - self._save_to_hdf5(df=df, event=event, outputPath=outputPath) From d43670ffa39f5a7668867b0f21307d99bd240c48 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 17:41:09 -0800 Subject: [PATCH 057/125] Updated TODO --- src/guppy/extractors/doric_recording_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index 62a8586..13f7fdb 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -24,7 +24,7 @@ def execute_import_doric(folder_path, storesList, outputPath): class DoricRecordingExtractor(BaseRecordingExtractor): - # TODO: consolidate duplicate flag logic between the `__init__` and the `check_doric` method. + # TODO: consolidate duplicate flag logic between the `discover_events_and_flags` and the `check_doric` method. @classmethod def discover_events_and_flags(cls, folder_path): From cd245a165ba8afea06780fbd12e007f33a99f218 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 18:01:33 -0800 Subject: [PATCH 058/125] Centralized read_and_save_all_events and read_and_save_event functions into the base_recording_extractor and removed all duplicates. --- src/guppy/extractors/__init__.py | 10 ++++----- .../extractors/base_recording_extractor.py | 21 +++++++++++++++++++ .../extractors/csv_recording_extractor.py | 19 ----------------- .../extractors/doric_recording_extractor.py | 9 -------- .../extractors/npm_recording_extractor.py | 19 ----------------- .../extractors/tdt_recording_extractor.py | 16 -------------- src/guppy/readTevTsq.py | 21 ++++++++++++------- 7 files changed, 39 insertions(+), 76 deletions(-) diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index 75933c7..ca2fbe0 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,5 +1,5 @@ -from .base_recording_extractor import BaseRecordingExtractor -from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev -from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv -from .doric_recording_extractor import DoricRecordingExtractor, execute_import_doric -from .npm_recording_extractor import NpmRecordingExtractor, execute_import_npm +from .base_recording_extractor import BaseRecordingExtractor, read_and_save_event, read_and_save_all_events +from .tdt_recording_extractor import TdtRecordingExtractor +from .csv_recording_extractor import CsvRecordingExtractor +from .doric_recording_extractor import DoricRecordingExtractor +from .npm_recording_extractor import NpmRecordingExtractor diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py index 839c3db..a8f274b 100644 --- a/src/guppy/extractors/base_recording_extractor.py +++ b/src/guppy/extractors/base_recording_extractor.py @@ -1,12 +1,18 @@ """Base class for recording extractors.""" +import logging +import multiprocessing as mp import os +import time from abc import ABC, abstractmethod +from itertools import repeat from typing import Any import h5py import numpy as np +logger = logging.getLogger(__name__) + class BaseRecordingExtractor(ABC): """ @@ -116,3 +122,18 @@ def _write_hdf5(data: Any, storename: str, output_path: str, key: str) -> None: f.create_dataset(key, data=data, maxshape=(None,), chunks=True) else: f.create_dataset(key, data=data) + + +def read_and_save_event(extractor, event, outputPath): + output_dicts = extractor.read(events=[event], outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) + logger.info("Data for event {} fetched and stored.".format(event)) + + +def read_and_save_all_events(extractor, events, outputPath, numProcesses=mp.cpu_count()): + logger.info("Reading data for event {} ...".format(events)) + + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(read_and_save_event, zip(repeat(extractor), events, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index d74cfde..cfa9a8d 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -1,9 +1,6 @@ import glob import logging -import multiprocessing as mp import os -import time -from itertools import repeat from typing import Any import numpy as np @@ -14,22 +11,6 @@ logger = logging.getLogger(__name__) -def execute_import_csv(filepath, events, outputPath, numProcesses=mp.cpu_count()): - logger.info("Reading data for event {} ...".format(events)) - - extractor = CsvRecordingExtractor(folder_path=filepath) - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(read_and_save_csv, zip(repeat(extractor), events, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -def read_and_save_csv(extractor, event, outputPath): - output_dicts = extractor.read(events=[event], outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) - logger.info("Data for event {} fetched and stored.".format(event)) - - class CsvRecordingExtractor(BaseRecordingExtractor): @classmethod diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index 13f7fdb..047e087 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -14,15 +14,6 @@ logger = logging.getLogger(__name__) -def execute_import_doric(folder_path, storesList, outputPath): - events = list(storesList[0, :]) - event_name_to_event_type = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} - - extractor = DoricRecordingExtractor(folder_path=folder_path, event_name_to_event_type=event_name_to_event_type) - output_dicts = extractor.read(events=events, outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) - - class DoricRecordingExtractor(BaseRecordingExtractor): # TODO: consolidate duplicate flag logic between the `discover_events_and_flags` and the `check_doric` method. diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index 68d13f7..e3455b2 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -1,9 +1,6 @@ import glob import logging -import multiprocessing as mp import os -import time -from itertools import repeat import numpy as np import pandas as pd @@ -16,22 +13,6 @@ logger = logging.getLogger(__name__) -def execute_import_npm(folder_path, events, outputPath, numProcesses=mp.cpu_count()): - logger.info("Reading data for event {} ...".format(events)) - - extractor = NpmRecordingExtractor(folder_path=folder_path) - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(read_and_save_npm, zip(repeat(extractor), events, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -def read_and_save_npm(extractor, event, outputPath): - output_dicts = extractor.read(events=[event], outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) - logger.info("Data for event {} fetched and stored.".format(event)) - - class NpmRecordingExtractor(CsvRecordingExtractor): # Inherits from CsvRecordingExtractor to reuse identical read/save logic. # Only overrides discover_events_and_flags() and adds NPM-specific helper methods. diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index a877a8b..f65f7a9 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -1,9 +1,6 @@ import glob import logging -import multiprocessing as mp import os -import time -from itertools import repeat from typing import Any import numpy as np @@ -15,19 +12,6 @@ logger = logging.getLogger(__name__) -def read_and_save_tdt(extractor, event, outputPath): - output_dicts = extractor.read(events=[event], outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) - - -def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): - extractor = TdtRecordingExtractor(folder_path=folder_path) - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(read_and_save_tdt, zip(repeat(extractor), events, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - class TdtRecordingExtractor(BaseRecordingExtractor): @classmethod diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 2ae0c59..19a0a4a 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -8,10 +8,11 @@ import numpy as np from guppy.extractors import ( - execute_import_csv, - execute_import_doric, - execute_import_npm, - execute_readtev, + CsvRecordingExtractor, + DoricRecordingExtractor, + NpmRecordingExtractor, + TdtRecordingExtractor, + read_and_save_all_events, ) logger = logging.getLogger(__name__) @@ -74,15 +75,19 @@ def readRawData(inputParameters): events = np.unique(storesList[0, :]) if modality == "tdt": - execute_readtev(filepath, events, op, numProcesses) + extractor = TdtRecordingExtractor(folder_path=filepath) elif modality == "doric": - execute_import_doric(filepath, storesList, op) + event_name_to_event_type = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} + extractor = DoricRecordingExtractor( + folder_path=filepath, event_name_to_event_type=event_name_to_event_type + ) elif modality == "csv": - execute_import_csv(filepath, events, op, numProcesses) + extractor = CsvRecordingExtractor(folder_path=filepath) elif modality == "npm": - execute_import_npm(filepath, events, op, numProcesses) + extractor = NpmRecordingExtractor(folder_path=filepath) else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") + read_and_save_all_events(extractor, events, op, numProcesses) writeToFile(str(10 + ((step + 1) * 10)) + "\n") step += 1 From 7e69cc747dfff63d93dd733ff584c6cdbd459b03 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 18:04:56 -0800 Subject: [PATCH 059/125] Removed redundant intermediate common_step3.py. --- src/guppy/common_step3.py | 42 --------------------------------------- 1 file changed, 42 deletions(-) delete mode 100644 src/guppy/common_step3.py diff --git a/src/guppy/common_step3.py b/src/guppy/common_step3.py deleted file mode 100644 index 09e763f..0000000 --- a/src/guppy/common_step3.py +++ /dev/null @@ -1,42 +0,0 @@ -import logging -import os - -import h5py -import numpy as np - -logger = logging.getLogger(__name__) - - -# function to write data to a hdf5 file -def write_hdf5(data, event, filepath, key): - - # replacing \\ or / in storenames with _ (to avoid errors while saving data) - event = event.replace("\\", "_") - event = event.replace("/", "_") - - op = os.path.join(filepath, event + ".hdf5") - - # if file does not exist create a new file - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - # if file already exists, append data to it or add a new key to it - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) From 792e421ba5c6d22674e6b6558f480524a5f0c461 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 09:27:31 -0800 Subject: [PATCH 060/125] Added Claude code docs to gitignore. --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 0628429..f684eec 100755 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ GuPPy/runFiberPhotometryAnalysis.ipynb .clinerules/ testing_data/ + +CLAUDE.md From 60fa0bc67761ed648e08c2944f0da9a413ca5a53 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 11:46:09 -0800 Subject: [PATCH 061/125] Pulled out analysis-specific functions and io_utils from preprocess.py. --- src/guppy/analysis/__init__.py | 0 src/guppy/analysis/analysis.py | 268 ++++++++++++++++++++ src/guppy/analysis/io_utils.py | 163 ++++++++++++ src/guppy/preprocess.py | 441 +++++---------------------------- step4_data_flow_analysis.md | 348 ++++++++++++++++++++++++++ 5 files changed, 841 insertions(+), 379 deletions(-) create mode 100644 src/guppy/analysis/__init__.py create mode 100644 src/guppy/analysis/analysis.py create mode 100644 src/guppy/analysis/io_utils.py create mode 100644 step4_data_flow_analysis.md diff --git a/src/guppy/analysis/__init__.py b/src/guppy/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/guppy/analysis/analysis.py b/src/guppy/analysis/analysis.py new file mode 100644 index 0000000..4ec8960 --- /dev/null +++ b/src/guppy/analysis/analysis.py @@ -0,0 +1,268 @@ +import logging + +import numpy as np +from scipy import signal as ss +from scipy.optimize import curve_fit + +from .io_utils import fetchCoords, read_hdf5 + +logger = logging.getLogger(__name__) + + +# Category: Analysis +# Reason: Pure mathematical function for exponential curve fitting - no dependencies, pure computation +# curve fit exponential function +def curveFitFn(x, a, b, c): + return a + (b * np.exp(-(1 / c) * x)) + + +# Category: Analysis +# Reason: Pure algorithmic function - applies Savitzky-Golay filter and curve fitting to generate synthetic control channel +# helper function to create control channel using signal channel +# by curve fitting signal channel to exponential function +# when there is no isosbestic control channel is present +def helper_create_control_channel(signal, timestamps, window): + # check if window is greater than signal shape + if window > signal.shape[0]: + window = ((signal.shape[0] + 1) / 2) + 1 + if window % 2 != 0: + window = window + else: + window = window + 1 + + filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) + + p0 = [5, 50, 60] + + try: + popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) + except Exception as e: + logger.error(str(e)) + + # logger.info('Curve Fit Parameters : ', popt) + control = curveFitFn(timestamps, *popt) + + return control + + +# Category: Analysis +# Reason: Data validation function - compares array lengths and returns indices for processing +# function to check control and signal channel has same length +# if not, take a smaller length and do pre-processing +def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): + + indices = [] + for i in range(channels_arr.shape[1]): + idx_c = np.where(storesList == channels_arr[0, i])[0] + idx_s = np.where(storesList == channels_arr[1, i])[0] + control = read_hdf5(storenames[idx_c[0]], filepath, "data") + signal = read_hdf5(storenames[idx_s[0]], filepath, "data") + if control.shape[0] < signal.shape[0]: + indices.append(storesList[idx_c[0]]) + elif control.shape[0] > signal.shape[0]: + indices.append(storesList[idx_s[0]]) + else: + indices.append(storesList[idx_s[0]]) + + return indices + + +# Category: Analysis +# Reason: Pure algorithmic function - concatenates data chunks based on coordinate boundaries, adjusts timestamps mathematically +# helper function to process control and signal timestamps +def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + data = read_hdf5(event, filepath, "data").reshape(-1) + coords = fetchCoords(filepath, naming, ts) + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_arr = np.array([]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(arr) == 0: + arr = np.concatenate((arr, data[index])) + sub = ts[index][0] - timeForLightsTurnOn + new_ts = ts[index] - sub + ts_arr = np.concatenate((ts_arr, new_ts)) + else: + temp = data[index] + # new = temp + (arr[-1]-temp[0]) + temp_ts = ts[index] + new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) + arr = np.concatenate((arr, temp)) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + # logger.info(arr.shape, ts_arr.shape) + return arr, ts_arr + + +# Category: Analysis +# Reason: Pure algorithmic function - processes event timestamps based on coordinate boundaries, aligns with data timeline +# helper function to align event timestamps with the control and signal timestamps +def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, naming, tsNew) + + ts_arr = np.array([]) + tsNew_arr = np.array([]) + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(tsNew_arr) == 0: + sub = tsNew[tsNew_index][0] - timeForLightsTurnOn + tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) + ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) + else: + temp_tsNew = tsNew[tsNew_index] + temp_ts = ts[ts_index] + new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) + new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) + tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + return ts_arr + + +# Category: Analysis +# Reason: Pure algorithmic function - replaces specified data chunks with NaN based on coordinate boundaries +# adding nan values to removed chunks +# when using artifacts removal method - replace with NaN +def addingNaNValues(filepath, event, naming): + + ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + data = read_hdf5(event, filepath, "data").reshape(-1) + coords = fetchCoords(filepath, naming, ts) + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_index = np.arange(ts.shape[0]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + arr = np.concatenate((arr, index)) + + nan_indices = list(set(ts_index).symmetric_difference(arr)) + data[nan_indices] = np.nan + + return data + + +# Category: Analysis +# Reason: Pure algorithmic function - filters event timestamps to exclude artifact regions based on coordinates +# remove event TTLs which falls in the removed chunks +# when using artifacts removal method - replace with NaN +def removeTTLs(filepath, event, naming): + tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, naming, tsNew) + + ts_arr = np.array([]) + for i in range(coords.shape[0]): + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + ts_arr = np.concatenate((ts_arr, ts[ts_index])) + + return ts_arr + + +# Category: Analysis +# Reason: Pure mathematical computation - calculates deltaF/F from signal and control arrays with simple formula +# function to compute deltaF/F using fitted control channel and filtered signal channel +def deltaFF(signal, control): + + res = np.subtract(signal, control) + normData = np.divide(res, control) + # deltaFF = normData + normData = normData * 100 + + return normData + + +# Category: Analysis +# Reason: Pure algorithmic function - performs polynomial linear regression to fit control to signal +# function to fit control channel to signal channel +def controlFit(control, signal): + + p = np.polyfit(control, signal, 1) + arr = (p[0] * control) + p[1] + return arr + + +# Category: Analysis +# Reason: Pure signal processing function - applies moving average filter using scipy filtfilt +def filterSignal(filter_window, signal): + if filter_window == 0: + return signal + elif filter_window > 1: + b = np.divide(np.ones((filter_window,)), filter_window) + a = 1 + filtered_signal = ss.filtfilt(b, a, signal) + return filtered_signal + else: + raise Exception("Moving average filter window value is not correct.") + + +# Category: Routing +# Reason: Orchestrates signal processing pipeline - sequences calls to filterSignal, controlFit, and deltaFF with conditional logic +# function to filter control and signal channel, also execute above two function : controlFit and deltaFF +# function will also take care if there is only signal channel and no control channel +# if there is only signal channel, z-score will be computed using just signal channel +def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): + + if isosbestic_control == False: + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + else: + control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control_smooth, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + + return norm_data, control_fit + + +# Category: Analysis +# Reason: Pure statistical computation - calculates z-score from deltaF/F using different methods (standard, baseline, robust) +# function to compute z-score based on z-score computation method +def z_score_computation(dff, timestamps, inputParameters): + + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] + + if zscore_method == "standard z-score": + numerator = np.subtract(dff, np.nanmean(dff)) + zscore = np.divide(numerator, np.nanstd(dff)) + elif zscore_method == "baseline z-score": + idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] + if idx.shape[0] == 0: + logger.error( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + raise Exception( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + else: + baseline_mean = np.nanmean(dff[idx]) + baseline_std = np.nanstd(dff[idx]) + numerator = np.subtract(dff, baseline_mean) + zscore = np.divide(numerator, baseline_std) + else: + median = np.median(dff) + mad = np.median(np.abs(dff - median)) + numerator = 0.6745 * (dff - median) + zscore = np.divide(numerator, mad) + + return zscore diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py new file mode 100644 index 0000000..33b6650 --- /dev/null +++ b/src/guppy/analysis/io_utils.py @@ -0,0 +1,163 @@ +import fnmatch +import glob +import logging +import os +import re + +import h5py +import numpy as np + +logger = logging.getLogger(__name__) + + +# Category: Analysis +# Reason: Utility function for path filtering - pure data transformation with no GUI or orchestration +def takeOnlyDirs(paths): + removePaths = [] + for p in paths: + if os.path.isfile(p): + removePaths.append(p) + return list(set(paths) - set(removePaths)) + + +# Category: Analysis +# Reason: File system utility for case-insensitive file discovery - pure I/O helper with no orchestration +# find files by ignoring the case sensitivity +def find_files(path, glob_path, ignore_case=False): + rule = ( + re.compile(fnmatch.translate(glob_path), re.IGNORECASE) + if ignore_case + else re.compile(fnmatch.translate(glob_path)) + ) + + no_bytes_path = os.listdir(os.path.expanduser(path)) + str_path = [] + + # converting byte object to string + for x in no_bytes_path: + try: + str_path.append(x.decode("utf-8")) + except: + str_path.append(x) + return [os.path.join(path, n) for n in str_path if rule.match(n)] + + +# Category: Analysis +# Reason: Simple file type detection utility - pure file system check with no orchestration +# check if dealing with TDT files or csv files +def check_TDT(filepath): + path = glob.glob(os.path.join(filepath, "*.tsq")) + if len(path) > 0: + return True + else: + return False + + +# Category: Analysis +# Reason: I/O utility function for reading HDF5 files - pure file access with no business logic or orchestration +# function to read hdf5 file +def read_hdf5(event, filepath, key): + if event: + event = event.replace("\\", "_") + event = event.replace("/", "_") + op = os.path.join(filepath, event + ".hdf5") + else: + op = filepath + + if os.path.exists(op): + with h5py.File(op, "r") as f: + arr = np.asarray(f[key]) + else: + logger.error(f"{event}.hdf5 file does not exist") + raise Exception("{}.hdf5 file does not exist".format(event)) + + return arr + + +# Category: Analysis +# Reason: I/O utility function for writing HDF5 files - pure file access with no business logic or orchestration +# function to write hdf5 file +def write_hdf5(data, event, filepath, key): + event = event.replace("\\", "_") + event = event.replace("/", "_") + op = os.path.join(filepath, event + ".hdf5") + + # if file does not exist create a new file + if not os.path.exists(op): + with h5py.File(op, "w") as f: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + + # if file already exists, append data to it or add a new key to it + else: + with h5py.File(op, "r+") as f: + if key in list(f.keys()): + if type(data) is np.ndarray: + f[key].resize(data.shape) + arr = f[key] + arr[:] = data + else: + arr = f[key] + arr = data + else: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + + +# Category: Analysis +# Reason: Validation utility - checks file naming conventions and returns structured path array with no orchestration +# function to check if the naming convention for saving storeslist file was followed or not +def decide_naming_convention(filepath): + path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + + path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + + path = sorted(path_1 + path_2, key=str.casefold) + if len(path) % 2 != 0: + logger.error("There are not equal number of Control and Signal data") + raise Exception("There are not equal number of Control and Signal data") + + path = np.asarray(path).reshape(2, -1) + + return path + + +# Category: Analysis +# Reason: I/O utility that loads artifact coordinates from .npy file or provides default - pure file loading with simple logic +# function to read coordinates file which was saved by selecting chunks for artifacts removal +def fetchCoords(filepath, naming, data): + + path = os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy") + + if not os.path.exists(path): + coords = np.array([0, data[-1]]) + else: + coords = np.load(os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy"))[:, 0] + + if coords.shape[0] % 2 != 0: + logger.error("Number of values in coordsForPreProcessing file is not even.") + raise Exception("Number of values in coordsForPreProcessing file is not even.") + + coords = coords.reshape(-1, 2) + + return coords + + +# Category: Routing +# Reason: Organizes output folders for data combination - loops through numbered outputs and groups related folders +def get_all_stores_for_combining_data(folderNames): + op = [] + for i in range(100): + temp = [] + match = r"[\s\S]*" + "_output_" + str(i) + for j in folderNames: + temp.append(re.findall(match, j)) + temp = sorted(list(np.concatenate(temp).flatten()), key=str.casefold) + if len(temp) > 0: + op.append(temp) + + return op diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 8b79039..69616d9 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -1,95 +1,52 @@ -import fnmatch import glob import json import logging import os -import re import shutil import sys -import h5py import matplotlib.pyplot as plt import numpy as np import pandas as pd -from scipy import signal as ss -from scipy.optimize import curve_fit +from .analysis.analysis import ( + addingNaNValues, + check_cntrl_sig_length, + eliminateData, + eliminateTs, + execute_controlFit_dff, + helper_create_control_channel, + removeTTLs, + z_score_computation, +) +from .analysis.io_utils import ( + check_TDT, + decide_naming_convention, + fetchCoords, + find_files, + get_all_stores_for_combining_data, + read_hdf5, + takeOnlyDirs, + write_hdf5, +) from .combineDataFn import processTimestampsForCombiningData logger = logging.getLogger(__name__) -logger = logging.getLogger(__name__) - # Only set matplotlib backend if not in CI environment if not os.getenv("CI"): plt.switch_backend("TKAgg") -def takeOnlyDirs(paths): - removePaths = [] - for p in paths: - if os.path.isfile(p): - removePaths.append(p) - return list(set(paths) - set(removePaths)) - - +# Category: Visualization/User Input +# Reason: Writes progress updates to file for GUI progress bar - couples backend to GUI feedback mechanism def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) -# find files by ignoring the case sensitivity -def find_files(path, glob_path, ignore_case=False): - rule = ( - re.compile(fnmatch.translate(glob_path), re.IGNORECASE) - if ignore_case - else re.compile(fnmatch.translate(glob_path)) - ) - - no_bytes_path = os.listdir(os.path.expanduser(path)) - str_path = [] - - # converting byte object to string - for x in no_bytes_path: - try: - str_path.append(x.decode("utf-8")) - except: - str_path.append(x) - return [os.path.join(path, n) for n in str_path if rule.match(n)] - - -# curve fit exponential function -def curveFitFn(x, a, b, c): - return a + (b * np.exp(-(1 / c) * x)) - - -# helper function to create control channel using signal channel -# by curve fitting signal channel to exponential function -# when there is no isosbestic control channel is present -def helper_create_control_channel(signal, timestamps, window): - # check if window is greater than signal shape - if window > signal.shape[0]: - window = ((signal.shape[0] + 1) / 2) + 1 - if window % 2 != 0: - window = window - else: - window = window + 1 - - filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) - - p0 = [5, 50, 60] - - try: - popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) - except Exception as e: - logger.error(str(e)) - - # logger.info('Curve Fit Parameters : ', popt) - control = curveFitFn(timestamps, *popt) - - return control - - +# Category: Routing +# Reason: Orchestrates reading HDF5 files, calling helper_create_control_channel, and writing results - coordinates I/O with computation # main function to create control channel using # signal channel and save it to a file def create_control_channel(filepath, arr, window=5001): @@ -116,6 +73,8 @@ def create_control_channel(filepath, arr, window=5001): logger.info("Control channel from signal channel created using curve-fitting") +# Category: Routing +# Reason: Orchestrates validation logic, file copying, and storesList updates - coordinates multiple operations and file manipulations # function to add control channel when there is no # isosbestic control channel and update the storeslist file def add_control_channel(filepath, arr): @@ -162,86 +121,8 @@ def add_control_channel(filepath, arr): return arr -# check if dealing with TDT files or csv files -def check_TDT(filepath): - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 0: - return True - else: - return False - - -# function to read hdf5 file -def read_hdf5(event, filepath, key): - if event: - event = event.replace("\\", "_") - event = event.replace("/", "_") - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - logger.error(f"{event}.hdf5 file does not exist") - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - -# function to write hdf5 file -def write_hdf5(data, event, filepath, key): - event = event.replace("\\", "_") - event = event.replace("/", "_") - op = os.path.join(filepath, event + ".hdf5") - - # if file does not exist create a new file - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - # if file already exists, append data to it or add a new key to it - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - -# function to check control and signal channel has same length -# if not, take a smaller length and do pre-processing -def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): - - indices = [] - for i in range(channels_arr.shape[1]): - idx_c = np.where(storesList == channels_arr[0, i])[0] - idx_s = np.where(storesList == channels_arr[1, i])[0] - control = read_hdf5(storenames[idx_c[0]], filepath, "data") - signal = read_hdf5(storenames[idx_s[0]], filepath, "data") - if control.shape[0] < signal.shape[0]: - indices.append(storesList[idx_c[0]]) - elif control.shape[0] > signal.shape[0]: - indices.append(storesList[idx_s[0]]) - else: - indices.append(storesList[idx_s[0]]) - - return indices - - +# Category: Routing +# Reason: Orchestrates timestamp correction workflow - loops through stores, coordinates reading/writing, calls validation and correction logic # function to correct timestamps after eliminating first few seconds of the data (for csv data) def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): @@ -292,6 +173,8 @@ def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): logger.info("Timestamps corrected and converted to seconds.") +# Category: Routing +# Reason: Orchestrates timestamp correction workflow for TDT format - loops through stores, coordinates timestamp expansion algorithm with I/O # function to correct timestamps after eliminating first few seconds of the data (for TDT data) def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): @@ -354,6 +237,8 @@ def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): # return timeRecStart, correctionIndex, timestampNew +# Category: Routing +# Reason: Orchestrates applying timestamp corrections - reads correction indices, applies different logic based on data type, writes results # function to apply correction to control, signal and event timestamps def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): @@ -395,6 +280,8 @@ def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): # write_hdf5(control, displayName, filepath, 'data') +# Category: Routing +# Reason: Orchestrates naming validation and correction application - loops through channel pairs and delegates to applyCorrection # function to check if naming convention was followed while saving storeslist file # and apply timestamps correction using the function applyCorrection def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): @@ -423,6 +310,8 @@ def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, logger.info("Timestamps corrections applied to the data and event timestamps.") +# Category: Visualization/User Input +# Reason: Creates matplotlib plots to display z-score results - pure visualization with no computation # function to plot z_score def visualize_z_score(filepath): @@ -445,6 +334,8 @@ def visualize_z_score(filepath): # plt.show() +# Category: Visualization/User Input +# Reason: Creates matplotlib plots to display deltaF/F results - pure visualization with no computation # function to plot deltaF/F def visualize_dff(filepath): name = os.path.basename(filepath) @@ -466,6 +357,8 @@ def visualize_dff(filepath): # plt.show() +# Category: Visualization/User Input +# Reason: Interactive matplotlib GUI with keyboard event handlers for artifact selection - core user input mechanism that saves coordinates to disk def visualize(filepath, x, y1, y2, y3, plot_name, removeArtifacts): # plotting control and signal data @@ -555,6 +448,8 @@ def plt_close_event(event): # return fig +# Category: Visualization/User Input +# Reason: Orchestrates visualization of all control/signal pairs - reads data and delegates to visualize() for user interaction # function to plot control and signal, also provide a feature to select chunks for artifacts removal def visualizeControlAndSignal(filepath, removeArtifacts): path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) @@ -590,141 +485,8 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) -# function to check if the naming convention for saving storeslist file was followed or not -def decide_naming_convention(filepath): - path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - - path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - if len(path) % 2 != 0: - logger.error("There are not equal number of Control and Signal data") - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - return path - - -# function to read coordinates file which was saved by selecting chunks for artifacts removal -def fetchCoords(filepath, naming, data): - - path = os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy") - - if not os.path.exists(path): - coords = np.array([0, data[-1]]) - else: - coords = np.load(os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy"))[:, 0] - - if coords.shape[0] % 2 != 0: - logger.error("Number of values in coordsForPreProcessing file is not even.") - raise Exception("Number of values in coordsForPreProcessing file is not even.") - - coords = coords.reshape(-1, 2) - - return coords - - -# helper function to process control and signal timestamps -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_arr = np.array([]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(arr) == 0: - arr = np.concatenate((arr, data[index])) - sub = ts[index][0] - timeForLightsTurnOn - new_ts = ts[index] - sub - ts_arr = np.concatenate((ts_arr, new_ts)) - else: - temp = data[index] - # new = temp + (arr[-1]-temp[0]) - temp_ts = ts[index] - new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) - arr = np.concatenate((arr, temp)) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - # logger.info(arr.shape, ts_arr.shape) - return arr, ts_arr - - -# helper function to align event timestamps with the control and signal timestamps -def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - tsNew_arr = np.array([]) - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(tsNew_arr) == 0: - sub = tsNew[tsNew_index][0] - timeForLightsTurnOn - tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) - ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) - else: - temp_tsNew = tsNew[tsNew_index] - temp_ts = ts[ts_index] - new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) - new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) - tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - return ts_arr - - -# adding nan values to removed chunks -# when using artifacts removal method - replace with NaN -def addingNaNValues(filepath, event, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_index = np.arange(ts.shape[0]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - arr = np.concatenate((arr, index)) - - nan_indices = list(set(ts_index).symmetric_difference(arr)) - data[nan_indices] = np.nan - - return data - - -# remove event TTLs which falls in the removed chunks -# when using artifacts removal method - replace with NaN -def removeTTLs(filepath, event, naming): - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - for i in range(coords.shape[0]): - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - ts_arr = np.concatenate((ts_arr, ts[ts_index])) - - return ts_arr - - +# Category: Routing +# Reason: Orchestrates NaN replacement for all stores - loops through channels and coordinates calls to addingNaNValues and removeTTLs def addingNaNtoChunksWithArtifacts(filepath, events): logger.debug("Replacing chunks with artifacts by NaN values.") @@ -759,6 +521,8 @@ def addingNaNtoChunksWithArtifacts(filepath, events): logger.info("Chunks with artifacts are replaced by NaN values.") +# Category: Routing +# Reason: Orchestrates timestamp concatenation for artifact removal - loops through stores, coordinates eliminateData/eliminateTs calls and writes results # main function to align timestamps for control, signal and event timestamps for artifacts removal def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): @@ -800,89 +564,8 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") -# function to compute deltaF/F using fitted control channel and filtered signal channel -def deltaFF(signal, control): - - res = np.subtract(signal, control) - normData = np.divide(res, control) - # deltaFF = normData - normData = normData * 100 - - return normData - - -# function to fit control channel to signal channel -def controlFit(control, signal): - - p = np.polyfit(control, signal, 1) - arr = (p[0] * control) + p[1] - return arr - - -def filterSignal(filter_window, signal): - if filter_window == 0: - return signal - elif filter_window > 1: - b = np.divide(np.ones((filter_window,)), filter_window) - a = 1 - filtered_signal = ss.filtfilt(b, a, signal) - return filtered_signal - else: - raise Exception("Moving average filter window value is not correct.") - - -# function to filter control and signal channel, also execute above two function : controlFit and deltaFF -# function will also take care if there is only signal channel and no control channel -# if there is only signal channel, z-score will be computed using just signal channel -def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): - - if isosbestic_control == False: - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - else: - control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control_smooth, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - - return norm_data, control_fit - - -# function to compute z-score based on z-score computation method -def z_score_computation(dff, timestamps, inputParameters): - - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] - - if zscore_method == "standard z-score": - numerator = np.subtract(dff, np.nanmean(dff)) - zscore = np.divide(numerator, np.nanstd(dff)) - elif zscore_method == "baseline z-score": - idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] - if idx.shape[0] == 0: - logger.error( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - raise Exception( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - else: - baseline_mean = np.nanmean(dff[idx]) - baseline_std = np.nanstd(dff[idx]) - numerator = np.subtract(dff, baseline_mean) - zscore = np.divide(numerator, baseline_std) - else: - median = np.median(dff) - mad = np.median(np.abs(dff - median)) - numerator = 0.6745 * (dff - median) - zscore = np.divide(numerator, mad) - - return zscore - - +# Category: Routing +# Reason: Orchestrates z-score computation for one channel - handles artifact removal logic, coordinates calls to execute_controlFit_dff and z_score_computation # helper function to compute z-score and deltaF/F def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): @@ -957,6 +640,8 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ return z_score_arr, norm_data_arr, control_fit_arr +# Category: Routing +# Reason: Orchestrates z-score computation for all channels in a session - loops through control/signal pairs, calls helper_z_score, writes results # compute z-score and deltaF/F and save it to hdf5 file def compute_z_score(filepath, inputParameters): @@ -1005,6 +690,8 @@ def compute_z_score(filepath, inputParameters): logger.info(f"z-score for the data in {filepath} computed.") +# Category: Routing +# Reason: Top-level orchestrator for timestamp correction across all sessions - loops through folders, coordinates timestamp correction workflow # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection def execute_timestamp_correction(folderNames, inputParameters): @@ -1044,6 +731,8 @@ def execute_timestamp_correction(folderNames, inputParameters): logger.info(f"Timestamps corrections finished for {filepath}") +# Category: Routing +# Reason: Orchestrates reading and merging storeslist files from multiple sessions - loops through folders and consolidates results # for combining data, reading storeslist file from both data and create a new storeslist array def check_storeslistfile(folderNames): storesList = np.array([[], []]) @@ -1065,20 +754,8 @@ def check_storeslistfile(folderNames): return storesList -def get_all_stores_for_combining_data(folderNames): - op = [] - for i in range(100): - temp = [] - match = r"[\s\S]*" + "_output_" + str(i) - for j in folderNames: - temp.append(re.findall(match, j)) - temp = sorted(list(np.concatenate(temp).flatten()), key=str.casefold) - if len(temp) > 0: - op.append(temp) - - return op - - +# Category: Routing +# Reason: Orchestrates data combination workflow - validates sampling rates, coordinates processTimestampsForCombiningData, manages multi-session I/O # function to combine data when there are two different data files for the same recording session # it will combine the data, do timestamps processing and save the combined data in the first output folder. def combineData(folderNames, inputParameters, storesList): @@ -1123,6 +800,8 @@ def combineData(folderNames, inputParameters, storesList): return op +# Category: Routing +# Reason: Top-level orchestrator for z-score computation and artifact removal - coordinates compute_z_score, artifact processing, and visualization calls # function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts def execute_zscore(folderNames, inputParameters): @@ -1175,6 +854,8 @@ def execute_zscore(folderNames, inputParameters): logger.info("Signal data and event timestamps are extracted.") +# Category: Routing +# Reason: Main entry point for Step 4 - orchestrates entire preprocessing workflow including timestamp correction, data combination, and z-score computation def extractTsAndSignal(inputParameters): logger.debug("Extracting signal data and event timestamps...") @@ -1212,6 +893,8 @@ def extractTsAndSignal(inputParameters): execute_zscore(op_folder, inputParameters) +# Category: Routing +# Reason: Top-level entry point wrapper - handles error catching and calls extractTsAndSignal def main(input_parameters): try: extractTsAndSignal(input_parameters) diff --git a/step4_data_flow_analysis.md b/step4_data_flow_analysis.md new file mode 100644 index 0000000..d86e938 --- /dev/null +++ b/step4_data_flow_analysis.md @@ -0,0 +1,348 @@ +# Step 4 (preprocess.py) Data Flow Analysis + +## Overview + +Step 4 processes timestamp-corrected photometry data and computes normalized signals (ΔF/F and z-scores). It handles artifact removal, data combination from multiple sessions, and generates quality control visualizations. + +## High-Level Data Flow + +```mermaid +flowchart TD + A[Entry: extractTsAndSignal] --> B{combine_data?} + + B -->|False| C[execute_timestamp_correction] + B -->|True| D[execute_timestamp_correction] + + C --> E[execute_zscore] + + D --> F[check_storeslistfile] + F --> G[combineData] + G --> H[execute_zscore] + + E --> I[Output: z_score, dff, cntrl_sig_fit HDF5 files] + H --> I + + style A fill:#e1f5ff + style I fill:#d4edda +``` + +## Main Processing Paths + +### Entry Point +**`extractTsAndSignal(inputParameters)`** (line 1178) is the main entry point called by the GUI or API. + +### Path 1: Normal Processing (combine_data = False) +1. `execute_timestamp_correction()` → Correct timestamps and align data +2. `execute_zscore()` → Compute z-scores and ΔF/F + +### Path 2: Combined Data Processing (combine_data = True) +1. `execute_timestamp_correction()` → Correct timestamps for each file +2. `check_storeslistfile()` → Merge store lists from multiple files +3. `combineData()` → Combine data from multiple recording sessions +4. `execute_zscore()` → Compute z-scores and ΔF/F on combined data + +## Detailed Processing Stages + +### Stage 1: Timestamp Correction + +```mermaid +flowchart LR + A[Raw HDF5 files] --> B[Read storesList.csv] + B --> C{isosbestic_control?} + C -->|No| D[add_control_channel] + C -->|Yes| E[timestampCorrection_tdt/csv] + D --> E + E --> F[Eliminate first N seconds] + F --> G[decide_naming_convention_and_applyCorrection] + G --> H[applyCorrection for each store] + H --> I{isosbestic_control?} + I -->|No| J[create_control_channel via curve fitting] + I -->|Yes| K[timeCorrection_*.hdf5 files] + J --> K + + style A fill:#e1f5ff + style K fill:#d4edda +``` + +#### Function: `execute_timestamp_correction(folderNames, inputParameters)` + +**Input:** +- Raw HDF5 files from extractors: `control_*.hdf5`, `signal_*.hdf5`, `event_*.hdf5` + +**Process:** +1. For each session folder: + - Read `storesList.csv` (mapping of raw names to semantic names) + - If no isosbestic control: `add_control_channel()` creates placeholder control files + - **`timestampCorrection_tdt()`** or **`timestampCorrection_csv()`**: + - Eliminates first N seconds (`timeForLightsTurnOn`) + - For TDT: expands timestamps from block timestamps + sampling rate + - For CSV: uses timestamps as-is + - Writes `timeCorrection_*.hdf5` with keys: `timestampNew`, `correctionIndex`, `sampling_rate` + - **`decide_naming_convention_and_applyCorrection()`**: + - For each store, calls `applyCorrection()` to crop data using `correctionIndex` + - For control/signal channels: crops data arrays + - For event channels: subtracts time offset from timestamps + - If no isosbestic control: **`create_control_channel()`** generates synthetic control via curve fitting + +**Output:** +- Timestamp-corrected HDF5 files with trimmed data +- `timeCorrection_*.hdf5` files containing corrected timestamps + +### Stage 2: Z-Score Computation + +```mermaid +flowchart TD + A[Timestamp-corrected HDF5] --> B[compute_z_score] + B --> C{removeArtifacts?} + + C -->|No| D[helper_z_score: full data] + C -->|Yes| E[helper_z_score: chunk-by-chunk] + + D --> F[filterSignal] + E --> F + + F --> G[controlFit: linear regression] + G --> H[deltaFF: compute ΔF/F] + H --> I[z_score_computation] + + I --> J{removeArtifacts?} + + J -->|No| K[Write z_score, dff, cntrl_sig_fit] + J -->|Yes| L{artifactsRemovalMethod?} + + L -->|concatenate| M[processTimestampsForArtifacts] + L -->|NaN| N[addingNaNtoChunksWithArtifacts] + + M --> K + N --> K + + K --> O[visualizeControlAndSignal] + + style A fill:#e1f5ff + style K fill:#d4edda + style O fill:#fff3cd +``` + +#### Function: `execute_zscore(folderNames, inputParameters)` + +**Input:** +- Timestamp-corrected HDF5 files + +**Process:** +1. For each output folder: + + **`compute_z_score(filepath, inputParameters)`**: + - For each control/signal pair: + - **`helper_z_score(control, signal, filepath, name, inputParameters)`**: + + **Without artifacts removal:** + - `execute_controlFit_dff()`: Filter signals → fit control to signal → compute ΔF/F + - `z_score_computation()`: Compute z-score from ΔF/F + + **With artifacts removal:** + - For each user-selected chunk (from `coordsForPreProcessing_*.npy`): + - If no isosbestic: `helper_create_control_channel()` creates synthetic control + - `execute_controlFit_dff()` on chunk + - Concatenate or NaN-fill between chunks + - `z_score_computation()` on processed data + + - Writes: `z_score_*.hdf5`, `dff_*.hdf5`, `cntrl_sig_fit_*.hdf5` + + **If artifacts removal with concatenate method:** + - **`processTimestampsForArtifacts()`**: + - `eliminateData()`: Concatenates good chunks, adjusts timestamps to be continuous + - `eliminateTs()`: Aligns event timestamps with new timeline + - Overwrites data files with concatenated versions + + **If artifacts removal with NaN method:** + - **`addingNaNtoChunksWithArtifacts()`**: + - `addingNaNValues()`: Replaces bad chunks with NaN + - `removeTTLs()`: Filters event timestamps to keep only valid times + + - **`visualizeControlAndSignal()`**: Plots control, signal, cntrl_sig_fit for QC + +**Output:** +- `z_score_*.hdf5` (z-scored signal) +- `dff_*.hdf5` (ΔF/F) +- `cntrl_sig_fit_*.hdf5` (fitted control channel) + +## Key Data Transformations + +### Signal Processing Pipeline + +```mermaid +flowchart LR + A[Raw Signal] --> B[filterSignal: Moving Average] + C[Raw Control] --> D[filterSignal: Moving Average] + + B --> E[controlFit: Linear Regression] + D --> E + + E --> F[control_fit = p0*control + p1] + F --> G[deltaFF] + + B --> G + + G --> H[ΔF/F = signal - control_fit / control_fit * 100] + H --> I[z_score_computation] + + I --> J{zscore_method?} + J -->|standard| K[z = ΔF/F - mean / std] + J -->|baseline| L[z = ΔF/F - baseline_mean / baseline_std] + J -->|robust| M[z = 0.6745 * ΔF/F - median / MAD] + + K --> N[Z-Score Output] + L --> N + M --> N + + style A fill:#e1f5ff + style C fill:#e1f5ff + style N fill:#d4edda +``` + +### Transformation Functions + +1. **`filterSignal(filter_window, signal)`** (line 822) + - Applies moving average filter with configurable window + - Uses `scipy.signal.filtfilt` for zero-phase filtering + +2. **`controlFit(control, signal)`** (line 815) + - Linear regression: fits control to signal + - Returns: `fitted_control = p[0] * control + p[1]` + +3. **`deltaFF(signal, control)`** (line 804) + - Formula: `((signal - control) / control) * 100` + - Computes normalized fluorescence change + +4. **`z_score_computation(dff, timestamps, inputParameters)`** (line 853) + - **Standard z-score:** `(ΔF/F - mean(ΔF/F)) / std(ΔF/F)` + - **Baseline z-score:** `(ΔF/F - mean(baseline)) / std(baseline)` + - **Robust z-score:** `0.6745 * (ΔF/F - median) / MAD` + +## Artifact Removal Workflow + +### Interactive Artifact Selection + +The `visualize()` function (line 469) provides an interactive matplotlib plot: +- **Space key:** Mark artifact boundary (vertical line drawn) +- **'d' key:** Delete last marked boundary +- **Close plot:** Save coordinates to `coordsForPreProcessing_*.npy` + +### Two Removal Methods + +**Concatenate Method:** +- Removes artifact chunks completely +- Concatenates good chunks end-to-end +- Adjusts timestamps to be continuous +- Event timestamps realigned to new timeline + +**NaN Method:** +- Replaces artifact chunks with NaN values +- Preserves original timeline +- Filters out event timestamps in artifact regions + +## Supporting Functions + +### Control Channel Creation + +**`helper_create_control_channel(signal, timestamps, window)`** (line 69) +- Used when no isosbestic control is available +- Applies Savitzky-Golay filter to signal +- Fits to exponential function: `f(x) = a + b * exp(-(1/c) * x)` +- Returns synthetic control channel + +### Data Combination + +**`combineData(folderNames, inputParameters, storesList)`** (line 1084) +- Merges data from multiple recording sessions +- Validates that sampling rates match across sessions +- Calls `processTimestampsForCombiningData()` to align timelines +- Saves combined data to first output folder + +### Coordinate Fetching + +**`fetchCoords(filepath, naming, data)`** (line 610) +- Reads `coordsForPreProcessing_*.npy` (artifact boundary coordinates) +- If file doesn't exist: uses `[0, data[-1]]` (entire recording) +- Validates even number of coordinates (pairs of boundaries) +- Returns reshaped array of coordinate pairs + +## File I/O Summary + +### Files Read + +| File Pattern | Content | Source | +|-------------|---------|--------| +| `control_*.hdf5` | Control channel data | Extractors (Step 3) | +| `signal_*.hdf5` | Signal channel data | Extractors (Step 3) | +| `event_*.hdf5` | Event timestamps | Extractors (Step 3) | +| `storesList.csv` | Channel name mapping | Step 2 | +| `coordsForPreProcessing_*.npy` | Artifact boundaries | User selection (optional) | + +### Files Written + +| File Pattern | Content | Keys | +|-------------|---------|------| +| `timeCorrection_*.hdf5` | Corrected timestamps | `timestampNew`, `correctionIndex`, `sampling_rate`, `timeRecStart` (TDT only) | +| `z_score_*.hdf5` | Z-scored signal | `data` | +| `dff_*.hdf5` | ΔF/F signal | `data` | +| `cntrl_sig_fit_*.hdf5` | Fitted control | `data` | +| `event_*_*.hdf5` | Corrected event timestamps | `ts` | + +## Key Parameters from inputParameters + +| Parameter | Purpose | Default/Options | +|-----------|---------|-----------------| +| `timeForLightsTurnOn` | Seconds to eliminate from start | 1 | +| `filter_window` | Moving average window size | 100 | +| `isosbestic_control` | Use isosbestic control channel? | True/False | +| `removeArtifacts` | Enable artifact removal? | True/False | +| `artifactsRemovalMethod` | How to handle artifacts | "concatenate" / "NaN" | +| `zscore_method` | Z-score computation method | "standard z-score" / "baseline z-score" / "robust z-score" | +| `baselineWindowStart` | Baseline window start (seconds) | 0 | +| `baselineWindowEnd` | Baseline window end (seconds) | 0 | +| `combine_data` | Combine multiple recordings? | True/False | + +## Architecture Notes for Refactoring + +### Current Coupling Issues + +1. **GUI Progress Tracking:** `writeToFile()` writes to `~/pbSteps.txt` for progress bar updates (lines 36-38, 1042, 1171, 1203, 1208, 1220) +2. **Interactive Plotting:** `visualize()` requires user interaction (matplotlib event handlers) +3. **File Path Assumptions:** Hard-coded path patterns (`*_output_*`, naming conventions) +4. **Mixed Responsibilities:** Single functions handle both computation and I/O + +### Recommended Separation Points + +**Backend Analysis Layer Should Include:** +- `filterSignal()` - pure signal processing +- `controlFit()` - pure regression +- `deltaFF()` - pure computation +- `z_score_computation()` - pure statistical computation +- `helper_create_control_channel()` - algorithmic control generation +- Core timestamp correction logic (separated from I/O) +- Core artifact removal logic (separated from I/O) + +**Data I/O Layer Should Include:** +- `read_hdf5()`, `write_hdf5()` - file operations +- Store list reading/writing +- Coordinate file handling +- HDF5 file discovery and path management + +**Frontend Visualization Layer Should Include:** +- `visualize()` - interactive artifact selection +- `visualizeControlAndSignal()` - QC plots +- `visualize_z_score()`, `visualize_dff()` - result visualization +- Progress tracking callbacks (replace `writeToFile()`) + +### Potential Refactoring Strategy + +1. **Extract pure computation functions** into a `signal_processing` module +2. **Create data models** (dataclasses) for: + - TimeCorrectionResult + - ProcessedSignal (with z_score, dff, control_fit) + - ArtifactRegions +3. **Separate I/O operations** into `io_utils` module with consistent interfaces +4. **Create processing pipelines** that accept data objects, return data objects +5. **Move visualization to separate module** with callbacks for progress/interaction +6. **Use dependency injection** for progress callbacks instead of hard-coded file writes From eadb22f62670ffd10301ae85eb08060c45f6a133 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 13:26:52 -0800 Subject: [PATCH 062/125] Organized step 4 analysis functions into various conceptual sub-steps. --- src/guppy/analysis/analysis.py | 268 ---------- src/guppy/analysis/artifact_removal.py | 200 ++++++++ src/guppy/analysis/combine_data.py | 398 ++++++++++++++ src/guppy/analysis/control_channel.py | 42 ++ src/guppy/analysis/io_utils.py | 23 + src/guppy/analysis/timestamp_correction.py | 302 +++++++++++ src/guppy/analysis/z_score.py | 234 +++++++++ src/guppy/preprocess.py | 570 +-------------------- 8 files changed, 1213 insertions(+), 824 deletions(-) delete mode 100644 src/guppy/analysis/analysis.py create mode 100644 src/guppy/analysis/artifact_removal.py create mode 100644 src/guppy/analysis/combine_data.py create mode 100644 src/guppy/analysis/control_channel.py create mode 100644 src/guppy/analysis/timestamp_correction.py create mode 100644 src/guppy/analysis/z_score.py diff --git a/src/guppy/analysis/analysis.py b/src/guppy/analysis/analysis.py deleted file mode 100644 index 4ec8960..0000000 --- a/src/guppy/analysis/analysis.py +++ /dev/null @@ -1,268 +0,0 @@ -import logging - -import numpy as np -from scipy import signal as ss -from scipy.optimize import curve_fit - -from .io_utils import fetchCoords, read_hdf5 - -logger = logging.getLogger(__name__) - - -# Category: Analysis -# Reason: Pure mathematical function for exponential curve fitting - no dependencies, pure computation -# curve fit exponential function -def curveFitFn(x, a, b, c): - return a + (b * np.exp(-(1 / c) * x)) - - -# Category: Analysis -# Reason: Pure algorithmic function - applies Savitzky-Golay filter and curve fitting to generate synthetic control channel -# helper function to create control channel using signal channel -# by curve fitting signal channel to exponential function -# when there is no isosbestic control channel is present -def helper_create_control_channel(signal, timestamps, window): - # check if window is greater than signal shape - if window > signal.shape[0]: - window = ((signal.shape[0] + 1) / 2) + 1 - if window % 2 != 0: - window = window - else: - window = window + 1 - - filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) - - p0 = [5, 50, 60] - - try: - popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) - except Exception as e: - logger.error(str(e)) - - # logger.info('Curve Fit Parameters : ', popt) - control = curveFitFn(timestamps, *popt) - - return control - - -# Category: Analysis -# Reason: Data validation function - compares array lengths and returns indices for processing -# function to check control and signal channel has same length -# if not, take a smaller length and do pre-processing -def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): - - indices = [] - for i in range(channels_arr.shape[1]): - idx_c = np.where(storesList == channels_arr[0, i])[0] - idx_s = np.where(storesList == channels_arr[1, i])[0] - control = read_hdf5(storenames[idx_c[0]], filepath, "data") - signal = read_hdf5(storenames[idx_s[0]], filepath, "data") - if control.shape[0] < signal.shape[0]: - indices.append(storesList[idx_c[0]]) - elif control.shape[0] > signal.shape[0]: - indices.append(storesList[idx_s[0]]) - else: - indices.append(storesList[idx_s[0]]) - - return indices - - -# Category: Analysis -# Reason: Pure algorithmic function - concatenates data chunks based on coordinate boundaries, adjusts timestamps mathematically -# helper function to process control and signal timestamps -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_arr = np.array([]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(arr) == 0: - arr = np.concatenate((arr, data[index])) - sub = ts[index][0] - timeForLightsTurnOn - new_ts = ts[index] - sub - ts_arr = np.concatenate((ts_arr, new_ts)) - else: - temp = data[index] - # new = temp + (arr[-1]-temp[0]) - temp_ts = ts[index] - new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) - arr = np.concatenate((arr, temp)) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - # logger.info(arr.shape, ts_arr.shape) - return arr, ts_arr - - -# Category: Analysis -# Reason: Pure algorithmic function - processes event timestamps based on coordinate boundaries, aligns with data timeline -# helper function to align event timestamps with the control and signal timestamps -def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - tsNew_arr = np.array([]) - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(tsNew_arr) == 0: - sub = tsNew[tsNew_index][0] - timeForLightsTurnOn - tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) - ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) - else: - temp_tsNew = tsNew[tsNew_index] - temp_ts = ts[ts_index] - new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) - new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) - tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - return ts_arr - - -# Category: Analysis -# Reason: Pure algorithmic function - replaces specified data chunks with NaN based on coordinate boundaries -# adding nan values to removed chunks -# when using artifacts removal method - replace with NaN -def addingNaNValues(filepath, event, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_index = np.arange(ts.shape[0]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - arr = np.concatenate((arr, index)) - - nan_indices = list(set(ts_index).symmetric_difference(arr)) - data[nan_indices] = np.nan - - return data - - -# Category: Analysis -# Reason: Pure algorithmic function - filters event timestamps to exclude artifact regions based on coordinates -# remove event TTLs which falls in the removed chunks -# when using artifacts removal method - replace with NaN -def removeTTLs(filepath, event, naming): - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - for i in range(coords.shape[0]): - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - ts_arr = np.concatenate((ts_arr, ts[ts_index])) - - return ts_arr - - -# Category: Analysis -# Reason: Pure mathematical computation - calculates deltaF/F from signal and control arrays with simple formula -# function to compute deltaF/F using fitted control channel and filtered signal channel -def deltaFF(signal, control): - - res = np.subtract(signal, control) - normData = np.divide(res, control) - # deltaFF = normData - normData = normData * 100 - - return normData - - -# Category: Analysis -# Reason: Pure algorithmic function - performs polynomial linear regression to fit control to signal -# function to fit control channel to signal channel -def controlFit(control, signal): - - p = np.polyfit(control, signal, 1) - arr = (p[0] * control) + p[1] - return arr - - -# Category: Analysis -# Reason: Pure signal processing function - applies moving average filter using scipy filtfilt -def filterSignal(filter_window, signal): - if filter_window == 0: - return signal - elif filter_window > 1: - b = np.divide(np.ones((filter_window,)), filter_window) - a = 1 - filtered_signal = ss.filtfilt(b, a, signal) - return filtered_signal - else: - raise Exception("Moving average filter window value is not correct.") - - -# Category: Routing -# Reason: Orchestrates signal processing pipeline - sequences calls to filterSignal, controlFit, and deltaFF with conditional logic -# function to filter control and signal channel, also execute above two function : controlFit and deltaFF -# function will also take care if there is only signal channel and no control channel -# if there is only signal channel, z-score will be computed using just signal channel -def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): - - if isosbestic_control == False: - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - else: - control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control_smooth, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - - return norm_data, control_fit - - -# Category: Analysis -# Reason: Pure statistical computation - calculates z-score from deltaF/F using different methods (standard, baseline, robust) -# function to compute z-score based on z-score computation method -def z_score_computation(dff, timestamps, inputParameters): - - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] - - if zscore_method == "standard z-score": - numerator = np.subtract(dff, np.nanmean(dff)) - zscore = np.divide(numerator, np.nanstd(dff)) - elif zscore_method == "baseline z-score": - idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] - if idx.shape[0] == 0: - logger.error( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - raise Exception( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - else: - baseline_mean = np.nanmean(dff[idx]) - baseline_std = np.nanstd(dff[idx]) - numerator = np.subtract(dff, baseline_mean) - zscore = np.divide(numerator, baseline_std) - else: - median = np.median(dff) - mad = np.median(np.abs(dff - median)) - numerator = 0.6745 * (dff - median) - zscore = np.divide(numerator, mad) - - return zscore diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py new file mode 100644 index 0000000..3c51830 --- /dev/null +++ b/src/guppy/analysis/artifact_removal.py @@ -0,0 +1,200 @@ +import logging +import os + +import numpy as np + +from .io_utils import ( + decide_naming_convention, + fetchCoords, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +# Category: Routing +# Reason: Orchestrates NaN replacement for all stores - loops through channels and coordinates calls to addingNaNValues and removeTTLs +def addingNaNtoChunksWithArtifacts(filepath, events): + + logger.debug("Replacing chunks with artifacts by NaN values.") + storesList = events[1, :] + + path = decide_naming_convention(filepath) + + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + # dirname = os.path.dirname(path[i]) + if name_1[-1] == name_2[-1]: + name = name_1[-1] + sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + for i in range(len(storesList)): + if ( + "control_" + name.lower() in storesList[i].lower() + or "signal_" + name.lower() in storesList[i].lower() + ): # changes done + data = addingNaNValues(filepath, storesList[i], name) + write_hdf5(data, storesList[i], filepath, "data") + else: + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + continue + else: + ts = removeTTLs(filepath, storesList[i], name) + write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") + + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + logger.info("Chunks with artifacts are replaced by NaN values.") + + +# Category: Routing +# Reason: Orchestrates timestamp concatenation for artifact removal - loops through stores, coordinates eliminateData/eliminateTs calls and writes results +# main function to align timestamps for control, signal and event timestamps for artifacts removal +def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): + + logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") + storesList = events[1, :] + + path = decide_naming_convention(filepath) + + timestamp_dict = dict() + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + # dirname = os.path.dirname(path[i]) + if name_1[-1] == name_2[-1]: + name = name_1[-1] + sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + + for i in range(len(storesList)): + if ( + "control_" + name.lower() in storesList[i].lower() + or "signal_" + name.lower() in storesList[i].lower() + ): # changes done + data, timestampNew = eliminateData( + filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name + ) + write_hdf5(data, storesList[i], filepath, "data") + else: + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + continue + else: + ts = eliminateTs(filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name) + write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") + + # timestamp_dict[name] = timestampNew + write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") + + +# Category: Analysis +# Reason: Pure algorithmic function - concatenates data chunks based on coordinate boundaries, adjusts timestamps mathematically +# helper function to process control and signal timestamps +def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + data = read_hdf5(event, filepath, "data").reshape(-1) + coords = fetchCoords(filepath, naming, ts) + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_arr = np.array([]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(arr) == 0: + arr = np.concatenate((arr, data[index])) + sub = ts[index][0] - timeForLightsTurnOn + new_ts = ts[index] - sub + ts_arr = np.concatenate((ts_arr, new_ts)) + else: + temp = data[index] + # new = temp + (arr[-1]-temp[0]) + temp_ts = ts[index] + new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) + arr = np.concatenate((arr, temp)) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + # logger.info(arr.shape, ts_arr.shape) + return arr, ts_arr + + +# Category: Analysis +# Reason: Pure algorithmic function - processes event timestamps based on coordinate boundaries, aligns with data timeline +# helper function to align event timestamps with the control and signal timestamps +def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, naming, tsNew) + + ts_arr = np.array([]) + tsNew_arr = np.array([]) + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(tsNew_arr) == 0: + sub = tsNew[tsNew_index][0] - timeForLightsTurnOn + tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) + ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) + else: + temp_tsNew = tsNew[tsNew_index] + temp_ts = ts[ts_index] + new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) + new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) + tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + return ts_arr + + +# Category: Analysis +# Reason: Pure algorithmic function - replaces specified data chunks with NaN based on coordinate boundaries +# adding nan values to removed chunks +# when using artifacts removal method - replace with NaN +def addingNaNValues(filepath, event, naming): + + ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + data = read_hdf5(event, filepath, "data").reshape(-1) + coords = fetchCoords(filepath, naming, ts) + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_index = np.arange(ts.shape[0]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + arr = np.concatenate((arr, index)) + + nan_indices = list(set(ts_index).symmetric_difference(arr)) + data[nan_indices] = np.nan + + return data + + +# Category: Analysis +# Reason: Pure algorithmic function - filters event timestamps to exclude artifact regions based on coordinates +# remove event TTLs which falls in the removed chunks +# when using artifacts removal method - replace with NaN +def removeTTLs(filepath, event, naming): + tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, naming, tsNew) + + ts_arr = np.array([]) + for i in range(coords.shape[0]): + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + ts_arr = np.concatenate((ts_arr, ts[ts_index])) + + return ts_arr diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py new file mode 100644 index 0000000..29e4b9d --- /dev/null +++ b/src/guppy/analysis/combine_data.py @@ -0,0 +1,398 @@ +# TODO: remove redundant function implementations such as eliminateData, eliminateTs, read_hdf5, et cetera. + +import fnmatch +import glob +import logging +import os +import re + +import numpy as np + +from .io_utils import ( + get_all_stores_for_combining_data, + read_hdf5, + takeOnlyDirs, +) + +logger = logging.getLogger(__name__) + + +# Category: Routing +# Reason: Orchestrates data combination workflow - validates sampling rates, coordinates processTimestampsForCombiningData, manages multi-session I/O +# function to combine data when there are two different data files for the same recording session +# it will combine the data, do timestamps processing and save the combined data in the first output folder. +def combineData(folderNames, inputParameters, storesList): + + logger.debug("Combining Data from different data files...") + timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] + op_folder = [] + for i in range(len(folderNames)): + filepath = folderNames[i] + op_folder.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + + op_folder = list(np.concatenate(op_folder).flatten()) + sampling_rate_fp = [] + for i in range(len(folderNames)): + filepath = folderNames[i] + storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList_new = np.genfromtxt( + os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," + ).reshape(2, -1) + sampling_rate_fp.append(glob.glob(os.path.join(filepath, "timeCorrection_*"))) + + # check if sampling rate is same for both data + sampling_rate_fp = np.concatenate(sampling_rate_fp) + sampling_rate = [] + for i in range(sampling_rate_fp.shape[0]): + sampling_rate.append(read_hdf5("", sampling_rate_fp[i], "sampling_rate")) + + res = all(i == sampling_rate[0] for i in sampling_rate) + if res == False: + logger.error("To combine the data, sampling rate for both the data should be same.") + raise Exception("To combine the data, sampling rate for both the data should be same.") + + # get the output folders informatinos + op = get_all_stores_for_combining_data(op_folder) + + # processing timestamps for combining the data + processTimestampsForCombiningData(op, timeForLightsTurnOn, storesList, sampling_rate[0]) + logger.info("Data is combined from different data files.") + + return op + + +def find_files(path, glob_path, ignore_case=False): + rule = ( + re.compile(fnmatch.translate(glob_path), re.IGNORECASE) + if ignore_case + else re.compile(fnmatch.translate(glob_path)) + ) + no_bytes_path = os.listdir(os.path.expanduser(path)) + str_path = [] + + # converting byte object to string + for x in no_bytes_path: + try: + str_path.append(x.decode("utf-8")) + except: + str_path.append(x) + + return [os.path.join(path, n) for n in str_path if rule.match(n)] + + +def read_hdf5(event, filepath, key): + if event: + op = os.path.join(filepath, event + ".hdf5") + else: + op = filepath + + if os.path.exists(op): + with h5py.File(op, "r") as f: + arr = np.asarray(f[key]) + else: + raise Exception("{}.hdf5 file does not exist".format(event)) + + return arr + + +def write_hdf5(data, event, filepath, key): + op = os.path.join(filepath, event + ".hdf5") + + if not os.path.exists(op): + with h5py.File(op, "w") as f: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + else: + with h5py.File(op, "r+") as f: + if key in list(f.keys()): + if type(data) is np.ndarray: + f[key].resize(data.shape) + arr = f[key] + arr[:] = data + else: + arr = f[key] + arr = data + else: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + + +def decide_naming_convention(filepath): + path_1 = find_files(filepath, "control*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + + path_2 = find_files(filepath, "signal*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + + path = sorted(path_1 + path_2, key=str.casefold) + + if len(path) % 2 != 0: + raise Exception("There are not equal number of Control and Signal data") + + path = np.asarray(path).reshape(2, -1) + + return path + + +def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + arr = np.array([]) + ts_arr = np.array([]) + for i in range(len(filepath)): + ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") + data = read_hdf5(event, filepath[i], "data").reshape(-1) + + # index = np.where((ts>coords[i,0]) & (tscoords[i,0]) & (ts signal.shape[0]: + window = ((signal.shape[0] + 1) / 2) + 1 + if window % 2 != 0: + window = window + else: + window = window + 1 + + filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) + + p0 = [5, 50, 60] + + try: + popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) + except Exception as e: + logger.error(str(e)) + + # logger.info('Curve Fit Parameters : ', popt) + control = curveFitFn(timestamps, *popt) + + return control + + +# Category: Analysis +# Reason: Pure mathematical function for exponential curve fitting - no dependencies, pure computation +# curve fit exponential function +def curveFitFn(x, a, b, c): + return a + (b * np.exp(-(1 / c) * x)) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index 33b6650..999c190 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -161,3 +161,26 @@ def get_all_stores_for_combining_data(folderNames): op.append(temp) return op + + +# Category: Routing +# Reason: Orchestrates reading and merging storeslist files from multiple sessions - loops through folders and consolidates results +# for combining data, reading storeslist file from both data and create a new storeslist array +def check_storeslistfile(folderNames): + storesList = np.array([[], []]) + for i in range(len(folderNames)): + filepath = folderNames[i] + storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList = np.concatenate( + ( + storesList, + np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1), + ), + axis=1, + ) + + storesList = np.unique(storesList, axis=1) + + return storesList diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py new file mode 100644 index 0000000..350dd5d --- /dev/null +++ b/src/guppy/analysis/timestamp_correction.py @@ -0,0 +1,302 @@ +import logging +import os +import shutil + +import numpy as np +import pandas as pd + +from .control_channel import helper_create_control_channel +from .io_utils import ( + check_TDT, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +# Category: Routing +# Reason: Orchestrates validation logic, file copying, and storesList updates - coordinates multiple operations and file manipulations +# function to add control channel when there is no +# isosbestic control channel and update the storeslist file +def add_control_channel(filepath, arr): + + storenames = arr[0, :] + storesList = np.char.lower(arr[1, :]) + + keep_control = np.array([]) + # check a case if there is isosbestic control channel present + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "signal_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) > 1: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + if len(find_signal) == 0: + logger.error( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + raise Exception( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + else: + continue + + for i in range(storesList.shape[0]): + if "signal" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "control_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) == 0: + src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( + filepath, "cntrl" + str(i) + ".hdf5" + ) + shutil.copyfile(src, dst) + arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) + + np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") + + return arr + + +# Category: Routing +# Reason: Orchestrates timestamp correction workflow - loops through stores, coordinates reading/writing, calls validation and correction logic +# function to correct timestamps after eliminating first few seconds of the data (for csv data) +def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): + + logger.debug( + f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" + ) + storenames = storesList[0, :] + storesList = storesList[1, :] + + arr = [] + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + arr.append(storesList[i]) + + arr = sorted(arr, key=str.casefold) + try: + arr = np.asarray(arr).reshape(2, -1) + except: + logger.error("Error in saving stores list file or spelling mistake for control or signal") + raise Exception("Error in saving stores list file or spelling mistake for control or signal") + + indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) + + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + # dirname = os.path.dirname(path[i]) + idx = np.where(storesList == indices[i])[0] + + if idx.shape[0] == 0: + logger.error(f"{arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + + timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") + sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") + + if name_1 == name_2: + correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] + timestampNew = timestamp[correctionIndex] + write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") + + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + logger.info("Timestamps corrected and converted to seconds.") + + +# Category: Routing +# Reason: Orchestrates timestamp correction workflow for TDT format - loops through stores, coordinates timestamp expansion algorithm with I/O +# function to correct timestamps after eliminating first few seconds of the data (for TDT data) +def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): + + logger.debug( + f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" + ) + storenames = storesList[0, :] + storesList = storesList[1, :] + + arr = [] + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + arr.append(storesList[i]) + + arr = sorted(arr, key=str.casefold) + + try: + arr = np.asarray(arr).reshape(2, -1) + except: + logger.error("Error in saving stores list file or spelling mistake for control or signal") + raise Exception("Error in saving stores list file or spelling mistake for control or signal") + + indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) + + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + # dirname = os.path.dirname(path[i]) + idx = np.where(storesList == indices[i])[0] + + if idx.shape[0] == 0: + logger.error(f"{arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + + timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") + npoints = read_hdf5(storenames[idx][0], filepath, "npoints") + sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") + + if name_1 == name_2: + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) + adder = np.arange(npoints) / sampling_rate + lengthAdder = adder.shape[0] + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] + timestampNew = timestampNew[correctionIndex] + + write_hdf5(np.asarray([timeRecStart]), "timeCorrection_" + name_1, filepath, "timeRecStart") + write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(np.asarray([sampling_rate]), "timeCorrection_" + name_1, filepath, "sampling_rate") + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + logger.info("Timestamps corrected and converted to seconds.") + # return timeRecStart, correctionIndex, timestampNew + + +# Category: Routing +# Reason: Orchestrates naming validation and correction application - loops through channel pairs and delegates to applyCorrection +# function to check if naming convention was followed while saving storeslist file +# and apply timestamps correction using the function applyCorrection +def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): + + logger.debug("Applying correction of timestamps to the data and event timestamps") + storesList = storesList[1, :] + + arr = [] + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + arr.append(storesList[i]) + + arr = sorted(arr, key=str.casefold) + arr = np.asarray(arr).reshape(2, -1) + + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + # dirname = os.path.dirname(path[i]) + if name_1 == name_2: + applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + logger.info("Timestamps corrections applied to the data and event timestamps.") + + +# Category: Routing +# Reason: Orchestrates applying timestamp corrections - reads correction indices, applies different logic based on data type, writes results +# function to apply correction to control, signal and event timestamps +def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): + + cond = check_TDT(os.path.dirname(filepath)) + + if cond == True: + timeRecStart = read_hdf5("timeCorrection_" + naming, filepath, "timeRecStart")[0] + + timestampNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + correctionIndex = read_hdf5("timeCorrection_" + naming, filepath, "correctionIndex") + + if "control" in displayName.lower() or "signal" in displayName.lower(): + split_name = displayName.split("_")[-1] + if split_name == naming: + pass + else: + correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") + arr = read_hdf5(event, filepath, "data") + if (arr == 0).all() == True: + arr = arr + else: + arr = arr[correctionIndex] + write_hdf5(arr, displayName, filepath, "data") + else: + arr = read_hdf5(event, filepath, "timestamps") + if cond == True: + res = (arr >= timeRecStart).all() + if res == True: + arr = np.subtract(arr, timeRecStart) + arr = np.subtract(arr, timeForLightsTurnOn) + else: + arr = np.subtract(arr, timeForLightsTurnOn) + else: + arr = np.subtract(arr, timeForLightsTurnOn) + write_hdf5(arr, displayName + "_" + naming, filepath, "ts") + + # if isosbestic_control==False and 'control' in displayName.lower(): + # control = create_control_channel(filepath, displayName) + # write_hdf5(control, displayName, filepath, 'data') + + +# Category: Routing +# Reason: Orchestrates reading HDF5 files, calling helper_create_control_channel, and writing results - coordinates I/O with computation +# main function to create control channel using +# signal channel and save it to a file +def create_control_channel(filepath, arr, window=5001): + + storenames = arr[0, :] + storesList = arr[1, :] + + for i in range(storesList.shape[0]): + event_name, event = storesList[i], storenames[i] + if "control" in event_name.lower() and "cntrl" in event.lower(): + logger.debug("Creating control channel from signal channel using curve-fitting") + name = event_name.split("_")[-1] + signal = read_hdf5("signal_" + name, filepath, "data") + timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + sampling_rate = np.full(timestampNew.shape, np.nan) + sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + + control = helper_create_control_channel(signal, timestampNew, window) + + write_hdf5(control, event_name, filepath, "data") + d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} + df = pd.DataFrame(d) + df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) + logger.info("Control channel from signal channel created using curve-fitting") + + +# Category: Analysis +# Reason: Data validation function - compares array lengths and returns indices for processing +# function to check control and signal channel has same length +# if not, take a smaller length and do pre-processing +def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): + + indices = [] + for i in range(channels_arr.shape[1]): + idx_c = np.where(storesList == channels_arr[0, i])[0] + idx_s = np.where(storesList == channels_arr[1, i])[0] + control = read_hdf5(storenames[idx_c[0]], filepath, "data") + signal = read_hdf5(storenames[idx_s[0]], filepath, "data") + if control.shape[0] < signal.shape[0]: + indices.append(storesList[idx_c[0]]) + elif control.shape[0] > signal.shape[0]: + indices.append(storesList[idx_s[0]]) + else: + indices.append(storesList[idx_s[0]]) + + return indices diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py new file mode 100644 index 0000000..d8cc1bc --- /dev/null +++ b/src/guppy/analysis/z_score.py @@ -0,0 +1,234 @@ +import logging +import os + +import numpy as np +from scipy import signal as ss + +from .control_channel import helper_create_control_channel +from .io_utils import ( + fetchCoords, + find_files, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +# Category: Routing +# Reason: Orchestrates z-score computation for all channels in a session - loops through control/signal pairs, calls helper_z_score, writes results +# compute z-score and deltaF/F and save it to hdf5 file +def compute_z_score(filepath, inputParameters): + + logger.debug(f"Computing z-score for each of the data in {filepath}") + remove_artifacts = inputParameters["removeArtifacts"] + + path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + + path = sorted(path_1 + path_2, key=str.casefold) + + b = np.divide(np.ones((100,)), 100) + a = 1 + + if len(path) % 2 != 0: + logger.error("There are not equal number of Control and Signal data") + raise Exception("There are not equal number of Control and Signal data") + + path = np.asarray(path).reshape(2, -1) + + for i in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") + # dirname = os.path.dirname(path[i]) + + if name_1[-1] == name_2[-1]: + name = name_1[-1] + control = read_hdf5("", path[0, i], "data").reshape(-1) + signal = read_hdf5("", path[1, i], "data").reshape(-1) + # control_smooth = ss.filtfilt(b, a, control) + # signal_smooth = ss.filtfilt(b, a, signal) + # _score, dff = helper_z_score(control_smooth, signal_smooth) + z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters) + if remove_artifacts == True: + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + else: + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + logger.info(f"z-score for the data in {filepath} computed.") + + +# Category: Routing +# Reason: Orchestrates z-score computation for one channel - handles artifact removal logic, coordinates calls to execute_controlFit_dff and z_score_computation +# helper function to compute z-score and deltaF/F +def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): + + removeArtifacts = inputParameters["removeArtifacts"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + filter_window = inputParameters["filter_window"] + + isosbestic_control = inputParameters["isosbestic_control"] + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy") + + logger.info("Remove Artifacts : ", removeArtifacts) + + if (control == 0).all() == True: + control = np.zeros(tsNew.shape[0]) + + z_score_arr = np.array([]) + norm_data_arr = np.full(tsNew.shape[0], np.nan) + control_fit_arr = np.full(tsNew.shape[0], np.nan) + temp_control_arr = np.full(tsNew.shape[0], np.nan) + + if removeArtifacts == True: + coords = fetchCoords(filepath, name, tsNew) + + # for artifacts removal, each chunk which was selected by user is being processed individually and then + # z-score is calculated + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + if isosbestic_control == False: + control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff( + control_arr, signal_arr, isosbestic_control, filter_window + ) + temp_control_arr[tsNew_index] = control_arr + if i < coords.shape[0] - 1: + blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] + temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) + else: + control_arr = control[tsNew_index] + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff( + control_arr, signal_arr, isosbestic_control, filter_window + ) + norm_data_arr[tsNew_index] = norm_data + control_fit_arr[tsNew_index] = control_fit + + if artifactsRemovalMethod == "concatenate": + norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] + control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] + z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) + z_score_arr = np.concatenate((z_score_arr, z_score)) + else: + tsNew_index = np.arange(tsNew.shape[0]) + norm_data, control_fit = execute_controlFit_dff(control, signal, isosbestic_control, filter_window) + z_score = z_score_computation(norm_data, tsNew, inputParameters) + z_score_arr = np.concatenate((z_score_arr, z_score)) + norm_data_arr[tsNew_index] = norm_data # np.concatenate((norm_data_arr, norm_data)) + control_fit_arr[tsNew_index] = control_fit # np.concatenate((control_fit_arr, control_fit)) + + # handle the case if there are chunks being cut in the front and the end + if isosbestic_control == False and removeArtifacts == True: + coords = coords.flatten() + # front chunk + idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0] + temp_control_arr[idx] = np.full(idx.shape[0], np.nan) + # end chunk + idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0] + temp_control_arr[idx] = np.full(idx.shape[0], np.nan) + write_hdf5(temp_control_arr, "control_" + name, filepath, "data") + + return z_score_arr, norm_data_arr, control_fit_arr + + +# Category: Routing +# Reason: Orchestrates signal processing pipeline - sequences calls to filterSignal, controlFit, and deltaFF with conditional logic +# function to filter control and signal channel, also execute above two function : controlFit and deltaFF +# function will also take care if there is only signal channel and no control channel +# if there is only signal channel, z-score will be computed using just signal channel +def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): + + if isosbestic_control == False: + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + else: + control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control_smooth, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + + return norm_data, control_fit + + +# Category: Analysis +# Reason: Pure mathematical computation - calculates deltaF/F from signal and control arrays with simple formula +# function to compute deltaF/F using fitted control channel and filtered signal channel +def deltaFF(signal, control): + + res = np.subtract(signal, control) + normData = np.divide(res, control) + # deltaFF = normData + normData = normData * 100 + + return normData + + +# Category: Analysis +# Reason: Pure algorithmic function - performs polynomial linear regression to fit control to signal +# function to fit control channel to signal channel +def controlFit(control, signal): + + p = np.polyfit(control, signal, 1) + arr = (p[0] * control) + p[1] + return arr + + +# Category: Analysis +# Reason: Pure signal processing function - applies moving average filter using scipy filtfilt +def filterSignal(filter_window, signal): + if filter_window == 0: + return signal + elif filter_window > 1: + b = np.divide(np.ones((filter_window,)), filter_window) + a = 1 + filtered_signal = ss.filtfilt(b, a, signal) + return filtered_signal + else: + raise Exception("Moving average filter window value is not correct.") + + +# Category: Analysis +# Reason: Pure statistical computation - calculates z-score from deltaF/F using different methods (standard, baseline, robust) +# function to compute z-score based on z-score computation method +def z_score_computation(dff, timestamps, inputParameters): + + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] + + if zscore_method == "standard z-score": + numerator = np.subtract(dff, np.nanmean(dff)) + zscore = np.divide(numerator, np.nanstd(dff)) + elif zscore_method == "baseline z-score": + idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] + if idx.shape[0] == 0: + logger.error( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + raise Exception( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + else: + baseline_mean = np.nanmean(dff[idx]) + baseline_std = np.nanstd(dff[idx]) + numerator = np.subtract(dff, baseline_mean) + zscore = np.divide(numerator, baseline_std) + else: + median = np.median(dff) + mad = np.median(np.abs(dff - median)) + numerator = 0.6745 * (dff - median) + zscore = np.divide(numerator, mad) + + return zscore diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 69616d9..78f046a 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -2,34 +2,31 @@ import json import logging import os -import shutil import sys import matplotlib.pyplot as plt import numpy as np -import pandas as pd - -from .analysis.analysis import ( - addingNaNValues, - check_cntrl_sig_length, - eliminateData, - eliminateTs, - execute_controlFit_dff, - helper_create_control_channel, - removeTTLs, - z_score_computation, + +from .analysis.artifact_removal import ( + addingNaNtoChunksWithArtifacts, + processTimestampsForArtifacts, ) +from .analysis.combine_data import combineData from .analysis.io_utils import ( + check_storeslistfile, check_TDT, - decide_naming_convention, - fetchCoords, find_files, - get_all_stores_for_combining_data, read_hdf5, takeOnlyDirs, - write_hdf5, +) # Necessary for other modules that depend on preprocess.py +from .analysis.timestamp_correction import ( + add_control_channel, + create_control_channel, + decide_naming_convention_and_applyCorrection, + timestampCorrection_csv, + timestampCorrection_tdt, ) -from .combineDataFn import processTimestampsForCombiningData +from .analysis.z_score import compute_z_score logger = logging.getLogger(__name__) @@ -45,271 +42,6 @@ def writeToFile(value: str): file.write(value) -# Category: Routing -# Reason: Orchestrates reading HDF5 files, calling helper_create_control_channel, and writing results - coordinates I/O with computation -# main function to create control channel using -# signal channel and save it to a file -def create_control_channel(filepath, arr, window=5001): - - storenames = arr[0, :] - storesList = arr[1, :] - - for i in range(storesList.shape[0]): - event_name, event = storesList[i], storenames[i] - if "control" in event_name.lower() and "cntrl" in event.lower(): - logger.debug("Creating control channel from signal channel using curve-fitting") - name = event_name.split("_")[-1] - signal = read_hdf5("signal_" + name, filepath, "data") - timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - sampling_rate = np.full(timestampNew.shape, np.nan) - sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - - control = helper_create_control_channel(signal, timestampNew, window) - - write_hdf5(control, event_name, filepath, "data") - d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} - df = pd.DataFrame(d) - df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) - logger.info("Control channel from signal channel created using curve-fitting") - - -# Category: Routing -# Reason: Orchestrates validation logic, file copying, and storesList updates - coordinates multiple operations and file manipulations -# function to add control channel when there is no -# isosbestic control channel and update the storeslist file -def add_control_channel(filepath, arr): - - storenames = arr[0, :] - storesList = np.char.lower(arr[1, :]) - - keep_control = np.array([]) - # check a case if there is isosbestic control channel present - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "signal_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) > 1: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - if len(find_signal) == 0: - logger.error( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - raise Exception( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - else: - continue - - for i in range(storesList.shape[0]): - if "signal" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "control_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) == 0: - src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( - filepath, "cntrl" + str(i) + ".hdf5" - ) - shutil.copyfile(src, dst) - arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) - - np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") - - return arr - - -# Category: Routing -# Reason: Orchestrates timestamp correction workflow - loops through stores, coordinates reading/writing, calls validation and correction logic -# function to correct timestamps after eliminating first few seconds of the data (for csv data) -def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): - - logger.debug( - f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" - ) - storenames = storesList[0, :] - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] - - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") - - if name_1 == name_2: - correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] - timestampNew = timestamp[correctionIndex] - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") - - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info("Timestamps corrected and converted to seconds.") - - -# Category: Routing -# Reason: Orchestrates timestamp correction workflow for TDT format - loops through stores, coordinates timestamp expansion algorithm with I/O -# function to correct timestamps after eliminating first few seconds of the data (for TDT data) -def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): - - logger.debug( - f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" - ) - storenames = storesList[0, :] - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] - - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - npoints = read_hdf5(storenames[idx][0], filepath, "npoints") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") - - if name_1 == name_2: - timeRecStart = timestamp[0] - timestamps = np.subtract(timestamp, timeRecStart) - adder = np.arange(npoints) / sampling_rate - lengthAdder = adder.shape[0] - timestampNew = np.zeros((len(timestamps), lengthAdder)) - for i in range(lengthAdder): - timestampNew[:, i] = np.add(timestamps, adder[i]) - timestampNew = (timestampNew.T).reshape(-1, order="F") - correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] - timestampNew = timestampNew[correctionIndex] - - write_hdf5(np.asarray([timeRecStart]), "timeCorrection_" + name_1, filepath, "timeRecStart") - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray([sampling_rate]), "timeCorrection_" + name_1, filepath, "sampling_rate") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info("Timestamps corrected and converted to seconds.") - # return timeRecStart, correctionIndex, timestampNew - - -# Category: Routing -# Reason: Orchestrates applying timestamp corrections - reads correction indices, applies different logic based on data type, writes results -# function to apply correction to control, signal and event timestamps -def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): - - cond = check_TDT(os.path.dirname(filepath)) - - if cond == True: - timeRecStart = read_hdf5("timeCorrection_" + naming, filepath, "timeRecStart")[0] - - timestampNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - correctionIndex = read_hdf5("timeCorrection_" + naming, filepath, "correctionIndex") - - if "control" in displayName.lower() or "signal" in displayName.lower(): - split_name = displayName.split("_")[-1] - if split_name == naming: - pass - else: - correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") - arr = read_hdf5(event, filepath, "data") - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, displayName, filepath, "data") - else: - arr = read_hdf5(event, filepath, "timestamps") - if cond == True: - res = (arr >= timeRecStart).all() - if res == True: - arr = np.subtract(arr, timeRecStart) - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - write_hdf5(arr, displayName + "_" + naming, filepath, "ts") - - # if isosbestic_control==False and 'control' in displayName.lower(): - # control = create_control_channel(filepath, displayName) - # write_hdf5(control, displayName, filepath, 'data') - - -# Category: Routing -# Reason: Orchestrates naming validation and correction application - loops through channel pairs and delegates to applyCorrection -# function to check if naming convention was followed while saving storeslist file -# and apply timestamps correction using the function applyCorrection -def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): - - logger.debug("Applying correction of timestamps to the data and event timestamps") - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - arr = np.asarray(arr).reshape(2, -1) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - if name_1 == name_2: - applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info("Timestamps corrections applied to the data and event timestamps.") - - # Category: Visualization/User Input # Reason: Creates matplotlib plots to display z-score results - pure visualization with no computation # function to plot z_score @@ -485,211 +217,6 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) -# Category: Routing -# Reason: Orchestrates NaN replacement for all stores - loops through channels and coordinates calls to addingNaNValues and removeTTLs -def addingNaNtoChunksWithArtifacts(filepath, events): - - logger.debug("Replacing chunks with artifacts by NaN values.") - storesList = events[1, :] - - path = decide_naming_convention(filepath) - - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): # changes done - data = addingNaNValues(filepath, storesList[i], name) - write_hdf5(data, storesList[i], filepath, "data") - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = removeTTLs(filepath, storesList[i], name) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") - - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - logger.info("Chunks with artifacts are replaced by NaN values.") - - -# Category: Routing -# Reason: Orchestrates timestamp concatenation for artifact removal - loops through stores, coordinates eliminateData/eliminateTs calls and writes results -# main function to align timestamps for control, signal and event timestamps for artifacts removal -def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): - - logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") - storesList = events[1, :] - - path = decide_naming_convention(filepath) - - timestamp_dict = dict() - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): # changes done - data, timestampNew = eliminateData( - filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name - ) - write_hdf5(data, storesList[i], filepath, "data") - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = eliminateTs(filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") - - # timestamp_dict[name] = timestampNew - write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") - - -# Category: Routing -# Reason: Orchestrates z-score computation for one channel - handles artifact removal logic, coordinates calls to execute_controlFit_dff and z_score_computation -# helper function to compute z-score and deltaF/F -def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): - - removeArtifacts = inputParameters["removeArtifacts"] - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - filter_window = inputParameters["filter_window"] - - isosbestic_control = inputParameters["isosbestic_control"] - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy") - - logger.info("Remove Artifacts : ", removeArtifacts) - - if (control == 0).all() == True: - control = np.zeros(tsNew.shape[0]) - - z_score_arr = np.array([]) - norm_data_arr = np.full(tsNew.shape[0], np.nan) - control_fit_arr = np.full(tsNew.shape[0], np.nan) - temp_control_arr = np.full(tsNew.shape[0], np.nan) - - if removeArtifacts == True: - coords = fetchCoords(filepath, name, tsNew) - - # for artifacts removal, each chunk which was selected by user is being processed individually and then - # z-score is calculated - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - if isosbestic_control == False: - control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - temp_control_arr[tsNew_index] = control_arr - if i < coords.shape[0] - 1: - blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] - temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) - else: - control_arr = control[tsNew_index] - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - norm_data_arr[tsNew_index] = norm_data - control_fit_arr[tsNew_index] = control_fit - - if artifactsRemovalMethod == "concatenate": - norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] - control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] - z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) - else: - tsNew_index = np.arange(tsNew.shape[0]) - norm_data, control_fit = execute_controlFit_dff(control, signal, isosbestic_control, filter_window) - z_score = z_score_computation(norm_data, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) - norm_data_arr[tsNew_index] = norm_data # np.concatenate((norm_data_arr, norm_data)) - control_fit_arr[tsNew_index] = control_fit # np.concatenate((control_fit_arr, control_fit)) - - # handle the case if there are chunks being cut in the front and the end - if isosbestic_control == False and removeArtifacts == True: - coords = coords.flatten() - # front chunk - idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0] - temp_control_arr[idx] = np.full(idx.shape[0], np.nan) - # end chunk - idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0] - temp_control_arr[idx] = np.full(idx.shape[0], np.nan) - write_hdf5(temp_control_arr, "control_" + name, filepath, "data") - - return z_score_arr, norm_data_arr, control_fit_arr - - -# Category: Routing -# Reason: Orchestrates z-score computation for all channels in a session - loops through control/signal pairs, calls helper_z_score, writes results -# compute z-score and deltaF/F and save it to hdf5 file -def compute_z_score(filepath, inputParameters): - - logger.debug(f"Computing z-score for each of the data in {filepath}") - remove_artifacts = inputParameters["removeArtifacts"] - - path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - b = np.divide(np.ones((100,)), 100) - a = 1 - - if len(path) % 2 != 0: - logger.error("There are not equal number of Control and Signal data") - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - for i in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - - if name_1[-1] == name_2[-1]: - name = name_1[-1] - control = read_hdf5("", path[0, i], "data").reshape(-1) - signal = read_hdf5("", path[1, i], "data").reshape(-1) - # control_smooth = ss.filtfilt(b, a, control) - # signal_smooth = ss.filtfilt(b, a, signal) - # _score, dff = helper_z_score(control_smooth, signal_smooth) - z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters) - if remove_artifacts == True: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - else: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info(f"z-score for the data in {filepath} computed.") - - # Category: Routing # Reason: Top-level orchestrator for timestamp correction across all sessions - loops through folders, coordinates timestamp correction workflow # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection @@ -731,75 +258,6 @@ def execute_timestamp_correction(folderNames, inputParameters): logger.info(f"Timestamps corrections finished for {filepath}") -# Category: Routing -# Reason: Orchestrates reading and merging storeslist files from multiple sessions - loops through folders and consolidates results -# for combining data, reading storeslist file from both data and create a new storeslist array -def check_storeslistfile(folderNames): - storesList = np.array([[], []]) - for i in range(len(folderNames)): - filepath = folderNames[i] - storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList = np.concatenate( - ( - storesList, - np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1), - ), - axis=1, - ) - - storesList = np.unique(storesList, axis=1) - - return storesList - - -# Category: Routing -# Reason: Orchestrates data combination workflow - validates sampling rates, coordinates processTimestampsForCombiningData, manages multi-session I/O -# function to combine data when there are two different data files for the same recording session -# it will combine the data, do timestamps processing and save the combined data in the first output folder. -def combineData(folderNames, inputParameters, storesList): - - logger.debug("Combining Data from different data files...") - timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] - op_folder = [] - for i in range(len(folderNames)): - filepath = folderNames[i] - op_folder.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - - op_folder = list(np.concatenate(op_folder).flatten()) - sampling_rate_fp = [] - for i in range(len(folderNames)): - filepath = folderNames[i] - storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList_new = np.genfromtxt( - os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," - ).reshape(2, -1) - sampling_rate_fp.append(glob.glob(os.path.join(filepath, "timeCorrection_*"))) - - # check if sampling rate is same for both data - sampling_rate_fp = np.concatenate(sampling_rate_fp) - sampling_rate = [] - for i in range(sampling_rate_fp.shape[0]): - sampling_rate.append(read_hdf5("", sampling_rate_fp[i], "sampling_rate")) - - res = all(i == sampling_rate[0] for i in sampling_rate) - if res == False: - logger.error("To combine the data, sampling rate for both the data should be same.") - raise Exception("To combine the data, sampling rate for both the data should be same.") - - # get the output folders informatinos - op = get_all_stores_for_combining_data(op_folder) - - # processing timestamps for combining the data - processTimestampsForCombiningData(op, timeForLightsTurnOn, storesList, sampling_rate[0]) - logger.info("Data is combined from different data files.") - - return op - - # Category: Routing # Reason: Top-level orchestrator for z-score computation and artifact removal - coordinates compute_z_score, artifact processing, and visualization calls # function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts From 29d5f9ac7f700957e2c0171e835c5201edb53442 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 13:36:21 -0800 Subject: [PATCH 063/125] Removed categorization comments. --- src/guppy/analysis/artifact_removal.py | 12 ------------ src/guppy/analysis/combine_data.py | 2 -- src/guppy/analysis/control_channel.py | 2 -- src/guppy/analysis/io_utils.py | 18 ------------------ src/guppy/analysis/timestamp_correction.py | 14 -------------- src/guppy/analysis/z_score.py | 14 -------------- src/guppy/preprocess.py | 18 ------------------ 7 files changed, 80 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 3c51830..ac483bb 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -13,8 +13,6 @@ logger = logging.getLogger(__name__) -# Category: Routing -# Reason: Orchestrates NaN replacement for all stores - loops through channels and coordinates calls to addingNaNValues and removeTTLs def addingNaNtoChunksWithArtifacts(filepath, events): logger.debug("Replacing chunks with artifacts by NaN values.") @@ -49,8 +47,6 @@ def addingNaNtoChunksWithArtifacts(filepath, events): logger.info("Chunks with artifacts are replaced by NaN values.") -# Category: Routing -# Reason: Orchestrates timestamp concatenation for artifact removal - loops through stores, coordinates eliminateData/eliminateTs calls and writes results # main function to align timestamps for control, signal and event timestamps for artifacts removal def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): @@ -92,8 +88,6 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") -# Category: Analysis -# Reason: Pure algorithmic function - concatenates data chunks based on coordinate boundaries, adjusts timestamps mathematically # helper function to process control and signal timestamps def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): @@ -127,8 +121,6 @@ def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): return arr, ts_arr -# Category: Analysis -# Reason: Pure algorithmic function - processes event timestamps based on coordinate boundaries, aligns with data timeline # helper function to align event timestamps with the control and signal timestamps def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): @@ -157,8 +149,6 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): return ts_arr -# Category: Analysis -# Reason: Pure algorithmic function - replaces specified data chunks with NaN based on coordinate boundaries # adding nan values to removed chunks # when using artifacts removal method - replace with NaN def addingNaNValues(filepath, event, naming): @@ -183,8 +173,6 @@ def addingNaNValues(filepath, event, naming): return data -# Category: Analysis -# Reason: Pure algorithmic function - filters event timestamps to exclude artifact regions based on coordinates # remove event TTLs which falls in the removed chunks # when using artifacts removal method - replace with NaN def removeTTLs(filepath, event, naming): diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 29e4b9d..d8f0ce6 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -17,8 +17,6 @@ logger = logging.getLogger(__name__) -# Category: Routing -# Reason: Orchestrates data combination workflow - validates sampling rates, coordinates processTimestampsForCombiningData, manages multi-session I/O # function to combine data when there are two different data files for the same recording session # it will combine the data, do timestamps processing and save the combined data in the first output folder. def combineData(folderNames, inputParameters, storesList): diff --git a/src/guppy/analysis/control_channel.py b/src/guppy/analysis/control_channel.py index 96665f2..2da82e2 100644 --- a/src/guppy/analysis/control_channel.py +++ b/src/guppy/analysis/control_channel.py @@ -35,8 +35,6 @@ def helper_create_control_channel(signal, timestamps, window): return control -# Category: Analysis -# Reason: Pure mathematical function for exponential curve fitting - no dependencies, pure computation # curve fit exponential function def curveFitFn(x, a, b, c): return a + (b * np.exp(-(1 / c) * x)) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index 999c190..8b10127 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -10,8 +10,6 @@ logger = logging.getLogger(__name__) -# Category: Analysis -# Reason: Utility function for path filtering - pure data transformation with no GUI or orchestration def takeOnlyDirs(paths): removePaths = [] for p in paths: @@ -20,8 +18,6 @@ def takeOnlyDirs(paths): return list(set(paths) - set(removePaths)) -# Category: Analysis -# Reason: File system utility for case-insensitive file discovery - pure I/O helper with no orchestration # find files by ignoring the case sensitivity def find_files(path, glob_path, ignore_case=False): rule = ( @@ -42,8 +38,6 @@ def find_files(path, glob_path, ignore_case=False): return [os.path.join(path, n) for n in str_path if rule.match(n)] -# Category: Analysis -# Reason: Simple file type detection utility - pure file system check with no orchestration # check if dealing with TDT files or csv files def check_TDT(filepath): path = glob.glob(os.path.join(filepath, "*.tsq")) @@ -53,8 +47,6 @@ def check_TDT(filepath): return False -# Category: Analysis -# Reason: I/O utility function for reading HDF5 files - pure file access with no business logic or orchestration # function to read hdf5 file def read_hdf5(event, filepath, key): if event: @@ -74,8 +66,6 @@ def read_hdf5(event, filepath, key): return arr -# Category: Analysis -# Reason: I/O utility function for writing HDF5 files - pure file access with no business logic or orchestration # function to write hdf5 file def write_hdf5(data, event, filepath, key): event = event.replace("\\", "_") @@ -108,8 +98,6 @@ def write_hdf5(data, event, filepath, key): f.create_dataset(key, data=data) -# Category: Analysis -# Reason: Validation utility - checks file naming conventions and returns structured path array with no orchestration # function to check if the naming convention for saving storeslist file was followed or not def decide_naming_convention(filepath): path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) @@ -126,8 +114,6 @@ def decide_naming_convention(filepath): return path -# Category: Analysis -# Reason: I/O utility that loads artifact coordinates from .npy file or provides default - pure file loading with simple logic # function to read coordinates file which was saved by selecting chunks for artifacts removal def fetchCoords(filepath, naming, data): @@ -147,8 +133,6 @@ def fetchCoords(filepath, naming, data): return coords -# Category: Routing -# Reason: Organizes output folders for data combination - loops through numbered outputs and groups related folders def get_all_stores_for_combining_data(folderNames): op = [] for i in range(100): @@ -163,8 +147,6 @@ def get_all_stores_for_combining_data(folderNames): return op -# Category: Routing -# Reason: Orchestrates reading and merging storeslist files from multiple sessions - loops through folders and consolidates results # for combining data, reading storeslist file from both data and create a new storeslist array def check_storeslistfile(folderNames): storesList = np.array([[], []]) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 350dd5d..2e3185a 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -15,8 +15,6 @@ logger = logging.getLogger(__name__) -# Category: Routing -# Reason: Orchestrates validation logic, file copying, and storesList updates - coordinates multiple operations and file manipulations # function to add control channel when there is no # isosbestic control channel and update the storeslist file def add_control_channel(filepath, arr): @@ -63,8 +61,6 @@ def add_control_channel(filepath, arr): return arr -# Category: Routing -# Reason: Orchestrates timestamp correction workflow - loops through stores, coordinates reading/writing, calls validation and correction logic # function to correct timestamps after eliminating first few seconds of the data (for csv data) def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): @@ -115,8 +111,6 @@ def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): logger.info("Timestamps corrected and converted to seconds.") -# Category: Routing -# Reason: Orchestrates timestamp correction workflow for TDT format - loops through stores, coordinates timestamp expansion algorithm with I/O # function to correct timestamps after eliminating first few seconds of the data (for TDT data) def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): @@ -179,8 +173,6 @@ def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): # return timeRecStart, correctionIndex, timestampNew -# Category: Routing -# Reason: Orchestrates naming validation and correction application - loops through channel pairs and delegates to applyCorrection # function to check if naming convention was followed while saving storeslist file # and apply timestamps correction using the function applyCorrection def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): @@ -209,8 +201,6 @@ def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, logger.info("Timestamps corrections applied to the data and event timestamps.") -# Category: Routing -# Reason: Orchestrates applying timestamp corrections - reads correction indices, applies different logic based on data type, writes results # function to apply correction to control, signal and event timestamps def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): @@ -252,8 +242,6 @@ def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): # write_hdf5(control, displayName, filepath, 'data') -# Category: Routing -# Reason: Orchestrates reading HDF5 files, calling helper_create_control_channel, and writing results - coordinates I/O with computation # main function to create control channel using # signal channel and save it to a file def create_control_channel(filepath, arr, window=5001): @@ -280,8 +268,6 @@ def create_control_channel(filepath, arr, window=5001): logger.info("Control channel from signal channel created using curve-fitting") -# Category: Analysis -# Reason: Data validation function - compares array lengths and returns indices for processing # function to check control and signal channel has same length # if not, take a smaller length and do pre-processing def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index d8cc1bc..b5032be 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -15,8 +15,6 @@ logger = logging.getLogger(__name__) -# Category: Routing -# Reason: Orchestrates z-score computation for all channels in a session - loops through control/signal pairs, calls helper_z_score, writes results # compute z-score and deltaF/F and save it to hdf5 file def compute_z_score(filepath, inputParameters): @@ -65,8 +63,6 @@ def compute_z_score(filepath, inputParameters): logger.info(f"z-score for the data in {filepath} computed.") -# Category: Routing -# Reason: Orchestrates z-score computation for one channel - handles artifact removal logic, coordinates calls to execute_controlFit_dff and z_score_computation # helper function to compute z-score and deltaF/F def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): @@ -141,8 +137,6 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ return z_score_arr, norm_data_arr, control_fit_arr -# Category: Routing -# Reason: Orchestrates signal processing pipeline - sequences calls to filterSignal, controlFit, and deltaFF with conditional logic # function to filter control and signal channel, also execute above two function : controlFit and deltaFF # function will also take care if there is only signal channel and no control channel # if there is only signal channel, z-score will be computed using just signal channel @@ -161,8 +155,6 @@ def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): return norm_data, control_fit -# Category: Analysis -# Reason: Pure mathematical computation - calculates deltaF/F from signal and control arrays with simple formula # function to compute deltaF/F using fitted control channel and filtered signal channel def deltaFF(signal, control): @@ -174,8 +166,6 @@ def deltaFF(signal, control): return normData -# Category: Analysis -# Reason: Pure algorithmic function - performs polynomial linear regression to fit control to signal # function to fit control channel to signal channel def controlFit(control, signal): @@ -184,8 +174,6 @@ def controlFit(control, signal): return arr -# Category: Analysis -# Reason: Pure signal processing function - applies moving average filter using scipy filtfilt def filterSignal(filter_window, signal): if filter_window == 0: return signal @@ -198,8 +186,6 @@ def filterSignal(filter_window, signal): raise Exception("Moving average filter window value is not correct.") -# Category: Analysis -# Reason: Pure statistical computation - calculates z-score from deltaF/F using different methods (standard, baseline, robust) # function to compute z-score based on z-score computation method def z_score_computation(dff, timestamps, inputParameters): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 78f046a..5ff8de6 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -35,15 +35,11 @@ plt.switch_backend("TKAgg") -# Category: Visualization/User Input -# Reason: Writes progress updates to file for GUI progress bar - couples backend to GUI feedback mechanism def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) -# Category: Visualization/User Input -# Reason: Creates matplotlib plots to display z-score results - pure visualization with no computation # function to plot z_score def visualize_z_score(filepath): @@ -66,8 +62,6 @@ def visualize_z_score(filepath): # plt.show() -# Category: Visualization/User Input -# Reason: Creates matplotlib plots to display deltaF/F results - pure visualization with no computation # function to plot deltaF/F def visualize_dff(filepath): name = os.path.basename(filepath) @@ -89,8 +83,6 @@ def visualize_dff(filepath): # plt.show() -# Category: Visualization/User Input -# Reason: Interactive matplotlib GUI with keyboard event handlers for artifact selection - core user input mechanism that saves coordinates to disk def visualize(filepath, x, y1, y2, y3, plot_name, removeArtifacts): # plotting control and signal data @@ -180,8 +172,6 @@ def plt_close_event(event): # return fig -# Category: Visualization/User Input -# Reason: Orchestrates visualization of all control/signal pairs - reads data and delegates to visualize() for user interaction # function to plot control and signal, also provide a feature to select chunks for artifacts removal def visualizeControlAndSignal(filepath, removeArtifacts): path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) @@ -217,8 +207,6 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) -# Category: Routing -# Reason: Top-level orchestrator for timestamp correction across all sessions - loops through folders, coordinates timestamp correction workflow # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection def execute_timestamp_correction(folderNames, inputParameters): @@ -258,8 +246,6 @@ def execute_timestamp_correction(folderNames, inputParameters): logger.info(f"Timestamps corrections finished for {filepath}") -# Category: Routing -# Reason: Top-level orchestrator for z-score computation and artifact removal - coordinates compute_z_score, artifact processing, and visualization calls # function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts def execute_zscore(folderNames, inputParameters): @@ -312,8 +298,6 @@ def execute_zscore(folderNames, inputParameters): logger.info("Signal data and event timestamps are extracted.") -# Category: Routing -# Reason: Main entry point for Step 4 - orchestrates entire preprocessing workflow including timestamp correction, data combination, and z-score computation def extractTsAndSignal(inputParameters): logger.debug("Extracting signal data and event timestamps...") @@ -351,8 +335,6 @@ def extractTsAndSignal(inputParameters): execute_zscore(op_folder, inputParameters) -# Category: Routing -# Reason: Top-level entry point wrapper - handles error catching and calls extractTsAndSignal def main(input_parameters): try: extractTsAndSignal(input_parameters) From a9a65abf0b31e1aca2bc874efd6c4187c0801634 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 15:00:15 -0800 Subject: [PATCH 064/125] Removed redundant fns --- src/guppy/analysis/combine_data.py | 251 +---------------------------- src/guppy/preprocess.py | 3 +- 2 files changed, 4 insertions(+), 250 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index d8f0ce6..aa5a1dd 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -1,17 +1,17 @@ # TODO: remove redundant function implementations such as eliminateData, eliminateTs, read_hdf5, et cetera. -import fnmatch import glob import logging import os -import re import numpy as np from .io_utils import ( + decide_naming_convention, get_all_stores_for_combining_data, read_hdf5, takeOnlyDirs, + write_hdf5, ) logger = logging.getLogger(__name__) @@ -61,78 +61,6 @@ def combineData(folderNames, inputParameters, storesList): return op -def find_files(path, glob_path, ignore_case=False): - rule = ( - re.compile(fnmatch.translate(glob_path), re.IGNORECASE) - if ignore_case - else re.compile(fnmatch.translate(glob_path)) - ) - no_bytes_path = os.listdir(os.path.expanduser(path)) - str_path = [] - - # converting byte object to string - for x in no_bytes_path: - try: - str_path.append(x.decode("utf-8")) - except: - str_path.append(x) - - return [os.path.join(path, n) for n in str_path if rule.match(n)] - - -def read_hdf5(event, filepath, key): - if event: - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - -def write_hdf5(data, event, filepath, key): - op = os.path.join(filepath, event + ".hdf5") - - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - - -def decide_naming_convention(filepath): - path_1 = find_files(filepath, "control*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - - path_2 = find_files(filepath, "signal*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - if len(path) % 2 != 0: - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - return path - - def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): arr = np.array([]) @@ -219,178 +147,3 @@ def processTimestampsForCombiningData(filepath, timeForLightsTurnOn, events, sam else: ts = eliminateTs(filepath[k], timeForLightsTurnOn, storesList[i], sampling_rate, name) write_hdf5(ts, storesList[i] + "_" + name, filepath[k][0], "ts") - - -import h5py -import numpy as np - -logger = logging.getLogger(__name__) - - -def find_files(path, glob_path, ignore_case=False): - rule = ( - re.compile(fnmatch.translate(glob_path), re.IGNORECASE) - if ignore_case - else re.compile(fnmatch.translate(glob_path)) - ) - no_bytes_path = os.listdir(os.path.expanduser(path)) - str_path = [] - - # converting byte object to string - for x in no_bytes_path: - try: - str_path.append(x.decode("utf-8")) - except: - str_path.append(x) - - return [os.path.join(path, n) for n in str_path if rule.match(n)] - - -def read_hdf5(event, filepath, key): - if event: - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - -def write_hdf5(data, event, filepath, key): - op = os.path.join(filepath, event + ".hdf5") - - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - - -def decide_naming_convention(filepath): - path_1 = find_files(filepath, "control*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - - path_2 = find_files(filepath, "signal*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - if len(path) % 2 != 0: - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - return path - - -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - arr = np.array([]) - ts_arr = np.array([]) - for i in range(len(filepath)): - ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") - data = read_hdf5(event, filepath[i], "data").reshape(-1) - - # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 5 Dec 2025 15:19:35 -0800 Subject: [PATCH 065/125] Removed redundant fns --- src/guppy/analysis/combine_data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index aa5a1dd..f89315f 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -1,5 +1,3 @@ -# TODO: remove redundant function implementations such as eliminateData, eliminateTs, read_hdf5, et cetera. - import glob import logging import os From 1bb8de4a2df3544f656bc4f52c48c75c0f0b338e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 17:54:37 -0800 Subject: [PATCH 066/125] Peeled off read operations from timestamp_correction CSV function. --- src/guppy/analysis/timestamp_correction.py | 146 ++--- src/guppy/preprocess.py | 58 +- timestamp_correction_analysis.md | 723 +++++++++++++++++++++ 3 files changed, 851 insertions(+), 76 deletions(-) create mode 100644 timestamp_correction_analysis.md diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 2e3185a..e179d26 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -1,6 +1,5 @@ import logging import os -import shutil import numpy as np import pandas as pd @@ -15,91 +14,37 @@ logger = logging.getLogger(__name__) -# function to add control channel when there is no -# isosbestic control channel and update the storeslist file -def add_control_channel(filepath, arr): - - storenames = arr[0, :] - storesList = np.char.lower(arr[1, :]) - - keep_control = np.array([]) - # check a case if there is isosbestic control channel present - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "signal_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) > 1: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - if len(find_signal) == 0: - logger.error( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - raise Exception( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - else: - continue - - for i in range(storesList.shape[0]): - if "signal" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "control_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) == 0: - src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( - filepath, "cntrl" + str(i) + ".hdf5" - ) - shutil.copyfile(src, dst) - arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) - - np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") - - return arr - - # function to correct timestamps after eliminating first few seconds of the data (for csv data) -def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): - +def timestampCorrection_csv( + filepath, timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps, name_to_sampling_rate +): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) storenames = storesList[0, :] - storesList = storesList[1, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) + indices = check_cntrl_sig_length(arr, name_to_data) for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] + idx = np.where(names_for_storenames == indices[i])[0] if idx.shape[0] == 0: logger.error(f"{arr[0,i]} does not exist in the stores list file.") raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") + name = names_for_storenames[idx][0] + timestamp = name_to_timestamps[name] + sampling_rate = name_to_sampling_rate[name] if name_1 == name_2: correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] timestampNew = timestamp[correctionIndex] + # TODO: Pull out write operations into preprocess.py write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") @@ -270,19 +215,72 @@ def create_control_channel(filepath, arr, window=5001): # function to check control and signal channel has same length # if not, take a smaller length and do pre-processing -def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): +def check_cntrl_sig_length(channels_arr, name_to_data): indices = [] for i in range(channels_arr.shape[1]): - idx_c = np.where(storesList == channels_arr[0, i])[0] - idx_s = np.where(storesList == channels_arr[1, i])[0] - control = read_hdf5(storenames[idx_c[0]], filepath, "data") - signal = read_hdf5(storenames[idx_s[0]], filepath, "data") + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + control = name_to_data[control_name] + signal = name_to_data[signal_name] if control.shape[0] < signal.shape[0]: - indices.append(storesList[idx_c[0]]) + indices.append(control_name) elif control.shape[0] > signal.shape[0]: - indices.append(storesList[idx_s[0]]) + indices.append(signal_name) else: - indices.append(storesList[idx_s[0]]) + indices.append(signal_name) return indices + + +def get_control_and_signal_channel_names(storesList): + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + channels_arr = [] + for i in range(names_for_storenames.shape[0]): + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + channels_arr.append(names_for_storenames[i]) + + channels_arr = sorted(channels_arr, key=str.casefold) + try: + channels_arr = np.asarray(channels_arr).reshape(2, -1) + except: + logger.error("Error in saving stores list file or spelling mistake for control or signal") + raise Exception("Error in saving stores list file or spelling mistake for control or signal") + + return channels_arr + + +def read_control_and_signal(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_data = {} + name_to_timestamps = {} + name_to_sampling_rate = {} + + for i in range(channels_arr.shape[1]): + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + idx_c = np.where(storesList == control_name)[0] + idx_s = np.where(storesList == signal_name)[0] + control_storename = storenames[idx_c[0]] + signal_storename = storenames[idx_s[0]] + + control_data = read_hdf5(control_storename, filepath, "data") + signal_data = read_hdf5(signal_storename, filepath, "data") + control_timestamps = read_hdf5(control_storename, filepath, "timestamps") + signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps") + control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate") + signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate") + + name_to_data[control_name] = control_data + name_to_data[signal_name] = signal_data + name_to_timestamps[control_name] = control_timestamps + name_to_timestamps[signal_name] = signal_timestamps + name_to_sampling_rate[control_name] = control_sampling_rate + name_to_sampling_rate[signal_name] = signal_sampling_rate + + return name_to_data, name_to_timestamps, name_to_sampling_rate diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 15c547f..74033f8 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -2,6 +2,7 @@ import json import logging import os +import shutil import sys import matplotlib.pyplot as plt @@ -21,9 +22,9 @@ takeOnlyDirs, ) from .analysis.timestamp_correction import ( - add_control_channel, create_control_channel, decide_naming_convention_and_applyCorrection, + read_control_and_signal, timestampCorrection_csv, timestampCorrection_tdt, ) @@ -208,6 +209,54 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) +# This function just creates placeholder Control-HDF5 files that are then immediately overwritten later on in the pipeline. +# TODO: Refactor this function to avoid unnecessary file creation. +# function to add control channel when there is no +# isosbestic control channel and update the storeslist file +def add_control_channel(filepath, arr): + + storenames = arr[0, :] + storesList = np.char.lower(arr[1, :]) + + keep_control = np.array([]) + # check a case if there is isosbestic control channel present + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "signal_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) > 1: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + if len(find_signal) == 0: + logger.error( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + raise Exception( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + else: + continue + + for i in range(storesList.shape[0]): + if "signal" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "control_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) == 0: + src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( + filepath, "cntrl" + str(i) + ".hdf5" + ) + shutil.copyfile(src, dst) + arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) + + np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") + + return arr + + # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection def execute_timestamp_correction(folderNames, inputParameters): @@ -231,7 +280,12 @@ def execute_timestamp_correction(folderNames, inputParameters): if cond == True: timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) else: - timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) + + control_and_signal_dicts = read_control_and_signal(filepath, storesList) + name_to_data, name_to_timestamps, name_to_sampling_rate = control_and_signal_dicts + timestampCorrection_csv( + filepath, timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps, name_to_sampling_rate + ) for k in range(storesList.shape[1]): decide_naming_convention_and_applyCorrection( diff --git a/timestamp_correction_analysis.md b/timestamp_correction_analysis.md new file mode 100644 index 0000000..121aa3f --- /dev/null +++ b/timestamp_correction_analysis.md @@ -0,0 +1,723 @@ +# Timestamp Correction Module Analysis + +## Overview + +The `timestamp_correction.py` module handles the correction of timestamps for photometry data, including: +- Eliminating the first N seconds of recording (light stabilization period) +- Expanding TDT block timestamps into continuous timestamps +- Creating synthetic control channels when no isosbestic control is present +- Applying corrections to both data channels and event markers + +## Module Structure + +### Entry Point from preprocess.py + +```python +execute_timestamp_correction(folderNames, inputParameters) # preprocess.py:212 +``` + +This orchestrator loops through all session folders and calls functions in this module. + +## Two-Phase Control Channel Creation Pattern + +### Understanding add_control_channel vs create_control_channel + +These two functions work together in a **two-phase process** to handle synthetic control channel generation. They are **not redundant** but serve distinct purposes: + +#### Phase 1: `add_control_channel` (Called BEFORE timestamp correction) + +**Execution:** Line 229 in `execute_timestamp_correction` + +**Purpose:** Create **PLACEHOLDER** control files to satisfy workflow requirements + +**What it does:** +1. Validates that if `isosbestic_control=False`, no real control channels exist +2. For each signal channel without a matching control: + - Copies the raw signal HDF5 file to `cntrl{i}.hdf5` (placeholder) + - Adds entry to storesList: `[["cntrl{i}"], ["control_{region}"]]` +3. Saves updated `storesList.csv` + +**Files created:** +- `cntrl0.hdf5`, `cntrl1.hdf5`, etc. (copies of **RAW** signal data) +- Updated `storesList.csv` with placeholder entries + +**Why it's needed:** +- Timestamp correction workflow expects **paired** control/signal channels in storesList +- Without placeholders, the pairing logic in `timestampCorrection_xxx` and `check_cntrl_sig_length` would fail +- The placeholder **data is never actually used** - it just satisfies structural requirements + +#### Phase 2: `create_control_channel` (Called AFTER timestamp correction) + +**Execution:** Line 243 in `execute_timestamp_correction` + +**Purpose:** Generate **ACTUAL** synthetic control via curve fitting and overwrite placeholders + +**What it does:** +1. Looks for placeholder files (checks: `"control" in event_name.lower() and "cntrl" in event.lower()`) +2. Reads the **CORRECTED** signal data: `signal_{region}.hdf5` (after timestamp correction) +3. Calls `helper_create_control_channel()` to: + - Apply Savitzky-Golay filter to cleaned signal + - Fit to exponential function: `f(x) = a + b * exp(-(1/c) * x)` +4. **OVERWRITES** the placeholder `control_{region}.hdf5` with real synthetic control +5. Also exports to CSV format (legacy) + +**Files written:** +- `control_{region}.hdf5` → `data` (replaces placeholder with curve-fitted control) +- `{raw_name}.csv` (timestamps, data, sampling_rate columns) + +**Why it's separate:** +- Requires **timestamp-corrected** signal data (doesn't exist until after lines 232-239) +- Curve fitting algorithm needs clean timestamps (first N seconds eliminated) +- Cannot be done before timestamp correction without re-correcting the synthetic control + +#### Execution Timeline + +```python +# When isosbestic_control == False: + +# ========== PHASE 1: BEFORE TIMESTAMP CORRECTION ========== +# Line 229: Create placeholders (just file copies) +storesList = add_control_channel(filepath, storesList) +# Result: storesList now has paired structure +# [["Dv1A", "cntrl0"], ["signal_dms", "control_dms"]] +# Files: cntrl0.hdf5 (copy of raw signal, never used) + +# ========== TIMESTAMP CORRECTION PHASE ========== +# Lines 232-234: Process both signal AND placeholder control +timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) +# Result: Creates timeCorrection_dms.hdf5 with correctionIndex + +# Lines 236-239: Apply corrections to all channels +decide_naming_convention_and_applyCorrection(...) +# Result: signal_dms.hdf5 now contains corrected signal data +# control_dms.hdf5 still contains uncorrected placeholder copy + +# ========== PHASE 2: AFTER TIMESTAMP CORRECTION ========== +# Line 243: Generate REAL synthetic controls +create_control_channel(filepath, storesList, window=101) +# Result: control_dms.hdf5 OVERWRITTEN with curve-fitted synthetic control +# Now contains valid control data derived from corrected signal +``` + +#### Why This Design Exists + +This is a **chicken-and-egg problem solved with placeholders:** + +1. **Requirement:** Timestamp correction expects paired control/signal channels +2. **Constraint:** Synthetic control generation requires timestamp-corrected signal data +3. **Solution:** Create dummy placeholders → correct everything → replace placeholders with real data + +#### Visual Flow + +```mermaid +flowchart TD + A[isosbestic_control = False] --> B[add_control_channel] + B --> C[Copy signal.hdf5 to cntrl0.hdf5] + C --> D[Update storesList.csv] + + D --> E[timestampCorrection_xxx] + E --> F[Creates timeCorrection_dms.hdf5] + + F --> G[decide_naming_convention_and_applyCorrection] + G --> H[Corrects signal_dms.hdf5] + G --> I[Corrects control_dms.hdf5
still contains placeholder] + + I --> J[create_control_channel] + J --> K[Read corrected signal_dms.hdf5] + K --> L[helper_create_control_channel
curve fit] + L --> M[OVERWRITE control_dms.hdf5
with synthetic control] + + style C fill:#fff3cd + style I fill:#fff3cd + style M fill:#d4edda +``` + +#### Refactoring Opportunity + +This placeholder pattern is a **code smell** indicating potential design improvements: + +**Issues:** +1. **Unnecessary I/O:** Placeholder files are written and then overwritten +2. **Confusing flow:** Hard to understand that placeholders are temporary +3. **Tight coupling:** Timestamp correction assumes paired files exist +4. **Wasted computation:** Placeholder controls get timestamp-corrected unnecessarily + +**Potential Improvements:** + +**Option 1: Lazy Control Creation** +- Modify timestamp correction to handle missing controls gracefully +- Only create synthetic controls after all corrections complete +- Remove placeholder file creation entirely + +**Option 2: Data Structure Refactoring** +- Use a data structure that doesn't require physical paired files upfront +- Track "needs synthetic control" as metadata rather than file presence +- Generate and write controls only once at the end + +**Option 3: Two-Pass Workflow** +- First pass: Correct only signal channels +- Second pass: Generate synthetic controls from corrected signals +- Would require refactoring `check_cntrl_sig_length` and pairing logic + +## Function Catalog + +### 1. add_control_channel +**Location:** `timestamp_correction.py:20` +**Purpose:** Create placeholder control channel files when no isosbestic control exists + +```python +def add_control_channel(filepath, arr) -> arr +``` + +**Input:** +- `filepath`: Path to session output folder +- `arr`: 2D array `[[storenames], [storesList]]` from storesList.csv + +**Process:** +1. Validates that control/signal pairs match (raises error if mismatched) +2. For each signal channel without a matching control: + - Copies signal HDF5 file to `cntrl{i}.hdf5` (placeholder) + - Adds entry to storesList array: `[["cntrl{i}"], ["control_{region}"]]` +3. Writes updated storesList.csv + +**Output:** +- Updated `arr` with new control channel entries +- **Files Written:** Updated `storesList.csv`, copied `cntrl*.hdf5` files + +**I/O Summary:** +- **Reads:** Signal HDF5 files (via shutil.copyfile) +- **Writes:** `storesList.csv`, placeholder `cntrl*.hdf5` files + +--- + +### 2. timestampCorrection_csv +**Location:** `timestamp_correction.py:65` +**Purpose:** Correct timestamps for CSV-format data (Doric, NPM, custom CSV) + +```python +def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) +``` + +**Input:** +- `filepath`: Path to session output folder +- `timeForLightsTurnOn`: Seconds to eliminate from start (default: 1) +- `storesList`: 2D array `[[storenames], [storesList]]` + +**Process:** +1. Filters storesList to control/signal channels only +2. Pairs control/signal channels, validates naming matches +3. Calls `check_cntrl_sig_length()` to determine which channel to use (shorter one) +4. For each control/signal pair: + - **Reads:** `timestamps` and `sampling_rate` from raw HDF5 + - **Computes:** `correctionIndex = np.where(timestamp >= timeForLightsTurnOn)` + - **Writes:** `timeCorrection_{region}.hdf5` with keys: + - `timestampNew`: Corrected timestamps + - `correctionIndex`: Indices to keep + - `sampling_rate`: Sampling rate + +**Output:** +- **Files Written:** `timeCorrection_{region}.hdf5` for each control/signal pair + +**I/O Summary:** +- **Reads:** `{storename}.hdf5` → `timestamps`, `sampling_rate` +- **Writes:** `timeCorrection_{region}.hdf5` → `timestampNew`, `correctionIndex`, `sampling_rate` + +--- + +### 3. timestampCorrection_tdt +**Location:** `timestamp_correction.py:115` +**Purpose:** Correct timestamps for TDT-format data (expands block timestamps) + +```python +def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) +``` + +**Input:** Same as `timestampCorrection_csv` + +**Process:** +1. Filters storesList to control/signal channels only +2. Pairs control/signal channels, validates naming matches +3. Calls `check_cntrl_sig_length()` to determine which channel to use +4. For each control/signal pair: + - **Reads:** `timestamps`, `npoints`, `sampling_rate` from raw HDF5 + - **TDT-specific expansion algorithm:** + ```python + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) # Zero-base + adder = np.arange(npoints) / sampling_rate # Within-block offsets + # Expand: for each block timestamp, add within-block offsets + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") # Flatten + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn) + timestampNew = timestampNew[correctionIndex] + ``` + - **Writes:** `timeCorrection_{region}.hdf5` with keys: + - `timeRecStart`: Recording start time (TDT-specific) + - `timestampNew`: Expanded, corrected timestamps + - `correctionIndex`: Indices to keep + - `sampling_rate`: Sampling rate + +**Output:** +- **Files Written:** `timeCorrection_{region}.hdf5` with TDT-specific `timeRecStart` key + +**I/O Summary:** +- **Reads:** `{storename}.hdf5` → `timestamps`, `npoints`, `sampling_rate` +- **Writes:** `timeCorrection_{region}.hdf5` → `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate` + +--- + +### 4. check_cntrl_sig_length +**Location:** `timestamp_correction.py:273` +**Purpose:** Determine which channel (control or signal) to use as reference based on length + +```python +def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList) -> indices +``` + +**Input:** +- `filepath`: Path to session output folder +- `channels_arr`: Paired control/signal array `[["control_A", "control_B"], ["signal_A", "signal_B"]]` +- `storenames`: Raw HDF5 filenames +- `storesList`: Semantic channel names + +**Process:** +1. For each control/signal pair: + - **Reads:** `data` from both control and signal HDF5 + - Compares lengths: `control.shape[0]` vs `signal.shape[0]` + - Returns the shorter one's storename (or signal if equal) + +**Output:** +- List of storenames to use for timestamp correction (one per pair) + +**I/O Summary:** +- **Reads:** `{control_storename}.hdf5` → `data`, `{signal_storename}.hdf5` → `data` + +**Note:** This is a pure analysis function but performs I/O to determine which data to use. + +--- + +### 5. decide_naming_convention_and_applyCorrection +**Location:** `timestamp_correction.py:178` +**Purpose:** Loop through all channels and apply timestamp corrections + +```python +def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList) +``` + +**Input:** +- `filepath`: Path to session output folder +- `timeForLightsTurnOn`: Seconds eliminated from start +- `event`: Raw storename (e.g., "Dv1A") +- `displayName`: Semantic name (e.g., "control_DMS") +- `storesList`: Full storesList array + +**Process:** +1. Filters storesList to control/signal channels +2. Pairs channels and validates naming conventions +3. For each pair, calls `applyCorrection(filepath, timeForLightsTurnOn, event, displayName, region)` + +**Output:** +- Delegates to `applyCorrection()` (no direct I/O) + +--- + +### 6. applyCorrection +**Location:** `timestamp_correction.py:205` +**Purpose:** Apply timestamp corrections to data channels or event markers + +```python +def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming) +``` + +**Input:** +- `filepath`: Path to session output folder +- `timeForLightsTurnOn`: Seconds eliminated from start +- `event`: Raw storename +- `displayName`: Semantic display name +- `naming`: Region identifier (e.g., "dms") + +**Process:** + +**For Control/Signal Channels:** +1. **Reads:** `timeCorrection_{naming}.hdf5` → `correctionIndex` +2. **Reads:** `{event}.hdf5` → `data` +3. **Applies:** `arr = arr[correctionIndex]` (crops data) +4. **Writes:** `{displayName}.hdf5` → `data` (overwrites with corrected data) + +**For Event Channels:** +1. Detects TDT format: `check_TDT(os.path.dirname(filepath))` +2. **Reads:** `timeCorrection_{naming}.hdf5` → `timeRecStart` (if TDT) +3. **Reads:** `{event}.hdf5` → `timestamps` +4. **Applies corrections:** + - If TDT and timestamps >= timeRecStart: subtract both `timeRecStart` and `timeForLightsTurnOn` + - Otherwise: subtract only `timeForLightsTurnOn` +5. **Writes:** `{event}_{naming}.hdf5` → `ts` (corrected event timestamps) + +**Output:** +- **Files Written:** + - `{displayName}.hdf5` → `data` (for control/signal) + - `{event}_{naming}.hdf5` → `ts` (for events) + +**I/O Summary:** +- **Reads:** `timeCorrection_{naming}.hdf5`, `{event}.hdf5` +- **Writes:** `{displayName}.hdf5` or `{event}_{naming}.hdf5` + +--- + +### 7. create_control_channel +**Location:** `timestamp_correction.py:247` +**Purpose:** Generate synthetic control channel using curve fitting (when no isosbestic control exists) + +```python +def create_control_channel(filepath, arr, window=5001) +``` + +**Input:** +- `filepath`: Path to session output folder +- `arr`: storesList array `[[storenames], [storesList]]` +- `window`: Savitzky-Golay filter window (default: 5001) + +**Process:** +1. Loops through storesList to find placeholder control channels (`cntrl` in storename) +2. For each placeholder: + - **Reads:** `signal_{region}.hdf5` → `data` (corrected signal) + - **Reads:** `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate` + - **Calls:** `helper_create_control_channel(signal, timestampNew, window)` from `control_channel.py` + - Applies Savitzky-Golay filter + - Fits to exponential: `f(x) = a + b * exp(-(1/c) * x)` + - **Writes:** `{control_name}.hdf5` → `data` (synthetic control) + - **Writes:** `{event_name}.csv` with columns: `timestamps`, `data`, `sampling_rate` + +**Output:** +- **Files Written:** + - `control_{region}.hdf5` → `data` (replaces placeholder) + - `{raw_name}.csv` (legacy format export) + +**I/O Summary:** +- **Reads:** `signal_{region}.hdf5` → `data`, `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate` +- **Writes:** `control_{region}.hdf5` → `data`, `{raw_name}.csv` + +--- + +## Data Flow Diagram + +### High-Level Flow (called from execute_timestamp_correction) + +```mermaid +flowchart TD + A[execute_timestamp_correction] --> B[Read storesList.csv] + B --> C{isosbestic_control?} + + C -->|False| D[add_control_channel] + C -->|True| E{Check format} + D --> E + + E -->|TDT| F[timestampCorrection_tdt] + E -->|CSV/Doric/NPM| G[timestampCorrection_csv] + + F --> H[Loop: decide_naming_convention_and_applyCorrection] + G --> H + + H --> I[For each store: applyCorrection] + + I --> J{isosbestic_control?} + J -->|False| K[create_control_channel] + J -->|True| L[Done] + K --> L + + style A fill:#e1f5ff + style L fill:#d4edda +``` + +### Detailed Flow: timestampCorrection Functions + +```mermaid +flowchart LR + A[Raw HDF5 files] --> B[check_cntrl_sig_length] + B --> C[Read control & signal data] + C --> D[Return shorter channel name] + + D --> E{Format?} + E -->|CSV| F[timestampCorrection_csv] + E -->|TDT| G[timestampCorrection_tdt] + + F --> H[Read timestamps from selected channel] + G --> I[Read timestamps, npoints, sampling_rate] + + H --> J[correctionIndex = where >= timeForLightsTurnOn] + I --> K[Expand block timestamps] + K --> J + + J --> L[Write timeCorrection_{region}.hdf5] + + style A fill:#e1f5ff + style L fill:#d4edda +``` + +### Detailed Flow: applyCorrection + +```mermaid +flowchart TD + A[applyCorrection called] --> B{Channel type?} + + B -->|control/signal| C[Read correctionIndex] + B -->|event| D[Read event timestamps] + + C --> E[Read raw data] + E --> F[data = data correctionIndex] + F --> G[Write displayName.hdf5] + + D --> H{TDT format?} + H -->|Yes| I[Read timeRecStart] + H -->|No| J[ts -= timeForLightsTurnOn] + + I --> K[ts -= timeRecStart] + K --> J + J --> L[Write event_region.hdf5] + + style A fill:#e1f5ff + style G fill:#d4edda + style L fill:#d4edda +``` + +### Detailed Flow: Control Channel Creation + +```mermaid +flowchart LR + A[add_control_channel] --> B[For each signal without control] + B --> C[Copy signal.hdf5 to cntrl_i.hdf5] + C --> D[Update storesList.csv] + + D --> E[... timestamp correction ...] + + E --> F[create_control_channel] + F --> G[For each cntrl_i placeholder] + G --> H[Read signal_{region}.hdf5] + H --> I[helper_create_control_channel] + I --> J[Savitzky-Golay filter] + J --> K[Curve fit to exponential] + K --> L[Write control_{region}.hdf5] + L --> M[Export to CSV] + + style A fill:#fff3cd + style M fill:#d4edda +``` + +## Execution Order in execute_timestamp_correction + +```python +# preprocess.py:212-247 +for each session in folderNames: + for each output_folder in session: + # Step 1: Read metadata + storesList = np.genfromtxt("storesList.csv") + + # Step 2: Add placeholder controls if needed + if isosbestic_control == False: + storesList = add_control_channel(filepath, storesList) + + # Step 3: Compute correctionIndex and timestampNew + if check_TDT(folderName): + timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) + else: + timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) + + # Step 4: Apply corrections to all channels/events + for each store in storesList: + decide_naming_convention_and_applyCorrection( + filepath, timeForLightsTurnOn, storename, displayName, storesList + ) + # ^ This calls applyCorrection for each channel + + # Step 5: Generate synthetic controls via curve fitting + if isosbestic_control == False: + create_control_channel(filepath, storesList, window=101) +``` + +## File I/O Summary + +### Files Read + +| Function | Files Read | Keys | +|----------|-----------|------| +| `add_control_channel` | `signal_*.hdf5` (for copying) | - | +| `timestampCorrection_csv` | `{storename}.hdf5` | `timestamps`, `sampling_rate` | +| `timestampCorrection_tdt` | `{storename}.hdf5` | `timestamps`, `npoints`, `sampling_rate` | +| `check_cntrl_sig_length` | `control_*.hdf5`, `signal_*.hdf5` | `data` | +| `applyCorrection` | `timeCorrection_{region}.hdf5`
`{event}.hdf5` | `correctionIndex`, `timeRecStart` (TDT)
`data` or `timestamps` | +| `create_control_channel` | `signal_{region}.hdf5`
`timeCorrection_{region}.hdf5` | `data`
`timestampNew`, `sampling_rate` | + +### Files Written + +| Function | Files Written | Keys | Notes | +|----------|--------------|------|-------| +| `add_control_channel` | `storesList.csv`
`cntrl{i}.hdf5` | -
(copy of signal) | Placeholder files | +| `timestampCorrection_csv` | `timeCorrection_{region}.hdf5` | `timestampNew`, `correctionIndex`, `sampling_rate` | One per region | +| `timestampCorrection_tdt` | `timeCorrection_{region}.hdf5` | `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate` | TDT-specific | +| `applyCorrection` | `{displayName}.hdf5`
`{event}_{region}.hdf5` | `data`
`ts` | Overwrites with corrected data | +| `create_control_channel` | `control_{region}.hdf5`
`{raw_name}.csv` | `data`
timestamps, data, sampling_rate | Replaces placeholder | + +## Key Transformations + +### 1. Timestamp Expansion (TDT only) + +**Input:** Block timestamps (one per acquisition block) +**Algorithm:** +```python +timeRecStart = timestamp[0] +timestamps = timestamp - timeRecStart # Zero-base +adder = np.arange(npoints) / sampling_rate # Within-block offsets [0, 1/fs, 2/fs, ...] +# Matrix multiplication to expand: +timestampNew = zeros((n_blocks, npoints)) +for i in range(npoints): + timestampNew[:, i] = timestamps + adder[i] +timestampNew = timestampNew.T.reshape(-1, order='F') # Column-major flatten +``` +**Output:** Continuous timestamps at full sampling rate + +### 2. Correction Index Computation + +**Input:** Timestamps array, `timeForLightsTurnOn` +**Algorithm:** +```python +correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] +``` +**Output:** Indices of timestamps to keep (after eliminating first N seconds) + +### 3. Data Cropping + +**Applied to:** Control/signal data channels +**Algorithm:** +```python +data_corrected = data[correctionIndex] +``` + +### 4. Event Timestamp Adjustment + +**Applied to:** Event markers (TTL pulses) +**Algorithm:** +```python +# CSV format: +ts_corrected = ts - timeForLightsTurnOn + +# TDT format (if ts >= timeRecStart): +ts_corrected = ts - timeRecStart - timeForLightsTurnOn +``` + +### 5. Synthetic Control Generation + +**Input:** Signal channel (already corrected) +**Algorithm:** +1. Apply Savitzky-Golay filter: `filtered_signal = savgol_filter(signal, window, polyorder=3)` +2. Curve fit to exponential: `control = a + b * exp(-(1/c) * t)` +3. Return fitted curve as synthetic control + +## Analysis for I/O Separation + +### Pure Analysis Functions (Minimal I/O) +These could be extracted with I/O injected: +- ❌ None - all functions perform substantial I/O + +### Orchestration Functions (Heavy I/O, Light Analysis) +These coordinate reading/writing and delegate computation: +- `add_control_channel` - File copying and CSV writing +- `decide_naming_convention_and_applyCorrection` - Loops and delegates +- `create_control_channel` - Orchestrates read → process → write + +### Mixed Functions (I/O + Analysis) +These perform both I/O and computation inline: +- `timestampCorrection_csv` - Reads data, computes correctionIndex, writes results +- `timestampCorrection_tdt` - Reads data, expands timestamps, computes correctionIndex, writes +- `applyCorrection` - Reads multiple files, applies transformations, writes +- `check_cntrl_sig_length` - Reads data just to compare lengths + +## Refactoring Recommendations for I/O Separation + +### Option 1: Extract Pure Computation Functions + +Create new pure functions: +```python +# Pure analysis (no I/O) +def compute_correction_index(timestamps, timeForLightsTurnOn): + return np.where(timestamps >= timeForLightsTurnOn)[0] + +def expand_tdt_timestamps(block_timestamps, npoints, sampling_rate): + # TDT expansion algorithm + ... + return expanded_timestamps + +def crop_data_by_index(data, correctionIndex): + return data[correctionIndex] + +def adjust_event_timestamps(ts, timeRecStart, timeForLightsTurnOn, is_tdt): + # Event adjustment logic + ... + return adjusted_ts +``` + +Then modify existing functions to use these pure functions, keeping I/O separate. + +### Option 2: Reader/Writer Pattern + +Create dedicated I/O classes: +```python +class TimestampCorrectionReader: + def read_raw_timestamps(self, filepath, storename): + ... + + def read_correction_data(self, filepath, region): + ... + +class TimestampCorrectionWriter: + def write_correction_file(self, filepath, region, data): + ... + + def write_corrected_data(self, filepath, displayName, data): + ... +``` + +### Option 3: Data Class Pattern + +Return data objects instead of writing directly: +```python +@dataclass +class TimestampCorrection: + timestampNew: np.ndarray + correctionIndex: np.ndarray + sampling_rate: float + timeRecStart: Optional[float] = None # TDT only + +def timestampCorrection_tdt(...) -> TimestampCorrection: + # Compute all values + return TimestampCorrection( + timestampNew=..., + correctionIndex=..., + sampling_rate=..., + timeRecStart=... + ) + +# Separate writer function +def write_timestamp_correction(filepath, region, correction: TimestampCorrection): + write_hdf5(correction.timestampNew, f"timeCorrection_{region}", filepath, "timestampNew") + # ... etc +``` + +## Current I/O Patterns to Refactor + +1. **Inline writes in computation functions:** + - `timestampCorrection_csv` and `timestampCorrection_tdt` compute AND write + - Should separate: compute → return data → write in caller + +2. **Reading for validation only:** + - `check_cntrl_sig_length` reads full data arrays just to compare shapes + - Could be optimized to read only array metadata/shapes + +3. **Side-effect file creation:** + - `add_control_channel` creates files as side effect + - `create_control_channel` both generates data AND writes multiple formats (HDF5 + CSV) + +4. **Mixed responsibilities in applyCorrection:** + - Handles both control/signal cropping AND event timestamp adjustment + - Could be split into two separate functions From aa36e330f790eaccea333f89703e8d49bfb31bfd Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 11 Dec 2025 15:07:26 -0800 Subject: [PATCH 067/125] Inverted name check --- src/guppy/analysis/timestamp_correction.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index e179d26..71b4760 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -30,6 +30,10 @@ def timestampCorrection_csv( for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + # dirname = os.path.dirname(path[i]) idx = np.where(names_for_storenames == indices[i])[0] @@ -41,17 +45,12 @@ def timestampCorrection_csv( timestamp = name_to_timestamps[name] sampling_rate = name_to_sampling_rate[name] - if name_1 == name_2: - correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] - timestampNew = timestamp[correctionIndex] - # TODO: Pull out write operations into preprocess.py - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") - - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") + correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] + timestampNew = timestamp[correctionIndex] + # TODO: Pull out write operations into preprocess.py + write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") logger.info("Timestamps corrected and converted to seconds.") From 2049c4a5bd2337324a8b2bde0a7da88ba2922013 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 11 Dec 2025 15:40:52 -0800 Subject: [PATCH 068/125] Refactored out write --- src/guppy/analysis/timestamp_correction.py | 26 +++++++++++++++------- src/guppy/preprocess.py | 9 +++++--- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 71b4760..8fbb8f9 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -15,12 +15,11 @@ # function to correct timestamps after eliminating first few seconds of the data (for csv data) -def timestampCorrection_csv( - filepath, timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps, name_to_sampling_rate -): +def timestampCorrection_csv(timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) + name_to_timestamps = name_to_timestamps.copy() storenames = storesList[0, :] names_for_storenames = storesList[1, :] arr = get_control_and_signal_channel_names(storesList) @@ -43,16 +42,27 @@ def timestampCorrection_csv( name = names_for_storenames[idx][0] timestamp = name_to_timestamps[name] - sampling_rate = name_to_sampling_rate[name] correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] timestampNew = timestamp[correctionIndex] - # TODO: Pull out write operations into preprocess.py - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") + name_to_timestamps[name] = timestampNew logger.info("Timestamps corrected and converted to seconds.") + return name_to_timestamps + + +def write_corrected_timestamps(filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate): + for name, timestamps in name_to_timestamps.items(): + corrected_timestamps = corrected_name_to_timestamps[name] + correctionIndex = np.where(timestamps >= corrected_timestamps[0])[0] + sampling_rate = name_to_sampling_rate[name] + name_1 = name.split("_")[-1] + assert np.array_equal( + corrected_timestamps, timestamps[correctionIndex] + ), "Timestamps do not match after correction" + write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") # function to correct timestamps after eliminating first few seconds of the data (for TDT data) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 74033f8..413246d 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -27,6 +27,7 @@ read_control_and_signal, timestampCorrection_csv, timestampCorrection_tdt, + write_corrected_timestamps, ) from .analysis.z_score import compute_z_score @@ -280,11 +281,13 @@ def execute_timestamp_correction(folderNames, inputParameters): if cond == True: timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) else: - control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate = control_and_signal_dicts - timestampCorrection_csv( - filepath, timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps, name_to_sampling_rate + corrected_name_to_timestamps = timestampCorrection_csv( + timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps + ) + write_corrected_timestamps( + filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate ) for k in range(storesList.shape[1]): From 8b50fb70522732a3413c60262262845548c4e4da Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 11:44:59 -0800 Subject: [PATCH 069/125] Refactored read and write out of timestampcorrection_tdt --- src/guppy/analysis/timestamp_correction.py | 103 +++++++++++---------- src/guppy/preprocess.py | 29 +++++- 2 files changed, 78 insertions(+), 54 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 8fbb8f9..4e37efe 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -20,6 +20,7 @@ def timestampCorrection_csv(timeForLightsTurnOn, storesList, name_to_data, name_ f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) name_to_timestamps = name_to_timestamps.copy() + name_to_correctionIndex = {} storenames = storesList[0, :] names_for_storenames = storesList[1, :] arr = get_control_and_signal_channel_names(storesList) @@ -46,85 +47,78 @@ def timestampCorrection_csv(timeForLightsTurnOn, storesList, name_to_data, name_ correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] timestampNew = timestamp[correctionIndex] name_to_timestamps[name] = timestampNew + name_to_correctionIndex[name] = correctionIndex logger.info("Timestamps corrected and converted to seconds.") - return name_to_timestamps + return name_to_timestamps, name_to_correctionIndex -def write_corrected_timestamps(filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate): - for name, timestamps in name_to_timestamps.items(): +def write_corrected_timestamps( + filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex +): + for name, correctionIndex in name_to_correctionIndex.items(): + timestamps = name_to_timestamps[name] corrected_timestamps = corrected_name_to_timestamps[name] - correctionIndex = np.where(timestamps >= corrected_timestamps[0])[0] sampling_rate = name_to_sampling_rate[name] + if sampling_rate.shape == (): # numpy scalar + sampling_rate = np.asarray([sampling_rate]) name_1 = name.split("_")[-1] - assert np.array_equal( - corrected_timestamps, timestamps[correctionIndex] - ), "Timestamps do not match after correction" + write_hdf5(np.asarray([timestamps[0]]), "timeCorrection_" + name_1, filepath, "timeRecStart") write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew") write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") + write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") # function to correct timestamps after eliminating first few seconds of the data (for TDT data) -def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): - +def timestampCorrection_tdt( + filepath, timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints +): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) + name_to_timestamps = name_to_timestamps.copy() + name_to_correctionIndex = {} storenames = storesList[0, :] - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) + indices = check_cntrl_sig_length(arr, name_to_data) for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] + idx = np.where(names_for_storenames == indices[i])[0] if idx.shape[0] == 0: logger.error(f"{arr[0,i]} does not exist in the stores list file.") raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - npoints = read_hdf5(storenames[idx][0], filepath, "npoints") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") + name = names_for_storenames[idx][0] + timestamp = name_to_timestamps[name] + sampling_rate = name_to_sampling_rate[name] + npoints = name_to_npoints[name] + + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) + adder = np.arange(npoints) / sampling_rate + lengthAdder = adder.shape[0] + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] + timestampNew = timestampNew[correctionIndex] - if name_1 == name_2: - timeRecStart = timestamp[0] - timestamps = np.subtract(timestamp, timeRecStart) - adder = np.arange(npoints) / sampling_rate - lengthAdder = adder.shape[0] - timestampNew = np.zeros((len(timestamps), lengthAdder)) - for i in range(lengthAdder): - timestampNew[:, i] = np.add(timestamps, adder[i]) - timestampNew = (timestampNew.T).reshape(-1, order="F") - correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] - timestampNew = timestampNew[correctionIndex] - - write_hdf5(np.asarray([timeRecStart]), "timeCorrection_" + name_1, filepath, "timeRecStart") - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray([sampling_rate]), "timeCorrection_" + name_1, filepath, "sampling_rate") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") + name_to_timestamps[name] = timestampNew + name_to_correctionIndex[name] = correctionIndex logger.info("Timestamps corrected and converted to seconds.") - # return timeRecStart, correctionIndex, timestampNew + return name_to_timestamps, name_to_correctionIndex # function to check if naming convention was followed while saving storeslist file @@ -269,6 +263,7 @@ def read_control_and_signal(filepath, storesList): name_to_data = {} name_to_timestamps = {} name_to_sampling_rate = {} + name_to_npoints = {} for i in range(channels_arr.shape[1]): control_name = channels_arr[0, i] @@ -284,6 +279,12 @@ def read_control_and_signal(filepath, storesList): signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps") control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate") signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate") + try: # TODO: define npoints for csv datasets + control_npoints = read_hdf5(control_storename, filepath, "npoints") + signal_npoints = read_hdf5(signal_storename, filepath, "npoints") + except KeyError: # npoints is not defined for csv datasets + control_npoints = None + signal_npoints = None name_to_data[control_name] = control_data name_to_data[signal_name] = signal_data @@ -291,5 +292,7 @@ def read_control_and_signal(filepath, storesList): name_to_timestamps[signal_name] = signal_timestamps name_to_sampling_rate[control_name] = control_sampling_rate name_to_sampling_rate[signal_name] = signal_sampling_rate + name_to_npoints[control_name] = control_npoints + name_to_npoints[signal_name] = signal_npoints - return name_to_data, name_to_timestamps, name_to_sampling_rate + return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 413246d..db9d8d0 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -279,15 +279,36 @@ def execute_timestamp_correction(folderNames, inputParameters): storesList = add_control_channel(filepath, storesList) if cond == True: - timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) + control_and_signal_dicts = read_control_and_signal(filepath, storesList) + name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts + corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_tdt( + filepath, + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + ) + write_corrected_timestamps( + filepath, + corrected_name_to_timestamps, + name_to_timestamps, + name_to_sampling_rate, + name_to_correctionIndex, + ) else: control_and_signal_dicts = read_control_and_signal(filepath, storesList) - name_to_data, name_to_timestamps, name_to_sampling_rate = control_and_signal_dicts - corrected_name_to_timestamps = timestampCorrection_csv( + name_to_data, name_to_timestamps, name_to_sampling_rate, _ = control_and_signal_dicts + corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_csv( timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps ) write_corrected_timestamps( - filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate + filepath, + corrected_name_to_timestamps, + name_to_timestamps, + name_to_sampling_rate, + name_to_correctionIndex, ) for k in range(storesList.shape[1]): From b73417063e15f8a1dafe9615bbc6abafcdcbcb23 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 11:48:19 -0800 Subject: [PATCH 070/125] Removed, now unused file path parameter. --- src/guppy/analysis/timestamp_correction.py | 2 +- src/guppy/preprocess.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 4e37efe..cd662bd 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -71,7 +71,7 @@ def write_corrected_timestamps( # function to correct timestamps after eliminating first few seconds of the data (for TDT data) def timestampCorrection_tdt( - filepath, timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints + timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints ): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index db9d8d0..83659bf 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -282,7 +282,6 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_tdt( - filepath, timeForLightsTurnOn, storesList, name_to_timestamps, From 4402cbb20f02273c78020b2aa0d20f98236e1c9c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 12:10:39 -0800 Subject: [PATCH 071/125] Consolidated TDT and CSV timestamp correction functions into a single timestamp_correction function with a mode parameter. --- src/guppy/analysis/timestamp_correction.py | 72 ++++++---------------- src/guppy/preprocess.py | 54 ++++++---------- 2 files changed, 40 insertions(+), 86 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index cd662bd..df72800 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -14,45 +14,6 @@ logger = logging.getLogger(__name__) -# function to correct timestamps after eliminating first few seconds of the data (for csv data) -def timestampCorrection_csv(timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps): - logger.debug( - f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" - ) - name_to_timestamps = name_to_timestamps.copy() - name_to_correctionIndex = {} - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - arr = get_control_and_signal_channel_names(storesList) - - indices = check_cntrl_sig_length(arr, name_to_data) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - if name_1 != name_2: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - # dirname = os.path.dirname(path[i]) - idx = np.where(names_for_storenames == indices[i])[0] - - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - name = names_for_storenames[idx][0] - timestamp = name_to_timestamps[name] - - correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] - timestampNew = timestamp[correctionIndex] - name_to_timestamps[name] = timestampNew - name_to_correctionIndex[name] = correctionIndex - - logger.info("Timestamps corrected and converted to seconds.") - return name_to_timestamps, name_to_correctionIndex - - def write_corrected_timestamps( filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex ): @@ -69,13 +30,16 @@ def write_corrected_timestamps( write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") -# function to correct timestamps after eliminating first few seconds of the data (for TDT data) -def timestampCorrection_tdt( - timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints +# function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode) +def timestampCorrection( + timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints, mode ): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) + if mode not in ["tdt", "csv"]: + logger.error("Mode should be either 'tdt' or 'csv'") + raise ValueError("Mode should be either 'tdt' or 'csv'") name_to_timestamps = name_to_timestamps.copy() name_to_correctionIndex = {} storenames = storesList[0, :] @@ -103,16 +67,20 @@ def timestampCorrection_tdt( sampling_rate = name_to_sampling_rate[name] npoints = name_to_npoints[name] - timeRecStart = timestamp[0] - timestamps = np.subtract(timestamp, timeRecStart) - adder = np.arange(npoints) / sampling_rate - lengthAdder = adder.shape[0] - timestampNew = np.zeros((len(timestamps), lengthAdder)) - for i in range(lengthAdder): - timestampNew[:, i] = np.add(timestamps, adder[i]) - timestampNew = (timestampNew.T).reshape(-1, order="F") - correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] - timestampNew = timestampNew[correctionIndex] + if mode == "tdt": + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) + adder = np.arange(npoints) / sampling_rate + lengthAdder = adder.shape[0] + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] + timestampNew = timestampNew[correctionIndex] + elif mode == "csv": + correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] + timestampNew = timestamp[correctionIndex] name_to_timestamps[name] = timestampNew name_to_correctionIndex[name] = correctionIndex diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 83659bf..19626dd 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -25,8 +25,7 @@ create_control_channel, decide_naming_convention_and_applyCorrection, read_control_and_signal, - timestampCorrection_csv, - timestampCorrection_tdt, + timestampCorrection, write_corrected_timestamps, ) from .analysis.z_score import compute_z_score @@ -267,7 +266,7 @@ def execute_timestamp_correction(folderNames, inputParameters): for i in range(len(folderNames)): filepath = folderNames[i] storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - cond = check_TDT(folderNames[i]) + mode = "tdt" if check_TDT(folderNames[i]) else "csv" logger.debug(f"Timestamps corrections started for {filepath}") for j in range(len(storesListPath)): filepath = storesListPath[j] @@ -278,37 +277,24 @@ def execute_timestamp_correction(folderNames, inputParameters): if isosbestic_control == False: storesList = add_control_channel(filepath, storesList) - if cond == True: - control_and_signal_dicts = read_control_and_signal(filepath, storesList) - name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts - corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_tdt( - timeForLightsTurnOn, - storesList, - name_to_timestamps, - name_to_data, - name_to_sampling_rate, - name_to_npoints, - ) - write_corrected_timestamps( - filepath, - corrected_name_to_timestamps, - name_to_timestamps, - name_to_sampling_rate, - name_to_correctionIndex, - ) - else: - control_and_signal_dicts = read_control_and_signal(filepath, storesList) - name_to_data, name_to_timestamps, name_to_sampling_rate, _ = control_and_signal_dicts - corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_csv( - timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps - ) - write_corrected_timestamps( - filepath, - corrected_name_to_timestamps, - name_to_timestamps, - name_to_sampling_rate, - name_to_correctionIndex, - ) + control_and_signal_dicts = read_control_and_signal(filepath, storesList) + name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts + corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + mode=mode, + ) + write_corrected_timestamps( + filepath, + corrected_name_to_timestamps, + name_to_timestamps, + name_to_sampling_rate, + name_to_correctionIndex, + ) for k in range(storesList.shape[1]): decide_naming_convention_and_applyCorrection( From ca735ce723a870e972308131f7cb1cd020a6ab61 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 12:21:50 -0800 Subject: [PATCH 072/125] Cleaned up some inefficient code --- src/guppy/analysis/timestamp_correction.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index df72800..efa4c52 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -94,25 +94,16 @@ def timestampCorrection( def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): logger.debug("Applying correction of timestamps to the data and event timestamps") - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - arr = np.asarray(arr).reshape(2, -1) + arr = get_control_and_signal_channel_names(storesList) for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - if name_1 == name_2: - applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) - else: + if name_1 != name_2: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") + else: + applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) logger.info("Timestamps corrections applied to the data and event timestamps.") @@ -153,10 +144,6 @@ def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): arr = np.subtract(arr, timeForLightsTurnOn) write_hdf5(arr, displayName + "_" + naming, filepath, "ts") - # if isosbestic_control==False and 'control' in displayName.lower(): - # control = create_control_channel(filepath, displayName) - # write_hdf5(control, displayName, filepath, 'data') - # main function to create control channel using # signal channel and save it to a file From 262681bcab890d51f73d65856ade3533a6b97842 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 13:12:16 -0800 Subject: [PATCH 073/125] Pulled read operations out of the applyCorrection functions. --- src/guppy/analysis/timestamp_correction.py | 84 +++++++++++++++++----- src/guppy/preprocess.py | 26 ++++++- 2 files changed, 92 insertions(+), 18 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index efa4c52..2da2020 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -91,7 +91,19 @@ def timestampCorrection( # function to check if naming convention was followed while saving storeslist file # and apply timestamps correction using the function applyCorrection -def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): +def decide_naming_convention_and_applyCorrection( + filepath, + timeForLightsTurnOn, + event, + displayName, + storesList, + name_1_to_corrected_timestamps, + name_1_to_timestamps, + name_1_to_sampling_rate, + name_1_to_correctionIndex, + data, + ttl_timestamps, +): logger.debug("Applying correction of timestamps to the data and event timestamps") arr = get_control_and_signal_channel_names(storesList) @@ -103,36 +115,61 @@ def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") else: - applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) + corrected_timestamps = name_1_to_corrected_timestamps[name_1] + timestamps = name_1_to_timestamps[name_1] + timeRecStart = timestamps[0] + sampling_rate = name_1_to_sampling_rate[name_1] + correctionIndex = name_1_to_correctionIndex[name_1] + applyCorrection( + filepath, + timeForLightsTurnOn, + event, + displayName, + name_1, + corrected_timestamps, + sampling_rate, + correctionIndex, + timeRecStart, + data, + ttl_timestamps, + ) logger.info("Timestamps corrections applied to the data and event timestamps.") # function to apply correction to control, signal and event timestamps -def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): +def applyCorrection( + filepath, + timeForLightsTurnOn, + event, + displayName, + naming, + corrected_timestamps, + sampling_rate, + correctionIndex, + timeRecStart, + data, + ttl_timestamps, +): cond = check_TDT(os.path.dirname(filepath)) - if cond == True: - timeRecStart = read_hdf5("timeCorrection_" + naming, filepath, "timeRecStart")[0] - - timestampNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - correctionIndex = read_hdf5("timeCorrection_" + naming, filepath, "correctionIndex") - + timestampNew = corrected_timestamps if "control" in displayName.lower() or "signal" in displayName.lower(): - split_name = displayName.split("_")[-1] - if split_name == naming: - pass - else: - correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") - arr = read_hdf5(event, filepath, "data") + # TODO: double-check that this code is not reachable + # split_name = displayName.split("_")[-1] + # if split_name == naming: + # pass + # else: + # correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") + arr = data if (arr == 0).all() == True: arr = arr else: arr = arr[correctionIndex] write_hdf5(arr, displayName, filepath, "data") else: - arr = read_hdf5(event, filepath, "timestamps") + arr = ttl_timestamps if cond == True: res = (arr >= timeRecStart).all() if res == True: @@ -251,3 +288,18 @@ def read_control_and_signal(filepath, storesList): name_to_npoints[signal_name] = signal_npoints return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints + + +def read_ttl(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_timestamps = {} + for storename, name in zip(storenames, names_for_storenames): + if storename in channels_arr: + continue + timestamps = read_hdf5(storename, filepath, "timestamps") + name_to_timestamps[name] = timestamps + + return name_to_timestamps diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 19626dd..1715cfc 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -25,6 +25,7 @@ create_control_channel, decide_naming_convention_and_applyCorrection, read_control_and_signal, + read_ttl, timestampCorrection, write_corrected_timestamps, ) @@ -295,10 +296,31 @@ def execute_timestamp_correction(folderNames, inputParameters): name_to_sampling_rate, name_to_correctionIndex, ) - + name_1_to_corrected_timestamps = { + name.split("_")[-1]: ts for name, ts in corrected_name_to_timestamps.items() + } + name_1_to_timestamps = {name.split("_")[-1]: ts for name, ts in name_to_timestamps.items()} + name_1_to_sampling_rate = {name.split("_")[-1]: sr for name, sr in name_to_sampling_rate.items()} + name_1_to_correctionIndex = {name.split("_")[-1]: idx for name, idx in name_to_correctionIndex.items()} + + name_to_timestamps_ttl = read_ttl(filepath, storesList) for k in range(storesList.shape[1]): + data = name_to_data[storesList[1, k]] if storesList[1, k] in name_to_data else None + ttl_timestamps = ( + name_to_timestamps_ttl[storesList[1, k]] if storesList[1, k] in name_to_timestamps_ttl else None + ) decide_naming_convention_and_applyCorrection( - filepath, timeForLightsTurnOn, storesList[0, k], storesList[1, k], storesList + filepath, + timeForLightsTurnOn, + storesList[0, k], + storesList[1, k], + storesList, + name_1_to_corrected_timestamps, + name_1_to_timestamps, + name_1_to_sampling_rate, + name_1_to_correctionIndex, + data, + ttl_timestamps, ) # check if isosbestic control is false and also if new control channel is added From b6173dd889e892f65f7e2c2f096dd10c88acee17 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 16:41:46 -0800 Subject: [PATCH 074/125] split up applyCorrection by ttl vs signal_and_control --- src/guppy/analysis/timestamp_correction.py | 112 ++++++++++++++++++++- src/guppy/preprocess.py | 70 ++++++++----- 2 files changed, 154 insertions(+), 28 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 2da2020..a1088c9 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -137,6 +137,116 @@ def decide_naming_convention_and_applyCorrection( logger.info("Timestamps corrections applied to the data and event timestamps.") +def decide_naming_and_applyCorrection_signal_and_control( + filepath, + storesList, + name_to_correctionIndex, + name_to_data, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) + indices = check_cntrl_sig_length(arr, name_to_data) + + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + idx = np.where(names_for_storenames == indices[i])[0] + if idx.shape[0] == 0: + logger.error(f"{arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + + name = names_for_storenames[idx][0] + correctionIndex = name_to_correctionIndex[name] + control_name = arr[0, i] + signal_name = arr[1, i] + control_data = name_to_data[control_name] + signal_data = name_to_data[signal_name] + applyCorrection_signal_and_control(filepath, control_name, correctionIndex, control_data) + applyCorrection_signal_and_control(filepath, signal_name, correctionIndex, signal_data) + + logger.info("Timestamps corrections applied to the data and event timestamps.") + + +def applyCorrection_signal_and_control(filepath, displayName, correctionIndex, data): + arr = data + if (arr == 0).all() == True: + arr = arr + else: + arr = arr[correctionIndex] + write_hdf5(arr, displayName, filepath, "data") + + +def decide_naming_and_applyCorrection_ttl( + filepath, + timeForLightsTurnOn, + storesList, + name_to_timestamps_ttl, + name_to_timestamps, + name_to_data, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) + indices = check_cntrl_sig_length(arr, name_to_data) + + for ttl_name, ttl_timestamps in name_to_timestamps_ttl.items(): + displayName = ttl_name + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + idx = np.where(names_for_storenames == indices[i])[0] + if idx.shape[0] == 0: + logger.error(f"{arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + + name = names_for_storenames[idx][0] + timestamps = name_to_timestamps[name] + timeRecStart = timestamps[0] + applyCorrection_ttl( + filepath, + timeForLightsTurnOn, + displayName, + name_1, + timeRecStart, + ttl_timestamps, + ) + + logger.info("Timestamps corrections applied to the data and event timestamps.") + + +def applyCorrection_ttl( + filepath, + timeForLightsTurnOn, + displayName, + naming, + timeRecStart, + ttl_timestamps, +): + cond = check_TDT(os.path.dirname(filepath)) + arr = ttl_timestamps + if cond == True: + res = (arr >= timeRecStart).all() + if res == True: + arr = np.subtract(arr, timeRecStart) + arr = np.subtract(arr, timeForLightsTurnOn) + else: + arr = np.subtract(arr, timeForLightsTurnOn) + else: + arr = np.subtract(arr, timeForLightsTurnOn) + write_hdf5(arr, displayName + "_" + naming, filepath, "ts") + + # function to apply correction to control, signal and event timestamps def applyCorrection( filepath, @@ -297,7 +407,7 @@ def read_ttl(filepath, storesList): name_to_timestamps = {} for storename, name in zip(storenames, names_for_storenames): - if storename in channels_arr: + if name in channels_arr: continue timestamps = read_hdf5(storename, filepath, "timestamps") name_to_timestamps[name] = timestamps diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 1715cfc..acea813 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -23,7 +23,8 @@ ) from .analysis.timestamp_correction import ( create_control_channel, - decide_naming_convention_and_applyCorrection, + decide_naming_and_applyCorrection_signal_and_control, + decide_naming_and_applyCorrection_ttl, read_control_and_signal, read_ttl, timestampCorrection, @@ -280,7 +281,7 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts - corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection( + name_to_corrected_timestamps, name_to_correctionIndex = timestampCorrection( timeForLightsTurnOn, storesList, name_to_timestamps, @@ -291,37 +292,52 @@ def execute_timestamp_correction(folderNames, inputParameters): ) write_corrected_timestamps( filepath, - corrected_name_to_timestamps, + name_to_corrected_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex, ) - name_1_to_corrected_timestamps = { - name.split("_")[-1]: ts for name, ts in corrected_name_to_timestamps.items() - } - name_1_to_timestamps = {name.split("_")[-1]: ts for name, ts in name_to_timestamps.items()} - name_1_to_sampling_rate = {name.split("_")[-1]: sr for name, sr in name_to_sampling_rate.items()} - name_1_to_correctionIndex = {name.split("_")[-1]: idx for name, idx in name_to_correctionIndex.items()} name_to_timestamps_ttl = read_ttl(filepath, storesList) - for k in range(storesList.shape[1]): - data = name_to_data[storesList[1, k]] if storesList[1, k] in name_to_data else None - ttl_timestamps = ( - name_to_timestamps_ttl[storesList[1, k]] if storesList[1, k] in name_to_timestamps_ttl else None - ) - decide_naming_convention_and_applyCorrection( - filepath, - timeForLightsTurnOn, - storesList[0, k], - storesList[1, k], - storesList, - name_1_to_corrected_timestamps, - name_1_to_timestamps, - name_1_to_sampling_rate, - name_1_to_correctionIndex, - data, - ttl_timestamps, - ) + decide_naming_and_applyCorrection_signal_and_control( + filepath, + storesList, + name_to_correctionIndex, + name_to_data, + ) + decide_naming_and_applyCorrection_ttl( + filepath, + timeForLightsTurnOn, + storesList, + name_to_timestamps_ttl, + name_to_timestamps, + name_to_data, + ) + + # name_1_to_corrected_timestamps = { + # name.split("_")[-1]: ts for name, ts in name_to_corrected_timestamps.items() + # } + # name_1_to_timestamps = {name.split("_")[-1]: ts for name, ts in name_to_timestamps.items()} + # name_1_to_sampling_rate = {name.split("_")[-1]: sr for name, sr in name_to_sampling_rate.items()} + # name_1_to_correctionIndex = {name.split("_")[-1]: idx for name, idx in name_to_correctionIndex.items()} + # for k in range(storesList.shape[1]): # TODO: Refactor nested loops for clarity + # data = name_to_data[storesList[1, k]] if storesList[1, k] in name_to_data else None + # ttl_timestamps = ( + # name_to_timestamps_ttl[storesList[1, k]] if storesList[1, k] in name_to_timestamps_ttl else None + # ) + # decide_naming_convention_and_applyCorrection( + # filepath, + # timeForLightsTurnOn, + # storesList[0, k], + # storesList[1, k], + # storesList, + # name_1_to_corrected_timestamps, + # name_1_to_timestamps, + # name_1_to_sampling_rate, + # name_1_to_correctionIndex, + # data, + # ttl_timestamps, + # ) # check if isosbestic control is false and also if new control channel is added if isosbestic_control == False: From 4bfc1a7c41ca9ab792b4484f1fa68b5f06b8b23e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 16:42:47 -0800 Subject: [PATCH 075/125] Removed commented section. --- src/guppy/preprocess.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index acea813..543f565 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -314,31 +314,6 @@ def execute_timestamp_correction(folderNames, inputParameters): name_to_data, ) - # name_1_to_corrected_timestamps = { - # name.split("_")[-1]: ts for name, ts in name_to_corrected_timestamps.items() - # } - # name_1_to_timestamps = {name.split("_")[-1]: ts for name, ts in name_to_timestamps.items()} - # name_1_to_sampling_rate = {name.split("_")[-1]: sr for name, sr in name_to_sampling_rate.items()} - # name_1_to_correctionIndex = {name.split("_")[-1]: idx for name, idx in name_to_correctionIndex.items()} - # for k in range(storesList.shape[1]): # TODO: Refactor nested loops for clarity - # data = name_to_data[storesList[1, k]] if storesList[1, k] in name_to_data else None - # ttl_timestamps = ( - # name_to_timestamps_ttl[storesList[1, k]] if storesList[1, k] in name_to_timestamps_ttl else None - # ) - # decide_naming_convention_and_applyCorrection( - # filepath, - # timeForLightsTurnOn, - # storesList[0, k], - # storesList[1, k], - # storesList, - # name_1_to_corrected_timestamps, - # name_1_to_timestamps, - # name_1_to_sampling_rate, - # name_1_to_correctionIndex, - # data, - # ttl_timestamps, - # ) - # check if isosbestic control is false and also if new control channel is added if isosbestic_control == False: create_control_channel(filepath, storesList, window=101) From b01a58f525f20a9a0f29c06b01e30c4672fa3f57 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 16:57:31 -0800 Subject: [PATCH 076/125] Refactored applyCorrection inside timestampCorrection for signal and control --- src/guppy/analysis/timestamp_correction.py | 25 +++++++++++++++++++++- src/guppy/preprocess.py | 8 +------ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index a1088c9..3d5c73c 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -32,7 +32,14 @@ def write_corrected_timestamps( # function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode) def timestampCorrection( - timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints, mode + filepath, + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + mode, ): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" @@ -49,6 +56,8 @@ def timestampCorrection( indices = check_cntrl_sig_length(arr, name_to_data) for i in range(arr.shape[1]): + control_name = arr[0, i] + signal_name = arr[1, i] name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] if name_1 != name_2: @@ -85,6 +94,20 @@ def timestampCorrection( name_to_timestamps[name] = timestampNew name_to_correctionIndex[name] = correctionIndex + arr = name_to_data[control_name] + if (arr == 0).all() == True: + arr = arr + else: + arr = arr[correctionIndex] + write_hdf5(arr, control_name, filepath, "data") + + arr = name_to_data[signal_name] + if (arr == 0).all() == True: + arr = arr + else: + arr = arr[correctionIndex] + write_hdf5(arr, signal_name, filepath, "data") + logger.info("Timestamps corrected and converted to seconds.") return name_to_timestamps, name_to_correctionIndex diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 543f565..df07c21 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -23,7 +23,6 @@ ) from .analysis.timestamp_correction import ( create_control_channel, - decide_naming_and_applyCorrection_signal_and_control, decide_naming_and_applyCorrection_ttl, read_control_and_signal, read_ttl, @@ -282,6 +281,7 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts name_to_corrected_timestamps, name_to_correctionIndex = timestampCorrection( + filepath, timeForLightsTurnOn, storesList, name_to_timestamps, @@ -299,12 +299,6 @@ def execute_timestamp_correction(folderNames, inputParameters): ) name_to_timestamps_ttl = read_ttl(filepath, storesList) - decide_naming_and_applyCorrection_signal_and_control( - filepath, - storesList, - name_to_correctionIndex, - name_to_data, - ) decide_naming_and_applyCorrection_ttl( filepath, timeForLightsTurnOn, From 62cb84f921fbb26c6a7b78e76ca037d68a12bb18 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 17:19:28 -0800 Subject: [PATCH 077/125] Pulled write operations back out of timestamp_correction. --- src/guppy/analysis/timestamp_correction.py | 53 ++++++++++------------ src/guppy/preprocess.py | 5 +- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 3d5c73c..e8144f3 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -30,9 +30,13 @@ def write_corrected_timestamps( write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") +def write_corrected_data(filepath, name_to_corrected_data): + for name, data in name_to_corrected_data.items(): + write_hdf5(data, name, filepath, "data") + + # function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode) def timestampCorrection( - filepath, timeForLightsTurnOn, storesList, name_to_timestamps, @@ -47,19 +51,20 @@ def timestampCorrection( if mode not in ["tdt", "csv"]: logger.error("Mode should be either 'tdt' or 'csv'") raise ValueError("Mode should be either 'tdt' or 'csv'") - name_to_timestamps = name_to_timestamps.copy() + name_to_corrected_timestamps = {} name_to_correctionIndex = {} + name_to_corrected_data = {} storenames = storesList[0, :] names_for_storenames = storesList[1, :] - arr = get_control_and_signal_channel_names(storesList) + data = get_control_and_signal_channel_names(storesList) - indices = check_cntrl_sig_length(arr, name_to_data) + indices = check_cntrl_sig_length(data, name_to_data) - for i in range(arr.shape[1]): - control_name = arr[0, i] - signal_name = arr[1, i] - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] + for i in range(data.shape[1]): + control_name = data[0, i] + signal_name = data[1, i] + name_1 = data[0, i].split("_")[-1] + name_2 = data[1, i].split("_")[-1] if name_1 != name_2: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") @@ -68,8 +73,8 @@ def timestampCorrection( idx = np.where(names_for_storenames == indices[i])[0] if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + logger.error(f"{data[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(data[0, i])) name = names_for_storenames[idx][0] timestamp = name_to_timestamps[name] @@ -91,25 +96,17 @@ def timestampCorrection( correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] timestampNew = timestamp[correctionIndex] - name_to_timestamps[name] = timestampNew - name_to_correctionIndex[name] = correctionIndex - - arr = name_to_data[control_name] - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, control_name, filepath, "data") - - arr = name_to_data[signal_name] - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, signal_name, filepath, "data") + for displayName in [control_name, signal_name]: + name_to_corrected_timestamps[displayName] = timestampNew + name_to_correctionIndex[displayName] = correctionIndex + data = name_to_data[displayName] + if (data == 0).all() == True: + name_to_corrected_data[displayName] = data + else: + name_to_corrected_data[displayName] = data[correctionIndex] logger.info("Timestamps corrected and converted to seconds.") - return name_to_timestamps, name_to_correctionIndex + return name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data # function to check if naming convention was followed while saving storeslist file diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index df07c21..4653ce3 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -27,6 +27,7 @@ read_control_and_signal, read_ttl, timestampCorrection, + write_corrected_data, write_corrected_timestamps, ) from .analysis.z_score import compute_z_score @@ -280,8 +281,7 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts - name_to_corrected_timestamps, name_to_correctionIndex = timestampCorrection( - filepath, + name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data = timestampCorrection( timeForLightsTurnOn, storesList, name_to_timestamps, @@ -297,6 +297,7 @@ def execute_timestamp_correction(folderNames, inputParameters): name_to_sampling_rate, name_to_correctionIndex, ) + write_corrected_data(filepath, name_to_corrected_data) name_to_timestamps_ttl = read_ttl(filepath, storesList) decide_naming_and_applyCorrection_ttl( From 36ba6b848362e827a489d9700e6ae41d29f6f974 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 09:13:33 -0800 Subject: [PATCH 078/125] Pulled write operations out of applyCorrection_ttl. --- src/guppy/analysis/timestamp_correction.py | 46 +++++++++++++--------- src/guppy/preprocess.py | 7 ++-- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index e8144f3..d9d873f 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -203,12 +203,12 @@ def applyCorrection_signal_and_control(filepath, displayName, correctionIndex, d def decide_naming_and_applyCorrection_ttl( - filepath, timeForLightsTurnOn, storesList, name_to_timestamps_ttl, name_to_timestamps, name_to_data, + mode, ): logger.debug("Applying correction of timestamps to the data and event timestamps") storenames = storesList[0, :] @@ -216,8 +216,8 @@ def decide_naming_and_applyCorrection_ttl( arr = get_control_and_signal_channel_names(storesList) indices = check_cntrl_sig_length(arr, name_to_data) + compound_name_to_corrected_ttl_timestamps = {} for ttl_name, ttl_timestamps in name_to_timestamps_ttl.items(): - displayName = ttl_name for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] @@ -233,38 +233,46 @@ def decide_naming_and_applyCorrection_ttl( name = names_for_storenames[idx][0] timestamps = name_to_timestamps[name] timeRecStart = timestamps[0] - applyCorrection_ttl( - filepath, + corrected_ttl_timestamps = applyCorrection_ttl( timeForLightsTurnOn, - displayName, - name_1, timeRecStart, ttl_timestamps, + mode, ) + compound_name = ttl_name + "_" + name_1 + compound_name_to_corrected_ttl_timestamps[compound_name] = corrected_ttl_timestamps logger.info("Timestamps corrections applied to the data and event timestamps.") + return compound_name_to_corrected_ttl_timestamps def applyCorrection_ttl( - filepath, timeForLightsTurnOn, - displayName, - naming, timeRecStart, ttl_timestamps, + mode, ): - cond = check_TDT(os.path.dirname(filepath)) - arr = ttl_timestamps - if cond == True: - res = (arr >= timeRecStart).all() + corrected_ttl_timestamps = ttl_timestamps + if mode == "tdt": + res = (corrected_ttl_timestamps >= timeRecStart).all() if res == True: - arr = np.subtract(arr, timeRecStart) - arr = np.subtract(arr, timeForLightsTurnOn) + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeRecStart) + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn) else: - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - write_hdf5(arr, displayName + "_" + naming, filepath, "ts") + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn) + elif mode == "csv": + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn) + return corrected_ttl_timestamps + + +def write_corrected_ttl_timestamps( + filepath, + compound_name_to_corrected_ttl_timestamps, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): + write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") + logger.info("Timestamps corrections applied to the data and event timestamps.") # function to apply correction to control, signal and event timestamps diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 4653ce3..127e929 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -29,6 +29,7 @@ timestampCorrection, write_corrected_data, write_corrected_timestamps, + write_corrected_ttl_timestamps, ) from .analysis.z_score import compute_z_score @@ -300,15 +301,15 @@ def execute_timestamp_correction(folderNames, inputParameters): write_corrected_data(filepath, name_to_corrected_data) name_to_timestamps_ttl = read_ttl(filepath, storesList) - decide_naming_and_applyCorrection_ttl( - filepath, + compound_name_to_corrected_ttl_timestamps = decide_naming_and_applyCorrection_ttl( timeForLightsTurnOn, storesList, name_to_timestamps_ttl, name_to_timestamps, name_to_data, + mode=mode, ) - + write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps) # check if isosbestic control is false and also if new control channel is added if isosbestic_control == False: create_control_channel(filepath, storesList, window=101) From 05d855ec34dd29adde0d21c1f0685571000adf74 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 09:34:51 -0800 Subject: [PATCH 079/125] Move add_control_channel and create_control_channel to the control_channel module --- src/guppy/analysis/control_channel.py | 81 ++++++++++++++++++++++ src/guppy/analysis/timestamp_correction.py | 28 -------- src/guppy/preprocess.py | 51 +------------- 3 files changed, 82 insertions(+), 78 deletions(-) diff --git a/src/guppy/analysis/control_channel.py b/src/guppy/analysis/control_channel.py index 2da82e2..d9f6ad8 100644 --- a/src/guppy/analysis/control_channel.py +++ b/src/guppy/analysis/control_channel.py @@ -1,12 +1,93 @@ import logging +import os import numpy as np +import pandas as pd from scipy import signal as ss from scipy.optimize import curve_fit +from .io_utils import ( + read_hdf5, + write_hdf5, +) + logger = logging.getLogger(__name__) +# This function just creates placeholder Control-HDF5 files that are then immediately overwritten later on in the pipeline. +# TODO: Refactor this function to avoid unnecessary file creation. +# function to add control channel when there is no +# isosbestic control channel and update the storeslist file +def add_control_channel(filepath, arr): + + storenames = arr[0, :] + storesList = np.char.lower(arr[1, :]) + + keep_control = np.array([]) + # check a case if there is isosbestic control channel present + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "signal_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) > 1: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + if len(find_signal) == 0: + logger.error( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + raise Exception( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + else: + continue + + for i in range(storesList.shape[0]): + if "signal" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "control_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) == 0: + src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( + filepath, "cntrl" + str(i) + ".hdf5" + ) + shutil.copyfile(src, dst) + arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) + + np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") + + return arr + + +# main function to create control channel using +# signal channel and save it to a file +def create_control_channel(filepath, arr, window=5001): + + storenames = arr[0, :] + storesList = arr[1, :] + + for i in range(storesList.shape[0]): + event_name, event = storesList[i], storenames[i] + if "control" in event_name.lower() and "cntrl" in event.lower(): + logger.debug("Creating control channel from signal channel using curve-fitting") + name = event_name.split("_")[-1] + signal = read_hdf5("signal_" + name, filepath, "data") + timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + sampling_rate = np.full(timestampNew.shape, np.nan) + sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + + control = helper_create_control_channel(signal, timestampNew, window) + + write_hdf5(control, event_name, filepath, "data") + d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} + df = pd.DataFrame(d) + df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) + logger.info("Control channel from signal channel created using curve-fitting") + + # TODO: figure out why a control channel is created for both timestamp correction and z-score steps. # helper function to create control channel using signal channel # by curve fitting signal channel to exponential function diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index d9d873f..709deca 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -2,9 +2,7 @@ import os import numpy as np -import pandas as pd -from .control_channel import helper_create_control_channel from .io_utils import ( check_TDT, read_hdf5, @@ -320,32 +318,6 @@ def applyCorrection( write_hdf5(arr, displayName + "_" + naming, filepath, "ts") -# main function to create control channel using -# signal channel and save it to a file -def create_control_channel(filepath, arr, window=5001): - - storenames = arr[0, :] - storesList = arr[1, :] - - for i in range(storesList.shape[0]): - event_name, event = storesList[i], storenames[i] - if "control" in event_name.lower() and "cntrl" in event.lower(): - logger.debug("Creating control channel from signal channel using curve-fitting") - name = event_name.split("_")[-1] - signal = read_hdf5("signal_" + name, filepath, "data") - timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - sampling_rate = np.full(timestampNew.shape, np.nan) - sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - - control = helper_create_control_channel(signal, timestampNew, window) - - write_hdf5(control, event_name, filepath, "data") - d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} - df = pd.DataFrame(d) - df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) - logger.info("Control channel from signal channel created using curve-fitting") - - # function to check control and signal channel has same length # if not, take a smaller length and do pre-processing def check_cntrl_sig_length(channels_arr, name_to_data): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 127e929..9f1f14e 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -2,7 +2,6 @@ import json import logging import os -import shutil import sys import matplotlib.pyplot as plt @@ -13,6 +12,7 @@ processTimestampsForArtifacts, ) from .analysis.combine_data import combineData +from .analysis.control_channel import add_control_channel, create_control_channel from .analysis.io_utils import ( check_storeslistfile, check_TDT, @@ -22,7 +22,6 @@ takeOnlyDirs, ) from .analysis.timestamp_correction import ( - create_control_channel, decide_naming_and_applyCorrection_ttl, read_control_and_signal, read_ttl, @@ -212,54 +211,6 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) -# This function just creates placeholder Control-HDF5 files that are then immediately overwritten later on in the pipeline. -# TODO: Refactor this function to avoid unnecessary file creation. -# function to add control channel when there is no -# isosbestic control channel and update the storeslist file -def add_control_channel(filepath, arr): - - storenames = arr[0, :] - storesList = np.char.lower(arr[1, :]) - - keep_control = np.array([]) - # check a case if there is isosbestic control channel present - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "signal_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) > 1: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - if len(find_signal) == 0: - logger.error( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - raise Exception( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - else: - continue - - for i in range(storesList.shape[0]): - if "signal" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "control_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) == 0: - src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( - filepath, "cntrl" + str(i) + ".hdf5" - ) - shutil.copyfile(src, dst) - arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) - - np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") - - return arr - - # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection def execute_timestamp_correction(folderNames, inputParameters): From 1f65c14b838096c4625e5895a791fb5d0976a64e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 10:57:02 -0800 Subject: [PATCH 080/125] Moved read and write to standard_io.py. --- src/guppy/analysis/io_utils.py | 19 +++ src/guppy/analysis/timestamp_correction.py | 138 +++++---------------- src/guppy/preprocess.py | 28 ++--- 3 files changed, 66 insertions(+), 119 deletions(-) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index 8b10127..c11edba 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -166,3 +166,22 @@ def check_storeslistfile(folderNames): storesList = np.unique(storesList, axis=1) return storesList + + +def get_control_and_signal_channel_names(storesList): + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + channels_arr = [] + for i in range(names_for_storenames.shape[0]): + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + channels_arr.append(names_for_storenames[i]) + + channels_arr = sorted(channels_arr, key=str.casefold) + try: + channels_arr = np.asarray(channels_arr).reshape(2, -1) + except: + logger.error("Error in saving stores list file or spelling mistake for control or signal") + raise Exception("Error in saving stores list file or spelling mistake for control or signal") + + return channels_arr diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 709deca..f48a255 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -5,32 +5,47 @@ from .io_utils import ( check_TDT, - read_hdf5, + get_control_and_signal_channel_names, write_hdf5, ) logger = logging.getLogger(__name__) -def write_corrected_timestamps( - filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex +def correct_timestamps( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + name_to_timestamps_ttl, + mode, ): - for name, correctionIndex in name_to_correctionIndex.items(): - timestamps = name_to_timestamps[name] - corrected_timestamps = corrected_name_to_timestamps[name] - sampling_rate = name_to_sampling_rate[name] - if sampling_rate.shape == (): # numpy scalar - sampling_rate = np.asarray([sampling_rate]) - name_1 = name.split("_")[-1] - write_hdf5(np.asarray([timestamps[0]]), "timeCorrection_" + name_1, filepath, "timeRecStart") - write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") - + name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data = timestampCorrection( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + mode=mode, + ) + compound_name_to_corrected_ttl_timestamps = decide_naming_and_applyCorrection_ttl( + timeForLightsTurnOn, + storesList, + name_to_timestamps_ttl, + name_to_timestamps, + name_to_data, + mode=mode, + ) -def write_corrected_data(filepath, name_to_corrected_data): - for name, data in name_to_corrected_data.items(): - write_hdf5(data, name, filepath, "data") + return ( + name_to_corrected_timestamps, + name_to_correctionIndex, + name_to_corrected_data, + compound_name_to_corrected_ttl_timestamps, + ) # function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode) @@ -263,16 +278,6 @@ def applyCorrection_ttl( return corrected_ttl_timestamps -def write_corrected_ttl_timestamps( - filepath, - compound_name_to_corrected_ttl_timestamps, -): - logger.debug("Applying correction of timestamps to the data and event timestamps") - for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): - write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") - logger.info("Timestamps corrections applied to the data and event timestamps.") - - # function to apply correction to control, signal and event timestamps def applyCorrection( filepath, @@ -336,80 +341,3 @@ def check_cntrl_sig_length(channels_arr, name_to_data): indices.append(signal_name) return indices - - -def get_control_and_signal_channel_names(storesList): - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - - channels_arr = [] - for i in range(names_for_storenames.shape[0]): - if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): - channels_arr.append(names_for_storenames[i]) - - channels_arr = sorted(channels_arr, key=str.casefold) - try: - channels_arr = np.asarray(channels_arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - return channels_arr - - -def read_control_and_signal(filepath, storesList): - channels_arr = get_control_and_signal_channel_names(storesList) - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - - name_to_data = {} - name_to_timestamps = {} - name_to_sampling_rate = {} - name_to_npoints = {} - - for i in range(channels_arr.shape[1]): - control_name = channels_arr[0, i] - signal_name = channels_arr[1, i] - idx_c = np.where(storesList == control_name)[0] - idx_s = np.where(storesList == signal_name)[0] - control_storename = storenames[idx_c[0]] - signal_storename = storenames[idx_s[0]] - - control_data = read_hdf5(control_storename, filepath, "data") - signal_data = read_hdf5(signal_storename, filepath, "data") - control_timestamps = read_hdf5(control_storename, filepath, "timestamps") - signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps") - control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate") - signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate") - try: # TODO: define npoints for csv datasets - control_npoints = read_hdf5(control_storename, filepath, "npoints") - signal_npoints = read_hdf5(signal_storename, filepath, "npoints") - except KeyError: # npoints is not defined for csv datasets - control_npoints = None - signal_npoints = None - - name_to_data[control_name] = control_data - name_to_data[signal_name] = signal_data - name_to_timestamps[control_name] = control_timestamps - name_to_timestamps[signal_name] = signal_timestamps - name_to_sampling_rate[control_name] = control_sampling_rate - name_to_sampling_rate[signal_name] = signal_sampling_rate - name_to_npoints[control_name] = control_npoints - name_to_npoints[signal_name] = signal_npoints - - return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints - - -def read_ttl(filepath, storesList): - channels_arr = get_control_and_signal_channel_names(storesList) - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - - name_to_timestamps = {} - for storename, name in zip(storenames, names_for_storenames): - if name in channels_arr: - continue - timestamps = read_hdf5(storename, filepath, "timestamps") - name_to_timestamps[name] = timestamps - - return name_to_timestamps diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 9f1f14e..aa0c761 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -21,15 +21,14 @@ read_hdf5, takeOnlyDirs, ) -from .analysis.timestamp_correction import ( - decide_naming_and_applyCorrection_ttl, +from .analysis.standard_io import ( read_control_and_signal, read_ttl, - timestampCorrection, write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, ) +from .analysis.timestamp_correction import correct_timestamps from .analysis.z_score import compute_z_score logger = logging.getLogger(__name__) @@ -233,15 +232,25 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts - name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data = timestampCorrection( + name_to_timestamps_ttl = read_ttl(filepath, storesList) + + timestamps_dicts = correct_timestamps( timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints, + name_to_timestamps_ttl, mode=mode, ) + ( + name_to_corrected_timestamps, + name_to_correctionIndex, + name_to_corrected_data, + compound_name_to_corrected_ttl_timestamps, + ) = timestamps_dicts + write_corrected_timestamps( filepath, name_to_corrected_timestamps, @@ -250,17 +259,8 @@ def execute_timestamp_correction(folderNames, inputParameters): name_to_correctionIndex, ) write_corrected_data(filepath, name_to_corrected_data) - - name_to_timestamps_ttl = read_ttl(filepath, storesList) - compound_name_to_corrected_ttl_timestamps = decide_naming_and_applyCorrection_ttl( - timeForLightsTurnOn, - storesList, - name_to_timestamps_ttl, - name_to_timestamps, - name_to_data, - mode=mode, - ) write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps) + # check if isosbestic control is false and also if new control channel is added if isosbestic_control == False: create_control_channel(filepath, storesList, window=101) From b628232b16de5a59260e8caa09b75a3504a56c40 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 10:57:18 -0800 Subject: [PATCH 081/125] Moved read and write to standard_io.py. --- src/guppy/analysis/standard_io.py | 100 ++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 src/guppy/analysis/standard_io.py diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py new file mode 100644 index 0000000..2ce8189 --- /dev/null +++ b/src/guppy/analysis/standard_io.py @@ -0,0 +1,100 @@ +import logging + +import numpy as np + +from .io_utils import ( + get_control_and_signal_channel_names, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +def read_control_and_signal(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_data = {} + name_to_timestamps = {} + name_to_sampling_rate = {} + name_to_npoints = {} + + for i in range(channels_arr.shape[1]): + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + idx_c = np.where(storesList == control_name)[0] + idx_s = np.where(storesList == signal_name)[0] + control_storename = storenames[idx_c[0]] + signal_storename = storenames[idx_s[0]] + + control_data = read_hdf5(control_storename, filepath, "data") + signal_data = read_hdf5(signal_storename, filepath, "data") + control_timestamps = read_hdf5(control_storename, filepath, "timestamps") + signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps") + control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate") + signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate") + try: # TODO: define npoints for csv datasets + control_npoints = read_hdf5(control_storename, filepath, "npoints") + signal_npoints = read_hdf5(signal_storename, filepath, "npoints") + except KeyError: # npoints is not defined for csv datasets + control_npoints = None + signal_npoints = None + + name_to_data[control_name] = control_data + name_to_data[signal_name] = signal_data + name_to_timestamps[control_name] = control_timestamps + name_to_timestamps[signal_name] = signal_timestamps + name_to_sampling_rate[control_name] = control_sampling_rate + name_to_sampling_rate[signal_name] = signal_sampling_rate + name_to_npoints[control_name] = control_npoints + name_to_npoints[signal_name] = signal_npoints + + return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints + + +def read_ttl(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_timestamps = {} + for storename, name in zip(storenames, names_for_storenames): + if name in channels_arr: + continue + timestamps = read_hdf5(storename, filepath, "timestamps") + name_to_timestamps[name] = timestamps + + return name_to_timestamps + + +def write_corrected_timestamps( + filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex +): + for name, correctionIndex in name_to_correctionIndex.items(): + timestamps = name_to_timestamps[name] + corrected_timestamps = corrected_name_to_timestamps[name] + sampling_rate = name_to_sampling_rate[name] + if sampling_rate.shape == (): # numpy scalar + sampling_rate = np.asarray([sampling_rate]) + name_1 = name.split("_")[-1] + write_hdf5(np.asarray([timestamps[0]]), "timeCorrection_" + name_1, filepath, "timeRecStart") + write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") + + +def write_corrected_data(filepath, name_to_corrected_data): + for name, data in name_to_corrected_data.items(): + write_hdf5(data, name, filepath, "data") + + +def write_corrected_ttl_timestamps( + filepath, + compound_name_to_corrected_ttl_timestamps, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): + write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") + logger.info("Timestamps corrections applied to the data and event timestamps.") From 90e838bccde583051ddbf52e5d8902f4c4f01c00 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 11:06:02 -0800 Subject: [PATCH 082/125] Removed unused functions after the refactor. --- src/guppy/analysis/timestamp_correction.py | 145 +-------------------- 1 file changed, 1 insertion(+), 144 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index f48a255..60cf76a 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -1,13 +1,8 @@ import logging -import os import numpy as np -from .io_utils import ( - check_TDT, - get_control_and_signal_channel_names, - write_hdf5, -) +from .io_utils import get_control_and_signal_channel_names logger = logging.getLogger(__name__) @@ -122,99 +117,6 @@ def timestampCorrection( return name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data -# function to check if naming convention was followed while saving storeslist file -# and apply timestamps correction using the function applyCorrection -def decide_naming_convention_and_applyCorrection( - filepath, - timeForLightsTurnOn, - event, - displayName, - storesList, - name_1_to_corrected_timestamps, - name_1_to_timestamps, - name_1_to_sampling_rate, - name_1_to_correctionIndex, - data, - ttl_timestamps, -): - - logger.debug("Applying correction of timestamps to the data and event timestamps") - arr = get_control_and_signal_channel_names(storesList) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - if name_1 != name_2: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - else: - corrected_timestamps = name_1_to_corrected_timestamps[name_1] - timestamps = name_1_to_timestamps[name_1] - timeRecStart = timestamps[0] - sampling_rate = name_1_to_sampling_rate[name_1] - correctionIndex = name_1_to_correctionIndex[name_1] - applyCorrection( - filepath, - timeForLightsTurnOn, - event, - displayName, - name_1, - corrected_timestamps, - sampling_rate, - correctionIndex, - timeRecStart, - data, - ttl_timestamps, - ) - - logger.info("Timestamps corrections applied to the data and event timestamps.") - - -def decide_naming_and_applyCorrection_signal_and_control( - filepath, - storesList, - name_to_correctionIndex, - name_to_data, -): - logger.debug("Applying correction of timestamps to the data and event timestamps") - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - arr = get_control_and_signal_channel_names(storesList) - indices = check_cntrl_sig_length(arr, name_to_data) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - if name_1 != name_2: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - idx = np.where(names_for_storenames == indices[i])[0] - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - name = names_for_storenames[idx][0] - correctionIndex = name_to_correctionIndex[name] - control_name = arr[0, i] - signal_name = arr[1, i] - control_data = name_to_data[control_name] - signal_data = name_to_data[signal_name] - applyCorrection_signal_and_control(filepath, control_name, correctionIndex, control_data) - applyCorrection_signal_and_control(filepath, signal_name, correctionIndex, signal_data) - - logger.info("Timestamps corrections applied to the data and event timestamps.") - - -def applyCorrection_signal_and_control(filepath, displayName, correctionIndex, data): - arr = data - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, displayName, filepath, "data") - - def decide_naming_and_applyCorrection_ttl( timeForLightsTurnOn, storesList, @@ -278,51 +180,6 @@ def applyCorrection_ttl( return corrected_ttl_timestamps -# function to apply correction to control, signal and event timestamps -def applyCorrection( - filepath, - timeForLightsTurnOn, - event, - displayName, - naming, - corrected_timestamps, - sampling_rate, - correctionIndex, - timeRecStart, - data, - ttl_timestamps, -): - - cond = check_TDT(os.path.dirname(filepath)) - - timestampNew = corrected_timestamps - if "control" in displayName.lower() or "signal" in displayName.lower(): - # TODO: double-check that this code is not reachable - # split_name = displayName.split("_")[-1] - # if split_name == naming: - # pass - # else: - # correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") - arr = data - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, displayName, filepath, "data") - else: - arr = ttl_timestamps - if cond == True: - res = (arr >= timeRecStart).all() - if res == True: - arr = np.subtract(arr, timeRecStart) - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - write_hdf5(arr, displayName + "_" + naming, filepath, "ts") - - # function to check control and signal channel has same length # if not, take a smaller length and do pre-processing def check_cntrl_sig_length(channels_arr, name_to_data): From bf57616f1671a0c5a0ca674cceb6c36cbdbc8fe5 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 11:47:54 -0800 Subject: [PATCH 083/125] Refactored artifact removal separate from z score --- src/guppy/preprocess.py | 62 ++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index aa0c761..4f72929 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -270,15 +270,11 @@ def execute_timestamp_correction(folderNames, inputParameters): logger.info(f"Timestamps corrections finished for {filepath}") -# function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts +# function to compute z-score and deltaF/F def execute_zscore(folderNames, inputParameters): - timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] - remove_artifacts = inputParameters["removeArtifacts"] - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] plot_zScore_dff = inputParameters["plot_zScore_dff"] combine_data = inputParameters["combine_data"] - isosbestic_control = inputParameters["isosbestic_control"] storesListPath = [] for i in range(len(folderNames)): @@ -292,20 +288,9 @@ def execute_zscore(folderNames, inputParameters): for j in range(len(storesListPath)): filepath = storesListPath[j] - storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) - if remove_artifacts == True: - logger.debug("Removing Artifacts from the data and correcting timestamps...") - compute_z_score(filepath, inputParameters) - if artifactsRemovalMethod == "concatenate": - processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) - else: - addingNaNtoChunksWithArtifacts(filepath, storesList) - visualizeControlAndSignal(filepath, remove_artifacts) - logger.info("Artifacts from the data are removed and timestamps are corrected.") - else: - compute_z_score(filepath, inputParameters) - visualizeControlAndSignal(filepath, remove_artifacts) + compute_z_score(filepath, inputParameters) + visualizeControlAndSignal(filepath, removeArtifacts=False) if plot_zScore_dff == "z_score": visualize_z_score(filepath) @@ -319,7 +304,42 @@ def execute_zscore(folderNames, inputParameters): inputParameters["step"] += 1 plt.show() - logger.info("Signal data and event timestamps are extracted.") + logger.info("Z-score computation completed.") + + +# function to remove artifacts from z-score data +def execute_artifact_removal(folderNames, inputParameters): + + timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + combine_data = inputParameters["combine_data"] + + storesListPath = [] + for i in range(len(folderNames)): + if combine_data == True: + storesListPath.append([folderNames[i][0]]) + else: + filepath = folderNames[i] + storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + + storesListPath = np.concatenate(storesListPath) + + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + + logger.debug("Removing artifacts from the data...") + if artifactsRemovalMethod == "concatenate": + processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) + else: + addingNaNtoChunksWithArtifacts(filepath, storesList) + visualizeControlAndSignal(filepath, removeArtifacts=True) + logger.info("Artifacts removed and timestamps corrected.") + + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + + logger.info("Artifact removal completed.") def extractTsAndSignal(inputParameters): @@ -350,6 +370,8 @@ def extractTsAndSignal(inputParameters): writeToFile(str((pbMaxValue + 1) * 10) + "\n" + str(10) + "\n") execute_timestamp_correction(folderNames, inputParameters) execute_zscore(folderNames, inputParameters) + if remove_artifacts == True: + execute_artifact_removal(folderNames, inputParameters) else: pbMaxValue = 1 + len(folderNames) writeToFile(str((pbMaxValue) * 10) + "\n" + str(10) + "\n") @@ -357,6 +379,8 @@ def extractTsAndSignal(inputParameters): storesList = check_storeslistfile(folderNames) op_folder = combineData(folderNames, inputParameters, storesList) execute_zscore(op_folder, inputParameters) + if remove_artifacts == True: + execute_artifact_removal(op_folder, inputParameters) def main(input_parameters): From a03d018fb3ee1a5cf5558a8a8afc34f8019d665a Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 14:27:38 -0800 Subject: [PATCH 084/125] Added artifact removal parameter back to execute_zscore. --- src/guppy/preprocess.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 4f72929..ad4507e 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -275,6 +275,7 @@ def execute_zscore(folderNames, inputParameters): plot_zScore_dff = inputParameters["plot_zScore_dff"] combine_data = inputParameters["combine_data"] + remove_artifacts = inputParameters["removeArtifacts"] storesListPath = [] for i in range(len(folderNames)): @@ -290,7 +291,8 @@ def execute_zscore(folderNames, inputParameters): filepath = storesListPath[j] compute_z_score(filepath, inputParameters) - visualizeControlAndSignal(filepath, removeArtifacts=False) + if not remove_artifacts: + visualizeControlAndSignal(filepath, removeArtifacts=remove_artifacts) if plot_zScore_dff == "z_score": visualize_z_score(filepath) @@ -334,11 +336,11 @@ def execute_artifact_removal(folderNames, inputParameters): else: addingNaNtoChunksWithArtifacts(filepath, storesList) visualizeControlAndSignal(filepath, removeArtifacts=True) - logger.info("Artifacts removed and timestamps corrected.") writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") inputParameters["step"] += 1 + plt.show() logger.info("Artifact removal completed.") From e0a4ca80e6b470c6d9d53e2a8c3032e93246e5a9 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 14:31:13 -0800 Subject: [PATCH 085/125] Removed idle removeArtifacts parameter from compute z-score function. --- src/guppy/analysis/z_score.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index b5032be..87bf184 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -19,7 +19,6 @@ def compute_z_score(filepath, inputParameters): logger.debug(f"Computing z-score for each of the data in {filepath}") - remove_artifacts = inputParameters["removeArtifacts"] path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) @@ -48,14 +47,9 @@ def compute_z_score(filepath, inputParameters): # signal_smooth = ss.filtfilt(b, a, signal) # _score, dff = helper_z_score(control_smooth, signal_smooth) z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters) - if remove_artifacts == True: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - else: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") else: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") From 44292ae41c2e7cc7ff2a94c93040da30ddba739d Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 15:55:25 -0800 Subject: [PATCH 086/125] Streamlined remove artifact branch of the helper_z_score function. --- src/guppy/analysis/z_score.py | 62 +++++++++++++++-------------------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 87bf184..5f64d7f 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -80,42 +80,34 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ if removeArtifacts == True: coords = fetchCoords(filepath, name, tsNew) - - # for artifacts removal, each chunk which was selected by user is being processed individually and then - # z-score is calculated - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - if isosbestic_control == False: - control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - temp_control_arr[tsNew_index] = control_arr - if i < coords.shape[0] - 1: - blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] - temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) - else: - control_arr = control[tsNew_index] - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - norm_data_arr[tsNew_index] = norm_data - control_fit_arr[tsNew_index] = control_fit - - if artifactsRemovalMethod == "concatenate": - norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] - control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] - z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) else: - tsNew_index = np.arange(tsNew.shape[0]) - norm_data, control_fit = execute_controlFit_dff(control, signal, isosbestic_control, filter_window) - z_score = z_score_computation(norm_data, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) - norm_data_arr[tsNew_index] = norm_data # np.concatenate((norm_data_arr, norm_data)) - control_fit_arr[tsNew_index] = control_fit # np.concatenate((control_fit_arr, control_fit)) + dt = tsNew[1] - tsNew[0] + coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) + + # for artifacts removal, each chunk which was selected by user is being processed individually and then + # z-score is calculated + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + if isosbestic_control == False: + control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff(control_arr, signal_arr, isosbestic_control, filter_window) + temp_control_arr[tsNew_index] = control_arr + if i < coords.shape[0] - 1: + blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] + temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) + else: + control_arr = control[tsNew_index] + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff(control_arr, signal_arr, isosbestic_control, filter_window) + norm_data_arr[tsNew_index] = norm_data + control_fit_arr[tsNew_index] = control_fit + + if artifactsRemovalMethod == "concatenate": + norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] + control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] + z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) + z_score_arr = np.concatenate((z_score_arr, z_score)) # handle the case if there are chunks being cut in the front and the end if isosbestic_control == False and removeArtifacts == True: From 6da97c08ec9da0448b9a7ace28f31ebea463b62b Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:02:49 -0800 Subject: [PATCH 087/125] Streamlined remove artifact branch of the helper_z_score function pt 2 --- src/guppy/analysis/control_channel.py | 1 + src/guppy/analysis/z_score.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/guppy/analysis/control_channel.py b/src/guppy/analysis/control_channel.py index d9f6ad8..605bd17 100644 --- a/src/guppy/analysis/control_channel.py +++ b/src/guppy/analysis/control_channel.py @@ -1,5 +1,6 @@ import logging import os +import shutil import numpy as np import pandas as pd diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 5f64d7f..9472322 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -110,7 +110,7 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ z_score_arr = np.concatenate((z_score_arr, z_score)) # handle the case if there are chunks being cut in the front and the end - if isosbestic_control == False and removeArtifacts == True: + if isosbestic_control == False: coords = coords.flatten() # front chunk idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0] From d8bfcc0d8ba9c1e06b9c484613dd6e4c7fec3d05 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:14:14 -0800 Subject: [PATCH 088/125] Pulled remove_artifact code out of helper_z_score --- src/guppy/analysis/z_score.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 9472322..60bb88a 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -46,7 +46,14 @@ def compute_z_score(filepath, inputParameters): # control_smooth = ss.filtfilt(b, a, control) # signal_smooth = ss.filtfilt(b, a, signal) # _score, dff = helper_z_score(control_smooth, signal_smooth) - z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters) + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + removeArtifacts = inputParameters["removeArtifacts"] + if removeArtifacts == True: + coords = fetchCoords(filepath, name, tsNew) + else: + dt = tsNew[1] - tsNew[0] + coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) + z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters, coords) write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") @@ -58,9 +65,10 @@ def compute_z_score(filepath, inputParameters): # helper function to compute z-score and deltaF/F -def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): +def helper_z_score( + control, signal, filepath, name, inputParameters, coords +): # helper_z_score(control_smooth, signal_smooth): - removeArtifacts = inputParameters["removeArtifacts"] artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] filter_window = inputParameters["filter_window"] @@ -68,8 +76,6 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy") - logger.info("Remove Artifacts : ", removeArtifacts) - if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) @@ -78,12 +84,6 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ control_fit_arr = np.full(tsNew.shape[0], np.nan) temp_control_arr = np.full(tsNew.shape[0], np.nan) - if removeArtifacts == True: - coords = fetchCoords(filepath, name, tsNew) - else: - dt = tsNew[1] - tsNew[0] - coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) - # for artifacts removal, each chunk which was selected by user is being processed individually and then # z-score is calculated for i in range(coords.shape[0]): From b33c522ed317376f771794f166003d98bc815f4c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:28:20 -0800 Subject: [PATCH 089/125] Pulled remove_artifact code into dedicated fn --- src/guppy/analysis/z_score.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 60bb88a..7537f9d 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -25,35 +25,26 @@ def compute_z_score(filepath, inputParameters): path = sorted(path_1 + path_2, key=str.casefold) - b = np.divide(np.ones((100,)), 100) - a = 1 - if len(path) % 2 != 0: logger.error("There are not equal number of Control and Signal data") raise Exception("There are not equal number of Control and Signal data") path = np.asarray(path).reshape(2, -1) + removeArtifacts = inputParameters["removeArtifacts"] for i in range(path.shape[1]): name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) if name_1[-1] == name_2[-1]: name = name_1[-1] control = read_hdf5("", path[0, i], "data").reshape(-1) signal = read_hdf5("", path[1, i], "data").reshape(-1) - # control_smooth = ss.filtfilt(b, a, control) - # signal_smooth = ss.filtfilt(b, a, signal) - # _score, dff = helper_z_score(control_smooth, signal_smooth) tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - removeArtifacts = inputParameters["removeArtifacts"] - if removeArtifacts == True: - coords = fetchCoords(filepath, name, tsNew) - else: - dt = tsNew[1] - tsNew[0] - coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) - z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters, coords) + + coords = get_coords(filepath, name, tsNew, removeArtifacts) + z_score, dff, control_fit = helper_z_score(control, signal, tsNew, filepath, name, inputParameters, coords) + write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") @@ -64,17 +55,23 @@ def compute_z_score(filepath, inputParameters): logger.info(f"z-score for the data in {filepath} computed.") +def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redundant with fetchCoords + if removeArtifacts == True: + coords = fetchCoords(filepath, name, tsNew) + else: + dt = tsNew[1] - tsNew[0] + coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) + return coords + + # helper function to compute z-score and deltaF/F def helper_z_score( - control, signal, filepath, name, inputParameters, coords + control, signal, tsNew, filepath, name, inputParameters, coords ): # helper_z_score(control_smooth, signal_smooth): artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] filter_window = inputParameters["filter_window"] - isosbestic_control = inputParameters["isosbestic_control"] - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy") if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) From e87c80963224de6e298fab3c50514598cf6a0009 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:44:41 -0800 Subject: [PATCH 090/125] Pulled write code out of helper_z_score --- src/guppy/analysis/z_score.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 7537f9d..0dd4171 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -43,11 +43,15 @@ def compute_z_score(filepath, inputParameters): tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit = helper_z_score(control, signal, tsNew, filepath, name, inputParameters, coords) + z_score, dff, control_fit, temp_control_arr = helper_z_score( + control, signal, tsNew, filepath, name, inputParameters, coords + ) write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + if temp_control_arr is not None: + write_hdf5(temp_control_arr, "control_" + name, filepath, "data") else: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") @@ -115,9 +119,10 @@ def helper_z_score( # end chunk idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0] temp_control_arr[idx] = np.full(idx.shape[0], np.nan) - write_hdf5(temp_control_arr, "control_" + name, filepath, "data") + else: + temp_control_arr = None - return z_score_arr, norm_data_arr, control_fit_arr + return z_score_arr, norm_data_arr, control_fit_arr, temp_control_arr # function to filter control and signal channel, also execute above two function : controlFit and deltaFF From cf7345888e6c42e330263ca596271348b36d57a7 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:47:26 -0800 Subject: [PATCH 091/125] inverted input handling --- src/guppy/analysis/z_score.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 0dd4171..8fc598b 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -35,27 +35,26 @@ def compute_z_score(filepath, inputParameters): for i in range(path.shape[1]): name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") - - if name_1[-1] == name_2[-1]: - name = name_1[-1] - control = read_hdf5("", path[0, i], "data").reshape(-1) - signal = read_hdf5("", path[1, i], "data").reshape(-1) - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - - coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit, temp_control_arr = helper_z_score( - control, signal, tsNew, filepath, name, inputParameters, coords - ) - - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - if temp_control_arr is not None: - write_hdf5(temp_control_arr, "control_" + name, filepath, "data") - else: + if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + control = read_hdf5("", path[0, i], "data").reshape(-1) + signal = read_hdf5("", path[1, i], "data").reshape(-1) + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + + coords = get_coords(filepath, name, tsNew, removeArtifacts) + z_score, dff, control_fit, temp_control_arr = helper_z_score( + control, signal, tsNew, filepath, name, inputParameters, coords + ) + + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + if temp_control_arr is not None: + write_hdf5(temp_control_arr, "control_" + name, filepath, "data") + logger.info(f"z-score for the data in {filepath} computed.") From 7304fae988fdf569532f6918acabf0b6b902b08e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:50:34 -0800 Subject: [PATCH 092/125] removed unnecessary parameters --- src/guppy/analysis/z_score.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 8fc598b..1afe9e5 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -45,9 +45,7 @@ def compute_z_score(filepath, inputParameters): tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit, temp_control_arr = helper_z_score( - control, signal, tsNew, filepath, name, inputParameters, coords - ) + z_score, dff, control_fit, temp_control_arr = helper_z_score(control, signal, tsNew, inputParameters, coords) write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") @@ -68,9 +66,7 @@ def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redun # helper function to compute z-score and deltaF/F -def helper_z_score( - control, signal, tsNew, filepath, name, inputParameters, coords -): # helper_z_score(control_smooth, signal_smooth): +def helper_z_score(control, signal, tsNew, inputParameters, coords): artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] filter_window = inputParameters["filter_window"] From 965f62b4edc3455c6414eea6432b6325caa69580 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 11:40:26 -0800 Subject: [PATCH 093/125] purified helper_z_score --- src/guppy/analysis/z_score.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 1afe9e5..167863a 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -18,6 +18,10 @@ # compute z-score and deltaF/F and save it to hdf5 file def compute_z_score(filepath, inputParameters): + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + filter_window = inputParameters["filter_window"] + isosbestic_control = inputParameters["isosbestic_control"] + logger.debug(f"Computing z-score for each of the data in {filepath}") path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) @@ -45,7 +49,9 @@ def compute_z_score(filepath, inputParameters): tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit, temp_control_arr = helper_z_score(control, signal, tsNew, inputParameters, coords) + z_score, dff, control_fit, temp_control_arr = helper_z_score( + control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control + ) write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") @@ -66,12 +72,9 @@ def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redun # helper function to compute z-score and deltaF/F -def helper_z_score(control, signal, tsNew, inputParameters, coords): - - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - filter_window = inputParameters["filter_window"] - isosbestic_control = inputParameters["isosbestic_control"] - +def helper_z_score( + control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control +): if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) From c49d05f32bf2933abdc02bdeac73ed4ad2043607 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 11:44:38 -0800 Subject: [PATCH 094/125] purified z_score_computation --- src/guppy/analysis/z_score.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 167863a..7dae540 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -75,6 +75,8 @@ def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redun def helper_z_score( control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control ): + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) @@ -105,7 +107,7 @@ def helper_z_score( if artifactsRemovalMethod == "concatenate": norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] - z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) + z_score = z_score_computation(norm_data_arr, tsNew, zscore_method, baseline_start, baseline_end) z_score_arr = np.concatenate((z_score_arr, z_score)) # handle the case if there are chunks being cut in the front and the end @@ -173,11 +175,7 @@ def filterSignal(filter_window, signal): # function to compute z-score based on z-score computation method -def z_score_computation(dff, timestamps, inputParameters): - - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] - +def z_score_computation(dff, timestamps, zscore_method, baseline_start, baseline_end): if zscore_method == "standard z-score": numerator = np.subtract(dff, np.nanmean(dff)) zscore = np.divide(numerator, np.nanstd(dff)) From a88c026aef77be33f3154caaed65b2d595be11d8 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 11:46:53 -0800 Subject: [PATCH 095/125] purified helper_z_score --- src/guppy/analysis/z_score.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 7dae540..31645b5 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -21,6 +21,8 @@ def compute_z_score(filepath, inputParameters): artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] filter_window = inputParameters["filter_window"] isosbestic_control = inputParameters["isosbestic_control"] + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] logger.debug(f"Computing z-score for each of the data in {filepath}") @@ -50,7 +52,16 @@ def compute_z_score(filepath, inputParameters): coords = get_coords(filepath, name, tsNew, removeArtifacts) z_score, dff, control_fit, temp_control_arr = helper_z_score( - control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control + control, + signal, + tsNew, + coords, + artifactsRemovalMethod, + filter_window, + isosbestic_control, + zscore_method, + baseline_start, + baseline_end, ) write_hdf5(z_score, "z_score_" + name, filepath, "data") @@ -73,10 +84,17 @@ def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redun # helper function to compute z-score and deltaF/F def helper_z_score( - control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control + control, + signal, + tsNew, + coords, + artifactsRemovalMethod, + filter_window, + isosbestic_control, + zscore_method, + baseline_start, + baseline_end, ): - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) From bf268f81147a5b471d9506c63a26ab34080074f9 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 12:12:07 -0800 Subject: [PATCH 096/125] Refactored zscore to use a single high-level compute_zscore function that is pure and moved all the impure code into execute_zscore in preprocess.py. --- src/guppy/analysis/io_utils.py | 9 ++++ src/guppy/analysis/standard_io.py | 16 +++++++ src/guppy/analysis/z_score.py | 78 +------------------------------ src/guppy/preprocess.py | 44 ++++++++++++++++- 4 files changed, 69 insertions(+), 78 deletions(-) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index c11edba..b467c37 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -133,6 +133,15 @@ def fetchCoords(filepath, naming, data): return coords +def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redundant with fetchCoords + if removeArtifacts == True: + coords = fetchCoords(filepath, name, tsNew) + else: + dt = tsNew[1] - tsNew[0] + coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) + return coords + + def get_all_stores_for_combining_data(folderNames): op = [] for i in range(100): diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 2ce8189..b6fcd8a 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -98,3 +98,19 @@ def write_corrected_ttl_timestamps( for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") logger.info("Timestamps corrections applied to the data and event timestamps.") + + +def read_corrected_data(control_path, signal_path, filepath, name): + control = read_hdf5("", control_path, "data").reshape(-1) + signal = read_hdf5("", signal_path, "data").reshape(-1) + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + + return control, signal, tsNew + + +def write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr): + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + if temp_control_arr is not None: + write_hdf5(temp_control_arr, "control_" + name, filepath, "data") diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 31645b5..34b29ee 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -1,89 +1,15 @@ import logging -import os import numpy as np from scipy import signal as ss from .control_channel import helper_create_control_channel -from .io_utils import ( - fetchCoords, - find_files, - read_hdf5, - write_hdf5, -) logger = logging.getLogger(__name__) -# compute z-score and deltaF/F and save it to hdf5 file -def compute_z_score(filepath, inputParameters): - - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - filter_window = inputParameters["filter_window"] - isosbestic_control = inputParameters["isosbestic_control"] - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] - - logger.debug(f"Computing z-score for each of the data in {filepath}") - - path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - if len(path) % 2 != 0: - logger.error("There are not equal number of Control and Signal data") - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - removeArtifacts = inputParameters["removeArtifacts"] - - for i in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") - if name_1[-1] != name_2[-1]: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - name = name_1[-1] - control = read_hdf5("", path[0, i], "data").reshape(-1) - signal = read_hdf5("", path[1, i], "data").reshape(-1) - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - - coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit, temp_control_arr = helper_z_score( - control, - signal, - tsNew, - coords, - artifactsRemovalMethod, - filter_window, - isosbestic_control, - zscore_method, - baseline_start, - baseline_end, - ) - - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - if temp_control_arr is not None: - write_hdf5(temp_control_arr, "control_" + name, filepath, "data") - - logger.info(f"z-score for the data in {filepath} computed.") - - -def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redundant with fetchCoords - if removeArtifacts == True: - coords = fetchCoords(filepath, name, tsNew) - else: - dt = tsNew[1] - tsNew[0] - coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) - return coords - - -# helper function to compute z-score and deltaF/F -def helper_z_score( +# high-level function to compute z-score and deltaF/F +def compute_z_score( control, signal, tsNew, diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index ad4507e..5829a2d 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -18,15 +18,18 @@ check_TDT, find_files, get_all_stores_for_combining_data, # noqa: F401 -- Necessary for other modules that depend on preprocess.py + get_coords, read_hdf5, takeOnlyDirs, ) from .analysis.standard_io import ( read_control_and_signal, + read_corrected_data, read_ttl, write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, + write_zscore, ) from .analysis.timestamp_correction import correct_timestamps from .analysis.z_score import compute_z_score @@ -276,6 +279,11 @@ def execute_zscore(folderNames, inputParameters): plot_zScore_dff = inputParameters["plot_zScore_dff"] combine_data = inputParameters["combine_data"] remove_artifacts = inputParameters["removeArtifacts"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + filter_window = inputParameters["filter_window"] + isosbestic_control = inputParameters["isosbestic_control"] + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] storesListPath = [] for i in range(len(folderNames)): @@ -284,13 +292,45 @@ def execute_zscore(folderNames, inputParameters): else: filepath = folderNames[i] storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - storesListPath = np.concatenate(storesListPath) for j in range(len(storesListPath)): filepath = storesListPath[j] + logger.debug(f"Computing z-score for each of the data in {filepath}") + path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + path = sorted(path_1 + path_2, key=str.casefold) + if len(path) % 2 != 0: + logger.error("There are not equal number of Control and Signal data") + raise Exception("There are not equal number of Control and Signal data") + path = np.asarray(path).reshape(2, -1) + + for i in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") + if name_1[-1] != name_2[-1]: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + control, signal, tsNew = read_corrected_data(path[0, i], path[1, i], filepath, name) + coords = get_coords(filepath, name, tsNew, remove_artifacts) + z_score, dff, control_fit, temp_control_arr = compute_z_score( + control, + signal, + tsNew, + coords, + artifactsRemovalMethod, + filter_window, + isosbestic_control, + zscore_method, + baseline_start, + baseline_end, + ) + write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr) + + logger.info(f"z-score for the data in {filepath} computed.") - compute_z_score(filepath, inputParameters) if not remove_artifacts: visualizeControlAndSignal(filepath, removeArtifacts=remove_artifacts) From 4d49fd973f34b31af1b24bd66086e056004ea076 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 13:47:15 -0800 Subject: [PATCH 097/125] Refactored read-out of addingNaNValues --- src/guppy/analysis/artifact_removal.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index ac483bb..0106ec6 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -27,12 +27,15 @@ def addingNaNtoChunksWithArtifacts(filepath, events): if name_1[-1] == name_2[-1]: name = name_1[-1] sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + coords = fetchCoords(filepath, name, ts) for i in range(len(storesList)): if ( "control_" + name.lower() in storesList[i].lower() or "signal_" + name.lower() in storesList[i].lower() ): # changes done - data = addingNaNValues(filepath, storesList[i], name) + data = read_hdf5(storesList[i], filepath, "data").reshape(-1) + data = addingNaNValues(data=data, ts=ts, coords=coords) write_hdf5(data, storesList[i], filepath, "data") else: if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): @@ -151,11 +154,7 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): # adding nan values to removed chunks # when using artifacts removal method - replace with NaN -def addingNaNValues(filepath, event, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) +def addingNaNValues(*, data, ts, coords): if (data == 0).all() == True: data = np.zeros(ts.shape[0]) From a80f080e3f0b9fc69c5b3b83f42020ab599f82f9 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 13:54:44 -0800 Subject: [PATCH 098/125] Refactored read out of removeTTLs --- src/guppy/analysis/artifact_removal.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 0106ec6..599372e 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -41,7 +41,8 @@ def addingNaNtoChunksWithArtifacts(filepath, events): if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): continue else: - ts = removeTTLs(filepath, storesList[i], name) + ts = read_hdf5(storesList[i] + "_" + name, filepath, "ts").reshape(-1) + ts = removeTTLs(ts=ts, coords=coords) write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") else: @@ -174,11 +175,7 @@ def addingNaNValues(*, data, ts, coords): # remove event TTLs which falls in the removed chunks # when using artifacts removal method - replace with NaN -def removeTTLs(filepath, event, naming): - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - +def removeTTLs(*, ts, coords): ts_arr = np.array([]) for i in range(coords.shape[0]): ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] From 1b2066d5db7d2ad364c21e7f968a02fffd490f73 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 14:08:48 -0800 Subject: [PATCH 099/125] Refactored read out of eliminateData and eliminateTs --- src/guppy/analysis/artifact_removal.py | 32 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 599372e..f7e95a3 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -73,15 +73,31 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): "control_" + name.lower() in storesList[i].lower() or "signal_" + name.lower() in storesList[i].lower() ): # changes done + ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + data = read_hdf5(storesList[i], filepath, "data").reshape(-1) + coords = fetchCoords(filepath, name, ts) data, timestampNew = eliminateData( - filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name + data=data, + ts=ts, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, ) write_hdf5(data, storesList[i], filepath, "data") else: if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): continue else: - ts = eliminateTs(filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name) + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + ts = read_hdf5(storesList[i] + "_" + name, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, name, tsNew) + ts = eliminateTs( + ts=ts, + tsNew=tsNew, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, + ) write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") # timestamp_dict[name] = timestampNew @@ -93,11 +109,7 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): # helper function to process control and signal timestamps -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) +def eliminateData(*, data, ts, coords, timeForLightsTurnOn, sampling_rate): if (data == 0).all() == True: data = np.zeros(ts.shape[0]) @@ -126,11 +138,7 @@ def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): # helper function to align event timestamps with the control and signal timestamps -def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) +def eliminateTs(*, ts, tsNew, coords, timeForLightsTurnOn, sampling_rate): ts_arr = np.array([]) tsNew_arr = np.array([]) From 7275b50342300cbb73146824d7e317663506b089 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 14:31:34 -0800 Subject: [PATCH 100/125] cleaned up addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 47 ++++++++++++-------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index f7e95a3..a17eb0e 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -13,41 +13,38 @@ logger = logging.getLogger(__name__) -def addingNaNtoChunksWithArtifacts(filepath, events): +def addingNaNtoChunksWithArtifacts(filepath, storesList): logger.debug("Replacing chunks with artifacts by NaN values.") - storesList = events[1, :] + names_for_storenames = storesList[1, :] path = decide_naming_convention(filepath) for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - coords = fetchCoords(filepath, name, ts) - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): # changes done - data = read_hdf5(storesList[i], filepath, "data").reshape(-1) - data = addingNaNValues(data=data, ts=ts, coords=coords) - write_hdf5(data, storesList[i], filepath, "data") - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = read_hdf5(storesList[i] + "_" + name, filepath, "ts").reshape(-1) - ts = removeTTLs(ts=ts, coords=coords) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") - - else: + if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + coords = fetchCoords(filepath, name, ts) + for i in range(len(names_for_storenames)): + if ( + "control_" + name.lower() in names_for_storenames[i].lower() + or "signal_" + name.lower() in names_for_storenames[i].lower() + ): # changes done + data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) + data = addingNaNValues(data=data, ts=ts, coords=coords) + write_hdf5(data, names_for_storenames[i], filepath, "data") + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + ts = read_hdf5(names_for_storenames[i] + "_" + name, filepath, "ts").reshape(-1) + ts = removeTTLs(ts=ts, coords=coords) + write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") logger.info("Chunks with artifacts are replaced by NaN values.") From 07dcfa80ede5a9b5ba15ab7b27da5319aa2ec709 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 15:08:14 -0800 Subject: [PATCH 101/125] moved read to the top of addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 32 +++++++++----- src/guppy/analysis/standard_io.py | 59 ++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index a17eb0e..3af3001 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -9,11 +9,21 @@ read_hdf5, write_hdf5, ) +from .standard_io import ( + read_control_and_signal, + read_coords_pairwise, + read_corrected_timestamps_pairwise, + read_corrected_ttl_timestamps, +) logger = logging.getLogger(__name__) def addingNaNtoChunksWithArtifacts(filepath, storesList): + name_to_data, _, _, _ = read_control_and_signal(filepath, storesList) + pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) + compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList, pair_name_to_tsNew) logger.debug("Replacing chunks with artifacts by NaN values.") names_for_storenames = storesList[1, :] @@ -26,25 +36,27 @@ def addingNaNtoChunksWithArtifacts(filepath, storesList): if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") - name = name_1[-1] + pair_name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - coords = fetchCoords(filepath, name, ts) + tsNew = pair_name_to_tsNew[pair_name] + coords = pair_name_to_coords[pair_name] for i in range(len(names_for_storenames)): if ( - "control_" + name.lower() in names_for_storenames[i].lower() - or "signal_" + name.lower() in names_for_storenames[i].lower() + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() ): # changes done - data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) - data = addingNaNValues(data=data, ts=ts, coords=coords) + # data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) + data = name_to_data[names_for_storenames[i]].reshape(-1) + data = addingNaNValues(data=data, ts=tsNew, coords=coords) write_hdf5(data, names_for_storenames[i], filepath, "data") else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue - ts = read_hdf5(names_for_storenames[i] + "_" + name, filepath, "ts").reshape(-1) + ttl_name = names_for_storenames[i] + compound_name = ttl_name + "_" + pair_name + ts = compound_name_to_ttl_timestamps[compound_name].reshape(-1) ts = removeTTLs(ts=ts, coords=coords) - write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") + write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepath, "ts") logger.info("Chunks with artifacts are replaced by NaN values.") diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index b6fcd8a..9c2b7c5 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -1,8 +1,11 @@ import logging +import os import numpy as np from .io_utils import ( + decide_naming_convention, + fetchCoords, get_control_and_signal_channel_names, read_hdf5, write_hdf5, @@ -114,3 +117,59 @@ def write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr): write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") if temp_control_arr is not None: write_hdf5(temp_control_arr, "control_" + name, filepath, "data") + + +def read_corrected_timestamps_pairwise(filepath): + pair_name_to_tsNew = {} + path = decide_naming_convention(filepath) + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + if name_1[-1] != name_2[-1]: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + pair_name_to_tsNew[name] = tsNew + return pair_name_to_tsNew + + +def read_coords_pairwise(filepath, pair_name_to_tsNew): + pair_name_to_coords = {} + path = decide_naming_convention(filepath) + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + if name_1[-1] != name_2[-1]: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1[-1] + + tsNew = pair_name_to_tsNew[pair_name] + coords = fetchCoords(filepath, pair_name, tsNew) + pair_name_to_coords[pair_name] = coords + return pair_name_to_coords + + +def read_corrected_ttl_timestamps(filepath, storesList): + compound_name_to_ttl_timestamps = {} + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) + + for storename, name in zip(storenames, names_for_storenames): + if name in arr: + continue + ttl_name = name + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + compound_name = ttl_name + "_" + name_1 + ts = read_hdf5(compound_name, filepath, "ts") + compound_name_to_ttl_timestamps[compound_name] = ts + + return compound_name_to_ttl_timestamps From 8e037759ed2ac405ff6e615ec7ca572156b8723c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 15:18:43 -0800 Subject: [PATCH 102/125] moved read out of addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 16 +++------------- src/guppy/preprocess.py | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 3af3001..97e24f3 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -9,22 +9,13 @@ read_hdf5, write_hdf5, ) -from .standard_io import ( - read_control_and_signal, - read_coords_pairwise, - read_corrected_timestamps_pairwise, - read_corrected_ttl_timestamps, -) logger = logging.getLogger(__name__) -def addingNaNtoChunksWithArtifacts(filepath, storesList): - name_to_data, _, _, _ = read_control_and_signal(filepath, storesList) - pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) - pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) - compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList, pair_name_to_tsNew) - +def addingNaNtoChunksWithArtifacts( + filepath, storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps +): logger.debug("Replacing chunks with artifacts by NaN values.") names_for_storenames = storesList[1, :] @@ -45,7 +36,6 @@ def addingNaNtoChunksWithArtifacts(filepath, storesList): "control_" + pair_name.lower() in names_for_storenames[i].lower() or "signal_" + pair_name.lower() in names_for_storenames[i].lower() ): # changes done - # data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) data = name_to_data[names_for_storenames[i]].reshape(-1) data = addingNaNValues(data=data, ts=tsNew, coords=coords) write_hdf5(data, names_for_storenames[i], filepath, "data") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 5829a2d..184d9fa 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -24,7 +24,10 @@ ) from .analysis.standard_io import ( read_control_and_signal, + read_coords_pairwise, read_corrected_data, + read_corrected_timestamps_pairwise, + read_corrected_ttl_timestamps, read_ttl, write_corrected_data, write_corrected_timestamps, @@ -374,7 +377,18 @@ def execute_artifact_removal(folderNames, inputParameters): if artifactsRemovalMethod == "concatenate": processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) else: - addingNaNtoChunksWithArtifacts(filepath, storesList) + name_to_data, _, _, _ = read_control_and_signal(filepath, storesList) + pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) + compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) + addingNaNtoChunksWithArtifacts( + filepath, + storesList, + pair_name_to_tsNew, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) visualizeControlAndSignal(filepath, removeArtifacts=True) writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") From a87c507144e8ebe5968a22d68718f716dee44d67 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 17:15:18 -0800 Subject: [PATCH 103/125] fixed data read bug --- src/guppy/analysis/standard_io.py | 15 +++++++++++++++ src/guppy/preprocess.py | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 9c2b7c5..f8d291b 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -152,6 +152,21 @@ def read_coords_pairwise(filepath, pair_name_to_tsNew): return pair_name_to_coords +def read_corrected_data_dict(filepath, storesList): # TODO: coordinate with read_corrected_data + name_to_corrected_data = {} + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + control_and_signal_names = get_control_and_signal_channel_names(storesList) + + for storename, name in zip(storenames, names_for_storenames): + if name not in control_and_signal_names: + continue + data = read_hdf5(name, filepath, "data").reshape(-1) + name_to_corrected_data[name] = data + + return name_to_corrected_data + + def read_corrected_ttl_timestamps(filepath, storesList): compound_name_to_ttl_timestamps = {} storenames = storesList[0, :] diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 184d9fa..0c0e176 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -26,6 +26,7 @@ read_control_and_signal, read_coords_pairwise, read_corrected_data, + read_corrected_data_dict, read_corrected_timestamps_pairwise, read_corrected_ttl_timestamps, read_ttl, @@ -377,7 +378,7 @@ def execute_artifact_removal(folderNames, inputParameters): if artifactsRemovalMethod == "concatenate": processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) else: - name_to_data, _, _, _ = read_control_and_signal(filepath, storesList) + name_to_data = read_corrected_data_dict(filepath, storesList) pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) From b1cbc836971c2faee6c1b633a7ee3d7122e398c2 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 17:29:08 -0800 Subject: [PATCH 104/125] Refactored write operations out of addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 8 ++++++-- src/guppy/analysis/standard_io.py | 13 +++++++++++++ src/guppy/preprocess.py | 7 ++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 97e24f3..db40e64 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -21,6 +21,8 @@ def addingNaNtoChunksWithArtifacts( path = decide_naming_convention(filepath) + name_to_corrected_data = {} + compound_name_to_corrected_ttl_timestamps = {} for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") @@ -38,7 +40,7 @@ def addingNaNtoChunksWithArtifacts( ): # changes done data = name_to_data[names_for_storenames[i]].reshape(-1) data = addingNaNValues(data=data, ts=tsNew, coords=coords) - write_hdf5(data, names_for_storenames[i], filepath, "data") + name_to_corrected_data[names_for_storenames[i]] = data else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue @@ -46,9 +48,11 @@ def addingNaNtoChunksWithArtifacts( compound_name = ttl_name + "_" + pair_name ts = compound_name_to_ttl_timestamps[compound_name].reshape(-1) ts = removeTTLs(ts=ts, coords=coords) - write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepath, "ts") + compound_name_to_corrected_ttl_timestamps[compound_name] = ts logger.info("Chunks with artifacts are replaced by NaN values.") + return name_to_corrected_data, compound_name_to_corrected_ttl_timestamps + # main function to align timestamps for control, signal and event timestamps for artifacts removal def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index f8d291b..ad7408e 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -188,3 +188,16 @@ def read_corrected_ttl_timestamps(filepath, storesList): compound_name_to_ttl_timestamps[compound_name] = ts return compound_name_to_ttl_timestamps + + +def write_nan_corrected_data(filepath, name_to_corrected_data): + for name, data in name_to_corrected_data.items(): + write_hdf5(data, name, filepath, "data") + + +def write_nan_corrected_ttl_timestamps( + filepath, + compound_name_to_corrected_ttl_timestamps, +): + for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): + write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 0c0e176..a625bc9 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -33,6 +33,8 @@ write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, + write_nan_corrected_data, + write_nan_corrected_ttl_timestamps, write_zscore, ) from .analysis.timestamp_correction import correct_timestamps @@ -382,7 +384,7 @@ def execute_artifact_removal(folderNames, inputParameters): pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) - addingNaNtoChunksWithArtifacts( + name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( filepath, storesList, pair_name_to_tsNew, @@ -390,6 +392,9 @@ def execute_artifact_removal(folderNames, inputParameters): name_to_data, compound_name_to_ttl_timestamps, ) + write_nan_corrected_data(filepath, name_to_data) + write_nan_corrected_ttl_timestamps(filepath, compound_name_to_ttl_timestamps) + visualizeControlAndSignal(filepath, removeArtifacts=True) writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") From 393d3aa79fbbccb9335d73612d2747ef131d1421 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:15:06 -0800 Subject: [PATCH 105/125] Refactored filepath out of addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 14 +++----------- src/guppy/preprocess.py | 1 - 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index db40e64..556a719 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -14,23 +14,15 @@ def addingNaNtoChunksWithArtifacts( - filepath, storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps + storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps ): logger.debug("Replacing chunks with artifacts by NaN values.") names_for_storenames = storesList[1, :] - - path = decide_naming_convention(filepath) + pair_names = pair_name_to_tsNew.keys() name_to_corrected_data = {} compound_name_to_corrected_ttl_timestamps = {} - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - if name_1[-1] != name_2[-1]: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - pair_name = name_1[-1] - + for pair_name in pair_names: tsNew = pair_name_to_tsNew[pair_name] coords = pair_name_to_coords[pair_name] for i in range(len(names_for_storenames)): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index a625bc9..8555b55 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -385,7 +385,6 @@ def execute_artifact_removal(folderNames, inputParameters): pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( - filepath, storesList, pair_name_to_tsNew, pair_name_to_coords, From 22f4f182c24851a6aa2e9abb62f85dc71e96551d Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:18:09 -0800 Subject: [PATCH 106/125] Renamed some variables in processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 556a719..51c2d19 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -47,10 +47,10 @@ def addingNaNtoChunksWithArtifacts( # main function to align timestamps for control, signal and event timestamps for artifacts removal -def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): +def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList): logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") - storesList = events[1, :] + names_for_storenames = storesList[1, :] path = decide_naming_convention(filepath) @@ -63,13 +63,13 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): name = name_1[-1] sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - for i in range(len(storesList)): + for i in range(len(names_for_storenames)): if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() + "control_" + name.lower() in names_for_storenames[i].lower() + or "signal_" + name.lower() in names_for_storenames[i].lower() ): # changes done ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - data = read_hdf5(storesList[i], filepath, "data").reshape(-1) + data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) coords = fetchCoords(filepath, name, ts) data, timestampNew = eliminateData( data=data, @@ -78,13 +78,13 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): timeForLightsTurnOn=timeForLightsTurnOn, sampling_rate=sampling_rate, ) - write_hdf5(data, storesList[i], filepath, "data") + write_hdf5(data, names_for_storenames[i], filepath, "data") else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue else: tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - ts = read_hdf5(storesList[i] + "_" + name, filepath, "ts").reshape(-1) + ts = read_hdf5(names_for_storenames[i] + "_" + name, filepath, "ts").reshape(-1) coords = fetchCoords(filepath, name, tsNew) ts = eliminateTs( ts=ts, @@ -93,7 +93,7 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): timeForLightsTurnOn=timeForLightsTurnOn, sampling_rate=sampling_rate, ) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") + write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") # timestamp_dict[name] = timestampNew write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") From a4a162f2267a295ed89d3ee3aca7188f23e596fb Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:23:21 -0800 Subject: [PATCH 107/125] Refactored read out of processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 24 ++++++++++++++++-------- src/guppy/preprocess.py | 14 +++++++++++++- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 51c2d19..8e78669 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -5,7 +5,6 @@ from .io_utils import ( decide_naming_convention, - fetchCoords, read_hdf5, write_hdf5, ) @@ -47,7 +46,15 @@ def addingNaNtoChunksWithArtifacts( # main function to align timestamps for control, signal and event timestamps for artifacts removal -def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList): +def processTimestampsForArtifacts( + filepath, + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, +): logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") names_for_storenames = storesList[1, :] @@ -68,9 +75,9 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList): "control_" + name.lower() in names_for_storenames[i].lower() or "signal_" + name.lower() in names_for_storenames[i].lower() ): # changes done - ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) - coords = fetchCoords(filepath, name, ts) + ts = pair_name_to_tsNew[name] + data = name_to_data[names_for_storenames[i]] + coords = pair_name_to_coords[name] data, timestampNew = eliminateData( data=data, ts=ts, @@ -83,9 +90,10 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList): if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue else: - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - ts = read_hdf5(names_for_storenames[i] + "_" + name, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, name, tsNew) + compound_name = names_for_storenames[i] + "_" + name + tsNew = pair_name_to_tsNew[name] + ts = compound_name_to_ttl_timestamps[compound_name] + coords = pair_name_to_coords[name] ts = eliminateTs( ts=ts, tsNew=tsNew, diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 8555b55..dd02bd0 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -378,7 +378,19 @@ def execute_artifact_removal(folderNames, inputParameters): logger.debug("Removing artifacts from the data...") if artifactsRemovalMethod == "concatenate": - processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) + name_to_data = read_corrected_data_dict(filepath, storesList) + pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) + compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) + processTimestampsForArtifacts( + filepath, + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) else: name_to_data = read_corrected_data_dict(filepath, storesList) pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) From a25e7acaecf27e7ad1fd4667478af72509205e35 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:31:16 -0800 Subject: [PATCH 108/125] Refactored read out of processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 5 ++--- src/guppy/analysis/standard_io.py | 5 ++++- src/guppy/preprocess.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 8e78669..852cfb8 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -5,7 +5,6 @@ from .io_utils import ( decide_naming_convention, - read_hdf5, write_hdf5, ) @@ -51,6 +50,7 @@ def processTimestampsForArtifacts( timeForLightsTurnOn, storesList, pair_name_to_tsNew, + pair_name_to_sampling_rate, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps, @@ -65,10 +65,9 @@ def processTimestampsForArtifacts( for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) if name_1[-1] == name_2[-1]: name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + sampling_rate = pair_name_to_sampling_rate[name] for i in range(len(names_for_storenames)): if ( diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index ad7408e..bba3d20 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -121,6 +121,7 @@ def write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr): def read_corrected_timestamps_pairwise(filepath): pair_name_to_tsNew = {} + pair_name_to_sampling_rate = {} path = decide_naming_convention(filepath) for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") @@ -131,8 +132,10 @@ def read_corrected_timestamps_pairwise(filepath): name = name_1[-1] tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] pair_name_to_tsNew[name] = tsNew - return pair_name_to_tsNew + pair_name_to_sampling_rate[name] = sampling_rate + return pair_name_to_tsNew, pair_name_to_sampling_rate def read_coords_pairwise(filepath, pair_name_to_tsNew): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index dd02bd0..b618deb 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -379,7 +379,7 @@ def execute_artifact_removal(folderNames, inputParameters): logger.debug("Removing artifacts from the data...") if artifactsRemovalMethod == "concatenate": name_to_data = read_corrected_data_dict(filepath, storesList) - pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) processTimestampsForArtifacts( @@ -387,13 +387,14 @@ def execute_artifact_removal(folderNames, inputParameters): timeForLightsTurnOn, storesList, pair_name_to_tsNew, + pair_name_to_sampling_rate, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps, ) else: name_to_data = read_corrected_data_dict(filepath, storesList) - pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_tsNew, _ = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( From 3c7057916867bf451f809156735288c329984b4a Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:36:35 -0800 Subject: [PATCH 109/125] Reorganized processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 77 ++++++++++++-------------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 852cfb8..ebc1df9 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -61,52 +61,47 @@ def processTimestampsForArtifacts( path = decide_naming_convention(filepath) - timestamp_dict = dict() for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = pair_name_to_sampling_rate[name] - - for i in range(len(names_for_storenames)): - if ( - "control_" + name.lower() in names_for_storenames[i].lower() - or "signal_" + name.lower() in names_for_storenames[i].lower() - ): # changes done - ts = pair_name_to_tsNew[name] - data = name_to_data[names_for_storenames[i]] - coords = pair_name_to_coords[name] - data, timestampNew = eliminateData( - data=data, - ts=ts, - coords=coords, - timeForLightsTurnOn=timeForLightsTurnOn, - sampling_rate=sampling_rate, - ) - write_hdf5(data, names_for_storenames[i], filepath, "data") - else: - if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): - continue - else: - compound_name = names_for_storenames[i] + "_" + name - tsNew = pair_name_to_tsNew[name] - ts = compound_name_to_ttl_timestamps[compound_name] - coords = pair_name_to_coords[name] - ts = eliminateTs( - ts=ts, - tsNew=tsNew, - coords=coords, - timeForLightsTurnOn=timeForLightsTurnOn, - sampling_rate=sampling_rate, - ) - write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") - - # timestamp_dict[name] = timestampNew - write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") - else: + if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + sampling_rate = pair_name_to_sampling_rate[name] + tsNew = pair_name_to_tsNew[name] + coords = pair_name_to_coords[name] + + for i in range(len(names_for_storenames)): + if ( + "control_" + name.lower() in names_for_storenames[i].lower() + or "signal_" + name.lower() in names_for_storenames[i].lower() + ): # changes done + data = name_to_data[names_for_storenames[i]] + data, timestampNew = eliminateData( + data=data, + ts=tsNew, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, + ) + write_hdf5(data, names_for_storenames[i], filepath, "data") + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + compound_name = names_for_storenames[i] + "_" + name + ts = compound_name_to_ttl_timestamps[compound_name] + ts = eliminateTs( + ts=ts, + tsNew=tsNew, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, + ) + write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") + + write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") From b7d054967b992e113a404139306bf53fbe5baab8 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:56:43 -0800 Subject: [PATCH 110/125] Removed write from processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 33 +++++++++++++++++--------- src/guppy/analysis/standard_io.py | 5 ++++ src/guppy/preprocess.py | 6 ++++- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index ebc1df9..08ffc98 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -5,7 +5,6 @@ from .io_utils import ( decide_naming_convention, - write_hdf5, ) logger = logging.getLogger(__name__) @@ -61,22 +60,25 @@ def processTimestampsForArtifacts( path = decide_naming_convention(filepath) + name_to_corrected_data = {} + pair_name_to_corrected_timestamps = {} + compound_name_to_corrected_ttl_timestamps = {} for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") - name = name_1[-1] + pair_name = name_1[-1] - sampling_rate = pair_name_to_sampling_rate[name] - tsNew = pair_name_to_tsNew[name] - coords = pair_name_to_coords[name] + sampling_rate = pair_name_to_sampling_rate[pair_name] + tsNew = pair_name_to_tsNew[pair_name] + coords = pair_name_to_coords[pair_name] for i in range(len(names_for_storenames)): if ( - "control_" + name.lower() in names_for_storenames[i].lower() - or "signal_" + name.lower() in names_for_storenames[i].lower() + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() ): # changes done data = name_to_data[names_for_storenames[i]] data, timestampNew = eliminateData( @@ -86,11 +88,13 @@ def processTimestampsForArtifacts( timeForLightsTurnOn=timeForLightsTurnOn, sampling_rate=sampling_rate, ) - write_hdf5(data, names_for_storenames[i], filepath, "data") + name_to_corrected_data[names_for_storenames[i]] = data + pair_name_to_corrected_timestamps[pair_name] = timestampNew + # write_hdf5(data, names_for_storenames[i], filepath, "data") else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue - compound_name = names_for_storenames[i] + "_" + name + compound_name = names_for_storenames[i] + "_" + pair_name ts = compound_name_to_ttl_timestamps[compound_name] ts = eliminateTs( ts=ts, @@ -99,11 +103,18 @@ def processTimestampsForArtifacts( timeForLightsTurnOn=timeForLightsTurnOn, sampling_rate=sampling_rate, ) - write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") + compound_name_to_corrected_ttl_timestamps[compound_name] = ts + # write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepath, "ts") - write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") + # write_hdf5(timestampNew, "timeCorrection_" + pair_name, filepath, "timestampNew") logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") + return ( + name_to_corrected_data, + pair_name_to_corrected_timestamps, + compound_name_to_corrected_ttl_timestamps, + ) + # helper function to process control and signal timestamps def eliminateData(*, data, ts, coords, timeForLightsTurnOn, sampling_rate): diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index bba3d20..3131da5 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -204,3 +204,8 @@ def write_nan_corrected_ttl_timestamps( ): for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") + + +def write_concat_corrected_timestamps(filepath, pair_name_to_corrected_timestamps): + for pair_name, timestamps in pair_name_to_corrected_timestamps.items(): + write_hdf5(timestamps, "timeCorrection_" + pair_name, filepath, "timestampNew") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index b618deb..3f899a9 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -30,6 +30,7 @@ read_corrected_timestamps_pairwise, read_corrected_ttl_timestamps, read_ttl, + write_concat_corrected_timestamps, write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, @@ -382,7 +383,7 @@ def execute_artifact_removal(folderNames, inputParameters): pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) - processTimestampsForArtifacts( + name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = processTimestampsForArtifacts( filepath, timeForLightsTurnOn, storesList, @@ -392,6 +393,9 @@ def execute_artifact_removal(folderNames, inputParameters): name_to_data, compound_name_to_ttl_timestamps, ) + write_nan_corrected_data(filepath, name_to_data) + write_concat_corrected_timestamps(filepath, pair_name_to_timestamps) + write_nan_corrected_ttl_timestamps(filepath, compound_name_to_ttl_timestamps) else: name_to_data = read_corrected_data_dict(filepath, storesList) pair_name_to_tsNew, _ = read_corrected_timestamps_pairwise(filepath) From 61b2712d1aceb8bf894ad1d5868c66760b2b75f5 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:57:15 -0800 Subject: [PATCH 111/125] Removed write from processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 08ffc98..4ac22c9 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -90,7 +90,6 @@ def processTimestampsForArtifacts( ) name_to_corrected_data[names_for_storenames[i]] = data pair_name_to_corrected_timestamps[pair_name] = timestampNew - # write_hdf5(data, names_for_storenames[i], filepath, "data") else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue @@ -104,9 +103,7 @@ def processTimestampsForArtifacts( sampling_rate=sampling_rate, ) compound_name_to_corrected_ttl_timestamps[compound_name] = ts - # write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepath, "ts") - # write_hdf5(timestampNew, "timeCorrection_" + pair_name, filepath, "timestampNew") logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") return ( From 2dc18cc51a47a8efb03e6f093df4355a6c473a7f Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:59:30 -0800 Subject: [PATCH 112/125] Refactored filepath out of processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 4ac22c9..c661e49 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -1,12 +1,7 @@ import logging -import os import numpy as np -from .io_utils import ( - decide_naming_convention, -) - logger = logging.getLogger(__name__) @@ -45,7 +40,6 @@ def addingNaNtoChunksWithArtifacts( # main function to align timestamps for control, signal and event timestamps for artifacts removal def processTimestampsForArtifacts( - filepath, timeForLightsTurnOn, storesList, pair_name_to_tsNew, @@ -54,23 +48,14 @@ def processTimestampsForArtifacts( name_to_data, compound_name_to_ttl_timestamps, ): - logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") names_for_storenames = storesList[1, :] - - path = decide_naming_convention(filepath) + pair_names = pair_name_to_tsNew.keys() name_to_corrected_data = {} pair_name_to_corrected_timestamps = {} compound_name_to_corrected_ttl_timestamps = {} - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - if name_1[-1] != name_2[-1]: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - pair_name = name_1[-1] - + for pair_name in pair_names: sampling_rate = pair_name_to_sampling_rate[pair_name] tsNew = pair_name_to_tsNew[pair_name] coords = pair_name_to_coords[pair_name] From bfb18e058f3cae8abffde64412895d913a5a2c46 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 09:10:11 -0800 Subject: [PATCH 113/125] Consolidated write operations --- src/guppy/analysis/standard_io.py | 23 +++++++++++------------ src/guppy/preprocess.py | 25 ++++++++----------------- 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 3131da5..89f1b40 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -193,19 +193,18 @@ def read_corrected_ttl_timestamps(filepath, storesList): return compound_name_to_ttl_timestamps -def write_nan_corrected_data(filepath, name_to_corrected_data): - for name, data in name_to_corrected_data.items(): - write_hdf5(data, name, filepath, "data") +def write_artifact_corrected_timestamps(filepath, pair_name_to_corrected_timestamps): + for pair_name, timestamps in pair_name_to_corrected_timestamps.items(): + write_hdf5(timestamps, "timeCorrection_" + pair_name, filepath, "timestampNew") -def write_nan_corrected_ttl_timestamps( +def write_artifact_removal( filepath, - compound_name_to_corrected_ttl_timestamps, + name_to_corrected_data, + pair_name_to_corrected_timestamps, + compound_name_to_corrected_ttl_timestamps=None, ): - for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): - write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") - - -def write_concat_corrected_timestamps(filepath, pair_name_to_corrected_timestamps): - for pair_name, timestamps in pair_name_to_corrected_timestamps.items(): - write_hdf5(timestamps, "timeCorrection_" + pair_name, filepath, "timestampNew") + write_corrected_data(filepath, name_to_corrected_data) + write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps) + if pair_name_to_corrected_timestamps is not None: + write_artifact_corrected_timestamps(filepath, pair_name_to_corrected_timestamps) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 3f899a9..fc90b77 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -30,12 +30,10 @@ read_corrected_timestamps_pairwise, read_corrected_ttl_timestamps, read_ttl, - write_concat_corrected_timestamps, + write_artifact_removal, write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, - write_nan_corrected_data, - write_nan_corrected_ttl_timestamps, write_zscore, ) from .analysis.timestamp_correction import correct_timestamps @@ -377,14 +375,14 @@ def execute_artifact_removal(folderNames, inputParameters): filepath = storesListPath[j] storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + name_to_data = read_corrected_data_dict(filepath, storesList) + pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath) + pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) + compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) + logger.debug("Removing artifacts from the data...") if artifactsRemovalMethod == "concatenate": - name_to_data = read_corrected_data_dict(filepath, storesList) - pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath) - pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) - compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = processTimestampsForArtifacts( - filepath, timeForLightsTurnOn, storesList, pair_name_to_tsNew, @@ -393,14 +391,7 @@ def execute_artifact_removal(folderNames, inputParameters): name_to_data, compound_name_to_ttl_timestamps, ) - write_nan_corrected_data(filepath, name_to_data) - write_concat_corrected_timestamps(filepath, pair_name_to_timestamps) - write_nan_corrected_ttl_timestamps(filepath, compound_name_to_ttl_timestamps) else: - name_to_data = read_corrected_data_dict(filepath, storesList) - pair_name_to_tsNew, _ = read_corrected_timestamps_pairwise(filepath) - pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) - compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( storesList, pair_name_to_tsNew, @@ -408,9 +399,9 @@ def execute_artifact_removal(folderNames, inputParameters): name_to_data, compound_name_to_ttl_timestamps, ) - write_nan_corrected_data(filepath, name_to_data) - write_nan_corrected_ttl_timestamps(filepath, compound_name_to_ttl_timestamps) + pair_name_to_timestamps = None + write_artifact_removal(filepath, name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps) visualizeControlAndSignal(filepath, removeArtifacts=True) writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") From d4f3de43f207f84d3b7ff6ad67021f59e9263cc1 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 09:30:02 -0800 Subject: [PATCH 114/125] Consolidated into single remove_artifacts fn --- src/guppy/analysis/artifact_removal.py | 40 ++++++++++++++++++++++++++ src/guppy/preprocess.py | 34 +++++++--------------- 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index c661e49..d3da042 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -5,6 +5,46 @@ logger = logging.getLogger(__name__) +def remove_artifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + method, +): + if method == "concatenate": + name_to_corrected_data, pair_name_to_corrected_timestamps, compound_name_to_corrected_ttl_timestamps = ( + processTimestampsForArtifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) + ) + logger.info("Artifacts removed using concatenate method.") + elif method == "replace with NaN": + name_to_corrected_data, compound_name_to_corrected_ttl_timestamps = addingNaNtoChunksWithArtifacts( + storesList, + pair_name_to_tsNew, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) + pair_name_to_corrected_timestamps = None + logger.info("Artifacts removed using NaN replacement method.") + else: + logger.error("Invalid artifact removal method specified.") + raise ValueError("Invalid artifact removal method specified.") + + return name_to_corrected_data, pair_name_to_corrected_timestamps, compound_name_to_corrected_ttl_timestamps + + def addingNaNtoChunksWithArtifacts( storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps ): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index fc90b77..46fc7c7 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -7,10 +7,7 @@ import matplotlib.pyplot as plt import numpy as np -from .analysis.artifact_removal import ( - addingNaNtoChunksWithArtifacts, - processTimestampsForArtifacts, -) +from .analysis.artifact_removal import remove_artifacts from .analysis.combine_data import combineData from .analysis.control_channel import add_control_channel, create_control_channel from .analysis.io_utils import ( @@ -381,25 +378,16 @@ def execute_artifact_removal(folderNames, inputParameters): compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) logger.debug("Removing artifacts from the data...") - if artifactsRemovalMethod == "concatenate": - name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = processTimestampsForArtifacts( - timeForLightsTurnOn, - storesList, - pair_name_to_tsNew, - pair_name_to_sampling_rate, - pair_name_to_coords, - name_to_data, - compound_name_to_ttl_timestamps, - ) - else: - name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( - storesList, - pair_name_to_tsNew, - pair_name_to_coords, - name_to_data, - compound_name_to_ttl_timestamps, - ) - pair_name_to_timestamps = None + name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = remove_artifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + method=artifactsRemovalMethod, + ) write_artifact_removal(filepath, name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps) visualizeControlAndSignal(filepath, removeArtifacts=True) From c23aa1ddf12ba4c2525574f64e5952586db03113 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 13:17:21 -0800 Subject: [PATCH 115/125] fixed bug with read_control_and_signal --- src/guppy/analysis/standard_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 89f1b40..e7fe8e0 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -27,8 +27,8 @@ def read_control_and_signal(filepath, storesList): for i in range(channels_arr.shape[1]): control_name = channels_arr[0, i] signal_name = channels_arr[1, i] - idx_c = np.where(storesList == control_name)[0] - idx_s = np.where(storesList == signal_name)[0] + idx_c = np.where(names_for_storenames == control_name)[0] + idx_s = np.where(names_for_storenames == signal_name)[0] control_storename = storenames[idx_c[0]] signal_storename = storenames[idx_s[0]] From 1cda972960addc599f378d699d6d8eaa2da9e12e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 13:18:32 -0800 Subject: [PATCH 116/125] fixed naming bug in timestampCorrection --- src/guppy/analysis/timestamp_correction.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 60cf76a..0806fb8 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -64,15 +64,15 @@ def timestampCorrection( name_to_corrected_data = {} storenames = storesList[0, :] names_for_storenames = storesList[1, :] - data = get_control_and_signal_channel_names(storesList) + channels_arr = get_control_and_signal_channel_names(storesList) - indices = check_cntrl_sig_length(data, name_to_data) + indices = check_cntrl_sig_length(channels_arr, name_to_data) - for i in range(data.shape[1]): - control_name = data[0, i] - signal_name = data[1, i] - name_1 = data[0, i].split("_")[-1] - name_2 = data[1, i].split("_")[-1] + for i in range(channels_arr.shape[1]): + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + name_1 = channels_arr[0, i].split("_")[-1] + name_2 = channels_arr[1, i].split("_")[-1] if name_1 != name_2: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") @@ -81,8 +81,8 @@ def timestampCorrection( idx = np.where(names_for_storenames == indices[i])[0] if idx.shape[0] == 0: - logger.error(f"{data[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(data[0, i])) + logger.error(f"{channels_arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(channels_arr[0, i])) name = names_for_storenames[idx][0] timestamp = name_to_timestamps[name] From 19986c81c974138d2007982badd8ae2a8dcc679a Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 16:36:18 -0800 Subject: [PATCH 117/125] Fixed combinedata bug --- src/guppy/analysis/combine_data.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index f89315f..cf96835 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -66,6 +66,8 @@ def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): for i in range(len(filepath)): ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") data = read_hdf5(event, filepath[i], "data").reshape(-1) + print(f"{ts.shape = }") + print(f"{data.shape = }") # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 19 Dec 2025 16:42:28 -0800 Subject: [PATCH 118/125] Fixed combinedata bug --- src/guppy/analysis/combine_data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index cf96835..3da338d 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -66,8 +66,6 @@ def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): for i in range(len(filepath)): ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") data = read_hdf5(event, filepath[i], "data").reshape(-1) - print(f"{ts.shape = }") - print(f"{data.shape = }") # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 19 Dec 2025 16:51:22 -0800 Subject: [PATCH 119/125] Reorganized into execute_combined_data and combine_data. --- src/guppy/analysis/combine_data.py | 49 +----------------------------- src/guppy/preprocess.py | 48 +++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 50 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 3da338d..3ab73d3 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -1,4 +1,3 @@ -import glob import logging import os @@ -6,59 +5,13 @@ from .io_utils import ( decide_naming_convention, - get_all_stores_for_combining_data, read_hdf5, - takeOnlyDirs, write_hdf5, ) logger = logging.getLogger(__name__) -# function to combine data when there are two different data files for the same recording session -# it will combine the data, do timestamps processing and save the combined data in the first output folder. -def combineData(folderNames, inputParameters, storesList): - - logger.debug("Combining Data from different data files...") - timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] - op_folder = [] - for i in range(len(folderNames)): - filepath = folderNames[i] - op_folder.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - - op_folder = list(np.concatenate(op_folder).flatten()) - sampling_rate_fp = [] - for i in range(len(folderNames)): - filepath = folderNames[i] - storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList_new = np.genfromtxt( - os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," - ).reshape(2, -1) - sampling_rate_fp.append(glob.glob(os.path.join(filepath, "timeCorrection_*"))) - - # check if sampling rate is same for both data - sampling_rate_fp = np.concatenate(sampling_rate_fp) - sampling_rate = [] - for i in range(sampling_rate_fp.shape[0]): - sampling_rate.append(read_hdf5("", sampling_rate_fp[i], "sampling_rate")) - - res = all(i == sampling_rate[0] for i in sampling_rate) - if res == False: - logger.error("To combine the data, sampling rate for both the data should be same.") - raise Exception("To combine the data, sampling rate for both the data should be same.") - - # get the output folders informatinos - op = get_all_stores_for_combining_data(op_folder) - - # processing timestamps for combining the data - processTimestampsForCombiningData(op, timeForLightsTurnOn, storesList, sampling_rate[0]) - logger.info("Data is combined from different data files.") - - return op - - def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): arr = np.array([]) @@ -113,7 +66,7 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): return ts_arr -def processTimestampsForCombiningData(filepath, timeForLightsTurnOn, events, sampling_rate): +def combine_data(filepath, timeForLightsTurnOn, events, sampling_rate): logger.debug("Processing timestamps for combining data...") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 46fc7c7..17d1fb5 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -8,7 +8,7 @@ import numpy as np from .analysis.artifact_removal import remove_artifacts -from .analysis.combine_data import combineData +from .analysis.combine_data import combine_data from .analysis.control_channel import add_control_channel, create_control_channel from .analysis.io_utils import ( check_storeslistfile, @@ -399,6 +399,50 @@ def execute_artifact_removal(folderNames, inputParameters): logger.info("Artifact removal completed.") +# function to combine data when there are two different data files for the same recording session +# it will combine the data, do timestamps processing and save the combined data in the first output folder. +def execute_combine_data(folderNames, inputParameters, storesList): + + logger.debug("Combining Data from different data files...") + timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] + op_folder = [] + for i in range(len(folderNames)): + filepath = folderNames[i] + op_folder.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + + op_folder = list(np.concatenate(op_folder).flatten()) + sampling_rate_fp = [] + for i in range(len(folderNames)): + filepath = folderNames[i] + storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList_new = np.genfromtxt( + os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," + ).reshape(2, -1) + sampling_rate_fp.append(glob.glob(os.path.join(filepath, "timeCorrection_*"))) + + # check if sampling rate is same for both data + sampling_rate_fp = np.concatenate(sampling_rate_fp) + sampling_rate = [] + for i in range(sampling_rate_fp.shape[0]): + sampling_rate.append(read_hdf5("", sampling_rate_fp[i], "sampling_rate")) + + res = all(i == sampling_rate[0] for i in sampling_rate) + if res == False: + logger.error("To combine the data, sampling rate for both the data should be same.") + raise Exception("To combine the data, sampling rate for both the data should be same.") + + # get the output folders informatinos + op = get_all_stores_for_combining_data(op_folder) + + # processing timestamps for combining the data + combine_data(op, timeForLightsTurnOn, storesList, sampling_rate[0]) + logger.info("Data is combined from different data files.") + + return op + + def extractTsAndSignal(inputParameters): logger.debug("Extracting signal data and event timestamps...") @@ -434,7 +478,7 @@ def extractTsAndSignal(inputParameters): writeToFile(str((pbMaxValue) * 10) + "\n" + str(10) + "\n") execute_timestamp_correction(folderNames, inputParameters) storesList = check_storeslistfile(folderNames) - op_folder = combineData(folderNames, inputParameters, storesList) + op_folder = execute_combine_data(folderNames, inputParameters, storesList) execute_zscore(op_folder, inputParameters) if remove_artifacts == True: execute_artifact_removal(op_folder, inputParameters) From 042fb33c26327376cb6fe67497667e61341089c4 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 17:14:38 -0800 Subject: [PATCH 120/125] Renamed some variables for clarity. --- src/guppy/analysis/combine_data.py | 28 ++++++++++++++++------------ src/guppy/preprocess.py | 1 - 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 3ab73d3..b89f9e1 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -12,13 +12,13 @@ logger = logging.getLogger(__name__) -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateData(filepaths, timeForLightsTurnOn, event, sampling_rate, naming): arr = np.array([]) ts_arr = np.array([]) - for i in range(len(filepath)): - ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") - data = read_hdf5(event, filepath[i], "data").reshape(-1) + for i in range(len(filepaths)): + ts = read_hdf5("timeCorrection_" + naming, filepaths[i], "timestampNew") + data = read_hdf5(event, filepaths[i], "data").reshape(-1) # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 19 Dec 2025 17:27:49 -0800 Subject: [PATCH 121/125] Refactored read operations out of eliminateData. --- src/guppy/analysis/combine_data.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index b89f9e1..a63be7e 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -12,15 +12,16 @@ logger = logging.getLogger(__name__) -def eliminateData(filepaths, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, event, sampling_rate, naming): arr = np.array([]) ts_arr = np.array([]) - for i in range(len(filepaths)): - ts = read_hdf5("timeCorrection_" + naming, filepaths[i], "timestampNew") - data = read_hdf5(event, filepaths[i], "data").reshape(-1) - - # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 19 Dec 2025 17:36:57 -0800 Subject: [PATCH 122/125] Cleaned up some indentation in combine_data. --- src/guppy/analysis/combine_data.py | 77 +++++++++++++++--------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index a63be7e..e2fb719 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -67,12 +67,12 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): return ts_arr -def combine_data(filepath: list[list[str]], timeForLightsTurnOn, events, sampling_rate): +def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_storenames, sampling_rate): # filepath = [[folder1_output_0, folder2_output_0], [folder1_output_1, folder2_output_1], ...] logger.debug("Processing timestamps for combining data...") - storesList = events[1, :] + names_for_storenames = names_for_storenames[1, :] for single_output_filepaths in filepath: # single_output_filepaths = [folder1_output_i, folder2_output_i, ...] @@ -81,41 +81,42 @@ def combine_data(filepath: list[list[str]], timeForLightsTurnOn, events, samplin pair_name_to_tsNew = {} for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - if name_1[-1] == name_2[-1]: - name = name_1[-1] - - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): - filepath_to_timestamps = {} - filepath_to_data = {} - for filepath in single_output_filepaths: - ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - data = read_hdf5(storesList[i], filepath, "data").reshape(-1) - filepath_to_timestamps[filepath] = ts - filepath_to_data[filepath] = data - - data, timestampNew = eliminateData( - filepath_to_timestamps, - filepath_to_data, - timeForLightsTurnOn, - storesList[i], - sampling_rate, - name, - ) - write_hdf5(data, storesList[i], single_output_filepaths[0], "data") - pair_name_to_tsNew[name] = timestampNew - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = eliminateTs( - single_output_filepaths, timeForLightsTurnOn, storesList[i], sampling_rate, name - ) - write_hdf5(ts, storesList[i] + "_" + name, single_output_filepaths[0], "ts") + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1] + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1 + + for i in range(len(names_for_storenames)): + if ( + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() + ): + filepath_to_timestamps = {} + filepath_to_data = {} + for filepath in single_output_filepaths: + ts = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") + data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) + filepath_to_timestamps[filepath] = ts + filepath_to_data[filepath] = data + + data, timestampNew = eliminateData( + filepath_to_timestamps, + filepath_to_data, + timeForLightsTurnOn, + names_for_storenames[i], + sampling_rate, + pair_name, + ) + write_hdf5(data, names_for_storenames[i], single_output_filepaths[0], "data") + pair_name_to_tsNew[pair_name] = timestampNew + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + ts = eliminateTs( + single_output_filepaths, timeForLightsTurnOn, names_for_storenames[i], sampling_rate, pair_name + ) + write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, single_output_filepaths[0], "ts") for pair_name, tsNew in pair_name_to_tsNew.items(): write_hdf5(tsNew, "timeCorrection_" + pair_name, single_output_filepaths[0], "timestampNew") From d3a8fbc5c302867296f8f4b2a4bb97428e56781d Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 17:41:22 -0800 Subject: [PATCH 123/125] Refactored read operations out of eliminateTs. --- src/guppy/analysis/combine_data.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index e2fb719..6c00be6 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -43,11 +43,11 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): ts_arr = np.array([]) tsNew_arr = np.array([]) for i in range(len(filepath)): - tsNew = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") - if os.path.exists(os.path.join(filepath[i], event + "_" + naming + ".hdf5")): - ts = read_hdf5(event + "_" + naming, filepath[i], "ts").reshape(-1) - else: - ts = np.array([]) + # tsNew = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") + # if os.path.exists(os.path.join(filepath[i], event + "_" + naming + ".hdf5")): + # ts = read_hdf5(event + "_" + naming, filepath[i], "ts").reshape(-1) + # else: + # ts = np.array([]) # logger.info("total time : ", tsNew[-1]) if len(tsNew_arr) == 0: @@ -114,8 +114,24 @@ def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_store else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue + filepath_to_timestamps = {} + filepath_to_ttl_timestamps = {} + for filepath in single_output_filepaths: + tsNew = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") + if os.path.exists(os.path.join(filepath, names_for_storenames[i] + "_" + pair_name + ".hdf5")): + ts = read_hdf5(names_for_storenames[i] + "_" + pair_name, filepath, "ts").reshape(-1) + else: + ts = np.array([]) + filepath_to_timestamps[filepath] = tsNew + filepath_to_ttl_timestamps[filepath] = ts + ts = eliminateTs( - single_output_filepaths, timeForLightsTurnOn, names_for_storenames[i], sampling_rate, pair_name + filepath_to_timestamps, + filepath_to_ttl_timestamps, + timeForLightsTurnOn, + names_for_storenames[i], + sampling_rate, + pair_name, ) write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, single_output_filepaths[0], "ts") for pair_name, tsNew in pair_name_to_tsNew.items(): From c481d953aafac5919e1afb676a019a35aa338f89 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 17:43:50 -0800 Subject: [PATCH 124/125] Refactored read operations out of eliminateTs. --- src/guppy/analysis/combine_data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 6c00be6..4cddb32 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -38,11 +38,14 @@ def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, return arr, ts_arr -def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, event, sampling_rate, naming): ts_arr = np.array([]) tsNew_arr = np.array([]) - for i in range(len(filepath)): + filepaths = list(filepath_to_timestamps.keys()) + for filepath in filepaths: + ts = filepath_to_timestamps[filepath] + tsNew = filepath_to_ttl_timestamps[filepath] # tsNew = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") # if os.path.exists(os.path.join(filepath[i], event + "_" + naming + ".hdf5")): # ts = read_hdf5(event + "_" + naming, filepath[i], "ts").reshape(-1) From ebe24b64799cc603ea719d8fcbc970edd43950ec Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 17:47:11 -0800 Subject: [PATCH 125/125] Refactored read operations out of eliminateTs. --- src/guppy/analysis/combine_data.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 4cddb32..6ccddc0 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) -def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, sampling_rate): arr = np.array([]) ts_arr = np.array([]) @@ -20,8 +20,6 @@ def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, for filepath in filepaths: ts = filepath_to_timestamps[filepath] data = filepath_to_data[filepath] - # ts = read_hdf5("timeCorrection_" + naming, filepaths[i], "timestampNew") - # data = read_hdf5(event, filepaths[i], "data").reshape(-1) if len(arr) == 0: arr = np.concatenate((arr, data)) @@ -38,7 +36,7 @@ def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, return arr, ts_arr -def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, sampling_rate): ts_arr = np.array([]) tsNew_arr = np.array([]) @@ -46,13 +44,6 @@ def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLight for filepath in filepaths: ts = filepath_to_timestamps[filepath] tsNew = filepath_to_ttl_timestamps[filepath] - # tsNew = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") - # if os.path.exists(os.path.join(filepath[i], event + "_" + naming + ".hdf5")): - # ts = read_hdf5(event + "_" + naming, filepath[i], "ts").reshape(-1) - # else: - # ts = np.array([]) - - # logger.info("total time : ", tsNew[-1]) if len(tsNew_arr) == 0: sub = tsNew[0] - timeForLightsTurnOn tsNew_arr = np.concatenate((tsNew_arr, tsNew - sub)) @@ -108,9 +99,7 @@ def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_store filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, - names_for_storenames[i], sampling_rate, - pair_name, ) write_hdf5(data, names_for_storenames[i], single_output_filepaths[0], "data") pair_name_to_tsNew[pair_name] = timestampNew @@ -132,9 +121,7 @@ def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_store filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, - names_for_storenames[i], sampling_rate, - pair_name, ) write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, single_output_filepaths[0], "ts") for pair_name, tsNew in pair_name_to_tsNew.items():