diff --git a/.github/workflows/testing-code.yml b/.github/workflows/testing-code.yml new file mode 100644 index 00000000..1bfc49fb --- /dev/null +++ b/.github/workflows/testing-code.yml @@ -0,0 +1,31 @@ +name: Run Unit Test via Pytest + +on: [push] + +jobs: + build: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test with pytest + run: | + coverage run -m pytest tests + continue-on-error: true + - name: Generate Coverage Report + run: | + coverage report -m \ No newline at end of file diff --git a/app/__init_.py b/app/__init_.py new file mode 100644 index 00000000..e69de29b diff --git a/app/constants.py b/app/constants.py new file mode 100644 index 00000000..3ad5ab5a --- /dev/null +++ b/app/constants.py @@ -0,0 +1,80 @@ +# Test file names +EXAMPLE_POS_FILENAME = "1a_MZmine3_pos.csv" +EXAMPLE_NEG_FILENAME = "1b_MZmine3_neg.csv" +EXAMPLE_TRACER_FILENAME = "WW2DW_Tracers_Amenable.csv" +EXAMPLE_RUN_SEQUENCE_POS_FILENAME = "WW2DW_sequence_cal.csv" +EXAMPLE_RUN_SEQUENCE_NEG_FILENAME = "WW2DW_sequence_cal.csv" +EXAMPLE_SURROGATE_FILENAME = "qNTA_Surrogate_Input_File_WW2DW.csv" + +# Define pos/neg/neutral adduct lists +# Proton added - we observe Mass-(H+) and Mass+(Adduct) +NEG_ADDUCT_LI = [ + ("Cl", 35.976678), + ("Br", 79.926161), + ("HCO2", 46.005477), + ("CH3CO2", 60.021127), + ("CF3CO2", 113.992862), +] + +# Proton subtracted - we observe Mass+(H+) and Mass+(Adduct) +POS_ADDUCT_LI = [ + ("Na", 21.981942), + ("K", 37.955882), + ("NH4", 17.026547), +] + +NEUTRAL_LOSSES_LI = [ + ("H2O", -18.010565), + ("2H2O", -36.02113), + ("3H2O", -54.031695), + ("4H2O", -72.04226), + ("5H2O", -90.052825), + ("NH3", -17.0265), + ("O", -15.99490), + ("CO", -29.00220), + ("CO2", -43.989829), + ("C2H4", -28.03130), + ("CH2O2", 46.00550), # note here and below - not losses? but still neutral? + ("CH3COOH", 60.02110), + ("CH3OH", 32.02620), + ("CH3CN", 41.02650), + ("(CH3)2CHOH", 60.05810), +] + +# Set to tested memory capacity of WebApp for number of features in 'adduct_matrix' +MAX_NUM_ADDUCT_FEATURES = 12000 + +# Column names accessed throughout app +FEATURE_ID_COL = "Feature ID" +DASHBOARD_SEARCH_COL = "For_Dashboard_Search" +FORMULA_COL = "Formula" +MASS_COL = "Mass" +RETENTION_COL = "Retention_Time" +IONIZATION_COL = "Ionization_Mode" +MOLECULAR_FORMULA_COL = "MOLECULAR_FORMULA" + +# Format lists to test values agains +ALLOWED_BLANK_FORMATS_LIST = ["Blank", "blank", "BLANK", "MB", "Mb", "mb", "mB"] +ACTIVE_COLUMNS_LIST = [ + "Retention_Time", + "Mass", + "Ionization_Mode", + "Compound", +] + +# Establish ordering of all possible front matter (tracer/no tracer, flags/no flags, etc.) +FRONT_MATTER_ORDERING = [ + "Ionization_Mode", + "Mass", + "Retention_Time", + "Compound", + "Tracer Chemical Match?", + "Duplicate Feature?", + "Is Adduct or Loss?", + "Has Adduct or Loss?", + "Adduct or Loss Info", + "Final Occurrence Count", + "Final Occurrence Percentage", + "Final Occurrence Count (with flags)", + "Final Occurrence Percentage (with flags)", +] \ No newline at end of file diff --git a/app/feature/tests/test_feature.py b/app/feature/tests/test_feature.py index 2557a591..178cc035 100644 --- a/app/feature/tests/test_feature.py +++ b/app/feature/tests/test_feature.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- import unittest -from Feature import Feature_MS2 as ms2 +from feature import Feature_MS2 as ms2 from test_data import parsedMGF as mgfData #Note about test mgfData diff --git a/app/ms1/__init__.py b/app/ms1/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/ms1/nta_task.py b/app/ms1/nta_task.py index 5c869880..592eae94 100644 --- a/app/ms1/nta_task.py +++ b/app/ms1/nta_task.py @@ -6,6 +6,7 @@ import traceback import shutil import json +from typing import Union from datetime import datetime from dask.distributed import Client, LocalCluster, fire_and_forget from zipfile import ZipFile, ZIP_DEFLATED @@ -35,13 +36,13 @@ def run_nta_dask( parameters, - input_dfs, - tracer_df=None, - run_sequence_pos_df=None, - run_sequence_neg_df=None, - qnta_df=None, - jobid="00000000", - verbose=True, + input_dfs: list[Union[pd.DataFrame, None]], + tracer_df: Union[pd.DataFrame, None] = None, + run_sequence_pos_df: Union[pd.DataFrame, None] = None, + run_sequence_neg_df: Union[pd.DataFrame, None] = None, + qnta_df: Union[pd.DataFrame, None] = None, + jobid = "00000000", + verbose = True, ): in_docker = os.environ.get("IN_DOCKER") != "False" mongo_address = os.environ.get("MONGO_SERVER") @@ -96,14 +97,14 @@ def run_nta_dask( def run_nta( parameters, - input_dfs, - tracer_df=None, - run_sequence_pos_df=None, - run_sequence_neg_df=None, - qnta_df=None, - mongo_address=None, - jobid="00000000", - verbose=True, + input_dfs: list[Union[pd.DataFrame, None]], + tracer_df: Union[pd.DataFrame, None] = None, + run_sequence_pos_df: Union[pd.DataFrame, None] = None, + run_sequence_neg_df: Union[pd.DataFrame, None] = None, + qnta_df: Union[pd.DataFrame, None] = None, + jobid = "00000000", + verbose = True, + mongo_address: Union[str, None] = None, in_docker=True, ): nta_run = NtaRun( @@ -140,7 +141,7 @@ def __init__( run_sequence_pos_df=None, run_sequence_neg_df=None, qnta_df=None, - mongo_address=None, + mongo_address: Union[str, None] = None, jobid="00000000", verbose=True, in_docker=True, @@ -406,7 +407,7 @@ def check_existence_of_mass_column(self, input_dfs): return - def check_retention_time_column(self, input_dfs): + def check_retention_time_column(self, input_dfs: list[Union[pd.DataFrame, None]]): """ Check for the existence of alternate spellings of 'Retention_Time' column in input dataframes and rename to "Retention_Time". @@ -609,10 +610,10 @@ def pass_through_cols(self): self.pass_through = [ task_fun.passthrucol(df, self.all_headers)[0] if df is not None else None for df in self.dfs ] - self.dfs = [task_fun.passthrucol(df, self.all_headers)[1] if df is not None else None for df in self.dfs] + self.dfs: list[Union[pd.DataFrame, None]] = [task_fun.passthrucol(df, self.all_headers)[1] if df is not None else None for df in self.dfs] return - def filter_void_volume(self, min_rt): + def filter_void_volume(self, min_rt: float): """ Accesses self.dfs (list of dataframes) and self.parameters["minimum_rt"][1] then removes all rows with a value below "minimum_rt" in the "Retention_Time" @@ -624,7 +625,7 @@ def filter_void_volume(self, min_rt): None """ # Iterate through dfs, removing rows where "Retention_Time" is below min_rt threshold - self.dfs = [df.loc[df["Retention_Time"] > min_rt].copy() if df is not None else None for df in self.dfs] + self.dfs: list[Union[pd.DataFrame, None]] = [df.loc[df["Retention_Time"] > min_rt].copy() if df is not None else None for df in self.dfs] return def filter_duplicates(self): @@ -641,11 +642,11 @@ def filter_duplicates(self): None """ # Get ppm, mass_accuracy, and rt_accuracy parameters - ppm = self.parameters["mass_accuracy_units"][1] == "ppm" + ppm: bool = self.parameters["mass_accuracy_units"][1] == "ppm" mass_accuracy = float(self.parameters["mass_accuracy"][1]) rt_accuracy = float(self.parameters["rt_accuracy"][1]) # Perform duplicate flagging functions - self.dfs = [ + self.dfs: list[Union[pd.DataFrame, None]] = [ task_fun.duplicates(df, mass_accuracy, rt_accuracy, ppm, self.blank_headers, self.sample_headers) if df is not None else None @@ -677,7 +678,7 @@ def calc_statistics(self): # Iterate through dfs, calling chunk_stats() function # NTAW-49: Raises custom ValueError if blank columns are improperly named in the input dataframes try: - self.dfs = [ + self.dfs: list[Union[pd.DataFrame, None]] = [ task_fun.chunk_stats( df, min_blank_detection_percentage, @@ -831,7 +832,7 @@ def check_tracers(self): ) for df in self.dfs ] - self.dfs = [ + self.dfs: list[Union[pd.DataFrame, None]] = [ ( task_fun.check_feature_tracers( df, @@ -1007,12 +1008,12 @@ def clean_features(self): task_fun.clean_features(df, controls, tracer_df=tracer_df_bool)[2] if df is not None else None for index, df in enumerate(self.dfs) ] - self.dfs = [ + self.dfs: list[Union[pd.DataFrame, None]] = [ task_fun.clean_features(df, controls, tracer_df=tracer_df_bool)[0] if df is not None else None for index, df in enumerate(self.dfs) ] # subtract blanks from means - self.dfs = [task_fun.Blank_Subtract_Mean(df) if df is not None else None for index, df in enumerate(self.dfs)] + self.dfs: list[Union[pd.DataFrame, None]] = [task_fun.Blank_Subtract_Mean(df) if df is not None else None for index, df in enumerate(self.dfs)] # subtract blanks from means self.dfs_flagged = [ task_fun.Blank_Subtract_Mean(df) if df is not None else None for index, df in enumerate(self.dfs_flagged) diff --git a/app/ms1/task_functions.py b/app/ms1/task_functions.py index fe5d38f5..fe60f2e8 100644 --- a/app/ms1/task_functions.py +++ b/app/ms1/task_functions.py @@ -1,3 +1,4 @@ +from typing import Literal import pandas as pd import numpy as np from operator import itemgetter @@ -6,8 +7,10 @@ import os import re import logging +from typing import Union from openpyxl.utils import get_column_letter import io +from ..constants import * logger = logging.getLogger("nta_app.ms1") @@ -39,7 +42,7 @@ def assign_feature_id(df_in, start=1): # Adjust list based on start to_assign = [x + start for x in row_nums] # Insert column at the front of df - df.insert(0, "Feature ID", to_assign.copy()) + df.insert(0, FEATURE_ID_COL, to_assign.copy()) # Return df return df @@ -54,7 +57,7 @@ def differences(s1, s2): Outputs: count (int, # of characters different between s1 and s2) """ - # Replace special characters in s1 and s1 (not underscores or dashes) + # Replace special characters in s1 and s2 (not underscores or dashes) s1 = re.sub(re.compile(r"\([^)]*\)"), "", s1) s2 = re.sub(re.compile(r"\([^)]*\)"), "", s2) # Count up different characters between s1 and s2, plus difference in string length @@ -62,7 +65,7 @@ def differences(s1, s2): # count = sum(1 for a, b in zip(s1, s2) if a != b) + abs(len(s1) - len(s2)) mytup = tuple(zip(s1, s2)) count = abs(len(s1) - len(s2)) - diff_index = None # This value is only important if the final count ==1 + diff_index = None # This value is only important if the final count == 1 for i in range(len(mytup)): if mytup[i][0] != mytup[i][1]: count += 1 @@ -79,7 +82,7 @@ def differences(s1, s2): return count -def formulas(df): +def formulas(df: pd.DataFrame): """ Return list of formulas tagged 'For_Dashboard_Search' @@ -89,16 +92,16 @@ def formulas(df): formulas_list (list) """ # Remmove Formula duplicates, keeping the first - df.drop_duplicates(subset="Formula", keep="first", inplace=True) + df.drop_duplicates(subset=FORMULA_COL, keep="first", inplace=True) # Subset df by items selected for Dashboard search - formulas = df.loc[df["For_Dashboard_Search"] == "1", "Formula"].values + formulas = df.loc[df[DASHBOARD_SEARCH_COL] == "1", FORMULA_COL].values # Get formulas in list formulas_list = [str(i) for i in formulas] # Return list return formulas_list -def masses(df): +def masses(df: pd.DataFrame): """ Return list of masses tagged 'For_Dashboard_Search' @@ -108,7 +111,7 @@ def masses(df): masses_list (list) """ # Subset df by items selected for Dashboard search - masses = df.loc[df["For_Dashboard_Search"] == "1", "Mass"].values + masses = df.loc[df[DASHBOARD_SEARCH_COL] == "1", MASS_COL].values # Update logger logger.info("# of masses for dashboard search: {} out of {}".format(len(masses), len(df))) # Get masses in list @@ -117,7 +120,7 @@ def masses(df): return masses_list -def parse_headers(df_in): +def parse_headers(df_in: pd.DataFrame): """ A function to group the dataframe's column headers into sets of similar names which represent replicates @@ -138,7 +141,7 @@ def parse_headers(df_in): # Iterate through list of columns, calling differences() function # When differences() return is greater than some value, increase countD (group assigner) for s in range(0, len(headers) - 1): - if differences(str(headers[s]), str(headers[s + 1])) < 2: # 2 is more common + if differences(str(headers[s]), str(headers[s + 1])) < 2: # 2 is more common TODO: This might be a vulnerability with differences() countS += 1 if differences(str(headers[s]), str(headers[s + 1])) >= 2: countD += 1 @@ -152,7 +155,7 @@ def parse_headers(df_in): # Group lists of columns by group assigner (countD) groups = groupby(new_headers, itemgetter(1)) # Extract column names from group tuples - new_headers_list = [[item[0] for item in data] for (key, data) in groups] + new_headers_list: list[list[str]] = [[item[0] for item in data] for (key, data) in groups] # Check that replicate samples are present. Raise IndexError if no replicate samples are found. max_group_size = 0 for item in new_headers_list: @@ -167,17 +170,15 @@ def parse_headers(df_in): # NTAW-594 -def get_sample_and_blank_headers(dfs): +def get_sample_and_blank_headers(dfs: tuple[Union[pd.DataFrame, None], Union[pd.DataFrame, None]]): if dfs[0] is not None: all_headers = parse_headers(dfs[0]) else: all_headers = parse_headers(dfs[1]) # get all header groups header_groups = [item for item in all_headers if (len(item) > 1)] - # get blank headers - allowed_blank_formats = ["Blank", "blank", "BLANK", "MB", "Mb", "mb", "mB"] # Should be more than one blank in group, so blank_headers uses header_groups - blank_headers = [item for item in header_groups if any(x in head for head in item for x in allowed_blank_formats)] + blank_headers = [item for item in header_groups if any(x in head for head in item for x in ALLOWED_BLANK_FORMATS_LIST)] # get sample headers sample_headers = [item for item in header_groups if not any(item == x for x in blank_headers)] @@ -187,7 +188,7 @@ def get_sample_and_blank_headers(dfs): """PASS-THROUGH COLUMNS FUNCTION""" -def passthrucol(df_in, all_headers): +def passthrucol(df_in: pd.DataFrame, all_headers: list[list[str]]): """ Find all columns in dfs that aren't necessary (i.e., not Mass and RT) and store these columns to be later appended to the output -- TMF 11/20/23 @@ -200,21 +201,14 @@ def passthrucol(df_in, all_headers): """ # Make a copy of the input df df = df_in.copy() - # Define active_cols: Keep 'Feature ID' in pt_headers to merge later - active_cols = [ - "Retention_Time", - "Mass", - "Ionization_Mode", - "Compound", - ] # Create list of pass through headers that are not in the active columns - pt_headers = ["Feature ID"] + [ + pt_headers = [FEATURE_ID_COL] + [ item for sublist in all_headers for item in sublist - if len(sublist) == 1 and not any(x in sublist for x in active_cols) + if len(sublist) == 1 and not any(x in sublist for x in ACTIVE_COLUMNS_LIST) ] - headers = ["Feature ID"] + [ + headers = [FEATURE_ID_COL] + [ item for sublist in all_headers for item in sublist if not any(x in item for x in pt_headers) ] # Save pass through columns in df @@ -227,7 +221,7 @@ def passthrucol(df_in, all_headers): """ADDUCT IDENTIFICATION FUNCTIONS""" -def adduct_matrix(df, a_name, delta, Mass_Difference, Retention_Difference, ppm): +def adduct_matrix(df: pd.DataFrame, a_name: str, delta: float, Mass_Difference: float, Retention_Difference: float, ppm: int): """ Modified version of Jeff's 'adduct_identifier' function. This function executes the matrix portion of the old function -- TMF 10/27/23 @@ -243,9 +237,9 @@ def adduct_matrix(df, a_name, delta, Mass_Difference, Retention_Difference, ppm) df (dataframe, with adduct information added to columns) """ # 'Mass' to matrix, 'Retention Time' to matrix, 'Feature ID' to matrix - mass = df["Mass"].to_numpy() - rts = df["Retention_Time"].to_numpy() - ids = df["Feature ID"].to_numpy() + mass = df[MASS_COL].to_numpy() + rts = df[RETENTION_COL].to_numpy() + ids = df[FEATURE_ID_COL].to_numpy() # Reshape 'masses', 'rts', and 'ids' masses_vector = np.reshape(mass, (len(mass), 1)) rts_vector = np.reshape(rts, (len(rts), 1)) @@ -320,7 +314,7 @@ def adduct_matrix(df, a_name, delta, Mass_Difference, Retention_Difference, ppm) return df -def collapse_adduct_id_array(the_array, delta_name): +def collapse_adduct_id_array(the_array: np.ndarray, delta_name: str): """ Helper function that collapses each row of the adduct ID matrix into a string containing all matches @@ -342,7 +336,7 @@ def collapse_adduct_id_array(the_array, delta_name): return adduct_info_str -def window_size(df_in, mass_diff_mass=112.985586): +def window_size(df_in: pd.DataFrame, mass_diff_mass=112.985586): """ # Estimate a sliding window size from the input data by finding the maximum distance between indices differing by 'mass_diff_mass' -- TMF 10/27/23 @@ -374,7 +368,7 @@ def window_size(df_in, mass_diff_mass=112.985586): return val -def chunk_adducts(df_in, n, step, a_name, delta, Mass_Difference, Retention_Difference, ppm): +def chunk_adducts(df_in: pd.DataFrame, n: int, step: int, a_name: str, delta: float, Mass_Difference: float, Retention_Difference: float, ppm: int): """ Function that takes the input data, chunks it based on window size, then loops through chunks and sends them to 'adduct_matrix' for calculation -- TMF 10/27/23 @@ -397,7 +391,7 @@ def chunk_adducts(df_in, n, step, a_name, delta, Mass_Difference, Retention_Diff to_test_list = [df[i : i + n] for i in range(0, df.shape[0], step)] to_test_list = [i for i in to_test_list if (i.shape[0] > n / 2)] # Create list, iterate through df chunks and append results to list - li = [] + li: list[pd.DataFrame] = [] for x in to_test_list: dum = adduct_matrix(x, a_name, delta, Mass_Difference, Retention_Difference, ppm) li.append(dum) @@ -407,12 +401,14 @@ def chunk_adducts(df_in, n, step, a_name, delta, Mass_Difference, Retention_Diff return output -def adduct_identifier(df_in, adduct_selections, Mass_Difference, Retention_Difference, ppm, ionization): +def adduct_identifier(df_in: pd.DataFrame, adduct_selections: list[tuple[str, float]], Mass_Difference: float, Retention_Difference: float, ppm: int, ionization: str): """ Function that does the front-end of the old 'adduct_identifier'; we trim the input data by identifying features that are near to adduct distance from another feature. This shortened dataframe is used to calculate a window size, then loop through possible adducts, passing to 'chunk_adducts' -- TMF 10/27/23 + TODO: Add Ionization to list of inputs. + Inputs: df_in (dataframe) adduct_selections (list of tuples, contains adduct names and masses selected by user) @@ -429,50 +425,17 @@ def adduct_identifier(df_in, adduct_selections, Mass_Difference, Retention_Diffe df["Rounded RT"] = df["Retention_Time"].round(1) # Create tuple of 'Rounded RT' and 'Rounded Mass' df["Rounded_RT_Mass_Pair"] = list(zip(df["Rounded RT"], df["Rounded Mass"])) - # Define pos/neg/neutral adduct lists - # Proton subtracted - we observe Mass+(H+) and Mass+(Adduct) - pos_adduct_li = [ - ("Na", 21.981942), - ("K", 37.955882), - ("NH4", 17.026547), - ] - # Proton added - we observe Mass-(H+) and Mass+(Adduct) - neg_adduct_li = [ - ("Cl", 35.976678), - ("Br", 79.926161), - ("HCO2", 46.005477), - ("CH3CO2", 60.021127), - ("CF3CO2", 113.992862), - ] - # no change to neutral losses - neutral_losses_li = [ - ("H2O", -18.010565), - ("2H2O", -36.02113), - ("3H2O", -54.031695), - ("4H2O", -72.04226), - ("5H2O", -90.052825), - ("NH3", -17.0265), - ("O", -15.99490), - ("CO", -29.00220), - ("CO2", -43.989829), - ("C2H4", -28.03130), - ("CH2O2", 46.00550), # note here and below - not losses? but still neutral? - ("CH3COOH", 60.02110), - ("CH3OH", 32.02620), - ("CH3CN", 41.02650), - ("(CH3)2CHOH", 60.05810), - ] # Determine possible adduct dictionary according to ionization if ionization == "positive": - possible_adduct_deltas = [item for item in pos_adduct_li if item[0] in adduct_selections[0]] + possible_adduct_deltas = [item for item in POS_ADDUCT_LI if item[0] in adduct_selections[0]] possible_adduct_deltas = possible_adduct_deltas + [ - item for item in neutral_losses_li if item[0] in adduct_selections[2] + item for item in NEUTRAL_LOSSES_LI if item[0] in adduct_selections[2] ] possible_adduct_deltas = dict(possible_adduct_deltas) else: - possible_adduct_deltas = [item for item in neg_adduct_li if item[0] in adduct_selections[1]] + possible_adduct_deltas = [item for item in NEG_ADDUCT_LI if item[0] in adduct_selections[1]] possible_adduct_deltas = possible_adduct_deltas + [ - item for item in neutral_losses_li if item[0] in adduct_selections[2] + item for item in NEUTRAL_LOSSES_LI if item[0] in adduct_selections[2] ] possible_adduct_deltas = dict(possible_adduct_deltas) # Create empty list to hold mass shift/RT tuples @@ -500,18 +463,16 @@ def adduct_identifier(df_in, adduct_selections, Mass_Difference, Retention_Diffe to_test["Has Adduct or Loss?"] = 0 to_test["Is Adduct or Loss?"] = 0 to_test["Adduct or Loss Info"] = "" - # Set 'n' to tested memory capacity of WebApp for number of features in 'adduct_matrix' - n = 12000 - # If 'to_test' is less than n, send it straight to 'adduct_matrix' - if to_test.shape[0] <= n: + # If 'to_test' is less than MAX_NUM_ADDUCT_FEATURES, send it straight to 'adduct_matrix' + if to_test.shape[0] <= MAX_NUM_ADDUCT_FEATURES: for a_name, delta in possible_adduct_deltas.items(): to_test = adduct_matrix(to_test, a_name, delta, Mass_Difference, Retention_Difference, ppm) # Else, calculate the moving window size and send 'to_test' to 'chunk_adducts' else: - step = n - window_size(to_test) + step = MAX_NUM_ADDUCT_FEATURES - window_size(to_test) # Loop through possible adducts, perform 'adduct_matrix' for a_name, delta in possible_adduct_deltas.items(): - to_test = chunk_adducts(to_test, n, step, a_name, delta, Mass_Difference, Retention_Difference, ppm) + to_test = chunk_adducts(to_test, MAX_NUM_ADDUCT_FEATURES, step, a_name, delta, Mass_Difference, Retention_Difference, ppm) # Concatenate 'Has Adduct or Loss?', 'Is Adduct or Loss?', 'Adduct or Loss Info' to df df_in = pd.merge( df_in, @@ -618,7 +579,7 @@ def dup_matrix_flag(df_in, mass_cutoff, rt_cutoff, ppm): return output -def duplicates(df_in, mass_cutoff, rt_cutoff, ppm, blank_headers, sample_headers): +def duplicates(df_in: pd.DataFrame, mass_cutoff: float, rt_cutoff: float, ppm: bool, blank_headers: list[list[str]], sample_headers: list[list[str]]): """ Drop duplicates from input dataframe, based on mass_cutoff and rt_cutoff. Includes logic statement for determining if the dataframe is too large to @@ -632,7 +593,7 @@ def duplicates(df_in, mass_cutoff, rt_cutoff, ppm, blank_headers, sample_headers df_in (dataframe) mass_cutoff (float, value for determing if masses are close enough) rt_cutoff (float, value for determing if rts are close enough) - ppm (int, binary yes/no for using ppm as units) + ppm (bool, binary yes/no for using ppm as units) Outputs: output (dataframe, dataframe with duplicate flag column added) """ @@ -667,7 +628,7 @@ def duplicates(df_in, mass_cutoff, rt_cutoff, ppm, blank_headers, sample_headers """CALCULATE STATISTICS FUNCTIONS""" -def statistics(df_in, blank_headers, sample_headers): +def statistics(df_in: pd.DataFrame, blank_headers, sample_headers): """ Calculates statistics (mean, median, std, CV, N_Abun, & Percent Abun) on the dataframe. Includes logic statement for determining if the dataframe is @@ -1486,7 +1447,8 @@ def feat_drop_df(df, docs, df_flagged): # Return df (data), df_flagged (data + flagged data) return df, df_flagged - +# TODO: tracer_df as defined in nta_task should be a Dataframe or None. If we want this to be a boolean, we should phrase the parameter name as +# a yes-no question, i.e. has_tracer_df, which would be False if tracer_df was None. def clean_features(df_in, controls, tracer_df=False): """ Function that removes (blanks out) observations at feature and occurrence level @@ -1577,7 +1539,7 @@ def clean_features(df_in, controls, tracer_df=False): return df, docs, df_flagged -def Blank_Subtract_Mean(df_in): +def Blank_Subtract_Mean(df_in: pd.DataFrame): """ Calculate the mean blank intensity for each feature and subtract that value from each sample's mean value for that feature. @@ -1607,7 +1569,7 @@ def Blank_Subtract_Mean(df_in): """FUNCTIONS FOR COMBINING DATAFRAMES / FILE PREPARATION""" -def combine(df1, df2): +def combine(df1: Union[pd.DataFrame, None], df2: Union[pd.DataFrame, None]): """ Function to combine positive and negative mode dataframes into df_combined @@ -1628,7 +1590,7 @@ def combine(df1, df2): # Get column names columns = dfc.columns.values.tolist() # Drop duplicates (should not be any) - dfc = dfc.drop_duplicates(subset=["Mass", "Retention_Time"]) + dfc = dfc.drop_duplicates(subset=[MASS_COL, RETENTION_COL]) # Get sample Means Mean_list = dfc.columns[ (dfc.columns.str.contains(pat="Mean ") == True) @@ -1638,12 +1600,12 @@ def combine(df1, df2): dfc["N_Abun_Samples"] = dfc[Mean_list].count(axis=1, numeric_only=True) dfc["Mean_Abun_Samples"] = dfc[Mean_list].median(axis=1, skipna=True).round(0) # Sort by 'Mass' and 'Retention_Time' - dfc = dfc[columns].sort_values(["Mass", "Retention_Time"], ascending=[True, True]) + dfc: pd.DataFrame = dfc[columns].sort_values([MASS_COL, RETENTION_COL], ascending=[True, True]) # Return combined dataframe return dfc - -def combine_doc(doc1, doc2, tracer_df=False): +# TODO: Is tracer_df necessary if it is unaccessed? +def combine_doc(doc1: Union[pd.DataFrame, None], doc2: Union[pd.DataFrame, None], tracer_df=False): """ Function to combine positive and negative mode docs for filter_documentation sheet @@ -1654,27 +1616,25 @@ def combine_doc(doc1, doc2, tracer_df=False): Outputs: dfc (dataframe, doc1 and doc2 combined) """ - # Define blank sub-strings - blanks = ["MB", "mb", "mB", "Mb", "blank", "Blank", "BLANK"] # Recombine doc and dupe if doc1 is not None and doc2 is not None: # Get Mean columns for blanks and samples Mean = doc1.columns[doc1.columns.str.contains(pat="Mean ")].tolist() - Mean_Samples = [md for md in Mean if not any(x in md for x in blanks)] - Mean_MB = [md for md in Mean if any(x in md for x in blanks)] + Mean_Samples = [md for md in Mean if not any(x in md for x in ALLOWED_BLANK_FORMATS_LIST)] + Mean_MB = [md for md in Mean if any(x in md for x in ALLOWED_BLANK_FORMATS_LIST)] dfc = pd.concat([doc1, doc2], sort=True) # fixing pandas FutureWarning dfc = dfc.reindex(columns=doc1.columns) elif doc1 is not None: # Get Mean columns for blanks and samples Mean = doc1.columns[doc1.columns.str.contains(pat="Mean ")].tolist() - Mean_Samples = [md for md in Mean if not any(x in md for x in blanks)] - Mean_MB = [md for md in Mean if any(x in md for x in blanks)] + Mean_Samples = [md for md in Mean if not any(x in md for x in ALLOWED_BLANK_FORMATS_LIST)] + Mean_MB = [md for md in Mean if any(x in md for x in ALLOWED_BLANK_FORMATS_LIST)] dfc = doc1.copy() else: # Get Mean columns for blanks and samples Mean = doc2.columns[doc2.columns.str.contains(pat="Mean ")].tolist() - Mean_Samples = [md for md in Mean if not any(x in md for x in blanks)] - Mean_MB = [md for md in Mean if any(x in md for x in blanks)] + Mean_Samples = [md for md in Mean if not any(x in md for x in ALLOWED_BLANK_FORMATS_LIST)] + Mean_MB = [md for md in Mean if any(x in md for x in ALLOWED_BLANK_FORMATS_LIST)] dfc = doc2.copy() # Select columns for keeping, with tracer conditional @@ -1699,12 +1659,12 @@ def combine_doc(doc1, doc2, tracer_df=False): dfc = dfc[cols] dfc.rename({"BlkStd_cutoff": "Selected MRL"}, axis=1, inplace=True) # Sort by 'Mass' and 'Retention_Time' - dfc = dfc.sort_values(["Feature ID"], ascending=[True]) + dfc = dfc.sort_values([FEATURE_ID_COL], ascending=[True]) # Return filter_documentation dataframe with removed duplicates appended return dfc -def MPP_Ready(dfc, pts, blank_headers, sample_headers): +def MPP_Ready(dfc: pd.DataFrame, pts: list[Union[pd.DataFrame, None]], blank_headers: list[list[str]], sample_headers: list[list[str]]): """ Function that re-combines the pass-through columns with the processed dataframe plus some final column sorting. @@ -1719,13 +1679,13 @@ def MPP_Ready(dfc, pts, blank_headers, sample_headers): # Assign pass through columns to pt_cols for re_org if pts[0] is not None and pts[1] is not None: pt_com = pd.concat([pts[0], pts[1]], axis=0) - dfc = pd.merge(dfc, pt_com, how="left", on=["Feature ID"]) + dfc = pd.merge(dfc, pt_com, how="left", on=[FEATURE_ID_COL]) pt_cols = pts[0].columns.tolist() elif pts[0] is not None: - dfc = pd.merge(dfc, pts[0], how="left", on=["Feature ID"]) + dfc = pd.merge(dfc, pts[0], how="left", on=[FEATURE_ID_COL]) pt_cols = pts[0].columns.tolist() else: - dfc = pd.merge(dfc, pts[1], how="left", on=["Feature ID"]) + dfc = pd.merge(dfc, pts[1], how="left", on=[FEATURE_ID_COL]) pt_cols = pts[1].columns.tolist() # Get raw sample headers @@ -1733,35 +1693,19 @@ def MPP_Ready(dfc, pts, blank_headers, sample_headers): raw_samples = [item for sublist in sample_groups for item in sublist] + ["MRL (3x)", "MRL (5x)", "MRL (10x)"] # Get blank subtracted means blank_subtracted_means = dfc.columns[dfc.columns.str.contains(pat="BlankSub")].tolist() - # Establish ordering of all possible front matter (tracer/no tracer, flags/no flags, etc.) - ordering = [ - "Ionization_Mode", - "Mass", - "Retention_Time", - "Compound", - "Tracer Chemical Match?", - "Duplicate Feature?", - "Is Adduct or Loss?", - "Has Adduct or Loss?", - "Adduct or Loss Info", - "Final Occurrence Count", - "Final Occurrence Percentage", - "Final Occurrence Count (with flags)", - "Final Occurrence Percentage (with flags)", - ] # Get dft columns in list all_cols = dfc.columns.tolist() # Front matter list comp - front_matter = [item for item in ordering if item in all_cols] + front_matter = [item for item in FRONT_MATTER_ORDERING if item in all_cols] # Generate full column list cols = pt_cols + front_matter + raw_samples + blank_subtracted_means - # Subset dft with correct columns / ordering + # Subset dft with correct columns / FRONT_MATTER_ORDERING dfc = dfc[cols] # Rename columns - dfc["Ionization_Mode"] = dfc["Ionization_Mode"].replace("Esi+", "ESI+") - dfc["Ionization_Mode"] = dfc["Ionization_Mode"].replace("Esi-", "ESI-") + dfc[IONIZATION_COL] = dfc[IONIZATION_COL].replace("Esi+", "ESI+") + dfc[IONIZATION_COL] = dfc[IONIZATION_COL].replace("Esi-", "ESI-") dfc.rename( - {"Ionization_Mode": "Ionization Mode", "Retention_Time": "Retention Time"}, + {IONIZATION_COL: "Ionization Mode", RETENTION_COL: "Retention Time"}, axis=1, inplace=True, ) @@ -1769,7 +1713,7 @@ def MPP_Ready(dfc, pts, blank_headers, sample_headers): return dfc -def calc_toxcast_percent_active(df): +def calc_toxcast_percent_active(df: pd.DataFrame): """ Function that calculates toxcast percent active values. @@ -1801,7 +1745,7 @@ def calc_toxcast_percent_active(df): return dft -def determine_string_width(input_string): +def determine_string_width(input_string: str): """ The following function calculates a "width" of a string based on the characters within, as some characters are large, medium or skinnyThese widths are used to determine the spacing of the group @@ -1885,6 +1829,7 @@ def determine_string_width(input_string): for j in range(len(input_string)): if input_string[j] in big_letters: temp_increment = temp_increment + big_increment + # TODO: are these print statements still necessary? print("big") elif input_string[j] in medium_letters: temp_increment = temp_increment + medium_increment @@ -1896,7 +1841,7 @@ def determine_string_width(input_string): return temp_increment -def chunk_dataframe(df, chunk_size): +def chunk_dataframe(df: pd.DataFrame, chunk_size: int): """ Function for splitting a dataframe into chunks for printing into separate sheets of an excel workbook. @@ -1914,7 +1859,7 @@ def chunk_dataframe(df, chunk_size): yield df[i * chunk_size : (i + 1) * chunk_size] -def create_excel_book(d, chem_res=False): +def create_excel_book(d: dict[str, pd.DataFrame], chem_res=False): """ Function for creating excel book from python dictionary, where dict keys are sheet names and dict items (dfs) are sheet contents. @@ -1961,7 +1906,7 @@ def create_excel_book(d, chem_res=False): return excel_data -def DSSTox_atom_filtering(df_in, atom_ranges): +def DSSTox_atom_filtering(df_in: pd.DataFrame, atom_ranges: list[dict]): """ Function that takes a dataframe of returned candidates from searching DSSTox and user submitted ranges for atoms (CHONPS, Halogens, and other potential elements). @@ -1977,7 +1922,7 @@ def DSSTox_atom_filtering(df_in, atom_ranges): # Copy input dataframe df = df_in.copy() # Drop candidates with no formula information - df = df.loc[~df["MOLECULAR_FORMULA"].isna(), :] + df = df.loc[~df[MOLECULAR_FORMULA_COL].isna(), :] # Create separate 'atom_ranges' into 'to_search' and 'to_exclude' to_search = [item for item in atom_ranges if (item["max"] - item["min"]) > 0] to_exclude = [item["element"] for item in atom_ranges if (item["max"] - item["min"]) <= 0] @@ -1986,11 +1931,11 @@ def DSSTox_atom_filtering(df_in, atom_ranges): # is stored in a new "{element} filter check" column (1 - pass, 0 - fail) for item in to_search: col = item["element"] + " filter check" - df[col] = df["MOLECULAR_FORMULA"].apply( + df[col] = df[MOLECULAR_FORMULA_COL].apply( lambda x: formula_atom_count(x, item["element"], item["min"], item["max"]) ) # Flag candidates with elements from 'to_exclude' - df["Pass excluded elements filter?"] = df["MOLECULAR_FORMULA"].apply(lambda x: formula_exclude(x, to_exclude)) + df["Pass excluded elements filter?"] = df[MOLECULAR_FORMULA_COL].apply(lambda x: formula_exclude(x, to_exclude)) # Get '{element} filter check' columns in list cols = [col for col in df.columns if " filter check" in col] + ["Pass excluded elements filter?"] # Keep rows that pass for all '{element} filter check' columns and excluded elements @@ -2002,11 +1947,11 @@ def DSSTox_atom_filtering(df_in, atom_ranges): def formula_atom_count( - formula, - element, - minimum, - maximum, -): + formula: str, + element: str, + minimum: int, + maximum: int, +) -> Literal[0, 1]: """ Function that takes in a chemical formula string, an element string, a minimum integer, and a maximum integer. The function finds the element string @@ -2046,9 +1991,9 @@ def formula_atom_count( def formula_exclude( - formula, - element_li, -): + formula: str, + element_li: list[str], +) -> Literal[0, 1]: """ Function that takes in a chemical formula string and an element string. The function searches for the element string in the chemical formula string, diff --git a/input/ms1/1a_MZmine3_pos.csv b/input/ms1/1a_MZmine3_pos.csv index 66d94d24..fff5d797 100644 --- a/input/ms1/1a_MZmine3_pos.csv +++ b/input/ms1/1a_MZmine3_pos.csv @@ -1,4 +1,4 @@ -MZmine_ID,MB1,MB2,MB3,MB4,MB5,D1S1_1,D1S1_2,D1S1_3,D1S2_1,D1S2_2,D1S2_3,D1S3_1,D1S3_2,D1S3_3,D1S4_1,D1S4_2,D1S4_3,D2S1_1,D2S1_2,D2S1_3,D2S2_1,D2S2_2,D2S2_3,D2S3_1,D2S3_2,D2S3_3,D2S4_1,D2S4_2,D2S4_3,D3S1_1,D3S1_2,D3S1_3,D3S2_1,D3S2_2,D3S2_3,D3S3_1,D3S3_2,D3S3_3,D3S4_1,D3S4_2,D3S4_3,Pooled_1,Pooled_2,Pooled_3,Pooled_4,10ppb_1,10ppb_2,100ppb_1,100ppb_2,250ppb_1,250ppb_2,250ppb_3,250ppb_4,500ppb_1,500ppb_2,1000ppb_1,1000ppb_2,m/z,Retention Time,Mass,Ionization mode +MZmine_ID,MB1,MB2,MB3,MB4,MB5,D1S1_1,D1S1_2,D1S1_3,D1S2_1,D1S2_2,D1S2_3,D1S3_1,D1S3_2,D1S3_3,D1S4_1,D1S4_2,D1S4_3,D2S1_1,D2S1_2,D2S1_3,D2S2_1,D2S2_2,D2S2_3,D2S3_1,D2S3_2,D2S3_3,D2S4_1,D2S4_2,D2S4_3,D3S1_1,D3S1_2,D3S1_3,D3S2_1,D3S2_2,D3S2_3,D3S3_1,D3S3_2,D3S3_3,D3S4_1,D3S4_2,D3S4_3,Pooled_1,Pooled_2,Pooled_3,Pooled_4,10ppb_1,10ppb_2,100ppb_1,100ppb_2,250ppb_1,250ppb_2,250ppb_3,250ppb_4,500ppb_1,500ppb_2,1000ppb_1,1000ppb_2,m/z,Retention Time,Mass,Ionization Mode 3268,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,195.77501,,,179.43219,,,,,,,,,,,168.7952,168.7952,58.06545151,9.240864,57.05817551,Esi+ 174,,,,,,,,,,,,,,,,,,,,,1091.0004,1212.4932,960.021,855.3134,,,,,,,,,,,,903.57697,,,,,,615.77826,666.35693,638.09827,522.0698,,,,,,,,,,,,,61.00808652,0.7318255,60.00081052,Esi+ 1656,301.16437,238.65865,,172.99602,124.241974,185.96703,,126.823105,172.96233,142.18542,108.34602,151.76581,,124.848045,158.36009,,154.1734,,152.28131,116.609276,,128.06255,,137.65839,,104.11684,,168.83298,114.57462,,,,,131.19362,,,124.75299,,,186.35593,116.801285,242.66692,,149.98123,110.991875,647.5779,647.5779,494.74332,494.74332,406.00397,171.5949,146.74963,114.577896,385.40845,385.40845,314.1869,314.1869,69.03398487,6.4335365,68.02670887,Esi+ diff --git a/input/ms1/1b_MZmine3_neg.csv b/input/ms1/1b_MZmine3_neg.csv index ae53889d..1bc7e737 100644 --- a/input/ms1/1b_MZmine3_neg.csv +++ b/input/ms1/1b_MZmine3_neg.csv @@ -1,4 +1,4 @@ -MZmine_ID,MB1,MB2,MB3,MB4,MB5,D1S1_1,D1S1_2,D1S1_3,D1S2_1,D1S2_2,D1S2_3,D1S3_1,D1S3_2,D1S3_3,D1S4_1,D1S4_2,D1S4_3,D2S1_1,D2S1_2,D2S1_3,D2S2_1,D2S2_2,D2S2_3,D2S3_1,D2S3_2,D2S3_3,D2S4_1,D2S4_2,D2S4_3,D3S1_1,D3S1_2,D3S1_3,D3S2_1,D3S2_2,D3S2_3,D3S3_1,D3S3_2,D3S3_3,D3S4_1,D3S4_2,D3S4_3,Pooled_1,Pooled_2,Pooled_3,Pooled_4,10ppb_1,10ppb_2,100ppb_1,100ppb_2,250ppb_1,250ppb_2,250ppb_3,250ppb_4,500ppb_1,500ppb_2,1000ppb_1,1000ppb_2,m/z,Retention Time,Mass,Ionization mode +MZmine_ID,MB1,MB2,MB3,MB4,MB5,D1S1_1,D1S1_2,D1S1_3,D1S2_1,D1S2_2,D1S2_3,D1S3_1,D1S3_2,D1S3_3,D1S4_1,D1S4_2,D1S4_3,D2S1_1,D2S1_2,D2S1_3,D2S2_1,D2S2_2,D2S2_3,D2S3_1,D2S3_2,D2S3_3,D2S4_1,D2S4_2,D2S4_3,D3S1_1,D3S1_2,D3S1_3,D3S2_1,D3S2_2,D3S2_3,D3S3_1,D3S3_2,D3S3_3,D3S4_1,D3S4_2,D3S4_3,Pooled_1,Pooled_2,Pooled_3,Pooled_4,10ppb_1,10ppb_2,100ppb_1,100ppb_2,250ppb_1,250ppb_2,250ppb_3,250ppb_4,500ppb_1,500ppb_2,1000ppb_1,1000ppb_2,m/z,Retention Time,Mass,Ionization Mode 16,,,,,,212.73552,219.34872,281.90924,221.2685,248.72163,214.2259,242.11006,318.33566,342.4138,,,,,253.82516,,287.05722,412.3387,347.5727,278.27313,353.55814,339.57358,,,,269.50406,305.74353,265.1876,238.98248,295.77087,280.3127,277.0126,316.07498,312.39313,,,,,,,,,,,,,,,,,,,,86.99327011,0.820615,88.00054611,Esi- 97,1060.3883,,1468.8896,2660.218,1198.4237,900.1573,1856.4471,,1629.8142,,3759.2708,2697.5913,1550.9102,2276.3447,3398.5586,2640.9841,2326.5679,1775.7096,1986.4924,,,1446.644,3586.179,,5742.782,1210.672,2446.036,2605.1956,1961.6882,1491.55,2004.7794,,1380.3123,1090.6395,1459.1161,1379.0665,,2324.6565,1118.1552,,2835.0273,1791.8285,3270.8782,3327.1106,3122.3787,1884.4099,1884.4099,3270.5608,3270.5608,2535.286,4900.734,1849.4922,6822.686,2279.2485,2279.2485,5719.1035,5719.1035,91.00316049,0.91303396,92.01043649,Esi- 2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,209.75368,212.25703,180.59227,,,,,,,,,,,303.348,303.348,94.98040041,0.7934577,95.98767641,Esi- diff --git a/requirements.txt b/requirements.txt index a9e69a74..2427b872 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ aiohttp==3.9.2 bokeh -blosc==1.10.2 +blosc==1.11.2 +coverage==7.8.0 dask==2023.4.0 distributed==2023.4.0 django==4.1.13 @@ -12,6 +13,7 @@ matplotlib==3.7.1 numpy==1.24.2 pandas==2.0.0 pymongo==3.12.0 +pytest==8.3.5 psycopg2-binary==2.9.2 requests==2.31.0 toolz==0.11.2 diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..30b7a7dd --- /dev/null +++ b/tests/README.md @@ -0,0 +1,7 @@ +# Pytest +To run tests, from top directory run: + +``` +pytest nta_app/tests/ +``` + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/app_ms1_test_helpers.py b/tests/app_ms1_test_helpers.py new file mode 100644 index 00000000..464ab931 --- /dev/null +++ b/tests/app_ms1_test_helpers.py @@ -0,0 +1,42 @@ +import datetime +from nta_app.app.constants import EXAMPLE_POS_FILENAME, EXAMPLE_NEG_FILENAME, EXAMPLE_RUN_SEQUENCE_NEG_FILENAME, EXAMPLE_RUN_SEQUENCE_POS_FILENAME, EXAMPLE_SURROGATE_FILENAME, EXAMPLE_TRACER_FILENAME + +inputParameters = { + "project_name": ["Project name", "Example nta"], + "datetime": ["Date & time", str(datetime.datetime.now())], + "test_files": ["Run test files only (debugging)", "yes"], + "pos_input": ["Positive mode file", EXAMPLE_POS_FILENAME], + "neg_input": ["Negative mode file", EXAMPLE_NEG_FILENAME], + "pos_adducts": ["Positive mode adducts", ["Na", "K", "NH4"]], + "neg_adducts": ["Negative mode adducts", ["Cl", "HCO2", "CH3CO2", "FA"]], + "neutral_losses": ["Neutral losses (both modes)", ["H2O", "CO2"]], + "mass_accuracy_units": ["Adduct / duplicate mass accuracy units", "ppm"], + "mass_accuracy": ["Adduct / duplicate mass accuracy", 10], + "rt_accuracy": ["Adduct / duplicate retention time accuracy (mins)", 0.05], + "run_sequence_pos_file": [ + "Run sequence positive mode file", + EXAMPLE_RUN_SEQUENCE_POS_FILENAME, + ], + "run_sequence_neg_file": [ + "Run sequence negative mode file", + EXAMPLE_RUN_SEQUENCE_NEG_FILENAME, + ], + "tracer_input": ["Tracer file", EXAMPLE_TRACER_FILENAME], + "mass_accuracy_units_tr": ["Tracer mass accuracy units", "ppm"], + "mass_accuracy_tr": ["Tracer mass accuracy", 5], + "rt_accuracy_tr": ["Tracer retention time accuracy (mins)", 0.1], + "tracer_plot_yaxis_format": ["Tracer plot y-axis scaling", "log"], + "tracer_plot_trendline": ["Tracer plot trendlines shown", "yes"], + "min_replicate_hits": ["Min replicate hits (%)", 66], + "min_replicate_hits_blanks": ["Min replicate hits in blanks (%)", 66], + "max_replicate_cv": ["Max replicate CV", 0.8], + "mrl_std_multiplier": ["MRL standard deviation multiplier", "3"], + "parent_ion_mass_accuracy": ["Parent ion mass accuracy (ppm)", 5], + "minimum_rt": ["Discard features below this retention time (mins)", 0.00], + "search_dsstox": ["Search DSSTox for possible structures", "no"], + "search_hcd": ["Search Cheminformatics Hazard Module for toxicity data", "no"], + "search_mode": ["Search dashboard by", "mass"], + "do_qnta": ["Perform qNTA?", "no"], + "qnta_input": ["qNTA Surrogate input file", EXAMPLE_SURROGATE_FILENAME], + "atom_ranges": ["Atom filtering ranges", None], +} \ No newline at end of file diff --git a/tests/test_app_ms1_test_helpers.py b/tests/test_app_ms1_test_helpers.py new file mode 100644 index 00000000..456187e0 --- /dev/null +++ b/tests/test_app_ms1_test_helpers.py @@ -0,0 +1,6 @@ +from nta_app.tests.app_ms1_test_helpers import inputParameters +from nta_app.app.constants import EXAMPLE_POS_FILENAME + +def test__ensure_parameters_are_complete_for_tests(): + assert inputParameters["pos_input"][1] == EXAMPLE_POS_FILENAME + assert inputParameters["test_files"][1] == "yes" \ No newline at end of file diff --git a/tests/test_nta_task.py b/tests/test_nta_task.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_task_functions.py b/tests/test_task_functions.py new file mode 100644 index 00000000..7d070982 --- /dev/null +++ b/tests/test_task_functions.py @@ -0,0 +1,67 @@ +import pandas as pd +import os +import pytest +from nta_app.tests.app_ms1_test_helpers import inputParameters +from nta_app.app.constants import EXAMPLE_NEG_FILENAME, EXAMPLE_POS_FILENAME +from nta_app.app.ms1.task_functions import duplicates, assign_feature_id, differences as count_string_differences, parse_headers, get_sample_and_blank_headers, passthrucol, window_size + +data_dir = "input/ms1" +my_pos_df = pd.read_csv(os.path.join(data_dir, EXAMPLE_POS_FILENAME)) +my_neg_df = pd.read_csv(os.path.join(data_dir, EXAMPLE_NEG_FILENAME)) + +def test__added_feature_id__new_column_for_feature_id(): + data = { + "firstCol": [420, 380, 390], + "secondCol": [50, 40, 45] + } + old_df = pd.DataFrame(data) + assert "Feature ID" not in old_df.columns + + new_df = assign_feature_id(df_in=old_df, start=1) + + assert "Feature ID" in new_df.columns + +def test__count_string_differences__for_basic_strings(): + assert count_string_differences(s1="ones", s2="one") == 2 + +def test__count_string_differences__for_strings_with_special_characters(): + assert count_string_differences(s1="o_nes", s2="o^nes") == 2 + +def test__parse_headers__returns_list_of_list_of_string(df=my_pos_df): + result = parse_headers(df) + assert len(result) == 23 + +def test__parse_headers__lists_contain_expected_items(df=my_pos_df): + result = parse_headers(df) + assert result[0][0] == "MB1" + assert result[22][0] == "Ionization Mode" + +def test__get_sample_and_blank_headers__returns_all_headers(pos_df=my_pos_df, neg_df=my_neg_df): + assert len(get_sample_and_blank_headers((pos_df, neg_df))) == 3 + +def test__get_sample_and_blank_headers__fails_when_both_dfs_are_none(pos_df=my_pos_df, neg_df=my_neg_df): + with pytest.raises(AttributeError): + get_sample_and_blank_headers((None, None)) + +def test__get_sample_and_blank_headers__returns_correct_content(pos_df=my_pos_df, neg_df=my_neg_df): + all_headers, blank_headers, sample_headers = get_sample_and_blank_headers((pos_df, neg_df)) + assert len(blank_headers[0]) == 5 + for sample_types in sample_headers: + for sample in sample_types: + assert not sample.startswith("MB") + assert all_headers[-1][0] == "Ionization Mode" + +def test__passthrucol__returns_passthrough_and_trimmed_df(pos_df=my_pos_df, neg_df=my_neg_df): + pos_df = assign_feature_id(pos_df) + all_headers = get_sample_and_blank_headers((pos_df, neg_df))[0] + df_pt, df_trim = passthrucol(pos_df, all_headers) + assert "m/z" in df_pt.columns.values + assert "m/z" not in df_trim.columns.values + +def test__window_size__default_mass_diff(df_in=my_pos_df): + val = window_size(df_in) + assert val == 1801 + +def test__window_size__supplied_mass_diff(df_in=my_pos_df): + val = window_size(df_in, 100.00) + assert val == 1597 \ No newline at end of file diff --git a/views/ms1/ms1_input_api.py b/views/ms1/ms1_input_api.py index 02d690a0..c19902a3 100644 --- a/views/ms1/ms1_input_api.py +++ b/views/ms1/ms1_input_api.py @@ -8,6 +8,7 @@ import string, random import datetime import logging +from nta_app.app.constants import EXAMPLE_NEG_FILENAME, EXAMPLE_POS_FILENAME, EXAMPLE_RUN_SEQUENCE_NEG_FILENAME, EXAMPLE_RUN_SEQUENCE_POS_FILENAME, EXAMPLE_SURROGATE_FILENAME, EXAMPLE_TRACER_FILENAME from ...app.ms1.nta_task import run_nta_dask from ...tools.ms1 import file_manager from ..views_dectorators import api_key_required @@ -17,15 +18,6 @@ if os.getenv("DEPLOY_ENV", "kube-dev") == "kube-prod": logger.setLevel(logging.WARNING) -# hard-coded example file names for testing found in nta_app/input/ms1/ -example_pos_filename = "1a_MZmine3_pos.csv" -example_neg_filename = "1b_MZmine3_neg.csv" -example_tracer_filename = "WW2DW_Tracers_Amenable.csv" -example_run_sequence_pos_filename = "WW2DW_sequence_cal.csv" -example_run_sequence_neg_filename = "WW2DW_sequence_cal.csv" -example_surrogate_filename = "qNTA_Surrogate_Input_File_WW2DW.csv" - - @api_key_required @csrf_exempt def ms1_run_api(request): @@ -205,13 +197,13 @@ def ms1_run_api(request): # handle case 1: the user has selected to run the test files # get the path and filename of the test files example_data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "input/ms1") - pos_input = os.path.join(example_data_dir, example_pos_filename) - neg_input = os.path.join(example_data_dir, example_neg_filename) - tracer_file = os.path.join(example_data_dir, example_tracer_filename) - run_sequence_pos_file = os.path.join(example_data_dir, example_run_sequence_pos_filename) - run_sequence_neg_file = os.path.join(example_data_dir, example_run_sequence_neg_filename) + pos_input = os.path.join(example_data_dir, EXAMPLE_POS_FILENAME) + neg_input = os.path.join(example_data_dir, EXAMPLE_NEG_FILENAME) + tracer_file = os.path.join(example_data_dir, EXAMPLE_TRACER_FILENAME) + run_sequence_pos_file = os.path.join(example_data_dir, EXAMPLE_RUN_SEQUENCE_POS_FILENAME) + run_sequence_neg_file = os.path.join(example_data_dir, EXAMPLE_RUN_SEQUENCE_NEG_FILENAME) if parameters["do_qnta"] == "yes": - qnta_file = os.path.join(example_data_dir, example_surrogate_filename) + qnta_file = os.path.join(example_data_dir, EXAMPLE_SURROGATE_FILENAME) inputParameters["qnta_input"][1] = qnta_file qnta_df = file_manager.tracer_handler(qnta_file) else: