quanted · StaceyD22 · Apr 15, 2025 · Apr 21, 2025 · May 7, 2025 · May 7, 2025
diff --git a/.github/workflows/testing-code.yml b/.github/workflows/testing-code.yml
@@ -0,0 +1,31 @@
+name: Run Unit Test via Pytest
+
+on: [push]
+
+jobs:
+  build:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        run: |
+          coverage run -m pytest tests
+        continue-on-error: true
+      - name: Generate Coverage Report
+        run: |
+          coverage report -m
diff --git a/app/__init_.py b/app/__init_.py
diff --git a/app/constants.py b/app/constants.py
@@ -0,0 +1,80 @@
+# Test file names
+EXAMPLE_POS_FILENAME = "1a_MZmine3_pos.csv"
+EXAMPLE_NEG_FILENAME = "1b_MZmine3_neg.csv"
+EXAMPLE_TRACER_FILENAME = "WW2DW_Tracers_Amenable.csv"
+EXAMPLE_RUN_SEQUENCE_POS_FILENAME = "WW2DW_sequence_cal.csv"
+EXAMPLE_RUN_SEQUENCE_NEG_FILENAME = "WW2DW_sequence_cal.csv"
+EXAMPLE_SURROGATE_FILENAME = "qNTA_Surrogate_Input_File_WW2DW.csv"
+
+# Define pos/neg/neutral adduct lists
+# Proton added - we observe Mass-(H+) and Mass+(Adduct)
+NEG_ADDUCT_LI = [
+    ("Cl", 35.976678),
+    ("Br", 79.926161),
+    ("HCO2", 46.005477),
+    ("CH3CO2", 60.021127),
+    ("CF3CO2", 113.992862),
+]
+
+# Proton subtracted - we observe Mass+(H+) and Mass+(Adduct)
+POS_ADDUCT_LI = [
+    ("Na", 21.981942),
+    ("K", 37.955882),
+    ("NH4", 17.026547),
+]
+
+NEUTRAL_LOSSES_LI = [
+    ("H2O", -18.010565),
+    ("2H2O", -36.02113),
+    ("3H2O", -54.031695),
+    ("4H2O", -72.04226),
+    ("5H2O", -90.052825),
+    ("NH3", -17.0265),
+    ("O", -15.99490),
+    ("CO", -29.00220),
+    ("CO2", -43.989829),
+    ("C2H4", -28.03130),
+    ("CH2O2", 46.00550),  # note here and below - not losses? but still neutral?
+    ("CH3COOH", 60.02110),
+    ("CH3OH", 32.02620),
+    ("CH3CN", 41.02650),
+    ("(CH3)2CHOH", 60.05810),
+]
+
+# Set to tested memory capacity of WebApp for number of features in 'adduct_matrix'
+MAX_NUM_ADDUCT_FEATURES = 12000
+
+# Column names accessed throughout app
+FEATURE_ID_COL = "Feature ID"
+DASHBOARD_SEARCH_COL = "For_Dashboard_Search"
+FORMULA_COL = "Formula"
+MASS_COL = "Mass"
+RETENTION_COL = "Retention_Time"
+IONIZATION_COL = "Ionization_Mode"
+MOLECULAR_FORMULA_COL = "MOLECULAR_FORMULA"
+
+# Format lists to test values agains
+ALLOWED_BLANK_FORMATS_LIST = ["Blank", "blank", "BLANK", "MB", "Mb", "mb", "mB"]
+ACTIVE_COLUMNS_LIST = [
+    "Retention_Time",
+    "Mass",
+    "Ionization_Mode",
+    "Compound",
+]
+
+# Establish ordering of all possible front matter (tracer/no tracer, flags/no flags, etc.)
+FRONT_MATTER_ORDERING = [
+    "Ionization_Mode",
+    "Mass",
+    "Retention_Time",
+    "Compound",
+    "Tracer Chemical Match?",
+    "Duplicate Feature?",
+    "Is Adduct or Loss?",
+    "Has Adduct or Loss?",
+    "Adduct or Loss Info",
+    "Final Occurrence Count",
+    "Final Occurrence Percentage",
+    "Final Occurrence Count (with flags)",
+    "Final Occurrence Percentage (with flags)",
+]
diff --git a/app/feature/tests/test_feature.py b/app/feature/tests/test_feature.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 import unittest
-from Feature import Feature_MS2 as ms2
+from feature import Feature_MS2 as ms2
 from test_data import parsedMGF as mgfData
 
 #Note about test mgfData

diff --git a/app/ms1/__init__.py b/app/ms1/__init__.py
diff --git a/app/ms1/nta_task.py b/app/ms1/nta_task.py
@@ -6,6 +6,7 @@
 import traceback
 import shutil
 import json
+from typing import Union
 from datetime import datetime
 from dask.distributed import Client, LocalCluster, fire_and_forget
 from zipfile import ZipFile, ZIP_DEFLATED
@@ -35,13 +36,13 @@
 
 def run_nta_dask(
     parameters,
-    input_dfs,
-    tracer_df=None,
-    run_sequence_pos_df=None,
-    run_sequence_neg_df=None,
-    qnta_df=None,
-    jobid="00000000",
-    verbose=True,
+    input_dfs: list[Union[pd.DataFrame, None]],
+    tracer_df: Union[pd.DataFrame, None] = None,
+    run_sequence_pos_df: Union[pd.DataFrame, None] = None,
+    run_sequence_neg_df: Union[pd.DataFrame, None] = None,
+    qnta_df: Union[pd.DataFrame, None] = None,
+    jobid = "00000000",
+    verbose = True,
 ):
     in_docker = os.environ.get("IN_DOCKER") != "False"
     mongo_address = os.environ.get("MONGO_SERVER")
@@ -96,14 +97,14 @@ def run_nta_dask(
 
 def run_nta(
     parameters,
-    input_dfs,
-    tracer_df=None,
-    run_sequence_pos_df=None,
-    run_sequence_neg_df=None,
-    qnta_df=None,
-    mongo_address=None,
-    jobid="00000000",
-    verbose=True,
+    input_dfs: list[Union[pd.DataFrame, None]],
+    tracer_df: Union[pd.DataFrame, None] = None,
+    run_sequence_pos_df: Union[pd.DataFrame, None] = None,
+    run_sequence_neg_df: Union[pd.DataFrame, None] = None,
+    qnta_df: Union[pd.DataFrame, None] = None,
+    jobid = "00000000",
+    verbose = True,
+    mongo_address: Union[str, None] = None,
     in_docker=True,
 ):
     nta_run = NtaRun(
@@ -140,7 +141,7 @@ def __init__(
         run_sequence_pos_df=None,
         run_sequence_neg_df=None,
         qnta_df=None,
-        mongo_address=None,
+        mongo_address: Union[str, None] = None,
         jobid="00000000",
         verbose=True,
         in_docker=True,
@@ -406,7 +407,7 @@ def check_existence_of_mass_column(self, input_dfs):
 
         return
 
-    def check_retention_time_column(self, input_dfs):
+    def check_retention_time_column(self, input_dfs: list[Union[pd.DataFrame, None]]):
         """
         Check for the existence of alternate spellings of 'Retention_Time' column in input dataframes and rename to "Retention_Time".
 
@@ -609,10 +610,10 @@ def pass_through_cols(self):
         self.pass_through = [
             task_fun.passthrucol(df, self.all_headers)[0] if df is not None else None for df in self.dfs
         ]
-        self.dfs = [task_fun.passthrucol(df, self.all_headers)[1] if df is not None else None for df in self.dfs]
+        self.dfs: list[Union[pd.DataFrame, None]] = [task_fun.passthrucol(df, self.all_headers)[1] if df is not None else None for df in self.dfs]
         return
 
-    def filter_void_volume(self, min_rt):
+    def filter_void_volume(self, min_rt: float):
         """
         Accesses self.dfs (list of dataframes) and self.parameters["minimum_rt"][1]
         then removes all rows with a value below "minimum_rt" in the "Retention_Time"
@@ -624,7 +625,7 @@ def filter_void_volume(self, min_rt):
             None
         """
         # Iterate through dfs, removing rows where "Retention_Time" is below min_rt threshold
-        self.dfs = [df.loc[df["Retention_Time"] > min_rt].copy() if df is not None else None for df in self.dfs]
+        self.dfs: list[Union[pd.DataFrame, None]] = [df.loc[df["Retention_Time"] > min_rt].copy() if df is not None else None for df in self.dfs]
         return
 
     def filter_duplicates(self):
@@ -641,11 +642,11 @@ def filter_duplicates(self):
             None
         """
         # Get ppm, mass_accuracy, and rt_accuracy parameters
-        ppm = self.parameters["mass_accuracy_units"][1] == "ppm"
+        ppm: bool = self.parameters["mass_accuracy_units"][1] == "ppm"
         mass_accuracy = float(self.parameters["mass_accuracy"][1])
         rt_accuracy = float(self.parameters["rt_accuracy"][1])
         # Perform duplicate flagging functions
-        self.dfs = [
+        self.dfs: list[Union[pd.DataFrame, None]] = [
             task_fun.duplicates(df, mass_accuracy, rt_accuracy, ppm, self.blank_headers, self.sample_headers)
             if df is not None
             else None
@@ -677,7 +678,7 @@ def calc_statistics(self):
         # Iterate through dfs, calling chunk_stats() function
         # NTAW-49: Raises custom ValueError if blank columns are improperly named in the input dataframes
         try:
-            self.dfs = [
+            self.dfs: list[Union[pd.DataFrame, None]] = [
                 task_fun.chunk_stats(
                     df,
                     min_blank_detection_percentage,
@@ -831,7 +832,7 @@ def check_tracers(self):
             )
             for df in self.dfs
         ]
-        self.dfs = [
+        self.dfs: list[Union[pd.DataFrame, None]] = [
             (
                 task_fun.check_feature_tracers(
                     df,
@@ -1007,12 +1008,12 @@ def clean_features(self):
             task_fun.clean_features(df, controls, tracer_df=tracer_df_bool)[2] if df is not None else None
             for index, df in enumerate(self.dfs)
         ]
-        self.dfs = [
+        self.dfs: list[Union[pd.DataFrame, None]] = [
             task_fun.clean_features(df, controls, tracer_df=tracer_df_bool)[0] if df is not None else None
             for index, df in enumerate(self.dfs)
         ]
         # subtract blanks from means
-        self.dfs = [task_fun.Blank_Subtract_Mean(df) if df is not None else None for index, df in enumerate(self.dfs)]
+        self.dfs: list[Union[pd.DataFrame, None]] = [task_fun.Blank_Subtract_Mean(df) if df is not None else None for index, df in enumerate(self.dfs)]
         # subtract blanks from means
         self.dfs_flagged = [
             task_fun.Blank_Subtract_Mean(df) if df is not None else None for index, df in enumerate(self.dfs_flagged)