petab1->2: create experiment df

dweindl · dweindl · commit 8806a8ec6ed6 · 2024-12-18T18:12:57.000+01:00
diff --git a/petab/v1/calculate.py b/petab/v1/calculate.py
@@ -97,6 +97,9 @@ def calculate_residuals_for_table(
     Calculate residuals for a single measurement table.
     For the arguments, see `calculate_residuals`.
     """
+    # below, we rely on a unique index
+    measurement_df = measurement_df.reset_index(drop=True)
+
     # create residual df as copy of measurement df, change column
     residual_df = measurement_df.copy(deep=True).rename(
         columns={MEASUREMENT: RESIDUAL}
@@ -120,6 +123,10 @@ def calculate_residuals_for_table(
             for col in compared_cols
         ]
         mask = reduce(lambda x, y: x & y, masks)
+        if mask.sum() == 0:
+            raise ValueError(
+                f"Could not find simulation for measurement {row}."
+            )
         simulation = simulation_df.loc[mask][SIMULATION].iloc[0]
         if scale:
             # apply scaling
diff --git a/petab/v2/C.py b/petab/v2/C.py
@@ -13,14 +13,6 @@
 #: Experiment ID column in the measurement table
 EXPERIMENT_ID = "experimentId"
 
-# TODO: remove
-#: Preequilibration condition ID column in the measurement table
-PREEQUILIBRATION_CONDITION_ID = "preequilibrationConditionId"
-
-# TODO: remove
-#: Simulation condition ID column in the measurement table
-SIMULATION_CONDITION_ID = "simulationConditionId"
-
 #: Measurement value column in the measurement table
 MEASUREMENT = "measurement"
 
@@ -45,17 +37,13 @@
 #: Mandatory columns of measurement table
 MEASUREMENT_DF_REQUIRED_COLS = [
     OBSERVABLE_ID,
-    # TODO: add
-    # EXPERIMENT_ID,
-    SIMULATION_CONDITION_ID,
+    EXPERIMENT_ID,
     MEASUREMENT,
     TIME,
 ]
 
 #: Optional columns of measurement table
 MEASUREMENT_DF_OPTIONAL_COLS = [
-    # TODO: remove
-    PREEQUILIBRATION_CONDITION_ID,
     OBSERVABLE_PARAMETERS,
     NOISE_PARAMETERS,
     DATASET_ID,
diff --git a/petab/v2/__init__.py b/petab/v2/__init__.py
@@ -27,7 +27,10 @@
 
 # import after v1
 from ..version import __version__  # noqa: F401, E402
-from . import models  # noqa: F401, E402
+from . import (  # noqa: F401, E402
+    C,  # noqa: F401, E402
+    models,  # noqa: F401, E402
+)
 from .conditions import *  # noqa: F403, F401, E402
 from .experiments import (  # noqa: F401, E402
     get_experiment_df,
diff --git a/petab/v2/lint.py b/petab/v2/lint.py
@@ -15,6 +15,9 @@
 from .. import v2
 from ..v1.lint import (
     _check_df,
+    assert_measured_observables_defined,
+    assert_measurements_not_null,
+    assert_measurements_numeric,
     assert_model_parameters_in_condition_or_parameter_table,
     assert_no_leading_trailing_whitespace,
     assert_parameter_bounds_are_numeric,
@@ -23,13 +26,16 @@
     assert_parameter_prior_parameters_are_valid,
     assert_parameter_prior_type_is_valid,
     assert_parameter_scale_is_valid,
+    assert_unique_observable_ids,
     assert_unique_parameter_ids,
     check_ids,
-    check_measurement_df,
     check_observable_df,
     check_parameter_bounds,
 )
-from ..v1.measurements import split_parameter_replacement_list
+from ..v1.measurements import (
+    assert_overrides_match_parameter_count,
+    split_parameter_replacement_list,
+)
 from ..v1.observables import get_output_parameters, get_placeholders
 from ..v1.visualize.lint import validate_visualization_df
 from ..v2.C import *
@@ -237,8 +243,51 @@ def run(self, problem: Problem) -> ValidationIssue | None:
         if problem.measurement_df is None:
             return
 
+        df = problem.measurement_df
         try:
-            check_measurement_df(problem.measurement_df, problem.observable_df)
+            _check_df(df, MEASUREMENT_DF_REQUIRED_COLS, "measurement")
+
+            for column_name in MEASUREMENT_DF_REQUIRED_COLS:
+                if not np.issubdtype(df[column_name].dtype, np.number):
+                    assert_no_leading_trailing_whitespace(
+                        df[column_name].values, column_name
+                    )
+
+            for column_name in MEASUREMENT_DF_OPTIONAL_COLS:
+                if column_name in df and not np.issubdtype(
+                    df[column_name].dtype, np.number
+                ):
+                    assert_no_leading_trailing_whitespace(
+                        df[column_name].values, column_name
+                    )
+
+            if problem.observable_df is not None:
+                assert_measured_observables_defined(df, problem.observable_df)
+                assert_overrides_match_parameter_count(
+                    df, problem.observable_df
+                )
+
+                if OBSERVABLE_TRANSFORMATION in problem.observable_df:
+                    # Check for positivity of measurements in case of
+                    #  log-transformation
+                    assert_unique_observable_ids(problem.observable_df)
+                    # If the above is not checked, in the following loop
+                    # trafo may become a pandas Series
+                    for measurement, obs_id in zip(
+                        df[MEASUREMENT], df[OBSERVABLE_ID], strict=True
+                    ):
+                        trafo = problem.observable_df.loc[
+                            obs_id, OBSERVABLE_TRANSFORMATION
+                        ]
+                        if measurement <= 0.0 and trafo in [LOG, LOG10]:
+                            raise ValueError(
+                                "Measurements with observable "
+                                f"transformation {trafo} must be "
+                                f"positive, but {measurement} <= 0."
+                            )
+
+            assert_measurements_not_null(df)
+            assert_measurements_numeric(df)
         except AssertionError as e:
             return ValidationError(str(e))
 
@@ -247,46 +296,20 @@ def run(self, problem: Problem) -> ValidationIssue | None:
         #  condition table should be an error if the measurement table refers
         #  to conditions
 
-        # check that measured experiments/conditions exist
-        # TODO: fully switch to experiment table and remove this:
-        if SIMULATION_CONDITION_ID in problem.measurement_df:
-            if problem.condition_df is None:
-                return
-            used_conditions = set(
-                problem.measurement_df[SIMULATION_CONDITION_ID].dropna().values
-            )
-            if PREEQUILIBRATION_CONDITION_ID in problem.measurement_df:
-                used_conditions |= set(
-                    problem.measurement_df[PREEQUILIBRATION_CONDITION_ID]
-                    .dropna()
-                    .values
-                )
-            available_conditions = set(
-                problem.condition_df[CONDITION_ID].unique()
-            )
-            if missing_conditions := (used_conditions - available_conditions):
-                return ValidationError(
-                    "Measurement table references conditions that "
-                    "are not specified in the condition table: "
-                    + str(missing_conditions)
-                )
-        elif EXPERIMENT_ID in problem.measurement_df:
-            if problem.experiment_df is None:
-                return
-            used_experiments = set(
-                problem.measurement_df[EXPERIMENT_ID].values
-            )
-            available_experiments = set(
-                problem.condition_df[CONDITION_ID].unique()
+        # check that measured experiments
+        if problem.experiment_df is None:
+            return
+
+        used_experiments = set(problem.measurement_df[EXPERIMENT_ID].values)
+        available_experiments = set(
+            problem.condition_df[CONDITION_ID].unique()
+        )
+        if missing_experiments := (used_experiments - available_experiments):
+            raise AssertionError(
+                "Measurement table references experiments that "
+                "are not specified in the experiments table: "
+                + str(missing_experiments)
             )
-            if missing_experiments := (
-                used_experiments - available_experiments
-            ):
-                raise AssertionError(
-                    "Measurement table references experiments that "
-                    "are not specified in the experiments table: "
-                    + str(missing_experiments)
-                )
 
 
 class CheckConditionTable(ValidationTask):
@@ -771,7 +794,8 @@ def append_overrides(overrides):
     )
 
     # parameters that are overridden via the condition table are not allowed
-    parameter_ids -= set(problem.condition_df[TARGET_ID].unique())
+    if problem.condition_df is not None:
+        parameter_ids -= set(problem.condition_df[TARGET_ID].unique())
 
     return parameter_ids
 
diff --git a/petab/v2/petab1to2.py b/petab/v2/petab1to2.py
@@ -4,8 +4,8 @@
 from itertools import chain
 from pathlib import Path
 from urllib.parse import urlparse
+from uuid import uuid4
 
-import numpy as np
 import pandas as pd
 from pandas.io.common import get_handle, is_url
 
@@ -98,10 +98,81 @@ def petab1to2(yaml_config: Path | str, output_dir: Path | str = None):
             condition_df = v1v2_condition_df(condition_df, petab_problem.model)
             v2.write_condition_df(condition_df, get_dest_path(condition_file))
 
+        # records for the experiment table to be created
+        experiments = []
+
+        def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str:
+            if not sim_cond_id and not preeq_cond_id:
+                return ""
+            if preeq_cond_id:
+                preeq_cond_id = f"{preeq_cond_id}_"
+            exp_id = f"experiment_{preeq_cond_id}{sim_cond_id}"
+            if exp_id in experiments:  # noqa: B023
+                i = 1
+                while f"{exp_id}_{i}" in experiments:  # noqa: B023
+                    i += 1
+                exp_id = f"{exp_id}_{i}"
+            return exp_id
+
+        measured_experiments = (
+            petab_problem.get_simulation_conditions_from_measurement_df()
+        )
+        for (
+            _,
+            row,
+        ) in measured_experiments.iterrows():
+            # generate a new experiment for each simulation / pre-eq condition
+            #  combination
+            sim_cond_id = row[v1.C.SIMULATION_CONDITION_ID]
+            preeq_cond_id = row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, "")
+            exp_id = create_experiment_id(sim_cond_id, preeq_cond_id)
+            if preeq_cond_id:
+                experiments.append(
+                    {
+                        v2.C.EXPERIMENT_ID: exp_id,
+                        v2.C.CONDITION_ID: preeq_cond_id,
+                        v2.C.TIME: float("-inf"),
+                    }
+                )
+            experiments.append(
+                {
+                    v2.C.EXPERIMENT_ID: exp_id,
+                    v2.C.CONDITION_ID: sim_cond_id,
+                    v2.C.TIME: 0,
+                }
+            )
+        if experiments:
+            exp_table_path = output_dir / "experiments.tsv"
+            if exp_table_path.exists():
+                raise ValueError(
+                    f"Experiment table file {exp_table_path} already exists."
+                )
+            problem_config[v2.C.EXPERIMENT_FILES] = [exp_table_path.name]
+            v2.write_experiment_df(
+                v2.get_experiment_df(pd.DataFrame(experiments)), exp_table_path
+            )
+
         for measurement_file in problem_config.get(v2.C.MEASUREMENT_FILES, []):
             measurement_df = v1.get_measurement_df(
                 get_src_path(measurement_file)
             )
+            # if there is already an experiment ID column, we rename it
+            if v2.C.EXPERIMENT_ID in measurement_df.columns:
+                measurement_df.rename(
+                    columns={v2.C.EXPERIMENT_ID: f"experiment_id_{uuid4()}"},
+                    inplace=True,
+                )
+            # add pre-eq condition id if not present or convert to string
+            #  for simplicity
+            if v1.C.PREEQUILIBRATION_CONDITION_ID in measurement_df.columns:
+                measurement_df[
+                    v1.C.PREEQUILIBRATION_CONDITION_ID
+                ] = measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID].astype(
+                    str
+                )
+            else:
+                measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = ""
+
             if (
                 petab_problem.condition_df is not None
                 and len(
@@ -110,20 +181,33 @@ def petab1to2(yaml_config: Path | str, output_dir: Path | str = None):
                 )
                 == 0
             ):
-                # can't have "empty" conditions with no overrides in v2
-                # TODO: this needs to be done condition wise
-                measurement_df[v2.C.SIMULATION_CONDITION_ID] = np.nan
+                # we can't have "empty" conditions with no overrides in v2,
+                #  therefore, we drop the respective condition ID completely
+                #   TODO: or can we?
+                # TODO: this needs to be checked condition-wise, not globally
+                measurement_df[v1.C.SIMULATION_CONDITION_ID] = ""
                 if (
                     v1.C.PREEQUILIBRATION_CONDITION_ID
                     in measurement_df.columns
                 ):
-                    measurement_df[v2.C.PREEQUILIBRATION_CONDITION_ID] = np.nan
+                    measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = ""
+            # condition IDs to experiment IDs
+            measurement_df.insert(
+                0,
+                v2.C.EXPERIMENT_ID,
+                measurement_df.apply(
+                    lambda row: create_experiment_id(
+                        row[v1.C.SIMULATION_CONDITION_ID],
+                        row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, ""),
+                    ),
+                    axis=1,
+                ),
+            )
+            del measurement_df[v1.C.SIMULATION_CONDITION_ID]
+            del measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID]
             v2.write_measurement_df(
                 measurement_df, get_dest_path(measurement_file)
             )
-    # TODO: Measurements: preequilibration to experiments/timecourses once
-    #  finalized
-    ...
 
     # validate updated Problem
     validation_issues = v2.lint_problem(new_yaml_file)
@@ -189,7 +273,7 @@ def v1v2_condition_df(
     """Convert condition table from petab v1 to v2."""
     condition_df = condition_df.copy().reset_index()
     with suppress(KeyError):
-        # TODO: are condition names still supported in v2?
+        # conditionName was dropped in PEtab v2
         condition_df.drop(columns=[v2.C.CONDITION_NAME], inplace=True)
 
     condition_df = condition_df.melt(
diff --git a/petab/v2/problem.py b/petab/v2/problem.py
diff --git a/tests/v2/test_problem.py b/tests/v2/test_problem.py