From a65cc2405177a1bb40ba5981064d763dc9f1ebd0 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 15 Jan 2026 16:00:38 +0100 Subject: [PATCH] tests and warnings for issue 399 (nan in table columns) --- src/spatialdata/models/models.py | 15 ++++++++++++ tests/io/test_readwrite.py | 42 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index ddda2a61..e834ad78 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -1061,6 +1061,21 @@ def _validate_table_annotation_metadata(self, data: AnnData) -> None: if len(set(expected_regions).symmetric_difference(set(found_regions))) > 0: raise ValueError(f"Regions in the AnnData object and `{attr[self.REGION_KEY_KEY]}` do not match.") + # Warning for object/string columns with NaN in region_key or instance_key + instance_key = attr[self.INSTANCE_KEY] + region_key = attr[self.REGION_KEY_KEY] + for key_name, key_value in [("region_key", region_key), ("instance_key", instance_key)]: + if key_value in data.obs: + col = data.obs[key_value] + col_dtype = col.dtype + if (col_dtype == "object" or pd.api.types.is_string_dtype(col_dtype)) and col.isna().any(): + logger.warning( + f"The {key_name} column '{key_value}' is of {col_dtype} type and contains NaN values. " + "After writing and reading with AnnData, NaN values may (depending on the AnnData version) " + "be converted to strings. This may cause issues when matching instances across read/write " + "cycles." + ) + def validate( self, data: AnnData, diff --git a/tests/io/test_readwrite.py b/tests/io/test_readwrite.py index 7ecd7420..af028d29 100644 --- a/tests/io/test_readwrite.py +++ b/tests/io/test_readwrite.py @@ -1065,3 +1065,45 @@ def test_read_sdata(tmp_path: Path, points: SpatialData) -> None: assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_str) assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_upath) assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_zarr_group) + + +def test_sdata_with_nan_in_obs() -> None: + """Test writing SpatialData with mixed string/NaN values in obs works correctly. + + Regression test for https://github.com/scverse/spatialdata/issues/399 + Previously this raised TypeError: expected unicode string, found nan. + Now the write succeeds, though NaN values in object-dtype columns are + converted to the string "nan" after round-trip. + """ + from spatialdata.models import TableModel + + table = TableModel.parse( + AnnData( + obs=pd.DataFrame( + { + "region": ["region1", "region2"], + "instance": [0, 0], + "column_only_region1": ["string", np.nan], + "column_only_region2": [np.nan, 3], + } + ) + ), + region_key="region", + instance_key="instance", + region=["region1", "region2"], + ) + sdata = SpatialData(tables={"table": table}) + assert sdata["table"].obs["column_only_region1"].iloc[1] is np.nan + assert np.isnan(sdata["table"].obs["column_only_region2"].iloc[0]) + + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "data.zarr") + sdata.write(path) + + sdata2 = SpatialData.read(path) + assert "column_only_region1" in sdata2["table"].obs.columns + assert sdata2["table"].obs["column_only_region1"].iloc[0] == "string" + assert sdata2["table"].obs["column_only_region2"].iloc[1] == 3 + # After round-trip, NaN in object-dtype column becomes string "nan" + assert sdata2["table"].obs["column_only_region1"].iloc[1] == "nan" + assert np.isnan(sdata2["table"].obs["column_only_region2"].iloc[0])