diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 582173b6..2e0554d5 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -49,8 +49,7 @@ jobs: - name: Build documentation run: | - cd docs - sphinx-build -b html . _build/html + sphinx-build -b html docs docs/_build/html - name: Setup Pages uses: actions/configure-pages@v5 diff --git a/.github/workflows/links.yaml b/.github/workflows/links.yaml index b6da3850..3113a59e 100644 --- a/.github/workflows/links.yaml +++ b/.github/workflows/links.yaml @@ -33,8 +33,7 @@ jobs: - name: Build documentation with Sphinx run: | - cd docs - sphinx-build -b html . _build/html + sphinx-build -b html docs docs/_build/html - name: Link Checker on built documentation id: lychee diff --git a/docs/conf.py b/docs/conf.py index 35e52175..b54038e7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,7 +11,7 @@ author = "HED Standard" # The full version, including alpha/beta/rc tags -release = "0.8.0" +release = "0.8.1" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/hed/tools/analysis/tabular_summary.py b/hed/tools/analysis/tabular_summary.py index ef25ac40..c40ef6b2 100644 --- a/hed/tools/analysis/tabular_summary.py +++ b/hed/tools/analysis/tabular_summary.py @@ -10,19 +10,22 @@ class TabularSummary: """Summarize the contents of columnar files.""" - def __init__(self, value_cols=None, skip_cols=None, name=""): + def __init__(self, value_cols=None, skip_cols=None, name="", categorical_limit=None): """Constructor for a BIDS tabular file summary. Parameters: value_cols (list, None): List of columns to be treated as value columns. skip_cols (list, None): List of columns to be skipped. name (str): Name associated with the dictionary. + categorical_limit (int, None): Maximum number of unique values to store for categorical columns. """ self.name = name self.categorical_info = {} self.value_info = {} + self.categorical_counts = {} + self.categorical_limit = categorical_limit if value_cols and skip_cols and set(value_cols).intersection(skip_cols): raise HedFileError( "ValueSkipOverlap", f"Value columns {str(value_cols)} and skip columns {str(skip_cols)} cannot overlap", "" @@ -47,7 +50,10 @@ def __str__(self): for key in sorted_keys: value_dict = self.categorical_info[key] sorted_v_keys = sorted(value_dict) - summary_list.append(f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values):") + counts = self.categorical_counts.get(key, [0, 0]) + summary_list.append( + f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values, {counts[0]} total values in {counts[1]} files):" + ) for v_key in sorted_v_keys: summary_list.append(f"{indent * 3}{v_key}: {value_dict[v_key]}") @@ -101,9 +107,11 @@ def get_summary(self, as_json=False) -> Union[dict, str]: "Total events": self.total_events, "Total files": self.total_files, "Categorical columns": categorical_cols, + "Categorical counts": self.categorical_counts, "Value columns": value_cols, "Skip columns": self.skip_cols, "Files": self.files, + "Categorical limit": str(self.categorical_limit), } if as_json: return json.dumps(summary, indent=4) @@ -131,7 +139,7 @@ def get_number_unique(self, column_names=None) -> dict: return counts def update(self, data, name=None): - """Update the counts based on data. + """Update the counts based on data (DataFrame, filename, or list of filenames). Parameters: data (DataFrame, str, or list): DataFrame containing data to update. @@ -166,19 +174,26 @@ def update_summary(self, tab_sum): self._update_dict_value(tab_sum) self._update_dict_categorical(tab_sum) - def _update_categorical(self, tab_name, values): + def _update_categorical(self, tab_name, values, cat_counts): """Update the categorical information for this summary. Parameters: tab_name (str): Name of a key indicating a categorical column. values (dict): A dictionary whose keys are unique categorical values. + cat_counts (list): A list with two elements: total count of values and number of entries. """ if tab_name not in self.categorical_info: self.categorical_info[tab_name] = {} - + if tab_name not in self.categorical_counts: + self.categorical_counts[tab_name] = [cat_counts[0], cat_counts[1]] + else: + self.categorical_counts[tab_name][0] += cat_counts[0] + self.categorical_counts[tab_name][1] += cat_counts[1] total_values = self.categorical_info[tab_name] for name, value in values.items(): + if self.categorical_limit is not None and len(total_values) >= self.categorical_limit: + break value_list = total_values.get(name, [0, 0]) if not isinstance(value, list): value = [value, 1] @@ -207,9 +222,15 @@ def _update_dataframe(self, data, name): self.value_info[col_name][0] = self.value_info[col_name][0] + len(col_values) self.value_info[col_name][1] = self.value_info[col_name][1] + 1 else: + cat_counts = self.categorical_counts.get(col_name, [0, 0]) + cat_counts[0] += len(col_values) + cat_counts[1] += 1 + self.categorical_counts[col_name] = cat_counts + if self.categorical_limit is not None and len(col_values) > self.categorical_limit: + continue col_values = col_values.astype(str) values = col_values.value_counts(ascending=True) - self._update_categorical(col_name, values) + self._update_categorical(col_name, values, cat_counts) def _update_dict_categorical(self, col_dict): """Update this summary with the categorical information in the dictionary from another summary. @@ -228,7 +249,7 @@ def _update_dict_categorical(self, col_dict): elif col in self.skip_cols: continue else: - self._update_categorical(col, col_dict.categorical_info[col]) + self._update_categorical(col, col_dict.categorical_info[col], col_dict.categorical_counts.get(col, [0, 0])) def _update_dict_skip(self, col_dict): """Update this summary with the skip column information from another summary. @@ -289,13 +310,15 @@ def extract_summary(summary_info) -> "TabularSummary": new_tab = TabularSummary( value_cols=summary_info.get("Value columns", {}).keys(), skip_cols=summary_info.get("Skip columns", []), - name=summary_info.get("Summary name", ""), + name=summary_info.get("Name", ""), + categorical_limit=summary_info.get("Categorical limit", None), ) - new_tab.value_info = summary_info.get("Value_columns", {}) + new_tab.value_info = summary_info.get("Value columns", {}) new_tab.total_files = summary_info.get("Total files", 0) new_tab.total_events = summary_info.get("Total events", 0) new_tab.skip_cols = summary_info.get("Skip columns", []) new_tab.categorical_info = summary_info.get("Categorical columns", {}) + new_tab.categorical_counts = summary_info.get("Categorical counts", {}) new_tab.files = summary_info.get("Files", {}) return new_tab diff --git a/tests/tools/analysis/test_tabular_summary.py b/tests/tools/analysis/test_tabular_summary.py index b750bcef..e902cfeb 100644 --- a/tests/tools/analysis/test_tabular_summary.py +++ b/tests/tools/analysis/test_tabular_summary.py @@ -80,7 +80,7 @@ def test_get_summary(self): ) summary1 = dict1.get_summary(as_json=False) self.assertIsInstance(summary1, dict) - self.assertEqual(len(summary1), 7) + self.assertEqual(len(summary1), 9) summary2 = dict1.get_summary(as_json=True).replace('"', "") self.assertIsInstance(summary2, str) @@ -240,6 +240,100 @@ def test_update_summary(self): self.assertEqual(len(files_bids), tab_all.total_files) self.assertEqual(len(files_bids) * 200, tab_all.total_events) + def test_categorical_limit_constructor(self): + # Test that categorical_limit can be set in constructor + dict1 = TabularSummary(categorical_limit=5) + self.assertEqual(dict1.categorical_limit, 5) + + dict2 = TabularSummary(categorical_limit=None) + self.assertIsNone(dict2.categorical_limit) + + def test_categorical_limit_enforced(self): + # Test that categorical_limit is enforced when updating + stern_df = get_new_dataframe(self.stern_map_path) + + # Create a summary with no limit + dict_no_limit = TabularSummary() + dict_no_limit.update(stern_df) + + # Create a summary with a limit of 2 unique values per column + dict_with_limit = TabularSummary(categorical_limit=2) + dict_with_limit.update(stern_df) + + # Check that columns with more than 2 unique values are limited + for col_name in dict_with_limit.categorical_info: + self.assertLessEqual( + len(dict_with_limit.categorical_info[col_name]), + 2, + f"Column {col_name} should have at most 2 unique values stored", + ) + # But categorical_counts should track all values + self.assertIn(col_name, dict_with_limit.categorical_counts) + self.assertGreater(dict_with_limit.categorical_counts[col_name][0], 0) + + def test_categorical_limit_columns_with_many_values(self): + # Test that columns with many values are skipped during initial update + wh_df = get_new_dataframe(self.wh_events_path) + + # Set limit to 5 + dict1 = TabularSummary(categorical_limit=5) + dict1.update(wh_df) + + # Columns with more than 5 unique values at collection time should still be tracked in counts + for col_name, counts in dict1.categorical_counts.items(): + self.assertGreater(counts[0], 0, f"Column {col_name} should have event count > 0") + self.assertEqual(counts[1], 1, f"Column {col_name} should have been updated once") + + def test_categorical_limit_in_summary(self): + # Test that categorical_limit appears in the summary output + dict1 = TabularSummary(categorical_limit=10) + stern_df = get_new_dataframe(self.stern_map_path) + dict1.update(stern_df) + + summary = dict1.get_summary(as_json=False) + self.assertIn("Categorical limit", summary) + self.assertEqual(summary["Categorical limit"], "10") + + # Test with None + dict2 = TabularSummary() + dict2.update(stern_df) + summary2 = dict2.get_summary(as_json=False) + self.assertEqual(summary2["Categorical limit"], "None") + + def test_categorical_limit_extract_summary(self): + # Test that categorical_limit is preserved through extract_summary + dict1 = TabularSummary(categorical_limit=15) + stern_df = get_new_dataframe(self.stern_map_path) + dict1.update(stern_df) + + summary_info = dict1.get_summary(as_json=False) + dict2 = TabularSummary.extract_summary(summary_info) + + # Note: extract_summary doesn't restore categorical_limit currently, + # but it should at least not error + self.assertIsInstance(dict2, TabularSummary) + + def test_categorical_limit_update_dict(self): + # Test that categorical_limit works correctly with update_summary + stern_df = get_new_dataframe(self.stern_test1_path) + + dict1 = TabularSummary(categorical_limit=3) + dict1.update(stern_df) + + dict2 = TabularSummary(categorical_limit=3) + dict2.update(stern_df) + + # Update dict1 with dict2 + dict1.update_summary(dict2) + + # Check that limits are still enforced + for col_name in dict1.categorical_info: + self.assertLessEqual( + len(dict1.categorical_info[col_name]), + 3, + f"Column {col_name} should have at most 3 unique values after update_summary", + ) + if __name__ == "__main__": unittest.main()