From b038bdecf5d943ed493eb90d6b528c85c000146c Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Tue, 6 Jan 2026 16:11:58 +0000 Subject: [PATCH 1/7] Test upath simple --- .../semantic_struct_converters.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index 3ba45f5..9ff0e80 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -172,3 +172,108 @@ def hash_struct_dict( raise ValueError(f"Path is a directory, not a file: {path}") except OSError as e: raise OSError(f"Error reading file {path}: {e}") + + +from upath import UPath + +class UPathStructConverter(SemanticStructConverterBase): + """Converter for universal_pathlib.UPath objects to/from semantic structs.""" + + def __init__(self): + super().__init__("upath") + self._python_type = UPath + + # Define the Arrow struct type for upaths + self._arrow_struct_type = pa.struct( + [ + pa.field("path", pa.large_string()), + ] + ) + + @property + def python_type(self) -> type: + return self._python_type + + @property + def arrow_struct_type(self) -> pa.StructType: + return self._arrow_struct_type + + def python_to_struct_dict(self, value: UPath) -> dict[str, Any]: + """Convert UPath to struct dictionary.""" + if not isinstance(value, UPath): + raise TypeError(f"Expected UPath, got {type(value)}") + + return { + "path": str(value), + } + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> UPath: + """Convert struct dictionary back to UPath.""" + path_str = struct_dict.get("path") + if path_str is None: + raise ValueError("Missing 'path' field in struct") + + return UPath(path_str) + + def can_handle_python_type(self, python_type: type) -> bool: + """Check if this converter can handle the given Python type.""" + return issubclass(python_type, UPath) + + def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + """Check if this converter can handle the given struct type.""" + # Check if struct has the expected fields + field_names = [field.name for field in struct_type] + expected_fields = {"path"} + + if set(field_names) != expected_fields: + return False + + # Check field types + field_types = {field.name: field.type for field in struct_type} + + return field_types["path"] == pa.large_string() + + def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: + """Check if a struct dictionary represents this semantic type.""" + return set(struct_dict.keys()) == {"path"} and isinstance( + struct_dict["path"], str + ) + + def hash_struct_dict( + self, struct_dict: dict[str, Any], add_prefix: bool = False + ) -> str: + """ + Compute hash of the file content pointed to by the path. + + Args: + struct_dict: Arrow struct dictionary with 'path' field + add_prefix: If True, prefix with semantic type and algorithm info + + Returns: + Hash string of the file content, optionally prefixed + + Raises: + FileNotFoundError: If the file doesn't exist + PermissionError: If the file can't be read + OSError: For other file system errors + """ + path_str = struct_dict.get("path") + if path_str is None: + raise ValueError("Missing 'path' field in struct") + + path = UPath(path_str) + + try: + # Read file content and compute hash + content = path.read_bytes() + hash_bytes = self._compute_content_hash(content) + return self._format_hash_string(hash_bytes, add_prefix) + + except FileNotFoundError: + raise FileNotFoundError(f"File not found: {path}") + except PermissionError: + raise PermissionError(f"Permission denied reading file: {path}") + except IsADirectoryError: + raise ValueError(f"Path is a directory, not a file: {path}") + except OSError as e: + raise OSError(f"Error reading file {path}: {e}") \ No newline at end of file From 9c8edaa8d1b418aecdaf8fd5a0ee71f0267b743c Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Tue, 6 Jan 2026 16:23:36 +0000 Subject: [PATCH 2/7] Add UPath to semantic registry JSON --- src/orcapod/contexts/data/v0.1.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 9f1708e..250f317 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -9,6 +9,10 @@ "path": { "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", "_config": {} + }, + "upath": { + "_class": "orcapod.semantic_types.semantic_struct_converters.UPathStructConverter", + "_config": {} } } } From 850bbc70518782e2dec80f92eee9f2e460c8c16d Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Tue, 6 Jan 2026 16:49:53 +0000 Subject: [PATCH 3/7] Give UPath unique struct type diff from Path --- .../semantic_struct_converters.py | 59 +++++++++---------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index 9ff0e80..c046e4a 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -178,97 +178,92 @@ def hash_struct_dict( class UPathStructConverter(SemanticStructConverterBase): """Converter for universal_pathlib.UPath objects to/from semantic structs.""" - + def __init__(self): super().__init__("upath") self._python_type = UPath - # Define the Arrow struct type for upaths self._arrow_struct_type = pa.struct( [ - pa.field("path", pa.large_string()), + pa.field("upath", pa.large_string()), ] ) - + @property def python_type(self) -> type: return self._python_type - + @property def arrow_struct_type(self) -> pa.StructType: return self._arrow_struct_type - + def python_to_struct_dict(self, value: UPath) -> dict[str, Any]: """Convert UPath to struct dictionary.""" if not isinstance(value, UPath): raise TypeError(f"Expected UPath, got {type(value)}") - return { - "path": str(value), + "upath": str(value), } - + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> UPath: """Convert struct dictionary back to UPath.""" - path_str = struct_dict.get("path") + path_str = struct_dict.get("upath") if path_str is None: - raise ValueError("Missing 'path' field in struct") - + raise ValueError("Missing 'upath' field in struct") return UPath(path_str) - + def can_handle_python_type(self, python_type: type) -> bool: """Check if this converter can handle the given Python type.""" return issubclass(python_type, UPath) - + def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: """Check if this converter can handle the given struct type.""" # Check if struct has the expected fields field_names = [field.name for field in struct_type] - expected_fields = {"path"} - + expected_fields = {"upath"} + if set(field_names) != expected_fields: return False - + # Check field types field_types = {field.name: field.type for field in struct_type} - - return field_types["path"] == pa.large_string() - + return field_types["upath"] == pa.large_string() + def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: """Check if a struct dictionary represents this semantic type.""" - return set(struct_dict.keys()) == {"path"} and isinstance( - struct_dict["path"], str + return set(struct_dict.keys()) == {"upath"} and isinstance( + struct_dict["upath"], str ) - + def hash_struct_dict( self, struct_dict: dict[str, Any], add_prefix: bool = False ) -> str: """ Compute hash of the file content pointed to by the path. - + Args: - struct_dict: Arrow struct dictionary with 'path' field + struct_dict: Arrow struct dictionary with 'upath' field add_prefix: If True, prefix with semantic type and algorithm info - + Returns: Hash string of the file content, optionally prefixed - + Raises: FileNotFoundError: If the file doesn't exist PermissionError: If the file can't be read OSError: For other file system errors """ - path_str = struct_dict.get("path") + path_str = struct_dict.get("upath") if path_str is None: - raise ValueError("Missing 'path' field in struct") - + raise ValueError("Missing 'upath' field in struct") + path = UPath(path_str) - + try: # Read file content and compute hash content = path.read_bytes() hash_bytes = self._compute_content_hash(content) return self._format_hash_string(hash_bytes, add_prefix) - except FileNotFoundError: raise FileNotFoundError(f"File not found: {path}") except PermissionError: From 1deb53d8fe6d89fea513c55418aba51001999a35 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Tue, 6 Jan 2026 21:32:10 +0000 Subject: [PATCH 4/7] Change order of converters in JSON --- src/orcapod/contexts/data/v0.1.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 250f317..0b021fe 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -6,13 +6,13 @@ "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", "_config": { "converters": { - "path": { - "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", - "_config": {} - }, "upath": { "_class": "orcapod.semantic_types.semantic_struct_converters.UPathStructConverter", "_config": {} + }, + "path": { + "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", + "_config": {} } } } From da4c080c0230a0b5610b4225f205ba6df35c0bad Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Wed, 7 Jan 2026 02:20:32 +0000 Subject: [PATCH 5/7] Refactor converters to avoid duplicate code --- .../semantic_struct_converters.py | 202 +++++------------- 1 file changed, 48 insertions(+), 154 deletions(-) diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index c046e4a..8122eb0 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -5,8 +5,10 @@ making semantic types visible in schemas and preserved through operations. """ +from abc import ABC from typing import Any, TYPE_CHECKING from pathlib import Path +from upath import UPath from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: @@ -68,171 +70,55 @@ def _compute_content_hash(self, content: bytes) -> bytes: return hashlib.sha256(content).digest() - -# Path-specific implementation -class PathStructConverter(SemanticStructConverterBase): - """Converter for pathlib.Path objects to/from semantic structs.""" - - def __init__(self): - super().__init__("path") - self._python_type = Path - - # Define the Arrow struct type for paths - self._arrow_struct_type = pa.struct( - [ - pa.field("path", pa.large_string()), - ] - ) - - @property - def python_type(self) -> type: - return self._python_type - - @property - def arrow_struct_type(self) -> pa.StructType: - return self._arrow_struct_type - - def python_to_struct_dict(self, value: Path) -> dict[str, Any]: - """Convert Path to struct dictionary.""" - if not isinstance(value, Path): - raise TypeError(f"Expected Path, got {type(value)}") - - return { - "path": str(value), - } - - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Path: - """Convert struct dictionary back to Path.""" - path_str = struct_dict.get("path") - if path_str is None: - raise ValueError("Missing 'path' field in struct") - - return Path(path_str) - - def can_handle_python_type(self, python_type: type) -> bool: - """Check if this converter can handle the given Python type.""" - return issubclass(python_type, Path) - - def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: - """Check if this converter can handle the given struct type.""" - # Check if struct has the expected fields - field_names = [field.name for field in struct_type] - expected_fields = {"path"} - - if set(field_names) != expected_fields: - return False - - # Check field types - field_types = {field.name: field.type for field in struct_type} - - return field_types["path"] == pa.large_string() - - def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: - """Check if a struct dictionary represents this semantic type.""" - return set(struct_dict.keys()) == {"path"} and isinstance( - struct_dict["path"], str - ) - - def hash_struct_dict( - self, struct_dict: dict[str, Any], add_prefix: bool = False - ) -> str: - """ - Compute hash of the file content pointed to by the path. - - Args: - struct_dict: Arrow struct dictionary with 'path' field - add_prefix: If True, prefix with semantic type and algorithm info - - Returns: - Hash string of the file content, optionally prefixed - - Raises: - FileNotFoundError: If the file doesn't exist - PermissionError: If the file can't be read - OSError: For other file system errors - """ - path_str = struct_dict.get("path") - if path_str is None: - raise ValueError("Missing 'path' field in struct") - - path = Path(path_str) - - try: - # TODO: replace with FileHasher implementation - # Read file content and compute hash - content = path.read_bytes() - hash_bytes = self._compute_content_hash(content) - return self._format_hash_string(hash_bytes, add_prefix) - - except FileNotFoundError: - raise FileNotFoundError(f"File not found: {path}") - except PermissionError: - raise PermissionError(f"Permission denied reading file: {path}") - except IsADirectoryError: - raise ValueError(f"Path is a directory, not a file: {path}") - except OSError as e: - raise OSError(f"Error reading file {path}: {e}") - - -from upath import UPath - -class UPathStructConverter(SemanticStructConverterBase): - """Converter for universal_pathlib.UPath objects to/from semantic structs.""" +class FilePathStructConverterBase(SemanticStructConverterBase, ABC): + """Base converter for file path types Path and UPath, since they have similar APIs.""" - def __init__(self): - super().__init__("upath") - self._python_type = UPath - # Define the Arrow struct type for upaths - self._arrow_struct_type = pa.struct( - [ - pa.field("upath", pa.large_string()), - ] - ) + def __init__(self, name: str, path_type: type): + super().__init__(name) + self._python_type = path_type + self._field_name = name + self._arrow_struct_type = pa.struct([ + pa.field(name, pa.large_string()), + ]) @property def python_type(self) -> type: return self._python_type @property - def arrow_struct_type(self) -> pa.StructType: + def arrow_struct_type(self) -> "pa.StructType": return self._arrow_struct_type - def python_to_struct_dict(self, value: UPath) -> dict[str, Any]: - """Convert UPath to struct dictionary.""" - if not isinstance(value, UPath): - raise TypeError(f"Expected UPath, got {type(value)}") - return { - "upath": str(value), - } + def python_to_struct_dict(self, value: Any) -> dict[str, Any]: + """Convert path object to struct dictionary.""" + if not isinstance(value, self._python_type): + raise TypeError(f"Expected {self._python_type.__name__}, got {type(value)}") + return {self._field_name: str(value)} - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> UPath: - """Convert struct dictionary back to UPath.""" - path_str = struct_dict.get("upath") + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: + """Convert struct dictionary back to path object.""" + path_str = struct_dict.get(self._field_name) if path_str is None: - raise ValueError("Missing 'upath' field in struct") - return UPath(path_str) + raise ValueError(f"Missing '{self._field_name}' field in struct") + return self._python_type(path_str) def can_handle_python_type(self, python_type: type) -> bool: """Check if this converter can handle the given Python type.""" - return issubclass(python_type, UPath) + return issubclass(python_type, self._python_type) - def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: """Check if this converter can handle the given struct type.""" - # Check if struct has the expected fields field_names = [field.name for field in struct_type] - expected_fields = {"upath"} - - if set(field_names) != expected_fields: + if set(field_names) != {self._field_name}: return False - - # Check field types field_types = {field.name: field.type for field in struct_type} - return field_types["upath"] == pa.large_string() + return field_types[self._field_name] == pa.large_string() def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: """Check if a struct dictionary represents this semantic type.""" - return set(struct_dict.keys()) == {"upath"} and isinstance( - struct_dict["upath"], str + return ( + set(struct_dict.keys()) == {self._field_name} + and isinstance(struct_dict[self._field_name], str) ) def hash_struct_dict( @@ -242,25 +128,19 @@ def hash_struct_dict( Compute hash of the file content pointed to by the path. Args: - struct_dict: Arrow struct dictionary with 'upath' field + struct_dict: Arrow struct dictionary with path field add_prefix: If True, prefix with semantic type and algorithm info Returns: Hash string of the file content, optionally prefixed - - Raises: - FileNotFoundError: If the file doesn't exist - PermissionError: If the file can't be read - OSError: For other file system errors """ - path_str = struct_dict.get("upath") + path_str = struct_dict.get(self._field_name) if path_str is None: - raise ValueError("Missing 'upath' field in struct") + raise ValueError(f"Missing '{self._field_name}' field in struct") - path = UPath(path_str) + path = self._python_type(path_str) try: - # Read file content and compute hash content = path.read_bytes() hash_bytes = self._compute_content_hash(content) return self._format_hash_string(hash_bytes, add_prefix) @@ -271,4 +151,18 @@ def hash_struct_dict( except IsADirectoryError: raise ValueError(f"Path is a directory, not a file: {path}") except OSError as e: - raise OSError(f"Error reading file {path}: {e}") \ No newline at end of file + raise OSError(f"Error reading file {path}: {e}") + + +class PathStructConverter(FilePathStructConverterBase): + """Converter for pathlib.Path objects to/from semantic structs.""" + + def __init__(self): + super().__init__("path", Path) + + +class UPathStructConverter(FilePathStructConverterBase): + """Converter for universal_pathlib.UPath objects to/from semantic structs.""" + + def __init__(self): + super().__init__("upath", UPath) \ No newline at end of file From 115e5a280f9d96f0cf59406635db333c2a74963d Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Tue, 13 Jan 2026 22:43:30 +0000 Subject: [PATCH 6/7] Handle Optional[T] unions --- src/orcapod/semantic_types/universal_converter.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index c3ba97e..be8ac62 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -662,6 +662,19 @@ def _create_python_to_arrow_converter( f"f{i}": converters[i](item) for i, item in enumerate(value) } + # Handle Optional[T] unions; complex unions (e.g., A | B) are not currently supported + elif origin is typing.Union or origin is types.UnionType: + non_none_types = [t for t in args if t is not type(None)] + if len(non_none_types) == 1: + # Optional[T] - use converter for T, pass through None + inner_converter = self.get_python_to_arrow_converter(non_none_types[0]) + return lambda value: inner_converter(value) if value is not None else None + else: + raise ValueError( + f"Complex unions with multiple non-None types are not supported: {python_type}. " + f"Only Optional[T] (i.e., T | None) is allowed." + ) + else: # Default passthrough return lambda value: value From 35408534587cd0af0f41d3f68f6ca050701dbf25 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Tue, 13 Jan 2026 22:44:29 +0000 Subject: [PATCH 7/7] Add tests for upath type --- pyproject.toml | 1 + .../test_universal_converter.py | 32 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index eb38aba..e4197e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "deltalake>=1.0.2", "graphviz>=0.21", "gitpython>=3.1.45", + "universal-pathlib>=0.3.8", ] readme = "README.md" requires-python = ">=3.11.0" diff --git a/tests/test_semantic_types/test_universal_converter.py b/tests/test_semantic_types/test_universal_converter.py index 375a119..62c1a29 100644 --- a/tests/test_semantic_types/test_universal_converter.py +++ b/tests/test_semantic_types/test_universal_converter.py @@ -30,6 +30,38 @@ def test_python_type_to_arrow_type_custom(): assert field.name == "path" assert field.type == pa.large_string() +def test_python_type_to_arrow_type_upath(): + from upath import UPath + arrow_type = universal_converter.python_type_to_arrow_type(UPath) + # Should be a StructType with field 'upath' of type large_string + assert isinstance(arrow_type, pa.StructType) + assert len(arrow_type) == 1 + field = arrow_type[0] + assert field.name == "upath" + assert field.type == pa.large_string() + +def test_optional_upath_converter(): + """Test that Optional[UPath] correctly converts UPath values.""" + from upath import UPath + + to_arrow, to_python = universal_converter.get_conversion_functions(UPath | None) + + # Test with UPath value + path = UPath("/tmp/test.txt") + result = to_arrow(path) + assert result == {"upath": "/tmp/test.txt"} + + # Test with None + assert to_arrow(None) is None + + +def test_complex_union_raises_error(): + """Test that complex unions (multiple non-None types) raise ValueError.""" + from upath import UPath + from pathlib import Path + + with pytest.raises(ValueError, match="Complex unions"): + universal_converter.get_conversion_functions(UPath | Path) def test_python_type_to_arrow_type_context(): ctx = get_default_context()