diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index 34fcf9d..a6cc4b2 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -2,13 +2,14 @@ from __future__ import annotations -from typing import Optional +from typing import Optional, Literal import pyopenms as oms +import warnings class Py_AASequence: """ - A Pythonic wrapper around pyOpenMS AASequence. + A Pythonic, immutable wrapper around pyOpenMS AASequence. This class provides intuitive properties and methods for working with amino acid sequences, including common operations like reversing and @@ -40,7 +41,7 @@ def __init__(self, native_sequence: Optional[oms.AASequence] = None): @classmethod def from_string(cls, sequence_str: str) -> Py_AASequence: """ - Create AASequence from string representation. + Create Py_AASequence from string representation. Args: sequence_str: String representation of the amino acid sequence. @@ -57,6 +58,20 @@ def from_string(cls, sequence_str: str) -> Py_AASequence: # ==================== Pythonic Properties ==================== + @classmethod + def from_native(cls, native_sequence: oms.AASequence) -> Py_AASequence: + """ + Creates Py_AASequence from native pyOpenMS AASequence. + + Args: + native_sequence (oms.AASequence): + + Returns: + Py_AASequence: New wrapped opject + + """ + return cls(native_sequence) + @property def native(self) -> oms.AASequence: """Return the underlying pyOpenMS AASequence.""" @@ -204,26 +219,126 @@ def __eq__(self, other: object) -> bool: return False return self.sequence == other.sequence - def __getitem__(self, index: int) -> str: + def __getitem__(self, index): """ - Get residue at position. + Get residue(s) at position(s). + + Supports both single indexing and slicing, returning Py_AASequence objects. Args: - index: Position in the sequence (0-based). + index: Integer for single residue, or slice object for subsequence. Returns: - str: Single letter amino acid code. + Py_AASequence: Wrapped residue or subsequence. + + Example: + >>> seq = Py_AASequence.from_string("PEPTIDE") + >>> seq[1] # Returns Py_AASequence("E") + >>> seq[1:4] # Returns Py_AASequence("EPT") + >>> seq[-1] # Returns Py_AASequence("E") """ - if index < 0 or index >= len(self): - raise IndexError(f"Index {index} out of range for sequence of length {len(self)}") - residue = self._sequence.getResidue(index) - return residue.getOneLetterCode() + if isinstance(index, slice): + start, stop, step = index.indices(len(self)) + if step != 1: + raise ValueError("Step slicing is not supported for amino acid sequences") + return Py_AASequence.from_native(self._sequence.getSubsequence(start, stop - start)) + else: + # Handle negative indices + if index < 0: + index = len(self) + index + if index >= len(self): + raise IndexError(f"Index {index} out of range for sequence of length {len(self)}") + residue = self._sequence.getSubsequence(index, 1) + return Py_AASequence.from_native(residue) def __iter__(self): """Iterate over residues.""" for i in range(len(self)): yield self[i] + def __add__(self, other: Py_AASequence | str) -> Py_AASequence: + """ + Concatenate sequences. + + Args: + other: Py_AASequence or string to append. + + Returns: + Py_AASequence: New concatenated sequence. + + Example: + >>> seq1 = Py_AASequence.from_string("PEP") + >>> seq2 = Py_AASequence.from_string("TIDE") + >>> combined = seq1 + seq2 + >>> print(combined.sequence) + PEPTIDE + >>> combined2 = seq1 + "TIDE" + >>> print(combined2.sequence) + PEPTIDE + """ + if isinstance(other, Py_AASequence): + combined_str = self.sequence + other.sequence + elif isinstance(other, str): + combined_str = self.sequence + other + else: + return NotImplemented + return Py_AASequence.from_string(combined_str) + + def __radd__(self, other: str) -> Py_AASequence: + """ + Support string + Py_AASequence. + Example: + >>> seq = Py_AASequence.from_string("TIDE") + >>> combined = "PEP" + seq + >>> print(combined.sequence) + PEPTIDE + """ + if isinstance(other, str): + combined_str = other + self.sequence + return Py_AASequence.from_string(combined_str) + return NotImplemented + + def __mul__(self, times: int) -> Py_AASequence: + """ + Repeat sequence. + + Args: + times: Number of times to repeat (must be >= 0). + + Returns: + Py_AASequence: New repeated sequence. + + Example: + >>> seq = Py_AASequence.from_string("PEP") + >>> repeated = seq * 3 + >>> print(repeated.sequence) + PEPPEPPEP + """ + if not isinstance(times, int) or times < 0: + return NotImplemented + return Py_AASequence.from_string(self.sequence * times) + + def __rmul__(self, times: int) -> Py_AASequence: + """Support int * Py_AASequence.""" + return self.__mul__(times) + def __contains__(self, substring: str) -> bool: + """Check if substring is in sequence.""" + return self.has_substring(substring) + + def __hash__(self) -> int: + """Make sequences hashable for use in sets/dicts.""" + return hash(self.sequence) + + def __lt__(self, other: Py_AASequence) -> bool: + """Lexicographic comparison by sequence.""" + if not isinstance(other, Py_AASequence): + return NotImplemented + return self.sequence < other.sequence + def count(self, residue: str) -> int: + """Count occurrences of a residue, to be consistent with str.count(), note currently does not account for modifications""" + warnings.warn("count method does not account for modifications") + return self.unmodified_sequence.count(residue) + # ==================== Additional Utilities ==================== def get_mz(self, charge: int) -> float: @@ -277,4 +392,37 @@ def has_suffix(self, suffix: str) -> bool: bool: True if sequence ends with suffix. """ return self._sequence.hasSuffix(oms.AASequence.fromString(suffix)) + + + # ===================== Exporting ======================= + def to_string(self, modified=True, mod_format: Literal['default', 'unimod', 'bracket'] = 'default') -> str: + """ + Get string representation of the sequence. + + Args: + modified (bool): Whether to include modifications in the string. + mod_format (Optional[Literal['default', 'unimod', 'bracket']]): Format for modifications. + 'default' for OpenMS format, + 'unimod' for UniMod format, + 'bracket' for bracket notation. + Default is 'default'. + Returns: + str: Amino acid sequence as string. + + Example: + >>> seq = Py_AASequence.from_string("PEPTIDE") + >>> seq_str = seq.to_string() + """ + if not modified: + return self.unmodified_sequence + + else: + if mod_format == 'default': + return self._sequence.toString() + elif mod_format == 'unimod': + return self._sequence.toUniModString() + elif mod_format == 'bracket': + return self._sequence.toBracketString() + else: + raise ValueError(f"Unsupported mod_format: {mod_format}, supported are 'default', 'unimod' and 'bracket'") diff --git a/tests/test_py_aasequence.py b/tests/test_py_aasequence.py index 4f56f01..93ee370 100644 --- a/tests/test_py_aasequence.py +++ b/tests/test_py_aasequence.py @@ -118,7 +118,7 @@ def test_py_aasequence_iteration(): seq = Py_AASequence.from_string("PEPTIDE") residues = list(seq) - assert residues == ["P", "E", "P", "T", "I", "D", "E"] + assert [res.sequence for res in residues] == ["P", "E", "P", "T", "I", "D", "E"] assert len(residues) == 7 @@ -126,17 +126,14 @@ def test_py_aasequence_indexing(): """Test indexing into sequence.""" seq = Py_AASequence.from_string("PEPTIDE") - assert seq[0] == "P" - assert seq[1] == "E" - assert seq[6] == "E" + assert seq[0].sequence == "P" + assert seq[1].sequence == "E" + assert seq[6].sequence == "E" # Test out of bounds with pytest.raises(IndexError): _ = seq[7] - with pytest.raises(IndexError): - _ = seq[-1] - def test_py_aasequence_string_representation(): """Test string representations.""" @@ -258,3 +255,43 @@ def test_py_aasequence_with_native_aasequence(): assert seq.sequence == "PEPTIDE" assert seq.native is native + + +def test_py_aasequence_to_string(): + """Test to_string method with different options.""" + seq = Py_AASequence.from_string("PEPTIDEM(Oxidation)") + + # Default should return modified string in default format + mod_str = seq.to_string() + assert mod_str == "PEPTIDEM(Oxidation)" + + # Unmodified should return unmodified sequence + unmod_str = seq.to_string(modified=False) + assert unmod_str == "PEPTIDEM" + + # Bracket format + bracket_str = seq.to_string(modified=True, mod_format='bracket') + assert bracket_str == "PEPTIDEM[147]" + + # unimod format + unimod_str = seq.to_string(modified=True, mod_format='unimod') + assert unimod_str == "PEPTIDEM(UniMod:35)" + + # Invalid format should raise error + with pytest.raises(ValueError): + _ = seq.to_string(modified=True, mod_format='invalid_format') + + +def test_slicing(): + aa_seq = Py_AASequence.from_string('PEPTIDEM(Oxidation)R') + assert aa_seq[0].sequence == 'P' + assert aa_seq[-1].sequence == 'R' + assert aa_seq[1:4].sequence == 'EPT' + assert aa_seq[-2:].sequence == 'M(Oxidation)R' + +def test_count(): + aa_seq = Py_AASequence.from_string('PEPTIDEM(Oxidation)R') + assert aa_seq.count('E') == 2 + assert aa_seq.count('P') == 2 + assert aa_seq.count('K') == 0 +