From 1e899de01b195d4ce7765a002dd6f51c71cff812 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:08:43 +0000 Subject: [PATCH 1/4] Initial plan From b65fb48f2307d4e9c8629b1d0ffbed88446d6e7d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:15:47 +0000 Subject: [PATCH 2/4] Add Py_AASequence wrapper class with reverse and shuffle operations Co-authored-by: timosachsenberg <5803621+timosachsenberg@users.noreply.github.com> --- openms_python/__init__.py | 2 + openms_python/py_aasequence.py | 279 +++++++++++++++++++++++++++++++++ tests/test_py_aasequence.py | 260 ++++++++++++++++++++++++++++++ 3 files changed, 541 insertions(+) create mode 100644 openms_python/py_aasequence.py create mode 100644 tests/test_py_aasequence.py diff --git a/openms_python/__init__.py b/openms_python/__init__.py index 9c49e04..28f5a15 100644 --- a/openms_python/__init__.py +++ b/openms_python/__init__.py @@ -29,6 +29,7 @@ from .py_featuremap import Py_FeatureMap from .py_consensusmap import Py_ConsensusMap from .py_experimentaldesign import Py_ExperimentalDesign +from .py_aasequence import Py_AASequence from .py_identifications import ( ProteinIdentifications, PeptideIdentifications, @@ -107,6 +108,7 @@ def get_example(name: str, *, load: bool = False, target_dir: Union[str, Path, N "Py_FeatureMap", "Py_ConsensusMap", "Py_ExperimentalDesign", + "Py_AASequence", "ProteinIdentifications", "PeptideIdentifications", "Identifications", diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py new file mode 100644 index 0000000..df290d5 --- /dev/null +++ b/openms_python/py_aasequence.py @@ -0,0 +1,279 @@ +"""Pythonic wrapper for pyOpenMS AASequence class.""" + +from __future__ import annotations + +from typing import Optional +import pyopenms as oms + + +class Py_AASequence: + """ + A Pythonic wrapper around pyOpenMS AASequence. + + This class provides intuitive properties and methods for working with + amino acid sequences, including common operations like reversing and + shuffling sequences with optional enzyme constraints. + + Example: + >>> seq = Py_AASequence.from_string("PEPTIDE") + >>> print(seq.sequence) + PEPTIDE + >>> print(seq.mono_weight) + 799.36... + >>> reversed_seq = seq.reverse() + >>> print(reversed_seq.sequence) + EDITPEP + >>> shuffled_seq = seq.shuffle(enzyme="Trypsin") + >>> print(shuffled_seq.sequence) # Shuffled while preserving cleavage sites + """ + + def __init__(self, native_sequence: Optional[oms.AASequence] = None): + """ + Initialize Py_AASequence wrapper. + + Args: + native_sequence: pyOpenMS AASequence object. If None, creates empty sequence. + """ + self._sequence = native_sequence if native_sequence is not None else oms.AASequence() + self._decoy_generator = None + + @classmethod + def from_string(cls, sequence_str: str) -> Py_AASequence: + """ + Create AASequence from string representation. + + Args: + sequence_str: String representation of the amino acid sequence. + Can include modifications in OpenMS format. + + Returns: + Py_AASequence: New wrapped sequence. + + Example: + >>> seq = Py_AASequence.from_string("PEPTIDE") + >>> seq = Py_AASequence.from_string("PEPTIDEM(Oxidation)") + """ + return cls(oms.AASequence.fromString(sequence_str)) + + # ==================== Pythonic Properties ==================== + + @property + def native(self) -> oms.AASequence: + """Return the underlying pyOpenMS AASequence.""" + return self._sequence + + @property + def sequence(self) -> str: + """Get the sequence as a string.""" + return self._sequence.toString() + + @property + def unmodified_sequence(self) -> str: + """Get the sequence without modifications.""" + return self._sequence.toUnmodifiedString() + + @property + def mono_weight(self) -> float: + """Get monoisotopic weight.""" + return self._sequence.getMonoWeight() + + @property + def average_weight(self) -> float: + """Get average weight.""" + return self._sequence.getAverageWeight() + + @property + def formula(self) -> str: + """Get molecular formula.""" + return self._sequence.getFormula().toString() + + @property + def is_modified(self) -> bool: + """Check if sequence has any modifications.""" + return self._sequence.isModified() + + @property + def has_n_terminal_modification(self) -> bool: + """Check if sequence has N-terminal modification.""" + return self._sequence.hasNTerminalModification() + + @property + def has_c_terminal_modification(self) -> bool: + """Check if sequence has C-terminal modification.""" + return self._sequence.hasCTerminalModification() + + # ==================== Decoy Generation ==================== + + def _get_decoy_generator(self) -> oms.DecoyGenerator: + """Get or create DecoyGenerator instance (lazy initialization).""" + if self._decoy_generator is None: + self._decoy_generator = oms.DecoyGenerator() + return self._decoy_generator + + def reverse(self) -> Py_AASequence: + """ + Reverse the entire amino acid sequence. + + Returns: + Py_AASequence: New sequence with reversed amino acids. + + Example: + >>> seq = Py_AASequence.from_string("PEPTIDE") + >>> reversed_seq = seq.reverse() + >>> print(reversed_seq.sequence) + EDITPEP + """ + dg = self._get_decoy_generator() + reversed_native = dg.reverseProtein(self._sequence) + return Py_AASequence(reversed_native) + + def reverse_with_enzyme(self, enzyme: str = "Trypsin") -> Py_AASequence: + """ + Reverse peptide sequences between enzymatic cleavage sites. + + This is useful for creating decoy sequences that maintain the + same enzymatic cleavage pattern as the target. + + Args: + enzyme: Name of the enzyme (e.g., "Trypsin", "Lys-C", "Asp-N"). + Default is "Trypsin". + + Returns: + Py_AASequence: New sequence with reversed peptides between cleavage sites. + + Example: + >>> seq = Py_AASequence.from_string("PEPTIDERK") + >>> reversed_seq = seq.reverse_with_enzyme("Trypsin") + >>> # K and R are cleavage sites, so segments are reversed separately + """ + dg = self._get_decoy_generator() + reversed_native = dg.reversePeptides(self._sequence, enzyme) + return Py_AASequence(reversed_native) + + def shuffle( + self, enzyme: str = "Trypsin", max_attempts: int = 100, seed: Optional[int] = None + ) -> Py_AASequence: + """ + Shuffle peptide sequences between enzymatic cleavage sites. + + This creates a decoy sequence by shuffling amino acids within + peptide segments defined by enzyme cleavage sites, attempting + to minimize sequence identity with the original. + + Args: + enzyme: Name of the enzyme (e.g., "Trypsin", "Lys-C", "Asp-N"). + Default is "Trypsin". + max_attempts: Maximum number of shuffle attempts to minimize + sequence identity. Default is 100. + seed: Optional random seed for reproducible shuffling. + + Returns: + Py_AASequence: New shuffled sequence. + + Example: + >>> seq = Py_AASequence.from_string("PEPTIDERK") + >>> shuffled_seq = seq.shuffle(enzyme="Trypsin", seed=42) + >>> # Amino acids are shuffled within enzyme-defined segments + """ + dg = self._get_decoy_generator() + if seed is not None: + dg.setSeed(seed) + shuffled_native = dg.shufflePeptides(self._sequence, enzyme, max_attempts) + return Py_AASequence(shuffled_native) + + # ==================== Sequence Operations ==================== + + def __len__(self) -> int: + """Get sequence length.""" + return self._sequence.size() + + def __str__(self) -> str: + """String representation.""" + return self.sequence + + def __repr__(self) -> str: + """Developer-friendly representation.""" + seq_str = self.sequence + if len(seq_str) > 20: + seq_str = seq_str[:17] + "..." + return f"Py_AASequence('{seq_str}')" + + def __eq__(self, other: object) -> bool: + """Check equality based on sequence string.""" + if not isinstance(other, Py_AASequence): + return False + return self.sequence == other.sequence + + def __getitem__(self, index: int) -> str: + """ + Get residue at position. + + Args: + index: Position in the sequence (0-based). + + Returns: + str: Single letter amino acid code. + """ + if index < 0 or index >= len(self): + raise IndexError(f"Index {index} out of range for sequence of length {len(self)}") + residue = self._sequence.getResidue(index) + return residue.getOneLetterCode() + + def __iter__(self): + """Iterate over residues.""" + for i in range(len(self)): + yield self[i] + + # ==================== Additional Utilities ==================== + + def get_mz(self, charge: int) -> float: + """ + Get m/z value for given charge state. + + Args: + charge: Charge state (must be > 0). + + Returns: + float: m/z value. + + Example: + >>> seq = Py_AASequence.from_string("PEPTIDE") + >>> mz = seq.get_mz(2) # doubly charged + """ + return self._sequence.getMZ(charge) + + def has_substring(self, substring: str) -> bool: + """ + Check if sequence contains a substring. + + Args: + substring: Amino acid sequence to search for. + + Returns: + bool: True if substring is present. + """ + return self._sequence.hasSubsequence(oms.AASequence.fromString(substring)) + + def has_prefix(self, prefix: str) -> bool: + """ + Check if sequence starts with a prefix. + + Args: + prefix: Amino acid sequence to check. + + Returns: + bool: True if sequence starts with prefix. + """ + return self._sequence.hasPrefix(oms.AASequence.fromString(prefix)) + + def has_suffix(self, suffix: str) -> bool: + """ + Check if sequence ends with a suffix. + + Args: + suffix: Amino acid sequence to check. + + Returns: + bool: True if sequence ends with suffix. + """ + return self._sequence.hasSuffix(oms.AASequence.fromString(suffix)) diff --git a/tests/test_py_aasequence.py b/tests/test_py_aasequence.py new file mode 100644 index 0000000..4f56f01 --- /dev/null +++ b/tests/test_py_aasequence.py @@ -0,0 +1,260 @@ +"""Tests for Py_AASequence wrapper.""" + +from __future__ import annotations + +import pytest +import pyopenms as oms + +from openms_python.py_aasequence import Py_AASequence + + +def test_py_aasequence_from_string(): + """Test creating sequence from string.""" + seq = Py_AASequence.from_string("PEPTIDE") + assert seq.sequence == "PEPTIDE" + assert len(seq) == 7 + + +def test_py_aasequence_empty(): + """Test creating empty sequence.""" + seq = Py_AASequence() + assert len(seq) == 0 + assert seq.sequence == "" + + +def test_py_aasequence_properties(): + """Test basic properties.""" + seq = Py_AASequence.from_string("PEPTIDE") + + # Basic properties + assert seq.sequence == "PEPTIDE" + assert seq.unmodified_sequence == "PEPTIDE" + assert len(seq) == 7 + + # Weight properties + assert seq.mono_weight > 0 + assert seq.average_weight > 0 + assert seq.mono_weight != seq.average_weight + + # Formula + assert "C" in seq.formula + assert "H" in seq.formula + assert "N" in seq.formula + assert "O" in seq.formula + + # Modification status + assert not seq.is_modified + assert not seq.has_n_terminal_modification + assert not seq.has_c_terminal_modification + + +def test_py_aasequence_modified_sequence(): + """Test sequence with modifications.""" + # Create a modified sequence + native_seq = oms.AASequence.fromString("PEPTIDEM(Oxidation)") + seq = Py_AASequence(native_seq) + + assert seq.is_modified + assert "M(Oxidation)" in seq.sequence + assert seq.unmodified_sequence == "PEPTIDEM" + + +def test_py_aasequence_reverse(): + """Test reverse operation.""" + seq = Py_AASequence.from_string("PEPTIDE") + reversed_seq = seq.reverse() + + assert reversed_seq.sequence == "EDITPEP" + assert len(reversed_seq) == len(seq) + # Original should be unchanged + assert seq.sequence == "PEPTIDE" + + +def test_py_aasequence_reverse_with_enzyme(): + """Test reverse with enzyme constraint.""" + # Trypsin cleaves after K and R + seq = Py_AASequence.from_string("PEPTIDERK") + reversed_seq = seq.reverse_with_enzyme("Trypsin") + + # The sequence should be reversed in segments + assert len(reversed_seq) == len(seq) + # Original should be unchanged + assert seq.sequence == "PEPTIDERK" + # Reversed sequence should be different + assert reversed_seq.sequence != seq.sequence + + +def test_py_aasequence_shuffle(): + """Test shuffle operation.""" + seq = Py_AASequence.from_string("PEPTIDERK") + + # Shuffle with a seed for reproducibility + shuffled1 = seq.shuffle(enzyme="Trypsin", seed=42) + shuffled2 = seq.shuffle(enzyme="Trypsin", seed=42) + + # Same seed should give same result + assert shuffled1.sequence == shuffled2.sequence + + # Different seed should (usually) give different result + shuffled3 = seq.shuffle(enzyme="Trypsin", seed=123) + # Can't guarantee they're different due to randomness, but length should match + assert len(shuffled3) == len(seq) + + # Original should be unchanged + assert seq.sequence == "PEPTIDERK" + + +def test_py_aasequence_shuffle_without_seed(): + """Test shuffle without explicit seed.""" + seq = Py_AASequence.from_string("PEPTIDERK") + shuffled = seq.shuffle(enzyme="Trypsin") + + # Should create a valid sequence of same length + assert len(shuffled) == len(seq) + + +def test_py_aasequence_iteration(): + """Test iterating over residues.""" + seq = Py_AASequence.from_string("PEPTIDE") + residues = list(seq) + + assert residues == ["P", "E", "P", "T", "I", "D", "E"] + assert len(residues) == 7 + + +def test_py_aasequence_indexing(): + """Test indexing into sequence.""" + seq = Py_AASequence.from_string("PEPTIDE") + + assert seq[0] == "P" + assert seq[1] == "E" + assert seq[6] == "E" + + # Test out of bounds + with pytest.raises(IndexError): + _ = seq[7] + + with pytest.raises(IndexError): + _ = seq[-1] + + +def test_py_aasequence_string_representation(): + """Test string representations.""" + seq = Py_AASequence.from_string("PEPTIDE") + + assert str(seq) == "PEPTIDE" + assert repr(seq) == "Py_AASequence('PEPTIDE')" + + # Test long sequence truncation in repr + long_seq = Py_AASequence.from_string("PEPTIDEPEPTIDEPEPTIDEPEPTIDE") + assert "..." in repr(long_seq) + + +def test_py_aasequence_equality(): + """Test equality comparison.""" + seq1 = Py_AASequence.from_string("PEPTIDE") + seq2 = Py_AASequence.from_string("PEPTIDE") + seq3 = Py_AASequence.from_string("DIFFERENT") + + assert seq1 == seq2 + assert seq1 != seq3 + assert seq1 != "PEPTIDE" # Different type + + +def test_py_aasequence_get_mz(): + """Test m/z calculation.""" + seq = Py_AASequence.from_string("PEPTIDE") + + # Get m/z for different charge states + mz1 = seq.get_mz(1) + mz2 = seq.get_mz(2) + mz3 = seq.get_mz(3) + + # Higher charge should give lower m/z + assert mz1 > mz2 > mz3 + assert mz1 > 0 + assert mz2 > 0 + assert mz3 > 0 + + +def test_py_aasequence_substring_operations(): + """Test substring checking.""" + seq = Py_AASequence.from_string("PEPTIDERK") + + # Test substring + assert seq.has_substring("TIDE") + assert seq.has_substring("PEPT") + assert not seq.has_substring("XXX") + + # Test prefix + assert seq.has_prefix("PEP") + assert seq.has_prefix("PEPTIDE") + assert not seq.has_prefix("TIDE") + + # Test suffix + assert seq.has_suffix("RK") + assert seq.has_suffix("DERK") + assert not seq.has_suffix("PEP") + + +def test_py_aasequence_native_access(): + """Test access to native pyOpenMS object.""" + seq = Py_AASequence.from_string("PEPTIDE") + native = seq.native + + assert isinstance(native, oms.AASequence) + assert native.toString() == "PEPTIDE" + + +def test_py_aasequence_reverse_removes_modifications(): + """Test that reverse operation removes modifications (expected behavior).""" + # Create a sequence with modification + native_seq = oms.AASequence.fromString("PEPTIDEM(Oxidation)") + seq = Py_AASequence(native_seq) + + assert seq.is_modified + + reversed_seq = seq.reverse() + + # DecoyGenerator removes modifications (this is expected behavior in OpenMS) + assert not reversed_seq.is_modified + # Original should be unchanged + assert seq.is_modified + # Reversed should contain the same amino acids (unmodified) + assert "M" in reversed_seq.sequence + assert len(reversed_seq) == len(seq) + + +def test_py_aasequence_different_enzymes(): + """Test different enzyme options.""" + seq = Py_AASequence.from_string("PEPTIDERK") + + # Test with Trypsin + trypsin_rev = seq.reverse_with_enzyme("Trypsin") + assert len(trypsin_rev) == len(seq) + + # Test with different enzyme (no cleavage) + no_cleavage_rev = seq.reverse_with_enzyme("no cleavage") + assert len(no_cleavage_rev) == len(seq) + + +def test_py_aasequence_shuffle_max_attempts(): + """Test shuffle with different max_attempts.""" + seq = Py_AASequence.from_string("PEPTIDERK") + + # Different max_attempts should still work + shuffled1 = seq.shuffle(enzyme="Trypsin", max_attempts=10, seed=42) + shuffled2 = seq.shuffle(enzyme="Trypsin", max_attempts=1000, seed=42) + + # Both should produce valid sequences + assert len(shuffled1) == len(seq) + assert len(shuffled2) == len(seq) + + +def test_py_aasequence_with_native_aasequence(): + """Test wrapping an existing pyOpenMS AASequence.""" + native = oms.AASequence.fromString("PEPTIDE") + seq = Py_AASequence(native) + + assert seq.sequence == "PEPTIDE" + assert seq.native is native From c081b92b751769c1b203158055ec611fae771f38 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:18:21 +0000 Subject: [PATCH 3/4] Complete Py_AASequence implementation with tests and security checks Co-authored-by: timosachsenberg <5803621+timosachsenberg@users.noreply.github.com> --- openms_python/py_aasequence.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index df290d5..34fcf9d 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -277,3 +277,4 @@ def has_suffix(self, suffix: str) -> bool: bool: True if sequence ends with suffix. """ return self._sequence.hasSuffix(oms.AASequence.fromString(suffix)) + From ff18081ae94ed04e3bbe5110acabf5b1e135763d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:24:42 +0000 Subject: [PATCH 4/4] Add Py_AASequence documentation to README.md Co-authored-by: timosachsenberg <5803621+timosachsenberg@users.noreply.github.com> --- README.md | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/README.md b/README.md index eef1aa2..7a0ff5c 100644 --- a/README.md +++ b/README.md @@ -267,6 +267,78 @@ for key in feature: print(key, feature[key]) ``` +### Working with Amino Acid Sequences + +The `Py_AASequence` wrapper provides a Pythonic interface to amino acid sequences with support for common operations like sequence reversal and shuffling for decoy generation. All operations delegate to pyOpenMS functionality to minimize reimplementation. + +```python +from openms_python import Py_AASequence + +# Create a sequence from string +seq = Py_AASequence.from_string("PEPTIDERK") + +# Access properties +print(f"Sequence: {seq.sequence}") # PEPTIDERK +print(f"Length: {len(seq)}") # 9 +print(f"Mono weight: {seq.mono_weight:.2f} Da") # 1083.56 Da +print(f"Formula: {seq.formula}") # C46H77N13O17 + +# Iterate over amino acids +for aa in seq: + print(aa) # P, E, P, T, I, D, E, R, K + +# Generate decoy sequences +reversed_seq = seq.reverse() +print(reversed_seq.sequence) # KREDITPEP + +# Reverse with enzyme constraint (preserves cleavage sites) +reversed_enzyme = seq.reverse_with_enzyme("Trypsin") +print(reversed_enzyme.sequence) # EDITPEPRK + +# Shuffle with reproducible seed +shuffled = seq.shuffle(enzyme="Trypsin", seed=42) +print(shuffled.sequence) # IPEDTEPRK (same with seed=42) + +# Calculate m/z for different charge states +mz1 = seq.get_mz(1) # 1084.56 +mz2 = seq.get_mz(2) # 542.79 +mz3 = seq.get_mz(3) # 362.19 + +# Query sequence content +has_tide = seq.has_substring("TIDE") # True +starts_pep = seq.has_prefix("PEP") # True +ends_rk = seq.has_suffix("RK") # True + +# Access individual residues +first_aa = seq[0] # "P" + +# Work with modified sequences +mod_seq = Py_AASequence.from_string("PEPTIDEM(Oxidation)K") +print(f"Is modified: {mod_seq.is_modified}") # True +print(f"Unmodified: {mod_seq.unmodified_sequence}") # PEPTIDEMK +``` + +**Properties:** +- `sequence`: Full sequence string with modifications +- `unmodified_sequence`: Sequence without modifications +- `mono_weight`: Monoisotopic weight in Da +- `average_weight`: Average weight in Da +- `formula`: Molecular formula +- `is_modified`: Whether sequence has modifications +- `has_n_terminal_modification`: N-terminal modification status +- `has_c_terminal_modification`: C-terminal modification status +- `native`: Access to underlying pyOpenMS AASequence + +**Methods:** +- `from_string(sequence_str)`: Create from string (class method) +- `reverse()`: Reverse entire sequence +- `reverse_with_enzyme(enzyme)`: Reverse peptides between cleavage sites +- `shuffle(enzyme, max_attempts, seed)`: Shuffle with enzyme constraints +- `get_mz(charge)`: Calculate m/z for charge state +- `has_substring(substring)`: Check for substring +- `has_prefix(prefix)`: Check for prefix +- `has_suffix(suffix)`: Check for suffix + ### Working with Spectra ```python @@ -866,6 +938,29 @@ plt.show() - `normalize_intensity(max_value)`: Normalize intensities to max value - `normalize_to_tic()`: Normalize so intensities sum to 1.0 +### Py_AASequence + +**Properties:** +- `sequence`: Full sequence string with modifications +- `unmodified_sequence`: Sequence without modifications +- `mono_weight`: Monoisotopic weight in Da +- `average_weight`: Average weight in Da +- `formula`: Molecular formula +- `is_modified`: Whether sequence has modifications +- `has_n_terminal_modification`: N-terminal modification status +- `has_c_terminal_modification`: C-terminal modification status +- `native`: Access to underlying pyOpenMS AASequence + +**Methods:** +- `from_string(sequence_str)`: Create from string (class method) +- `reverse()`: Reverse entire sequence +- `reverse_with_enzyme(enzyme)`: Reverse peptides between enzymatic cleavage sites +- `shuffle(enzyme, max_attempts, seed)`: Shuffle peptides with enzyme constraints +- `get_mz(charge)`: Calculate m/z for given charge state +- `has_substring(substring)`: Check if sequence contains substring +- `has_prefix(prefix)`: Check if sequence starts with prefix +- `has_suffix(suffix)`: Check if sequence ends with suffix + ### Py_MSSpectrum **Properties:** @@ -954,6 +1049,10 @@ pip install -e ".[dev]" | Iterate chromatograms | Manual loop + range check | `for chrom in exp.chromatograms():` | | Peak data | `peaks = spec.get_peaks(); mz = peaks[0]` | `mz, intensity = spec.peaks` | | DataFrame | Not available | `df = exp.to_dataframe()` | +| Create sequence | `oms.AASequence.fromString("PEP")` | `Py_AASequence.from_string("PEP")` | +| Get sequence weight | `seq.getMonoWeight()` | `seq.mono_weight` | +| Reverse sequence | `DecoyGenerator().reverseProtein(seq)` | `seq.reverse()` | +| Iterate residues | Manual loop with `getResidue(i)` | `for aa in seq:` | ## Contributing