Source code for neurosnap.sequence.protein

import re
from collections import Counter
from typing import Dict

from neurosnap.constants.sequence import (
  AA_ALIASES,
  AA_MASS_PROTEIN_AVG,
  AA_RECORDS,
  DEFAULT_PKA,
  AARecord,
  STANDARD_AAs,
)


[docs] def getAA(query: str, *, non_standard: str = "reject") -> AARecord: """Resolve an amino acid identifier to a canonical record. This function accepts either a **1-letter code**, **3-letter abbreviation**, or **full name** (case-insensitive) and returns the corresponding `AARecord`. Parameters ---------- query : str Amino acid identifier (1-letter code, 3-letter CCD abbreviation, or full name). non_standard : {"reject", "convert", "allow"}, optional Policy for handling non-standard amino acids (default: "reject"): - "reject": Raise an error if the amino acid is non-standard. - "convert": Map non-standard amino acids to their closest standard equivalent (e.g., MSE → MET). - "allow": Return the non-standard amino acid unchanged. Returns ------- AARecord A record containing: - `code`: 1-letter code (may be "?" if unavailable for non-standard AAs). - `abr`: 3-letter abbreviation. - `name`: Full amino acid name. - `is_standard`: Whether the residue is one of the 20 canonical amino acids. - `standard_equiv_abr`: 3-letter abbreviation of the standard equivalent (if applicable). Raises ------ ValueError If `query` does not match any supported amino acid identifier. If `non_standard="reject"` and the amino acid is non-standard. If `non_standard="convert"` but no standard equivalent is defined. """ query = query.upper() try: abr = AA_ALIASES[query] rec = AA_RECORDS[abr] except KeyError: raise ValueError(f"Unknown amino acid identifier: '{query}'. Expected a 1-letter code, 3-letter code, or full name.") if not rec.is_standard: if non_standard == "reject": raise ValueError( f"Encountered non-standard amino acid '{rec.abr}' ({rec.name}). " "To handle these, set `non_standard='allow'` to keep them " "or `non_standard='convert'` to map them to a standard equivalent." ) elif non_standard == "convert": if not rec.standard_equiv_abr: raise ValueError(f"Non-standard amino acid '{rec.abr}' ({rec.name}) does not have a standard equivalent and cannot be converted.") rec = AA_RECORDS[rec.standard_equiv_abr] return rec
[docs] def sanitize_aa_seq(seq: str, *, non_standard: str = "reject", trim_term: bool = True, uppercase=True, clean_whitespace: bool = True) -> str: """ Validates and sanitizes an amino acid sequence string. Parameters: seq: The input amino acid sequence. non_standard: How to handle non-standard amino acids. Must be one of: - "reject": Raise an error if any non-standard residue is found (default). - "convert": Replace non-standard residues with standard equivalents, if possible. - "allow": Keep non-standard residues unchanged. trim_term: If True, trims terminal stop codons ("*") from the end of the sequence. Default is True. uppercase: If True, converts the sequence to uppercase before processing. Default is True. clean_whitespace: If True, removes all whitespace characters from the sequence. Default is True. Returns: The sanitized amino acid sequence. Raises: ValueError: If an invalid residue is found and `non_standard` is set to "reject", or if a residue cannot be converted when `non_standard` is "convert". AssertionError: If `non_standard` is not one of "allow", "convert", or "reject". """ assert non_standard in ("allow", "convert", "reject"), f'Unknown value of "{non_standard}" supplied for non_standard parameter.' if uppercase: seq = seq.upper() if clean_whitespace: seq = re.sub(r"\s", "", seq) if trim_term: seq = seq.rstrip("*") new_seq = "" for i, x in enumerate(seq, start=1): if x not in STANDARD_AAs: if non_standard == "allow": pass elif non_standard == "convert": x = getAA(x, non_standard="convert").code else: raise ValueError(f'Invalid amino acid "{x}" specified at position {i}.') new_seq += x return new_seq
[docs] def molecular_weight(sequence: str, aa_mws: Dict[str, float] = AA_MASS_PROTEIN_AVG) -> float: """ Calculate the molecular weight of a protein or peptide sequence. This function computes the molecular weight by summing the residue masses for each amino acid in the input sequence. By default, it uses average amino acid residue masses (`AA_MASS_PROTEIN_AVG`), but you can provide a custom mass dictionary (e.g., monoisotopic or free amino acid masses). The calculation accounts for the loss of one water molecule (H₂O, 18.015 Da) for each peptide bond formed. For a sequence of length n, (n - 1) * 18.015 Da is subtracted from the total. Args: sequence: Amino acid sequence (one-letter codes). aa_mws: Dictionary mapping amino acid one-letter codes to molecular weights. Defaults to `AA_MASS_PROTEIN_AVG`. Returns: Estimated molecular weight of the protein or peptide in Daltons (Da). Raises: ValueError: If the sequence contains an invalid or unsupported amino acid code. Notes: - Use `AA_MASS_PROTEIN_MONO` for monoisotopic mass calculations, typically used in mass spectrometry. - Use `AA_MASS_PROTEIN_AVG` (default) for average residue masses, appropriate for bulk molecular weight estimation. - For free amino acids (not incorporated in peptides), use `AA_MASS_FREE`. - Weight dictionaries are defined in `constants.py`. """ # Remove whitespace and convert to uppercase sequence = sequence.strip().upper() # Sum molecular weights weight = 0.0 for aa in sequence: if aa not in aa_mws: raise ValueError(f"Invalid amino acid: {aa}") weight += aa_mws[aa] # Adjust for water loss during peptide bond formation # Each peptide bond loses one H2O (18.015 Da) if len(sequence) > 1: weight -= (len(sequence) - 1) * 18.015 return weight
def _fraction_protonated_basic(pH: float, pKa: float) -> float: """For BH+ <-> B + H+, returns fraction in the protonated (+1) form.""" return 1.0 / (1.0 + 10.0 ** (pH - pKa)) def _fraction_deprotonated_acidic(pH: float, pKa: float) -> float: """For HA <-> A- + H+, returns fraction in the deprotonated (-1) form.""" return 1.0 / (1.0 + 10.0 ** (pKa - pH))
[docs] def net_charge(sequence: str, pH: float, pKa: Dict[str, float] = DEFAULT_PKA) -> float: """ Calculate the net charge of a protein or peptide sequence at a given pH. This function applies the Henderson–Hasselbalch equation to estimate the protonation state of titratable groups (N-terminus, C-terminus, and ionizable side chains) and computes the overall net charge. Args: sequence: Amino acid sequence in one-letter codes. Supports the 20 canonical residues and optionally 'U' (selenocysteine). Non-ionizable residues are ignored. pH: The solution pH at which to evaluate the net charge. pKa: Dictionary of pKa values for titratable groups. Must include keys "N_TERMINUS", "C_TERMINUS", "D", "E", "C", "Y", "H", "K", and "R". If 'U' is present in the sequence, it should also include "U". Returns: Estimated net charge of the sequence at the given pH. Notes: Positive charges come from protonated groups: - N-terminus - Lysine (K) - Arginine (R) - Histidine (H) Negative charges come from deprotonated groups: - C-terminus - Aspartic acid (D) - Glutamic acid (E) - Cysteine (C) - Tyrosine (Y) - Selenocysteine (U), if included The calculation assumes independent ionization equilibria and does not account for local environment or structural effects. It is best interpreted as an approximate charge profile. """ seq = sequence.strip().upper() if not seq: return 0.0 counts = Counter(seq) # N-terminus (+1 when protonated) nterm = _fraction_protonated_basic(pH, pKa["N_TERMINUS"]) # C-terminus (-1 when deprotonated) cterm = _fraction_deprotonated_acidic(pH, pKa["C_TERMINUS"]) # Side chains pos = ( counts.get("K", 0) * _fraction_protonated_basic(pH, pKa["K"]) + counts.get("R", 0) * _fraction_protonated_basic(pH, pKa["R"]) + counts.get("H", 0) * _fraction_protonated_basic(pH, pKa["H"]) ) neg = ( counts.get("D", 0) * _fraction_deprotonated_acidic(pH, pKa["D"]) + counts.get("E", 0) * _fraction_deprotonated_acidic(pH, pKa["E"]) + counts.get("C", 0) * _fraction_deprotonated_acidic(pH, pKa["C"]) + counts.get("Y", 0) * _fraction_deprotonated_acidic(pH, pKa["Y"]) + counts.get("U", 0) * _fraction_deprotonated_acidic(pH, pKa["U"]) # optional ) return (nterm + pos) - (cterm + neg)
[docs] def isoelectric_point( sequence: str, pKa: Dict[str, float] = DEFAULT_PKA, *, pH_low: float = 0.0, pH_high: float = 14.0, tol: float = 1e-4, max_iter: int = 100 ) -> float: """ Estimate the isoelectric point (pI) of a protein or peptide. The pI is the pH at which the net charge of the molecule is zero. This function computes the net charge across pH and uses a bisection search to find the root. Args: sequence: Amino acid sequence (one-letter codes). Supports the 20 canonical residues and optionally 'U' (selenocysteine). Non-titratable residues contribute no charge. pKa: Dictionary of pKa values for titratable groups. Must include keys "N_TERMINUS", "C_TERMINUS", and for side chains "D", "E", "C", "Y", "H", "K", "R". If 'U' appears in the sequence, include "U" (default ~5.2, approximate). pH_low: Lower bound of the bracketing interval for the bisection search (default 0.0). pH_high: Upper bound of the bracketing interval for the bisection search (default 14.0). tol: Target absolute net charge tolerance at the solution (default 1e-4). max_iter: Maximum iterations for the bisection search (default 100). Returns: Estimated pI. Notes: - Results depend on the chosen pKa set. For consistency with common tools, you may substitute a different pKa dictionary (e.g., Bjellqvist or IPC sets). - Pyrrolysine ('O') is not included by default due to scarce consensus pKa data; it is treated as non-titratable here. You can add an entry if you have a value. - This model ignores sequence-context and microenvironment effects (local shifts in pKa due to neighbors or structure). It’s a good heuristic, not a guarantee. """ seq = sequence.strip().upper() if not seq: return 0.0 # Validate sequence characters valid = STANDARD_AAs | {"U", "O"} for aa in seq: if aa not in valid: raise ValueError(f"Invalid amino acid: {aa}") # Bisection search lo, hi = pH_low, pH_high q_lo = net_charge(seq, lo, pKa) q_hi = net_charge(seq, hi, pKa) # If the bracket doesn't change sign, still proceed but clamp toward the side # where |charge| is smaller to avoid errors on unusual sequences. if q_lo == 0: return lo if q_hi == 0: return hi if q_lo * q_hi > 0: # No sign change; do a guarded search by nudging bounds inward. # This keeps the function robust for edge cases (e.g., extremely acidic/basic sequences). for _ in range(20): mid = (lo + hi) / 2.0 q_mid = net_charge(seq, mid, pKa) if abs(q_mid) < abs(q_lo) and abs(q_mid) < abs(q_hi): # Use the best we have if we can't bracket best = mid lo += 0.1 hi -= 0.1 return best if "best" in locals() else (lo + hi) / 2.0 for _ in range(max_iter): mid = (lo + hi) / 2.0 q_mid = net_charge(seq, mid, pKa) if abs(q_mid) <= tol: return mid # Decide which subinterval keeps the sign change if q_lo * q_mid < 0: hi, q_hi = mid, q_mid else: lo, q_lo = mid, q_mid # Fallback if not converged within max_iter return (lo + hi) / 2.0