Source code for neurosnap.sequence.protein

import re
from collections import Counter
from typing import Dict

from neurosnap.constants.sequence import (
  AA_ALIASES,
  AA_MASS_PROTEIN_AVG,
  AA_RECORDS,
  DEFAULT_PKA,
  AARecord,
  STANDARD_AAs,
)



[docs]
def getAA(query: str, *, non_standard: str = "reject") -> AARecord:
  """Resolve an amino acid identifier to a canonical record.

  This function accepts either a **1-letter code**, **3-letter abbreviation**,
  or **full name** (case-insensitive) and returns the corresponding `AARecord`.

  Parameters
  ----------
  query : str
      Amino acid identifier (1-letter code, 3-letter CCD abbreviation,
      or full name).
  non_standard : {"reject", "convert", "allow"}, optional
      Policy for handling non-standard amino acids (default: "reject"):

      - "reject": Raise an error if the amino acid is non-standard.
      - "convert": Map non-standard amino acids to their closest
        standard equivalent (e.g., MSE → MET).
      - "allow": Return the non-standard amino acid unchanged.

  Returns
  -------
  AARecord
      A record containing:
      - `code`: 1-letter code (may be "?" if unavailable for non-standard AAs).
      - `abr`: 3-letter abbreviation.
      - `name`: Full amino acid name.
      - `is_standard`: Whether the residue is one of the 20 canonical amino acids.
      - `standard_equiv_abr`: 3-letter abbreviation of the standard equivalent
        (if applicable).

  Raises
  ------
  ValueError
      If `query` does not match any supported amino acid identifier.
      If `non_standard="reject"` and the amino acid is non-standard.
      If `non_standard="convert"` but no standard equivalent is defined.
  """
  query = query.upper()
  try:
    abr = AA_ALIASES[query]
    rec = AA_RECORDS[abr]
  except KeyError:
    raise ValueError(f"Unknown amino acid identifier: '{query}'. Expected a 1-letter code, 3-letter code, or full name.")

  if not rec.is_standard:
    if non_standard == "reject":
      raise ValueError(
        f"Encountered non-standard amino acid '{rec.abr}' ({rec.name}). "
        "To handle these, set `non_standard='allow'` to keep them "
        "or `non_standard='convert'` to map them to a standard equivalent."
      )
    elif non_standard == "convert":
      if not rec.standard_equiv_abr:
        raise ValueError(f"Non-standard amino acid '{rec.abr}' ({rec.name}) does not have a standard equivalent and cannot be converted.")
      rec = AA_RECORDS[rec.standard_equiv_abr]
  return rec




[docs]
def sanitize_aa_seq(seq: str, *, non_standard: str = "reject", trim_term: bool = True, uppercase=True, clean_whitespace: bool = True) -> str:
  """
  Validates and sanitizes an amino acid sequence string.

  Parameters:
      seq: The input amino acid sequence.
      non_standard: How to handle non-standard amino acids.
          Must be one of:
          - "reject": Raise an error if any non-standard residue is found (default).
          - "convert": Replace non-standard residues with standard equivalents, if possible.
          - "allow": Keep non-standard residues unchanged.
      trim_term: If True, trims terminal stop codons ("*") from the end of the sequence. Default is True.
      uppercase: If True, converts the sequence to uppercase before processing. Default is True.
      clean_whitespace: If True, removes all whitespace characters from the sequence. Default is True.

  Returns:
      The sanitized amino acid sequence.

  Raises:
      ValueError: If an invalid residue is found and `non_standard` is set to "reject",
                  or if a residue cannot be converted when `non_standard` is "convert".
      AssertionError: If `non_standard` is not one of "allow", "convert", or "reject".
  """
  assert non_standard in ("allow", "convert", "reject"), f'Unknown value of "{non_standard}" supplied for non_standard parameter.'

  if uppercase:
    seq = seq.upper()

  if clean_whitespace:
    seq = re.sub(r"\s", "", seq)

  if trim_term:
    seq = seq.rstrip("*")

  new_seq = ""
  for i, x in enumerate(seq, start=1):
    if x not in STANDARD_AAs:
      if non_standard == "allow":
        pass
      elif non_standard == "convert":
        x = getAA(x, non_standard="convert").code
      else:
        raise ValueError(f'Invalid amino acid "{x}" specified at position {i}.')
    new_seq += x
  return new_seq




[docs]
def molecular_weight(sequence: str, aa_mws: Dict[str, float] = AA_MASS_PROTEIN_AVG) -> float:
  """
  Calculate the molecular weight of a protein or peptide sequence.

  This function computes the molecular weight by summing the residue
  masses for each amino acid in the input sequence. By default, it uses
  average amino acid residue masses (`AA_MASS_PROTEIN_AVG`), but you
  can provide a custom mass dictionary (e.g., monoisotopic or free amino
  acid masses).

  The calculation accounts for the loss of one water molecule (H₂O,
  18.015 Da) for each peptide bond formed. For a sequence of length n,
  (n - 1) * 18.015 Da is subtracted from the total.

  Args:
      sequence: Amino acid sequence (one-letter codes).
      aa_mws: Dictionary mapping amino acid one-letter codes to molecular
          weights. Defaults to `AA_MASS_PROTEIN_AVG`.

  Returns:
      Estimated molecular weight of the protein or peptide in Daltons (Da).

  Raises:
      ValueError: If the sequence contains an invalid or unsupported
      amino acid code.

  Notes:
      - Use `AA_MASS_PROTEIN_MONO` for monoisotopic mass calculations,
        typically used in mass spectrometry.
      - Use `AA_MASS_PROTEIN_AVG` (default) for average residue masses,
        appropriate for bulk molecular weight estimation.
      - For free amino acids (not incorporated in peptides), use
        `AA_MASS_FREE`.
      - Weight dictionaries are defined in `constants.py`.
  """
  # Remove whitespace and convert to uppercase
  sequence = sequence.strip().upper()

  # Sum molecular weights
  weight = 0.0
  for aa in sequence:
    if aa not in aa_mws:
      raise ValueError(f"Invalid amino acid: {aa}")
    weight += aa_mws[aa]

  # Adjust for water loss during peptide bond formation
  # Each peptide bond loses one H2O (18.015 Da)
  if len(sequence) > 1:
    weight -= (len(sequence) - 1) * 18.015

  return weight



def _fraction_protonated_basic(pH: float, pKa: float) -> float:
  """For BH+ <-> B + H+, returns fraction in the protonated (+1) form."""
  return 1.0 / (1.0 + 10.0 ** (pH - pKa))


def _fraction_deprotonated_acidic(pH: float, pKa: float) -> float:
  """For HA <-> A- + H+, returns fraction in the deprotonated (-1) form."""
  return 1.0 / (1.0 + 10.0 ** (pKa - pH))



[docs]
def net_charge(sequence: str, pH: float, pKa: Dict[str, float] = DEFAULT_PKA) -> float:
  """
  Calculate the net charge of a protein or peptide sequence at a given pH.

  This function applies the Henderson–Hasselbalch equation to estimate
  the protonation state of titratable groups (N-terminus, C-terminus,
  and ionizable side chains) and computes the overall net charge.

  Args:
      sequence: Amino acid sequence in one-letter codes. Supports the 20
          canonical residues and optionally 'U' (selenocysteine).
          Non-ionizable residues are ignored.
      pH: The solution pH at which to evaluate the net charge.
      pKa: Dictionary of pKa values for titratable groups. Must include
          keys "N_TERMINUS", "C_TERMINUS", "D", "E", "C", "Y",
          "H", "K", and "R". If 'U' is present in the sequence, it
          should also include "U".

  Returns:
      Estimated net charge of the sequence at the given pH.

  Notes:
      Positive charges come from protonated groups:
      - N-terminus
      - Lysine (K)
      - Arginine (R)
      - Histidine (H)

      Negative charges come from deprotonated groups:
      - C-terminus
      - Aspartic acid (D)
      - Glutamic acid (E)
      - Cysteine (C)
      - Tyrosine (Y)
      - Selenocysteine (U), if included

      The calculation assumes independent ionization equilibria and does
      not account for local environment or structural effects. It is best
      interpreted as an approximate charge profile.
  """
  seq = sequence.strip().upper()
  if not seq:
    return 0.0

  counts = Counter(seq)

  # N-terminus (+1 when protonated)
  nterm = _fraction_protonated_basic(pH, pKa["N_TERMINUS"])

  # C-terminus (-1 when deprotonated)
  cterm = _fraction_deprotonated_acidic(pH, pKa["C_TERMINUS"])

  # Side chains
  pos = (
    counts.get("K", 0) * _fraction_protonated_basic(pH, pKa["K"])
    + counts.get("R", 0) * _fraction_protonated_basic(pH, pKa["R"])
    + counts.get("H", 0) * _fraction_protonated_basic(pH, pKa["H"])
  )

  neg = (
    counts.get("D", 0) * _fraction_deprotonated_acidic(pH, pKa["D"])
    + counts.get("E", 0) * _fraction_deprotonated_acidic(pH, pKa["E"])
    + counts.get("C", 0) * _fraction_deprotonated_acidic(pH, pKa["C"])
    + counts.get("Y", 0) * _fraction_deprotonated_acidic(pH, pKa["Y"])
    + counts.get("U", 0) * _fraction_deprotonated_acidic(pH, pKa["U"])  # optional
  )
  return (nterm + pos) - (cterm + neg)




[docs]
def isoelectric_point(
  sequence: str, pKa: Dict[str, float] = DEFAULT_PKA, *, pH_low: float = 0.0, pH_high: float = 14.0, tol: float = 1e-4, max_iter: int = 100
) -> float:
  """
  Estimate the isoelectric point (pI) of a protein or peptide.

  The pI is the pH at which the net charge of the molecule is zero.
  This function computes the net charge across pH and uses a bisection
  search to find the root.

  Args:
      sequence: Amino acid sequence (one-letter codes). Supports the 20
          canonical residues and optionally 'U' (selenocysteine).
          Non-titratable residues contribute no charge.
      pKa: Dictionary of pKa values for titratable groups. Must include
          keys "N_TERMINUS", "C_TERMINUS", and for side chains
          "D", "E", "C", "Y", "H", "K", "R". If 'U' appears in the
          sequence, include "U" (default ~5.2, approximate).
      pH_low: Lower bound of the bracketing interval for the bisection
          search (default 0.0).
      pH_high: Upper bound of the bracketing interval for the bisection
          search (default 14.0).
      tol: Target absolute net charge tolerance at the solution
          (default 1e-4).
      max_iter: Maximum iterations for the bisection search (default 100).

  Returns:
      Estimated pI.

  Notes:
      - Results depend on the chosen pKa set. For consistency with common
        tools, you may substitute a different pKa dictionary
        (e.g., Bjellqvist or IPC sets).
      - Pyrrolysine ('O') is not included by default due to scarce
        consensus pKa data; it is treated as non-titratable here.
        You can add an entry if you have a value.
      - This model ignores sequence-context and microenvironment effects
        (local shifts in pKa due to neighbors or structure). It’s a good
        heuristic, not a guarantee.
  """
  seq = sequence.strip().upper()
  if not seq:
    return 0.0

  # Validate sequence characters
  valid = STANDARD_AAs | {"U", "O"}
  for aa in seq:
    if aa not in valid:
      raise ValueError(f"Invalid amino acid: {aa}")

  # Bisection search
  lo, hi = pH_low, pH_high
  q_lo = net_charge(seq, lo, pKa)
  q_hi = net_charge(seq, hi, pKa)

  # If the bracket doesn't change sign, still proceed but clamp toward the side
  # where |charge| is smaller to avoid errors on unusual sequences.
  if q_lo == 0:
    return lo
  if q_hi == 0:
    return hi
  if q_lo * q_hi > 0:
    # No sign change; do a guarded search by nudging bounds inward.
    # This keeps the function robust for edge cases (e.g., extremely acidic/basic sequences).
    for _ in range(20):
      mid = (lo + hi) / 2.0
      q_mid = net_charge(seq, mid, pKa)
      if abs(q_mid) < abs(q_lo) and abs(q_mid) < abs(q_hi):
        # Use the best we have if we can't bracket
        best = mid
      lo += 0.1
      hi -= 0.1
    return best if "best" in locals() else (lo + hi) / 2.0

  for _ in range(max_iter):
    mid = (lo + hi) / 2.0
    q_mid = net_charge(seq, mid, pKa)

    if abs(q_mid) <= tol:
      return mid
    # Decide which subinterval keeps the sign change
    if q_lo * q_mid < 0:
      hi, q_hi = mid, q_mid
    else:
      lo, q_lo = mid, q_mid

  # Fallback if not converged within max_iter
  return (lo + hi) / 2.0