"""
This file contains constants.
"""
from dataclasses import dataclass
from typing import Dict, Optional
## Backbone Atoms
# Names of atoms that are part of a protein's backbone structure
BACKBONE_ATOMS_AA = {"N", "CA", "C"}
# Names of atoms that are part of a DNA backbone structure
BACKBONE_ATOMS_DNA = {
# Phosphorus
"P",
# Phosphate oxygens (sometimes labeled OP1, OP2)
"O1P",
"O2P",
# Alternate naming convention
"OP1",
"OP2",
# Bridging oxygens between sugar and phosphate
"O3'",
"O5'",
# Sugar atoms
"C3'",
"C4'",
"C5'",
"O4'",
"C1'",
"C2'",
}
# Names of atoms that are part of an RNA backbone structure
# (Same as DNA but includes the 2'-OH group)
BACKBONE_ATOMS_RNA = BACKBONE_ATOMS_DNA.union({"O2'"})
## van der Waals radii (Å) for common elements
# Bondi values
VDW_RADII_BONDI = {
"H": 1.2,
"He": 1.4,
"Li": 1.81,
"C": 1.7,
"N": 1.55,
"O": 1.52,
"F": 1.47,
"Ne": 1.54,
"Na": 2.27,
"Mg": 1.73,
"Si": 2.1,
"P": 1.8,
"S": 1.8,
"Cl": 1.75,
"Ar": 1.88,
"K": 2.75,
"Ga": 1.87,
"As": 1.85,
"Se": 1.9,
"Br": 1.83,
"Kr": 2.02,
"In": 1.93,
"Sn": 2.17,
"Te": 2.06,
"I": 1.98,
"Xe": 2.16,
"Tl": 1.96,
"Pb": 2.02,
}
# UFF values
VDW_RADII_UFF = {
"H": 2.886,
"He": 2.362,
"Li": 2.451,
"Be": 2.745,
"B": 4.083,
"C": 3.851,
"N": 3.66,
"O": 3.5,
"F": 3.364,
"Ne": 3.243,
"Na": 2.983,
"Mg": 3.021,
"Al": 4.499,
"Si": 4.295,
"P": 4.147,
"S": 4.035,
"Cl": 3.947,
"Ar": 3.868,
"K": 3.812,
"Ca": 3.399,
"Sc": 3.295,
"Ti": 3.175,
"V": 3.144,
"Cr": 3.023,
"Mn": 2.961,
"Fe": 2.912,
"Co": 2.872,
"Ni": 2.834,
"Cu": 3.495,
"Zn": 2.763,
"Ga": 4.383,
"Ge": 4.28,
"As": 4.23,
"Se": 4.205,
"Br": 4.189,
"Kr": 4.141,
"Rb": 4.114,
"Sr": 3.641,
"Y": 3.345,
"Zr": 3.124,
"Nb": 3.165,
"Mo": 3.052,
"Tc": 2.998,
"Ru": 2.963,
"Rh": 2.929,
"Pd": 2.899,
"Ag": 3.148,
"Cd": 2.848,
"In": 4.463,
"Sn": 4.392,
"Sb": 4.42,
"Te": 4.47,
"I": 4.5,
"Xe": 4.404,
"Cs": 4.517,
"Ba": 3.703,
"La": 3.522,
"Ce": 3.556,
"Pr": 3.606,
"Nd": 3.575,
"Pm": 3.547,
"Sm": 3.52,
"Eu": 3.493,
"Gd": 3.368,
"Tb": 3.451,
"Dy": 3.428,
"Ho": 3.409,
"Er": 3.391,
"Tm": 3.374,
"Yb": 3.355,
"Lu": 3.64,
"Hf": 3.141,
"Ta": 3.17,
"W": 3.096,
"Re": 2.954,
"Os": 3.12,
"Ir": 2.84,
"Pt": 2.754,
"Au": 3.293,
"Hg": 2.705,
"Tl": 4.347,
"Pb": 4.297,
"Bi": 4.37,
"Po": 4.709,
"At": 4.75,
"Rn": 4.765,
"Fr": 4.9,
"Ra": 3.677,
"Ac": 3.478,
"Th": 3.396,
"Pa": 3.424,
"U": 3.395,
"Np": 3.424,
"Pu": 3.424,
"Am": 3.381,
"Cm": 3.326,
"Bk": 3.339,
"Cf": 3.313,
"Es": 3.299,
"Fm": 3.286,
"Md": 3.274,
"No": 3.248,
"Lr": 3.236,
}
## Nucleotide Codes
# Single-letter PDB residue codes for standard DNA residues
NUC_DNA_CODES = {"DA", "DT", "DC", "DG"}
# Single-letter PDB residue codes for standard RNA residues
NUC_RNA_CODES = {"A", "U", "C", "G"}
# Codes for standard nucleotides (both RNA and DNA)
STANDARD_NUCLEOTIDES = NUC_DNA_CODES.union(NUC_RNA_CODES)
## Amino Acid Codes and Properties
# Codes for standard amino acids
STANDARD_AAs = set("ACDEFGHIKLMNPQRSTVWY")
# List of hydrophobic residues
HYDROPHOBIC_RESIDUES = {"ALA", "VAL", "LEU", "ILE", "MET", "PHE", "TRP", "PRO"}
# Amino acid Record class
[docs]
@dataclass(frozen=True)
class AARecord:
code: Optional[str] # 1-letter code; None for if unavailable
abr: str # 3-letter abbreviation or CCD code
name: str # full name (upper-cased)
is_standard: bool # True for the 20 canonical AAs
standard_equiv_abr: Optional[str] # e.g., "LYS" for KCX; None if standard or unknown
## Amino acids keyed by ABR
AA_RECORDS: Dict[str, AARecord] = {
# STANDARD AMINO ACIDS
"ALA": AARecord("A", "ALA", "ALANINE", True, None),
"ARG": AARecord("R", "ARG", "ARGININE", True, None),
"ASN": AARecord("N", "ASN", "ASPARAGINE", True, None),
"ASP": AARecord("D", "ASP", "ASPARTIC ACID", True, None),
"CYS": AARecord("C", "CYS", "CYSTEINE", True, None),
"GLN": AARecord("Q", "GLN", "GLUTAMINE", True, None),
"GLU": AARecord("E", "GLU", "GLUTAMIC ACID", True, None),
"GLY": AARecord("G", "GLY", "GLYCINE", True, None),
"HIS": AARecord("H", "HIS", "HISTIDINE", True, None),
"ILE": AARecord("I", "ILE", "ISOLEUCINE", True, None),
"LEU": AARecord("L", "LEU", "LEUCINE", True, None),
"LYS": AARecord("K", "LYS", "LYSINE", True, None),
"MET": AARecord("M", "MET", "METHIONINE", True, None),
"PHE": AARecord("F", "PHE", "PHENYLALANINE", True, None),
"PRO": AARecord("P", "PRO", "PROLINE", True, None),
"SER": AARecord("S", "SER", "SERINE", True, None),
"THR": AARecord("T", "THR", "THREONINE", True, None),
"TRP": AARecord("W", "TRP", "TRYPTOPHAN", True, None),
"TYR": AARecord("Y", "TYR", "TYROSINE", True, None),
"VAL": AARecord("V", "VAL", "VALINE", True, None),
# NON-STANDARD / SPECIAL AMINO ACIDS (sequence-level)
"PYL": AARecord("O", "PYL", "PYRROLYSINE", False, "LYS"),
"SEC": AARecord("U", "SEC", "SELENOCYSTEINE", False, "CYS"),
"ASX": AARecord("B", "ASX", "ASPARAGINE/ASPARTIC ACID", False, "ASP"),
"GLX": AARecord("Z", "GLX", "GLUTAMINE/GLUTAMIC ACID", False, "GLU"),
"XLE": AARecord("J", "XLE", "LEUCINE/ISOLEUCINE", False, "LEU"),
"UNK": AARecord("X", "UNK", "UNKNOWN", False, None),
"TRM": AARecord("*", "TRM", "TERMINATION", False, None),
# NON-STANDARD / MODIFIED (from CCD)
"LLP": AARecord(None, "LLP", "Nε-LIPOYL-LYSINE", False, "LYS"),
"TPO": AARecord(None, "TPO", "O-PHOSPHOTHREONINE", False, "THR"),
"CSS": AARecord(None, "CSS", "SULFONATED CYSTEINE", False, "CYS"),
"OCS": AARecord(None, "OCS", "CYSTEINE-S-SULFONIC ACID", False, "CYS"),
"CSO": AARecord(None, "CSO", "S-HYDROXYCYSTEINE (CYSTEINE SULFINIC ACID)", False, "CYS"),
"PCA": AARecord(None, "PCA", "PYROGLUTAMIC ACID", False, "GLU"),
"KCX": AARecord(None, "KCX", "CARBOXYLYSINE", False, "LYS"),
"CME": AARecord(None, "CME", "S-METHYLCYSTEINE", False, "CYS"),
"MLY": AARecord(None, "MLY", "Nε-METHYLLYSINE", False, "LYS"),
"SEP": AARecord(None, "SEP", "O-PHOSPHOSERINE", False, "SER"),
"CSX": AARecord(None, "CSX", "CYSTEINE OXIDATION PRODUCT (UNSPECIFIED)", False, "CYS"),
"CSD": AARecord(None, "CSD", "CYSTEINE DISULFIDE", False, "CYS"),
"MSE": AARecord(None, "MSE", "SELENOMETHIONINE", False, "MET"),
"MHO": AARecord(None, "MHO", "METHIONINE SULFOXIDE", False, "MET"),
}
# Alias map: every searchable token → ABR
# (1-letter codes, 3-letter codes, and names)
AA_ALIASES: Dict[str, str] = {}
for abr, rec in AA_RECORDS.items():
if rec.code is not None:
AA_ALIASES[rec.code] = abr
AA_ALIASES[abr] = abr
AA_ALIASES[rec.name] = abr
## Amino acid molecular weights
# Average residue masses (in Daltons) for amino acids *as incorporated into peptides/proteins*.
# These values already account for the loss of one H2O molecule during peptide bond formation,
# so they represent the contribution of each amino acid *residue* in a chain.
# Source: https://proteomicsresource.washington.edu/protocols06/masses.php (Average masses)
AA_WEIGHTS_PROTEIN_AVG = {
"A": 71.07790000, # Alanine
"R": 156.1856800, # Arginine
"N": 114.1026400, # Asparagine
"D": 115.0874000, # Aspartic acid
"C": 103.1429000, # Cysteine
"E": 129.1139800, # Glutamic acid
"Q": 128.1292200, # Glutamine
"G": 57.05132000, # Glycine
"H": 137.1392800, # Histidine
"I": 113.1576400, # Isoleucine
"L": 113.1576400, # Leucine
"K": 128.1722800, # Lysine
"M": 131.1960600, # Methionine
"F": 147.1738600, # Phenylalanine
"P": 97.11518000, # Proline
"S": 87.07730000, # Serine
"T": 101.1038800, # Threonine
"W": 186.2099000, # Tryptophan
"Y": 163.1732600, # Tyrosine
"V": 99.13106000, # Valine
"O": 237.2981600, # pyrrolysine
"U": 150.0379000, # selenocysteine
}
# Monoisotopic residue masses (in Daltons) for amino acids *as incorporated into peptides/proteins*.
# These use the exact mass of the most abundant isotope of each element (e.g., 12C, 1H, 16O, 14N).
# Like the average masses above, these are residue contributions (with H2O already removed).
# Source: https://proteomicsresource.washington.edu/protocols06/masses.php (Monoisotopic masses)
AA_WEIGHTS_PROTEIN_MONO = {
"A": 71.0371138050, # Alanine
"R": 156.101111050, # Arginine
"N": 114.042927470, # Asparagine
"D": 115.026943065, # Aspartic acid
"C": 103.009184505, # Cysteine
"E": 129.042593135, # Glutamic acid
"Q": 128.058577540, # Glutamine
"G": 57.0214637350, # Glycine
"H": 137.058911875, # Histidine
"I": 113.084064015, # Isoleucine
"L": 113.084064015, # Leucine
"K": 128.094963050, # Lysine
"M": 131.040484645, # Methionine
"F": 147.068413945, # Phenylalanine
"P": 97.0527638750, # Proline
"S": 87.0320284350, # Serine
"T": 101.047678505, # Threonine
"W": 186.079312980, # Tryptophan
"Y": 163.063328575, # Tyrosine
"V": 99.0684139450, # Valine
"O": 237.147726925, # pyrrolysine
"U": 150.953633405, # selenocysteine
}
# Average molecular weights (in Daltons) of *free amino acids* (not incorporated into a chain).
# These values include the full amino acid with terminal H and OH groups, i.e. before peptide bond formation.
# Often used for small-molecule calculations or educational purposes, but not for intact peptides/proteins.
AA_WEIGHTS_FREE = {
"A": 89.090, # Alanine
"R": 174.20, # Arginine
"N": 132.12, # Asparagine
"D": 133.10, # Aspartic acid
"C": 121.15, # Cysteine
"E": 147.13, # Glutamic acid
"Q": 146.15, # Glutamine
"G": 75.070, # Glycine
"H": 155.16, # Histidine
"I": 131.17, # Isoleucine
"L": 131.17, # Leucine
"K": 146.19, # Lysine
"M": 149.21, # Methionine
"F": 165.19, # Phenylalanine
"P": 115.13, # Proline
"S": 105.09, # Serine
"T": 119.12, # Threonine
"W": 204.23, # Tryptophan
"Y": 181.19, # Tyrosine
"V": 117.15, # Valine
"O": 255.31, # Pyrrolysine (free)
"U": 168.06, # Selenocysteine (free)
}
## pKa Values
# Default pKa set (EMBOSS-like). Values are typical textbook approximations.
# You can swap these for another set (e.g., Bjellqvist, IPC) if desired.
DEFAULT_PKA = {
# termini
"N_TERMINUS": 8.6,
"C_TERMINUS": 3.6,
# acidic side chains (deprotonate to -1)
"C": 8.50, # Cys
"D": 3.90, # Asp
"E": 4.10, # Glu
"Y": 10.1, # Tyr
# basic side chains (protonate to +1)
"H": 6.50, # His
"K": 10.8, # Lys
"R": 12.5, # Arg
# optional uncommon residue
"U": 5.20, # Selenocysteine (approx.; behaves like acidic thiol/selenol)
}