Source code for neurosnap.algos.pdockq

from typing import Tuple, Union

import numpy as np

from neurosnap.protein import Protein


def _chain_cb_or_gly_ca(prot: Protein, chain_id: str) -> Tuple[np.ndarray, np.ndarray]:
  """
  Extract per-residue coordinates and plDDT for a chain using CB atoms,
  or CA for GLY residues (matching the original pDockQ convention).

  Returns:
      coords: (N, 3) array
      plddt: (N,) array aligned with coords
  """
  df = prot.df
  sel = df[(df["chain"] == chain_id) & ((df["atom_name"] == "CB") | ((df["atom_name"] == "CA") & (df["res_name"] == "GLY")))].copy()

  # Ensure stable ordering along the sequence
  # (sort by residue number, then by atom name to keep deterministic order)
  sel.sort_values(["res_id", "atom_name"], inplace=True)

  if sel.empty:
    # Fall back to CA-only if nothing found (e.g., all-GLY chains without CB records)
    sel = df[(df["chain"] == chain_id) & (df["atom_name"] == "CA")].copy()
    sel.sort_values(["res_id", "atom_name"], inplace=True)

  coords = sel[["x", "y", "z"]].to_numpy(dtype=float)
  plddt = sel["bfactor"].to_numpy(dtype=float)  # AlphaFold stores pLDDT in B-factor
  return coords, plddt


def _calc_pdockq_from_arrays(chain_coords: dict, chain_plddt: dict, dist_thresh: float) -> Tuple[float, float]:
  """
  Reproduce the original pDockQ mapping given pre-extracted arrays.
  Returns (pDockQ, PPV).
  """
  # Unpack the two chains
  ch1, ch2 = list(chain_coords.keys())
  coords1, coords2 = chain_coords[ch1], chain_coords[ch2]
  plddt1, plddt2 = chain_plddt[ch1], chain_plddt[ch2]

  if coords1.size == 0 or coords2.size == 0:
    return 0.0, 0.0

  # Pairwise distances between residues (CB/CA(GLY) representatives)
  mat = np.vstack([coords1, coords2])
  diff = mat[:, None, :] - mat[None, :, :]
  dists = np.sqrt(np.sum(diff**2, axis=-1))

  l1 = len(coords1)
  contact_dists = dists[:l1, l1:]  # distances between chain1 (rows) and chain2 (cols)
  contacts = np.argwhere(contact_dists <= dist_thresh)

  if contacts.shape[0] < 1:
    return 0.0, 0.0

  # Average interface pLDDT (unique residues on either side that form a contact)
  if_idx1 = np.unique(contacts[:, 0])
  if_idx2 = np.unique(contacts[:, 1])
  avg_if_plddt = np.average(np.concatenate([plddt1[if_idx1], plddt2[if_idx2]]))

  # Number of interface contacts
  n_if_contacts = contacts.shape[0]

  # Logistic mapping (original constants)
  x = avg_if_plddt * np.log10(max(n_if_contacts, 1))
  pdockq = 0.724 / (1 + np.exp(-0.052 * (x - 152.611))) + 0.018

  # PPV calibration (original look-up)
  PPV = np.array(
    [
      0.98128027,
      0.96322524,
      0.95333044,
      0.9400192,
      0.93172991,
      0.92420274,
      0.91629946,
      0.90952562,
      0.90043139,
      0.8919553,
      0.88570037,
      0.87822061,
      0.87116417,
      0.86040801,
      0.85453785,
      0.84294946,
      0.83367787,
      0.82238224,
      0.81190228,
      0.80223507,
      0.78549007,
      0.77766077,
      0.75941223,
      0.74006263,
      0.73044282,
      0.71391784,
      0.70615739,
      0.68635536,
      0.66728511,
      0.63555449,
      0.55890174,
    ]
  )
  pdockq_thresholds = np.array(
    [
      0.67333079,
      0.65666073,
      0.63254566,
      0.62604391,
      0.60150931,
      0.58313803,
      0.5647381,
      0.54122438,
      0.52314392,
      0.49659878,
      0.4774676,
      0.44661346,
      0.42628389,
      0.39990988,
      0.38479715,
      0.3649393,
      0.34526004,
      0.3262589,
      0.31475668,
      0.29750023,
      0.26673725,
      0.24561247,
      0.21882689,
      0.19651314,
      0.17606258,
      0.15398168,
      0.13927677,
      0.12024131,
      0.09996019,
      0.06968505,
      0.02946438,
    ]
  )
  inds = np.argwhere(pdockq_thresholds >= pdockq)
  if len(inds) > 0:
    ppv = float(PPV[inds[-1]][0])
  else:
    ppv = float(PPV[0])

  return float(pdockq), float(ppv)


[docs] def calculate_pDockQ(structure: Union[str, Protein], chain1: str = None, chain2: str = None, dist_thresh: float = 8.0) -> Tuple[float, float]: """ Calculate the predicted DockQ (pDockQ) score and corresponding PPV for a two-chain complex. Args: structure: Protein object or a path/ID string that can be parsed by neurosnap.protein.Protein chain1: Chain ID for the first chain (e.g., "A"). If None, auto-detected if only 2 chains exist. chain2: Chain ID for the second chain (e.g., "B"). If None, auto-detected if only 2 chains exist. dist_thresh: Distance threshold (Å) for interface contacts, default 8.0 Returns: (pdockq, ppv): - pdockq (float): Predicted DockQ score between 0 and 1. Higher values suggest greater confidence that the predicted interface is correct. - ppv (float): Estimated positive predictive value (precision) corresponding to the pDockQ score. Represents the minimum expected probability that the predicted interface is correct (bounded roughly 0.55-0.98 with higher being better). """ # Normalize/obtain a Protein object prot = structure if isinstance(structure, Protein) else Protein(structure, format="auto") chains = prot.chains(prot.models()[0]) # Auto-detect if chains are not specified if chain1 is None or chain2 is None: if len(chains) == 2: chain1, chain2 = chains else: raise ValueError( f"Ambiguous chain selection: structure has {len(chains)} chains " f"({', '.join(chains)}). Please specify chain_id1 and chain_id2 explicitly." ) # Basic chain validation if chain1 not in chains: raise ValueError(f'Chain "{chain1}" not found. Available chains: {chains}') if chain2 not in chains: raise ValueError(f'Chain "{chain2}" not found. Available chains: {chains}') if chain1 == chain2: raise ValueError("chain_id1 and chain_id2 must be different.") # Extract chain representatives and plDDT coords1, plddt1 = _chain_cb_or_gly_ca(prot, chain1) coords2, plddt2 = _chain_cb_or_gly_ca(prot, chain2) chain_coords = {chain1: coords1, chain2: coords2} chain_plddt = {chain1: plddt1, chain2: plddt2} # Compute and return return _calc_pdockq_from_arrays(chain_coords, chain_plddt, dist_thresh)