Source code for neurosnap.database.uniprot

"""UniProt and UniParc sequence retrieval helpers."""

import io
from typing import Dict, Iterable, Union

import pandas as pd
import requests
from tqdm import tqdm

from neurosnap.log import logger
from neurosnap.sequence.align import read_msa



[docs]
def fetch_accessions(accessions: Iterable[str], batch_size: int = 150) -> Dict[str, Union[str, None]]:
  """Fetch sequences corresponding to a list of UniProt accession numbers.

  This function queries UniParc first and then UniProtKB for any missing
  accessions. Accessions are processed in batches to handle large lists
  efficiently.

  Args:
      accessions: A list of UniProt accession numbers. Duplicate accessions
          are removed automatically.
      batch_size: Number of accessions to query per request.

  Returns:
      Dictionary mapping accession numbers to protein sequences. Missing
      accessions are assigned ``None``.

  Raises:
      requests.exceptions.HTTPError: If an API request fails.
  """
  accessions = list(set(str(x).strip() for x in accessions))

  batches = [accessions[i : i + batch_size] for i in range(0, len(accessions), batch_size)]

  output = {}
  for batch in tqdm(batches, desc="Fetching sequences from uniprot.org", total=len(batches)):
    query = " OR ".join([f"isoform:{accession}" if "-" in accession else f"accession:{accession}" for accession in batch])
    response = requests.get(f"https://rest.uniprot.org/uniparc/search?fields=accession,sequence&format=tsv&query=({query})&size=500")
    if response.status_code == 200:
      dataframe = pd.read_csv(io.StringIO(response.text), sep="\t")
      for _, row in dataframe.iterrows():
        for accession in row.UniProtKB.split("; "):
          if accession in batch and accession not in output:
            output[accession] = row.Sequence
            break
    else:
      logger.error(f"[{response.status_code}] {response.text}")
      response.raise_for_status()

  accessions_missing = [accession for accession in accessions if accession not in output]
  batches = [accessions_missing[i : i + batch_size] for i in range(0, len(accessions_missing), batch_size)]
  for batch in tqdm(batches, desc="Fetching sequences from uniprot.org", total=len(batches)):
    query = " OR ".join([f"accession:{accession}" for accession in batch])
    response = requests.get(f"https://rest.uniprot.org/uniprotkb/search?fields=accession,sequence&format=tsv&query=({query})&size=500")
    if response.status_code == 200:
      dataframe = pd.read_csv(io.StringIO(response.text), sep="\t")
      for _, row in dataframe.iterrows():
        if row.Entry in batch and row.Entry not in output:
          output[row.Entry] = row.Sequence
    else:
      logger.error(f"[{response.status_code}] {response.text}")
      response.raise_for_status()

  for accession in accessions:
    if accession not in output:
      output[accession] = None
      logger.warning(f"Could not find a sequence for accession: {accession}")

  return output




[docs]
def fetch_uniprot(uniprot_id: str, head: bool = False) -> Union[str, bool]:
  """Fetch a UniProtKB or UniParc FASTA entry by identifier.

  Args:
      uniprot_id: UniProtKB or UniParc accession ID.
      head: If ``True``, perform a HEAD request and return whether the entry
          exists.

  Returns:
      ``True`` when ``head`` is enabled and the accession exists, otherwise the
      fetched protein sequence.

  Raises:
      Exception: If the accession is not found in UniProtKB or UniParc.
      ValueError: If the returned FASTA does not contain exactly one sequence.
  """
  method = requests.head if head else requests.get
  logger.debug(f"Fetching uniprot entry with ID {uniprot_id}")
  response = method(f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta")
  if response.status_code != 200:
    response = method(f"https://rest.uniprot.org/uniparc/{uniprot_id}.fasta")
    if response.status_code != 200:
      raise Exception(
        f'Could not find UniProt accession "{uniprot_id}" in either UniProtKB or UniParc. Please ensure that IDs are correct and refer to actual proteins.'
      )

  if head:
    return True

  sequences = [sequence for _, sequence in read_msa(response.text)]
  if len(sequences) > 1:
    print(response.text)
    raise ValueError("Too many sequences returned")
  if len(sequences) < 1:
    print(response.text)
    raise ValueError("No sequence returned")

  return sequences[0]