Source code for neurosnap.database.uniprot

"""UniProt and UniParc sequence retrieval helpers."""

import io
from typing import Dict, Iterable, Union

import pandas as pd
import requests
from tqdm import tqdm

from neurosnap.log import logger
from neurosnap.sequence.align import read_msa


[docs] def fetch_accessions(accessions: Iterable[str], batch_size: int = 150) -> Dict[str, Union[str, None]]: """Fetch sequences corresponding to a list of UniProt accession numbers. This function queries UniParc first and then UniProtKB for any missing accessions. Accessions are processed in batches to handle large lists efficiently. Args: accessions: A list of UniProt accession numbers. Duplicate accessions are removed automatically. batch_size: Number of accessions to query per request. Returns: Dictionary mapping accession numbers to protein sequences. Missing accessions are assigned ``None``. Raises: requests.exceptions.HTTPError: If an API request fails. """ accessions = list(set(str(x).strip() for x in accessions)) batches = [accessions[i : i + batch_size] for i in range(0, len(accessions), batch_size)] output = {} for batch in tqdm(batches, desc="Fetching sequences from uniprot.org", total=len(batches)): query = " OR ".join([f"isoform:{accession}" if "-" in accession else f"accession:{accession}" for accession in batch]) response = requests.get(f"https://rest.uniprot.org/uniparc/search?fields=accession,sequence&format=tsv&query=({query})&size=500") if response.status_code == 200: dataframe = pd.read_csv(io.StringIO(response.text), sep="\t") for _, row in dataframe.iterrows(): for accession in row.UniProtKB.split("; "): if accession in batch and accession not in output: output[accession] = row.Sequence break else: logger.error(f"[{response.status_code}] {response.text}") response.raise_for_status() accessions_missing = [accession for accession in accessions if accession not in output] batches = [accessions_missing[i : i + batch_size] for i in range(0, len(accessions_missing), batch_size)] for batch in tqdm(batches, desc="Fetching sequences from uniprot.org", total=len(batches)): query = " OR ".join([f"accession:{accession}" for accession in batch]) response = requests.get(f"https://rest.uniprot.org/uniprotkb/search?fields=accession,sequence&format=tsv&query=({query})&size=500") if response.status_code == 200: dataframe = pd.read_csv(io.StringIO(response.text), sep="\t") for _, row in dataframe.iterrows(): if row.Entry in batch and row.Entry not in output: output[row.Entry] = row.Sequence else: logger.error(f"[{response.status_code}] {response.text}") response.raise_for_status() for accession in accessions: if accession not in output: output[accession] = None logger.warning(f"Could not find a sequence for accession: {accession}") return output
[docs] def fetch_uniprot(uniprot_id: str, head: bool = False) -> Union[str, bool]: """Fetch a UniProtKB or UniParc FASTA entry by identifier. Args: uniprot_id: UniProtKB or UniParc accession ID. head: If ``True``, perform a HEAD request and return whether the entry exists. Returns: ``True`` when ``head`` is enabled and the accession exists, otherwise the fetched protein sequence. Raises: Exception: If the accession is not found in UniProtKB or UniParc. ValueError: If the returned FASTA does not contain exactly one sequence. """ method = requests.head if head else requests.get logger.debug(f"Fetching uniprot entry with ID {uniprot_id}") response = method(f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta") if response.status_code != 200: response = method(f"https://rest.uniprot.org/uniparc/{uniprot_id}.fasta") if response.status_code != 200: raise Exception( f'Could not find UniProt accession "{uniprot_id}" in either UniProtKB or UniParc. Please ensure that IDs are correct and refer to actual proteins.' ) if head: return True sequences = [sequence for _, sequence in read_msa(response.text)] if len(sequences) > 1: print(response.text) raise ValueError("Too many sequences returned") if len(sequences) < 1: print(response.text) raise ValueError("No sequence returned") return sequences[0]