Source code for neurosnap.nucleotide

"""
Provides functions and classes related to processing nucleotide data.
"""

import gzip
import re
from pathlib import Path
from typing import Tuple, Union

from neurosnap.log import logger


[docs] def get_reverse_complement(seq: str) -> str: """ Generate the complementary strand of a DNA or RNA sequence in reverse order. Args: seq (str): A string representing the nucleotide sequence. Valid characters are 'A', 'T', 'C', 'G' for DNA and 'A', 'U', 'C', 'G' for RNA. Returns: str: A string representing the reverse complementary strand of the input sequence. Raises: KeyError: If the input sequence contains invalid nucleotide characters. """ complement = {"A": "T", "C": "G", "G": "C", "T": "A", "U": "A"} return "".join([complement[base] for base in seq[::-1]])
[docs] def split_interleaved_fastq( fn_in: Union[str, Path], output_dir: Union[str, Path], *, preserve_identifier_names: bool = False, ) -> Tuple[Path, Path]: """ Split an interleaved FASTQ into left/right FASTQ files. Assumes pairs are adjacent (left read followed by right read) and rewrites headers as "@<index>/1" and "@<index>/2". Supports gzip-compressed inputs with filenames ending in ".fastq.gz" or ".fq.gz". Compression is detected by filename and streamed during read. Parameters: fn_in: Path to the interleaved FASTQ file. output_dir: Directory to write outputs into. preserve_identifier_names: If True, preserve the input read identifiers (normalizing mate suffix to "/1" or "/2"). If False, rewrite identifiers as "@<index>/1" and "@<index>/2". Returns: Tuple[Path, Path]: Paths to the left and right FASTQ output files. """ fn_in_path = Path(fn_in) output_dir_path = Path(output_dir) output_dir_path.mkdir(parents=True, exist_ok=True) left_path = output_dir_path / "split_left.fq" right_path = output_dir_path / "split_right.fq" is_gz = fn_in_path.name.endswith((".fastq.gz", ".fq.gz")) open_fn = gzip.open if is_gz else open open_mode = "rt" if is_gz else "r" # Status corresponds to the expected type of line to read next # - "@" = Header for NT sequence # - "BP" = NT sequence # - "+" = Header for quality assurance sequence # - "QA" = Assurance sequence status = "@" current_len = None read_direction = 1 # 1 corresponds to left read (first), 2 corresponds to right read. index = 1 # Actual index with open_fn(fn_in_path, open_mode) as fin: with open(left_path, "w") as fout_l: with open(right_path, "w") as fout_r: for i, line in enumerate(fin, start=1): line = line.strip() if status == "@" and line.startswith("@"): if preserve_identifier_names: prefix, suffix = (line.split(" ", 1) + [""])[:2] base = re.sub(r"[/.][12]$", "", prefix) # stripe existing mate suffixes output = f"{base}/{read_direction}" + (f" {suffix}" if suffix else "") else: output = f"@{index}/{read_direction}" status = "BP" elif status == "BP" and re.search(r"^([GUATNC]*)$", line): output = line status = "+" current_len = len(line) elif status == "+" and line.startswith("+"): if preserve_identifier_names: prefix, suffix = (line.split(" ", 1) + [""])[:2] base = re.sub(r"[/.][12]$", "", prefix) # stripe existing mate suffixes output = f"{base}/{read_direction}" + (f" {suffix}" if suffix else "") else: output = "+" status = "QA" elif status == "QA" and re.search( r"^([ !\"#$%&'()*+,-.\/0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])*$", line ): output = line status = "@" if current_len != len(line): raise ValueError(f"Sequence length does not match for line {i + 1}:\n{line}") else: print(status) raise ValueError(f"Unknown parsing error for line {i + 1}:\n{line}") # write to corresponding output file if read_direction == 1: fout_l.write(f"{output}\n") else: fout_r.write(f"{output}\n") # change read_direction if status == "@": if read_direction == 1: read_direction = 2 else: read_direction = 1 index += 1 assert read_direction == 1, "Uneven number of reads in both files" logger.info(f"Found total of {index:,} syntactically valid reads") return left_path, right_path