Source code for neurosnap.nucleotide
"""
Provides functions and classes related to processing nucleotide data.
"""
import gzip
import re
from pathlib import Path
from typing import Tuple, Union
from neurosnap.log import logger
[docs]
def get_reverse_complement(seq: str) -> str:
"""
Generate the complementary strand of a DNA or RNA sequence in reverse order.
Args:
seq (str): A string representing the nucleotide sequence.
Valid characters are 'A', 'T', 'C', 'G' for DNA and 'A', 'U', 'C', 'G' for RNA.
Returns:
str: A string representing the reverse complementary strand of the input sequence.
Raises:
KeyError: If the input sequence contains invalid nucleotide characters.
"""
complement = {"A": "T", "C": "G", "G": "C", "T": "A", "U": "A"}
return "".join([complement[base] for base in seq[::-1]])
[docs]
def split_interleaved_fastq(
fn_in: Union[str, Path],
output_dir: Union[str, Path],
*,
preserve_identifier_names: bool = False,
) -> Tuple[Path, Path]:
"""
Split an interleaved FASTQ into left/right FASTQ files.
Assumes pairs are adjacent (left read followed by right read) and rewrites
headers as "@<index>/1" and "@<index>/2".
Supports gzip-compressed inputs with filenames ending in ".fastq.gz" or ".fq.gz".
Compression is detected by filename and streamed during read.
Parameters:
fn_in: Path to the interleaved FASTQ file.
output_dir: Directory to write outputs into.
preserve_identifier_names: If True, preserve the input read identifiers
(normalizing mate suffix to "/1" or "/2"). If False, rewrite
identifiers as "@<index>/1" and "@<index>/2".
Returns:
Tuple[Path, Path]: Paths to the left and right FASTQ output files.
"""
fn_in_path = Path(fn_in)
output_dir_path = Path(output_dir)
output_dir_path.mkdir(parents=True, exist_ok=True)
left_path = output_dir_path / "split_left.fq"
right_path = output_dir_path / "split_right.fq"
is_gz = fn_in_path.name.endswith((".fastq.gz", ".fq.gz"))
open_fn = gzip.open if is_gz else open
open_mode = "rt" if is_gz else "r"
# Status corresponds to the expected type of line to read next
# - "@" = Header for NT sequence
# - "BP" = NT sequence
# - "+" = Header for quality assurance sequence
# - "QA" = Assurance sequence
status = "@"
current_len = None
read_direction = 1 # 1 corresponds to left read (first), 2 corresponds to right read.
index = 1 # Actual index
with open_fn(fn_in_path, open_mode) as fin:
with open(left_path, "w") as fout_l:
with open(right_path, "w") as fout_r:
for i, line in enumerate(fin, start=1):
line = line.strip()
if status == "@" and line.startswith("@"):
if preserve_identifier_names:
prefix, suffix = (line.split(" ", 1) + [""])[:2]
base = re.sub(r"[/.][12]$", "", prefix) # stripe existing mate suffixes
output = f"{base}/{read_direction}" + (f" {suffix}" if suffix else "")
else:
output = f"@{index}/{read_direction}"
status = "BP"
elif status == "BP" and re.search(r"^([GUATNC]*)$", line):
output = line
status = "+"
current_len = len(line)
elif status == "+" and line.startswith("+"):
if preserve_identifier_names:
prefix, suffix = (line.split(" ", 1) + [""])[:2]
base = re.sub(r"[/.][12]$", "", prefix) # stripe existing mate suffixes
output = f"{base}/{read_direction}" + (f" {suffix}" if suffix else "")
else:
output = "+"
status = "QA"
elif status == "QA" and re.search(
r"^([ !\"#$%&'()*+,-.\/0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])*$", line
):
output = line
status = "@"
if current_len != len(line):
raise ValueError(f"Sequence length does not match for line {i + 1}:\n{line}")
else:
print(status)
raise ValueError(f"Unknown parsing error for line {i + 1}:\n{line}")
# write to corresponding output file
if read_direction == 1:
fout_l.write(f"{output}\n")
else:
fout_r.write(f"{output}\n")
# change read_direction
if status == "@":
if read_direction == 1:
read_direction = 2
else:
read_direction = 1
index += 1
assert read_direction == 1, "Uneven number of reads in both files"
logger.info(f"Found total of {index:,} syntactically valid reads")
return left_path, right_path