From 1d8f92dba2d82f3f929287e950fa029f7f662703 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 18 Oct 2024 16:57:44 -0400 Subject: [PATCH] responding to review comments. --- fgpyo/sam/__init__.py | 71 ++++++++--------------------------- tests/fgpyo/sam/test_cigar.py | 64 +++++++++++++++++-------------- 2 files changed, 50 insertions(+), 85 deletions(-) diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index fd8e48a..669ada8 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -158,7 +158,6 @@ import enum import io import sys -from dataclasses import dataclass from pathlib import Path from typing import IO from typing import Any @@ -361,46 +360,6 @@ class _CigarOpUtil: } -@dataclass(frozen=True) -class RangeOfBases: - """A simple data class for holding offsets into a range of bases in a read. - - Attributes: - start (int): The starting offset (0-based) into the range. - stop (int): The ending (excluded) offset into the range. - - Properties: - slice: A slice of the range. can be used rto extract the aligned bases from a string. - range: The range of bases represented by this object. Can be used to obtain the indexes - into the aligned bases in the read. - __len__: The length of the range. - __iter__: Enables unpacking of start and stop into a tuple by return the iterator of - (start, stop) - - """ - - start: int - stop: int - - @property - def slice(self) -> slice: - """A slice of the range""" - return slice(self.start, self.stop) - - @property - def range(self) -> range: - """The range of bases represented by this object""" - return range(self.start, self.stop) - - def __len__(self) -> int: - """The length of the range""" - return self.stop - self.start - - def __iter__(self) -> Iterable[int]: - """enables unpacking of start and stop into a tuple""" - return (self.start, self.stop).__iter__() - - @enum.unique class CigarOp(enum.Enum): """Enumeration of operators that can appear in a Cigar string. @@ -588,28 +547,28 @@ def length_on_target(self) -> int: """Returns the length of the alignment on the target sequence.""" return sum([elem.length_on_target for elem in self.elements]) - def query_alignment_offsets(self, reverse: bool = False) -> Optional[RangeOfBases]: + def query_alignment_offsets(self) -> tuple: """Gets the 0-based, end-exclusive positions of the first and last aligned base in the - query. The resulting range will contain the range of positions in the SEQ string for - the bases that are aligned. If no bases are aligned, the return value will be None. + query. - Args: - reverse: If True, count from the end of the query. i.e. find the offsets - using the reversed elements of the cigar. + The resulting range will contain the range of positions in the SEQ string for + the bases that are aligned. If no bases are aligned, the return value will be None. + If counting from the end of the query is desired, use + `cigar.reversed().query_alignment_offsets()` Returns: - A RangeOfBases object containing the start and stop positions (0-based, end-exclusive) + A tuple (start, stop) containing the start and stop positions of the aligned part of the query. These offsets are 0-based and open-ended, with - respect to the beginning of the query. (If 'reverse' is True, the offsets are with - respect to the reversed query.) If no bases are aligned, the return value will be - None. + respect to the beginning of the query. If no bases are aligned. + + Throws: + An """ start_offset: int = 0 end_offset: int = 0 element: CigarElement alignment_began = False - elements = self.elements if not reverse else reversed(self.elements) - for element in elements: + for element in self.elements: if element.operator.is_clipping and not alignment_began: # We are in the clipping operators preceding the alignment # Note: hardclips have length-on-query=0 @@ -623,9 +582,9 @@ def query_alignment_offsets(self, reverse: bool = False) -> Optional[RangeOfBase # We have exited the alignment and are in the clipping operators after the alignment break - ret = RangeOfBases(start_offset, end_offset) - if not alignment_began or len(ret) == 0: - return None + ret = (start_offset, end_offset) + if start_offset == end_offset: + raise ValueError("Cigar has no aligned bases") return ret def __getitem__( diff --git a/tests/fgpyo/sam/test_cigar.py b/tests/fgpyo/sam/test_cigar.py index d744ca6..f93deed 100644 --- a/tests/fgpyo/sam/test_cigar.py +++ b/tests/fgpyo/sam/test_cigar.py @@ -4,7 +4,6 @@ import pytest from fgpyo.sam import Cigar -from fgpyo.sam import RangeOfBases cigar = Cigar.from_cigarstring("1M4D45N37X23I11=") @@ -40,54 +39,61 @@ def test_bad_index_raises_type_error(index: int) -> None: @pytest.mark.parametrize( ("cigar_string", "maybe_range"), { - ("10M", RangeOfBases(0, 10)), - ("10M10I", RangeOfBases(0, 20)), - ("10X10I", RangeOfBases(0, 20)), - ("10X10D", RangeOfBases(0, 10)), - ("10=10D", RangeOfBases(0, 10)), - ("10S10M", RangeOfBases(10, 20)), - ("10H10M", RangeOfBases(0, 10)), - ("10H10S10M", RangeOfBases(10, 20)), - ("10H10S10M5S", RangeOfBases(10, 20)), - ("10H10S10M5S10H", RangeOfBases(10, 20)), + ("10M", (0, 10)), + ("10M10I", (0, 20)), + ("10X10I", (0, 20)), + ("10X10D", (0, 10)), + ("10=10D", (0, 10)), + ("10S10M", (10, 20)), + ("10H10M", (0, 10)), + ("10H10S10M", (10, 20)), + ("10H10S10M5S", (10, 20)), + ("10H10S10M5S10H", (10, 20)), ("10H", None), ("10S", None), ("10S10H", None), ("5H10S10H", None), ("76D", None), - ("76I", RangeOfBases(0, 76)), + ("76I", (0, 76)), ("10P76S", None), ("50S1000N50S", None), }, ) -def test_get_alignments(cigar_string: str, maybe_range: Optional[RangeOfBases]) -> None: +def test_get_alignments(cigar_string: str, maybe_range: Optional[tuple]) -> None: cig = Cigar.from_cigarstring(cigar_string) - - assert Cigar.query_alignment_offsets(cig, reverse=False) == maybe_range + if not maybe_range: + with pytest.raises(ValueError): + cig.query_alignment_offsets() + else: + ret = cig.query_alignment_offsets() + assert ret == maybe_range @pytest.mark.parametrize( ("cigar_string", "maybe_range"), { - ("10M", RangeOfBases(0, 10)), - ("10M10I", RangeOfBases(0, 20)), - ("10X10I", RangeOfBases(0, 20)), - ("10X10D", RangeOfBases(0, 10)), - ("10=10D", RangeOfBases(0, 10)), - ("10S10M", RangeOfBases(0, 10)), - ("10H10M", RangeOfBases(0, 10)), - ("10H10S10M", RangeOfBases(0, 10)), - ("10H10S10M5S", RangeOfBases(5, 15)), - ("10H10S10M5S10H", RangeOfBases(5, 15)), + ("10M", (0, 10)), + ("10M10I", (0, 20)), + ("10X10I", (0, 20)), + ("10X10D", (0, 10)), + ("10=10D", (0, 10)), + ("10S10M", (0, 10)), + ("10H10M", (0, 10)), + ("10H10S10M", (0, 10)), + ("10H10S10M5S", (5, 15)), + ("10H10S10M5S10H", (5, 15)), ("10H", None), ("10S", None), ("10S10H", None), ("5H10S10H", None), }, ) -def test_get_alignments_reversed(cigar_string: str, maybe_range: Optional[Tuple[int, int]]) -> None: +def test_get_alignments_reversed(cigar_string: str, maybe_range: Optional[Tuple]) -> None: cig = Cigar.from_cigarstring(cigar_string) - assert Cigar.query_alignment_offsets(cig, reverse=True) == maybe_range - if maybe_range is not None: - start, stop = maybe_range + if not maybe_range: + with pytest.raises(ValueError): + cig.reversed().query_alignment_offsets() + else: + ret = cig.reversed().query_alignment_offsets() + assert ret == maybe_range