Skip to content

Commit

Permalink
feat: add methods for computing run lengths of repeats in a sequence
Browse files Browse the repository at this point in the history
The `longest_dinucleotide_run_length` method is optimized for dinucleotides,
whereas the `longest_multinucleotide_run_length` method is general for
any repeat of length N.
  • Loading branch information
nh13 committed Jul 17, 2024
1 parent dfcb7e2 commit 0a40798
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 8 deletions.
71 changes: 64 additions & 7 deletions fgpyo/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

from typing import Dict
from typing import Optional

_COMPLEMENTS: Dict[str, str] = {
# Discrete bases
Expand Down Expand Up @@ -70,9 +71,19 @@ def reverse_complement(bases: str) -> str:
return "".join([_COMPLEMENTS[b] for b in bases[::-1]])


def longest_hp_length(bases: str) -> int:
def gc_content(bases: str) -> float:
"""Calculates the fraction of G and C bases in a sequence."""
if len(bases) == 0:
return 0
gc_count = sum(1 for base in bases if base == "C" or base == "G" or base == "c" or base == "g")
return gc_count / len(bases)


def longest_hp_length(bases: str, _max_hp: Optional[int] = None) -> int:
"""Calculates the length of the longest homopolymer in the input sequence."""
max_hp = 0
# NB: _max_hp is a private parameter used by longest_multinucleotide_run_length when we want
# to search for homopolymer run lengths of at least _max_hp
max_hp = 0 if _max_hp is None else _max_hp
i = 0
# NB: if we have found a homopolymer of length `max_hp`, then we do not need
# to examine the last `max_hp` bases since we'll never find a longer one.
Expand All @@ -88,9 +99,55 @@ def longest_hp_length(bases: str) -> int:
return max_hp


def gc_content(bases: str) -> float:
"""Calculates the fraction of G and C bases in a sequence."""
if len(bases) == 0:
def longest_dinucleotide_run_length(bases: str) -> int:
"""Number of bases in the longest dinucleotide run in a primer.
A dinucleotide run is when two nucleotides are repeated in tandem. For example,
TTGG (length = 4) or AACCAACCAA (length = 10). If there are no such runs, returns 0."""

best_length: int = 0

start = 0 # the start index of the current dinucleotide run
while start < len(bases) - 1:
# get the dinuc bases
first_base = bases[start]
second_base = bases[start + 1]
# keep going while there are more di-nucs
end = start + 2
while end < len(bases) - 1 and bases[end] == first_base and bases[end + 1] == second_base:
end += 2
# update the longest total run length
best_length = max(best_length, end - start)
# skip to the last base of the current run
start += end - start - 1

return best_length


def longest_multinucleotide_run_length(bases: str, n: int) -> int:
"""Number of bases in the longest multi-nucleotide run.
A multi-nucleotide run is when N nucleotides are repeated in tandem. For example,
TTGG (length = 4, N=2) or TAGTAGTAG (length = 9, N = 3). If there are no such runs,
returns 0."""

if len(bases) < n:
return 0
gc_count = sum(1 for base in bases if base == "C" or base == "G" or base == "c" or base == "g")
return gc_count / len(bases)
elif len(bases) == n:
return len(bases)
elif n == 1:
return longest_hp_length(bases=bases)

Check warning on line 139 in fgpyo/sequence.py

View check run for this annotation

Codecov / codecov/patch

fgpyo/sequence.py#L139

Added line #L139 was not covered by tests
elif n == 2:
return longest_dinucleotide_run_length(bases=bases)

best_length: int = 0
for k in range(n):
# split the bases into groups of <n> bases, such that there are ~`len(bases) / n` items
cur = [bases[i : i + n] for i in range(k, len(bases) - k, n)]
# find the longest run of identical items
cur_length = longest_hp_length(cur, _max_hp=best_length) # type: ignore
# update the current longest run of identical items
best_length = max(best_length, cur_length)
# since each "item" contains "n" bases, we must multiply by "n"
best_length *= n
return best_length
36 changes: 35 additions & 1 deletion tests/fgpyo/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from fgpyo.sequence import gc_content
from fgpyo.sequence import gc_content, longest_multinucleotide_run_length
from fgpyo.sequence import longest_hp_length
from fgpyo.sequence import reverse_complement

Expand Down Expand Up @@ -54,3 +54,37 @@ def test_homopolymer(bases: str, expected_hp_len: int) -> None:
)
def test_gc_content(bases: str, expected_gc_content: float) -> None:
assert gc_content(bases) == expected_gc_content


@pytest.mark.parametrize(
"bases, n, expected_length",
[
("", 2, 0),
("", 1, 0),
("A", 1, 1),
("A", 2, 0),
("ACGT", 2, 2),
("AACCGGTT",2, 2),
("TTTTCCCGGA",2, 4),
("ACCGGGTTTT",2, 4),
("GTGTGTAAAA", 2, 6),
("GTGTGTGAAAA",2, 6),
("GTGTGTACACACAC",2, 8),
("GTGTGTGACACACAC",2, 8),
("AAACTCTCTCGG", 2, 6),
("AACTCTCTCGGG", 2, 6),
("TGTATATATA", 2, 8),
("TGTATATATAC",2, 8),
("TGATATATATC",2, 8),
("TGATATATAC", 2, 6),
("TAGTAGTAG", 3, 9),
("TATAGTAGTAGTA", 3, 9),
("TACGTACGTACTG", 4, 8),
("TAGTAGTAGTAG", 3, 12),
("TAGTAGTAGTAG", 6, 12),
("TAGTAGTAGTAG", 12, 12),
],
)
def test_longest_multinucleotide_run_length(bases: str, n: int, expected_length: int) -> None:
assert expected_length == 0 or expected_length % n == 0
assert longest_multinucleotide_run_length(bases, n=n) == expected_length

0 comments on commit 0a40798

Please sign in to comment.