feat: add methods for computing run lengths of repeats in a sequence

The `longest_dinucleotide_run_length` method is optimized for dinucleotides, whereas the `longest_multinucleotide_run_length` method is general for any repeat of length N.
fulcrumgenomics · Jul 17, 2024 · 0a40798 · 0a40798
1 parent dfcb7e2
commit 0a40798
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 8 deletions.
diff --git a/fgpyo/sequence.py b/fgpyo/sequence.py
@@ -5,6 +5,7 @@
 """
 
 from typing import Dict
+from typing import Optional
 
 _COMPLEMENTS: Dict[str, str] = {
  # Discrete bases
@@ -70,9 +71,19 @@ def reverse_complement(bases: str) -> str:
  return "".join([_COMPLEMENTS[b] for b in bases[::-1]])
 
 
-def longest_hp_length(bases: str) -> int:
+def gc_content(bases: str) -> float:
+ """Calculates the fraction of G and C bases in a sequence."""
+ if len(bases) == 0:
+ return 0
+ gc_count = sum(1 for base in bases if base == "C" or base == "G" or base == "c" or base == "g")
+ return gc_count / len(bases)
+
+
+def longest_hp_length(bases: str, _max_hp: Optional[int] = None) -> int:
  """Calculates the length of the longest homopolymer in the input sequence."""
- max_hp = 0
+ # NB: _max_hp is a private parameter used by longest_multinucleotide_run_length when we want
+ # to search for homopolymer run lengths of at least _max_hp
+ max_hp = 0 if _max_hp is None else _max_hp
  i = 0
  # NB: if we have found a homopolymer of length `max_hp`, then we do not need
  # to examine the last `max_hp` bases since we'll never find a longer one.
@@ -88,9 +99,55 @@ def longest_hp_length(bases: str) -> int:
  return max_hp
 
 
-def gc_content(bases: str) -> float:
- """Calculates the fraction of G and C bases in a sequence."""
- if len(bases) == 0:
+def longest_dinucleotide_run_length(bases: str) -> int:
+ """Number of bases in the longest dinucleotide run in a primer.
+
+ A dinucleotide run is when two nucleotides are repeated in tandem. For example,
+ TTGG (length = 4) or AACCAACCAA (length = 10). If there are no such runs, returns 0."""
+
+ best_length: int = 0
+
+ start = 0 # the start index of the current dinucleotide run
+ while start < len(bases) - 1:
+ # get the dinuc bases
+ first_base = bases[start]
+ second_base = bases[start + 1]
+ # keep going while there are more di-nucs
+ end = start + 2
+ while end < len(bases) - 1 and bases[end] == first_base and bases[end + 1] == second_base:
+ end += 2
+ # update the longest total run length
+ best_length = max(best_length, end - start)
+ # skip to the last base of the current run
+ start += end - start - 1
+
+ return best_length
+
+
+def longest_multinucleotide_run_length(bases: str, n: int) -> int:
+ """Number of bases in the longest multi-nucleotide run.
+
+ A multi-nucleotide run is when N nucleotides are repeated in tandem. For example,
+ TTGG (length = 4, N=2) or TAGTAGTAG (length = 9, N = 3). If there are no such runs,
+ returns 0."""
+
+ if len(bases) < n:
  return 0
- gc_count = sum(1 for base in bases if base == "C" or base == "G" or base == "c" or base == "g")
- return gc_count / len(bases)
+ elif len(bases) == n:
+ return len(bases)
+ elif n == 1:
+ return longest_hp_length(bases=bases)
+ elif n == 2:
+ return longest_dinucleotide_run_length(bases=bases)
+
+ best_length: int = 0
+ for k in range(n):
+ # split the bases into groups of <n> bases, such that there are ~`len(bases) / n` items
+ cur = [bases[i : i + n] for i in range(k, len(bases) - k, n)]
+ # find the longest run of identical items
+ cur_length = longest_hp_length(cur, _max_hp=best_length) # type: ignore
+ # update the current longest run of identical items
+ best_length = max(best_length, cur_length)
+ # since each "item" contains "n" bases, we must multiply by "n"
+ best_length *= n
+ return best_length
diff --git a/tests/fgpyo/test_sequence.py b/tests/fgpyo/test_sequence.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from fgpyo.sequence import gc_content
+from fgpyo.sequence import gc_content, longest_multinucleotide_run_length
 from fgpyo.sequence import longest_hp_length
 from fgpyo.sequence import reverse_complement
 
@@ -54,3 +54,37 @@ def test_homopolymer(bases: str, expected_hp_len: int) -> None:
 )
 def test_gc_content(bases: str, expected_gc_content: float) -> None:
  assert gc_content(bases) == expected_gc_content
+
+
+@pytest.mark.parametrize(
+ "bases, n, expected_length",
+ [
+ ("", 2, 0),
+ ("", 1, 0),
+ ("A", 1, 1),
+ ("A", 2, 0),
+ ("ACGT", 2, 2),
+ ("AACCGGTT",2, 2),
+ ("TTTTCCCGGA",2, 4),
+ ("ACCGGGTTTT",2, 4),
+ ("GTGTGTAAAA", 2, 6),
+ ("GTGTGTGAAAA",2, 6),
+ ("GTGTGTACACACAC",2, 8),
+ ("GTGTGTGACACACAC",2, 8),
+ ("AAACTCTCTCGG", 2, 6),
+ ("AACTCTCTCGGG", 2, 6),
+ ("TGTATATATA", 2, 8),
+ ("TGTATATATAC",2, 8),
+ ("TGATATATATC",2, 8),
+ ("TGATATATAC", 2, 6),
+ ("TAGTAGTAG", 3, 9),
+ ("TATAGTAGTAGTA", 3, 9),
+ ("TACGTACGTACTG", 4, 8),
+ ("TAGTAGTAGTAG", 3, 12),
+ ("TAGTAGTAGTAG", 6, 12),
+ ("TAGTAGTAGTAG", 12, 12),
+ ],
+)
+def test_longest_multinucleotide_run_length(bases: str, n: int, expected_length: int) -> None:
+ assert expected_length == 0 or expected_length % n == 0
+ assert longest_multinucleotide_run_length(bases, n=n) == expected_length