Test bonanza for v3 (#203)

Add more tests and fix various small bugs
BioJulia · Oct 31, 2021 · cd6caac · cd6caac
1 parent 832fb7c
commit cd6caac
Show file tree

Hide file tree

Showing 19 changed files with 199 additions and 74 deletions.
diff --git a/src/BioSequences.jl b/src/BioSequences.jl
@@ -126,6 +126,7 @@ export
  reverse_complement!,
  ungap,
  ungap!,
+ join!,
 
  ###
  ### LongSequence

diff --git a/src/biosequence/biosequence.jl b/src/biosequence/biosequence.jl
@@ -100,7 +100,7 @@ end
 
 # Fast path for iterables we know are stateless
 function join!(seq::BioSequence, it::Union{Vector, Tuple, Set})
- _join(resize!(seq, sum(length, it)), it, Val(true))
+ _join!(resize!(seq, sum(length, it)), it, Val(true))
 end
 
 join!(seq::BioSequence, it) = _join!(seq, it, Val(false))

diff --git a/src/biosequence/copying.jl b/src/biosequence/copying.jl
@@ -32,7 +32,7 @@ julia> copyto!(seq, 2, rna"UUUU", 1, 4)
 TTTTTM
 ```
 """
-function _copyto!(dst::BioSequence{A}, doff::Integer,
+function Base.copyto!(dst::BioSequence{A}, doff::Integer,
  src::BioSequence, soff::Integer,
  N::Integer) where {A <: Alphabet}
 

diff --git a/src/biosequence/indexing.jl b/src/biosequence/indexing.jl
@@ -101,9 +101,6 @@ Base.@propagate_inbounds function Base.setindex!(seq::BioSequence, x, locs::Abst
  return seq
 end
 
-# For backwards compatibility
-unsafe_setindex!(seq::BioSequence, x, i) = @inbounds seq[i] = x
-
 function Base.setindex!(seq::BioSequence, x, ::Colon)
  return setindex!(seq, x, 1:lastindex(seq))
 end
diff --git a/src/biosequence/transformations.jl b/src/biosequence/transformations.jl
@@ -235,7 +235,7 @@ end
 Create the canonical sequence of `seq`.
 
 """
-canonical(seq::NucleotideSeq) = is_canonical(seq) ? copy(seq) : reverse_complement(seq)
+canonical(seq::NucleotideSeq) = iscanonical(seq) ? copy(seq) : reverse_complement(seq)
 
 "Create a copy of a sequence with gap characters removed."
 ungap(seq::BioSequence) = filter(!isgap, seq)

diff --git a/src/bit-manipulation/bitindex.jl b/src/bit-manipulation/bitindex.jl
@@ -48,7 +48,6 @@ Base.:-(i::BitIndex{N,W}, n::Integer) where {N,W} = BitIndex{N,W}(i.val - n)
 Base.:-(i1::BitIndex, i2::BitIndex) = i1.val - i2.val
 Base.:(==)(i1::BitIndex, i2::BitIndex) = i1.val == i2.val
 Base.isless(i1::BitIndex, i2::BitIndex) = isless(i1.val, i2.val)
-Base.cmp(i1::BitIndex, i2::BitIndex) = cmp(i1.val, i2.val)
 
 @inline function nextposition(i::BitIndex{N,W}) where {N,W}
  return i + N
@@ -76,13 +75,6 @@ Base.show(io::IO, i::BitIndex) = print(io, '(', index(i), ", ", offset(i), ')')
  return chunk & bitmask(bidx)
 end
 
-"Extract the element stored in a packed bitarray referred to by bidx."
-@inline function extract_encoded_element(bidx::BitIndex{N,W}, data::NTuple{n,W}) where {N,n,W}
- @inbounds chunk = data[index(bidx)]
- offchunk = chunk >> (bitwidth(bidx) - N - offset(bidx))
- return offchunk & bitmask(bidx)
-end
-
 # Create a bit mask that fills least significant `n` bits (`n` must be a
 # non-negative integer).
 "Create a bit mask covering the least significant `n` bits."
@@ -95,4 +87,3 @@ end
 # This is used in the extract_encoded_element function.
 bitmask(::BitIndex{N,W}) where {N, W} = bitmask(W, N)
 bitmask(n::Integer) = bitmask(UInt64, n)
-bitmask(::Type{T}, ::Val{N}) where {T, N} = (one(T) << N) - one(T)
diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl
@@ -22,7 +22,6 @@ function LongSequence{A}(::UndefInitializer, len::Integer) where {A<:Alphabet}
 end
 
 # Generic constructor
-LongSequence(it) = LongSequence{eltype(it)}(it)
 function LongSequence{A}(it) where {A <: Alphabet}
  len = length(it)
  data = Vector{UInt64}(undef, seq_data_len(A, len))
@@ -63,16 +62,16 @@ function LongSequence{A}(s::Union{String, SubString{String}}, ::AsciiAlphabet) w
 end
 
 function LongSequence{A}(
-  src::Union{AbstractString,AbstractVector{UInt8}},
-  startpos::Integer=1,
- stoppos::Integer=length(src)) where {A<:Alphabet}
- len = stoppos - startpos + 1
+ src::Union{AbstractString,AbstractVector{UInt8}},
+ part::AbstractUnitRange{<:Integer}=1:length(src)
+) where {A<:Alphabet}
+ len = length(part)
  seq = LongSequence{A}(undef, len)
- return copyto!(seq, 1, src, startpos, len)
+ return copyto!(seq, 1, src, first(part), len)
 end
 
 # create a subsequence
-function LongSequence(other::LongSequence, part::UnitRange{<:Integer})
+function LongSequence(other::LongSequence, part::AbstractUnitRange{<:Integer})
  checkbounds(other, part)
  subseq = typeof(other)(undef, length(part))
  copyto!(subseq, 1, other, first(part), length(part))

diff --git a/src/longsequences/indexing.jl b/src/longsequences/indexing.jl
@@ -10,11 +10,6 @@
  bitindex(N, encoded_data_eltype(typeof(x)), i)
 end
 
-@inline function bitindex(x::LongSubSeq, i::Integer)
- N = BitsPerSymbol(Alphabet(typeof(x)))
- bitindex(N, encoded_data_eltype(typeof(x)), i % UInt + first(x.part) - 1)
-end
-
 firstbitindex(s::SeqOrView) = bitindex(s, firstindex(s))
 lastbitindex(s::SeqOrView) = bitindex(s, lastindex(s))
 
@@ -23,10 +18,6 @@ lastbitindex(s::SeqOrView) = bitindex(s, lastindex(s))
  extract_encoded_element(bi, x.data)
 end
 
-@inline function encoded_setindex!(seq::SeqOrView, bin::Unsigned, i::Integer)
- encoded_setindex!(seq, UInt64(bin), bitindex(seq, i))
-end
-
 @inline function encoded_setindex!(s::SeqOrView, v::UInt64, i::BitIndex)
  vi, off = i
  data = s.data
@@ -36,25 +27,30 @@ end
 end
 
 # More efficient due to copyto!
-function Base.getindex(seq::LongSequence, part::UnitRange{<:Integer})
+function Base.getindex(seq::LongSequence, part::AbstractUnitRange{<:Integer})
  @boundscheck checkbounds(seq, part)
  newseq = typeof(seq)(undef, length(part))
  return copyto!(newseq, 1, seq, first(part), length(part))
 end
 
 # More efficient due to copyto!
-function Base.setindex!(seq::SeqOrView{A},
- other::SeqOrView{A},
- locs::UnitRange{<:Integer}) where {A <: Alphabet}
+function Base.setindex!(
+ seq::SeqOrView{A},
+ other::SeqOrView{A},
+ locs::AbstractUnitRange{<:Integer}
+) where {A <: Alphabet}
  @boundscheck checkbounds(seq, locs)
  @boundscheck if length(other) != length(locs)
  throw(DimensionMismatch("Attempt to assign $(length(locs)) values to $(length(seq)) destinations"))
  end
  return copyto!(seq, locs.start, other, 1, length(locs))
 end
 
-@inline function encoded_setindex!(seq::SeqOrView{A},
- bin::Unsigned, i::Integer) where {A <: Alphabet}
+@inline function encoded_setindex!(
+ seq::SeqOrView{A},
+ bin::Unsigned, 
+ i::Integer
+) where {A <: Alphabet}
  return encoded_setindex!(seq, bin, bitindex(seq, i))
 end
 

diff --git a/src/longsequences/seqview.jl b/src/longsequences/seqview.jl
@@ -34,9 +34,11 @@ Base.length(v::LongSubSeq) = last(v.part) - first(v.part) + 1
 Base.copy(v::LongSubSeq{A}) where A = LongSequence{A}(v)
 
 encoded_data_eltype(::Type{<:LongSubSeq}) = encoded_data_eltype(LongSequence)
+symbols_per_data_element(x::LongSubSeq) = div(64, bits_per_symbol(Alphabet(x)))
 
-@inline function bitindex(x::LongSubSeq{A}, i::Integer) where A
- bitindex(BitsPerSymbol(A), encoded_data_eltype(typeof(x)), i - first(x.part) + 1)
+@inline function bitindex(x::LongSubSeq, i::Integer)
+ N = BitsPerSymbol(Alphabet(typeof(x)))
+ bitindex(N, encoded_data_eltype(typeof(x)), i % UInt + first(x.part) - 1)
 end
 
 # Constructors
@@ -48,12 +50,12 @@ function LongSubSeq{A}(seq::LongSubSeq{A}) where A
  return LongSubSeq{A}(seq.data, seq.part)
 end
 
-function LongSubSeq{A}(seq::LongSequence{A}, part::UnitRange{<:Integer}) where A
+function LongSubSeq{A}(seq::LongSequence{A}, part::AbstractUnitRange{<:Integer}) where A
  @boundscheck checkbounds(seq, part)
- return LongSubSeq{A}(seq.data, part)
+ return LongSubSeq{A}(seq.data, UnitRange{Int}(part))
 end
 
-function LongSubSeq{A}(seq::LongSubSeq{A}, part::UnitRange{<:Integer}) where A
+function LongSubSeq{A}(seq::LongSubSeq{A}, part::AbstractUnitRange{<:Integer}) where A
  @boundscheck checkbounds(seq, part)
  newpart = first(part) + first(seq.part) - 1 : last(part) + first(seq.part) - 1
  return LongSubSeq{A}(seq.data, newpart)
@@ -66,7 +68,7 @@ end
 LongSubSeq(seq::SeqOrView, ::Colon) = LongSubSeq(seq, 1:lastindex(seq))
 LongSubSeq(seq::BioSequence{A}) where A = LongSubSeq{A}(seq)
 
-Base.view(seq::SeqOrView, part::UnitRange) = LongSubSeq(seq, part)
+Base.view(seq::SeqOrView, part::AbstractUnitRange) = LongSubSeq(seq, part)
 
 # Conversion
 function LongSequence(s::LongSubSeq{A}) where A
@@ -95,6 +97,6 @@ function Base.convert(::Type{T1}, seq::T2) where
 end
 
 # Indexing
-function Base.getindex(seq::LongSubSeq, part::UnitRange{<:Integer})
+function Base.getindex(seq::LongSubSeq, part::AbstractUnitRange{<:Integer})
  return LongSubSeq(seq, part)
 end
diff --git a/test/alphabet.jl b/test/alphabet.jl
@@ -36,6 +36,14 @@
  end
 end
 
+@testset "Basic" begin
+ @test length(DNAAlphabet{4}()) == 16
+ @test length(RNAAlphabet{2}()) == 4
+ @test length(AminoAcidAlphabet()) == 28
+
+ @test BioSequences.symbols(RNAAlphabet{2}()) == (RNA_A, RNA_C, RNA_G, RNA_U)
+end
+
 encode = BioSequences.encode
 EncodeError = BioSequences.EncodeError
 decode = BioSequences.decode
@@ -83,6 +91,10 @@ end
  @test encode(ReducedAAAlphabet(), aa) === data
  @test decode(ReducedAAAlphabet(), data) === aa
  end
+
+ str = "NSTPHML"
+ @test String(LongSequence{ReducedAAAlphabet}(str)) == str
+
  @test_throws EncodeError encode(ReducedAAAlphabet(), AA_V)
  @test_throws EncodeError encode(ReducedAAAlphabet(), AA_I)
  @test_throws EncodeError encode(ReducedAAAlphabet(), AA_R)

diff --git a/test/biosequences/biosequence.jl b/test/biosequences/biosequence.jl
@@ -47,6 +47,19 @@ random_simple(len::Integer) = SimpleSeq(rand([RNA_A, RNA_C, RNA_G, RNA_U], len))
  @test prevind(seq, lastindex(seq)) == 2
  @test prevind(seq, 2) == 1
 
+ seq2 = SimpleSeq([RNA_U, RNA_C, RNA_U])
+ gen = (i for i in [seq, seq2])
+ @test join!(SimpleSeq([]), [seq, seq2]) == SimpleSeq([RNA(i) for i in "CGUUCU"])
+ @test join!(SimpleSeq([]), gen) == SimpleSeq([RNA(i) for i in "CGUUCU"])
+ @test join(SimpleSeq, [seq, seq2]) == join!(SimpleSeq([]), [seq, seq2])
+ @test join(SimpleSeq, gen) == join!(SimpleSeq([]), gen)
+
+ @test copy!(SimpleSeq([]), seq) == seq
+ seq3 = copy(seq2)
+ @test copyto!(seq3, seq) == seq
+ seq3 = copy(seq2)
+ @test copyto!(seq3, 2, seq, 3, 1) == SimpleSeq([RNA(i) for i in "UUU"])
+
  @test_throws EncodeError SimpleSeq([RNA_C, RNA_G, RNA_M])
  @test_throws EncodeError SimpleSeq([RNA_Gap])
  @test_throws MethodError SimpleSeq(1:3)

diff --git a/test/biosequences/indexing.jl b/test/biosequences/indexing.jl
@@ -172,4 +172,13 @@ end
  seq[1:2:17] = "ACG"^3
  @test seq == SimpleSeq(map(RNA, collect("AACCGCAUCAGGAUCUG")))
  end
+end
+
+@testset "BitIndex" begin
+ ind = BioSequences.BitIndex{4, UInt64}(16)
+ BioSequences.BitsPerSymbol(ind) = BioSequences.BitsPerSymbol{4}()
+ BioSequences.bitwidth(UInt64) = 64
+ BioSequences.bitwidth(UInt16) = 16
+ BioSequences.prevposition(ind) == BioSequences.BitIndex{4, UInt64}(12)
+ BioSequences.nextposition(ind) == BioSequences.BitIndex{4, UInt64}(20)
 end
diff --git a/test/biosequences/misc.jl b/test/biosequences/misc.jl
@@ -120,6 +120,12 @@ end
  @test iscanonical(SimpleSeq("AAUU"))
  @test !iscanonical(SimpleSeq("UGGA"))
  @test !iscanonical(SimpleSeq("CGAU"))
+
+ @test canonical(SimpleSeq("UGGA")) == SimpleSeq("UCCA")
+ @test canonical(SimpleSeq("GCAC")) == SimpleSeq("GCAC")
+ seq = SimpleSeq("CGAU")
+ canonical!(seq)
+ @test seq == SimpleSeq("AUCG")
 end
 
 @testset "Ispalindromic" begin
@@ -153,3 +159,35 @@ end
  @test !hasambiguity(SimpleSeq("A"))
  @test !hasambiguity(SimpleSeq("ACGU"))
 end
+
+@testset "Shuffle" begin
+ function test_same(a, b)
+ @test all(symbols(Alphabet(a))) do i
+ count(isequal(i), a) == count(isequal(i), b)
+ end
+ end
+ seq = SimpleSeq([RNA(i) for i in "AGCGUUAUGCUGAUUAGGAC"])
+ seq2 = Random.shuffle(seq)
+ test_same(seq, seq2)
+ Random.shuffle!(seq)
+ test_same(seq, seq2)
+end
+
+@testset "Reverse-complement" begin
+ seq = SimpleSeq([RNA(i) for i in "UAGUUC"])
+ @test reverse(seq) == SimpleSeq([RNA(i) for i in "CUUGAU"])
+ @test complement(seq) == SimpleSeq([RNA(i) for i in "AUCAAG"])
+ @test reverse_complement(seq) == reverse(complement(seq))
+
+ reverse!(seq)
+ @test seq == SimpleSeq([RNA(i) for i in "CUUGAU"])
+ complement!(seq)
+ @test seq == SimpleSeq([RNA(i) for i in "GAACUA"])
+end
+
+@testset "Ungap" begin
+ seq = SimpleSeq([RNA(i) for i in "UAGUUC"])
+ @test ungap(seq) == seq
+ cp = copy(seq)
+ @test ungap!(seq) == cp
+end