Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tool to consensus call overlapping bases in paired end reads. Adds direct support in the consensus calling tools too. #805

Merged
merged 25 commits into from
Apr 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
/*
* The MIT License
*
* Copyright (c) 2022 Fulcrum Genomics
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/

package com.fulcrumgenomics.bam

import com.fulcrumgenomics.FgBioDef.{FgBioEnum, FilePath, PathToBam, PathToFasta, SafelyClosable}
import com.fulcrumgenomics.bam.api.{SamOrder, SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.collection.ParIterator
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.commons.util.Threads.IterableThreadLocal
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, Metric, ProgressLogger}
import enumeratum.EnumEntry

import scala.collection.immutable


@clp(group = ClpGroups.SamOrBam, description=
"""
|Consensus calls overlapping bases in read pairs.
|
|## Inputs and Outputs
|
|In order to correctly correct reads by template, the input BAM must be either `queryname` sorted or `query` grouped.
|The sort can be done in streaming fashion with:
|
|```
|samtools sort -n -u in.bam | fgbio CallOverlappingConsensusBases -i /dev/stdin ...
nh13 marked this conversation as resolved.
Show resolved Hide resolved
|```
|
|The output sort order may be specified with `--sort-order`. If not given, then the output will be in the same
|order as input.
|
|The reference FASTA must be given so that any existing `NM`, `UQ` and `MD` tags can be repaired.
|
|## Correction
|
|Only mapped read pairs with overlapping bases will be eligible for correction.
|
|Each read base from the read and its mate that map to same position in the reference will be used to create
|a consensus base as follows:
|
|1. If the base agree, then the chosen agreement strategy (`--agreement-strategy`) will be used.
|2. If the base disagree, then the chosen disagreement strategy (`--disagreement-strategy`) will be used.
|
|The agreement strategies are as follows:
|
|* Consensus: Call the consensus base and return a new base quality that is the sum of the two base qualities.
|* MaxQual: Call the consensus base and return a new base quality that is the maximum of the two base qualities.
|* PassThrough: Leave the bases and base qualities unchanged.
|
|In the context of disagreement strategies, masking a base will make the base an "N" with base quality phred-value "2".
|The disagreement strategies are as follows:
|
|* MaskBoth: Mask both bases.
|* MaskLowerQual: Mask the base with the lowest base quality, with the other base unchanged. If the base qualities
| are the same, mask both bases.
|* Consensus: Consensus call the base. If the base qualities are the same, mask both bases. Otherwise, call the
| base with the highest base quality and return a new base quality that is the difference between the
| highest and lowest base quality.
| """)
class CallOverlappingConsensusBases
(@arg(flag='i', doc="Input SAM or BAM file of aligned reads.") val input: PathToBam,
@arg(flag='o', doc="Output SAM or BAM file.") val output: PathToBam,
@arg(flag='m', doc="Output metrics file.") val metrics: FilePath,
@arg(flag='r', doc="Reference sequence fasta file.") val ref: PathToFasta,
@arg(doc="The number of threads to use while consensus calling.") val threads: Int = 1,
@arg(flag='S', doc="The sort order of the output. If not given, output will be in the same order as input if the input.")
val sortOrder: Option[SamOrder] = None,
@arg(doc="The strategy to consensus call when both bases agree. See the usage for more details")
val agreementStrategy: AgreementStrategy = AgreementStrategy.Consensus,
@arg(doc="The strategy to consensus call when both bases disagree. See the usage for more details")
val disagreementStrategy: DisagreementStrategy = DisagreementStrategy.Consensus
) extends FgBioTool with LazyLogging {
Io.assertReadable(input)
Io.assertReadable(ref)
Io.assertCanWriteFile(output)

private case class ThreadData
(caller: OverlappingBasesConsensusCaller = new OverlappingBasesConsensusCaller(agreementStrategy=agreementStrategy, disagreementStrategy=disagreementStrategy),
templateMetric: CallOverlappingConsensusBasesMetric = CallOverlappingConsensusBasesMetric(kind=CountKind.Templates),
basesMetric: CallOverlappingConsensusBasesMetric = CallOverlappingConsensusBasesMetric(kind=CountKind.Bases)
)

override def execute(): Unit = {
val source = SamSource(input)
val outSort = sortOrder.flatMap { order => if (SamOrder(source.header).contains(order)) None else Some(order) }
val writer = Bams.nmUqMdTagRegeneratingWriter(writer=SamWriter(output, source.header.clone(), sort=outSort), ref=ref)
val progress = new ProgressLogger(logger)
val templateIterator = Bams.templateIterator(source)
val threadData = new IterableThreadLocal(() => ThreadData())

// Require queryname sorted or query grouped
Bams.requireQueryGrouped(header=source.header, toolName="CallOverlappingConsensusBases")

ParIterator(templateIterator, threads=threads)
.map { template =>
val threadDatum = threadData.get()
threadDatum.synchronized {
nh13 marked this conversation as resolved.
Show resolved Hide resolved
// update metrics
threadDatum.templateMetric.total += 1
threadDatum.basesMetric.total += template.primaryReads.map(_.length).sum
nh13 marked this conversation as resolved.
Show resolved Hide resolved
// corrects
val stats = threadDatum.caller.call(template)
val correctedBases = stats.r1CorrectedBases + stats.r2CorrectedBases
if (stats.overlappingBases > 0) {
threadDatum.templateMetric.overlapping += 1
threadDatum.basesMetric.overlapping += stats.overlappingBases
if (correctedBases > 0) {
threadDatum.templateMetric.corrected += 1
threadDatum.basesMetric.corrected += correctedBases
}
}
}
template.allReads.foreach(progress.record)
template
}.toAsync(ParIterator.DefaultChunkSize * 8).foreach { template =>
writer ++= template.allReads
nh13 marked this conversation as resolved.
Show resolved Hide resolved
}
progress.logLast()
source.safelyClose()
writer.close()

val templatesMetric = CallOverlappingConsensusBasesMetric(kind=CountKind.Templates)
val basesMetric = CallOverlappingConsensusBasesMetric(kind=CountKind.Bases)
threadData.foreach { datum =>
templatesMetric += datum.templateMetric
basesMetric += datum.basesMetric
}

Metric.write(metrics, templatesMetric, basesMetric)
}
}

/** Collects the the number of reads or bases that were examined, had overlap, and were corrected as part of
* the [[CallOverlappingConsensusBases]] tool.
*
* @param kind template if the counts are per template, bases if counts are in units of bases.
* @param total the total number of templates (bases) examined
* @param overlapping the total number of templates (bases) that were overlapping
* @param corrected the total number of templates (bases) that were corrected.
*/
case class CallOverlappingConsensusBasesMetric
(
kind: CountKind,
var total: Long = 0,
var overlapping: Long = 0,
var corrected: Long = 0,
) extends Metric {
def +=(other: CallOverlappingConsensusBasesMetric): CallOverlappingConsensusBasesMetric = {
require(this.kind == other.kind)
this.total += other.total
this.overlapping += other.overlapping
this.corrected += other.corrected
this
}
}

sealed trait CountKind extends EnumEntry

/** Enumeration for the type of counts in [[CallOverlappingConsensusBasesMetric]]. */
object CountKind extends FgBioEnum[CountKind] {
case object Templates extends CountKind
case object Bases extends CountKind

override def values: immutable.IndexedSeq[CountKind] = findValues
}
Loading