Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added handling for indels within CollectHsMetrics and CollectTargetedPcrMetrics. #1273

Merged
merged 6 commits into from
Feb 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,6 @@ protected HsMetricCollector makeCollector(final Set<MetricAccumulationLevel> acc
final String probeSetName,
final int nearProbeDistance) {
return new HsMetricCollector(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance,
MINIMUM_MAPPING_QUALITY, MINIMUM_BASE_QUALITY, CLIP_OVERLAPPING_READS, true, COVERAGE_CAP, SAMPLE_SIZE);
MINIMUM_MAPPING_QUALITY, MINIMUM_BASE_QUALITY, CLIP_OVERLAPPING_READS, true, INCLUDE_INDELS, COVERAGE_CAP, SAMPLE_SIZE);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ protected abstract COLLECTOR makeCollector(final Set<MetricAccumulationLevel> ac
@Argument(doc = "True if we are to clip overlapping reads, false otherwise.", optional=true)
public boolean CLIP_OVERLAPPING_READS = false;

@Argument(doc= "If true count inserted bases as on target and deleted bases as covered by a read.")
public boolean INCLUDE_INDELS = false;

@Argument(shortName = "covMax", doc = "Parameter to set a max coverage limit for Theoretical Sensitivity calculations. Default is 200.", optional = true)
public int COVERAGE_CAP = 200;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,6 @@ protected TargetedPcrMetricsCollector makeCollector(final Set<MetricAccumulation
final String probeSetName,
final int nearProbeDistance) {
return new TargetedPcrMetricsCollector(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance,
MINIMUM_MAPPING_QUALITY, MINIMUM_BASE_QUALITY, CLIP_OVERLAPPING_READS, true, COVERAGE_CAP, SAMPLE_SIZE);
MINIMUM_MAPPING_QUALITY, MINIMUM_BASE_QUALITY, CLIP_OVERLAPPING_READS, true, INCLUDE_INDELS, COVERAGE_CAP, SAMPLE_SIZE);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,10 @@ public HsMetricCollector(final Set<MetricAccumulationLevel> accumulationLevels,
final int minimumBaseQuality,
final boolean clipOverlappingReads,
final boolean noSideEffects,
final boolean includeIndels,
final int coverageCap,
final int sampleSize) {
super(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, noSideEffects, coverageCap, sampleSize);
super(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, noSideEffects, includeIndels, coverageCap, sampleSize);
}

@Override
Expand Down
135 changes: 79 additions & 56 deletions src/main/java/picard/analysis/directed/TargetMetricsCollector.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,7 @@

package picard.analysis.directed;

import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.*;
import htsjdk.samtools.metrics.MetricBase;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
Expand Down Expand Up @@ -130,6 +126,7 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri
private final int minimumBaseQuality;
private final boolean clipOverlappingReads;
private boolean noSideEffects;
private final boolean includeIndels;

//A map of coverage by target in which coverage is reset every read, this is done
//so that we can calculate overlap for a read once and the resulting coverage is
Expand Down Expand Up @@ -254,7 +251,7 @@ public TargetMetricsCollector(final Set<MetricAccumulationLevel> accumulationLev
final boolean clipOverlappingReads,
final int coverageCap,
final int sampleSize) {
this(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, false, coverageCap, sampleSize);
this(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, false, false, coverageCap, sampleSize);
}

public TargetMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels,
Expand All @@ -270,6 +267,7 @@ public TargetMetricsCollector(final Set<MetricAccumulationLevel> accumulationLev
final int minimumBaseQuality,
final boolean clipOverlappingReads,
final boolean noSideEffects,
final boolean includeIndels,
final int coverageCap,
final int sampleSize) {
this.perTargetCoverage = perTargetCoverage;
Expand Down Expand Up @@ -321,6 +319,7 @@ public TargetMetricsCollector(final Set<MetricAccumulationLevel> accumulationLev
this.minimumBaseQuality = minimumBaseQuality;
this.clipOverlappingReads = clipOverlappingReads;
this.noSideEffects = noSideEffects;
this.includeIndels = includeIndels;

setup(accumulationLevels, samRgRecords);
}
Expand All @@ -329,7 +328,7 @@ public TargetMetricsCollector(final Set<MetricAccumulationLevel> accumulationLev
protected PerUnitMetricCollector<METRIC_TYPE, Integer, SAMRecord> makeChildCollector(final String sample, final String library, final String readGroup) {
final PerUnitTargetMetricCollector collector = new PerUnitTargetMetricCollector(probeSetName, coverageByTargetForRead.keySet(),
sample, library, readGroup, probeTerritory, targetTerritory, genomeSize,
intervalToGc, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads);
intervalToGc, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, includeIndels);
if (this.probeSetName != null) {
collector.setBaitSetName(probeSetName);
}
Expand Down Expand Up @@ -366,6 +365,7 @@ public class PerUnitTargetMetricCollector implements PerUnitMetricCollector<METR
private final int minimumBaseQuality;
private final CountingMapQFilter mapQFilter;
private final boolean clipOverlappingReads;
private final boolean includeIndels;

/**
* Constructor that parses the squashed reference to genome reference file and stores the
Expand All @@ -377,7 +377,8 @@ public PerUnitTargetMetricCollector(final String probeSetName, final Set<Interva
final Map<Interval, Double> intervalToGc,
final int minimumMappingQuality,
final int minimumBaseQuality,
final boolean clipOverlappingReads) {
final boolean clipOverlappingReads,
final boolean includeIndels) {
this.metrics.SAMPLE = sample;
this.metrics.LIBRARY = library;
this.metrics.READ_GROUP = readGroup;
Expand All @@ -399,6 +400,7 @@ public PerUnitTargetMetricCollector(final String probeSetName, final Set<Interva
this.minimumBaseQuality = minimumBaseQuality;
this.intervalToGc = intervalToGc;
this.clipOverlappingReads = clipOverlappingReads;
this.includeIndels = includeIndels;
}

/** Sets the (optional) File to write per-target coverage information to. If null (the default), no file is produced. */
Expand Down Expand Up @@ -428,7 +430,7 @@ public Map<Interval, Coverage> getCoverageByTarget() {
/** Adds information about an individual SAMRecord to the statistics. */
public void acceptRecord(final SAMRecord record) {
// Just ignore secondary alignments altogether
if (record.getNotPrimaryAlignmentFlag()) return;
if (record.isSecondaryAlignment()) return;

// Cache some things, and compute the total number of bases aligned in the record.
final boolean mappedInPair = record.getReadPairedFlag() && !record.getReadUnmappedFlag() && !record.getMateUnmappedFlag() && !record.getSupplementaryAlignmentFlag();
Expand Down Expand Up @@ -548,63 +550,84 @@ public void acceptRecord(final SAMRecord record) {
rec = record;
}

// Find the target overlaps
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you are here, in line 331 above, can you replace the use of getNotPrimaryAlignmentFlag() with the non-deprecated isSecondaryAlignment ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, do you think that the clipping for overlapping bases should be modified in any way if we take indels into account?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the clipping is probably fine. The clipping is based on reference positions IIRC, which should be ok.

final Set<Interval> coveredTargets = new HashSet<>();
for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
final int length = block.getLength();
final int refStart = block.getReferenceStart();
final int readStart = block.getReadStart();

for (int offset = 0; offset < length; ++offset) {
final int refPos = refStart + offset;
final int readPos = readStart + offset;
final int qual = baseQualities[readPos - 1];

if (qual <= 2) {
metrics.PCT_EXC_BASEQ++;
continue;
}
// Calculate all the things that require examining individual bases in the read. This includes:
// 1. Per-base coverage
// 2. The number of reads contributing to per-base coverage per target
// 3. Unfiltered coverage information for het sensitivity
// 4. The count of bases rejected for being low baseq or off-target
// 5. The count of overall on-target bases, and on-target bases from paired reads
final Set<Interval> coveredTargets = new HashSet<>(); // Each target is added to this the first time the read covers it
int readOffset = 0;
int refOffset = rec.getAlignmentStart() - 1;

for (final CigarElement cig : rec.getCigar()) {
final CigarOperator op = cig.getOperator();
final int len = cig.getLength();

for (int i=0; i<len; ++i) {
if (op.isAlignment() || (this.includeIndels && op.isIndel())) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this line be swapped with the previous line as it seems to encompass all of the for-loop and doesn't depend on i.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The whole if is encompassed, but not the statements on lines 612 and 613 that update readOffset and refOffset. I did at one point try inverting and adding an else that updated the offsets using the length of the element and saw no performance increase, so went back to how it is now.

final int refPos = refOffset + 1;
final int qual = baseQualities[readOffset];
final boolean highQual = qual >= this.minimumBaseQuality;
final boolean onTarget = overlapsAny(refPos, targets);
final boolean incrementPerTargetCoverage = op != CigarOperator.INSERTION; // Inserted bases don't have a target position

// Firstly handle all the summary metrics
if (!highQual) {
metrics.PCT_EXC_BASEQ++;
} else if (!onTarget) {
metrics.PCT_EXC_OFF_TARGET++;
} else {
metrics.ON_TARGET_BASES++;
if (mappedInPair) metrics.ON_TARGET_FROM_PAIR_BASES++;
}

boolean isOnTarget = false;
for (final Interval target : targets) {
if (refPos >= target.getStart() && refPos <= target.getEnd()) {
final int targetOffset = refPos - target.getStart();

// if the base quality exceeds the minimum threshold, then we update various metrics
if (qual >= minimumBaseQuality) {
++metrics.ON_TARGET_BASES;
if (mappedInPair) ++metrics.ON_TARGET_FROM_PAIR_BASES;
final Coverage highQualityCoverage = highQualityCoverageByTarget.get(target);
highQualityCoverage.addBase(targetOffset);
if (!coveredTargets.contains(target)) {
highQualityCoverage.incrementReadCount();
coveredTargets.add(target);
isOnTarget = true;
// Then go through the per-target/per-base hq and unfiltered coverage
// The cutoff of > 2 is because even the unfilteredCoverage doesn't want those bases
if (qual > 2 && incrementPerTargetCoverage && onTarget) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (qual > 2 && incrementPerTargetCoverage && onTarget) {
if (qual >= 2 && incrementPerTargetCoverage && onTarget) {

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll fix the comment, the code is actually correct here - the het sensitivity code only wants bases > q2, not >= q2.

for (final Interval target : targets) {
if (overlapsInterval(refPos, target)) {
final int targetOffset = refPos - target.getStart();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refactor into overlapsTarget and use below in 619 as well?

 if (refPos >= target.getStart() && refPos <= target.getEnd()) {


// Unfiltered first (for theoretical het sensitivity)
final Coverage ufCoverage = unfilteredCoverageByTarget.get(target);
ufCoverage.addBase(targetOffset);
if (ufCoverage.getDepths()[targetOffset] <= coverageCap) baseQHistogramArray[qual]++;

// Then filtered
if (highQual) {
final Coverage hqCoverage = highQualityCoverageByTarget.get(target);
hqCoverage.addBase(targetOffset);

if (coveredTargets.add(target)) {
hqCoverage.incrementReadCount();
}
}
}

} else {
// the base quality is in the range (2, minimumBaseQuality). we exclude them from the high-quality coverage histogram
this.metrics.PCT_EXC_BASEQ++;
}

// even when the base quality is below minimumBaseQuality (but higher than 2), update the base quality and unfiltered coverage histogram for theoretical het sensitivity
// we don't bother with the read count for unfiltered coverage histogram because we don't use it
unfilteredCoverageByTarget.get(target).addBase(targetOffset);

// we do not want to increment the base quality histogram for bases that will eventually get thrown out by the coverage cap
if (unfilteredCoverageByTarget.get(target).getDepths()[targetOffset] <= coverageCap){
baseQHistogramArray[qual]++;
}
}
}

// a base must not be on target and its base quality must exceed minimumBaseQuality for us to increment PCT_EXC_OFF_TARGET
if (!isOnTarget) this.metrics.PCT_EXC_OFF_TARGET++;

// Finally update the offsets!
if (op.consumesReadBases()) readOffset += 1;
if (op.consumesReferenceBases()) refOffset += 1;
}
}
}

/* Returns true if the `pos` is between the `start` and `end` of at least one interval. */
private boolean overlapsAny(final int pos, final Collection<Interval> intervals) {
for (final Interval interval : intervals) {
if (overlapsInterval(pos, interval)) return true;
}
return false;
}

/** Returns true if the position is within the start-end range inclusive of the given interval. */
private boolean overlapsInterval(final int pos, final Interval interval) {
return pos >= interval.getStart() && pos <= interval.getEnd();
}

@Override
public void finish() {
metrics.PCT_PF_READS = metrics.PF_READS / (double) metrics.TOTAL_READS;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,10 @@ public TargetedPcrMetricsCollector(final Set<MetricAccumulationLevel> accumulati
final int minimumBaseQuality,
final boolean clipOverlappingReads,
final boolean noSideEffects,
final boolean includeIndels,
final int coverageCap,
final int sampleSize) {
super(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, noSideEffects, coverageCap, sampleSize);
super(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, noSideEffects, includeIndels, coverageCap, sampleSize);
}
@Override
public TargetedPcrMetrics convertMetric(final TargetMetrics targetMetrics) {
Expand Down
Loading