Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Yf stratify chimeric #1269

Merged
merged 7 commits into from
Mar 8, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/main/java/picard/sam/AbstractAlignmentMerger.java
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ private void transferAlignmentInfoToFragment(final SAMRecord unaligned, final SA
* @param rec SAMRecord whose alignment information will be encoded
* @return String encoding rec's alignment information according to SA tag in the SAM spec
*/
static private String encodeMappingInformation(SAMRecord rec) {
static public String encodeMappingInformation(SAMRecord rec) {
yfarjoun marked this conversation as resolved.
Show resolved Hide resolved
return String.join(",",
rec.getContig(),
((Integer) rec.getAlignmentStart()).toString(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
"<p>" +
"The resulting metric file will be named according to a provided prefix and a suffix which is generated " +
" automatically according to the error metric. " +
"The tool cal collect multiple metrics in a single pass and there should be hardly any " +
"The tool can collect multiple metrics in a single pass and there should be hardly any " +
"performance loss when specifying multiple metrics at the same time; the default includes a " +
"large collection of metrics. \n" +
"<p>" +
Expand Down Expand Up @@ -230,7 +230,7 @@ protected int doWork() {
final Collection<BaseErrorAggregation> aggregatorList = getAggregatorList();
// Open up the input resources:
try (
final SamReader sam = SamReaderFactory.makeDefault().open(INPUT);
final SamReader sam = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
final ReferenceSequenceFileWalker referenceSequenceFileWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final PeekableIterator<VariantContext> vcfIterator = new PeekableIterator<>(
VCF == null ? Collections.emptyIterator() : new VCFFileReader(VCF, true).iterator())
Expand Down
144 changes: 104 additions & 40 deletions src/main/java/picard/sam/SamErrorMetric/ReadBaseStratification.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,10 @@

import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.SamLocusAndReferenceIterator.SAMLocusAndReference;
import htsjdk.samtools.util.Lazy;
import htsjdk.samtools.util.SamLocusIterator.*;
import htsjdk.samtools.util.SamLocusIterator.RecordAndOffset;
import htsjdk.samtools.util.SequenceUtil;
import org.broadinstitute.barclay.argparser.CommandLineParser;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;
Expand All @@ -47,7 +46,6 @@

/**
* Classes, methods, and enums that deal with the stratification of read bases and reference information.
*
*/
public class ReadBaseStratification {
// This variable has to be set using setLongHomopolymer _before_ the first call to binnedHomopolymerStratifier.get()
Expand Down Expand Up @@ -76,8 +74,6 @@ public static void setLongHomopolymer(int longHomopolymer) {
* The main interface for a stratifier.
* The Stratifier needs to be able to ake a {@link htsjdk.samtools.util.SamLocusIterator.RecordAndOffset RecordAndOffset}
* and return a value of type T. It also needs to have a suffix for automatically generating metrics file suffixes.
*
* @param <T>
*/
public interface RecordAndOffsetStratifier<T extends Comparable<T>> {
// The method that stratifies a base in a read (provided by RecordAndOffset) into a type T.
Expand Down Expand Up @@ -195,7 +191,7 @@ public String getSuffix() {
public static class CollectionStratifier implements RecordAndOffsetStratifier {
public CollectionStratifier(final Collection<RecordAndOffsetStratifier<?>> stratifiers) {

if (stratifiers.isEmpty()){
if (stratifiers.isEmpty()) {
throw new IllegalArgumentException("Must construct with a non-empty collection of stratifiers.");
}

Expand All @@ -210,12 +206,13 @@ public CollectionStratifier(final Collection<RecordAndOffsetStratifier<?>> strat

stratifier = linkedListStratifiers.remove();
}

final RecordAndOffsetStratifier stratifier;

@Override
public Comparable stratify(final RecordAndOffset recordAndOffset, final SAMLocusAndReference locusInfo) {

return stratifier.stratify(recordAndOffset,locusInfo);
return stratifier.stratify(recordAndOffset, locusInfo);
}

@Override
Expand All @@ -224,16 +221,14 @@ public String getSuffix() {
}
}


/**
* A factory for generating "pair" stratifier instances from two stratifiers and a string
*
* @param <T> the type of the left Stratifier
* @param <S> the type of the right Stratifier
* @param leftStratifier a {@link RecordAndOffsetStratifier<T>} to use
* @param <T> the type of the left Stratifier
* @param <S> the type of the right Stratifier
* @param leftStratifier a {@link RecordAndOffsetStratifier<T>} to use
* @param rightStratifier a {@link RecordAndOffsetStratifier<S>} to use
* @param suffix the suffix to use for the new stratifier
*
* @param suffix the suffix to use for the new stratifier
* @return an instance of {@link PairStratifier>} that will stratify according to both <code>leftStratifier</code>
* and <code>rightStratifier</code>
*/
Expand Down Expand Up @@ -293,7 +288,9 @@ public LongShortHomopolymer stratify(final RecordAndOffset recordAndOffset,
final SAMLocusAndReference locusInfo) {

final Integer hpLength = homoPolymerLengthStratifier.stratify(recordAndOffset, locusInfo);
if (hpLength == null) return null;
if (hpLength == null) {
return null;
}

return hpLength < longHomopolymer ? LongShortHomopolymer.SHORT_HOMOPOLYMER : LongShortHomopolymer.LONG_HOMOPOLYMER;

Expand Down Expand Up @@ -537,7 +534,7 @@ public String getSuffix() {
* <p>
* This is done with a Lazy wrapper since it requires access to {@link #LONG_HOMOPOLYMER} which can be set with {@link #setLongHomopolymer(int)}.
*/
public static final Lazy<PairStratifier<LongShortHomopolymer, Pair<Character, Character>>> binnedHomopolymerStratifier = new Lazy<>(() -> PairStratifierFactory( new LongShortHomopolymerStratifier(LONG_HOMOPOLYMER), preDiNucleotideStratifier,
public static final Lazy<PairStratifier<LongShortHomopolymer, Pair<Character, Character>>> binnedHomopolymerStratifier = new Lazy<>(() -> PairStratifierFactory(new LongShortHomopolymerStratifier(LONG_HOMOPOLYMER), preDiNucleotideStratifier,
"binned_length_homopolymer_and_following_ref_base"));

/**
Expand Down Expand Up @@ -581,6 +578,11 @@ public String getSuffix() {
*/
public static final RecordAndOffsetStratifier<ReadOrdinality> readOrdinalityStratifier = wrapStaticReadFunction(ReadOrdinality::of, "read_ordinality");

/**
* Stratifies bases into their read's Proper-pairedness
*/
public static final RecordAndOffsetStratifier<ProperPaired> readPairednessStratifier = wrapStaticReadFunction(ProperPaired::of, "pair_proper");

/**
* Stratifies bases into their read's Direction (i.e. forward or reverse)
*/
Expand All @@ -606,6 +608,11 @@ public String getSuffix() {
*/
public static final RecordAndOffsetStratifier<Integer> insertLengthStratifier = wrapStaticReadFunction(ReadBaseStratification::stratifyInsertLength, "insert_length");

/**
* Stratifies into the number of soft-clipped bases that the read has in its alignment, or {@value NOT_ALIGNED_ERROR} if not aligned.
*/
public static final RecordAndOffsetStratifier<Integer> softClipsLengthStratifier = wrapStaticReadFunction(ReadBaseStratification::stratifySoftClippedBases, "softclipped_bases");

/**
* Stratifies into the base-quality of the base under consideration
*/
Expand Down Expand Up @@ -646,30 +653,32 @@ public String getSuffix() {
*/
enum Stratifier implements CommandLineParser.ClpEnum {
ALL(() -> nonStratifier, "Puts all bases in the same stratum."),
GC_CONTENT(() -> gcContentStratifier, "Stratifies bases according to the gc content of their read."),
READ_ORDINALITY(() -> readOrdinalityStratifier, "Stratifies bases according to their read ordinality (i.e. first or second)."),
READ_BASE(() -> currentReadBaseStratifier, "Stratifies bases by the read in the original reading direction."),
READ_DIRECTION(() -> readDirectionStratifier, "Stratifies bases to +/- based on the alignment direction of the read."),
PAIR_ORIENTATION(() -> readOrientationStratifier, "Stratifies bases into F1R2/F2R1 according to their reads orientation and ordinality. Assumes reads are \"innies\"."),
REFERENCE_BASE(() -> referenceBaseStratifier, "Stratifies bases by the read-directed reference base."),
PRE_DINUC(() -> preDiNucleotideStratifier, "Stratifies bases by the read base at the previous cycle, and the current reference base."),
POST_DINUC(() -> postDiNucleotideStratifier, "Stratifies bases by the read base at the previous cycle, and the current reference base."),
HOMOPOLYMER_LENGTH(() -> homoPolymerLengthStratifier, "Stratifies bases based on the length of homopolymer they are part of (only accounts for bases that were read prior to the current base)."),
HOMOPOLYMER(() -> homopolymerStratifier, "Stratifies bases based on the length of homopolymer, the base that the homopolymer is comprised of, and the reference base."),
GC_CONTENT(() -> gcContentStratifier, "The GC-content of the read."),
READ_ORDINALITY(() -> readOrdinalityStratifier, "The read ordinality (i.e. first or second)."),
READ_BASE(() -> currentReadBaseStratifier, "the base in the original reading direction."),
READ_DIRECTION(() -> readDirectionStratifier, "The alignment direction of the read (encoded as + or -)."),
PAIR_ORIENTATION(() -> readOrientationStratifier, "The read-pair's orientation and ordinality (encoded as F1R2 or F2R1). Assumes reads are \"innies\"."),
yfarjoun marked this conversation as resolved.
Show resolved Hide resolved
PAIR_PROPERNESS(() -> readPairednessStratifier, "The properness of the reads alignment. Looks for indications of chimerism."),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"reads" should be possessive or singular. I vote for "read".

REFERENCE_BASE(() -> referenceBaseStratifier, "The reference base in the read's direction."),
PRE_DINUC(() -> preDiNucleotideStratifier, "The read base at the previous cycle, and the current reference base."),
POST_DINUC(() -> postDiNucleotideStratifier, "The read base at the subsequent cycle, and the current reference base."),
HOMOPOLYMER_LENGTH(() -> homoPolymerLengthStratifier, "The length of homopolymer the base is part of (only accounts for bases that were read prior to the current base)."),
HOMOPOLYMER(() -> homopolymerStratifier, "The length of homopolymer, the base that the homopolymer is comprised of, and the reference base."),
//using a lazy initializer to enable the value of LONG_HOMOPOLYMER to be used;
BINNED_HOMOPOLYMER(binnedHomopolymerStratifier::get, "Stratifies bases based on the scale of homopolymer (long or short), the base that the homopolymer is comprised of, and the reference base."),
FLOWCELL_TILE(() -> flowCellTileStratifier, "Stratifies according to the flowcell-tile where the base was read (taken from the read name)."),
READ_GROUP(() -> readgroupStratifier, "Stratifies bases according to their read-group id."),
CYCLE(() -> baseCycleStratifier, "Stratifies bases to the machine cycle during which they were read."),
BINNED_CYCLE(() -> binnedReadCycleStratifier, "Stratifies bases according to the binned machine cycle in the read similar to CYCLE, but binned into 5 evenly spaced ranges across the size of the read. This stratifier may produce confusing results when used on datasets with variable sized reads."),
INSERT_LENGTH(() -> insertLengthStratifier, "Stratifies bases according to the insert-size they came from (taken from the TLEN field.)"),
BASE_QUALITY(() -> baseQualityStratifier, "Stratifies bases according to their base quality."),
MAPPING_QUALITY(() -> mappingQualityStratifier, "Stratifies bases according to their read's mapping quality."),
MISMATCHES_IN_READ(() -> mismatchesInReadStratifier, "Stratifies bases according to the number of bases in the read that mismatch the reference excluding the current base. This stratifier requires the NM tag."),
ONE_BASE_PADDED_CONTEXT(() -> oneBasePaddedContextStratifier, "Stratifies bases according the current reference base and a one base padded region from the read resulting in a 3-base context."),
TWO_BASE_PADDED_CONTEXT(() -> twoBasePaddedContextStratifier, "Stratifies bases according the current reference base and a two base padded region from the read resulting in a 5-base context."),
CONSENSUS(() -> consensusStratifier, "Stratifies bases according to whether or not duplicate reads were used to form a consensus read. This stratifier makes use of the aD, bD, and cD tags for duplex consensus reads. If the reads are single index consensus, only the cD tags are used."),
NS_IN_READ(() -> nsInReadStratifier, "Stratifies bases according to the number of Ns in the read.");
BINNED_HOMOPOLYMER(binnedHomopolymerStratifier::get, "The scale of homopolymer (long or short), the base that the homopolymer is comprised of, and the reference base."),
FLOWCELL_TILE(() -> flowCellTileStratifier, "The flowcell and tile where the base was read (taken from the read name)."),
READ_GROUP(() -> readgroupStratifier, "The read-group id of the read."),
CYCLE(() -> baseCycleStratifier, "The machine cycle during which the base was read."),
BINNED_CYCLE(() -> binnedReadCycleStratifier, "The binned machine cycle. Similar to CYCLE, but binned into 5 evenly spaced ranges across the size of the read. This stratifier may produce confusing results when used on datasets with variable sized reads."),
SOFT_CLIPS(() -> softClipsLengthStratifier, "The number of softclipped bases the read has."),
INSERT_LENGTH(() -> insertLengthStratifier, "The insert-size they came from (taken from the TLEN field.)"),
BASE_QUALITY(() -> baseQualityStratifier, "The base quality."),
MAPPING_QUALITY(() -> mappingQualityStratifier, "The read's mapping quality."),
MISMATCHES_IN_READ(() -> mismatchesInReadStratifier, "The number of bases in the read that mismatch the reference, excluding the current base. This stratifier requires the NM tag."),
ONE_BASE_PADDED_CONTEXT(() -> oneBasePaddedContextStratifier, "The current reference base and a one base padded region from the read resulting in a 3-base context."),
TWO_BASE_PADDED_CONTEXT(() -> twoBasePaddedContextStratifier, "The current reference base and a two base padded region from the read resulting in a 5-base context."),
CONSENSUS(() -> consensusStratifier, "Whether or not duplicate reads were used to form a consensus read. This stratifier makes use of the aD, bD, and cD tags for duplex consensus reads. If the reads are single index consensus, only the cD tags are used."),
NS_IN_READ(() -> nsInReadStratifier, "The number of Ns in the read.");

private final String docString;

Expand Down Expand Up @@ -705,6 +714,48 @@ public static ReadOrdinality of(final SAMRecord sam) {
}
}

/**
* An enum to hold information about the "properness" of a read pair
*/
public enum ProperPaired {
yfarjoun marked this conversation as resolved.
Show resolved Hide resolved
// Read has no supplementary alignments, is aligned to the same reference as its mate,
// and has the "properly aligned" flag 0x2 set.
PROPER,
// Read has no supplementary alignments, is aligned to the same reference as its mate,
// and has the "properly aligned" flag 0x2 UN-set.
IMPROPER,
// Read is softclipped, is aligned to the same reference as its mate, and
// has at least one supplementary alignment
CHIMERIC,
// Read is aligned to a different reference than its mate
DISCORDANT,
// Read or Mate are not aligned, or read has no mate and cannot be declared to be CHIMERIC.
UNKNOWN;

public static ProperPaired of(final SAMRecord sam) {

if (sam.getReadPairedFlag() &&
!sam.getMateUnmappedFlag() && !sam.getReadUnmappedFlag() &&
!sam.getMateReferenceIndex().equals(sam.getReferenceIndex())) {
return DISCORDANT;
}

if (!sam.getReadUnmappedFlag() && sam.getCigar().isClipped() && sam.hasAttribute(SAMTag.SA.toString())) {
return CHIMERIC;
}

if (sam.getReadUnmappedFlag() || sam.getReadPairedFlag() && sam.getMateUnmappedFlag()) {
return UNKNOWN;
}

if (!sam.getProperPairFlag()) {
return IMPROPER;
}

return PROPER;
}
}

/**
* An enum designed to hold a binned version of any probability-like number (between 0 and 1)
* in quintiles
Expand Down Expand Up @@ -785,7 +836,9 @@ public static PairOrientation of(final SAMRecord sam) {
// if read is unmapped, read isn't mapped, or mate is unmapped, return null
if (direction == null ||
sam.getReadUnmappedFlag() ||
sam.getMateUnmappedFlag()) return null;
sam.getMateUnmappedFlag()) {
return null;
}

final boolean matePositiveStrand = !sam.getMateNegativeStrandFlag();

Expand Down Expand Up @@ -939,6 +992,17 @@ private static Integer stratifyInsertLength(final SAMRecord sam) {
Math.abs(sam.getInferredInsertSize()));
}

public static final int NOT_ALIGNED_ERROR = -1;
private static Integer stratifySoftClippedBases(final SAMRecord sam) {
final Cigar cigar = sam.getCigar();
if (cigar == null) {
return NOT_ALIGNED_ERROR;
}
return cigar.getCigarElements().stream()
.filter(e -> e.getOperator() == CigarOperator.S)
.mapToInt(CigarElement::getLength).sum();
}

private static Byte stratifyBaseQuality(final RecordAndOffset recordAndOffset) {
return recordAndOffset.getBaseQuality();
}
Expand Down
Loading