diff --git a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java
index 915418a6bc..141a2a2f23 100644
--- a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java
+++ b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java
@@ -67,45 +67,48 @@
import static picard.fingerprint.Fingerprint.CrosscheckMode.CHECK_SAME_SAMPLE;
/**
- * Checks that all data in the set of input files appear to come from the same
- * individual. Can be used to compare according to readgroups, libraries, samples, or files.
- * Operates on bams/sams and vcfs (including gvcfs).
+ * Checks that all data in the set of input files appear to come from the same
+ * individual. Can be used to compare according to readgroups, libraries, samples, or files.
+ * Operates on bams/sams and vcfs (including gvcfs).
*
*
Summary
- * Checks if all the genetic data within a set of files appear to come from the same individual.
- * It quickly determines whether a "group's" genotype matches that of an input SAM/BAM/VCF by selective sampling,
- * and has been designed to work well even for low-depth SAM/BAMs.
+ * Checks if all the genetic data within a set of files appear to come from the same individual.
+ * It quickly determines whether a "group's" genotype matches that of an input SAM/BAM/VCF by selective sampling,
+ * and has been designed to work well even for low-depth SAM/BAMs.
*
- * The tool collects "fingerprints" (essentially genotype information from different parts of the genome)
- * at the finest level available in the data (readgroup for SAM files
- * and sample for VCF files) and then optionally aggregates it by library, sample or file, to increase power and provide
- * results at the desired resolution. Output is in a "Moltenized" format, one row per comparison. The results will
- * be emitted into a metric file for the class {@link CrosscheckMetric}.
- * In this format the output will include the LOD score and also tumor-aware LOD score which can
- * help assess identity even in the presence of a severe loss of heterozygosity with high purity (which could
- * otherwise fail to notice that samples are from the same individual.)
- * A matrix output is also available to facilitate visual inspection of crosscheck results.
+ * The tool collects "fingerprints" (essentially genotype information from different parts of the genome)
+ * at the finest level available in the data (readgroup for SAM files
+ * and sample for VCF files) and then optionally aggregates it by library, sample or file, to increase power and provide
+ * results at the desired resolution. Output is in a "Moltenized" format, one row per comparison. The results will
+ * be emitted into a metric file for the class {@link CrosscheckMetric}.
+ * In this format, the output will include both the LOD score and the tumor-aware LOD score; the latter can
+ * help assess identity even in the presence of a severe loss of heterozygosity with high purity (the tool could
+ * otherwise fail to notice that samples are from the same individual.)
+ * A matrix output is also available to facilitate visual inspection of crosscheck results.
*
- * Since there can be many rows of output in the metric file, we recommend the use of {@link ClusterCrosscheckMetrics}
- * as a follow-up step to running CrosscheckFingerprints.
+ * Since there can be many rows of output in the metric file, we recommend the use of {@link ClusterCrosscheckMetrics}
+ * as a follow-up step to running CrosscheckFingerprints.
*
- * There are cases where one would like to identify a few groups out of a collection of many possible groups (say
- * to link a bam to it's correct sample in a multi-sample vcf. In this case one would not case for the cross-checking
- * of the various samples in the VCF against each other, but only in checking the identity of the bam against the various
- * samples in the vcf. The {@link #SECOND_INPUT} is provided for this use-case. With {@link #SECOND_INPUT} provided, CrosscheckFingerprints
- * does the following:
+ * There are cases where one would like to identify a few groups out of a collection of many possible groups (say
+ * to link a bam to its correct sample in a multi-sample vcf. In this case, one would not care for the cross-checking
+ * of the various samples in the VCF against each other, but only in checking the identity of the bam against the various
+ * samples in the vcf. The {@link #SECOND_INPUT} is provided for this use-case. With {@link #SECOND_INPUT} provided,
+ * CrosscheckFingerprints does the following:
*
* aggregation of data happens independently for the input files in {@link #INPUT} and {@link #SECOND_INPUT}.
* aggregation of data happens at the SAMPLE level.
- * each samples from {@link #INPUT} will only be compared to that same sample in {@link #INPUT}.
+ * each sample from {@link #INPUT} will only be compared to that same sample in {@link #INPUT}.
* {@link #MATRIX_OUTPUT} is disabled.
*
*
- * In some cases, the groups collected may not have any observations (calls for a vcf, reads for a bam) at fingerprinting sites, or a sample in INPUT may be missing from the SECOND_INPUT.
- * These cases are handled as follows: If running in CHECK_SAME_SAMPLES mode with INPUT and SECOND_INPUT, and either INPUT or SECOND_INPUT includes a sample
- * not found in the other, or contains a sample with no observations at any fingerprinting sites, an error will be logged and the tool will return EXIT_CODE_WHEN_MISMATCH.
- * In all other running modes, when any group which is being crosschecked does not have any observations at fingerprinting sites, a warning is logged. As long as there is at least
- * one comparison where both sides have observations at fingerprinting sites, the tool will return zero. However, if all comparisons have at least one side with no observations
+ * In some cases, the groups collected may not have any observations (calls for a vcf, reads for a bam) at fingerprinting
+ * sites, or a sample in INPUT may be missing from the SECOND_INPUT. These cases are handled as follows: If running in
+ * CHECK_SAME_SAMPLES mode with INPUT and SECOND_INPUT, and either INPUT or SECOND_INPUT includes a sample not found in the
+ * other, or contains a sample with no observations at any fingerprinting sites, an error will be logged and the tool will
+ * return EXIT_CODE_WHEN_MISMATCH.
+ * In all other running modes, when any group which is being crosschecked does not have any observations at fingerprinting
+ * sites, a warning is logged. As long as there is at least one comparison where both sides have observations at
+ * fingerprinting sites, the tool will return zero. However, if all comparisons have at least one side with no observations
* at fingerprinting sites, an error will be logged and the tool will return EXIT_CODE_WHEN_NO_VALID_CHECKS.
* Examples
* Check that all the readgroups from a sample match each other:
@@ -134,13 +137,13 @@
*
* This tool calculates the LOD score for identity check between "groups" of data in the INPUT files as defined by
* the CROSSCHECK_BY argument. A positive value indicates that the data seems to have come from the same individual
- * or, in other words the identity checks out. The scale is logarithmic (base 10), so a LOD of 6 indicates
- * that it is 1,000,000 more likely that the data matches the genotypes than not. A negative value indicates
+ * or, in other words, the identity checks out. The scale is logarithmic (base 10), so a LOD of 6 indicates
+ * that it is 1,000,000 times more likely that the data matches the genotypes than not. A negative value indicates
* that the data do not match. A score that is near zero is inconclusive and can result from low coverage
* or non-informative genotypes. Each group is assigned a sample identifier (for SAM this is taken from the SM tag in
* the appropriate readgroup header line, for VCF this is taken from the column label in the file-header.
* After combining all the data from the same "group" together, an all-against-all comparison is performed. Results are
- * categorized a {@link FingerprintResult} enum: EXPECTED_MATCH, EXPECTED_MISMATCH, UNEXPECTED_MATCH, UNEXPECTED_MISMATCH,
+ * categorized as a {@link FingerprintResult} enum: EXPECTED_MATCH, EXPECTED_MISMATCH, UNEXPECTED_MATCH, UNEXPECTED_MISMATCH,
* or AMBIGUOUS depending on the LOD score and on whether the sample identifiers of the groups agree: LOD scores that are
* less than LOD_THRESHOLD are considered mismatches, and those greater than -LOD_THRESHOLD are matches (between is ambiguous).
* If the sample identifiers are equal, the groups are expected to match. They are expected to mismatch otherwise.
@@ -159,55 +162,64 @@
@CommandLineProgramProperties(
summary =
- "Checks that all data in the set of input files appear to come from the same " +
+ "Checks the odds that all data in the set of input files come from the same " +
"individual. Can be used to cross-check readgroups, libraries, samples, or files. " +
- "Operates on SAM/BAM/CRAM and VCF (including gVCF and gzipped-VCF). " +
+ "Acceptable inputs include BAM/SAM/CRAM and VCF/GVCF files. Output delivers LOD " +
+ "scores in the form of a CrosscheckMetric file. \n" +
"\n" +
"Summary
\n" +
- "Checks if all the genetic data within a set of files appear to come from the same individual. " +
- "It quickly determines whether a group's genotype matches that of an input file by selective sampling, " +
- "and has been designed to work well for low-depth SAM (as well as high depth ones and VCFs.) " +
- "The tool collects fingerprints (essentially, genotype information from different parts of the genome) " +
- "at the finest level available in the data (readgroup for read-data files " +
- "and sample for variant-data files) and then optionally aggregates it by library, sample or file, to increase power and provide " +
- "results at the desired resolution. Output is in a \"Moltenized\" format, one row per comparison. The results are " +
- "emitted into a CrosscheckMetric metric file. " +
- "In this format the output will include the LOD score and also tumor-aware LOD score which can " +
- "help assess identity even in the presence of a severe loss of heterozygosity with high purity (which could cause it to " +
- "otherwise fail to notice that samples are from the same individual.) " +
- "A matrix output is also available to facilitate visual inspection of crosscheck results." +
+
+ "CrosscheckFingerprints rapidly checks the odds that all of the genetic data within " +
+ "a set of files come from the same individual. This is accomplished by selectively " +
+ "sampling from the input files, and determining whether the genotypes of the " +
+ "specified Groups match to each other. (Groups are defined by the input and the argument " +
+ "CROSSCHECK_BY; they can be READ_GROUP, LIBRARY, SAMPLE, or FILE.)" +
"
" +
-
- "Since there can be many rows of output in the metric file, we recommend the use of ClusterCrosscheckMetrics " +
- "as a follow-up step to running CrosscheckFingerprints.\n " +
- "
" +
-
- "There are cases where one would like to identify a few groups out of a collection of many possible groups (say " +
- "to link a SAM to its correct sample in a multi-sample VCF. In this case one would not case for the cross-checking " +
- "of the various samples in the VCF against each other, but only in checking the identity of the SAM against the various " +
- "samples in the VCF. The SECOND_INPUT is provided for this use-case. With SECOND_INPUT provided, CrosscheckFingerprints " +
- "does the following:" +
- "
" +
-
+ "Output is generated in the form of a “molten” (one row per comparison) CrosscheckMetric " +
+ "file that includes the Logarithm of the Odds (LOD) score, as well as the tumor-aware LOD " +
+ "score. Tumor-aware LOD scores can be used to assess genotypic identity in the presence of " +
+ "a severe Loss of Heterozygosity (LOH) with high purity—this could otherwise lead to a " +
+ "failure of the tool to identify samples are from the same individual. Output is also available " +
+ "as a matrix, to facilitate visual inspection of crosscheck results." +
+ "
" +
+ "Metric files can contain many rows of output. We therefore recommend following up CrosscheckFingerprints " +
+ "with a step using [ClusterCrosscheckMetrics (Picard)](https://gatk.broadinstitute.org/hc/en-us/articles/360045798972--Tool-Documentation-Index); this tool will cluster groups together that pass a designated LOD threshold, " +
+ "ensuring that groups within the cluster are related to each other. " +
+ "
" +
+ "There may be cases where several groups out of a collection of possible groups must be identified---for example, " +
+ "to link a BAM to its correct sample in a multi-sample VCF. In this case, it would not be necessary to cross-check " +
+ "the various samples in the VCF against each other, but only to check the identity of the BAM against the various " +
+ "samples in the VCF. For this application, the SECOND_INPUT argument is provided. With SECOND_INPUT, " +
+ "CrosscheckFingerprints can do the following: " +
+ "
" +
+
"" +
- "- aggregation of data happens independently for the input files in INPUT and SECOND_INPUT.
" +
- "- aggregation of data happens at the SAMPLE level
" +
- "- each samples from INPUT will only be compared to that same sample in SECOND_INPUT.
" +
- "- MATRIX_OUTPUT is disabled.
" +
+ "- Independently aggregate data for the input files in INPUT and SECOND_INPUT.
" +
+ "- Aggregate data at the SAMPLE level.
" +
+ "- Compare samples from INPUT to the same sample in SECOND_INPUT.
" +
+ "- Disables MATRIX_OUTPUT.
" +
"
" +
"
" +
- "In some cases, the groups collected may not have any observations (calls for a VCF, reads for a SAM) at fingerprinting sites, or " +
- "a sample in INPUT may be missing from the SECOND_INPUT. These cases are handled as follows:" +
+ "In some cases, the groups collected may not have any observations (‘reads’ for BAM files, or ‘calls’ for VCF files) " +
+ "at fingerprinting sites. Alternatively, a sample in INPUT may be missing from SECOND_INPUT. These cases are handled " +
+ "as follows: " +
+ "
" +
+
"" +
- "- If running in CHECK_SAME_SAMPLES mode with INPUT and SECOND_INPUT, and either INPUT or SECOND_INPUT " +
- "includes a sample not found in the other, or contains a sample with " +
- "no observations at any fingerprinting sites, an error will be logged and the tool will return EXIT_CODE_WHEN_MISMATCH.
" +
- "- In all other running modes, when any group which is being crosschecked does not have any observations at fingerprinting sites, a warning is " +
- "logged. As long as there is at least one comparison where both sides have observations at fingerprinting sites, the tool will " +
- "return zero. However, if all comparisons have at least one side with no observations at fingerprinting sites, an error will be " +
- "logged and the tool will return EXIT_CODE_WHEN_NO_VALID_CHECKS.
" +
+ "- If running in CHECK_SAME_SAMPLES mode with the INPUT and SECOND_INPUT sets of input files: when either set of inputs " +
+ "(1) includes a sample not found in the other, or (2) contains a sample with no observations at any fingerprinting sites, " +
+ "then an error will be logged and the tool will return EXIT_CODE_WHEN_MISMATCH.
" +
+ "- If running in any other running mode: when a group which is being crosschecked does not have any observations at " +
+ "fingerprinting sites, a warning will be logged.
" +
+ "
" +
+
"
" +
+ "Note that, as long as there is at least one comparison in which both files have observations at fingerprinting sites, " +
+ "the tool will return a ‘zero’. However, an error will be logged and the tool will return EXIT_CODE_WHEN_NO_VALID_CHECKS " +
+ "if all comparisons have at least one side without observations at a fingerprinting site (ie. all LOD scores are zero). " +
+ "
" +
+
"
" +
"Examples
" +
@@ -315,14 +327,16 @@ public class CrosscheckFingerprints extends CommandLineProgram {
"Should only be used with SECOND_INPUT. ", optional = true)
public File SECOND_INPUT_SAMPLE_MAP;
- @Argument(doc = "A tsv with two columns representing the individual with which each sample is associated. The first column is the sample id, and the second " +
- "column is the associated individual id. Values in the first column must be unique. If INPUT_SAMPLE_MAP or SECOND_INPUT_SAMPLE_MAP is also specified, " +
- "then the values in the first column of this file should be the sample aliases specified in the second columns of INPUT_SAMPLE_MAP and SECOND_INPUT_SAMPLE_MAP, " +
- "respectively. When this input is specified, expectations for matches will be based on the equality or inequality of the individual ids associated with two " +
- "samples, as opposed to the sample ids themselves. Samples which are not listed in this file will have their sample id used as their individual id, for the " +
- "purposes of match expectations. This means that one sample id could be used as the individual id for another sample, but not included in the map itself, and " +
- "these two samples would be considered to have come from the same individual. Note that use of this parameter only affects labelling of matches and mismatches as " +
- "EXPECTED or UNEXPECTED. It has no affect on how data is grouped for crosschecking.", optional = true)
+ @Argument(doc = "A tsv with two columns representing the individual with which each sample is associated. The first column " +
+ "is the sample id, and the second column is the associated individual id. Values in the first column must be unique. " +
+ "If INPUT_SAMPLE_MAP or SECOND_INPUT_SAMPLE_MAP is also specified, then the values in the first column of this file " +
+ "should be the sample aliases specified in the second columns of INPUT_SAMPLE_MAP and SECOND_INPUT_SAMPLE_MAP, " +
+ "respectively. When this input is specified, expectations for matches will be based on the equality or inequality of " +
+ "the individual ids associated with two samples, as opposed to the sample ids themselves. Samples which are not listed " +
+ "in this file will have their sample id used as their individual id, for the purposes of match expectations. This means " +
+ "that one sample id could be used as the individual id for another sample, but not included in the map itself, and these " +
+ "two samples would be considered to have come from the same individual. Note that use of this parameter only affects " +
+ "labelling of matches and mismatches as EXPECTED or UNEXPECTED. It has no affect on how data is grouped for crosschecking.", optional = true)
public File SAMPLE_INDIVIDUAL_MAP;
@Argument(doc = "An argument that controls how crosschecking with both INPUT and SECOND_INPUT should occur. ")
@@ -354,7 +368,7 @@ public class CrosscheckFingerprints extends CommandLineProgram {
"the groups are from the same individual. ")
public double LOD_THRESHOLD = 0;
- @Argument(doc = "Specificies which data-type should be used as the basic comparison unit. Fingerprints from readgroups can " +
+ @Argument(doc = "Specifies which data-type should be used as the basic comparison unit. Fingerprints from readgroups can " +
"be \"rolled-up\" to the LIBRARY, SAMPLE, or FILE level before being compared." +
" Fingerprints from VCF can be be compared by SAMPLE or FILE.")
public CrosscheckMetric.DataType CROSSCHECK_BY = CrosscheckMetric.DataType.READGROUP;
@@ -362,7 +376,7 @@ public class CrosscheckFingerprints extends CommandLineProgram {
@Argument(doc = "The number of threads to use to process files and generate fingerprints.")
public int NUM_THREADS = 1;
- @Argument(doc = "specifies whether the Tumor-aware result should be calculated. These are time consuming and can roughly double the " +
+ @Argument(doc = "Specifies whether the Tumor-aware result should be calculated. These are time consuming and can roughly double the " +
"runtime of the tool. When crosschecking many groups not calculating the tumor-aware results can result in a significant speedup.")
public boolean CALCULATE_TUMOR_AWARE_RESULTS = true;
@@ -374,7 +388,7 @@ public class CrosscheckFingerprints extends CommandLineProgram {
"the expected sample. Must be greater than zero. ")
public double GENOTYPING_ERROR_RATE = 0.01;
- @Argument(doc = "If true then only groups that do not relate to each other as expected will have their LODs reported.")
+ @Argument(doc = "If true, then only groups that do not relate to each other as expected will have their LODs reported.")
public boolean OUTPUT_ERRORS_ONLY = false;
@Argument(doc = "The rate at which a heterozygous genotype in a normal sample turns into a homozygous (via loss of heterozygosity) " +
@@ -389,10 +403,11 @@ public class CrosscheckFingerprints extends CommandLineProgram {
@Argument(doc = "When one or more mismatches between groups is detected, exit with this value instead of 0.")
public int EXIT_CODE_WHEN_MISMATCH = 1;
- @Argument(doc = "When all LOD score are zero, exit with this value.")
+ @Argument(doc = "When all LOD scores are zero, exit with this value.")
public int EXIT_CODE_WHEN_NO_VALID_CHECKS = 1;
- @Argument(doc = "Maximal effect of any single haplotype block on outcome (-log10 of maximal likelihood difference between the different values for the three possible genotypes).", minValue = 0)
+ @Argument(doc = "Maximal effect of any single haplotype block on outcome (-log10 of maximal likelihood difference between the different " +
+ "values for the three possible genotypes).", minValue = 0)
public double MAX_EFFECT_OF_EACH_HAPLOTYPE_BLOCK = 3.0;
@Hidden
diff --git a/src/main/java/picard/fingerprint/CrosscheckMetric.java b/src/main/java/picard/fingerprint/CrosscheckMetric.java
index f94a1b0050..2f86d04fa3 100644
--- a/src/main/java/picard/fingerprint/CrosscheckMetric.java
+++ b/src/main/java/picard/fingerprint/CrosscheckMetric.java
@@ -59,6 +59,7 @@ public Boolean isMatch() {
}
}
+ /** The data type. */
public enum DataType {
FILE,
SAMPLE,
@@ -66,44 +67,46 @@ public enum DataType {
READGROUP
}
+ /** The LEFT group value. */
public String LEFT_GROUP_VALUE;
+ /** The RIGHT group value. */
public String RIGHT_GROUP_VALUE;
- // The overall result of the match
+ /** The overall result of the match. */
public FingerprintResult RESULT;
- // The data type that was being compared
+ /** The data type that was being compared. */
public DataType DATA_TYPE;
- // The resulting LOD score comparing LEFT and RIGHT data
+ /** The resulting LOD score comparing LEFT and RIGHT data. */
public Double LOD_SCORE;
- // The resulting LOD score comparing LEFT as tumor and RIGHT as normal
+ /** The resulting LOD score comparing LEFT as tumor and RIGHT as normal. */
public Double LOD_SCORE_TUMOR_NORMAL;
- // The resulting LOD score comparing LEFT as normal and RIGHT as tumor
+ /** The resulting LOD score comparing LEFT as normal and RIGHT as tumor. */
public Double LOD_SCORE_NORMAL_TUMOR;
- // The LEFT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG
+ /** The LEFT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG. */
public String LEFT_RUN_BARCODE;
- // The LEFT lane
+ /** The LEFT lane. */
public Integer LEFT_LANE;
- // The LEFT molecular (sample) barcode
+ /** The LEFT molecular (sample) barcode. */
public String LEFT_MOLECULAR_BARCODE_SEQUENCE;
- // The LEFT library identifier
+ /** The LEFT library identifier. */
public String LEFT_LIBRARY;
- // The LEFT sample identifier
+ /** The LEFT sample identifier. */
public String LEFT_SAMPLE;
- // The LEFT file from which the fingerprint was obtained
+ /** The LEFT file from which the fingerprint was obtained. */
public String LEFT_FILE;
- // The RIGHT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG
+ /** The RIGHT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG. */
public String RIGHT_RUN_BARCODE;
- // The LEFT lane
+ /** The RIGHT lane. */
public Integer RIGHT_LANE;
- // The LEFT molecular (sample) barcode
+ /** The RIGHT molecular (sample) barcode. */
public String RIGHT_MOLECULAR_BARCODE_SEQUENCE;
- // The LEFT library identifier
+ /** The RIGHT library identifier. */
public String RIGHT_LIBRARY;
- // The LEFT sample identifier
+ /** The RIGHT sample identifier. */
public String RIGHT_SAMPLE;
- // The LEFT file from which the fingerprint was obtained
+ /** The RIGHT file from which the fingerprint was obtained. */
public String RIGHT_FILE;
}