Skip to content

Commit

Permalink
Fix a regression where VcfWriter would not write a regular file index (
Browse files Browse the repository at this point in the history
  • Loading branch information
clintval authored Mar 15, 2022
1 parent 1707517 commit 560f089
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 19 deletions.
29 changes: 19 additions & 10 deletions src/main/scala/com/fulcrumgenomics/vcf/api/VcfWriter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import htsjdk.samtools.Defaults
import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriter, VariantContextWriterBuilder}

import java.nio.file.Files
import java.nio.file.LinkOption.NOFOLLOW_LINKS
import java.nio.file.attribute.BasicFileAttributes

/**
* Writes [[Variant]]s to a file or other storage mechanism.
Expand All @@ -49,33 +49,42 @@ object VcfWriter {
var DefaultUseAsyncIo: Boolean = Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE

/**
* Creates a [[VcfWriter]] that will write to the give path. The path must end in either
* - `.vcf` to create an uncompressed VCF file
* - `.vcf.gz` to create a block-gzipped VCF file
* - `.bcf` to create a binary BCF file
* Creates a [[VcfWriter]] that will write to the given path. If the path is meant to point to a regular file, then
* the path must end in either:
*
* - `.vcf`: to create an uncompressed VCF file
* - `.vcf.gz`: to create a block-gzipped VCF file
* - `.bcf`: to create a binary BCF file
*
* If the path is meant to point to a regular file, then indexing will occur automatically. However, if the path
* already exists and the path is not a file or symbolic link, then this function will assume the path is a named
* pipe or device (such as `/dev/null`) and indexing will not occur.
*
* @param path the path to write to
* @param header the header of the VCF
* @return a VariantWriter to write to the given path
* @return a VCF writer to write to the given path
*/
def apply(path: PathToVcf, header: VcfHeader, async: Boolean = DefaultUseAsyncIo): VcfWriter = {
import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary
val javaHeader = VcfConversions.toJavaHeader(header)
require(!Files.isDirectory(path), s"Path cannot be a directory! Found $path")

val builder = new VariantContextWriterBuilder()
.setOutputPath(path)
.setReferenceDictionary(header.dict.asSam)
.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
.setBuffer(Io.bufferSize)

if (Files.isRegularFile(path, NOFOLLOW_LINKS)) {
builder.setOption(Options.INDEX_ON_THE_FLY)
} else {
if (async) builder.setOption(Options.USE_ASYNC_IO) else builder.unsetOption(Options.USE_ASYNC_IO)

// If the path exists and is not a file or symbolic link, then assume it is a named pipe and do not index.
if (Files.exists(path) && Files.readAttributes(path, classOf[BasicFileAttributes]).isOther) {
builder.unsetOption(Options.INDEX_ON_THE_FLY)
builder.setIndexCreator(null)
} else {
builder.setOption(Options.INDEX_ON_THE_FLY)
}

if (async) builder.setOption(Options.USE_ASYNC_IO) else builder.unsetOption(Options.USE_ASYNC_IO)
val writer = builder.build()
writer.writeHeader(javaHeader)
new VcfWriter(writer, header)
Expand Down
67 changes: 58 additions & 9 deletions src/test/scala/com/fulcrumgenomics/vcf/api/VcfIoTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,15 @@

package com.fulcrumgenomics.vcf.api

import com.fulcrumgenomics.commons.io.Io
import com.fulcrumgenomics.testing.{UnitSpec, VcfBuilder}
import com.fulcrumgenomics.commons.CommonsDef.SafelyClosable
import com.fulcrumgenomics.commons.io.{Io, PathUtil}
import com.fulcrumgenomics.testing.VcfBuilder.Gt
import com.fulcrumgenomics.testing.{UnitSpec, VcfBuilder}
import com.fulcrumgenomics.vcf.api.Allele.NoCallAllele
import htsjdk.samtools.util.FileExtensions.{BCF => BcfExtension, COMPRESSED_VCF => VcfGzExtension, CSI => CsiExtensions, TABIX_INDEX => TbiExtension, TRIBBLE_INDEX => IdxExtension, VCF => VcfExtension}
import org.scalatest.OptionValues

import java.nio.file.Files
import scala.collection.compat._
import scala.collection.immutable.ListMap

Expand Down Expand Up @@ -67,7 +70,7 @@ class VcfIoTest extends UnitSpec with OptionValues {

/** Writes out the variants to a file and reads them back, returning the header and the variants. */
@inline private def roundtrip(variants: IterableOnce[Variant], header: VcfHeader = VcfIoTest.Header): Result = {
val vcf = makeTempFile("test.", ".vcf")
val vcf = makeTempFile(getClass.getSimpleName, VcfExtension)
val out = VcfWriter(vcf, header)
out ++= variants
out.close()
Expand Down Expand Up @@ -167,7 +170,7 @@ class VcfIoTest extends UnitSpec with OptionValues {
val variants = Range.inclusive(1000, 2000, step=10).map { s =>
Variant(chrom="chr1", pos=s, alleles=alleles, genotypes=Map("s1" -> Genotype(alleles, "s1", alleles.alts)))
}
val vcf = makeTempFile("queryable.", ".vcf.gz")
val vcf = makeTempFile("queryable.", VcfGzExtension)
val out = VcfWriter(vcf, VcfIoTest.Header)
out ++= variants
out.close()
Expand Down Expand Up @@ -227,12 +230,58 @@ class VcfIoTest extends UnitSpec with OptionValues {
v.gt("0").callIndices.mkString("/") shouldBe "2/3"
}

it should "not attempt to index a VCF when streaming to a file handle or other kind of non-regular file" in {
it should "not allow a VCF writer to write to a directory path" in {
val samples = Seq("sample")
val builder = VcfBuilder(samples=samples)
val writer = VcfWriter(Io.DevNull, header = builder.header)
builder.add(chrom = "chr1", pos = 100, alleles = Seq("A", "C"), gts = Seq(Gt(sample="sample", gt="0/1")))
noException shouldBe thrownBy { writer.write(builder.toSeq) }
val builder = VcfBuilder(samples = samples)
val output = Io.makeTempDir(getClass.getSimpleName)
output.toFile.deleteOnExit()
an[IllegalArgumentException] shouldBe thrownBy { VcfWriter(output, header = builder.header) }
}

it should "write a sibling index when writing to a plaintext VCF file" in {
val samples = Seq("sample")
val builder = VcfBuilder(samples = samples)
val output = makeTempFile(getClass.getSimpleName, VcfExtension)
val writer = VcfWriter(output, header = builder.header)
builder.add(chrom = "chr1", pos = 100, alleles = Seq("A", "C"), gts = Seq(Gt(sample = "sample", gt = "0/1")))
writer.close()
val source = VcfSource(output)
source.isQueryable shouldBe true
source.safelyClose()
}

it should "write a sibling index when writing to a compressed VCF file" in {
val samples = Seq("sample")
val builder = VcfBuilder(samples = samples)
val output = makeTempFile(getClass.getSimpleName, VcfGzExtension)
val writer = VcfWriter(output, header = builder.header)
builder.add(chrom = "chr1", pos = 100, alleles = Seq("A", "C"), gts = Seq(Gt(sample = "sample", gt = "0/1")))
writer.close()
val source = VcfSource(output)
source.isQueryable shouldBe true
source.safelyClose()
}

it should "write a sibling index when writing to a BCF file" in {
val samples = Seq("sample")
val builder = VcfBuilder(samples = samples)
val output = makeTempFile(getClass.getSimpleName, BcfExtension)
val writer = VcfWriter(output, header = builder.header)
builder.add(chrom = "chr1", pos = 100, alleles = Seq("A", "C"), gts = Seq(Gt(sample = "sample", gt = "0/1")))
writer.close()
val source = VcfSource(output)
source.isQueryable shouldBe true
source.safelyClose()
}

it should "not attempt to write a sibling index when streaming to a named pipe like '/dev/null'" in {
val samples = Seq("sample")
val builder = VcfBuilder(samples = samples)
val writer = VcfWriter(Io.DevNull, header = builder.header)
builder.add(chrom = "chr1", pos = 100, alleles = Seq("A", "C"), gts = Seq(Gt(sample = "sample", gt = "0/1")))
noException shouldBe thrownBy { writer.write(builder.toSeq); writer.close() }
Files.exists(PathUtil.pathTo(Io.DevNull.getFileName.toString + CsiExtensions)) shouldBe false
Files.exists(PathUtil.pathTo(Io.DevNull.getFileName.toString + IdxExtension)) shouldBe false
Files.exists(PathUtil.pathTo(Io.DevNull.getFileName.toString + TbiExtension)) shouldBe false
}
}

0 comments on commit 560f089

Please sign in to comment.