Column iterators should check the number rows (#432)

* add another empty file for testing * add empty file tests with varied ordering * check that idx has rows before creating col iterator * update debug file * add test for all empty multi file * check rows exist before changing iterator * modify variable names * don't need this test file anymore * use inline test examples * restyle test file * add NEWS bullet
tidyverse · May 3, 2022 · 829ba96 · 829ba96
1 parent 728d88e
commit 829ba96
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 41 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # vroom (development version)
 
+* Fixed segfault when reading in multiple files and the first file is header-only but subsequent files have at least one row (#430).
+
 * Fixed segfault when `vroom_format()` is given an empty data frame (#425)
 
 * Fixed a segfault that could occur when the final field of the final line is missing and the file also does not end in a newline (#429).

diff --git a/src/index_collection.cc b/src/index_collection.cc
@@ -20,6 +20,18 @@ index_collection::full_iterator::full_iterator(
  idx_(std::move(idx)),
  column_(column),
  end_(idx_->indexes_.size() - 1) {
+
+ auto n_inputs = idx_->indexes_.size();
+ auto n_rows_total = idx_->rows_;
+ auto n_rows_current_input = idx_->indexes_[i_]->num_rows();
+
+ if (n_rows_current_input == 0 && n_inputs > 1 && n_rows_total > 0) {
+ while(n_rows_current_input == 0) {
+ ++i_;
+ n_rows_current_input = idx_->indexes_[i_]->num_rows();
+ }
+ }
+
  auto col = idx_->indexes_[i_]->get_column(column_);
  it_ = col->begin();
  it_end_ = col->end();

diff --git a/tests/testthat/test-multi-file.R b/tests/testthat/test-multi-file.R
@@ -56,7 +56,8 @@ test_that("vroom works with many files", {
  y = rnorm(10),
  ),
  file.path(dir, paste0(i, ".csv")),
- delim = ",")
+ delim = ","
+ )
  }
 
  files <- list.files(dir, pattern = ".*[.]csv", full.names = TRUE)
@@ -81,7 +82,8 @@ test_that("vroom works with many connections", {
  y = rnorm(10),
  ),
  file.path(dir, paste0(i, ".csv.gz")),
- delim = ",")
+ delim = ","
+ )
  }
 
  files <- list.files(dir, pattern = ".*[.]csv[.]gz", full.names = TRUE)
@@ -93,24 +95,65 @@ test_that("vroom works with many connections", {
 })
 
 test_that("vroom errors if numbers of columns are inconsistent", {
-
  files <- test_path("multi-file", c("foo", "baz"))
  expect_error(vroom::vroom(files, col_types = list()), "must all have")
 })
 
 test_that("vroom errors if column names are inconsistent", {
-
  files <- test_path("multi-file", c("foo", "bar"))
  expect_error(vroom::vroom(files, col_types = list()), "consistent column names")
 })
 
 test_that("vroom works if a file contains no data", {
-
  files <- test_path("multi-file", c("foo", "qux"))
  res <- vroom(files, col_types = list())
  expect_equal(res, tibble::tibble(A = 1, B = 2))
 })
 
+test_that("vroom works if some files contain no data, regardless of order (#430)", {
+ destdir <- withr::local_tempdir("testing-multiple-files")
+
+ vroom_write_lines(c("A,B"), file.path(destdir, "header_only.csv"))
+ vroom_write_lines(c("A,B"), file.path(destdir, "another_header_only.csv"))
+ vroom_write_lines(c("A,B", "1,2"), file.path(destdir, "header_and_one_row.csv"))
+
+ files <- file.path(destdir, c("header_only.csv", "header_and_one_row.csv"))
+ res <- vroom(files, show_col_types = FALSE)
+ expect_equal(res, tibble::tibble(A = 1, B = 2))
+
+ files <- file.path(destdir, c(
+ "header_only.csv",
+ "another_header_only.csv",
+ "header_and_one_row.csv"
+ ))
+ res <- vroom(files, show_col_types = FALSE)
+ expect_equal(res, tibble::tibble(A = 1, B = 2))
+
+ files <- file.path(destdir, c(
+ "header_only.csv",
+ "header_and_one_row.csv",
+ "another_header_only.csv"
+ ))
+ res <- vroom(files, show_col_types = FALSE)
+ expect_equal(res, tibble::tibble(A = 1, B = 2))
+
+ files <- file.path(destdir, c(
+ "header_and_one_row.csv",
+ "header_only.csv",
+ "another_header_only.csv"
+ ))
+ res <- vroom(files, show_col_types = FALSE)
+ expect_equal(res, tibble::tibble(A = 1, B = 2))
+
+ files <- file.path(destdir, c(
+ "header_only.csv",
+ "another_header_only.csv"
+ ))
+ res <- vroom(files, show_col_types = FALSE)
+ x <- tibble::tibble(A = "", B = "", .rows = 0)
+ expect_equal(res, x)
+})
+
 test_that("vroom works for indxes that span file boundries (#383)", {
  x <- vroom(c(vroom_example("mtcars.csv"), vroom_example("mtcars.csv")), col_types = list())
  y <- rbind(mtcars, mtcars)

diff --git a/tools/debug.R b/tools/debug.R
@@ -1,43 +1,28 @@
-devtools::clean_dll()
-devtools::load_all()
-
-# https:/tidyverse/vroom/issues/429
-x <- vroom::vroom("investigations/my_file.CSV") # often segfaults here
-tail(x[,12:15])
+# Setup
 
-# here's where the problem is, when it occurs
-# delimited_index.cc line 416
-# has_quote = *begin == quote_;
-# EXC_BAD_ACCESS
-# in the bad situation, begin == end and both are `\0`
-# happens when file is missing a final newline AND is missing the final field
-# the pointers to this field's beginning and end are the same and are
-# essentially past (at) the end of the file
+# create the files however you'd prefer
+# but they can't be temp files
+destdir <- "../think_tank/testFiles/"
 
-# the segfault always seems to happen in RStudio
+#brio::write_lines("a", file.path(destdir, "no-rows.tsv"))
+#brio::write_lines(c("a","x"), file.path(destdir, "one-row.tsv"))
 
-# once I'm in lldb inside VS Code, the bug usually goes away, i.e. dereferencing
-# *begin is OK
-# but ultimately it is clear which line is the problem (see above)
+# Debugging
 
-# if I specify the last col_type, reading (appears to) work
-# x <- vroom::vroom(
-# "investigations/my_file.CSV",
-# col_types = list(Average_SOC_Required = "n")
-# )
-# tail(x[,12:15]) # but now it crashes here
+devtools::clean_dll()
+devtools::load_all()
 
-# can I make this happen with a smaller file if I change
-# guess_max or the buffer size? nope, I never succeeded to do this
 
-# small example to better understand the index
-# writeChar("X,Y\r\na,bbb\r\ne,", "investigations/mini.csv", eos = NULL)
-#
-# | | | idx_[0]
-# | | | | | | idx_[1]
-# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
-# X , Y \r \n a , b b b \r \n e ,
+# also, when I'm in debug mode in VSCode
+# if I have altrep = TRUE (the default) it doesn't segfault
+# but this is only true when I'm using the debugger in VSCode
+# I've also been setting num_threads to 1 to make it easier to debug
+vroom::vroom(
+ file.path(destdir, c("no-rows.tsv", "one-row.tsv")
+ ), altrep = FALSE, num_threads = 1, delim = "\t")
 
-# withr::with_envvar(c("VROOM_CONNECTION_SIZE" = 10), {
-# x <- vroom(file("investigations/mini.csv"))
-# })
+#vroom::vroom(c(
+#"../think_tank/testFiles/also-one-row.tsv",
+#"../think_tank/testFiles/no-rows.tsv",
+#"../think_tank/testFiles/another-one-row.tsv"), 
+# altrep = FALSE, num_threads = 1)