Skip to content

Commit

Permalink
Column iterators should check the number rows (#432)
Browse files Browse the repository at this point in the history
* add another empty file for testing

* add empty file tests with varied ordering

* check that idx has rows before creating col iterator

* update debug file

* add test for all empty multi file

* check rows exist before changing iterator

* modify variable names

* don't need this test file anymore

* use inline test examples

* restyle test file

* add NEWS bullet
  • Loading branch information
sbearrows authored May 3, 2022
1 parent 728d88e commit 829ba96
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 41 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# vroom (development version)

* Fixed segfault when reading in multiple files and the first file is header-only but subsequent files have at least one row (#430).

* Fixed segfault when `vroom_format()` is given an empty data frame (#425)

* Fixed a segfault that could occur when the final field of the final line is missing and the file also does not end in a newline (#429).
Expand Down
12 changes: 12 additions & 0 deletions src/index_collection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ index_collection::full_iterator::full_iterator(
idx_(std::move(idx)),
column_(column),
end_(idx_->indexes_.size() - 1) {

auto n_inputs = idx_->indexes_.size();
auto n_rows_total = idx_->rows_;
auto n_rows_current_input = idx_->indexes_[i_]->num_rows();

if (n_rows_current_input == 0 && n_inputs > 1 && n_rows_total > 0) {
while(n_rows_current_input == 0) {
++i_;
n_rows_current_input = idx_->indexes_[i_]->num_rows();
}
}

auto col = idx_->indexes_[i_]->get_column(column_);
it_ = col->begin();
it_end_ = col->end();
Expand Down
53 changes: 48 additions & 5 deletions tests/testthat/test-multi-file.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ test_that("vroom works with many files", {
y = rnorm(10),
),
file.path(dir, paste0(i, ".csv")),
delim = ",")
delim = ","
)
}

files <- list.files(dir, pattern = ".*[.]csv", full.names = TRUE)
Expand All @@ -81,7 +82,8 @@ test_that("vroom works with many connections", {
y = rnorm(10),
),
file.path(dir, paste0(i, ".csv.gz")),
delim = ",")
delim = ","
)
}

files <- list.files(dir, pattern = ".*[.]csv[.]gz", full.names = TRUE)
Expand All @@ -93,24 +95,65 @@ test_that("vroom works with many connections", {
})

test_that("vroom errors if numbers of columns are inconsistent", {

files <- test_path("multi-file", c("foo", "baz"))
expect_error(vroom::vroom(files, col_types = list()), "must all have")
})

test_that("vroom errors if column names are inconsistent", {

files <- test_path("multi-file", c("foo", "bar"))
expect_error(vroom::vroom(files, col_types = list()), "consistent column names")
})

test_that("vroom works if a file contains no data", {

files <- test_path("multi-file", c("foo", "qux"))
res <- vroom(files, col_types = list())
expect_equal(res, tibble::tibble(A = 1, B = 2))
})

test_that("vroom works if some files contain no data, regardless of order (#430)", {
destdir <- withr::local_tempdir("testing-multiple-files")

vroom_write_lines(c("A,B"), file.path(destdir, "header_only.csv"))
vroom_write_lines(c("A,B"), file.path(destdir, "another_header_only.csv"))
vroom_write_lines(c("A,B", "1,2"), file.path(destdir, "header_and_one_row.csv"))

files <- file.path(destdir, c("header_only.csv", "header_and_one_row.csv"))
res <- vroom(files, show_col_types = FALSE)
expect_equal(res, tibble::tibble(A = 1, B = 2))

files <- file.path(destdir, c(
"header_only.csv",
"another_header_only.csv",
"header_and_one_row.csv"
))
res <- vroom(files, show_col_types = FALSE)
expect_equal(res, tibble::tibble(A = 1, B = 2))

files <- file.path(destdir, c(
"header_only.csv",
"header_and_one_row.csv",
"another_header_only.csv"
))
res <- vroom(files, show_col_types = FALSE)
expect_equal(res, tibble::tibble(A = 1, B = 2))

files <- file.path(destdir, c(
"header_and_one_row.csv",
"header_only.csv",
"another_header_only.csv"
))
res <- vroom(files, show_col_types = FALSE)
expect_equal(res, tibble::tibble(A = 1, B = 2))

files <- file.path(destdir, c(
"header_only.csv",
"another_header_only.csv"
))
res <- vroom(files, show_col_types = FALSE)
x <- tibble::tibble(A = "", B = "", .rows = 0)
expect_equal(res, x)
})

test_that("vroom works for indxes that span file boundries (#383)", {
x <- vroom(c(vroom_example("mtcars.csv"), vroom_example("mtcars.csv")), col_types = list())
y <- rbind(mtcars, mtcars)
Expand Down
57 changes: 21 additions & 36 deletions tools/debug.R
Original file line number Diff line number Diff line change
@@ -1,43 +1,28 @@
devtools::clean_dll()
devtools::load_all()

# https:/tidyverse/vroom/issues/429
x <- vroom::vroom("investigations/my_file.CSV") # often segfaults here
tail(x[,12:15])
# Setup

# here's where the problem is, when it occurs
# delimited_index.cc line 416
# has_quote = *begin == quote_;
# EXC_BAD_ACCESS
# in the bad situation, begin == end and both are `\0`
# happens when file is missing a final newline AND is missing the final field
# the pointers to this field's beginning and end are the same and are
# essentially past (at) the end of the file
# create the files however you'd prefer
# but they can't be temp files
destdir <- "../think_tank/testFiles/"

# the segfault always seems to happen in RStudio
#brio::write_lines("a", file.path(destdir, "no-rows.tsv"))
#brio::write_lines(c("a","x"), file.path(destdir, "one-row.tsv"))

# once I'm in lldb inside VS Code, the bug usually goes away, i.e. dereferencing
# *begin is OK
# but ultimately it is clear which line is the problem (see above)
# Debugging

# if I specify the last col_type, reading (appears to) work
# x <- vroom::vroom(
# "investigations/my_file.CSV",
# col_types = list(Average_SOC_Required = "n")
# )
# tail(x[,12:15]) # but now it crashes here
devtools::clean_dll()
devtools::load_all()

# can I make this happen with a smaller file if I change
# guess_max or the buffer size? nope, I never succeeded to do this

# small example to better understand the index
# writeChar("X,Y\r\na,bbb\r\ne,", "investigations/mini.csv", eos = NULL)
#
# | | | idx_[0]
# | | | | | | idx_[1]
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
# X , Y \r \n a , b b b \r \n e ,
# also, when I'm in debug mode in VSCode
# if I have altrep = TRUE (the default) it doesn't segfault
# but this is only true when I'm using the debugger in VSCode
# I've also been setting num_threads to 1 to make it easier to debug
vroom::vroom(
file.path(destdir, c("no-rows.tsv", "one-row.tsv")
), altrep = FALSE, num_threads = 1, delim = "\t")

# withr::with_envvar(c("VROOM_CONNECTION_SIZE" = 10), {
# x <- vroom(file("investigations/mini.csv"))
# })
#vroom::vroom(c(
#"../think_tank/testFiles/also-one-row.tsv",
#"../think_tank/testFiles/no-rows.tsv",
#"../think_tank/testFiles/another-one-row.tsv"),
# altrep = FALSE, num_threads = 1)

0 comments on commit 829ba96

Please sign in to comment.