Skip to content

Commit

Permalink
Ignore quotes when skipping in read_lines
Browse files Browse the repository at this point in the history
Fixes #991
  • Loading branch information
jimhester committed Sep 14, 2020
1 parent f7158cd commit e1f86e5
Show file tree
Hide file tree
Showing 9 changed files with 63 additions and 36 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

## Additional features and fixes

* `read_lines()` now ignores quotations when skipping lines (#991).

* `write_csv2()` now formats decimal numbers more consistently with `utils::write.csv2()` (#1087)

* `fwf_positions(end)` no longer has a default argument and must be specified (#996)
Expand Down
4 changes: 2 additions & 2 deletions R/lines.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ read_lines <- function(file, skip = 0, skip_empty_rows = FALSE, n_max = -1L,
if (empty_file(file)) {
return(character())
}
ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows)
ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows, skip_quote = FALSE)
read_lines_(ds, skip_empty_rows = skip_empty_rows, locale_ = locale, na = na, n_max = n_max, progress = progress)
}

Expand All @@ -47,7 +47,7 @@ read_lines_raw <- function(file, skip = 0,
if (empty_file(file)) {
return(list())
}
ds <- datasource(file, skip = skip, skip_empty_rows = FALSE)
ds <- datasource(file, skip = skip, skip_empty_rows = FALSE, skip_quote = FALSE)
read_lines_raw_(ds, n_max = n_max, progress = progress)
}

Expand Down
34 changes: 17 additions & 17 deletions R/source.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#' con <- rawConnection(charToRaw("abc\n123"))
#' datasource(con)
#' close(con)
datasource <- function(file, skip = 0, skip_empty_rows = FALSE, comment = "") {
datasource <- function(file, skip = 0, skip_empty_rows = FALSE, comment = "", skip_quote = TRUE) {
if (inherits(file, "source")) {

# If `skip` and `comment` arguments are expliictly passed, we want to use
Expand All @@ -50,20 +50,20 @@ datasource <- function(file, skip = 0, skip_empty_rows = FALSE, comment = "") {

file
} else if (is.connection(file)) {
datasource_connection(file, skip, skip_empty_rows, comment)
datasource_connection(file, skip, skip_empty_rows, comment, skip_quote)
} else if (is.raw(file)) {
datasource_raw(file, skip, skip_empty_rows, comment)
datasource_raw(file, skip, skip_empty_rows, comment, skip_quote)
} else if (is.character(file)) {
if (length(file) > 1) {
datasource_string(paste(file, collapse = "\n"), skip, skip_empty_rows, comment)
datasource_string(paste(file, collapse = "\n"), skip, skip_empty_rows, comment, skip_quote)
} else if (grepl("\n", file)) {
datasource_string(file, skip, skip_empty_rows, comment)
datasource_string(file, skip, skip_empty_rows, comment, skip_quote)
} else {
file <- standardise_path(file)
if (is.connection(file)) {
datasource_connection(file, skip, skip_empty_rows, comment)
datasource_connection(file, skip, skip_empty_rows, comment, skip_quote)
} else {
datasource_file(file, skip, skip_empty_rows, comment)
datasource_file(file, skip, skip_empty_rows, comment, skip_quote)
}
}
} else {
Expand All @@ -73,32 +73,32 @@ datasource <- function(file, skip = 0, skip_empty_rows = FALSE, comment = "") {

# Constructors -----------------------------------------------------------------

new_datasource <- function(type, x, skip, skip_empty_rows = TRUE, comment = "", ...) {
structure(list(x, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, ...),
new_datasource <- function(type, x, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE, ...) {
structure(list(x, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote, ...),
class = c(paste0("source_", type), "source"))
}

datasource_string <- function(text, skip, skip_empty_rows = TRUE, comment = "") {
new_datasource("string", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment)
datasource_string <- function(text, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE) {
new_datasource("string", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote)
}

datasource_file <- function(path, skip, skip_empty_rows = TRUE, comment = "", ...) {
datasource_file <- function(path, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE, ...) {
path <- check_path(path)
new_datasource("file", path, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, ...)
new_datasource("file", path, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote, ...)
}

datasource_connection <- function(path, skip, skip_empty_rows = TRUE, comment = "") {
datasource_connection <- function(path, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE) {
# We read the connection to a temporary file, then register a finalizer to
# cleanup the temp file after the datasource object is removed.

file <- read_connection(path)
env <- new.env(parent = emptyenv())
reg.finalizer(env, function(env) unlink(file))
datasource_file(file, skip, skip_empty_rows = skip_empty_rows, comment = comment, env = env)
datasource_file(file, skip, skip_empty_rows = skip_empty_rows, comment = comment, env = env, skip_quote = skip_quote)
}

datasource_raw <- function(text, skip, skip_empty_rows, comment) {
new_datasource("raw", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment)
datasource_raw <- function(text, skip, skip_empty_rows, comment, skip_quote = TRUE) {
new_datasource("raw", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote)
}

# Helpers ----------------------------------------------------------------------
Expand Down
23 changes: 14 additions & 9 deletions src/Source.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,18 @@ SourcePtr Source::create(cpp11::list spec) {
int skip = cpp11::as_cpp<int>(spec["skip"]);
bool skipEmptyRows = cpp11::as_cpp<bool>(spec["skip_empty_rows"]);
std::string comment = cpp11::as_cpp<std::string>(spec["comment"]);
bool skipQuote = cpp11::as_cpp<bool>(spec["skip_quote"]);

if (subclass == "source_raw") {
return SourcePtr(new SourceRaw(spec[0], skip, skipEmptyRows, comment));
return SourcePtr(
new SourceRaw(spec[0], skip, skipEmptyRows, comment, skipQuote));
} else if (subclass == "source_string") {
return SourcePtr(new SourceString(spec[0], skip, skipEmptyRows, comment));
return SourcePtr(
new SourceString(spec[0], skip, skipEmptyRows, comment, skipQuote));
} else if (subclass == "source_file") {
cpp11::strings path(spec[0]);
return SourcePtr(new SourceFile(
Rf_translateChar(path[0]), skip, skipEmptyRows, comment));
Rf_translateChar(path[0]), skip, skipEmptyRows, comment, skipQuote));
}

cpp11::stop("Unknown source type");
Expand All @@ -32,14 +35,16 @@ const char* Source::skipLines(
const char* end,
int n,
bool skipEmptyRows,
const std::string& comment) {
const std::string& comment,
bool skipQuote) {
bool hasComment = comment != "";
bool isComment;

const char* cur = begin;

while (cur < end && n > 0) {
cur = skipLine(cur, end, hasComment && inComment(cur, end, comment));
cur = skipLine(
cur, end, hasComment && inComment(cur, end, comment), skipQuote);
--n;
++skippedRows_;
}
Expand All @@ -48,19 +53,19 @@ const char* Source::skipLines(
while (cur < end &&
((skipEmptyRows && (*cur == '\n' || *cur == '\r')) ||
(isComment = hasComment && inComment(cur, end, comment)))) {
cur = skipLine(cur, end, isComment);
cur = skipLine(cur, end, isComment, skipQuote);
++skippedRows_;
}

return cur;
}

const char*
Source::skipLine(const char* begin, const char* end, bool isComment) {
const char* Source::skipLine(
const char* begin, const char* end, bool isComment, bool skipQuote) {
const char* cur = begin;
// skip the rest of the line until the newline
while (cur < end && !(*cur == '\n' || *cur == '\r')) {
if (!isComment && *cur == '"') {
if (!isComment && skipQuote && *cur == '"') {
cur = skipDoubleQuoted(cur, end);
} else {
advanceForLF(&cur, end);
Expand Down
6 changes: 4 additions & 2 deletions src/Source.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ class Source {
const char* end,
int n,
bool skipEmptyRows = true,
const std::string& comment = "");
const std::string& comment = "",
bool skipQuote = true);

const char* skipLine(const char* begin, const char* end, bool isComment);
const char*
skipLine(const char* begin, const char* end, bool isComment, bool skipQuote);

const char* skipDoubleQuoted(const char* begin, const char* end);

Expand Down
5 changes: 3 additions & 2 deletions src/SourceFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ class SourceFile : public Source {
const std::string& path,
int skip = 0,
bool skipEmptyRows = true,
const std::string& comment = "") {
const std::string& comment = "",
bool skipQuotes = true) {
try {
fm_ = boost::interprocess::file_mapping(
path.c_str(), boost::interprocess::read_only);
Expand All @@ -35,7 +36,7 @@ class SourceFile : public Source {
begin_ = skipBom(begin_, end_);

// Skip lines, if needed
begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment);
begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes);
}

const char* begin() { return begin_; }
Expand Down
5 changes: 3 additions & 2 deletions src/SourceRaw.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class SourceRaw : public Source {
cpp11::raws x,
int skip = 0,
bool skipEmptyRows = true,
const std::string& comment = "")
const std::string& comment = "",
bool skipQuotes = true)
: x_(x) {
begin_ = (const char*)RAW(x);
end_ = (const char*)RAW(x) + Rf_xlength(x);
Expand All @@ -23,7 +24,7 @@ class SourceRaw : public Source {
begin_ = skipBom(begin_, end_);

// Skip lines, if needed
begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment);
begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes);
}

const char* begin() { return begin_; }
Expand Down
5 changes: 3 additions & 2 deletions src/SourceString.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ class SourceString : public Source {
cpp11::strings x,
int skip = 0,
bool skipEmptyRows = true,
const std::string& comment = "")
const std::string& comment = "",
bool skipQuotes = true)
: string_(static_cast<SEXP>(x[0])) {

begin_ = CHAR(string_);
Expand All @@ -26,7 +27,7 @@ class SourceString : public Source {
begin_ = skipBom(begin_, end_);

// Skip lines, if needed
begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment);
begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes);
}

const char* begin() { return begin_; }
Expand Down
15 changes: 15 additions & 0 deletions tests/testthat/test-read-lines.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,21 @@ test_that("read_lines(skip_empty_rows) works when blank lines are at the end of
expect_equal(read_lines(tmp, skip_empty_rows = TRUE), "test")
})

test_that("read_lines(skip_empty_rows) works if there are double quotes in the lines (#991)", {
data <-
"a\"b
cde
f\"g
hij"

expect_equal(
read_lines(data, skip = 1),
c("cde",
"f\"g",
"hij")
)
})

# These tests are slow so are commented out
#test_that("long vectors are supported", {
#tmp <- tempfile(fileext = ".gz")
Expand Down

0 comments on commit e1f86e5

Please sign in to comment.