Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Column Select Helper #4248

Closed
wants to merge 10 commits into from
50 changes: 3 additions & 47 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -922,54 +922,10 @@ replace_dot_alias = function(e) {
# all duplicate columns must be matched, because nothing is provided
ansvals = chmatchdup(ansvars, names_x)
} else {
# FR #355 - negative numeric and character indices for SDcols
colsub = substitute(.SDcols)
# fix for R-Forge #5190. colsub[[1L]] gave error when it's a symbol.
if (colsub %iscall% c("!", "-")) {
negate_sdcols = TRUE
colsub = colsub[[2L]]
} else negate_sdcols = FALSE
# fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4)))
while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]]
if (colsub %iscall% ':' && length(colsub)==3L) {
# .SDcols is of the format a:b
.SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame())
} else {
if (colsub %iscall% 'patterns') {
# each pattern gives a new filter condition, intersect the end result
.SDcols = Reduce(intersect, do_patterns(colsub, names_x))
} else {
.SDcols = eval(colsub, parent.frame(), parent.frame())
# allow filtering via function in .SDcols, #3950
if (is.function(.SDcols)) {
.SDcols = lapply(x, .SDcols)
if (any(idx <- vapply_1i(.SDcols, length) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA)))
stop("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: ", brackify(names(x)[idx]))
.SDcols = unlist(.SDcols, use.names = FALSE)
}
}
}
if (anyNA(.SDcols))
stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols))))
if (is.logical(.SDcols)) {
ansvals = which_(rep(.SDcols, length.out=length(x)), !negate_sdcols)
ansvars = sdvars = names_x[ansvals]
} else if (is.numeric(.SDcols)) {
.SDcols = as.integer(.SDcols)
# if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
if (length(unique(sign(.SDcols))) > 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L))
stop(".SDcols is numeric but out of bounds [1, ", ncol(x), "] at: ", brackify(which(idx)))
ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols]
ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols
} else {
if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
if (!all(idx <- .SDcols %chin% names_x))
stop("Some items of .SDcols are not column names: ", brackify(.SDcols[!idx]))
ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols
# dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
ansvals = chmatch(ansvars, names_x)
}
ansvals = col_helper(x, colsub, ".SDcols")
if (attr(ansvals, "negate")) ansvals = setdiff(ansvals, which(names_x %chin% bynames))
ansvars = sdvars = names_x[ansvals]
}
# fix for long standing FR/bug, #495 and #484
allcols = c(names_x, xdotprefix, names_i, idotprefix)
Expand Down
94 changes: 94 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,97 @@ edit.data.table = function(name, ...) {
setDT(NextMethod('edit', name))[]
}
# nocov end

col_helper = function(x, colsub, mode = NA_character_) {
## helper that could replace some common code used for by, .SDcols, j, duplicated, unique, dcast,
## or anything that looks for columns

#' @param x required; a data.frame, data.table, tibble, or list.
#' @param colsub takes the same arguments as .SDcols such as
#' `patterns()`, `var1:var2`, a variable, or a vector of characters.
#' When TRUE, will return seq_len(length(x))
#' @param mode takes character vector of length 1. In the short-term
#' there will be different modes based on the call. In the long-term, it's just
#' to help the end-user what caused the error
#' @return Integer vector with a boolean attribute, `"negate"` which is TRUE when
#' `colsub[[1L]]` is `-` or `!`.

# see @jangorecki #4174 for how to get the vast majority of this into C

x_len = length(x)
origsub = colsub

# FR #4979 - negative numeric and character indices for SDcols
# fix for #5190. colsub[[1L]] gave error when it's a symbol.
negate_cols = is.call(colsub) && (colsub[[1L]] == "!" || (colsub[[1L]] == "-" && length(colsub) == 2L))
if (negate_cols) colsub = colsub[[2L]]
ColeMiller1 marked this conversation as resolved.
Show resolved Hide resolved

if (is.call(colsub)){
# fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4)))
if (colsub[[1L]] == "(") {
colsub = as.list(colsub)[[-1L]]
while(length(colsub) > 1L && colsub[[1L]] == "(") colsub = colsub[[-1L]]
# give users a second chance with negation for (-1)
if (length(colsub) == 2L && colsub[[1L]] == "-" && is.numeric(colsub[[2L]])) {
negate_cols = TRUE ##what about -(-1)???
colsub = colsub[[2L]]
}
}
if (length(colsub) == 3L && colsub[[1L]] == ":") {
ColeMiller1 marked this conversation as resolved.
Show resolved Hide resolved
if (is.name(colsub[[2L]]) && is.name(colsub[[3L]])){
# cols is of the format a:c
rnge = chmatch(c(as.character(colsub[[2L]]), as.character(colsub[[3L]])), names(x))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about !is.name(colsub[[3L]])?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

exactly, some recent commit in my branch was addressing cases like var1:5 or 1:var1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now both need to be names. If either is a character, a new error is raised. Otherwise, it evaluates in the parent frame. I largely followed Jan's work so now V2 -V1 errors as well.

if (anyNA(rnge)) stop(gettextf("Not all columns were found in the %s column select statement: [%s]", mode, deparse(origsub), domain = "R-data.table"))
cols = rnge[1L]:rnge[2L]
} else if (is.character(colsub[[2L]]) || is.character(colsub[[3L]])){
stop(gettextf("When selecting a range of columns in the var_1:var_2 format, quotation marks should not be used. Instead of %s try %s:%s", deparse(origsub), deparse(as.name(colsub[[2L]])), deparse(as.name(colsub[[3L]])), domain = "R-data.table"))
} else {
# cols is of the format 1:3 or would also allow min(var):max(var)
cols = eval(colsub, parent.frame(2L), parent.frame(2L))
}
} else if (length(colsub) > 1 && colsub[[1L]] == "patterns") {
# each pattern gives a new filter condition, intersect the end result
x_names = names(x)
cols = Reduce(intersect, lapply(as.list(colsub)[-1L], function(col) grep(eval(col, parent.frame(2L)), x_names)))
} else {
cols = eval(colsub, parent.frame(2L), parent.frame(2L))
}
} else {
cols = eval(colsub, parent.frame(2L), parent.frame(2L))
}

# allow filtering via function in .SDcols, #3950
if (is.function(cols)) {
cols = lapply(x, cols)
if (any(idx <- vapply_1i(cols, length) > 1L | vapply_1c(cols, typeof) != 'logical' | vapply_1b(cols, anyNA)))
stop(gettextf("When %s is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: %s", mode, brackify(names(x)[idx]), domain = "R-data.table"))
cols = unlist(cols, use.names = FALSE)
}

if (anyNA(cols))
stop(gettextf("%s missing at the following indices: %s", mode, brackify(which(is.na(cols)))))

if (is.logical(cols)) {
if ((col_len <- length(cols)) != x_len) {
## TODO change to error in 2022
warning(gettextf("When %s is a logical vector, each column should have a TRUE or FALSE entry. The current logical vector of length %d will be repeated to length of data.table. Warning will change to error in the next verion.", mode, col_len, domain = "R-data.table"))
cols = rep(cols, length.out = x_len)
}
ansvals = which_(cols, !negate_cols)
} else if (is.numeric(cols)) {
cols = as.integer(cols)
if (any(idx <- abs(cols)>x_len | cols == 0L))
stop(gettextf("%s is numeric but out of bounds [1, %d] at: %s", mode, x_len, brackify(which(idx)), domain = "R-data.table"))
if (length(unique(sign(cols))) > 1L) stop(gettextf("%s is numeric but has both +ve and -ve indices", mode, domain = "R-data.table"))
ansvals = if (negate_cols) setdiff(seq_len(x_len), cols) else cols
} else {
if (!is.character(cols)) stop(gettextf("%s should be column numbers or names", mode, domain = "R-data.table"))
x_names = names(x)
if (!all(idx <- cols %chin% x_names))
stop(gettextf("Some items of %s are not column names: %s", mode, brackify(cols[!idx]), domain = "R-data.table"))
ansvars = if (negate_cols) setdiff(x_names, cols) else cols
ansvals = chmatch(ansvars, x_names)
}
attr(ansvals, "negate") = negate_cols
return(ansvals)
}
17 changes: 15 additions & 2 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -6968,8 +6968,8 @@ test(1497, DT[, .SD, .SDcols = !c("a", "c")], DT[, !c("a", "c"), with=FALSE])

# Fix for #1060
DT = data.table(x=1, y=2, z=3, a=4, b=5, c=6)
test(1498.1, DT[, .SD, .SDcols=c(TRUE,FALSE)], DT[, c("x", "z", "b"), with=FALSE])
test(1498.2, DT[, .SD, .SDcols=!c(TRUE,FALSE)], DT[, !c("x", "z", "b"), with=FALSE])
test(1498.1, DT[, .SD, .SDcols=c(TRUE,FALSE)], DT[, c("x", "z", "b"), with=FALSE], warning = "When .SDcols is a logical vector, each column should have a TRUE or FALSE entry.")
test(1498.2, DT[, .SD, .SDcols=!c(TRUE,FALSE)], DT[, !c("x", "z", "b"), with=FALSE], warning = "When .SDcols is a logical vector, each column should have a TRUE or FALSE entry.")

# Fix for #1072
dt <- data.table(group1 = "a", group2 = "z", value = 1)
Expand Down Expand Up @@ -16846,3 +16846,16 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN))
test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
A = data.table(A=as.complex(rep(NA, 5)))
test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))

# Test additional .SDcols functionality
dt = as.data.table(lapply(1:5, c))
test(2139.1, dt[, .SD, .SDcols = V2 - V1], error = "object 'V2' not found")
test(2139.2, dt[, .SD, .SDcols = V1], error = "object 'V1' not found")
test(2139.3, dt[, .SD, .SDcols = (-1L)], dt[, .(V2, V3, V4, V5)])
V1 = "V2"
test(2139.4, dt[, .SD, .SDcols = V1], dt[, .(V2)])
test(2139.5, dt[, .SD, .SDcols = (V1)], dt[, .(V2)])
test(2139.6, dt[, .SD, .SDcols = "V1":V1], error = "When selecting a range of columns in the var_1:var_2 format, quotation marks should not be used. Instead of \"V1\":V1 try V1:V1")
test(2139.7, dt[, .SD, .SDcols = V1:V6], error = "Not all columns were found in the .SDcols column select statement: [V1:V6]")
test(2139.8, dt[, .SD, .SDcols = c(TRUE, FALSE)], dt[, .(V1, V3, V5)], warning = "When .SDcols is a logical vector, each column should have a TRUE or FALSE entry.")
rm(V1)