Rdatatable · ColeMiller1 · Feb 18, 2020 · Feb 18, 2020 · Feb 18, 2020 · Feb 18, 2020
@@ -922,54 +922,10 @@ replace_dot_alias = function(e) {
  # all duplicate columns must be matched, because nothing is provided
  ansvals = chmatchdup(ansvars, names_x)
  } else {
- # FR #355 - negative numeric and character indices for SDcols
  colsub = substitute(.SDcols)
- # fix for R-Forge #5190. colsub[[1L]] gave error when it's a symbol.
- if (colsub %iscall% c("!", "-")) {
- negate_sdcols = TRUE
- colsub = colsub[[2L]]
- } else negate_sdcols = FALSE
- # fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4)))
- while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]]
- if (colsub %iscall% ':' && length(colsub)==3L) {
- # .SDcols is of the format a:b
- .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame())
- } else {
- if (colsub %iscall% 'patterns') {
- # each pattern gives a new filter condition, intersect the end result
- .SDcols = Reduce(intersect, do_patterns(colsub, names_x))
- } else {
- .SDcols = eval(colsub, parent.frame(), parent.frame())
- # allow filtering via function in .SDcols, #3950
- if (is.function(.SDcols)) {
- .SDcols = lapply(x, .SDcols)
- if (any(idx <- vapply_1i(.SDcols, length) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA)))
- stop("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: ", brackify(names(x)[idx]))
- .SDcols = unlist(.SDcols, use.names = FALSE)
- }
- }
- }
- if (anyNA(.SDcols))
- stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols))))
- if (is.logical(.SDcols)) {
- ansvals = which_(rep(.SDcols, length.out=length(x)), !negate_sdcols)
- ansvars = sdvars = names_x[ansvals]
- } else if (is.numeric(.SDcols)) {
- .SDcols = as.integer(.SDcols)
- # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
- if (length(unique(sign(.SDcols))) > 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
- if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L))
- stop(".SDcols is numeric but out of bounds [1, ", ncol(x), "] at: ", brackify(which(idx)))
- ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols]
- ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols
- } else {
- if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
- if (!all(idx <- .SDcols %chin% names_x))
- stop("Some items of .SDcols are not column names: ", brackify(.SDcols[!idx]))
- ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols
- # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
- ansvals = chmatch(ansvars, names_x)
- }
+ ansvals = col_helper(x, colsub, ".SDcols")
+ if (attr(ansvals, "negate")) ansvals = setdiff(ansvals, which(names_x %chin% bynames))
+ ansvars = sdvars = names_x[ansvals]
  }
  # fix for long standing FR/bug, #495 and #484
  allcols = c(names_x, xdotprefix, names_i, idotprefix)

@@ -140,3 +140,97 @@ edit.data.table = function(name, ...) {
  setDT(NextMethod('edit', name))[]
 }
 # nocov end
+
+col_helper = function(x, colsub, mode = NA_character_) {
+ ## helper that could replace some common code used for by, .SDcols, j, duplicated, unique, dcast, 
+ ## or anything that looks for columns
+
+ #' @param x required; a data.frame, data.table, tibble, or list.
+ #' @param colsub takes the same arguments as .SDcols such as
+ #' `patterns()`, `var1:var2`, a variable, or a vector of characters.
+ #' When TRUE, will return seq_len(length(x))
+ #' @param mode takes character vector of length 1. In the short-term
+ #' there will be different modes based on the call. In the long-term, it's just
+ #' to help the end-user what caused the error
+ #' @return Integer vector with a boolean attribute, `"negate"` which is TRUE when
+ #' `colsub[[1L]]` is `-` or `!`.
+
+ # see @jangorecki #4174 for how to get the vast majority of this into C
+
+ x_len = length(x)
+ origsub = colsub
+
+ # FR #4979 - negative numeric and character indices for SDcols
+ # fix for #5190. colsub[[1L]] gave error when it's a symbol.
+ negate_cols = is.call(colsub) && (colsub[[1L]] == "!" || (colsub[[1L]] == "-" && length(colsub) == 2L))
+ if (negate_cols) colsub = colsub[[2L]]
+
+ if (is.call(colsub)){
+ # fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4)))
+ if (colsub[[1L]] == "(") {
+ colsub = as.list(colsub)[[-1L]]
+ while(length(colsub) > 1L && colsub[[1L]] == "(") colsub = colsub[[-1L]]
+ # give users a second chance with negation for (-1)
+ if (length(colsub) == 2L && colsub[[1L]] == "-" && is.numeric(colsub[[2L]])) {
+ negate_cols = TRUE ##what about -(-1)???
+ colsub = colsub[[2L]]
+ }
+ }
+ if (length(colsub) == 3L && colsub[[1L]] == ":") {
+ if (is.name(colsub[[2L]]) && is.name(colsub[[3L]])){
+ # cols is of the format a:c
+ rnge = chmatch(c(as.character(colsub[[2L]]), as.character(colsub[[3L]])), names(x))
+ if (anyNA(rnge)) stop(gettextf("Not all columns were found in the %s column select statement: [%s]", mode, deparse(origsub), domain = "R-data.table"))
+ cols = rnge[1L]:rnge[2L] 
+ } else if (is.character(colsub[[2L]]) || is.character(colsub[[3L]])){
+ stop(gettextf("When selecting a range of columns in the var_1:var_2 format, quotation marks should not be used. Instead of %s try %s:%s", deparse(origsub), deparse(as.name(colsub[[2L]])), deparse(as.name(colsub[[3L]])), domain = "R-data.table"))
+ } else {
+ # cols is of the format 1:3 or would also allow min(var):max(var)
+ cols = eval(colsub, parent.frame(2L), parent.frame(2L))
+ }
+ } else if (length(colsub) > 1 && colsub[[1L]] == "patterns") {
+ # each pattern gives a new filter condition, intersect the end result
+ x_names = names(x)
+ cols = Reduce(intersect, lapply(as.list(colsub)[-1L], function(col) grep(eval(col, parent.frame(2L)), x_names)))
+ } else {
+ cols = eval(colsub, parent.frame(2L), parent.frame(2L))
+ }
+ } else {
+ cols = eval(colsub, parent.frame(2L), parent.frame(2L))
+ }
+
+ # allow filtering via function in .SDcols, #3950
+ if (is.function(cols)) {
+ cols = lapply(x, cols)
+ if (any(idx <- vapply_1i(cols, length) > 1L | vapply_1c(cols, typeof) != 'logical' | vapply_1b(cols, anyNA)))
+ stop(gettextf("When %s is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: %s", mode, brackify(names(x)[idx]), domain = "R-data.table"))
+ cols = unlist(cols, use.names = FALSE)
+ }
+
+ if (anyNA(cols))
+ stop(gettextf("%s missing at the following indices: %s", mode, brackify(which(is.na(cols)))))
+
+ if (is.logical(cols)) {
+ if ((col_len <- length(cols)) != x_len) {
+ ## TODO change to error in 2022
+ warning(gettextf("When %s is a logical vector, each column should have a TRUE or FALSE entry. The current logical vector of length %d will be repeated to length of data.table. Warning will change to error in the next verion.", mode, col_len, domain = "R-data.table"))
+ cols = rep(cols, length.out = x_len)
+ } 
+ ansvals = which_(cols, !negate_cols)
+ } else if (is.numeric(cols)) {
+ cols = as.integer(cols)
+ if (any(idx <- abs(cols)>x_len | cols == 0L))
+ stop(gettextf("%s is numeric but out of bounds [1, %d] at: %s", mode, x_len, brackify(which(idx)), domain = "R-data.table"))
+ if (length(unique(sign(cols))) > 1L) stop(gettextf("%s is numeric but has both +ve and -ve indices", mode, domain = "R-data.table"))
+ ansvals = if (negate_cols) setdiff(seq_len(x_len), cols) else cols
+ } else {
+ if (!is.character(cols)) stop(gettextf("%s should be column numbers or names", mode, domain = "R-data.table"))
+ x_names = names(x)
+ if (!all(idx <- cols %chin% x_names))
+ stop(gettextf("Some items of %s are not column names: %s", mode, brackify(cols[!idx]), domain = "R-data.table"))
+ ansvars = if (negate_cols) setdiff(x_names, cols) else cols
+ ansvals = chmatch(ansvars, x_names)
+ }
+ attr(ansvals, "negate") = negate_cols
+ return(ansvals)
+}
@@ -6968,8 +6968,8 @@ test(1497, DT[, .SD, .SDcols = !c("a", "c")], DT[, !c("a", "c"), with=FALSE])
 
 # Fix for #1060
 DT = data.table(x=1, y=2, z=3, a=4, b=5, c=6)
-test(1498.1, DT[, .SD, .SDcols=c(TRUE,FALSE)], DT[, c("x", "z", "b"), with=FALSE])
-test(1498.2, DT[, .SD, .SDcols=!c(TRUE,FALSE)], DT[, !c("x", "z", "b"), with=FALSE])
+test(1498.1, DT[, .SD, .SDcols=c(TRUE,FALSE)], DT[, c("x", "z", "b"), with=FALSE], warning = "When .SDcols is a logical vector, each column should have a TRUE or FALSE entry.")
+test(1498.2, DT[, .SD, .SDcols=!c(TRUE,FALSE)], DT[, !c("x", "z", "b"), with=FALSE], warning = "When .SDcols is a logical vector, each column should have a TRUE or FALSE entry.")
 
 # Fix for #1072
 dt <- data.table(group1 = "a", group2 = "z", value = 1)
@@ -16846,3 +16846,16 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN))
 test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
 A = data.table(A=as.complex(rep(NA, 5)))
 test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
+
+# Test additional .SDcols functionality
+dt = as.data.table(lapply(1:5, c))
+test(2139.1, dt[, .SD, .SDcols = V2 - V1], error = "object 'V2' not found")
+test(2139.2, dt[, .SD, .SDcols = V1], error = "object 'V1' not found")
+test(2139.3, dt[, .SD, .SDcols = (-1L)], dt[, .(V2, V3, V4, V5)])
+V1 = "V2"
+test(2139.4, dt[, .SD, .SDcols = V1], dt[, .(V2)])
+test(2139.5, dt[, .SD, .SDcols = (V1)], dt[, .(V2)])
+test(2139.6, dt[, .SD, .SDcols = "V1":V1], error = "When selecting a range of columns in the var_1:var_2 format, quotation marks should not be used. Instead of \"V1\":V1 try V1:V1")
+test(2139.7, dt[, .SD, .SDcols = V1:V6], error = "Not all columns were found in the .SDcols column select statement: [V1:V6]")
+test(2139.8, dt[, .SD, .SDcols = c(TRUE, FALSE)], dt[, .(V1, V3, V5)], warning = "When .SDcols is a logical vector, each column should have a TRUE or FALSE entry.")
+rm(V1)