allow cross join in `[.data.table` #1717

jangorecki · 2016-05-20T12:34:22Z

There is not straight way to do cross join currently. User need to add a column with a constant value in both datasets and join on that column. This could be nicely addressed in a similar way as it is in SQL where you can use SELECT ... FROM t1 JOIN t2 ON 1=1, where 1=1 is used to evaluate to TRUE for every row.
Eventually allow.cartesian could be set to TRUE when on=TRUE detected so the use case would look like:

X[Y, on=TRUE]

And corresponding SQLite

library(RSQLite)
library(data.table)
X=data.table(a=1:2)
Y=data.table(b=letters[1:2])
conn=dbConnect(SQLite())
dbWriteTable(conn, "t1", X)
dbWriteTable(conn, "t2", Y)
dbGetQuery(conn, "SELECT * FROM t1 JOIN t2 ON 1=1;")
#  a b
#1 1 a
#2 1 b
#3 2 a
#4 2 b

Update https://stackoverflow.com/questions/25888706/r-data-table-cross-join-not-working when solved.

The text was updated successfully, but these errors were encountered:

ToeKneeFan · 2019-07-31T19:42:22Z

@MichaelChirico That's a nice, intuitive solution. It's worth cautioning, though, that it will result in duplicate column names if any are shared between the data.tables. Based on some (very nonexhaustive, probably memory-dependent) benchmarking, your solution (CJ1) is actually slightly faster than jangorecki's solution in the StackOverflow response (CJ2, modified to be more comparable) but has similar performance if the temporary joining column is assigned with :=. Of course, this is without manually taking into account keying and indexing behavior.

library(data.table)
DT1 <- data.table(V1 = rnorm(8E3))
DT2 <- data.table(V1 = rnorm(8E3))

# joining by grouping
CJ1 <- function(DT1, DT2){
	DT1[ , c(.SD, DT2), by = seq_len(nrow(DT1))]
}

# joining by allow.cartesian with temporary column
CJ2 <- function(DT1, DT2){
	DT1[, c(.SD, .tempCol = 1)][DT2[, c(.SD, .tempCol = 1)], on = ".tempCol", , allow.cartesian = T]
}

# joining by allow.cartesian with assigned column
CJ3 <- function(DT1, DT2){
	DT1[, .tempCol := 1]
	DT2[, .tempCol := 1]
	DT1[DT2, on = ".tempCol", , allow.cartesian = T]
}

join1 <- CJ1(DT1, DT2)
print(join1)
#           seq_len         V1          V1
#        1:       1 -0.6752308 -1.19897585
#        2:       1 -0.6752308 -2.14232417
#        3:       1 -0.6752308 -0.01960594
#        4:       1 -0.6752308  0.92697292
#        5:       1 -0.6752308  1.26429973
#       ---                               
# 63999996:    8000  1.1786346 -0.44214904
# 63999997:    8000  1.1786346  0.52729396
# 63999998:    8000  1.1786346  0.60022166
# 63999999:    8000  1.1786346  0.30667155
# 64000000:    8000  1.1786346 -0.77470274

join2 <- CJ2(DT1, DT2)
print(join2)
#                    V1 .tempCol       i.V1
#        1: -0.67523085        1 -1.1989758
#        2: -0.92701043        1 -1.1989758
#        3: -0.99014335        1 -1.1989758
#        4:  0.53156215        1 -1.1989758
#        5: -0.15562568        1 -1.1989758
#       ---                                
# 63999996:  0.39169500        1 -0.7747027
# 63999997:  1.94071323        1 -0.7747027
# 63999998: -0.08114057        1 -0.7747027
# 63999999: -0.31676249        1 -0.7747027
# 64000000:  1.17863463        1 -0.7747027

join3 <- CJ3(DT1, DT2)
print(join3)
#                    V1 .tempCol       i.V1
#        1: -0.67523085        1 -1.1989758
#        2: -0.92701043        1 -1.1989758
#        3: -0.99014335        1 -1.1989758
#        4:  0.53156215        1 -1.1989758
#        5: -0.15562568        1 -1.1989758
#       ---                                
# 63999996:  0.39169500        1 -0.7747027
# 63999997:  1.94071323        1 -0.7747027
# 63999998: -0.08114057        1 -0.7747027
# 63999999: -0.31676249        1 -0.7747027
# 64000000:  1.17863463        1 -0.7747027

# check results are the same
setnames(join1, which(names(join1) == "V1"), c("V1", "i.V1"))
join1 <- join1[, "seq_len" := NULL]
join2 <- join2[, ".tempCol" := NULL]
join3 <- join3[, ".tempCol" := NULL]
setorderv(join1, c("V1", "i.V1"))
setorderv(join2, c("V1", "i.V1"))
setorderv(join3, c("V1", "i.V1"))
identical(join1, join2)
# [1] TRUE
identical(join2, join3)
# [1] TRUE

# benchmark
microbenchmark::microbenchmark(
	CJ1(DT1, DT2),
	CJ2(DT1, DT2),
	CJ3(DT1, DT2),
	times = 5
)

# Unit: seconds
#           expr      min       lq     mean   median       uq      max neval cld
#  CJ1(DT1, DT2) 2.486950 2.638036 2.767990 2.822850 2.842497 3.049616     5   a
#  CJ2(DT1, DT2) 1.743497 3.531119 3.751018 3.943120 4.090093 5.447262     5   a
#  CJ3(DT1, DT2) 1.848768 2.728135 2.737173 2.822515 3.094846 3.191602     5   a

chinsoon12 · 2019-10-24T05:58:23Z

Another option:

CJ.dt_2 <- function(...) {
    DTls <- list(...)
    rows <- do.call(CJ, lapply(DTls, function(x) x[, seq_len(.N)]))
    res <- DTls[[1L]][rows[[1L]]]
    for (n in seq_along(DTls)[-1L])
        res <- res[, c(.SD, DTls[[n]][rows[[n]]])]
    res
}

timing code:

library(data.table) #data.table_1.12.4 Win-4 R-3.6.1 x64 4 threads
nr <- 3e2
DT1 <- data.table(A1=1:nr, A1=1:nr)
DT2 <- data.table(B1=1:nr, B2=1:nr)
DT3 <- data.table(C1=1:nr, C2=1:nr)

#https:/Rdatatable/data.table/pull/814#issuecomment-55807497
CJ.dt = function(...) {
    rows = do.call(CJ, lapply(list(...), function(x) if(is.data.frame(x)) seq_len(nrow(x)) else seq_along(x)));
    do.call(data.table, Map(function(x, y) x[y], list(...), rows))
}

#https:/Rdatatable/data.table/issues/1717#issuecomment-355499952
CJ.dt_1 <- function(...) {
    Reduce(f=function(x, y) cbind(x[rep(1:nrow(x), times=nrow(y)),], y[rep(1:nrow(y), each=nrow(x)),]),
        x=list(...))
} #CJ.dft

CJ.dt_2 <- function(...) {
    DTls <- list(...)
    rows <- do.call(CJ, lapply(DTls, function(x) x[, seq_len(.N)]))
    res <- DTls[[1L]][rows[[1L]]]
    for (n in seq_along(DTls)[-1L])
        res <- res[, c(.SD, DTls[[n]][rows[[n]]])]
    res
}

#https:/Rdatatable/data.table/issues/2343#issuecomment-328156867
CJDT <- function(...)
    Reduce(function(DT1, DT2) cbind(DT1, DT2[rep(1:.N, each=nrow(DT1))]), list(...))

a1 <- CJ.dt(DT1, DT2, DT3)
setorderv(a1, names(a1))
a2 <- CJ.dt_1(DT1, DT2, DT3)
setorderv(a2, names(a2))
a3 <- CJ.dt_2(DT1, DT2, DT3)
setorderv(a3, names(a3))
a4 <- CJDT(DT1, DT2, DT3)
setorderv(a4, names(a4))
identical(a1, a2)
identical(a1, a3)
identical(a1, a4)

bench::mark(CJ.dt(DT1, DT2, DT3),
    CJ.dt_1(DT1, DT2, DT3),
    CJ.dt_2(DT1, DT2, DT3),
    CJDT(DT1, DT2, DT3),
    check=FALSE)

timings:


# A tibble: 4 x 13
  expression                  min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                    memory                time     gc              
  <bch:expr>             <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>                    <list>                <list>   <list>          
1 CJ.dt(DT1, DT2, DT3)      1.52m    1.52m    0.0109    1.61GB   0.0328     1     3      1.52m <df[,6] [27,000,000 x 6]> <df[,3] [341 x 3]>    <bch:tm> <tibble [1 x 3]>
2 CJ.dt_1(DT1, DT2, DT3)    1.53s    1.53s    0.653     1.41GB   1.31       1     2      1.53s <df[,6] [27,000,000 x 6]> <df[,3] [1,467 x 3]>  <bch:tm> <tibble [1 x 3]>
3 CJ.dt_2(DT1, DT2, DT3)    1.27s    1.27s    0.787     2.35GB   0.787      1     1      1.27s <df[,6] [27,000,000 x 6]> <df[,3] [23,236 x 3]> <bch:tm> <tibble [1 x 3]>
4 CJDT(DT1, DT2, DT3)       1.07s    1.07s    0.930   929.44MB   0.930      1     1      1.07s <df[,6] [27,000,000 x 6]> <df[,3] [28 x 3]>     <bch:tm> <tibble [1 x 3]>

Another test:

DT_A <- data.table(A=1:8e3)
DT_B <- data.table(B=1:8e3)
bench::mark(CJ.dt(DT_A, DT_B), 
    CJ.dt_1(DT_A, DT_B), CJ.dt_2(DT_A, DT_B), CJDT(DT_A, DT_B), check=FALSE)

# A tibble: 4 x 13
  expression               min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                    memory             time     gc              
  <bch:expr>          <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>                    <list>             <list>   <list>          
1 CJ.dt(DT_A, DT_B)      1.32m    1.32m    0.0126    1.53GB   0.0126     1     1      1.32m <df[,2] [64,000,000 x 2]> <df[,3] [217 x 3]> <bch:tm> <tibble [1 x 3]>
2 CJ.dt_1(DT_A, DT_B)    2.77s    2.77s    0.362     1.43GB   0.362      1     1      2.77s <df[,2] [64,000,000 x 2]> <df[,3] [19 x 3]>  <bch:tm> <tibble [1 x 3]>
3 CJ.dt_2(DT_A, DT_B) 919.28ms 919.28ms    1.09      1.67GB   0          1     0   919.28ms <df[,2] [64,000,000 x 2]> <df[,3] [38 x 3]>  <bch:tm> <tibble [1 x 3]>
4 CJDT(DT_A, DT_B)       2.24s    2.24s    0.446   976.63MB   0.446      1     1      2.24s <df[,2] [64,000,000 x 2]> <df[,3] [19 x 3]>  <bch:tm> <tibble [1 x 3]>

jangorecki · 2020-05-10T01:04:13Z

This is a low-overhead version built on top of #4370, not really tested

crossjoin = function(x, i) {
  stopifnot(is.data.table(x), is.data.table(i))
  ni = nrow(i)
  nx = nrow(x)
  ## bmerge ans for cross join
  ans = list(starts = rep.int(1L, ni), lens = rep.int(nx, ni))
  ## dtmerge ans for cross join
  ans = list(xrows = vecseq(ans$starts, ans$lens, NULL), irows = seqexp(ans$lens))
  out.i = .Call(CsubsetDT, i, ans$irows, colnamesInt(i, NULL))
  out.x = .Call(CsubsetDT, x, ans$xrows, colnamesInt(x, NULL))
  out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
  setDT(out)
  out
}

jangorecki · 2020-05-10T09:32:56Z

@chinsoon12 benchmark you presented suffers badly in terms of readability

  expression                  min
  <bch:expr>             <bch:tm>       
1 CJ.dt(DT1, DT2, DT3)      1.52m
2 CJ.dt_1(DT1, DT2, DT3)    1.53s
3 CJ.dt_2(DT1, DT2, DT3)    1.27s
4 CJDT(DT1, DT2, DT3)       1.07s

how many people will notice that first expression has timing in minutes while all remaning ones in seconds?

jangorecki · 2020-05-10T09:55:31Z

Are there practical uses cases for a cross join on more than 2 tables?
Above crossjoin could be tweaked to apply Reduce when producing irows and xrows, then call CsubsetDT just single time for i and x, and not for every pair of tables.

I re-run benchmark

library(data.table)
nr <- 3e2
DT1 <- data.table(A1=1:nr, A1=1:nr)
DT2 <- data.table(B1=1:nr, B2=1:nr)
DT3 <- data.table(C1=1:nr, C2=1:nr)

#https:/Rdatatable/data.table/pull/814#issuecomment-55807497
CJ.dt = function(...) {
    rows = do.call(CJ, lapply(list(...), function(x) if(is.data.frame(x)) seq_len(nrow(x)) else seq_along(x)));
    do.call(data.table, Map(function(x, y) x[y], list(...), rows))
}

#https:/Rdatatable/data.table/issues/1717#issuecomment-355499952
CJ.dt_1 <- function(...) {
    Reduce(f=function(x, y) cbind(x[rep(1:nrow(x), times=nrow(y)),], y[rep(1:nrow(y), each=nrow(x)),]),
        x=list(...))
} #CJ.dft

CJ.dt_2 <- function(...) {
    DTls <- list(...)
    rows <- do.call(CJ, lapply(DTls, function(x) x[, seq_len(.N)]))
    res <- DTls[[1L]][rows[[1L]]]
    for (n in seq_along(DTls)[-1L])
        res <- res[, c(.SD, DTls[[n]][rows[[n]]])]
    res
}

#https:/Rdatatable/data.table/issues/2343#issuecomment-328156867
CJDT <- function(...)
    Reduce(function(DT1, DT2) cbind(DT1, DT2[rep(1:.N, each=nrow(DT1))]), list(...))

a1 <- CJ.dt(DT1, DT2, DT3)
setorderv(a1, names(a1))
a2 <- CJ.dt_1(DT1, DT2, DT3)
setorderv(a2, names(a2))
a3 <- CJ.dt_2(DT1, DT2, DT3)
setorderv(a3, names(a3))
a4 <- CJDT(DT1, DT2, DT3)
setorderv(a4, names(a4))
identical(a1, a2)
identical(a1, a3)
identical(a1, a4)

crossjoin = data.table:::crossjoin
a5 = crossjoin(crossjoin(DT1, DT2), DT3)
setcolorder(a5, c(5:6,3:4,1:2))
setorderv(a5, names(a5))
identical(a1, a5)

DT_A <- data.table(A=1:8e3)
DT_B <- data.table(B=1:8e3)

mark1 = function(..., check=NULL) {
  l = as.list(substitute(list(...)))[-1L]
  txt = vapply(l, deparse, "")
  env = parent.frame()
  sec = vapply(l, function(expr) system.time(eval(expr, env=env))[[3L]], 0.0)
  data.frame(expr = txt, sec = sec)
}
cat("# run 1\n")
mark1(CJ.dt(DT_A, DT_B), CJ.dt_1(DT_A, DT_B), CJ.dt_2(DT_A, DT_B), CJDT(DT_A, DT_B), crossjoin(DT_A, DT_B), check=FALSE)
cat("\n# run 2\n")
mark1(CJ.dt(DT_A, DT_B), CJ.dt_1(DT_A, DT_B), CJ.dt_2(DT_A, DT_B), CJDT(DT_A, DT_B), crossjoin(DT_A, DT_B), check=FALSE)

timings

# run 1
                   expr    sec
1     CJ.dt(DT_A, DT_B) 65.542
2   CJ.dt_1(DT_A, DT_B)  2.668
3   CJ.dt_2(DT_A, DT_B)  0.743
4      CJDT(DT_A, DT_B)  2.121
5 crossjoin(DT_A, DT_B)  0.459

# run 2
                   expr    sec
1     CJ.dt(DT_A, DT_B) 65.424
2   CJ.dt_1(DT_A, DT_B)  2.664
3   CJ.dt_2(DT_A, DT_B)  0.723
4      CJDT(DT_A, DT_B)  2.121
5 crossjoin(DT_A, DT_B)  0.458

Kamgang-B · 2023-05-24T09:38:37Z

I have just missed this feature.

My opinion on this:

1- I think that this feature nicely fits into CJ function in the sense that CJ could be expanded to support data.frame (or list) too. I find something like CJ(DT1, DT2, etc.) very nice and natural. I don't really see the benefit of implementing this feature in a new function.

2- cross join could also be supported in [.data.table function. X[Y, on=TRUE] is nice but I suggest to also consider using the keyword .CROSSJOIN in on=, like X[Y, on=.CROSSJOIN]. The latter one is likely more intuitive (specially for users not used to SQL) and I feel that it would be somehow consistent with the way the natural join looks like in data.table (X[Y, on=.NATURAL])

An important remark: The first point (implementation in CJ allows to cross join more than two datasets at once while the second point is limited to only two.

jangorecki · 2023-05-24T11:01:08Z

You are welcome to review cross join functionality proposed in #4370

jangorecki added the feature request label May 20, 2016

This comment has been minimized.

Sign in to view

jangorecki added this to the v1.9.10 milestone Nov 23, 2016

franknarf1 mentioned this issue Sep 8, 2017

FR: if CJ is passed data.table arguments, do a blocked cross-join #2343

Closed

This comment has been minimized.

Sign in to view

mattdowle removed this from the Candidate milestone May 10, 2018

MichaelChirico mentioned this issue Jul 22, 2018

single symbol in j behaves strangely #2983

Closed

MichaelChirico mentioned this issue Oct 11, 2018

Created CJ.dt - for cross joining two data.tables #814

Closed

jangorecki mentioned this issue May 25, 2019

CJ can handle list() input if sorted=FALSE but fails confusingly #3597

Closed

This comment has been minimized.

Sign in to view

jangorecki added this to the 1.13.0 milestone Jul 31, 2019

mattdowle modified the milestones: 1.12.7, 1.12.9 Dec 8, 2019

This comment has been minimized.

Sign in to view

jangorecki mentioned this issue May 10, 2020

cbindlist, mergelist #4370

Open

9 tasks

dcooley mentioned this issue May 10, 2020

1.6.2. - microbenchmark's multcomp reference missing csgillespie/efficientR#264

Open

engineerchange mentioned this issue May 10, 2020

Consistent time units (by default) r-lib/bench#80

Closed

jangorecki self-assigned this May 21, 2020

jangorecki added a commit that referenced this issue Jun 14, 2020

cross join, #1717

18c53d2

jangorecki linked a pull request Jun 14, 2020 that will close this issue

cross join, #1717 #4544

Draft

mattdowle modified the milestones: 1.13.1, 1.13.3 Oct 17, 2020

MichaelChirico mentioned this issue May 14, 2021

Master list of most-requested issues #3189

Open

76 tasks

mattdowle removed this from the 1.14.1 milestone Aug 28, 2021

ben-schwen mentioned this issue Jan 7, 2022

expand grid of two data.tables #5305

Closed

Darxor mentioned this issue Oct 17, 2022

Support data.frame cross-join markfairbanks/tidytable#655

Closed

MLopez-Ibanez mentioned this issue Feb 10, 2024

Conditional Cartesian join (Cross join) #5933

Open

MichaelChirico added the top request One of our most-requested issues label Apr 14, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

allow cross join in `[.data.table` #1717

allow cross join in `[.data.table` #1717

jangorecki commented May 20, 2016 •

edited

Loading

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

ToeKneeFan commented Jul 31, 2019

chinsoon12 commented Oct 24, 2019

jangorecki commented May 10, 2020 •

edited

Loading

jangorecki commented May 10, 2020

jangorecki commented May 10, 2020 •

edited

Loading

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

Kamgang-B commented May 24, 2023

jangorecki commented May 24, 2023

allow cross join in [.data.table #1717

allow cross join in [.data.table #1717

Comments

jangorecki commented May 20, 2016 • edited Loading

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

ToeKneeFan commented Jul 31, 2019

chinsoon12 commented Oct 24, 2019

jangorecki commented May 10, 2020 • edited Loading

jangorecki commented May 10, 2020

jangorecki commented May 10, 2020 • edited Loading

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

Kamgang-B commented May 24, 2023

jangorecki commented May 24, 2023

allow cross join in `[.data.table` #1717

allow cross join in `[.data.table` #1717

jangorecki commented May 20, 2016 •

edited

Loading

jangorecki commented May 10, 2020 •

edited

Loading

jangorecki commented May 10, 2020 •

edited

Loading