mlr-org · mb706 · Oct 1, 2021 · Oct 1, 2021 · Oct 1, 2021 · Oct 1, 2021
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -98,6 +98,8 @@ Roxygen: list(markdown = TRUE, r6 = FALSE)
 RoxygenNote: 7.3.2
 VignetteBuilder: knitr
 Collate:
+ 'DataBackendJoin.R'
+ 'DataBackendMultiCbind.R'
  'Graph.R'
  'GraphLearner.R'
  'mlr_pipeops.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -35,6 +35,8 @@ S3method(unmarshal_model,pipeop_impute_learner_state_marshaled)
 S3method(unmarshal_model,pipeop_learner_cv_state_marshaled)
 export("%>>!%")
 export("%>>%")
+export(DataBackendJoin)
+export(DataBackendMultiCbind)
 export(Graph)
 export(GraphLearner)
 export(LearnerClassifAvg)

diff --git a/R/DataBackendJoin.R b/R/DataBackendJoin.R
@@ -0,0 +1,160 @@
+
+
+#' @export
+DataBackendJoin = R6Class("DataBackendJoin", inherit = DataBackend, cloneable = FALSE,
+ public = list(
+ initialize = function(b1, b2, type, by_b1 = NULL, by_b2 = NULL, b1_index_colname = NULL, b2_index_colname = NULL) {
+ assert_backend(b1)
+ assert_backend(b2)
+
+ if ("data.table" %nin% intersect(b1$data_formats, b2$data_formats)) {
+ stop("DataBackendJoin currently only supports DataBackends that support 'data.table' format.")
+ }
+
+ assert_choice(type, c("left", "right", "outer", "inner"))
+
+ colnames_b1 = b1$colnames
+ colnames_b2 = b2$colnames
+ allcolnames = union(colnames_b1, colnames_b2)
+
+ assert_choice(by_b1, colnames_b1, null.ok = TRUE)
+ assert_choice(by_b2, colnames_b2, null.ok = TRUE)
+
+ assert_string(b1_index_colname, null.ok = TRUE)
+ assert_string(b2_index_colname, null.ok = TRUE)
+
+ if (!is.null(b1_index_colname) && b1_index_colname %in% setdiff(allcolnames, b1$primary_key)) stopf("b1_index_colname '%s' already a non-primary-key column in b1 or b2.", b1_index_colname)
+ if (!is.null(b2_index_colname) && b2_index_colname %in% setdiff(allcolnames, b2$primary_key)) stopf("b2_index_colname '%s' already a non-primary-key column in b2 or b2.", b2_index_colname)
+ if (!is.null(b1_index_colname) && !is.null(b2_index_colname) && b1_index_colname == b2_index_colname) stop("b1_index_colname and b2_index_colname must be different, but are both '%s'.", b1_index_colname)
+
+ colnames = unique(c(allcolnames, b1_index_colname, b2_index_colname))
+
+ rownames_b1 = b1$rownames
+ rownames_b2 = b2$rownames
+
+ joinby_b1 = if (is.null(by_b1)) rownames_b1 else b1$data(rownames_b1, by_b1, data_format = "data.table")[[1]]
+ joinby_b2 = if (is.null(by_b2)) rownames_b2 else b2$data(rownames_b2, by_b2, data_format = "data.table")[[1]]
+
+ index_table = merge(data.table(rownames_b1, joinby_b1), data.table(rownames_b2, joinby_b2), by.x = "joinby_b1", by.y = "joinby_b2",
+ all.x = type %in% c("left", "outer"), all.y = type %in% c("right", "outer"), sort = FALSE, allow.cartesian = TRUE)
+
+ set(index_table, , "joinby_b1", NULL)
+
+ pk = "..row_id"
+ index = 0
+ while (pk %in% allcolnames) {
+ index = index + 1
+ pk = paste0("..row_id.", index)
+ }
+
+ super$initialize(list(
+ b1 = b1, b2 = b2,
+ colnames_b1 = setdiff(colnames_b1, colnames_b2),
+ allcolnames = unique(c(colnames_b1, colnames_b2, b1_index_colname, b2_index_colname, pk)),
+ index_table = index_table,
+ b1_index_colname = b1_index_colname,
+ b2_index_colname = b2_index_colname,
+ pk = pk,
+ aux_hash = calculate_hash(by_b1, by_b2, type, b1_index_colname, b2_index_colname)
+ ), primary_key = pk, data_formats = "data.table")
+ },
+
+ data = function(rows, cols, data_format = "data.table") {
+ d = private$.data
+ rows = rows[inrange(rows, 1, nrow(d$index_table))]
+ indices = d$index_table[rows]
+ b1_rows = indices[!is.na(rownames_b1), rownames_b1]
+ b2_rows = indices[!is.na(rownames_b2), rownames_b2]
+ indices[!is.na(rownames_b1), b1_index := seq_len(length(b1_rows))]
+ indices[!is.na(rownames_b2), b2_index := seq_len(length(b2_rows))]
+ b1_index = indices[, b1_index]
+ b2_index = indices[, b2_index]
+
+ data = d$b2$data(b2_rows, cols, data_format = "data.table")[b2_index]
+ remainingcols = intersect(cols, d$colnames_b1)
+ if (length(remainingcols)) {
+ data = cbind(data, d$b1$data(b1_rows, cols, data_format = "data.table")[b1_index])
+ }
+ setkeyv(data, NULL)
+ if (d$pk %in% cols) {
+ set(data, , d$pk, rows)
+ }
+ if (!is.null(d$b2_index_colname) && d$b2_index_colname %in% cols) {
+ rownames_b2 = indices$rownames_b2
+ set(data, , d$b2_index_colname, rownames_b2)
+ }
+ if (!is.null(d$b1_index_colname) && d$b1_index_colname %in% cols) {
+ rownames_b1 = indices$rownames_b1
+ set(data, ,d$b1_index_colname, rownames_b1)
+ }
+ data[, intersect(cols, names(data)), with = FALSE]
+ },
+
+ head = function(n = 6L) {
+ rows = first(self$rownames, n)
+ self$data(rows = rows, cols = self$colnames)
+ },
+ distinct = function(rows, cols, na_rm = TRUE) {
+ d = private$.data
+ indices = d$index_table[rows]
+ rownames_b1 = rownames_b2 = NULL
+ b1_rows = indices[!is.na(rownames_b1), rownames_b1]
+ b2_rows = indices[!is.na(rownames_b2), rownames_b2]
+ d2 = private$.data$b2$distinct(rows = b2_rows, cols = cols, na_rm = na_rm)
+ if (!is.null(d$b2_index_colname) && d$b2_index_colname %in% cols) {
+ d2[[d$b2_index_colname]] = if (na_rm) unique(b2_rows) else unique(indices$rownames_b2)
+ }
+ d1 = private$.data$b1$distinct(rows = b1_rows, cols = setdiff(cols, names(d2)), na_rm = na_rm)
+ if (!is.null(d$b1_index_colname) && d$b1_index_colname %in% cols) {
+ d1[[d$b1_index_colname]] = if (na_rm) unique(b1_rows) else unique(indices$rownames_b1)
+ }
+
+ if (!na_rm && length(b1_rows) < length(rows)) {
+ d1 = map(d1, function(x) if (any(is.na(x))) x else c(x, NA))
+ }
+ if (!na_rm && length(b2_rows) < length(rows)) {
+ d2 = map(d2, function(x) if (any(is.na(x))) x else c(x, NA))
+ }
+ res = c(d1, d2)
+ if (d$pk %in% cols) {
+ res[[d$pk]] = unique(rows)
+ }
+
+ res[match(cols, names(res), nomatch = 0)]
+ },
+ missings = function(rows, cols) {
+ d = private$.data
+ indices = d$index_table[rows]
+ rownames_b1 = rownames_b2 = NULL
+ b1_rows = indices[!is.na(rownames_b1), rownames_b1]
+ b2_rows = indices[!is.na(rownames_b2), rownames_b2]
+ m2 = private$.data$b2$missings(b2_rows, cols)
+ if (!is.null(d$b2_index_colname) && d$b2_index_colname %in% cols) {
+ m2[d$b2_index_colname] = 0L
+ }
+ m1 = private$.data$b1$missings(b1_rows, setdiff(cols, names(m2)))
+ if (!is.null(d$b1_index_colname) && d$b1_index_colname %in% cols) {
+ m1[d$b1_index_colname] = 0L
+ }
+ m1 = m1 + length(rows) - length(b1_rows)
+ m2 = m2 + length(rows) - length(b2_rows)
+ res = c(m1, m2)
+ if (d$pk %in% cols) {
+ res[d$pk] = 0L
+ }
+ res[match(cols, names(res), nomatch = 0)]
+ }
+ ),
+ active = list(
+ rownames = function() seq_len(nrow(private$.data$index_table)),
+ colnames = function() private$.data$allcolnames,
+ nrow = function() nrow(private$.data$index_table),
+ ncol = function() length(private$.data$allcolnames)
+ ),
+ private = list(
+ .calculate_hash = function() {
+ d = private$.data
+ calculate_hash(d$b1$hash, d$b2$hash,d$aux_hash)
+ }
+ )
+)
diff --git a/R/DataBackendMultiCbind.R b/R/DataBackendMultiCbind.R
@@ -0,0 +1,134 @@
+
+
+#' @export
+DataBackendMultiCbind = R6Class("DataBackendMultiCbind", inherit = DataBackend, cloneable = FALSE,
+ public = list(
+ initialize = function(bs) {
+ assert_list(bs, min.len = 1)
+ lapply(bs, assert_backend)
+
+ formats = Reduce(intersect, map(bs, "data_formats"))
+
+ private$.colnames = unique(unlist(map(bs, "colnames")))
+
+ # primary key: if all backends have the same pk, just use that one.
+ otherpk = unique(unlist(map(bs, "primary_key")))
+ if (length(otherpk) == 1) {
+ pk = otherpk
+ } else {
+ # otherwise: introduce a new primary key that is completely different from the previous ones.
+ pk = "..row_id"
+ index = 0
+ while (pk %in% private$.colnames) {
+ index = index + 1
+ pk = paste0("..row_id.", index)
+ }
+ private$.colnames = c(private$.colnames, pk)
+ }
+
+ super$initialize(list(bs = rev(bs)), pk, formats)
+ },
+ data = function(rows, cols, data_format = "data.table") {
+ bs = private$.data$bs
+
+ urows = unique(rows)
+
+ datas = list()
+ pks = character(length(bs))
+ include_pk = logical(length(bs))
+ cols_remaining = cols
+ allrows = list()
+ for (i in seq_along(bs)) {
+ ## Not doing 'if (length(cols_remaining)) break' because there could still be tables remaining that add rows
+ pk = bs[[i]]$primary_key
+ pks[[i]] = pk
+ include_pk[[i]] = pk %in% cols_remaining
+ if (include_pk[[i]]) {
+ datas[[i]] = bs[[i]]$data(urows, cols_remaining, data_format = data_format)
+ cols_remaining = setdiff(cols_remaining, colnames(datas[[i]]))
+ } else {
+ datas[[i]] = bs[[i]]$data(urows, c(pk, cols_remaining), data_format = data_format)
+ cols_remaining = setdiff(cols_remaining, colnames(datas[[i]])[-1])
+ }
+ allrows[[i]] = datas[[i]][[pk]]
+ }
+ presentrows = unique(unlist(allrows))
+ join = list(presentrows)
+ result = do.call(cbind, pmap(list(datas, pks, include_pk), function(data, pk, include) {
+ if (include) {
+ result = data[join, on = pk, nomatch = NA]
+ set(result, result[[pk]] %nin% data[[pk]], pk, NA)
+ } else {
+ data[join, -pk, on = pk, with = FALSE, nomatch = NA]
+ }
+ }))
+ sbk = self$primary_key
+
+ set(result, , sbk, presentrows)
+ join = list(rows)
+ result[join, intersect(cols, colnames(result)), with = FALSE, on = sbk, nomatch = NULL]
+ },
+ head = function(n = 6L) {
+ rows = head(self$rownames, n)
+ self$data(rows = rows, cols = self$colnames)
+ },
+ distinct = function(rows, cols, na_rm = TRUE) {
+ bs = private$.data$bs
+ getpk = self$primary_key %in% cols
+ reslist = list()
+ remaining_cols = cols
+ if (!na_rm || getpk) {
+ rows = intersect(rows, self$rownames)
+ }
+ for (i in seq_along(bs)) {
+ if (!length(remaining_cols)) break
+ reslist[[i]] = bs[[i]]$distinct(rows = rows, cols = cols, na_rm = na_rm)
+ remaining_cols = setdiff(remaining_cols, names(reslist[[i]]))
+ if (!na_rm && !all(rows %in% bs[[i]]$rownames)) {
+ reslist[[i]] = map(reslist[[i]], function(x) if (any(is.na(x))) x else c(x, NA))
+ }
+ }
+ result = unlist(reslist, recursive = FALSE)
+ if (getpk) {
+ result[[self$primary_key]] = rows
+ }
+ result[match(cols, names(result), nomatch = 0)]
+ },
+ missings = function(rows, cols) {
+ rows = rows[rows %in% self$rownames]
+ bs = private$.data$bs
+ getpk = self$primary_key %in% cols
+ reslist = list()
+ remaining_cols = cols
+ for (i in seq_along(bs)) {
+ if (!length(remaining_cols)) break
+ missingrows = sum(rows %nin% bs[[i]]$rownames)
+ reslist[[i]] = bs[[i]]$missings(rows, remaining_cols) + missingrows
+ remaining_cols = setdiff(remaining_cols, names(reslist[[i]]))
+ }
+ result = unlist(reslist)
+ if (self$primary_key %in% cols) {
+ result[[self$primary_key]] = 0L
+ }
+ result[match(cols, names(result), nomatch = 0)]
+ }
+ ),
+ active = list(
+ rownames = function() {
+ if (is.null(private$.rownames_cache)) private$.rownames_cache = unique(unlist(rev(map(private$.data$bs, "rownames"))))
+ private$.rownames_cache
+ },
+ colnames = function() {
+ private$.colnames
+ },
+ nrow = function() length(self$rownames),
+ ncol = function() length(self$colnames)
+ ),
+ private = list(
+ .rownames_cache = NULL,
+ .colnames = NULL,
+ .calculate_hash = function() {
+ do.call(calculate_hash, private$.data$bs)
+ }
+ )
+)