#' Construct xgb.DMatrix object #' #' Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file. #' Supported input file formats are either a LIBSVM text file or a binary file that was created previously by #' \code{\link{xgb.DMatrix.save}}). #' #' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, #' a \code{dgRMatrix} object (only when making predictions from a fitted model), #' a \code{dsparseVector} object (only when making predictions from a fitted model, will be #' interpreted as a row vector), or a character string representing a filename. #' @param info a named list of additional information to store in the \code{xgb.DMatrix} object. #' See \code{\link{setinfo}} for the specific allowed kinds of #' @param missing a float value to represents missing values in data (used only when input is a dense matrix). #' It is useful when a 0 or some other extreme value represents missing values in data. #' @param silent whether to suppress printing an informational message after loading from a file. #' @param nthread Number of threads used for creating DMatrix. #' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list. #' #' @examples #' data(agaricus.train, package='xgboost') #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') #' dtrain <- xgb.DMatrix('xgb.DMatrix.data') #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data') #' @export xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, nthread = NULL, ...) { cnames <- NULL if (typeof(data) == "character") { if (length(data) > 1) stop("'data' has class 'character' and length ", length(data), ".\n 'data' accepts either a numeric matrix or a single filename.") data <- path.expand(data) handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent)) } else if (is.matrix(data)) { handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))) cnames <- colnames(data) } else if (inherits(data, "dgCMatrix")) { handle <- .Call( XGDMatrixCreateFromCSC_R, data@p, data@i, data@x, nrow(data), as.integer(NVL(nthread, -1)) ) cnames <- colnames(data) } else if (inherits(data, "dgRMatrix")) { handle <- .Call( XGDMatrixCreateFromCSR_R, data@p, data@j, data@x, ncol(data), as.integer(NVL(nthread, -1)) ) cnames <- colnames(data) } else if (inherits(data, "dsparseVector")) { indptr <- c(0L, as.integer(length(data@i))) ind <- as.integer(data@i) - 1L handle <- .Call( XGDMatrixCreateFromCSR_R, indptr, ind, data@x, length(data), as.integer(NVL(nthread, -1)) ) } else { stop("xgb.DMatrix does not support construction from ", typeof(data)) } dmat <- handle attributes(dmat) <- list(.Dimnames = list(NULL, cnames), class = "xgb.DMatrix") info <- append(info, list(...)) for (i in seq_along(info)) { p <- info[i] setinfo(dmat, names(p), p[[1]]) } return(dmat) } # get dmatrix from data, label # internal helper method xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL, nthread = NULL) { if (inherits(data, "dgCMatrix") || is.matrix(data)) { if (is.null(label)) { stop("label must be provided when data is a matrix") } dtrain <- xgb.DMatrix(data, label = label, missing = missing, nthread = nthread) if (!is.null(weight)){ setinfo(dtrain, "weight", weight) } } else { if (!is.null(label)) { warning("xgboost: label will be ignored.") } if (is.character(data)) { data <- path.expand(data) dtrain <- xgb.DMatrix(data[1]) } else if (inherits(data, "xgb.DMatrix")) { dtrain <- data } else if (inherits(data, "data.frame")) { stop("xgboost doesn't support data.frame as input. Convert it to matrix first.") } else { stop("xgboost: invalid input data") } } return (dtrain) } #' Dimensions of xgb.DMatrix #' #' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}. #' @param x Object of class \code{xgb.DMatrix} #' #' @details #' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also #' be directly used with an \code{xgb.DMatrix} object. #' #' @examples #' data(agaricus.train, package='xgboost') #' train <- agaricus.train #' dtrain <- xgb.DMatrix(train$data, label=train$label) #' #' stopifnot(nrow(dtrain) == nrow(train$data)) #' stopifnot(ncol(dtrain) == ncol(train$data)) #' stopifnot(all(dim(dtrain) == dim(train$data))) #' #' @export dim.xgb.DMatrix <- function(x) { c(.Call(XGDMatrixNumRow_R, x), .Call(XGDMatrixNumCol_R, x)) } #' Handling of column names of \code{xgb.DMatrix} #' #' Only column names are supported for \code{xgb.DMatrix}, thus setting of #' row names would have no effect and returned row names would be NULL. #' #' @param x object of class \code{xgb.DMatrix} #' @param value a list of two elements: the first one is ignored #' and the second one is column names #' #' @details #' Generic \code{dimnames} methods are used by \code{colnames}. #' Since row names are irrelevant, it is recommended to use \code{colnames} directly. #' #' @examples #' data(agaricus.train, package='xgboost') #' train <- agaricus.train #' dtrain <- xgb.DMatrix(train$data, label=train$label) #' dimnames(dtrain) #' colnames(dtrain) #' colnames(dtrain) <- make.names(1:ncol(train$data)) #' print(dtrain, verbose=TRUE) #' #' @rdname dimnames.xgb.DMatrix #' @export dimnames.xgb.DMatrix <- function(x) { attr(x, '.Dimnames') } #' @rdname dimnames.xgb.DMatrix #' @export `dimnames<-.xgb.DMatrix` <- function(x, value) { if (!is.list(value) || length(value) != 2L) stop("invalid 'dimnames' given: must be a list of two elements") if (!is.null(value[[1L]])) stop("xgb.DMatrix does not have rownames") if (is.null(value[[2]])) { attr(x, '.Dimnames') <- NULL return(x) } if (ncol(x) != length(value[[2]])) stop("can't assign ", length(value[[2]]), " colnames to a ", ncol(x), " column xgb.DMatrix") attr(x, '.Dimnames') <- value x } #' Get information of an xgb.DMatrix object #' #' Get information of an xgb.DMatrix object #' @param object Object of class \code{xgb.DMatrix} #' @param name the name of the information field to get (see details) #' @param ... other parameters #' #' @details #' The \code{name} field can be one of the following: #' #' \itemize{ #' \item \code{label}: label XGBoost learn from ; #' \item \code{weight}: to do a weight rescale ; #' \item \code{base_margin}: base margin is the base prediction XGBoost will boost from ; #' \item \code{nrow}: number of rows of the \code{xgb.DMatrix}. #' #' } #' #' \code{group} can be setup by \code{setinfo} but can't be retrieved by \code{getinfo}. #' #' @examples #' data(agaricus.train, package='xgboost') #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' #' labels <- getinfo(dtrain, 'label') #' setinfo(dtrain, 'label', 1-labels) #' #' labels2 <- getinfo(dtrain, 'label') #' stopifnot(all(labels2 == 1-labels)) #' @rdname getinfo #' @export getinfo <- function(object, ...) UseMethod("getinfo") #' @rdname getinfo #' @export getinfo.xgb.DMatrix <- function(object, name, ...) { if (typeof(name) != "character" || length(name) != 1 || !name %in% c('label', 'weight', 'base_margin', 'nrow', 'label_lower_bound', 'label_upper_bound')) { stop("getinfo: name must be one of the following\n", " 'label', 'weight', 'base_margin', 'nrow', 'label_lower_bound', 'label_upper_bound'") } if (name != "nrow"){ ret <- .Call(XGDMatrixGetInfo_R, object, name) } else { ret <- nrow(object) } if (length(ret) == 0) return(NULL) return(ret) } #' Set information of an xgb.DMatrix object #' #' Set information of an xgb.DMatrix object #' #' @param object Object of class "xgb.DMatrix" #' @param name the name of the field to get #' @param info the specific field of information to set #' @param ... other parameters #' #' @details #' The \code{name} field can be one of the following: #' #' \itemize{ #' \item \code{label}: label XGBoost learn from ; #' \item \code{weight}: to do a weight rescale ; #' \item \code{base_margin}: base margin is the base prediction XGBoost will boost from ; #' \item \code{group}: number of rows in each group (to use with \code{rank:pairwise} objective). #' } #' #' @examples #' data(agaricus.train, package='xgboost') #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' #' labels <- getinfo(dtrain, 'label') #' setinfo(dtrain, 'label', 1-labels) #' labels2 <- getinfo(dtrain, 'label') #' stopifnot(all.equal(labels2, 1-labels)) #' @rdname setinfo #' @export setinfo <- function(object, ...) UseMethod("setinfo") #' @rdname setinfo #' @export setinfo.xgb.DMatrix <- function(object, name, info, ...) { if (name == "label") { if (length(info) != nrow(object)) stop("The length of labels must equal to the number of rows in the input data") .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info)) return(TRUE) } if (name == "label_lower_bound") { if (length(info) != nrow(object)) stop("The length of lower-bound labels must equal to the number of rows in the input data") .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info)) return(TRUE) } if (name == "label_upper_bound") { if (length(info) != nrow(object)) stop("The length of upper-bound labels must equal to the number of rows in the input data") .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info)) return(TRUE) } if (name == "weight") { .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info)) return(TRUE) } if (name == "base_margin") { # if (length(info)!=nrow(object)) # stop("The length of base margin must equal to the number of rows in the input data") .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info)) return(TRUE) } if (name == "group") { if (sum(info) != nrow(object)) stop("The sum of groups must equal to the number of rows in the input data") .Call(XGDMatrixSetInfo_R, object, name, as.integer(info)) return(TRUE) } if (name == "feature_weights") { if (length(info) != ncol(object)) { stop("The number of feature weights must equal to the number of columns in the input data") } .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info)) return(TRUE) } stop("setinfo: unknown info name ", name) return(FALSE) } #' Get a new DMatrix containing the specified rows of #' original xgb.DMatrix object #' #' Get a new DMatrix containing the specified rows of #' original xgb.DMatrix object #' #' @param object Object of class "xgb.DMatrix" #' @param idxset a integer vector of indices of rows needed #' @param colset currently not used (columns subsetting is not available) #' @param ... other parameters (currently not used) #' #' @examples #' data(agaricus.train, package='xgboost') #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' #' dsub <- slice(dtrain, 1:42) #' labels1 <- getinfo(dsub, 'label') #' dsub <- dtrain[1:42, ] #' labels2 <- getinfo(dsub, 'label') #' all.equal(labels1, labels2) #' #' @rdname slice.xgb.DMatrix #' @export slice <- function(object, ...) UseMethod("slice") #' @rdname slice.xgb.DMatrix #' @export slice.xgb.DMatrix <- function(object, idxset, ...) { if (!inherits(object, "xgb.DMatrix")) { stop("object must be xgb.DMatrix") } ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset) attr_list <- attributes(object) nr <- nrow(object) len <- sapply(attr_list, NROW) ind <- which(len == nr) if (length(ind) > 0) { nms <- names(attr_list)[ind] for (i in seq_along(ind)) { obj_attr <- attr(object, nms[i]) if (NCOL(obj_attr) > 1) { attr(ret, nms[i]) <- obj_attr[idxset, ] } else { attr(ret, nms[i]) <- obj_attr[idxset] } } } return(structure(ret, class = "xgb.DMatrix")) } #' @rdname slice.xgb.DMatrix #' @export `[.xgb.DMatrix` <- function(object, idxset, colset = NULL) { slice(object, idxset) } #' Print xgb.DMatrix #' #' Print information about xgb.DMatrix. #' Currently it displays dimensions and presence of info-fields and colnames. #' #' @param x an xgb.DMatrix object #' @param verbose whether to print colnames (when present) #' @param ... not currently used #' #' @examples #' data(agaricus.train, package='xgboost') #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' #' dtrain #' print(dtrain, verbose=TRUE) #' #' @method print xgb.DMatrix #' @export print.xgb.DMatrix <- function(x, verbose = FALSE, ...) { cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') infos <- character(0) if (length(getinfo(x, 'label')) > 0) infos <- 'label' if (length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight') if (length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin') if (length(infos) == 0) infos <- 'NA' cat(infos) cnames <- colnames(x) cat(' colnames:') if (verbose & !is.null(cnames)) { cat("\n'") cat(cnames, sep = "','") cat("'") } else { if (is.null(cnames)) cat(' no') else cat(' yes') } cat("\n") invisible(x) }