# Here we are going to try training a model with categorical features # Load libraries library(data.table) library(lightgbm) # Load data and look at the structure # # Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables: # $ age : int 30 33 35 30 59 35 36 39 41 43 ... # $ job : chr "unemployed" "services" "management" "management" ... # $ marital : chr "married" "married" "single" "married" ... # $ education: chr "primary" "secondary" "tertiary" "tertiary" ... # $ default : chr "no" "no" "no" "no" ... # $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ... # $ housing : chr "no" "yes" "yes" "yes" ... # $ loan : chr "no" "yes" "no" "yes" ... # $ contact : chr "cellular" "cellular" "cellular" "unknown" ... # $ day : int 19 11 16 3 5 23 14 6 14 17 ... # $ month : chr "oct" "may" "apr" "jun" ... # $ duration : int 79 220 185 199 226 141 341 151 57 313 ... # $ campaign : int 1 1 1 4 1 2 1 2 2 1 ... # $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ... # $ previous : int 0 4 1 0 0 3 2 0 0 2 ... # $ poutcome : chr "unknown" "failure" "failure" "unknown" ... # $ y : chr "no" "no" "no" "no" ... data(bank, package = "lightgbm") str(bank) # We are dividing the dataset into two: one train, one validation bank_train <- bank[1L:4000L, ] bank_test <- bank[4001L:4521L, ] # We must now transform the data to fit in LightGBM # For this task, we use lgb.convert_with_rules # The function transforms the data into a fittable data # # Classes 'data.table' and 'data.frame': 521 obs. of 17 variables: # $ age : int 53 36 58 26 34 55 55 34 41 38 ... # $ job : num 1 10 10 9 10 2 2 3 3 4 ... # $ marital : num 1 2 1 3 3 2 2 2 1 1 ... # $ education: num 2 2 2 2 2 1 2 3 2 2 ... # $ default : num 1 1 1 1 1 1 1 1 1 1 ... # $ balance : int 26 191 -123 -147 179 1086 471 105 1588 70 ... # $ housing : num 2 1 1 1 1 2 2 2 2 1 ... # $ loan : num 1 1 1 1 1 1 1 1 2 1 ... # $ contact : num 1 1 1 3 1 1 3 3 3 1 ... # $ day : int 7 31 5 4 19 6 30 28 20 27 ... # $ month : num 9 2 2 7 2 9 9 9 7 11 ... # $ duration : int 56 69 131 95 294 146 58 249 10 255 ... # $ campaign : int 1 1 2 2 3 1 2 2 8 3 ... # $ pdays : int 359 -1 -1 -1 -1 272 -1 -1 -1 148 ... # $ previous : int 1 0 0 0 0 2 0 0 0 1 ... # $ poutcome : num 1 4 4 4 4 1 4 4 4 3 ... # $ y : num 1 1 1 1 1 1 1 1 1 2 ... bank_rules <- lgb.convert_with_rules(data = bank_train) bank_train <- bank_rules$data bank_test <- lgb.convert_with_rules(data = bank_test, rules = bank_rules$rules)$data str(bank_test) # Remove 1 to label because it must be between 0 and 1 bank_train$y <- bank_train$y - 1L bank_test$y <- bank_test$y - 1L # Data input to LightGBM must be a matrix, without the label my_data_train <- as.matrix(bank_train[, 1L:16L, with = FALSE]) my_data_test <- as.matrix(bank_test[, 1L:16L, with = FALSE]) # Creating the LightGBM dataset with categorical features # The categorical features can be passed to lgb.train to not copy and paste a lot dtrain <- lgb.Dataset( data = my_data_train , label = bank_train$y , categorical_feature = c(2L, 3L, 4L, 5L, 7L, 8L, 9L, 11L, 16L) ) dtest <- lgb.Dataset.create.valid( dtrain , data = my_data_test , label = bank_test$y ) # We can now train a model params <- list( objective = "binary" , metric = "l2" , min_data = 1L , learning_rate = 0.1 , min_hessian = 1.0 , max_depth = 2L ) model <- lgb.train( params = params , data = dtrain , nrounds = 100L , valids = list(train = dtrain, valid = dtest) ) # Try to find split_feature: 11 # If you find it, it means it used a categorical feature in the first tree lgb.dump(model, num_iteration = 1L)