# -*- coding: utf-8 -*- """Tests for XGBoost integration""" # pylint: disable=R0201, R0915 import math import os import numpy as np import pytest import treelite import treelite_runtime from treelite.util import has_sklearn from treelite.contrib import _libext from .util import os_compatible_toolchains, check_predictor from .metadata import dataset_db try: import xgboost except ImportError: # skip this test suite if XGBoost is not installed pytest.skip('XGBoost not installed; skipping', allow_module_level=True) @pytest.mark.skipif(not has_sklearn(), reason='Needs scikit-learn') @pytest.mark.parametrize('model_format', ['binary', 'json']) @pytest.mark.parametrize('objective', ['reg:linear', 'reg:squarederror', 'reg:squaredlogerror', 'reg:pseudohubererror']) @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) def test_xgb_boston(tmpdir, toolchain, objective, model_format): # pylint: disable=too-many-locals """Test Boston data (regression)""" from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) dtrain = xgboost.DMatrix(X_train, label=y_train) dtest = xgboost.DMatrix(X_test, label=y_test) param = {'max_depth': 8, 'eta': 1, 'silent': 1, 'objective': objective} num_round = 10 bst = xgboost.train(param, dtrain, num_boost_round=num_round, evals=[(dtrain, 'train'), (dtest, 'test')]) if model_format == 'json': model_name = 'boston.json' model_path = os.path.join(tmpdir, model_name) bst.save_model(model_path) model = treelite.Model.load(filename=model_path, model_format='xgboost_json') else: model = treelite.Model.from_xgboost(bst) assert model.num_feature == dtrain.num_col() assert model.num_class == 1 assert model.num_tree == num_round libpath = os.path.join(tmpdir, 'boston' + _libext()) model.export_lib(toolchain=toolchain, libpath=libpath, params={'parallel_comp': model.num_tree}, verbose=True) predictor = treelite_runtime.Predictor(libpath=libpath, verbose=True) assert predictor.num_feature == dtrain.num_col() assert predictor.num_class == 1 assert predictor.pred_transform == 'identity' assert predictor.global_bias == 0.5 assert predictor.sigmoid_alpha == 1.0 dmat = treelite_runtime.DMatrix(X_test, dtype='float32') out_pred = predictor.predict(dmat) expected_pred = bst.predict(dtest) np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5) @pytest.mark.skipif(not has_sklearn(), reason='Needs scikit-learn') @pytest.mark.parametrize('model_format', ['binary', 'json']) @pytest.mark.parametrize('objective,expected_pred_transform', [('multi:softmax', 'max_index'), ('multi:softprob', 'softmax')], ids=['multi:softmax', 'multi:softprob']) @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) def test_xgb_iris(tmpdir, toolchain, objective, model_format, expected_pred_transform): # pylint: disable=too-many-locals """Test Iris data (multi-class classification)""" from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) dtrain = xgboost.DMatrix(X_train, label=y_train) dtest = xgboost.DMatrix(X_test, label=y_test) num_class = 3 num_round = 10 param = {'max_depth': 6, 'eta': 0.05, 'num_class': num_class, 'verbosity': 0, 'objective': objective, 'metric': 'mlogloss'} bst = xgboost.train(param, dtrain, num_boost_round=num_round, evals=[(dtrain, 'train'), (dtest, 'test')]) if model_format == 'json': model_name = 'iris.json' model_path = os.path.join(tmpdir, model_name) bst.save_model(model_path) model = treelite.Model.load(filename=model_path, model_format='xgboost_json') else: model = treelite.Model.from_xgboost(bst) assert model.num_feature == dtrain.num_col() assert model.num_class == num_class assert model.num_tree == num_round * num_class libpath = os.path.join(tmpdir, 'iris' + _libext()) model.export_lib(toolchain=toolchain, libpath=libpath, params={}, verbose=True) predictor = treelite_runtime.Predictor(libpath=libpath, verbose=True) assert predictor.num_feature == dtrain.num_col() assert predictor.num_class == num_class assert predictor.pred_transform == expected_pred_transform assert predictor.global_bias == 0.5 assert predictor.sigmoid_alpha == 1.0 dmat = treelite_runtime.DMatrix(X_test, dtype='float32') out_pred = predictor.predict(dmat) expected_pred = bst.predict(dtest) np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5) @pytest.mark.parametrize('model_format', ['binary', 'json']) @pytest.mark.parametrize('objective,max_label,expected_global_bias', [('binary:logistic', 2, 0), ('binary:hinge', 2, 0.5), ('binary:logitraw', 2, 0.5), ('count:poisson', 4, math.log(0.5)), ('rank:pairwise', 5, 0.5), ('rank:ndcg', 5, 0.5), ('rank:map', 5, 0.5)], ids=['binary:logistic', 'binary:hinge', 'binary:logitraw', 'count:poisson', 'rank:pairwise', 'rank:ndcg', 'rank:map']) @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) def test_nonlinear_objective(tmpdir, objective, max_label, expected_global_bias, toolchain, model_format): # pylint: disable=too-many-locals,too-many-arguments """Test non-linear objectives with dummy data""" np.random.seed(0) nrow = 16 ncol = 8 X = np.random.randn(nrow, ncol) y = np.random.randint(0, max_label, size=nrow) assert np.min(y) == 0 assert np.max(y) == max_label - 1 num_round = 4 dtrain = xgboost.DMatrix(X, label=y) if objective.startswith('rank:'): dtrain.set_group([nrow]) bst = xgboost.train({'objective': objective, 'base_score': 0.5, 'seed': 0}, dtrain=dtrain, num_boost_round=num_round) objective_tag = objective.replace(':', '_') if model_format == 'json': model_name = f'nonlinear_{objective_tag}.json' else: model_name = f'nonlinear_{objective_tag}.bin' model_path = os.path.join(tmpdir, model_name) bst.save_model(model_path) model = treelite.Model.load( filename=model_path, model_format=('xgboost_json' if model_format == 'json' else 'xgboost')) assert model.num_feature == dtrain.num_col() assert model.num_class == 1 assert model.num_tree == num_round libpath = os.path.join(tmpdir, objective_tag + _libext()) model.export_lib(toolchain=toolchain, libpath=libpath, params={}, verbose=True) expected_pred_transform = {'binary:logistic': 'sigmoid', 'binary:hinge': 'hinge', 'binary:logitraw': 'identity', 'count:poisson': 'exponential', 'rank:pairwise': 'identity', 'rank:ndcg': 'identity', 'rank:map': 'identity'} predictor = treelite_runtime.Predictor(libpath=libpath, verbose=True) assert predictor.num_feature == dtrain.num_col() assert predictor.num_class == 1 assert predictor.pred_transform == expected_pred_transform[objective] np.testing.assert_almost_equal(predictor.global_bias, expected_global_bias, decimal=5) assert predictor.sigmoid_alpha == 1.0 dmat = treelite_runtime.DMatrix(X, dtype='float32') out_pred = predictor.predict(dmat) expected_pred = bst.predict(dtrain) np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5) @pytest.mark.skipif(not has_sklearn(), reason='Needs scikit-learn') @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) def test_xgb_deserializers(tmpdir, toolchain): # pylint: disable=too-many-locals """Test Boston data (regression)""" from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split # Train xgboost model X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, shuffle=False ) dtrain = xgboost.DMatrix(X_train, label=y_train) dtest = xgboost.DMatrix(X_test, label=y_test) param = {'max_depth': 8, 'eta': 1, 'silent': 1, 'objective': 'reg:linear'} num_round = 10 bst = xgboost.train(param, dtrain, num_boost_round=num_round, evals=[(dtrain, 'train'), (dtest, 'test')]) # Serialize xgboost model model_bin_path = os.path.join(tmpdir, 'serialized.model') bst.save_model(model_bin_path) model_json_path = os.path.join(tmpdir, 'serialized.json') bst.save_model(model_json_path) # Construct Treelite models from xgboost serializations model_bin = treelite.Model.load( model_bin_path, model_format='xgboost' ) model_json = treelite.Model.load( model_json_path, model_format='xgboost_json' ) with open(model_json_path) as file_: json_str = file_.read() model_json_str = treelite.Model.from_xgboost_json(json_str) # Compile models to libraries model_bin_lib = os.path.join(tmpdir, 'bin{}'.format(_libext())) model_bin.export_lib( toolchain=toolchain, libpath=model_bin_lib, params={'parallel_comp': model_bin.num_tree} ) model_json_lib = os.path.join(tmpdir, 'json{}'.format(_libext())) model_json.export_lib( toolchain=toolchain, libpath=model_json_lib, params={'parallel_comp': model_json.num_tree} ) model_json_str_lib = os.path.join(tmpdir, 'json_str{}'.format(_libext())) model_json_str.export_lib( toolchain=toolchain, libpath=model_json_str_lib, params={'parallel_comp': model_json_str.num_tree} ) # Generate predictors from compiled libraries predictor_bin = treelite_runtime.Predictor(model_bin_lib) assert predictor_bin.num_feature == dtrain.num_col() assert predictor_bin.num_class == 1 assert predictor_bin.pred_transform == 'identity' assert predictor_bin.global_bias == pytest.approx(0.5) assert predictor_bin.sigmoid_alpha == pytest.approx(1.0) predictor_json = treelite_runtime.Predictor(model_json_lib) assert predictor_json.num_feature == dtrain.num_col() assert predictor_json.num_class == 1 assert predictor_json.pred_transform == 'identity' assert predictor_json.global_bias == pytest.approx(0.5) assert predictor_json.sigmoid_alpha == pytest.approx(1.0) predictor_json_str = treelite_runtime.Predictor(model_json_str_lib) assert predictor_json_str.num_feature == dtrain.num_col() assert predictor_json_str.num_class == 1 assert predictor_json_str.pred_transform == 'identity' assert predictor_json_str.global_bias == pytest.approx(0.5) assert predictor_json_str.sigmoid_alpha == pytest.approx(1.0) # Run inference with each predictor dmat = treelite_runtime.DMatrix(X_test, dtype='float32') bin_pred = predictor_bin.predict(dmat) json_pred = predictor_json.predict(dmat) json_str_pred = predictor_json_str.predict(dmat) expected_pred = bst.predict(dtest) np.testing.assert_almost_equal(bin_pred, expected_pred, decimal=5) np.testing.assert_almost_equal(json_pred, expected_pred, decimal=5) np.testing.assert_almost_equal(json_str_pred, expected_pred, decimal=5) @pytest.mark.parametrize('parallel_comp', [None, 5]) @pytest.mark.parametrize('quantize', [True, False]) @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) def test_xgb_categorical_split(tmpdir, toolchain, quantize, parallel_comp): """Test toy XGBoost model with categorical splits""" dataset = 'xgb_toy_categorical' model = treelite.Model.load(dataset_db[dataset].model, model_format='xgboost_json') libpath = os.path.join(tmpdir, dataset_db[dataset].libname + _libext()) params = { 'quantize': (1 if quantize else 0), 'parallel_comp': (parallel_comp if parallel_comp else 0) } model.export_lib(toolchain=toolchain, libpath=libpath, params=params, verbose=True) predictor = treelite_runtime.Predictor(libpath) check_predictor(predictor, dataset)