# -*- coding: utf-8 -*- """Tests for LightGBM integration""" import os import numpy as np import scipy.sparse import pytest import treelite import treelite_runtime from treelite.contrib import _libext from treelite.util import has_sklearn from .metadata import dataset_db, _qualify_path from .util import os_compatible_toolchains, os_platform, check_predictor try: import lightgbm except ImportError: # skip this test suite if LightGBM is not installed pytest.skip('LightGBM not installed; skipping', allow_module_level=True) @pytest.mark.skipif(not has_sklearn(), reason='Needs scikit-learn') @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) @pytest.mark.parametrize('objective', ['regression', 'regression_l1', 'huber']) @pytest.mark.parametrize('reg_sqrt', [True, False]) def test_lightgbm_regression(tmpdir, objective, reg_sqrt, toolchain): # pylint: disable=too-many-locals """Test a regressor""" model_path = os.path.join(tmpdir, 'boston_lightgbm.txt') from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) dtrain = lightgbm.Dataset(X_train, y_train, free_raw_data=False) dtest = lightgbm.Dataset(X_test, y_test, reference=dtrain, free_raw_data=False) param = {'task': 'train', 'boosting_type': 'gbdt', 'objective': objective, 'reg_sqrt': reg_sqrt, 'metric': 'rmse', 'num_leaves': 31, 'learning_rate': 0.05} bst = lightgbm.train(param, dtrain, num_boost_round=10, valid_sets=[dtrain, dtest], valid_names=['train', 'test']) bst.save_model(model_path) model = treelite.Model.load(model_path, model_format='lightgbm') libpath = os.path.join(tmpdir, f'boston_{objective}' + _libext()) model.export_lib(toolchain=toolchain, libpath=libpath, params={'quantize': 1}, verbose=True) predictor = treelite_runtime.Predictor(libpath=libpath, verbose=True) dmat = treelite_runtime.DMatrix(X_test, dtype='float64') out_pred = predictor.predict(dmat) expected_pred = bst.predict(X_test) np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5) @pytest.mark.skipif(not has_sklearn(), reason='Needs scikit-learn') @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) @pytest.mark.parametrize('boosting_type', ['gbdt', 'rf']) @pytest.mark.parametrize('objective', ['multiclass', 'multiclassova']) def test_lightgbm_multiclass_classification(tmpdir, objective, boosting_type, toolchain): # pylint: disable=too-many-locals """Test a multi-class classifier""" model_path = os.path.join(tmpdir, 'iris_lightgbm.txt') from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) dtrain = lightgbm.Dataset(X_train, y_train, free_raw_data=False) dtest = lightgbm.Dataset(X_test, y_test, reference=dtrain, free_raw_data=False) param = {'task': 'train', 'boosting': boosting_type, 'objective': objective, 'metric': 'multi_logloss', 'num_class': 3, 'num_leaves': 31, 'learning_rate': 0.05} if boosting_type == 'rf': param.update({'bagging_fraction': 0.8, 'bagging_freq': 1}) bst = lightgbm.train(param, dtrain, num_boost_round=10, valid_sets=[dtrain, dtest], valid_names=['train', 'test']) bst.save_model(model_path) model = treelite.Model.load(model_path, model_format='lightgbm') libpath = os.path.join(tmpdir, f'iris_{objective}' + _libext()) model.export_lib(toolchain=toolchain, libpath=libpath, params={'quantize': 1}, verbose=True) predictor = treelite_runtime.Predictor(libpath=libpath, verbose=True) dmat = treelite_runtime.DMatrix(X_test, dtype='float64') out_pred = predictor.predict(dmat) expected_pred = bst.predict(X_test) np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5) @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) @pytest.mark.parametrize('objective', ['binary', 'xentlambda', 'xentropy']) def test_lightgbm_binary_classification(tmpdir, objective, toolchain): # pylint: disable=too-many-locals """Test a binary classifier""" dataset = 'mushroom' model_path = os.path.join(tmpdir, 'mushroom_lightgbm.txt') dtest_path = dataset_db[dataset].dtest dtrain = lightgbm.Dataset(dataset_db[dataset].dtrain) dtest = lightgbm.Dataset(dtest_path, reference=dtrain) param = {'task': 'train', 'boosting_type': 'gbdt', 'objective': objective, 'metric': 'auc', 'num_leaves': 7, 'learning_rate': 0.1} bst = lightgbm.train(param, dtrain, num_boost_round=10, valid_sets=[dtrain, dtest], valid_names=['train', 'test']) bst.save_model(model_path) expected_prob = bst.predict(dtest_path) expected_margin = bst.predict(dtest_path, raw_score=True) model = treelite.Model.load(model_path, model_format='lightgbm') libpath = os.path.join(tmpdir, f'agaricus_{objective}' + _libext()) dmat = treelite_runtime.DMatrix(dtest_path, dtype='float64') model.export_lib(toolchain=toolchain, libpath=libpath, params={}, verbose=True) predictor = treelite_runtime.Predictor(libpath, verbose=True) out_prob = predictor.predict(dmat) np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5) out_margin = predictor.predict(dmat, pred_margin=True) np.testing.assert_almost_equal(out_margin, expected_margin, decimal=5) @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) @pytest.mark.parametrize('parallel_comp', [None, 2]) @pytest.mark.parametrize('quantize', [True, False]) def test_categorical_data(tmpdir, quantize, parallel_comp, toolchain): """ LightGBM is able to produce categorical splits directly, so that categorical data don't have to be one-hot encoded. Test if Treelite is able to handle categorical splits. This toy example contains two features, both of which are categorical. The first has cardinality 3 and the second 5. The label was generated using the formula y = f(x0) + g(x1) + [noise with std=0.1] where f and g are given by the tables x0 f(x0) x1 g(x1) 0 -20 0 -2 1 -10 1 -1 2 0 2 0 3 1 4 2 """ dataset = 'toy_categorical' libpath = os.path.join(tmpdir, dataset_db[dataset].libname + _libext()) model = treelite.Model.load(dataset_db[dataset].model, model_format=dataset_db[dataset].format) params = { 'quantize': (1 if quantize else 0), 'parallel_comp': (parallel_comp if parallel_comp else 0) } model.export_lib(toolchain=toolchain, libpath=libpath, params=params, verbose=True) predictor = treelite_runtime.Predictor(libpath=libpath, verbose=True) check_predictor(predictor, dataset) @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) @pytest.mark.parametrize('quantize', [True, False]) def test_sparse_ranking_model(tmpdir, quantize, toolchain): # pylint: disable=too-many-locals """Generate a LightGBM ranking model with highly sparse data. This example is inspired by https://github.com/dmlc/treelite/issues/222. It verifies that Treelite is able to accommodate the unique behavior of LightGBM when it comes to handling missing values. LightGBM offers two modes of handling missing values: 1. Assign default direction per each test node: This is similar to how XGBoost handles missing values. 2. Replace missing values with zeroes (0.0): This behavior is unique to LightGBM. The mode is controlled by the missing_value_to_zero_ field of each test node. This example is crafted so as to invoke the second mode of missing value handling. """ rng = np.random.default_rng(seed=2020) X = scipy.sparse.random(m=10, n=206947, format='csr', dtype=np.float64, random_state=0, density=0.0001) X.data = rng.standard_normal(size=X.data.shape[0], dtype=np.float64) y = rng.integers(low=0, high=5, size=X.shape[0]) params = { 'objective': 'lambdarank', 'num_leaves': 32, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'min_gain_to_split': 0.0, 'learning_rate': 1.0, 'min_data_in_leaf': 1 } model_path = os.path.join(tmpdir, 'sparse_ranking_lightgbm.txt') libpath = os.path.join(tmpdir, 'sparse_ranking_lgb' + _libext()) dtrain = lightgbm.Dataset(X, label=y, group=[X.shape[0]]) bst = lightgbm.train(params, dtrain, num_boost_round=1) lgb_out = bst.predict(X) bst.save_model(model_path) model = treelite.Model.load(model_path, model_format='lightgbm') params = {'quantize': (1 if quantize else 0)} model.export_lib(toolchain=toolchain, libpath=libpath, params=params, verbose=True) predictor = treelite_runtime.Predictor(libpath, verbose=True) dmat = treelite_runtime.DMatrix(X) out = predictor.predict(dmat) np.testing.assert_almost_equal(out, lgb_out) @pytest.mark.parametrize('toolchain', os_compatible_toolchains()) @pytest.mark.parametrize('quantize', [True, False]) def test_sparse_categorical_model(tmpdir, quantize, toolchain): """ LightGBM is able to produce categorical splits directly, so that categorical data don't have to be one-hot encoded. Test if Treelite is able to handle categorical splits. This example produces a model with high-cardinality categorical variables. The training data has many missing values, so we need to match LightGBM when it comes to handling missing values """ if toolchain == 'clang': pytest.xfail(reason='Clang cannot handle long if conditional') if os_platform() == 'windows': pytest.xfail(reason='MSVC cannot handle long if conditional') if os_platform() == 'osx': pytest.xfail(reason='Apple Clang cannot handle long if conditional') dataset = 'sparse_categorical' libpath = os.path.join(tmpdir, dataset_db[dataset].libname + _libext()) model = treelite.Model.load(dataset_db[dataset].model, model_format=dataset_db[dataset].format) params = {'quantize': (1 if quantize else 0)} model.export_lib(toolchain=toolchain, libpath=libpath, params=params, verbose=True, options=['-O0']) predictor = treelite_runtime.Predictor(libpath=libpath, verbose=True) check_predictor(predictor, dataset) def test_constant_tree(): """Test whether Treelite can handle LightGBM models with a constant tree (which has a single node)""" model_path = _qualify_path('lightgbm_constant_tree', 'model_with_constant_tree.txt') model = treelite.Model.load(model_path, model_format='lightgbm') assert model.num_tree == 2