From 38dd91f49137305e5822f459ac338a1497f2e400 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 5 Jan 2024 17:53:36 +0800 Subject: [PATCH] Save model in ubj as the default. (#9947) --- .../spark/ml/util/XGBoostReadWrite.scala | 3 - .../scala/spark/XGBoostClassifierSuite.scala | 17 +- .../scala/spark/XGBoostRegressorSuite.scala | 27 +- .../java/ml/dmlc/xgboost4j/java/Booster.java | 5 +- .../ml/dmlc/xgboost4j/scala/Booster.scala | 3 +- python-package/xgboost/core.py | 2 +- python-package/xgboost/testing/__init__.py | 5 +- src/c_api/c_api.cc | 16 +- tests/ci_build/lint_python.py | 2 + tests/python/test_basic.py | 182 ++++---- tests/python/test_basic_models.py | 194 +-------- tests/python/test_callback.py | 32 +- tests/python/test_config.py | 10 +- tests/python/test_dmatrix.py | 39 +- tests/python/test_early_stopping.py | 6 +- tests/python/test_eval_metrics.py | 30 +- tests/python/test_linear.py | 10 +- tests/python/test_model_io.py | 406 ++++++++++++++++++ tests/python/test_pickling.py | 27 -- tests/python/test_shap.py | 2 +- tests/python/test_updaters.py | 4 +- tests/python/test_with_pandas.py | 7 - tests/python/test_with_sklearn.py | 123 ------ 23 files changed, 600 insertions(+), 552 deletions(-) create mode 100644 tests/python/test_model_io.py diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostReadWrite.scala b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostReadWrite.scala index 672241be1a01..ff732b78c08d 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostReadWrite.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostReadWrite.scala @@ -30,9 +30,6 @@ import org.apache.spark.ml.param.Params import org.apache.spark.ml.util.DefaultParamsReader.Metadata abstract class XGBoostWriter extends MLWriter { - - /** Currently it's using the "deprecated" format as - * default, which will be changed into `ubj` in future releases. */ def getModelFormat(): String = { optionMap.getOrElse("format", JBooster.DEFAULT_FORMAT) } diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala index 9b53c764273d..48e7dae52b2e 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala @@ -1,5 +1,5 @@ /* - Copyright (c) 2014-2022 by Contributors + Copyright (c) 2014-2024 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -432,6 +432,7 @@ class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerS val xgb = new XGBoostClassifier(paramMap) val model = xgb.fit(trainingDF) + // test json val modelPath = new File(tempDir.toFile, "xgbc").getPath model.write.option("format", "json").save(modelPath) val nativeJsonModelPath = new File(tempDir.toFile, "nativeModel.json").getPath @@ -439,21 +440,21 @@ class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerS assert(compareTwoFiles(new File(modelPath, "data/XGBoostClassificationModel").getPath, nativeJsonModelPath)) - // test default "deprecated" + // test ubj val modelUbjPath = new File(tempDir.toFile, "xgbcUbj").getPath model.write.save(modelUbjPath) - val nativeDeprecatedModelPath = new File(tempDir.toFile, "nativeModel").getPath - model.nativeBooster.saveModel(nativeDeprecatedModelPath) + val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel.ubj").getPath + model.nativeBooster.saveModel(nativeUbjModelPath) assert(compareTwoFiles(new File(modelUbjPath, "data/XGBoostClassificationModel").getPath, - nativeDeprecatedModelPath)) + nativeUbjModelPath)) // json file should be indifferent with ubj file val modelJsonPath = new File(tempDir.toFile, "xgbcJson").getPath model.write.option("format", "json").save(modelJsonPath) - val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel1.ubj").getPath - model.nativeBooster.saveModel(nativeUbjModelPath) + val nativeUbjModelPath1 = new File(tempDir.toFile, "nativeModel1.ubj").getPath + model.nativeBooster.saveModel(nativeUbjModelPath1) assert(!compareTwoFiles(new File(modelJsonPath, "data/XGBoostClassificationModel").getPath, - nativeUbjModelPath)) + nativeUbjModelPath1)) } test("native json model file should store feature_name and feature_type") { diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala index 1bdea7a827bd..0698541c7e89 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala @@ -1,5 +1,5 @@ /* - Copyright (c) 2014-2022 by Contributors + Copyright (c) 2014-2024 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -333,21 +333,24 @@ class XGBoostRegressorSuite extends AnyFunSuite with PerTest with TmpFolderPerSu assert(compareTwoFiles(new File(modelPath, "data/XGBoostRegressionModel").getPath, nativeJsonModelPath)) - // test default "deprecated" + // test default "ubj" val modelUbjPath = new File(tempDir.toFile, "xgbcUbj").getPath model.write.save(modelUbjPath) - val nativeDeprecatedModelPath = new File(tempDir.toFile, "nativeModel").getPath - model.nativeBooster.saveModel(nativeDeprecatedModelPath) - assert(compareTwoFiles(new File(modelUbjPath, "data/XGBoostRegressionModel").getPath, - nativeDeprecatedModelPath)) - // json file should be indifferent with ubj file - val modelJsonPath = new File(tempDir.toFile, "xgbcJson").getPath - model.write.option("format", "json").save(modelJsonPath) - val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel1.ubj").getPath + val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel.ubj").getPath model.nativeBooster.saveModel(nativeUbjModelPath) - assert(!compareTwoFiles(new File(modelJsonPath, "data/XGBoostRegressionModel").getPath, + + assert(compareTwoFiles(new File(modelUbjPath, "data/XGBoostRegressionModel").getPath, nativeUbjModelPath)) - } + // test the deprecated format + val modelDeprecatedPath = new File(tempDir.toFile, "modelDeprecated").getPath + model.write.option("format", "deprecated").save(modelDeprecatedPath) + + val nativeDeprecatedModelPath = new File(tempDir.toFile, "nativeModel.deprecated").getPath + model.nativeBooster.saveModel(nativeDeprecatedModelPath) + + assert(compareTwoFiles(new File(modelDeprecatedPath, "data/XGBoostRegressionModel").getPath, + nativeDeprecatedModelPath)) + } } diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java index 51959ce0cfb1..22ed6dc82166 100644 --- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java +++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java @@ -34,7 +34,7 @@ * Booster for xgboost, this is a model API that support interactive build of a XGBoost Model */ public class Booster implements Serializable, KryoSerializable { - public static final String DEFAULT_FORMAT = "deprecated"; + public static final String DEFAULT_FORMAT = "ubj"; private static final Log logger = LogFactory.getLog(Booster.class); // handle to the booster. private long handle = 0; @@ -788,8 +788,7 @@ private Map getFeatureImportanceFromModel( } /** - * Save model into raw byte array. Currently it's using the deprecated format as - * default, which will be changed into `ubj` in future releases. + * Save model into raw byte array in the UBJSON ("ubj") format. * * @return the saved byte array * @throws XGBoostError native error diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala index c288bfab19fb..57c3b9a5d91d 100644 --- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala +++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala @@ -337,8 +337,7 @@ class Booster private[xgboost4j](private[xgboost4j] var booster: JBooster) } /** - * Save model into a raw byte array. Currently it's using the deprecated format as - * default, which will be changed into `ubj` in future releases. + * Save model into a raw byte array in the UBJSON ("ubj") format. */ @throws(classOf[XGBoostError]) def toByteArray: Array[Byte] = { diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 097fb0935f5b..d554a8d3afe9 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -2613,7 +2613,7 @@ def save_model(self, fname: Union[str, os.PathLike]) -> None: else: raise TypeError("fname must be a string or os PathLike") - def save_raw(self, raw_format: str = "deprecated") -> bytearray: + def save_raw(self, raw_format: str = "ubj") -> bytearray: """Save the model to a in memory buffer representation instead of file. Parameters diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 373ad1c58613..46bbf880027e 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -630,7 +630,7 @@ def random_csc(t_id: int) -> sparse.csc_matrix: def make_datasets_with_margin( unweighted_strategy: strategies.SearchStrategy, -) -> Callable: +) -> Callable[[], strategies.SearchStrategy[TestDataset]]: """Factory function for creating strategies that generates datasets with weight and base margin. @@ -668,8 +668,7 @@ def weight_margin(draw: Callable) -> TestDataset: # A strategy for drawing from a set of example datasets. May add random weights to the # dataset -@memory.cache -def make_dataset_strategy() -> Callable: +def make_dataset_strategy() -> strategies.SearchStrategy[TestDataset]: _unweighted_datasets_strategy = strategies.sampled_from( [ TestDataset( diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 9aedcef2e855..d4cc217d1ee8 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1313,10 +1313,8 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) { namespace { void WarnOldModel() { - if (XGBOOST_VER_MAJOR >= 2) { - LOG(WARNING) << "Saving into deprecated binary model format, please consider using `json` or " - "`ubj`. Model format will default to JSON in XGBoost 2.2 if not specified."; - } + LOG(WARNING) << "Saving into deprecated binary model format, please consider using `json` or " + "`ubj`. Model format is default to UBJSON in XGBoost 2.1 if not specified."; } } // anonymous namespace @@ -1339,14 +1337,14 @@ XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char *fname) { save_json(std::ios::out); } else if (common::FileExtension(fname) == "ubj") { save_json(std::ios::binary); - } else if (XGBOOST_VER_MAJOR == 2 && XGBOOST_VER_MINOR >= 2) { - LOG(WARNING) << "Saving model to JSON as default. You can use file extension `json`, `ubj` or " - "`deprecated` to choose between formats."; - save_json(std::ios::out); - } else { + } else if (common::FileExtension(fname) == "deprecated") { WarnOldModel(); auto *bst = static_cast(handle); bst->SaveModel(fo.get()); + } else { + LOG(WARNING) << "Saving model in the UBJSON format as default. You can use file extension:" + " `json`, `ubj` or `deprecated` to choose between formats."; + save_json(std::ios::binary); } API_END(); } diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index ed33a96e51d5..1e414f3b531b 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -27,6 +27,7 @@ class LintersPaths: "tests/python/test_quantile_dmatrix.py", "tests/python/test_tree_regularization.py", "tests/python/test_shap.py", + "tests/python/test_model_io.py", "tests/python/test_with_pandas.py", "tests/python-gpu/", "tests/python-sycl/", @@ -83,6 +84,7 @@ class LintersPaths: "tests/python/test_multi_target.py", "tests/python-gpu/test_gpu_data_iterator.py", "tests/python-gpu/load_pickle.py", + "tests/python/test_model_io.py", "tests/test_distributed/test_with_spark/test_data.py", "tests/test_distributed/test_gpu_with_spark/test_data.py", "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py", diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index b99351c7f47b..cdc571a916df 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -10,46 +10,48 @@ import xgboost as xgb from xgboost import testing as tm -dpath = 'demo/data/' +dpath = "demo/data/" rng = np.random.RandomState(1994) class TestBasic: def test_compat(self): from xgboost.compat import lazy_isinstance + a = np.array([1, 2, 3]) - assert lazy_isinstance(a, 'numpy', 'ndarray') - assert not lazy_isinstance(a, 'numpy', 'dataframe') + assert lazy_isinstance(a, "numpy", "ndarray") + assert not lazy_isinstance(a, "numpy", "dataframe") def test_basic(self): dtrain, dtest = tm.load_agaricus(__file__) - param = {'max_depth': 2, 'eta': 1, - 'objective': 'binary:logistic'} + param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} # specify validations set to watch performance - watchlist = [(dtrain, 'train')] + watchlist = [(dtrain, "train")] num_round = 2 - bst = xgb.train(param, dtrain, num_round, watchlist, verbose_eval=True) + bst = xgb.train(param, dtrain, num_round, evals=watchlist, verbose_eval=True) preds = bst.predict(dtrain) labels = dtrain.get_label() - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) # error must be smaller than 10% assert err < 0.1 preds = bst.predict(dtest) labels = dtest.get_label() - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) # error must be smaller than 10% assert err < 0.1 with tempfile.TemporaryDirectory() as tmpdir: - dtest_path = os.path.join(tmpdir, 'dtest.dmatrix') + dtest_path = os.path.join(tmpdir, "dtest.dmatrix") # save dmatrix into binary buffer dtest.save_binary(dtest_path) # save model - model_path = os.path.join(tmpdir, 'model.booster') + model_path = os.path.join(tmpdir, "model.ubj") bst.save_model(model_path) # load model and data in bst2 = xgb.Booster(model_file=model_path) @@ -59,17 +61,21 @@ def test_basic(self): assert np.sum(np.abs(preds2 - preds)) == 0 def test_metric_config(self): - # Make sure that the metric configuration happens in booster so the - # string `['error', 'auc']` doesn't get passed down to core. + # Make sure that the metric configuration happens in booster so the string + # `['error', 'auc']` doesn't get passed down to core. dtrain, dtest = tm.load_agaricus(__file__) - param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']} - watchlist = [(dtest, 'eval'), (dtrain, 'train')] + param = { + "max_depth": 2, + "eta": 1, + "objective": "binary:logistic", + "eval_metric": ["error", "auc"], + } + watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 2 - booster = xgb.train(param, dtrain, num_round, watchlist) + booster = xgb.train(param, dtrain, num_round, evals=watchlist) predt_0 = booster.predict(dtrain) with tempfile.TemporaryDirectory() as tmpdir: - path = os.path.join(tmpdir, 'model.json') + path = os.path.join(tmpdir, "model.json") booster.save_model(path) booster = xgb.Booster(params=param, model_file=path) @@ -78,22 +84,23 @@ def test_metric_config(self): def test_multiclass(self): dtrain, dtest = tm.load_agaricus(__file__) - param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'num_class': 2} + param = {"max_depth": 2, "eta": 1, "num_class": 2} # specify validations set to watch performance - watchlist = [(dtest, 'eval'), (dtrain, 'train')] + watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 2 - bst = xgb.train(param, dtrain, num_round, watchlist) + bst = xgb.train(param, dtrain, num_round, evals=watchlist) # this is prediction preds = bst.predict(dtest) labels = dtest.get_label() - err = sum(1 for i in range(len(preds)) - if preds[i] != labels[i]) / float(len(preds)) + err = sum(1 for i in range(len(preds)) if preds[i] != labels[i]) / float( + len(preds) + ) # error must be smaller than 10% assert err < 0.1 with tempfile.TemporaryDirectory() as tmpdir: - dtest_path = os.path.join(tmpdir, 'dtest.buffer') - model_path = os.path.join(tmpdir, 'xgb.model') + dtest_path = os.path.join(tmpdir, "dtest.buffer") + model_path = os.path.join(tmpdir, "model.ubj") # save dmatrix into binary buffer dtest.save_binary(dtest_path) # save model @@ -108,33 +115,39 @@ def test_multiclass(self): def test_dump(self): data = np.random.randn(100, 2) target = np.array([0, 1] * 50) - features = ['Feature1', 'Feature2'] + features = ["Feature1", "Feature2"] dm = xgb.DMatrix(data, label=target, feature_names=features) - params = {'objective': 'binary:logistic', - 'eval_metric': 'logloss', - 'eta': 0.3, - 'max_depth': 1} + params = { + "objective": "binary:logistic", + "eval_metric": "logloss", + "eta": 0.3, + "max_depth": 1, + } bst = xgb.train(params, dm, num_boost_round=1) # number of feature importances should == number of features dump1 = bst.get_dump() - assert len(dump1) == 1, 'Expected only 1 tree to be dumped.' - len(dump1[0].splitlines()) == 3, 'Expected 1 root and 2 leaves - 3 lines in dump.' + assert len(dump1) == 1, "Expected only 1 tree to be dumped." + len( + dump1[0].splitlines() + ) == 3, "Expected 1 root and 2 leaves - 3 lines in dump." dump2 = bst.get_dump(with_stats=True) - assert dump2[0].count('\n') == 3, 'Expected 1 root and 2 leaves - 3 lines in dump.' - msg = 'Expected more info when with_stats=True is given.' - assert dump2[0].find('\n') > dump1[0].find('\n'), msg + assert ( + dump2[0].count("\n") == 3 + ), "Expected 1 root and 2 leaves - 3 lines in dump." + msg = "Expected more info when with_stats=True is given." + assert dump2[0].find("\n") > dump1[0].find("\n"), msg dump3 = bst.get_dump(dump_format="json") dump3j = json.loads(dump3[0]) - assert dump3j['nodeid'] == 0, 'Expected the root node on top.' + assert dump3j["nodeid"] == 0, "Expected the root node on top." dump4 = bst.get_dump(dump_format="json", with_stats=True) dump4j = json.loads(dump4[0]) - assert 'gain' in dump4j, "Expected 'gain' to be dumped in JSON." + assert "gain" in dump4j, "Expected 'gain' to be dumped in JSON." with pytest.raises(ValueError): bst.get_dump(fmap="foo") @@ -163,12 +176,14 @@ def test_feature_score(self): def test_load_file_invalid(self): with pytest.raises(xgb.core.XGBoostError): - xgb.Booster(model_file='incorrect_path') + xgb.Booster(model_file="incorrect_path") with pytest.raises(xgb.core.XGBoostError): - xgb.Booster(model_file=u'不正なパス') + xgb.Booster(model_file="不正なパス") - @pytest.mark.parametrize("path", ["모델.ubj", "がうる・ぐら.json"], ids=["path-0", "path-1"]) + @pytest.mark.parametrize( + "path", ["모델.ubj", "がうる・ぐら.json"], ids=["path-0", "path-1"] + ) def test_unicode_path(self, tmpdir, path): model_path = pathlib.Path(tmpdir) / path dtrain, _ = tm.load_agaricus(__file__) @@ -180,12 +195,11 @@ def test_unicode_path(self, tmpdir, path): assert bst.get_dump(dump_format="text") == bst2.get_dump(dump_format="text") def test_dmatrix_numpy_init_omp(self): - rows = [1000, 11326, 15000] cols = 50 for row in rows: X = np.random.randn(row, cols) - y = np.random.randn(row).astype('f') + y = np.random.randn(row).astype("f") dm = xgb.DMatrix(X, y, nthread=0) np.testing.assert_array_equal(dm.get_label(), y) assert dm.num_row() == row @@ -198,8 +212,7 @@ def test_dmatrix_numpy_init_omp(self): def test_cv(self): dm, _ = tm.load_agaricus(__file__) - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic'} + params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} # return np.ndarray cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False) @@ -208,19 +221,18 @@ def test_cv(self): def test_cv_no_shuffle(self): dm, _ = tm.load_agaricus(__file__) - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic'} + params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} # return np.ndarray - cv = xgb.cv(params, dm, num_boost_round=10, shuffle=False, nfold=10, - as_pandas=False) + cv = xgb.cv( + params, dm, num_boost_round=10, shuffle=False, nfold=10, as_pandas=False + ) assert isinstance(cv, dict) assert len(cv) == (4) def test_cv_explicit_fold_indices(self): dm, _ = tm.load_agaricus(__file__) - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': - 'binary:logistic'} + params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} folds = [ # Train Test ([1, 3], [5, 8]), @@ -228,15 +240,13 @@ def test_cv_explicit_fold_indices(self): ] # return np.ndarray - cv = xgb.cv(params, dm, num_boost_round=10, folds=folds, - as_pandas=False) + cv = xgb.cv(params, dm, num_boost_round=10, folds=folds, as_pandas=False) assert isinstance(cv, dict) assert len(cv) == (4) @pytest.mark.skipif(**tm.skip_s390x()) def test_cv_explicit_fold_indices_labels(self): - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': - 'reg:squarederror'} + params = {"max_depth": 2, "eta": 1, "objective": "reg:squarederror"} N = 100 F = 3 dm = xgb.DMatrix(data=np.random.randn(N, F), label=np.arange(N)) @@ -252,9 +262,10 @@ def __init__(self) -> None: super().__init__() def after_iteration( - self, model, + self, + model, epoch: int, - evals_log: xgb.callback.TrainingCallback.EvalsLog + evals_log: xgb.callback.TrainingCallback.EvalsLog, ): print([fold.dtest.get_label() for fold in model.cvfolds]) @@ -263,12 +274,18 @@ def after_iteration( # Run cross validation and capture standard out to test callback result with tm.captured_output() as (out, err): xgb.cv( - params, dm, num_boost_round=1, folds=folds, callbacks=[cb], - as_pandas=False + params, + dm, + num_boost_round=1, + folds=folds, + callbacks=[cb], + as_pandas=False, ) output = out.getvalue().strip() - solution = ('[array([5., 8.], dtype=float32), array([23., 43., 11.],' + - ' dtype=float32)]') + solution = ( + "[array([5., 8.], dtype=float32), array([23., 43., 11.]," + + " dtype=float32)]" + ) assert output == solution @@ -285,7 +302,7 @@ def test_DMatrix_save_to_path(self): """Saving to a binary file using pathlib from a DMatrix.""" data = np.random.randn(100, 2) target = np.array([0, 1] * 50) - features = ['Feature1', 'Feature2'] + features = ["Feature1", "Feature2"] dm = xgb.DMatrix(data, label=target, feature_names=features) @@ -299,42 +316,3 @@ def test_Booster_init_invalid_path(self): """An invalid model_file path should raise XGBoostError.""" with pytest.raises(xgb.core.XGBoostError): xgb.Booster(model_file=Path("invalidpath")) - - def test_Booster_save_and_load(self): - """Saving and loading model files from paths.""" - save_path = Path("saveload.model") - - data = np.random.randn(100, 2) - target = np.array([0, 1] * 50) - features = ['Feature1', 'Feature2'] - - dm = xgb.DMatrix(data, label=target, feature_names=features) - params = {'objective': 'binary:logistic', - 'eval_metric': 'logloss', - 'eta': 0.3, - 'max_depth': 1} - - bst = xgb.train(params, dm, num_boost_round=1) - - # save, assert exists - bst.save_model(save_path) - assert save_path.exists() - - def dump_assertions(dump): - """Assertions for the expected dump from Booster""" - assert len(dump) == 1, 'Exepcted only 1 tree to be dumped.' - assert len(dump[0].splitlines()) == 3, 'Expected 1 root and 2 leaves - 3 lines.' - - # load the model again using Path - bst2 = xgb.Booster(model_file=save_path) - dump2 = bst2.get_dump() - dump_assertions(dump2) - - # load again using load_model - bst3 = xgb.Booster() - bst3.load_model(save_path) - dump3 = bst3.get_dump() - dump_assertions(dump3) - - # remove file - Path.unlink(save_path) diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index 8f83e1fcc16a..ca35c4e9105d 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -15,33 +15,9 @@ rng = np.random.RandomState(1994) -def json_model(model_path: str, parameters: dict) -> dict: - datasets = pytest.importorskip("sklearn.datasets") - - X, y = datasets.make_classification(64, n_features=8, n_classes=3, n_informative=6) - if parameters.get("objective", None) == "multi:softmax": - parameters["num_class"] = 3 - - dm1 = xgb.DMatrix(X, y) - - bst = xgb.train(parameters, dm1) - bst.save_model(model_path) - - if model_path.endswith("ubj"): - import ubjson - - with open(model_path, "rb") as ubjfd: - model = ubjson.load(ubjfd) - else: - with open(model_path, "r") as fd: - model = json.load(fd) - - return model - - class TestModels: def test_glm(self): - param = {'verbosity': 0, 'objective': 'binary:logistic', + param = {'objective': 'binary:logistic', 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1, 'nthread': 1} dtrain, dtest = tm.load_agaricus(__file__) @@ -73,7 +49,7 @@ def test_dart(self): with tempfile.TemporaryDirectory() as tmpdir: dtest_path = os.path.join(tmpdir, 'dtest.dmatrix') - model_path = os.path.join(tmpdir, 'xgboost.model.dart') + model_path = os.path.join(tmpdir, "xgboost.model.dart.ubj") # save dmatrix into binary buffer dtest.save_binary(dtest_path) model_path = model_path @@ -101,7 +77,6 @@ def my_logloss(preds, dtrain): # check whether sample_type and normalize_type work num_round = 50 - param['verbosity'] = 0 param['learning_rate'] = 0.1 param['rate_drop'] = 0.1 preds_list = [] @@ -214,8 +189,7 @@ def test_multi_eval_metric(self): assert set(evals_result['eval'].keys()) == {'auc', 'error', 'logloss'} def test_fpreproc(self): - param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic'} + param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} num_round = 2 def fpreproc(dtrain, dtest, param): @@ -229,8 +203,7 @@ def fpreproc(dtrain, dtest, param): metrics={'auc'}, seed=0, fpreproc=fpreproc) def test_show_stdv(self): - param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic'} + param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} num_round = 2 dtrain, _ = tm.load_agaricus(__file__) xgb.cv(param, dtrain, num_round, nfold=5, @@ -273,142 +246,6 @@ def test_feature_names_validation(self): bst = xgb.train([], dm2) bst.predict(dm2) # success - def test_model_binary_io(self): - model_path = 'test_model_binary_io.bin' - parameters = {'tree_method': 'hist', 'booster': 'gbtree', - 'scale_pos_weight': '0.5'} - X = np.random.random((10, 3)) - y = np.random.random((10,)) - dtrain = xgb.DMatrix(X, y) - bst = xgb.train(parameters, dtrain, num_boost_round=2) - bst.save_model(model_path) - bst = xgb.Booster(model_file=model_path) - os.remove(model_path) - config = json.loads(bst.save_config()) - assert float(config['learner']['objective'][ - 'reg_loss_param']['scale_pos_weight']) == 0.5 - - buf = bst.save_raw() - from_raw = xgb.Booster() - from_raw.load_model(buf) - - buf_from_raw = from_raw.save_raw() - assert buf == buf_from_raw - - def run_model_json_io(self, parameters: dict, ext: str) -> None: - if ext == "ubj" and tm.no_ubjson()["condition"]: - pytest.skip(tm.no_ubjson()["reason"]) - - loc = locale.getpreferredencoding(False) - model_path = 'test_model_json_io.' + ext - j_model = json_model(model_path, parameters) - assert isinstance(j_model['learner'], dict) - - bst = xgb.Booster(model_file=model_path) - - bst.save_model(fname=model_path) - if ext == "ubj": - import ubjson - with open(model_path, "rb") as ubjfd: - j_model = ubjson.load(ubjfd) - else: - with open(model_path, 'r') as fd: - j_model = json.load(fd) - - assert isinstance(j_model['learner'], dict) - - os.remove(model_path) - assert locale.getpreferredencoding(False) == loc - - json_raw = bst.save_raw(raw_format="json") - from_jraw = xgb.Booster() - from_jraw.load_model(json_raw) - - ubj_raw = bst.save_raw(raw_format="ubj") - from_ubjraw = xgb.Booster() - from_ubjraw.load_model(ubj_raw) - - if parameters.get("multi_strategy", None) != "multi_output_tree": - # old binary model is not supported. - old_from_json = from_jraw.save_raw(raw_format="deprecated") - old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated") - - assert old_from_json == old_from_ubj - - raw_json = bst.save_raw(raw_format="json") - pretty = json.dumps(json.loads(raw_json), indent=2) + "\n\n" - bst.load_model(bytearray(pretty, encoding="ascii")) - - if parameters.get("multi_strategy", None) != "multi_output_tree": - # old binary model is not supported. - old_from_json = from_jraw.save_raw(raw_format="deprecated") - old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated") - - assert old_from_json == old_from_ubj - - rng = np.random.default_rng() - X = rng.random(size=from_jraw.num_features() * 10).reshape( - (10, from_jraw.num_features()) - ) - predt_from_jraw = from_jraw.predict(xgb.DMatrix(X)) - predt_from_bst = bst.predict(xgb.DMatrix(X)) - np.testing.assert_allclose(predt_from_jraw, predt_from_bst) - - @pytest.mark.parametrize("ext", ["json", "ubj"]) - def test_model_json_io(self, ext: str) -> None: - parameters = {"booster": "gbtree", "tree_method": "hist"} - self.run_model_json_io(parameters, ext) - parameters = { - "booster": "gbtree", - "tree_method": "hist", - "multi_strategy": "multi_output_tree", - "objective": "multi:softmax", - } - self.run_model_json_io(parameters, ext) - parameters = {"booster": "gblinear"} - self.run_model_json_io(parameters, ext) - parameters = {"booster": "dart", "tree_method": "hist"} - self.run_model_json_io(parameters, ext) - - @pytest.mark.skipif(**tm.no_json_schema()) - def test_json_io_schema(self): - import jsonschema - model_path = 'test_json_schema.json' - path = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - doc = os.path.join(path, 'doc', 'model.schema') - with open(doc, 'r') as fd: - schema = json.load(fd) - parameters = {'tree_method': 'hist', 'booster': 'gbtree'} - jsonschema.validate(instance=json_model(model_path, parameters), - schema=schema) - os.remove(model_path) - - parameters = {'tree_method': 'hist', 'booster': 'dart'} - jsonschema.validate(instance=json_model(model_path, parameters), - schema=schema) - os.remove(model_path) - - try: - dtrain, _ = tm.load_agaricus(__file__) - xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1) - except ValueError as e: - e_str = str(e) - beg = e_str.find('Objective candidate') - end = e_str.find('Stack trace') - e_str = e_str[beg: end] - e_str = e_str.strip() - splited = e_str.splitlines() - objectives = [s.split(': ')[1] for s in splited] - j_objectives = schema['properties']['learner']['properties'][ - 'objective']['oneOf'] - objectives_from_schema = set() - for j_obj in j_objectives: - objectives_from_schema.add( - j_obj['properties']['name']['const']) - objectives = set(objectives) - assert objectives == objectives_from_schema - @pytest.mark.skipif(**tm.no_json_schema()) def test_json_dump_schema(self): import jsonschema @@ -470,29 +307,6 @@ def validate_json(obj: dict) -> None: for d in text_dump: assert d.find(r"feature \"2\"") != -1 - def test_categorical_model_io(self): - X, y = tm.make_categorical(256, 16, 71, False) - Xy = xgb.DMatrix(X, y, enable_categorical=True) - booster = xgb.train({"tree_method": "approx"}, Xy, num_boost_round=16) - predt_0 = booster.predict(Xy) - - with tempfile.TemporaryDirectory() as tempdir: - path = os.path.join(tempdir, "model.binary") - with pytest.raises(ValueError, match=r".*JSON/UBJSON.*"): - booster.save_model(path) - - path = os.path.join(tempdir, "model.json") - booster.save_model(path) - booster = xgb.Booster(model_file=path) - predt_1 = booster.predict(Xy) - np.testing.assert_allclose(predt_0, predt_1) - - path = os.path.join(tempdir, "model.ubj") - booster.save_model(path) - booster = xgb.Booster(model_file=path) - predt_1 = booster.predict(Xy) - np.testing.assert_allclose(predt_0, predt_1) - @pytest.mark.skipif(**tm.no_sklearn()) def test_attributes(self): from sklearn.datasets import load_iris diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py index 262c09c99503..4893ad0749be 100644 --- a/tests/python/test_callback.py +++ b/tests/python/test_callback.py @@ -278,14 +278,18 @@ def run_eta_decay(self, tree_method): dtrain, dtest = tm.load_agaricus(__file__) - watchlist = [(dtest, 'eval'), (dtrain, 'train')] + watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 4 # learning_rates as a list # init eta with 0 to check whether learning_rates work - param = {'max_depth': 2, 'eta': 0, 'verbosity': 0, - 'objective': 'binary:logistic', 'eval_metric': 'error', - 'tree_method': tree_method} + param = { + "max_depth": 2, + "eta": 0, + "objective": "binary:logistic", + "eval_metric": "error", + "tree_method": tree_method, + } evals_result = {} bst = xgb.train( param, @@ -295,15 +299,19 @@ def run_eta_decay(self, tree_method): callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])], evals_result=evals_result, ) - eval_errors_0 = list(map(float, evals_result['eval']['error'])) + eval_errors_0 = list(map(float, evals_result["eval"]["error"])) assert isinstance(bst, xgb.core.Booster) # validation error should decrease, if eta > 0 assert eval_errors_0[0] > eval_errors_0[-1] # init learning_rate with 0 to check whether learning_rates work - param = {'max_depth': 2, 'learning_rate': 0, 'verbosity': 0, - 'objective': 'binary:logistic', 'eval_metric': 'error', - 'tree_method': tree_method} + param = { + "max_depth": 2, + "learning_rate": 0, + "objective": "binary:logistic", + "eval_metric": "error", + "tree_method": tree_method, + } evals_result = {} bst = xgb.train( @@ -314,15 +322,17 @@ def run_eta_decay(self, tree_method): callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])], evals_result=evals_result, ) - eval_errors_1 = list(map(float, evals_result['eval']['error'])) + eval_errors_1 = list(map(float, evals_result["eval"]["error"])) assert isinstance(bst, xgb.core.Booster) # validation error should decrease, if learning_rate > 0 assert eval_errors_1[0] > eval_errors_1[-1] # check if learning_rates override default value of eta/learning_rate param = { - 'max_depth': 2, 'verbosity': 0, 'objective': 'binary:logistic', - 'eval_metric': 'error', 'tree_method': tree_method + "max_depth": 2, + "objective": "binary:logistic", + "eval_metric": "error", + "tree_method": tree_method, } evals_result = {} bst = xgb.train( diff --git a/tests/python/test_config.py b/tests/python/test_config.py index 01b5c2d99c1a..3f741c25d527 100644 --- a/tests/python/test_config.py +++ b/tests/python/test_config.py @@ -12,6 +12,7 @@ def get_current_verbosity(): return xgb.get_config()["verbosity"] old_verbosity = get_current_verbosity() + assert old_verbosity == 1 with xgb.config_context(verbosity=verbosity_level): new_verbosity = get_current_verbosity() assert new_verbosity == verbosity_level @@ -30,7 +31,10 @@ def get_current_use_rmm_flag(): assert old_use_rmm_flag == get_current_use_rmm_flag() -def test_nested_config(): +def test_nested_config() -> None: + verbosity = xgb.get_config()["verbosity"] + assert verbosity == 1 + with xgb.config_context(verbosity=3): assert xgb.get_config()["verbosity"] == 3 with xgb.config_context(verbosity=2): @@ -45,13 +49,15 @@ def test_nested_config(): with xgb.config_context(verbosity=None): assert xgb.get_config()["verbosity"] == 3 # None has no effect - verbosity = xgb.get_config()["verbosity"] xgb.set_config(verbosity=2) assert xgb.get_config()["verbosity"] == 2 with xgb.config_context(verbosity=3): assert xgb.get_config()["verbosity"] == 3 xgb.set_config(verbosity=verbosity) # reset + verbosity = xgb.get_config()["verbosity"] + assert verbosity == 1 + def test_thread_safty(): n_threads = multiprocessing.cpu_count() diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index c718378c514e..9d123ddb9dbc 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -1,6 +1,7 @@ import csv import os import tempfile +import warnings import numpy as np import pytest @@ -24,20 +25,18 @@ def test_warn_missing(self): with pytest.warns(UserWarning): data._warn_unused_missing("uri", 4) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") data._warn_unused_missing("uri", None) data._warn_unused_missing("uri", np.nan) - assert len(record) == 0 - - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") x = rng.randn(10, 10) y = rng.randn(10) xgb.DMatrix(x, y, missing=4) - assert len(record) == 0 - def test_dmatrix_numpy_init(self): data = np.random.randn(5, 5) dm = xgb.DMatrix(data) @@ -264,7 +263,7 @@ def test_sparse_dmatrix_csr(self): dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) watchlist = [(dtrain, "train")] - param = {"max_depth": 3, "objective": "binary:logistic", "verbosity": 0} + param = {"max_depth": 3, "objective": "binary:logistic"} bst = xgb.train(param, dtrain, 5, watchlist) bst.predict(dtrain) @@ -302,7 +301,7 @@ def test_sparse_dmatrix_csc(self): dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) watchlist = [(dtrain, "train")] - param = {"max_depth": 3, "objective": "binary:logistic", "verbosity": 0} + param = {"max_depth": 3, "objective": "binary:logistic"} bst = xgb.train(param, dtrain, 5, watchlist) bst.predict(dtrain) @@ -475,17 +474,19 @@ def verify_coo(): def test_uri(self): def verify_uri(): rank = xgb.collective.get_rank() - data = np.random.rand(5, 5) - filename = f"test_data_{rank}.csv" - with open(filename, mode="w", newline="") as file: - writer = csv.writer(file) - for row in data: - writer.writerow(row) - dtrain = xgb.DMatrix( - f"{filename}?format=csv", data_split_mode=DataSplitMode.COL - ) - assert dtrain.num_row() == 5 - assert dtrain.num_col() == 5 * xgb.collective.get_world_size() + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, f"test_data_{rank}.csv") + + data = np.random.rand(5, 5) + with open(filename, mode="w", newline="") as file: + writer = csv.writer(file) + for row in data: + writer.writerow(row) + dtrain = xgb.DMatrix( + f"{filename}?format=csv", data_split_mode=DataSplitMode.COL + ) + assert dtrain.num_row() == 5 + assert dtrain.num_col() == 5 * xgb.collective.get_world_size() tm.run_with_rabit(world_size=3, test_fn=verify_uri) diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py index 47f58cbd69c6..7695c6861c94 100644 --- a/tests/python/test_early_stopping.py +++ b/tests/python/test_early_stopping.py @@ -67,8 +67,10 @@ def test_cv_early_stopping(self): X = digits['data'] y = digits['target'] dm = xgb.DMatrix(X, label=y) - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic', 'eval_metric': 'error'} + params = { + 'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic', + 'eval_metric': 'error' + } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10) diff --git a/tests/python/test_eval_metrics.py b/tests/python/test_eval_metrics.py index 147c87a27922..92726014b7dc 100644 --- a/tests/python/test_eval_metrics.py +++ b/tests/python/test_eval_metrics.py @@ -9,29 +9,13 @@ class TestEvalMetrics: - xgb_params_01 = { - 'verbosity': 0, - 'nthread': 1, - 'eval_metric': 'error' - } - - xgb_params_02 = { - 'verbosity': 0, - 'nthread': 1, - 'eval_metric': ['error'] - } - - xgb_params_03 = { - 'verbosity': 0, - 'nthread': 1, - 'eval_metric': ['rmse', 'error'] - } - - xgb_params_04 = { - 'verbosity': 0, - 'nthread': 1, - 'eval_metric': ['error', 'rmse'] - } + xgb_params_01 = {'nthread': 1, 'eval_metric': 'error'} + + xgb_params_02 = {'nthread': 1, 'eval_metric': ['error']} + + xgb_params_03 = {'nthread': 1, 'eval_metric': ['rmse', 'error']} + + xgb_params_04 = {'nthread': 1, 'eval_metric': ['error', 'rmse']} def evalerror_01(self, preds, dtrain): labels = dtrain.get_label() diff --git a/tests/python/test_linear.py b/tests/python/test_linear.py index 0a198a036d93..5d281d4152f1 100644 --- a/tests/python/test_linear.py +++ b/tests/python/test_linear.py @@ -22,8 +22,14 @@ def train_result(param, dmat, num_rounds): result = {} - xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False, - evals_result=result) + xgb.train( + param, + dmat, + num_rounds, + evals=[(dmat, "train")], + verbose_eval=False, + evals_result=result, + ) return result diff --git a/tests/python/test_model_io.py b/tests/python/test_model_io.py new file mode 100644 index 000000000000..dc843a7f432b --- /dev/null +++ b/tests/python/test_model_io.py @@ -0,0 +1,406 @@ +import json +import locale +import os +import pickle +import tempfile +from pathlib import Path +from typing import List + +import numpy as np +import pytest + +import xgboost as xgb +from xgboost import testing as tm + + +def json_model(model_path: str, parameters: dict) -> dict: + datasets = pytest.importorskip("sklearn.datasets") + + X, y = datasets.make_classification(64, n_features=8, n_classes=3, n_informative=6) + if parameters.get("objective", None) == "multi:softmax": + parameters["num_class"] = 3 + + dm1 = xgb.DMatrix(X, y) + + bst = xgb.train(parameters, dm1) + bst.save_model(model_path) + + if model_path.endswith("ubj"): + import ubjson + + with open(model_path, "rb") as ubjfd: + model = ubjson.load(ubjfd) + else: + with open(model_path, "r") as fd: + model = json.load(fd) + + return model + + +class TestBoosterIO: + def run_model_json_io(self, parameters: dict, ext: str) -> None: + config = xgb.config.get_config() + assert config["verbosity"] == 1 + + if ext == "ubj" and tm.no_ubjson()["condition"]: + pytest.skip(tm.no_ubjson()["reason"]) + + loc = locale.getpreferredencoding(False) + model_path = "test_model_json_io." + ext + j_model = json_model(model_path, parameters) + assert isinstance(j_model["learner"], dict) + + bst = xgb.Booster(model_file=model_path) + + bst.save_model(fname=model_path) + if ext == "ubj": + import ubjson + + with open(model_path, "rb") as ubjfd: + j_model = ubjson.load(ubjfd) + else: + with open(model_path, "r") as fd: + j_model = json.load(fd) + + assert isinstance(j_model["learner"], dict) + + os.remove(model_path) + assert locale.getpreferredencoding(False) == loc + + json_raw = bst.save_raw(raw_format="json") + from_jraw = xgb.Booster() + from_jraw.load_model(json_raw) + + ubj_raw = bst.save_raw(raw_format="ubj") + from_ubjraw = xgb.Booster() + from_ubjraw.load_model(ubj_raw) + + if parameters.get("multi_strategy", None) != "multi_output_tree": + # Old binary model is not supported for vector leaf. + with pytest.warns(Warning, match="Model format is default to UBJSON"): + old_from_json = from_jraw.save_raw(raw_format="deprecated") + old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated") + + assert old_from_json == old_from_ubj + + raw_json = bst.save_raw(raw_format="json") + pretty = json.dumps(json.loads(raw_json), indent=2) + "\n\n" + bst.load_model(bytearray(pretty, encoding="ascii")) + + if parameters.get("multi_strategy", None) != "multi_output_tree": + # old binary model is not supported. + with pytest.warns(Warning, match="Model format is default to UBJSON"): + old_from_json = from_jraw.save_raw(raw_format="deprecated") + old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated") + + assert old_from_json == old_from_ubj + + rng = np.random.default_rng() + X = rng.random(size=from_jraw.num_features() * 10).reshape( + (10, from_jraw.num_features()) + ) + predt_from_jraw = from_jraw.predict(xgb.DMatrix(X)) + predt_from_bst = bst.predict(xgb.DMatrix(X)) + np.testing.assert_allclose(predt_from_jraw, predt_from_bst) + + @pytest.mark.parametrize("ext", ["json", "ubj"]) + def test_model_json_io(self, ext: str) -> None: + parameters = {"booster": "gbtree", "tree_method": "hist"} + self.run_model_json_io(parameters, ext) + parameters = { + "booster": "gbtree", + "tree_method": "hist", + "multi_strategy": "multi_output_tree", + "objective": "multi:softmax", + } + self.run_model_json_io(parameters, ext) + parameters = {"booster": "gblinear"} + self.run_model_json_io(parameters, ext) + parameters = {"booster": "dart", "tree_method": "hist"} + self.run_model_json_io(parameters, ext) + + def test_categorical_model_io(self) -> None: + X, y = tm.make_categorical(256, 16, 71, False) + Xy = xgb.DMatrix(X, y, enable_categorical=True) + booster = xgb.train({"tree_method": "approx"}, Xy, num_boost_round=16) + predt_0 = booster.predict(Xy) + + with tempfile.TemporaryDirectory() as tempdir: + path = os.path.join(tempdir, "model.deprecated") + with pytest.raises(ValueError, match=r".*JSON/UBJSON.*"): + with pytest.warns(Warning, match="Model format is default to UBJSON"): + booster.save_model(path) + + path = os.path.join(tempdir, "model.json") + booster.save_model(path) + booster = xgb.Booster(model_file=path) + predt_1 = booster.predict(Xy) + np.testing.assert_allclose(predt_0, predt_1) + + path = os.path.join(tempdir, "model.ubj") + booster.save_model(path) + booster = xgb.Booster(model_file=path) + predt_1 = booster.predict(Xy) + np.testing.assert_allclose(predt_0, predt_1) + + @pytest.mark.skipif(**tm.no_json_schema()) + def test_json_io_schema(self) -> None: + import jsonschema + + model_path = "test_json_schema.json" + path = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + ) + doc = os.path.join(path, "doc", "model.schema") + with open(doc, "r") as fd: + schema = json.load(fd) + parameters = {"tree_method": "hist", "booster": "gbtree"} + jsonschema.validate(instance=json_model(model_path, parameters), schema=schema) + os.remove(model_path) + + parameters = {"tree_method": "hist", "booster": "dart"} + jsonschema.validate(instance=json_model(model_path, parameters), schema=schema) + os.remove(model_path) + + try: + dtrain, _ = tm.load_agaricus(__file__) + xgb.train({"objective": "foo"}, dtrain, num_boost_round=1) + except ValueError as e: + e_str = str(e) + beg = e_str.find("Objective candidate") + end = e_str.find("Stack trace") + e_str = e_str[beg:end] + e_str = e_str.strip() + splited = e_str.splitlines() + objectives = [s.split(": ")[1] for s in splited] + j_objectives = schema["properties"]["learner"]["properties"]["objective"][ + "oneOf" + ] + objectives_from_schema = set() + for j_obj in j_objectives: + objectives_from_schema.add(j_obj["properties"]["name"]["const"]) + assert set(objectives) == objectives_from_schema + + def test_model_binary_io(self) -> None: + model_path = "test_model_binary_io.deprecated" + parameters = { + "tree_method": "hist", + "booster": "gbtree", + "scale_pos_weight": "0.5", + } + X = np.random.random((10, 3)) + y = np.random.random((10,)) + dtrain = xgb.DMatrix(X, y) + bst = xgb.train(parameters, dtrain, num_boost_round=2) + with pytest.warns(Warning, match="Model format is default to UBJSON"): + bst.save_model(model_path) + bst = xgb.Booster(model_file=model_path) + os.remove(model_path) + config = json.loads(bst.save_config()) + assert ( + float(config["learner"]["objective"]["reg_loss_param"]["scale_pos_weight"]) + == 0.5 + ) + + buf = bst.save_raw() + from_raw = xgb.Booster() + from_raw.load_model(buf) + + buf_from_raw = from_raw.save_raw() + assert buf == buf_from_raw + + def test_with_pathlib(self) -> None: + """Saving and loading model files from paths.""" + save_path = Path("model.ubj") + + rng = np.random.default_rng(1994) + + data = rng.normal(size=(100, 2)) + target = np.array([0, 1] * 50) + features = ["Feature1", "Feature2"] + + dm = xgb.DMatrix(data, label=target, feature_names=features) + params = { + "objective": "binary:logistic", + "eval_metric": "logloss", + "eta": 0.3, + "max_depth": 1, + } + + bst = xgb.train(params, dm, num_boost_round=1) + + # save, assert exists + bst.save_model(save_path) + assert save_path.exists() + + def dump_assertions(dump: List[str]) -> None: + """Assertions for the expected dump from Booster""" + assert len(dump) == 1, "Exepcted only 1 tree to be dumped." + assert ( + len(dump[0].splitlines()) == 3 + ), "Expected 1 root and 2 leaves - 3 lines." + + # load the model again using Path + bst2 = xgb.Booster(model_file=save_path) + dump2 = bst2.get_dump() + dump_assertions(dump2) + + # load again using load_model + bst3 = xgb.Booster() + bst3.load_model(save_path) + dump3 = bst3.get_dump() + dump_assertions(dump3) + + # remove file + Path.unlink(save_path) + + +def save_load_model(model_path: str) -> None: + from sklearn.datasets import load_digits + from sklearn.model_selection import KFold + + rng = np.random.RandomState(1994) + + digits = load_digits(n_class=2) + y = digits["target"] + X = digits["data"] + kf = KFold(n_splits=2, shuffle=True, random_state=rng) + for train_index, test_index in kf.split(X, y): + xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) + xgb_model.save_model(model_path) + + xgb_model = xgb.XGBClassifier() + xgb_model.load_model(model_path) + + assert isinstance(xgb_model.classes_, np.ndarray) + np.testing.assert_equal(xgb_model.classes_, np.array([0, 1])) + assert isinstance(xgb_model._Booster, xgb.Booster) + + preds = xgb_model.predict(X[test_index]) + labels = y[test_index] + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) + assert err < 0.1 + assert xgb_model.get_booster().attr("scikit_learn") is None + + # test native booster + preds = xgb_model.predict(X[test_index], output_margin=True) + booster = xgb.Booster(model_file=model_path) + predt_1 = booster.predict(xgb.DMatrix(X[test_index]), output_margin=True) + assert np.allclose(preds, predt_1) + + with pytest.raises(TypeError): + xgb_model = xgb.XGBModel() + xgb_model.load_model(model_path) + + clf = xgb.XGBClassifier(booster="gblinear", early_stopping_rounds=1) + clf.fit(X, y, eval_set=[(X, y)]) + best_iteration = clf.best_iteration + best_score = clf.best_score + predt_0 = clf.predict(X) + clf.save_model(model_path) + clf.load_model(model_path) + assert clf.booster == "gblinear" + predt_1 = clf.predict(X) + np.testing.assert_allclose(predt_0, predt_1) + assert clf.best_iteration == best_iteration + assert clf.best_score == best_score + + clfpkl = pickle.dumps(clf) + clf = pickle.loads(clfpkl) + predt_2 = clf.predict(X) + np.testing.assert_allclose(predt_0, predt_2) + assert clf.best_iteration == best_iteration + assert clf.best_score == best_score + + +@pytest.mark.skipif(**tm.no_sklearn()) +def test_sklearn_model() -> None: + from sklearn.datasets import load_digits + from sklearn.model_selection import train_test_split + + with tempfile.TemporaryDirectory() as tempdir: + model_path = os.path.join(tempdir, "digits.deprecated") + with pytest.warns(Warning, match="Model format is default to UBJSON"): + save_load_model(model_path) + + with tempfile.TemporaryDirectory() as tempdir: + model_path = os.path.join(tempdir, "digits.model.json") + save_load_model(model_path) + + with tempfile.TemporaryDirectory() as tempdir: + model_path = os.path.join(tempdir, "digits.model.ubj") + digits = load_digits(n_class=2) + y = digits["target"] + X = digits["data"] + booster = xgb.train( + {"tree_method": "hist", "objective": "binary:logistic"}, + dtrain=xgb.DMatrix(X, y), + num_boost_round=4, + ) + predt_0 = booster.predict(xgb.DMatrix(X)) + booster.save_model(model_path) + cls = xgb.XGBClassifier() + cls.load_model(model_path) + + proba = cls.predict_proba(X) + assert proba.shape[0] == X.shape[0] + assert proba.shape[1] == 2 # binary + + predt_1 = cls.predict_proba(X)[:, 1] + assert np.allclose(predt_0, predt_1) + + cls = xgb.XGBModel() + cls.load_model(model_path) + predt_1 = cls.predict(X) + assert np.allclose(predt_0, predt_1) + + # mclass + X, y = load_digits(n_class=10, return_X_y=True) + # small test_size to force early stop + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.01, random_state=1 + ) + clf = xgb.XGBClassifier( + n_estimators=64, tree_method="hist", early_stopping_rounds=2 + ) + clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) + score = clf.best_score + clf.save_model(model_path) + + clf = xgb.XGBClassifier() + clf.load_model(model_path) + assert clf.classes_.size == 10 + assert clf.objective == "multi:softprob" + + np.testing.assert_equal(clf.classes_, np.arange(10)) + assert clf.n_classes_ == 10 + + assert clf.best_iteration == 27 + assert clf.best_score == score + + +@pytest.mark.skipif(**tm.no_sklearn()) +def test_with_sklearn_obj_metric() -> None: + from sklearn.metrics import mean_squared_error + + X, y = tm.datasets.make_regression() + reg = xgb.XGBRegressor(objective=tm.ls_obj, eval_metric=mean_squared_error) + reg.fit(X, y) + + pkl = pickle.dumps(reg) + reg_1 = pickle.loads(pkl) + assert callable(reg_1.objective) + assert callable(reg_1.eval_metric) + + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "model.json") + reg.save_model(path) + + reg_2 = xgb.XGBRegressor() + reg_2.load_model(path) + + assert not callable(reg_2.objective) + assert not callable(reg_2.eval_metric) + assert reg_2.eval_metric is None diff --git a/tests/python/test_pickling.py b/tests/python/test_pickling.py index 083a2a7fddf8..2f4d77bf0901 100644 --- a/tests/python/test_pickling.py +++ b/tests/python/test_pickling.py @@ -1,13 +1,10 @@ import json import os import pickle -import tempfile import numpy as np -import pytest import xgboost as xgb -from xgboost import testing as tm kRows = 100 kCols = 10 @@ -64,27 +61,3 @@ def check(config): params = {"nthread": 8, "tree_method": "exact", "subsample": 0.5} config = self.run_model_pickling(params) check(config) - - @pytest.mark.skipif(**tm.no_sklearn()) - def test_with_sklearn_obj_metric(self) -> None: - from sklearn.metrics import mean_squared_error - - X, y = tm.datasets.make_regression() - reg = xgb.XGBRegressor(objective=tm.ls_obj, eval_metric=mean_squared_error) - reg.fit(X, y) - - pkl = pickle.dumps(reg) - reg_1 = pickle.loads(pkl) - assert callable(reg_1.objective) - assert callable(reg_1.eval_metric) - - with tempfile.TemporaryDirectory() as tmpdir: - path = os.path.join(tmpdir, "model.json") - reg.save_model(path) - - reg_2 = xgb.XGBRegressor() - reg_2.load_model(path) - - assert not callable(reg_2.objective) - assert not callable(reg_2.eval_metric) - assert reg_2.eval_metric is None diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py index bbbdcedc0895..88149c05462d 100644 --- a/tests/python/test_shap.py +++ b/tests/python/test_shap.py @@ -49,7 +49,7 @@ def test_feature_importances(self) -> None: def fn(max_depth: int, num_rounds: int) -> None: # train - params = {"max_depth": max_depth, "eta": 1, "verbosity": 0} + params = {"max_depth": max_depth, "eta": 1} bst = xgb.train(params, dtrain, num_boost_round=num_rounds) # predict diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index 9a0cdecd20fd..e7641348d98e 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -117,7 +117,6 @@ def test_hist_categorical(self): ag_param = {'max_depth': 2, 'tree_method': 'hist', 'eta': 1, - 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc'} hist_res = {} @@ -340,7 +339,8 @@ def get_score(config: Dict) -> float: assert get_score(config_0) == get_score(config_1) - raw_booster = booster_1.save_raw(raw_format="deprecated") + with pytest.warns(Warning, match="Model format is default to UBJSON"): + raw_booster = booster_1.save_raw(raw_format="deprecated") booster_2 = xgb.Booster(model_file=raw_booster) config_2 = json.loads(booster_2.save_config()) assert get_score(config_1) == get_score(config_2) diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index 4dd0c640dba3..e53e7adccc1f 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -341,7 +341,6 @@ def test_cv_as_pandas(self): params = { "max_depth": 2, "eta": 1, - "verbosity": 0, "objective": "binary:logistic", "eval_metric": "error", } @@ -372,7 +371,6 @@ def test_cv_as_pandas(self): params = { "max_depth": 2, "eta": 1, - "verbosity": 0, "objective": "binary:logistic", "eval_metric": "auc", } @@ -383,7 +381,6 @@ def test_cv_as_pandas(self): params = { "max_depth": 2, "eta": 1, - "verbosity": 0, "objective": "binary:logistic", "eval_metric": ["auc"], } @@ -394,7 +391,6 @@ def test_cv_as_pandas(self): params = { "max_depth": 2, "eta": 1, - "verbosity": 0, "objective": "binary:logistic", "eval_metric": ["auc"], } @@ -413,7 +409,6 @@ def test_cv_as_pandas(self): params = { "max_depth": 2, "eta": 1, - "verbosity": 0, "objective": "binary:logistic", } cv = xgb.cv( @@ -424,7 +419,6 @@ def test_cv_as_pandas(self): params = { "max_depth": 2, "eta": 1, - "verbosity": 0, "objective": "binary:logistic", } cv = xgb.cv( @@ -435,7 +429,6 @@ def test_cv_as_pandas(self): params = { "max_depth": 2, "eta": 1, - "verbosity": 0, "objective": "binary:logistic", "eval_metric": ["auc"], } diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index ee0085d51a29..47f1778d64e7 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -678,7 +678,6 @@ def test_split_value_histograms(): params = { "max_depth": 6, "eta": 0.01, - "verbosity": 0, "objective": "binary:logistic", "base_score": 0.5, } @@ -897,128 +896,6 @@ def test_validation_weights(): run_validation_weights(xgb.XGBClassifier) -def save_load_model(model_path): - from sklearn.datasets import load_digits - from sklearn.model_selection import KFold - - digits = load_digits(n_class=2) - y = digits['target'] - X = digits['data'] - kf = KFold(n_splits=2, shuffle=True, random_state=rng) - for train_index, test_index in kf.split(X, y): - xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) - xgb_model.save_model(model_path) - - xgb_model = xgb.XGBClassifier() - xgb_model.load_model(model_path) - - assert isinstance(xgb_model.classes_, np.ndarray) - np.testing.assert_equal(xgb_model.classes_, np.array([0, 1])) - assert isinstance(xgb_model._Booster, xgb.Booster) - - preds = xgb_model.predict(X[test_index]) - labels = y[test_index] - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) - assert err < 0.1 - assert xgb_model.get_booster().attr('scikit_learn') is None - - # test native booster - preds = xgb_model.predict(X[test_index], output_margin=True) - booster = xgb.Booster(model_file=model_path) - predt_1 = booster.predict(xgb.DMatrix(X[test_index]), - output_margin=True) - assert np.allclose(preds, predt_1) - - with pytest.raises(TypeError): - xgb_model = xgb.XGBModel() - xgb_model.load_model(model_path) - - clf = xgb.XGBClassifier(booster="gblinear", early_stopping_rounds=1) - clf.fit(X, y, eval_set=[(X, y)]) - best_iteration = clf.best_iteration - best_score = clf.best_score - predt_0 = clf.predict(X) - clf.save_model(model_path) - clf.load_model(model_path) - assert clf.booster == "gblinear" - predt_1 = clf.predict(X) - np.testing.assert_allclose(predt_0, predt_1) - assert clf.best_iteration == best_iteration - assert clf.best_score == best_score - - clfpkl = pickle.dumps(clf) - clf = pickle.loads(clfpkl) - predt_2 = clf.predict(X) - np.testing.assert_allclose(predt_0, predt_2) - assert clf.best_iteration == best_iteration - assert clf.best_score == best_score - - -def test_save_load_model(): - with tempfile.TemporaryDirectory() as tempdir: - model_path = os.path.join(tempdir, "digits.model") - save_load_model(model_path) - - with tempfile.TemporaryDirectory() as tempdir: - model_path = os.path.join(tempdir, "digits.model.json") - save_load_model(model_path) - - from sklearn.datasets import load_digits - from sklearn.model_selection import train_test_split - - with tempfile.TemporaryDirectory() as tempdir: - model_path = os.path.join(tempdir, "digits.model.ubj") - digits = load_digits(n_class=2) - y = digits["target"] - X = digits["data"] - booster = xgb.train( - {"tree_method": "hist", "objective": "binary:logistic"}, - dtrain=xgb.DMatrix(X, y), - num_boost_round=4, - ) - predt_0 = booster.predict(xgb.DMatrix(X)) - booster.save_model(model_path) - cls = xgb.XGBClassifier() - cls.load_model(model_path) - - proba = cls.predict_proba(X) - assert proba.shape[0] == X.shape[0] - assert proba.shape[1] == 2 # binary - - predt_1 = cls.predict_proba(X)[:, 1] - assert np.allclose(predt_0, predt_1) - - cls = xgb.XGBModel() - cls.load_model(model_path) - predt_1 = cls.predict(X) - assert np.allclose(predt_0, predt_1) - - # mclass - X, y = load_digits(n_class=10, return_X_y=True) - # small test_size to force early stop - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.01, random_state=1 - ) - clf = xgb.XGBClassifier( - n_estimators=64, tree_method="hist", early_stopping_rounds=2 - ) - clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) - score = clf.best_score - clf.save_model(model_path) - - clf = xgb.XGBClassifier() - clf.load_model(model_path) - assert clf.classes_.size == 10 - assert clf.objective == "multi:softprob" - - np.testing.assert_equal(clf.classes_, np.arange(10)) - assert clf.n_classes_ == 10 - - assert clf.best_iteration == 27 - assert clf.best_score == score - - def test_RFECV(): from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris from sklearn.feature_selection import RFECV