From 9fed3c29fa576045aead176cc6cc8b1fb13298c1 Mon Sep 17 00:00:00 2001 From: Thomas Civeit Date: Thu, 11 May 2023 16:04:09 +0000 Subject: [PATCH] Initial implementation --- .gitignore | 109 +----------------------------------- LICENSE | 2 +- README.md | 103 +++++++++++++++++++++++++++++++++- example.py | 73 ++++++++++++++++++++++++ gsbfs/__about__.py | 4 ++ gsbfs/__init__.py | 3 + gsbfs/gsbfs.py | 134 +++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 33 +++++++++++ 8 files changed, 351 insertions(+), 110 deletions(-) create mode 100755 example.py create mode 100644 gsbfs/__about__.py create mode 100644 gsbfs/__init__.py create mode 100755 gsbfs/gsbfs.py create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore index 68bc17f..4e721d9 100644 --- a/.gitignore +++ b/.gitignore @@ -51,110 +51,5 @@ coverage.xml .pytest_cache/ cover/ -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +# Development +.idea/ diff --git a/LICENSE b/LICENSE index ee904e8..85ff210 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Thomas Civeit +Copyright (c) 2023 Thomas Civeit Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 1aea254..8fdcde6 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,101 @@ -# gsbfs -Gram-Schmidt Boruta Feature Selection +# Gram-Schmidt Boruta Feature Selection + +The `gsbfs` package provides 2 simple functions: + +- `gso_rank(X_input, y_output)` that ranks candidate features. +- `gso_boruta_select(X_input, y_output)` that selects features more relevant than random probes. + +Where `X_input` is the matrix of input features and `y_ouput` is the vector of output targets. +The use of the Gram-Schmidt Orthogonalization (GSO) procedure for ranking the variables of a model +that is linear with respect to its parameters was first described by +[Chen et al. (1989)](https://www.tandfonline.com/doi/abs/10.1080/00207178908953472). +The features are automatically selected using the Boruta algorithm described by +[Kursa & Rudnicki (2010)](https://www.jstatsoft.org/article/view/v036i11). + +[![PyPI - Version](https://img.shields.io/pypi/v/gsbfs.svg)](https://pypi.org/project/gsbfs) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/gsbfs.svg)](https://pypi.org/project/gsbfs) + +## Installation + +The easiest way to get `gsbfs` is to use pip: +```console +$ pip install gsbfs +``` +That will install the `gsbfs` package along with all the required dependencies. + +## GSO Feature Ranking + +We consider a model with P candidate features. The input matrix X has P columns (i.e. Xp input vectors) +corresponding to the P features, and N rows corresponding to the N observations. +The vector y is an N-vector that contains the values of the output for each observation. +All Xp vectors and the y vector must be centered before proceeding with the GSO algorithm. + +The first iteration of the procedure consists in finding the feature vector that "best explains" the output target, +i.e. which has the smallest angle with the output vector in the N-dimensional space of observations. +To this end, we calculate cos(Xp, y)**2, the square of cosine of the angles between the feature vector and +the target vector. The feature vector for which this quantity is largest is selected. + +In order to discard what is already explained by the first selected vector, all P-1 remaining input vectors, +and the output vector, are projected onto the subspace (of dimension N-1) of the selected feature. +Then, in that subspace, the projected input vector that best explains the projected output is selected, +and the P-2 remaining feature vectors are projected onto the subspace of the first two ranked vectors. +The procedure is repeated until all P input vectors are ranked. + +## Boruta Feature Selection + +The Boruta algorithm selects features that are more relevant than random probes. The latter are created by +randomly shuffling each feature of the *existing* observations. Each Xp input vector generates a new "shadow" +input vector (i.e. random probe) that is statistically irrelevant since the values of the feature for each +observation have been randomly permuted. The resulting input matrix X has 2*P columns (and still N rows), +combining the original vectors and the shadow vectors. + +All the candidate features are ranked following the GSO procedure described above. +The threshold to discard features is then defined as the highest rank recorded among the shadow features. +When the rank of an original feature is higher than this threshold (i.e. better than a random probe), +it is called a "hit". + +However, let us consider 2 arbitrary vectors v1 and v2, and a random vector vr. +It is important to note that there is still a probability of 50% that cos(v1, vr)**2 > cos(v1, v2)**2, +so a "lucky" random probe could randomly obtain a better rank than an original feature. +Therefore, the selection method consists in repeating the experiment (random shadow vectors + ranking) n times, +and counting the number of "hits" for each feature. Since each independent experiment can give a +binary outcome (hit or no hit), a series of n trials follows a binomial distribution. + +The statistical criteria for feature selection is then the maximum value of the probability mass function +that will be considered as predictive (right tail), or as non-predictive (left tail) since the binomial +distribution PMF is symmetrical. The fraction of the PMF that is neither predictive nor non-predictive, +is considered as indecisive. For instance, considering 20 trials and a maximum probability of 0.5%, +features having 16-20 hits will be selected, and features having 0-4 hits will be rejected. + +## Usage Example + +Let us create a data set consisting of 50 features, including only 10 informative features, +and 5000 observations. The features will be ranked by running: + +```python +from gsbfs.gsbfs import gso_rank, gso_boruta_select +from sklearn.datasets import make_classification +X, y = make_classification(n_samples=5000, n_features=50, n_informative=10, shuffle=False) +ranked_indexes, cos_sq_max = gso_rank(X, y) +``` + +which will return an array containing the ranked features indexes, see `gso_rank()` function documentation. + +Or the features will be selected by running: + +```python +rejected_indexes, selected_indexes, indecisive_indexes = gso_boruta_select(X, y) +``` +which will return an array containing the selected features indexes, see `gso_boruta_select()` function +documentation. Since the process can take several minutes or hours to complete when X is very large, +the function provides a `verbose` option to report its completion progress. + +Other usage examples are provided in [example.py](example.py). + +Please note that the selection process relies on random probes, which means that running the procedure multiple +times may yield different results. Moreover, when the number of observations is significantly greater than the +number of features (N >> P), there is a higher likelihood of selecting the informative features. + +## License + +`gsbfs` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license. diff --git a/example.py b/example.py new file mode 100755 index 0000000..5874777 --- /dev/null +++ b/example.py @@ -0,0 +1,73 @@ +# SPDX-FileCopyrightText: 2023-present Thomas Civeit +# +# SPDX-License-Identifier: MIT +"""Usage examples of the gsbfs package.""" + +from gsbfs.gsbfs import gso_rank, gso_boruta_select, get_expected_hits +import numpy as np +from sklearn.datasets import make_classification + + +def example_rank(): + """Usage example of the gso_rank() function.""" + # create instances + n_features = 50 + n_informative = 10 + X, y = make_classification( + n_samples=5000, + n_features=n_features, + n_informative=n_informative, + n_redundant=0, + n_repeated=0, + n_classes=3, + shuffle=False, # preserve ordering. first columns = informative features + ) + # shuffle instances + p = np.random.permutation(y.size) + X, y = X[p, :], y[p] + # rank features + ranked_indexes, cos_sq_max = gso_rank(X, y) + print(f"Ranked Features (total={n_features}, informative=[0,{n_informative-1}]):") + print(ranked_indexes) + + +def example_hits(): + """Usage example of the get_expected_hits() function.""" + n_trials = 20 + proba = 0.5 + pmf_max = 0.005 + rejected_hits, selected_hits = get_expected_hits(n_trials, proba, pmf_max) + print(f"Hits to be selected (n_trials={n_trials}, proba={proba}, pmf_max={pmf_max}):") + print(selected_hits) + + +def example_select(): + """Usage example of the gso_boruta_select() function.""" + # create instances + n_features = 50 + n_informative = 10 + X, y = make_classification( + n_samples=5000, + n_features=n_features, + n_informative=n_informative, + n_redundant=0, + n_repeated=0, + n_classes=3, + shuffle=False, # preserve ordering. first columns = informative features + ) + # shuffle instances + p = np.random.permutation(y.size) + X, y = X[p, :], y[p] + # select features + rejected_indexes, selected_indexes, indecisive_indexes = gso_boruta_select(X, y) + print(f"Selected Features (total={n_features}, informative=[0,{n_informative-1}]):") + print(selected_indexes) + + +if __name__ == '__main__': + print("--- example_rank() ---") + example_rank() + print("--- example_hits() ---") + example_hits() + print("--- example_select() ---") + example_select() diff --git a/gsbfs/__about__.py b/gsbfs/__about__.py new file mode 100644 index 0000000..8a4f63e --- /dev/null +++ b/gsbfs/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2023-present Thomas Civeit +# +# SPDX-License-Identifier: MIT +__version__ = '0.0.1' diff --git a/gsbfs/__init__.py b/gsbfs/__init__.py new file mode 100644 index 0000000..95a139e --- /dev/null +++ b/gsbfs/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present Thomas Civeit +# +# SPDX-License-Identifier: MIT diff --git a/gsbfs/gsbfs.py b/gsbfs/gsbfs.py new file mode 100755 index 0000000..3a595f0 --- /dev/null +++ b/gsbfs/gsbfs.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: 2023-present Thomas Civeit +# +# SPDX-License-Identifier: MIT +"""Gram-Schmidt Boruta Feature Selection.""" + +import numpy as np +from scipy import stats +import datetime as dt + + +def gso_rank(X_input, y_output, n_features=None, verbose=False): + """Rank features using Gram-Schmidt Orthogonalization procedure. + + Args: + X_input (numpy.ndarray): 2D array of input features, 1 raw per instance + y_output (numpy.ndarray): 1D array of output targets + n_features (int): If not None, stop after n_features features are ranked + verbose (bool): Report about calculation progress + + Returns: + ranked_indexes (numpy.ndarray): Array of ranked features indexes + cos_sq_max (numpy.ndarray): Array of calculated cosine squared + """ + X, y = X_input.copy(), y_output.copy() # array content will be modified + original_indexes = np.arange(X.shape[1]) + n_features = n_features if n_features else X.shape[1] + ranked_indexes = [] + cos_sq_max = [] + # X and y must be centered + X = X - np.mean(X, axis=0) + y = y - np.mean(y) + if verbose: + begin = dt.datetime.now() + print(f" projecting {n_features} features...") + while len(ranked_indexes) != n_features: + # normalize all vectors, ranking based on (xk, y) angle, norm does not matter + X = X / np.linalg.norm(X, axis=0) + y = y / np.linalg.norm(y) + # find xk that maximizes cos2(xk, y) i.e. best explains y + cos_sq = np.matmul(X.transpose(), y)**2 + if np.isnan(cos_sq).any(): + print(f" WARNING: cos_sq has NaN values") + k = np.argmax(cos_sq) + xk = X[:, k] + ranked_indexes.append(original_indexes[k]) + cos_sq_max.append(cos_sq[k]) + # remove xk from X + X = np.delete(X, k, axis=1) + original_indexes = np.delete(original_indexes, k) + # project all vectors onto xk: v = v_para + v_ortho + # replace vectors by v_ortho = v - v_para + for i in range(X.shape[1]): + X[:, i] = X[:, i] - (np.vdot(X[:, i], xk) * xk) + y = y - (np.vdot(y, xk) * xk) + if verbose: + end = dt.datetime.now() + delta = (end - begin).total_seconds() + print(f" projected {n_features} features in {delta:.3f} sec") + return np.array(ranked_indexes), np.array(cos_sq_max) + + +def get_expected_hits(n_trials, proba, pmf_max): + """Calculate expected number of hits based on binomial distribution parameters. + + Args: + n_trials (int): Number of independent experiments + proba (float): Probability of success + pmf_max (float): Maximum value of the probability mass function + + Returns: + rejected_hits (numpy.ndarray): Array of hits that are rejected + selected_hits (numpy.ndarray): Array of hits that are selected + """ + pmf = np.array([stats.binom.pmf(x, n_trials, proba) for x in range(n_trials + 1)]) + rejected_hits = np.where((pmf < pmf_max) & (np.arange(pmf.size) < (0.5 * pmf.size)))[0] + selected_hits = np.where((pmf < pmf_max) & (np.arange(pmf.size) > (0.5 * pmf.size)))[0] + return rejected_hits, selected_hits + + +def gso_boruta_select(X_input, y_output, n_trials=20, proba=0.5, pmf_max=0.005, verbose=False): + """Select features using Boruta procedure. + + Args: + X_input (numpy.ndarray): 2D array of input features, 1 raw per instance + y_output (numpy.ndarray): 1D array of output targets + n_trials (int): Number of independent experiments + proba (float): Probability of success + pmf_max (float): Maximum value of the probability mass function + verbose (bool): Report about calculation progress + + Returns: + rejected_indexes (numpy.ndarray): Array of rejected features indexes + selected_indexes (numpy.ndarray): Array of selected features indexes + indecisive_indexes (numpy.ndarray): Array of indecisive features indexes + """ + n_features = X_input.shape[1] + rejected_hits, selected_hits = get_expected_hits(n_trials, proba, pmf_max) + all_ranked_indexes = [] + if verbose: + begin = dt.datetime.now() + print(f" processing {n_features} features...") + for repeat in range(n_trials): + if verbose: + print(f" trial #{repeat+1}") + # make X_shadow random probes by randomly permuting each column of X + X_shadow = X_input.copy() + for i in range(n_features): + p = np.random.permutation(X_shadow.shape[0]) + X_shadow[:, i] = X_shadow[p, i] + # combine X and X_shadow + X_combined = np.hstack([X_input, X_shadow]) + # rank combined original+shadow features + ranked_indexes, cos_sq_max = gso_rank(X_combined, y_output, verbose=verbose) + all_ranked_indexes.append(ranked_indexes) + # count which features have better ranks than the shadow features + hits = np.zeros(n_features).astype(int) + for ranked_indexes in all_ranked_indexes: + best_shadow_index = np.min(np.where(ranked_indexes >= n_features)[0]) + hits_indexes = ranked_indexes[0:best_shadow_index] + hits[hits_indexes] += 1 + # select/reject features based on number of hits + rejected_indexes, selected_indexes, indecisive_indexes = [], [], [] + for feature_index in range(hits.size): + if hits[feature_index] in rejected_hits: + rejected_indexes.append(feature_index) + elif hits[feature_index] in selected_hits: + selected_indexes.append(feature_index) + else: + indecisive_indexes.append(feature_index) + if verbose: + end = dt.datetime.now() + delta = (end - begin).total_seconds() + print(f" processed {n_features} features in {delta:.3f} sec") + return np.array(rejected_indexes), np.array(selected_indexes), np.array(indecisive_indexes) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1d8d45d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,33 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "gsbfs" +description = "Gram-Schmidt Boruta Feature Selection." +readme = "README.md" +requires-python = ">=3.9" +license = { file = "LICENSE" } +keywords = ["machine-learning", "feature-selection", "feature-extraction", "feature-engineering"] +authors = [ + { name = "Thomas Civeit", email = "thomas@civeit.com" }, +] +classifiers = [ + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Operating System :: OS Independent", +] +dependencies = [ + "numpy", + "scipy" +] +dynamic = ["version"] + +[project.urls] +Source = "https://github.com/tomcv/gsbfs" +Issues = "https://github.com/tomcv/gsbfs/issues" + +[tool.hatch.version] +path = "gsbfs/__about__.py"