From 9fed3c29fa576045aead176cc6cc8b1fb13298c1 Mon Sep 17 00:00:00 2001
From: Thomas Civeit <thomas@civeit.com>
Date: Thu, 11 May 2023 16:04:09 +0000
Subject: [PATCH] Initial implementation

---
 .gitignore         | 109 +-----------------------------------
 LICENSE            |   2 +-
 README.md          | 103 +++++++++++++++++++++++++++++++++-
 example.py         |  73 ++++++++++++++++++++++++
 gsbfs/__about__.py |   4 ++
 gsbfs/__init__.py  |   3 +
 gsbfs/gsbfs.py     | 134 +++++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml     |  33 +++++++++++
 8 files changed, 351 insertions(+), 110 deletions(-)
 create mode 100755 example.py
 create mode 100644 gsbfs/__about__.py
 create mode 100644 gsbfs/__init__.py
 create mode 100755 gsbfs/gsbfs.py
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
index 68bc17f..4e721d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,110 +51,5 @@ coverage.xml
 .pytest_cache/
 cover/
 
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+# Development
+.idea/
diff --git a/LICENSE b/LICENSE
index ee904e8..85ff210 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Thomas Civeit
+Copyright (c) 2023 Thomas Civeit <thomas@civeit.com>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 1aea254..8fdcde6 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,101 @@
-# gsbfs
-Gram-Schmidt Boruta Feature Selection
+# Gram-Schmidt Boruta Feature Selection
+
+The `gsbfs` package provides 2 simple functions:
+
+- `gso_rank(X_input, y_output)` that ranks candidate features.
+- `gso_boruta_select(X_input, y_output)` that selects features more relevant than random probes.
+
+Where `X_input` is the matrix of input features and `y_ouput` is the vector of output targets.
+The use of the Gram-Schmidt Orthogonalization (GSO) procedure for ranking the variables of a model
+that is linear with respect to its parameters was first described by
+[Chen et al. (1989)](https://www.tandfonline.com/doi/abs/10.1080/00207178908953472).
+The features are automatically selected using the Boruta algorithm described by
+[Kursa & Rudnicki (2010)](https://www.jstatsoft.org/article/view/v036i11).
+
+[![PyPI - Version](https://img.shields.io/pypi/v/gsbfs.svg)](https://pypi.org/project/gsbfs)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/gsbfs.svg)](https://pypi.org/project/gsbfs)
+
+## Installation
+
+The easiest way to get `gsbfs` is to use pip:
+```console
+$ pip install gsbfs
+```
+That will install the `gsbfs` package along with all the required dependencies.
+
+## GSO Feature Ranking
+
+We consider a model with P candidate features. The input matrix X has P columns (i.e. Xp input vectors)
+corresponding to the P features, and N rows corresponding to the N observations. 
+The vector y is an N-vector that contains the values of the output for each observation.
+All Xp vectors and the y vector must be centered before proceeding with the GSO algorithm.
+
+The first iteration of the procedure consists in finding the feature vector that "best explains" the output target,
+i.e. which has the smallest angle with the output vector in the N-dimensional space of observations.
+To this end, we calculate cos(Xp, y)**2, the square of cosine of the angles between the feature vector and
+the target vector. The feature vector for which this quantity is largest is selected.
+
+In order to discard what is already explained by the first selected vector, all P-1 remaining input vectors,
+and the output vector, are projected onto the subspace (of dimension N-1) of the selected feature.
+Then, in that subspace, the projected input vector that best explains the projected  output is selected,
+and the P-2 remaining feature vectors are projected onto the subspace  of the first two ranked vectors.
+The procedure is repeated until all P input vectors are ranked.
+
+## Boruta Feature Selection
+
+The Boruta algorithm selects features that are more relevant than random probes. The latter are created by
+randomly shuffling each feature of the *existing* observations. Each Xp input vector generates a new "shadow"
+input vector (i.e. random probe) that is statistically irrelevant since the values of the feature for each
+observation have been randomly permuted. The resulting input matrix X has 2*P columns (and still N rows),
+combining the original vectors and the shadow vectors.
+
+All the candidate features are ranked following the GSO procedure described above.
+The threshold to discard features is then defined as the highest rank recorded among the shadow features.
+When the rank of an original feature is higher than this threshold (i.e. better than a random probe),
+it is called a "hit".
+
+However, let us consider 2 arbitrary vectors v1 and v2, and a random vector vr.
+It is important to note that there is still a probability of 50% that cos(v1, vr)**2 > cos(v1, v2)**2,
+so a "lucky" random probe could randomly obtain a better rank than an original feature.
+Therefore, the selection method consists in repeating the experiment (random shadow vectors + ranking) n times,
+and counting the number of "hits" for each feature. Since each independent experiment can give a
+binary outcome (hit or no hit), a series of n trials follows a binomial distribution.
+
+The statistical criteria for feature selection is then the maximum value of the probability mass function
+that will be considered as predictive (right tail), or as non-predictive (left tail) since the binomial
+distribution PMF is symmetrical. The fraction of the PMF that is neither predictive nor non-predictive,
+is considered as indecisive. For instance, considering 20 trials and a maximum probability of 0.5%,
+features having 16-20 hits will be selected, and features having 0-4 hits will be rejected.
+
+## Usage Example
+
+Let us create a data set consisting of 50 features, including only 10 informative features,
+and 5000 observations. The features will be ranked by running:
+
+```python
+from gsbfs.gsbfs import gso_rank, gso_boruta_select
+from sklearn.datasets import make_classification
+X, y = make_classification(n_samples=5000, n_features=50, n_informative=10, shuffle=False)
+ranked_indexes, cos_sq_max = gso_rank(X, y)
+```
+
+which will return an array containing the ranked features indexes, see `gso_rank()` function documentation.
+
+Or the features will be selected by running:
+
+```python
+rejected_indexes, selected_indexes, indecisive_indexes = gso_boruta_select(X, y)
+```
+which will return an array containing the selected features indexes, see `gso_boruta_select()` function
+documentation. Since the process can take several minutes or hours to complete when X is very large,
+the function provides a `verbose` option to report its completion progress.
+
+Other usage examples are provided in [example.py](example.py).
+
+Please note that the selection process relies on random probes, which means that running the procedure multiple
+times may yield different results. Moreover, when the number of observations is significantly  greater than the
+number of features (N >> P), there is a higher likelihood of selecting the informative features.
+
+## License
+
+`gsbfs` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.
diff --git a/example.py b/example.py
new file mode 100755
index 0000000..5874777
--- /dev/null
+++ b/example.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: 2023-present Thomas Civeit <thomas@civeit.com>
+#
+# SPDX-License-Identifier: MIT
+"""Usage examples of the gsbfs package."""
+
+from gsbfs.gsbfs import gso_rank, gso_boruta_select, get_expected_hits
+import numpy as np
+from sklearn.datasets import make_classification
+
+
+def example_rank():
+    """Usage example of the gso_rank() function."""
+    # create instances
+    n_features = 50
+    n_informative = 10
+    X, y = make_classification(
+        n_samples=5000,
+        n_features=n_features,
+        n_informative=n_informative,
+        n_redundant=0,
+        n_repeated=0,
+        n_classes=3,
+        shuffle=False,  # preserve ordering. first columns = informative features
+    )
+    # shuffle instances
+    p = np.random.permutation(y.size)
+    X, y = X[p, :], y[p]
+    # rank features
+    ranked_indexes, cos_sq_max = gso_rank(X, y)
+    print(f"Ranked Features (total={n_features}, informative=[0,{n_informative-1}]):")
+    print(ranked_indexes)
+
+
+def example_hits():
+    """Usage example of the get_expected_hits() function."""
+    n_trials = 20
+    proba = 0.5
+    pmf_max = 0.005
+    rejected_hits, selected_hits = get_expected_hits(n_trials, proba, pmf_max)
+    print(f"Hits to be selected (n_trials={n_trials}, proba={proba}, pmf_max={pmf_max}):")
+    print(selected_hits)
+
+
+def example_select():
+    """Usage example of the gso_boruta_select() function."""
+    # create instances
+    n_features = 50
+    n_informative = 10
+    X, y = make_classification(
+        n_samples=5000,
+        n_features=n_features,
+        n_informative=n_informative,
+        n_redundant=0,
+        n_repeated=0,
+        n_classes=3,
+        shuffle=False,  # preserve ordering. first columns = informative features
+    )
+    # shuffle instances
+    p = np.random.permutation(y.size)
+    X, y = X[p, :], y[p]
+    # select features
+    rejected_indexes, selected_indexes, indecisive_indexes = gso_boruta_select(X, y)
+    print(f"Selected Features (total={n_features}, informative=[0,{n_informative-1}]):")
+    print(selected_indexes)
+
+
+if __name__ == '__main__':
+    print("--- example_rank() ---")
+    example_rank()
+    print("--- example_hits() ---")
+    example_hits()
+    print("--- example_select() ---")
+    example_select()
diff --git a/gsbfs/__about__.py b/gsbfs/__about__.py
new file mode 100644
index 0000000..8a4f63e
--- /dev/null
+++ b/gsbfs/__about__.py
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: 2023-present Thomas Civeit <thomas@civeit.com>
+#
+# SPDX-License-Identifier: MIT
+__version__ = '0.0.1'
diff --git a/gsbfs/__init__.py b/gsbfs/__init__.py
new file mode 100644
index 0000000..95a139e
--- /dev/null
+++ b/gsbfs/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2023-present Thomas Civeit <thomas@civeit.com>
+#
+# SPDX-License-Identifier: MIT
diff --git a/gsbfs/gsbfs.py b/gsbfs/gsbfs.py
new file mode 100755
index 0000000..3a595f0
--- /dev/null
+++ b/gsbfs/gsbfs.py
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: 2023-present Thomas Civeit <thomas@civeit.com>
+#
+# SPDX-License-Identifier: MIT
+"""Gram-Schmidt Boruta Feature Selection."""
+
+import numpy as np
+from scipy import stats
+import datetime as dt
+
+
+def gso_rank(X_input, y_output, n_features=None, verbose=False):
+    """Rank features using Gram-Schmidt Orthogonalization procedure.
+
+    Args:
+        X_input (numpy.ndarray): 2D array of input features, 1 raw per instance
+        y_output (numpy.ndarray): 1D array of output targets
+        n_features (int): If not None, stop after n_features features are ranked
+        verbose (bool): Report about calculation progress
+
+    Returns:
+        ranked_indexes (numpy.ndarray): Array of ranked features indexes
+        cos_sq_max (numpy.ndarray): Array of calculated cosine squared
+    """
+    X, y = X_input.copy(), y_output.copy()  # array content will be modified
+    original_indexes = np.arange(X.shape[1])
+    n_features = n_features if n_features else X.shape[1]
+    ranked_indexes = []
+    cos_sq_max = []
+    # X and y must be centered
+    X = X - np.mean(X, axis=0)
+    y = y - np.mean(y)
+    if verbose:
+        begin = dt.datetime.now()
+        print(f"<gso_rank> projecting {n_features} features...")
+    while len(ranked_indexes) != n_features:
+        # normalize all vectors, ranking based on (xk, y) angle, norm does not matter
+        X = X / np.linalg.norm(X, axis=0)
+        y = y / np.linalg.norm(y)
+        # find xk that maximizes cos2(xk, y) i.e. best explains y
+        cos_sq = np.matmul(X.transpose(), y)**2
+        if np.isnan(cos_sq).any():
+            print(f"<gso_rank> WARNING: cos_sq has NaN values")
+        k = np.argmax(cos_sq)
+        xk = X[:, k]
+        ranked_indexes.append(original_indexes[k])
+        cos_sq_max.append(cos_sq[k])
+        # remove xk from X
+        X = np.delete(X, k, axis=1)
+        original_indexes = np.delete(original_indexes, k)
+        # project all vectors onto xk: v = v_para + v_ortho
+        # replace vectors by v_ortho = v - v_para
+        for i in range(X.shape[1]):
+            X[:, i] = X[:, i] - (np.vdot(X[:, i], xk) * xk)
+        y = y - (np.vdot(y, xk) * xk)
+    if verbose:
+        end = dt.datetime.now()
+        delta = (end - begin).total_seconds()
+        print(f"<gso_rank> projected {n_features} features in {delta:.3f} sec")
+    return np.array(ranked_indexes), np.array(cos_sq_max)
+
+
+def get_expected_hits(n_trials, proba, pmf_max):
+    """Calculate expected number of hits based on binomial distribution parameters.
+
+    Args:
+        n_trials (int): Number of independent experiments
+        proba (float): Probability of success
+        pmf_max (float): Maximum value of the probability mass function
+
+    Returns:
+        rejected_hits (numpy.ndarray): Array of hits that are rejected
+        selected_hits (numpy.ndarray): Array of hits that are selected
+    """
+    pmf = np.array([stats.binom.pmf(x, n_trials, proba) for x in range(n_trials + 1)])
+    rejected_hits = np.where((pmf < pmf_max) & (np.arange(pmf.size) < (0.5 * pmf.size)))[0]
+    selected_hits = np.where((pmf < pmf_max) & (np.arange(pmf.size) > (0.5 * pmf.size)))[0]
+    return rejected_hits, selected_hits
+
+
+def gso_boruta_select(X_input, y_output, n_trials=20, proba=0.5, pmf_max=0.005, verbose=False):
+    """Select features using Boruta procedure.
+
+    Args:
+        X_input (numpy.ndarray): 2D array of input features, 1 raw per instance
+        y_output (numpy.ndarray): 1D array of output targets
+        n_trials (int): Number of independent experiments
+        proba (float): Probability of success
+        pmf_max (float): Maximum value of the probability mass function
+        verbose (bool): Report about calculation progress
+
+    Returns:
+        rejected_indexes (numpy.ndarray): Array of rejected features indexes
+        selected_indexes (numpy.ndarray): Array of selected features indexes
+        indecisive_indexes (numpy.ndarray): Array of indecisive features indexes
+    """
+    n_features = X_input.shape[1]
+    rejected_hits, selected_hits = get_expected_hits(n_trials, proba, pmf_max)
+    all_ranked_indexes = []
+    if verbose:
+        begin = dt.datetime.now()
+        print(f"<gso_boruta_select> processing {n_features} features...")
+    for repeat in range(n_trials):
+        if verbose:
+            print(f"<gso_boruta_select> trial #{repeat+1}")
+        # make X_shadow random probes by randomly permuting each column of X
+        X_shadow = X_input.copy()
+        for i in range(n_features):
+            p = np.random.permutation(X_shadow.shape[0])
+            X_shadow[:, i] = X_shadow[p, i]
+        # combine X and X_shadow
+        X_combined = np.hstack([X_input, X_shadow])
+        # rank combined original+shadow features
+        ranked_indexes, cos_sq_max = gso_rank(X_combined, y_output, verbose=verbose)
+        all_ranked_indexes.append(ranked_indexes)
+    # count which features have better ranks than the shadow features
+    hits = np.zeros(n_features).astype(int)
+    for ranked_indexes in all_ranked_indexes:
+        best_shadow_index = np.min(np.where(ranked_indexes >= n_features)[0])
+        hits_indexes = ranked_indexes[0:best_shadow_index]
+        hits[hits_indexes] += 1
+    # select/reject features based on number of hits
+    rejected_indexes, selected_indexes, indecisive_indexes = [], [], []
+    for feature_index in range(hits.size):
+        if hits[feature_index] in rejected_hits:
+            rejected_indexes.append(feature_index)
+        elif hits[feature_index] in selected_hits:
+            selected_indexes.append(feature_index)
+        else:
+            indecisive_indexes.append(feature_index)
+    if verbose:
+        end = dt.datetime.now()
+        delta = (end - begin).total_seconds()
+        print(f"<gso_boruta_select> processed {n_features} features in {delta:.3f} sec")
+    return np.array(rejected_indexes), np.array(selected_indexes), np.array(indecisive_indexes)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..1d8d45d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,33 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "gsbfs"
+description = "Gram-Schmidt Boruta Feature Selection."
+readme = "README.md"
+requires-python = ">=3.9"
+license = { file = "LICENSE" }
+keywords = ["machine-learning", "feature-selection", "feature-extraction", "feature-engineering"]
+authors = [
+  { name = "Thomas Civeit", email = "thomas@civeit.com" },
+]
+classifiers = [
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "numpy",
+    "scipy"
+]
+dynamic = ["version"]
+
+[project.urls]
+Source = "https://github.com/tomcv/gsbfs"
+Issues = "https://github.com/tomcv/gsbfs/issues"
+
+[tool.hatch.version]
+path = "gsbfs/__about__.py"