Initial implementation

tomcv · May 11, 2023 · 9fed3c2 · 9fed3c2
1 parent fa24835
commit 9fed3c2
Show file tree

Hide file tree

Showing 8 changed files with 351 additions and 110 deletions.
diff --git a/.gitignore b/.gitignore
@@ -51,110 +51,5 @@ coverage.xml
 .pytest_cache/
 cover/
 
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+# Development
+.idea/
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Thomas Civeit
+Copyright (c) 2023 Thomas Civeit <thomas@civeit.com>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -1,2 +1,101 @@
-# gsbfs
-Gram-Schmidt Boruta Feature Selection
+# Gram-Schmidt Boruta Feature Selection
+
+The `gsbfs` package provides 2 simple functions:
+
+- `gso_rank(X_input, y_output)` that ranks candidate features.
+- `gso_boruta_select(X_input, y_output)` that selects features more relevant than random probes.
+
+Where `X_input` is the matrix of input features and `y_ouput` is the vector of output targets.
+The use of the Gram-Schmidt Orthogonalization (GSO) procedure for ranking the variables of a model
+that is linear with respect to its parameters was first described by
+[Chen et al. (1989)](https://www.tandfonline.com/doi/abs/10.1080/00207178908953472).
+The features are automatically selected using the Boruta algorithm described by
+[Kursa & Rudnicki (2010)](https://www.jstatsoft.org/article/view/v036i11).
+
+[![PyPI - Version](https://img.shields.io/pypi/v/gsbfs.svg)](https://pypi.org/project/gsbfs)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/gsbfs.svg)](https://pypi.org/project/gsbfs)
+
+## Installation
+
+The easiest way to get `gsbfs` is to use pip:
+```console
+$ pip install gsbfs
+```
+That will install the `gsbfs` package along with all the required dependencies.
+
+## GSO Feature Ranking
+
+We consider a model with P candidate features. The input matrix X has P columns (i.e. Xp input vectors)
+corresponding to the P features, and N rows corresponding to the N observations. 
+The vector y is an N-vector that contains the values of the output for each observation.
+All Xp vectors and the y vector must be centered before proceeding with the GSO algorithm.
+
+The first iteration of the procedure consists in finding the feature vector that "best explains" the output target,
+i.e. which has the smallest angle with the output vector in the N-dimensional space of observations.
+To this end, we calculate cos(Xp, y)**2, the square of cosine of the angles between the feature vector and
+the target vector. The feature vector for which this quantity is largest is selected.
+
+In order to discard what is already explained by the first selected vector, all P-1 remaining input vectors,
+and the output vector, are projected onto the subspace (of dimension N-1) of the selected feature.
+Then, in that subspace, the projected input vector that best explains the projected  output is selected,
+and the P-2 remaining feature vectors are projected onto the subspace  of the first two ranked vectors.
+The procedure is repeated until all P input vectors are ranked.
+
+## Boruta Feature Selection
+
+The Boruta algorithm selects features that are more relevant than random probes. The latter are created by
+randomly shuffling each feature of the *existing* observations. Each Xp input vector generates a new "shadow"
+input vector (i.e. random probe) that is statistically irrelevant since the values of the feature for each
+observation have been randomly permuted. The resulting input matrix X has 2*P columns (and still N rows),
+combining the original vectors and the shadow vectors.
+
+All the candidate features are ranked following the GSO procedure described above.
+The threshold to discard features is then defined as the highest rank recorded among the shadow features.
+When the rank of an original feature is higher than this threshold (i.e. better than a random probe),
+it is called a "hit".
+
+However, let us consider 2 arbitrary vectors v1 and v2, and a random vector vr.
+It is important to note that there is still a probability of 50% that cos(v1, vr)**2 > cos(v1, v2)**2,
+so a "lucky" random probe could randomly obtain a better rank than an original feature.
+Therefore, the selection method consists in repeating the experiment (random shadow vectors + ranking) n times,
+and counting the number of "hits" for each feature. Since each independent experiment can give a
+binary outcome (hit or no hit), a series of n trials follows a binomial distribution.
+
+The statistical criteria for feature selection is then the maximum value of the probability mass function
+that will be considered as predictive (right tail), or as non-predictive (left tail) since the binomial
+distribution PMF is symmetrical. The fraction of the PMF that is neither predictive nor non-predictive,
+is considered as indecisive. For instance, considering 20 trials and a maximum probability of 0.5%,
+features having 16-20 hits will be selected, and features having 0-4 hits will be rejected.
+
+## Usage Example
+
+Let us create a data set consisting of 50 features, including only 10 informative features,
+and 5000 observations. The features will be ranked by running:
+
+```python
+from gsbfs.gsbfs import gso_rank, gso_boruta_select
+from sklearn.datasets import make_classification
+X, y = make_classification(n_samples=5000, n_features=50, n_informative=10, shuffle=False)
+ranked_indexes, cos_sq_max = gso_rank(X, y)
+```
+
+which will return an array containing the ranked features indexes, see `gso_rank()` function documentation.
+
+Or the features will be selected by running:
+
+```python
+rejected_indexes, selected_indexes, indecisive_indexes = gso_boruta_select(X, y)
+```
+which will return an array containing the selected features indexes, see `gso_boruta_select()` function
+documentation. Since the process can take several minutes or hours to complete when X is very large,
+the function provides a `verbose` option to report its completion progress.
+
+Other usage examples are provided in [example.py](example.py).
+
+Please note that the selection process relies on random probes, which means that running the procedure multiple
+times may yield different results. Moreover, when the number of observations is significantly  greater than the
+number of features (N >> P), there is a higher likelihood of selecting the informative features.
+
+## License
+
+`gsbfs` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.
diff --git a/example.py b/example.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: 2023-present Thomas Civeit <thomas@civeit.com>
+#
+# SPDX-License-Identifier: MIT
+"""Usage examples of the gsbfs package."""
+
+from gsbfs.gsbfs import gso_rank, gso_boruta_select, get_expected_hits
+import numpy as np
+from sklearn.datasets import make_classification
+
+
+def example_rank():
+    """Usage example of the gso_rank() function."""
+    # create instances
+    n_features = 50
+    n_informative = 10
+    X, y = make_classification(
+        n_samples=5000,
+        n_features=n_features,
+        n_informative=n_informative,
+        n_redundant=0,
+        n_repeated=0,
+        n_classes=3,
+        shuffle=False,  # preserve ordering. first columns = informative features
+    )
+    # shuffle instances
+    p = np.random.permutation(y.size)
+    X, y = X[p, :], y[p]
+    # rank features
+    ranked_indexes, cos_sq_max = gso_rank(X, y)
+    print(f"Ranked Features (total={n_features}, informative=[0,{n_informative-1}]):")
+    print(ranked_indexes)
+
+
+def example_hits():
+    """Usage example of the get_expected_hits() function."""
+    n_trials = 20
+    proba = 0.5
+    pmf_max = 0.005
+    rejected_hits, selected_hits = get_expected_hits(n_trials, proba, pmf_max)
+    print(f"Hits to be selected (n_trials={n_trials}, proba={proba}, pmf_max={pmf_max}):")
+    print(selected_hits)
+
+
+def example_select():
+    """Usage example of the gso_boruta_select() function."""
+    # create instances
+    n_features = 50
+    n_informative = 10
+    X, y = make_classification(
+        n_samples=5000,
+        n_features=n_features,
+        n_informative=n_informative,
+        n_redundant=0,
+        n_repeated=0,
+        n_classes=3,
+        shuffle=False,  # preserve ordering. first columns = informative features
+    )
+    # shuffle instances
+    p = np.random.permutation(y.size)
+    X, y = X[p, :], y[p]
+    # select features
+    rejected_indexes, selected_indexes, indecisive_indexes = gso_boruta_select(X, y)
+    print(f"Selected Features (total={n_features}, informative=[0,{n_informative-1}]):")
+    print(selected_indexes)
+
+
+if __name__ == '__main__':
+    print("--- example_rank() ---")
+    example_rank()
+    print("--- example_hits() ---")
+    example_hits()
+    print("--- example_select() ---")
+    example_select()
diff --git a/gsbfs/__about__.py b/gsbfs/__about__.py
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: 2023-present Thomas Civeit <thomas@civeit.com>
+#
+# SPDX-License-Identifier: MIT
+__version__ = '0.0.1'
diff --git a/gsbfs/__init__.py b/gsbfs/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2023-present Thomas Civeit <thomas@civeit.com>
+#
+# SPDX-License-Identifier: MIT