Merge pull request #218 from credo-ai/release/1.0.1

Release/1.0.1
credo-ai · Oct 27, 2022 · 65528aa · 65528aa
2 parents f73d4bf + 27e077c
commit 65528aa
Show file tree

Hide file tree

Showing 48 changed files with 2,132 additions and 262 deletions.
diff --git a/credoai/__init__.py b/credoai/__init__.py
@@ -2,4 +2,4 @@
 Primary interface for Credo AI Lens package
 """
 
-__version__ = "1.0.0"
+__version__ = "1.0.1"
diff --git a/credoai/artifacts/data/tabular_data.py b/credoai/artifacts/data/tabular_data.py
@@ -25,8 +25,9 @@ class TabularData(Data):
         Outcome
     sensitive_features : pd.Series, pd.DataFrame, optional
         Sensitive Features, which will be used for disaggregating performance
-        metrics. This can be the columns you want to perform segmentation analysis on, or
-        a feature related to fairness like 'race' or 'gender'
+        metrics. This can be the feature you want to perform segmentation analysis on, or
+        a feature related to fairness like 'race' or 'gender'. Sensitive Features *must*
+        be categorical features.
     sensitive_intersections : bool, list
         Whether to add intersections of sensitive features. If True, add all possible
         intersections. If list, only create intersections from specified sensitive features.

diff --git a/credoai/evaluators/__init__.py b/credoai/evaluators/__init__.py
@@ -4,11 +4,14 @@
 
 from .evaluator import Evaluator
 from .data_fairness import DataFairness
-from .data_profiling import DataProfiling
+from .data_profiler import DataProfiler
 from .privacy import Privacy
 from .security import Security
 from .equity import DataEquity, ModelEquity
 from .performance import Performance
 from .fairness import ModelFairness
 from .ranking_fairness import RankingFairness
 from .survival_fairness import SurvivalFairness
+from .shap import ShapExplainer
+from .model_profiler import ModelProfiler
+from .feature_drift import FeatureDrift
diff --git a/credoai/evaluators/data_fairness.py b/credoai/evaluators/data_fairness.py
@@ -67,7 +67,6 @@ def __init__(
         self.categorical_threshold = categorical_threshold
         super().__init__()
 
-    name = "DataFairness"
     required_artifacts = {"data", "sensitive_feature"}
 
     def _setup(self):

diff --git a/credoai/evaluators/data_profiling.py → credoai/evaluators/data_profiler.py b/credoai/evaluators/data_profiling.py → credoai/evaluators/data_profiler.py
@@ -15,7 +15,7 @@
 matplotlib.use(backend)
 
 
-class DataProfiling(Evaluator):
+class DataProfiler(Evaluator):
     """Data profiling module for Credo AI.
 
     This evaluator runs the pandas profiler on a data. Pandas profiler calculates a number
@@ -31,7 +31,6 @@ class DataProfiling(Evaluator):
         Passed to pandas_profiling.ProfileReport
     """
 
-    name = "DataProfiler"
     required_artifacts = {"data"}
 
     def __init__(self, dataset_name=None, **profile_kwargs):
@@ -40,14 +39,11 @@ def __init__(self, dataset_name=None, **profile_kwargs):
         super().__init__()
 
     def _setup(self):
-        self.data_to_eval = self.data
-
-        self.data = pd.concat([self.data_to_eval.X, self.data_to_eval.y], axis=1)
+        self.data_to_profile = pd.concat([self.data.X, self.data.y], axis=1)
         return self
 
     def _validate_arguments(self):
         check_data_instance(self.data, TabularData)
-
         return self
 
     def get_html_report(self):
@@ -67,4 +63,4 @@ def evaluate(self):
     def _create_reporter(self):
         default_kwargs = {"title": "Dataset", "minimal": True}
         default_kwargs.update(self.profile_kwargs)
-        return ProfileReport(self.data, **default_kwargs)
+        return ProfileReport(self.data_to_profile, **default_kwargs)
diff --git a/credoai/evaluators/equity.py b/credoai/evaluators/equity.py
@@ -41,7 +41,6 @@ class DataEquity(Evaluator):
         The significance value to evaluate statistical tests
     """
 
-    name = "DataEquity"
     required_artifacts = {"data", "sensitive_feature"}
 
     def __init__(self, p_value=0.01):
@@ -324,7 +323,6 @@ def __init__(self, use_predict_proba=False, p_value=0.01):
         self.use_predict_proba = use_predict_proba
         super().__init__(p_value)
 
-    name = "ModelEquity"
     required_artifacts = {"model", "assessment_data", "sensitive_feature"}
 
     def _setup(self):

diff --git a/credoai/evaluators/evaluator.py b/credoai/evaluators/evaluator.py
@@ -17,6 +17,11 @@ def __init__(self):
         self._results = None
         self.artifact_keys = []
         self.logger = global_logger
+        self.metadata = {}
+
+    @property
+    def name(self):
+        return self.__class__.__name__
 
     @property
     def results(self):
@@ -36,12 +41,6 @@ def results(self, results):
                 raise ValidationError("All results must be EvidenceContainers")
         self._results = results
 
-    @property
-    @abstractmethod
-    def name(self):
-        """Used to define a unique identifier for the specific evaluator"""
-        pass
-
     @property
     @abstractmethod
     def required_artifacts(self):
@@ -106,11 +105,20 @@ def get_container_info(self, labels: dict = None, metadata: dict = None):
         return info
 
     def _base_container_info(self):
-        return {"labels": {"evaluator": self.name}, "metadata": self._get_artifacts()}
+        meta = {**self.metadata, **self._get_artifacts()}
+        labels = {"evaluator": self.name}
+        if "dataset_type" in meta:
+            labels["dataset_type"] = meta["dataset_type"]
+        return {"labels": labels, "metadata": meta}
 
     def _get_artifacts(self):
         artifacts = {}
-        save_keys = {"model": "model_name"}
+        save_keys = {
+            "model": "model_name",
+            "data": "data_name",
+            "assessment_data": "assessment_data_name",
+            "training_data": "training_data_name",
+        }
         for k in self.artifact_keys:
             save_key = save_keys.get(k, k)
             try:

diff --git a/credoai/evaluators/fairness.py b/credoai/evaluators/fairness.py
@@ -62,7 +62,6 @@ def __init__(
         self.fairness_prob_metrics = None
         super().__init__()
 
-    name = "ModelFairness"
     required_artifacts = {"model", "data", "sensitive_feature"}
 
     def _setup(self):
@@ -105,11 +104,9 @@ def evaluate(self):
 
         if disaggregated_thresh_results is not None:
             for key, df in disaggregated_thresh_results.items():
-                df.name = key
+                labels = {**sens_feat_label, **{"metric_type": key}}
                 self._results.append(
-                    TableContainer(
-                        df, **self.get_container_info(labels=sens_feat_label)
-                    )
+                    TableContainer(df, **self.get_container_info(labels=labels))
                 )
 
         return self
@@ -198,12 +195,15 @@ def get_disaggregated_threshold_performance(self):
             var_name="type",
         )
 
-        to_return = defaultdict(pd.DataFrame)
+        to_return = defaultdict(list)
         for i, row in df.iterrows():
-            label = f'{row["type"]}_disaggregated_performance'
             tmp_df = row["value"]
             tmp_df = tmp_df.assign(**row.drop("value"))
-            to_return[label] = pd.concat([to_return[label], tmp_df])
+            to_return[row["type"]].append(tmp_df)
+        for key in to_return.keys():
+            df = pd.concat(to_return[key])
+            df.name = "threshold_dependent_disaggregated_performance"
+            to_return[key] = df
         return to_return
 
     def get_fairness_results(self):

diff --git a/credoai/evaluators/feature_drift.py b/credoai/evaluators/feature_drift.py
@@ -0,0 +1,154 @@
+"""Feature Drift evaluator"""
+from credoai.artifacts import ClassificationModel
+from credoai.evaluators import Evaluator
+from credoai.evaluators.utils.validation import check_requirements_existence
+from credoai.evidence import MetricContainer
+from credoai.evidence.containers import TableContainer
+from credoai.modules.credoai_metrics import population_stability_index
+from pandas import DataFrame, Series
+
+
+class FeatureDrift(Evaluator):
+    """
+    Measure Feature Drift using population stability index.
+
+    This evaluator measures feature drift in:
+
+    1. Model prediction: the prediction for the assessment dataset is compared
+        to the prediction for the training dataset.
+        In the case of classifiers, the prediction is performed with predict proba if available.
+        If it is not available, the prediction is treated like a categorical variable, see the
+        processing of categorical variables in the item below.
+
+    2. Dataset features: 1 to 1 comparison across all features for the datasets. This is also
+    referred to as "characteristic stability index" (CSI).
+        - Numerical features are directly fed into the population_stability_index metric, and
+        binned according to the parameters specified at init time.
+        - Categorical features percentage distribution is manually calculated. The % amount of
+        samples per each class is calculated and then fed into the population_stability_index metric.
+        The percentage flag in the metric is set to True, to bypass the internal binning process.
+
+
+    Parameters
+    ----------
+    buckets : int, optional
+        Number of buckets to consider to bin the predictions, by default 10
+    buckettype :  Literal["bins", "quantiles"]
+        Type of strategy for creating buckets, bins splits into even splits,
+        quantiles splits into quantiles buckets, by default "bins"
+    csi_calculation : bool, optional
+        Calculate characteristic stability index, i.e., PSI for all features in the datasets,
+        by default False
+    """
+
+    def __init__(self, buckets: int = 10, buckettype="bins", csi_calculation=False):
+
+        self.bucket_number = buckets
+        self.buckettype = buckettype
+        self.csi_calculation = csi_calculation
+        self.percentage = False
+        super().__init__()
+
+    required_artifacts = {"model", "assessment_data", "training_data"}
+
+    def _validate_arguments(self):
+        check_requirements_existence(self)
+
+    def _setup(self):
+        # Default prediction to predict method
+        prediction_method = self.model.predict
+        if isinstance(self.model, ClassificationModel):
+            if hasattr(self.model, "predict_proba"):
+                prediction_method = self.model.predict_proba
+            else:
+                self.percentage = True
+
+        self.expected_prediction = prediction_method(self.training_data.X)
+        self.actual_prediction = prediction_method(self.assessment_data.X)
+
+        # Create the bins manually for categorical prediction if predict_proba
+        # is not available.
+        if self.percentage:
+            (
+                self.expected_prediction,
+                self.actual_prediction,
+            ) = self._create_bin_percentage(
+                self.expected_prediction, self.actual_prediction
+            )
+
+    def evaluate(self):
+        prediction_psi = self._calculate_psi_on_prediction()
+        self.results = [MetricContainer(prediction_psi, **self.get_container_info())]
+        if self.csi_calculation:
+            csi = self._calculate_csi()
+            self.results.append(TableContainer(csi, **self.get_container_info()))
+        return self
+
+    def _calculate_psi_on_prediction(self) -> DataFrame:
+        """
+        Calculate the psi index on the model prediction.
+
+        Returns
+        -------
+        DataFrame
+            Formatted for metric container.
+        """
+        psi = population_stability_index(
+            self.expected_prediction,
+            self.actual_prediction,
+            percentage=self.percentage,
+            buckets=self.bucket_number,
+            buckettype=self.buckettype,
+        )
+        res = DataFrame({"value": psi, "type": "population_stability_index"}, index=[0])
+        return res
+
+    def _calculate_csi(self) -> DataFrame:
+        """
+        Calculate psi for all the columns in the dataframes.
+
+        Returns
+        -------
+        DataFrame
+            Formatted for the table container.
+        """
+        columns_names = list(self.assessment_data.X.columns)
+        psis = {}
+        for col_name in columns_names:
+            train_data = self.training_data.X[col_name]
+            assess_data = self.assessment_data.X[col_name]
+            if self.assessment_data.X[col_name].dtype == "category":
+                train, assess = self._create_bin_percentage(train_data, assess_data)
+                psis[col_name] = population_stability_index(train, assess, True)
+            else:
+                psis[col_name] = population_stability_index(train_data, assess_data)
+        psis = DataFrame.from_dict(psis, orient="index")
+        psis = psis.reset_index()
+        psis.columns = ["feature_names", "value"]
+        psis.name = "Characteristic Stability Index"
+        return psis
+
+    @staticmethod
+    def _create_bin_percentage(train: Series, assess: Series) -> tuple:
+        """
+        In case of categorical values proceed to count the instances
+        of each class and divide by the total amount of samples to get
+        the ratios.
+
+        Parameters
+        ----------
+        train : Series
+            Array of values, dtype == category
+        assess : Series
+            Array of values, dtype == category
+
+        Returns
+        -------
+        tuple
+            Class percentages for both arrays
+        """
+        len_training = len(train)
+        len_assessment = len(assess)
+        train_bin_perc = train.value_counts() / len_training
+        assess_bin_perc = assess.value_counts() / len_assessment
+        return train_bin_perc, assess_bin_perc