Merge remote-tracking branch 'upstream/main'

red-hat-data-services · May 14, 2024 · 861a5d4 · 861a5d4
2 parents d0bd35b + 40fd75c
commit 861a5d4
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 1 deletion.
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -123,7 +123,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     python -m pip install --upgrade pip && \
     python -m pip install wheel && \
     python -m pip install "$(head bdist_name)" && \
-    python -m pip install "$(head bdist_name)[aim]" && \
+    # Due to FIPS tolerance issues, removing aim at this time
+    #python -m pip install "$(head bdist_name)[aim]" && \
     python -m pip install "$(head bdist_name)[flash-attn]" && \
     # Clean up the wheel module. It's only needed by flash-attn install
     python -m pip uninstall wheel -y && \

diff --git a/tests/data/trainercontroller/__init__.py b/tests/data/trainercontroller/__init__.py
@@ -25,6 +25,10 @@
 TRAINER_CONFIG_TEST_LOSS_ON_THRESHOLD_WITH_TRAINER_STATE_YAML = os.path.join(
     _DATA_DIR, "loss_on_threshold_with_trainer_state.yaml"
 )
+TRAINER_CONFIG_EXPOSED_METRICS_YAML = os.path.join(_DATA_DIR, "exposed_metrics.yaml")
+TRAINER_CONFIG_INCORRECT_SOURCE_EVENT_EXPOSED_METRICS_YAML = os.path.join(
+    _DATA_DIR, "incorrect_source_event_exposed_metrics.yaml"
+)
 TRAINER_CONFIG_TEST_MALICIOUS_OS_RULE_YAML = os.path.join(
     _DATA_DIR, "loss_with_malicious_os_rule.yaml"
 )

diff --git a/tests/data/trainercontroller/exposed_metrics.yaml b/tests/data/trainercontroller/exposed_metrics.yaml
@@ -0,0 +1,12 @@
+controller-metrics:
+  - name: evalmetric
+    class: EvalMetrics
+    arguments:
+      source-event: on_evaluate
+controllers:
+  - name: loss-controller
+    triggers:
+      - on_evaluate
+    rule: evalmetric['eval_loss'] < 2.5
+    operations:
+      - hfcontrols.should_training_stop
diff --git a/tests/data/trainercontroller/incorrect_source_event_exposed_metrics.yaml b/tests/data/trainercontroller/incorrect_source_event_exposed_metrics.yaml
@@ -0,0 +1,12 @@
+controller-metrics:
+  - name: evalmetric
+    class: EvalMetrics
+    arguments:
+      source-event: on_incorrect_event
+controllers:
+  - name: loss-controller
+    triggers:
+      - on_evaluate
+    rule: evalmetric['eval_loss'] < 2.5
+    operations:
+      - hfcontrols.should_training_stop
diff --git a/tests/trainercontroller/test_tuning_trainercontroller.py b/tests/trainercontroller/test_tuning_trainercontroller.py
@@ -102,9 +102,51 @@ def test_loss_on_threshold_with_trainer_state():
     tc_callback.on_init_end(args=test_data.args, state=test_data.state, control=control)
     # Trigger rule and test the condition
     tc_callback.on_log(args=test_data.args, state=test_data.state, control=control)
+
+
+def test_exposed_metrics():
+    """Tests the expose metric scenario example in
+    `examples/trainer-controller-configs/exposed_metrics.yaml`
+    """
+    test_data = _setup_data()
+    tc_callback = tc.TrainerControllerCallback(td.TRAINER_CONFIG_EXPOSED_METRICS_YAML)
+    control = TrainerControl(should_training_stop=False)
+    metrics = {"eval_loss": 2.2}
+    # Trigger on_init_end to perform registration of handlers to events
+    tc_callback.on_init_end(args=test_data.args, state=test_data.state, control=control)
+    # Trigger rule and test the condition
+    tc_callback.on_evaluate(
+        args=test_data.args, state=test_data.state, control=control, metrics=metrics
+    )
     assert control.should_training_stop == True
 
 
+def test_incorrect_source_event_exposed_metrics():
+    """Tests the expose metric scenario example in
+    `examples/trainer-controller-configs/incorrect_source_event_exposed_metrics.yaml`
+    """
+    with pytest.raises(ValueError) as exception_handler:
+        test_data = _setup_data()
+        tc_callback = tc.TrainerControllerCallback(
+            td.TRAINER_CONFIG_INCORRECT_SOURCE_EVENT_EXPOSED_METRICS_YAML
+        )
+        control = TrainerControl(should_training_stop=False)
+        metrics = {"eval_loss": 2.2}
+        # Trigger on_init_end to perform registration of handlers to events
+        tc_callback.on_init_end(
+            args=test_data.args, state=test_data.state, control=control
+        )
+        # Trigger rule and test the condition
+        tc_callback.on_evaluate(
+            args=test_data.args, state=test_data.state, control=control, metrics=metrics
+        )
+        assert (
+            str(exception_handler.value).strip("'")
+            == "Specified source event [on_incorrect_event] is invalid for EvalMetrics"
+        )
+        assert control.should_training_stop == True
+
+
 def test_custom_metric_handler():
     """Tests the custom metric registration
     `examples/trainer-controller-configs/loss_custom_metric.yaml`

diff --git a/tuning/trainercontroller/controllermetrics/__init__.py b/tuning/trainercontroller/controllermetrics/__init__.py
@@ -19,6 +19,7 @@
 from typing import Type
 
 # Local
+from .eval_metrics import EvalMetrics
 from .loss import Loss
 from .trainingstate import TrainingState
 
@@ -37,4 +38,5 @@ def register(cl: Type):
 
 # Register the default metric handlers in this package here
 register(TrainingState)
+register(EvalMetrics)
 register(Loss)
diff --git a/tuning/trainercontroller/controllermetrics/eval_metrics.py b/tuning/trainercontroller/controllermetrics/eval_metrics.py
@@ -0,0 +1,75 @@
+# Copyright The IBM Tuning Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-License-Identifier: Apache-2.0
+# https://spdx.dev/learn/handling-license-info/
+
+# Standard
+from typing import Any
+
+# Third Party
+from transformers.utils import logging
+
+# Local
+from tuning.trainercontroller.controllermetrics.metricshandler import MetricHandler
+
+logger = logging.get_logger(__name__)
+
+
+class EvalMetrics(MetricHandler):
+    """Implements the controller metric which exposes the evaluation metrics"""
+
+    def __init__(self, **kwargs):
+        """Initializes the metric handler, by registering the event \
+            list and arguments with base handler.
+
+        Args:
+            kwargs: List of arguments (key, value)-pairs
+        """
+        source_events_to_check = {"on_evaluate", "on_predict"}
+        source_event = kwargs.get("source-event")
+        if source_event is None:
+            source_event = "on_evaluate"
+        elif source_event in source_events_to_check:
+            super().__init__(
+                events=[
+                    source_event,
+                ],
+                **kwargs,
+            )
+        else:
+            raise ValueError(
+                "Specified source event [%s] is invalid for EvalMetrics"
+                % (source_event)
+            )
+
+    def validate(self) -> bool:
+        """Validate the training arguments (e.g logging_steps) are \
+            compatible with the computation of this metric.
+
+        Returns:
+            bool
+        """
+        return True
+
+    def compute(self, **kwargs) -> Any:
+        """Exposes the trainer state.
+
+        Args:
+            kwargs: Remaining event arguments
+
+        Returns:
+            dict. Trainer state as a dictionary
+        """
+        return kwargs["metrics"]