From 4a552dc13ef5c433b3e77416592a4b67e4398684 Mon Sep 17 00:00:00 2001 From: Massimo Gengarelli Date: Wed, 3 Jul 2024 17:59:43 +0200 Subject: [PATCH] feat(monitoring): instrument code using Prometheus --- README.md | 34 +++++++ cmd/chaosmonkey/main.go | 20 +++++ go.mod | 8 +- go.sum | 16 +++- internal/watcher/crd.go | 158 +++++++++++++++++++++++++++++++++ internal/watcher/deployment.go | 57 +++++++++++- internal/watcher/namespace.go | 109 ++++++++++++++++++++++- internal/watcher/pod.go | 61 ++++++++++++- main.tf | 42 +++++++-- tests/kubetest.sh | 77 +++++++++++++++- 10 files changed, 569 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 96a81c5..9a3d372 100644 --- a/README.md +++ b/README.md @@ -183,6 +183,40 @@ The value is not case-sensitive. Invalid or empty values will make ChaosMonkey default to the `info` level. +## Observability +The Chaos Monkey exposes some metrics using the [Prometheus](https://prometheus.io/) library and format. + +This is an _evolving_ list of metrics currently exposed, for more details please take a look +in the code under the corresponding service (all the services in the [watcher folder](./internal/watcher/) expose +some sort of metrics). + +All the events use the prefix `chaos_monkey` which, for readability issues, is not repeated in the +table below. + +| Name | Description | Type | +|----------------------------------------|-----------------------------------------|-----------| +| nswatcher_events | events handled by the nswatcher | Counter | +| nswatcher_event_duration | duration of each event in microseconds | Histogram | +| nswatcher_cmc_spawned | crd services spawned | Counter | +| nswatcher_cmc_active | currently active crd | Gauge | +| nswatcher_restarts | timeouts happened from K8S APIs | Counter | +| crdwatcher_events | events handled by the crd watcher | Counter | +| crdwatcher_pw_spawned | PodWatchers spawned | Counter | +| crdwatcher_pw_active | PodWatchers currently active | Gauge | +| crdwatcher_dw_spawned | DeploymentWatchers spawned | Counter | +| crdwatcher_dw_active | DeploymentWatchers active | Gauge | +| crdwatcher_restarts | timeouts happened from K8S APIs | Counter | +| podwatcher_pods_added | Pods having been added to the list | Counter | +| podwatcher_pods_removed | Pods having been removed from the list | Counter | +| podwatcher_pods_killed | Pods having been killed | Counter | +| podwatcher_pods_active | Pods currently being targeted | Gauge | +| podwatcher_restarts | timeouts happened from K8S APIs | Counter | +| deploymentwatcher_deployments_rescaled | deployments having been rescaled | Counter | +| deploymentwatcher_random_distribution | random distribution of deployments | Histogram | +| deploymentwatcher_last_scale | last value used to scale the deployment | Gauge | + + + ## Development All contributions are welcome, of course. Feel free to open an issue or submit a pull request. If you want to develop and test locally, you need to install: diff --git a/cmd/chaosmonkey/main.go b/cmd/chaosmonkey/main.go index 3d91700..d490a20 100644 --- a/cmd/chaosmonkey/main.go +++ b/cmd/chaosmonkey/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "net/http" "os" "os/signal" "sync" @@ -10,6 +11,7 @@ import ( "github.com/massix/chaos-monkey/internal/apis/clientset/versioned" "github.com/massix/chaos-monkey/internal/configuration" "github.com/massix/chaos-monkey/internal/watcher" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/sirupsen/logrus" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -63,9 +65,27 @@ func main() { } }() + // Spawn the HTTP Server for Prometheus in background + srv := &http.Server{ + Handler: promhttp.Handler(), + Addr: "0.0.0.0:9000", + } + + wg.Add(1) + go func() { + defer wg.Done() + if err := srv.ListenAndServe(); err != nil { + log.Warnf("Could not spawn Prometheus handler: %s", err) + } + }() + // Wait for a signal to arrive <-s + if err := srv.Shutdown(context.Background()); err != nil { + log.Warnf("Could not shutdown Prometheus handler: %s", err) + } + log.Info("Shutting down...") cancel() diff --git a/go.mod b/go.mod index 96854c7..d33b567 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/massix/chaos-monkey go 1.22.3 require ( + github.com/prometheus/client_golang v1.19.1 github.com/sirupsen/logrus v1.9.3 k8s.io/api v0.30.2 k8s.io/apimachinery v0.30.2 @@ -11,6 +12,8 @@ require ( ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/evanphx/json-patch v4.12.0+incompatible // indirect @@ -32,10 +35,13 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_model v0.5.0 // indirect + github.com/prometheus/common v0.48.0 // indirect + github.com/prometheus/procfs v0.12.0 // indirect github.com/spf13/pflag v1.0.5 // indirect golang.org/x/mod v0.15.0 // indirect golang.org/x/net v0.23.0 // indirect - golang.org/x/oauth2 v0.10.0 // indirect + golang.org/x/oauth2 v0.16.0 // indirect golang.org/x/sys v0.18.0 // indirect golang.org/x/term v0.18.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index 02c1eb3..4196537 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -65,6 +69,14 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= +github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI= +github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE= +github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc= +github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= +github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= @@ -97,8 +109,8 @@ golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= -golang.org/x/oauth2 v0.10.0 h1:zHCpF2Khkwy4mMB4bv0U37YtJdTGW8jI0glAApi0Kh8= -golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQI= +golang.org/x/oauth2 v0.16.0 h1:aDkGMBSYxElaoP81NpoUoz2oo2R2wHdZpGToUxfyQrQ= +golang.org/x/oauth2 v0.16.0/go.mod h1:hqZ+0LWXsiVoZpeld6jVt06P3adbS2Uu911W1SsJv2o= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/internal/watcher/crd.go b/internal/watcher/crd.go index 23db238..36c728b 100644 --- a/internal/watcher/crd.go +++ b/internal/watcher/crd.go @@ -12,6 +12,8 @@ import ( "github.com/massix/chaos-monkey/internal/apis/clientset/versioned/scheme" cmv1alpha1 "github.com/massix/chaos-monkey/internal/apis/clientset/versioned/typed/apis/v1alpha1" "github.com/massix/chaos-monkey/internal/apis/v1alpha1" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/sirupsen/logrus" apiappsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -35,6 +37,7 @@ type CrdWatcher struct { Logrus logrus.FieldLogger Client kubernetes.Interface + metrics *crdMetrics Mutex *sync.Mutex DeploymentWatchers map[string]*WatcherConfiguration ForceStopChan chan interface{} @@ -44,8 +47,114 @@ type CrdWatcher struct { Running bool } +type crdMetrics struct { + // Total number of events handled + addedEvents prometheus.Counter + modifiedEvents prometheus.Counter + deletedEvents prometheus.Counter + + // Total number of restarts + restarts prometheus.Counter + + // Metrics for PodWatchers + pwSpawned prometheus.Counter + pwActive prometheus.Gauge + + // Metrics for DeploymentWatchers + dwSpawned prometheus.Counter + dwActive prometheus.Gauge + + // How long it takes to handle an event + eventDuration prometheus.Histogram +} + +func (crd *crdMetrics) unregister() { + prometheus.Unregister(crd.addedEvents) + prometheus.Unregister(crd.modifiedEvents) + prometheus.Unregister(crd.deletedEvents) + prometheus.Unregister(crd.restarts) + prometheus.Unregister(crd.pwSpawned) + prometheus.Unregister(crd.pwActive) + prometheus.Unregister(crd.dwSpawned) + prometheus.Unregister(crd.dwActive) + prometheus.Unregister(crd.eventDuration) +} + var _ = (Watcher)((*CrdWatcher)(nil)) +func newCrdMetrics(namespace string) *crdMetrics { + return &crdMetrics{ + addedEvents: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "events", + Subsystem: "crdwatcher", + Help: "Total number of events handled", + ConstLabels: map[string]string{"namespace": namespace, "event_type": "add"}, + }), + modifiedEvents: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "events", + Subsystem: "crdwatcher", + Help: "Total number of events handled", + ConstLabels: map[string]string{"namespace": namespace, "event_type": "modify"}, + }), + deletedEvents: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "events", + Subsystem: "crdwatcher", + Help: "Total number of events handled", + ConstLabels: map[string]string{"namespace": namespace, "event_type": "delete"}, + }), + + restarts: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "restarts", + Subsystem: "crdwatcher", + Help: "Total number of restarts", + ConstLabels: map[string]string{"namespace": namespace}, + }), + + pwSpawned: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "pw_spawned", + Subsystem: "crdwatcher", + Help: "Total number of PodWatchers spawned", + ConstLabels: map[string]string{"namespace": namespace}, + }), + pwActive: promauto.NewGauge(prometheus.GaugeOpts{ + Namespace: "chaos_monkey", + Name: "pw_active", + Subsystem: "crdwatcher", + Help: "Total number of PodWatchers active", + ConstLabels: map[string]string{"namespace": namespace}, + }), + + dwSpawned: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "dw_spawned", + Subsystem: "crdwatcher", + Help: "Total number of DeploymentWatchers spawned", + ConstLabels: map[string]string{"namespace": namespace}, + }), + dwActive: promauto.NewGauge(prometheus.GaugeOpts{ + Namespace: "chaos_monkey", + Name: "dw_active", + Subsystem: "crdwatcher", + Help: "Total number of DeploymentWatchers active", + ConstLabels: map[string]string{"namespace": namespace}, + }), + + eventDuration: promauto.NewHistogram(prometheus.HistogramOpts{ + Namespace: "chaos_monkey", + Name: "event_duration", + Subsystem: "crdwatcher", + Help: "How long it took to handle an event (calculated in microseconds)", + ConstLabels: map[string]string{"namespace": namespace}, + Buckets: []float64{0, 5, 10, 20, 50, 100, 500, 1000, 1500, 2000}, + }), + } +} + func NewCrdWatcher(clientset kubernetes.Interface, cmcClientset typedcmc.Interface, recorder record.EventRecorderLogger, namespace string) Watcher { // Build my own recorder here if recorder == nil { @@ -69,6 +178,7 @@ func NewCrdWatcher(clientset kubernetes.Interface, cmcClientset typedcmc.Interfa CleanupTimeout: 15 * time.Minute, WatcherTimeout: 24 * time.Hour, Running: false, + metrics: newCrdMetrics(namespace), } } @@ -110,9 +220,11 @@ func (c *CrdWatcher) Start(ctx context.Context) error { c.setRunning(false) } + c.metrics.restarts.Inc() break } + startTime := time.Now().UnixMicro() cmc := evt.Object.(*v1alpha1.ChaosMonkeyConfiguration) c.Logrus.Debugf("Received %s event for %+v", evt.Type, cmc) @@ -147,6 +259,7 @@ func (c *CrdWatcher) Start(ctx context.Context) error { c.Logrus.Debug("All is good! Publishing event.") c.EventRecorderLogger.Eventf(cmc, "Normal", "Started", "Watcher started for deployment %s", dep.Name) + c.metrics.addedEvents.Inc() case watch.Modified: c.Logrus.Infof("Received MODIFIED event for %s, for deployment %s", cmc.Name, cmc.Spec.DeploymentName) @@ -157,6 +270,7 @@ func (c *CrdWatcher) Start(ctx context.Context) error { c.Logrus.Debug("All is good! Publishing event.") c.EventRecorderLogger.Eventf(cmc, "Normal", "Modified", "Watcher modified for deployment %s", cmc.Spec.DeploymentName) + c.metrics.modifiedEvents.Inc() case watch.Deleted: c.Logrus.Infof("Received DELETED event for %s, for deployment %s", cmc.Name, cmc.Spec.DeploymentName) @@ -167,8 +281,12 @@ func (c *CrdWatcher) Start(ctx context.Context) error { c.Logrus.Debug("All is good! Publishing event.") c.EventRecorderLogger.Eventf(cmc, "Normal", "Deleted", "Watcher deleted for deployment %s", cmc.Spec.DeploymentName) + c.metrics.deletedEvents.Inc() } + endTime := time.Now().UnixMicro() + c.metrics.eventDuration.Observe(float64(endTime - startTime)) + case <-ctx.Done(): c.Logrus.Info("Watcher context done") c.setRunning(false) @@ -198,6 +316,9 @@ func (c *CrdWatcher) Start(ctx context.Context) error { c.Mutex.Unlock() wg.Wait() + + c.Logrus.Debug("Unregistering Prometheus metrics") + c.metrics.unregister() return err } @@ -259,9 +380,11 @@ func (c *CrdWatcher) addWatcher(cmc *v1alpha1.ChaosMonkeyConfiguration, dep *api c.Logrus.Debugf("Configuring watcher with %+v", cmc.Spec) newWatcher = DefaultPodFactory(c.Client, nil, dep.Namespace, strings.Join(combinedLabelSelector, ",")) + c.metrics.pwSpawned.Inc() } else { c.Logrus.Debug("Creating new deployment watcher") newWatcher = DefaultDeploymentFactory(c.Client, nil, dep) + c.metrics.dwSpawned.Inc() } // Configure it @@ -289,6 +412,19 @@ func (c *CrdWatcher) startWatcher(ctx context.Context, forDeployment string, wg return fmt.Errorf("Watcher for deployment %s does not exist", forDeployment) } + var activeMetric prometheus.Gauge + + switch wc.Watcher.(type) { + case *PodWatcher: + activeMetric = c.metrics.pwActive + case *DeploymentWatcher: + activeMetric = c.metrics.dwActive + } + + if activeMetric != nil { + activeMetric.Inc() + } + wg.Add(1) go func() { defer wg.Done() @@ -296,6 +432,10 @@ func (c *CrdWatcher) startWatcher(ctx context.Context, forDeployment string, wg if err := wc.Watcher.Start(ctx); err != nil { c.Logrus.Errorf("Error while starting watcher: %s", err) } + + if activeMetric != nil { + activeMetric.Dec() + } }() return nil @@ -345,9 +485,11 @@ func (c *CrdWatcher) modifyWatcher(ctx context.Context, cmc *v1alpha1.ChaosMonke } newWatcher = DefaultPodFactory(c.Client, nil, dep.Namespace, allLabels...) + c.metrics.pwSpawned.Inc() } else { c.Logrus.Debug("Creating new Deployment watcher") newWatcher = DefaultDeploymentFactory(c.Client, nil, dep) + c.metrics.dwSpawned.Inc() } // Configure the watcher @@ -359,6 +501,18 @@ func (c *CrdWatcher) modifyWatcher(ctx context.Context, cmc *v1alpha1.ChaosMonke // Start the watcher c.Logrus.Info("Starting the newly created watcher") + var activeMetric prometheus.Gauge + switch newWatcher.(type) { + case *PodWatcher: + activeMetric = c.metrics.pwActive + case *DeploymentWatcher: + activeMetric = c.metrics.dwActive + } + + if activeMetric != nil { + activeMetric.Inc() + } + wg.Add(1) go func() { defer wg.Done() @@ -366,6 +520,10 @@ func (c *CrdWatcher) modifyWatcher(ctx context.Context, cmc *v1alpha1.ChaosMonke if err := newWatcher.Start(ctx); err != nil { c.Logrus.Errorf("Error while starting watcher: %s", err) } + + if activeMetric != nil { + activeMetric.Dec() + } }() // Put it into the map diff --git a/internal/watcher/deployment.go b/internal/watcher/deployment.go index 1325741..9490ba3 100644 --- a/internal/watcher/deployment.go +++ b/internal/watcher/deployment.go @@ -7,6 +7,8 @@ import ( "sync" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/sirupsen/logrus" appsv1 "k8s.io/api/apps/v1" scalev1 "k8s.io/api/autoscaling/v1" @@ -26,14 +28,32 @@ type DeploymentWatcher struct { Logrus logrus.FieldLogger OriginalDeployment *appsv1.Deployment Mutex *sync.Mutex + metrics *dwMetrics + ForceStopChan chan interface{} MinReplicas int MaxReplicas int Timeout time.Duration - ForceStopChan chan interface{} Running bool Enabled bool } +type dwMetrics struct { + // Total number of deployments rescaled + deploymentsRescaled prometheus.Counter + + // Distribution of the number of replicas used + randomDistribution prometheus.Histogram + + // Last used scale + lastScale prometheus.Gauge +} + +func (dw *dwMetrics) unregister() { + prometheus.Unregister(dw.deploymentsRescaled) + prometheus.Unregister(dw.randomDistribution) + prometheus.Unregister(dw.lastScale) +} + func NewDeploymentWatcher(clientset kubernetes.Interface, recorder record.EventRecorderLogger, deployment *appsv1.Deployment) ConfigurableWatcher { logrus.Infof("Creating new Deployment watcher for %s/%s", deployment.Namespace, deployment.Name) @@ -52,6 +72,7 @@ func NewDeploymentWatcher(clientset kubernetes.Interface, recorder record.EventR Logrus: logrus.WithFields(logrus.Fields{"component": "DeploymentWatcher", "namespace": deployment.Namespace, "deploymentName": deployment.Name}), Mutex: &sync.Mutex{}, + metrics: newDwMetrics(deployment.Name, deployment.Namespace), MinReplicas: 0, MaxReplicas: 0, Timeout: 0, @@ -61,6 +82,34 @@ func NewDeploymentWatcher(clientset kubernetes.Interface, recorder record.EventR } } +func newDwMetrics(deploymentName, namespace string) *dwMetrics { + constLabels := map[string]string{"namespace": namespace, "deployment": deploymentName} + return &dwMetrics{ + deploymentsRescaled: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Subsystem: "deploymentwatcher", + Name: "deployments_rescaled", + Help: "Total number of deployments rescaled", + ConstLabels: constLabels, + }), + randomDistribution: promauto.NewHistogram(prometheus.HistogramOpts{ + Namespace: "chaos_monkey", + Subsystem: "deploymentwatcher", + Name: "random_distribution", + Help: "Distribution of the number of replicas used", + ConstLabels: constLabels, + Buckets: []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + }), + lastScale: promauto.NewGauge(prometheus.GaugeOpts{ + Namespace: "chaos_monkey", + Subsystem: "deploymentwatcher", + Name: "last_scale", + Help: "Last value used for replicas of deployment", + ConstLabels: constLabels, + }), + } +} + // IsRunning implements DeploymentWatcherI. func (d *DeploymentWatcher) IsRunning() bool { d.Mutex.Lock() @@ -147,6 +196,9 @@ func (d *DeploymentWatcher) Start(ctx context.Context) error { } d.Logrus.Info("Chaos Monkey stopped") + d.Logrus.Debug("Unregistering Prometheus metrics") + d.metrics.unregister() + return nil } @@ -166,6 +218,9 @@ func (d *DeploymentWatcher) scaleDeployment(newReplicas int) error { if err == nil { d.Logrus.Debugf("Successfully scaled to %d replicas, publishing event", res.Spec.Replicas) d.Eventf(d.getOriginalDeployment(), corev1.EventTypeNormal, "ChaosMonkey", "Converted to %d replicas", res.Spec.Replicas) + d.metrics.deploymentsRescaled.Inc() + d.metrics.randomDistribution.Observe(float64(newReplicas)) + d.metrics.lastScale.Set(float64(newReplicas)) } return err diff --git a/internal/watcher/namespace.go b/internal/watcher/namespace.go index 8443bdf..3e360d4 100644 --- a/internal/watcher/namespace.go +++ b/internal/watcher/namespace.go @@ -8,6 +8,8 @@ import ( "time" mc "github.com/massix/chaos-monkey/internal/apis/clientset/versioned" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -21,20 +23,109 @@ import ( type NamespaceWatcher struct { typedcorev1.NamespaceInterface record.EventRecorderLogger - Logrus logrus.FieldLogger Client kubernetes.Interface CmcClient mc.Interface CrdWatchers map[string]Watcher Mutex *sync.Mutex + metrics *nwMetrics RootNamespace string CleanupTimeout time.Duration WatcherTimeout time.Duration Running bool } +// Metrics for the NamespaceWatcher component +type nwMetrics struct { + // Total number of events handled + addedEvents prometheus.Counter + modifiedEvents prometheus.Counter + deletedEvents prometheus.Counter + + // Total number of restarts handled + restarts prometheus.Counter + + // Total number of CMCs spawned + cmcSpawned prometheus.Counter + + // Total number of *active* CMCs + cmcActive prometheus.Gauge + + // How long it took to handle an event + eventDuration prometheus.Histogram +} + +func (nw *nwMetrics) unregister() { + prometheus.Unregister(nw.addedEvents) + prometheus.Unregister(nw.modifiedEvents) + prometheus.Unregister(nw.deletedEvents) + prometheus.Unregister(nw.restarts) + prometheus.Unregister(nw.cmcSpawned) + prometheus.Unregister(nw.cmcActive) + prometheus.Unregister(nw.eventDuration) +} + var _ = (Watcher)((*NamespaceWatcher)(nil)) +func newNwMetrics(rootNamespace string) *nwMetrics { + return &nwMetrics{ + addedEvents: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "events", + Subsystem: "nswatcher", + Help: "Total number of events handled", + ConstLabels: map[string]string{"event_type": "add", "root_namespace": rootNamespace}, + }), + modifiedEvents: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "events", + Subsystem: "nswatcher", + Help: "Total number of events handled", + ConstLabels: map[string]string{"event_type": "modify", "root_namespace": rootNamespace}, + }), + deletedEvents: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "events", + Subsystem: "nswatcher", + Help: "Total number of events handled", + ConstLabels: map[string]string{"event_type": "delete", "root_namespace": rootNamespace}, + }), + + restarts: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "restarts", + Subsystem: "nswatcher", + Help: "Total number of restarts handled", + ConstLabels: map[string]string{"root_namespace": rootNamespace}, + }), + + cmcSpawned: promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Name: "cmc_spawned", + Subsystem: "nswatcher", + Help: "Total number of CMCs spawned", + ConstLabels: map[string]string{"root_namespace": rootNamespace}, + }), + + cmcActive: promauto.NewGauge(prometheus.GaugeOpts{ + Namespace: "chaos_monkey", + Name: "cmc_active", + Subsystem: "nswatcher", + Help: "Current active CMC Watchers", + ConstLabels: map[string]string{"root_namespace": rootNamespace}, + }), + + eventDuration: promauto.NewHistogram(prometheus.HistogramOpts{ + Namespace: "chaos_monkey", + Name: "event_duration", + Subsystem: "nswatcher", + Help: "How long it took to handle an event (calculated in microseconds)", + ConstLabels: map[string]string{"root_namespace": rootNamespace}, + Buckets: []float64{0, 5, 10, 20, 50, 100, 500, 1000, 1500, 2000}, + }), + } +} + func NewNamespaceWatcher(clientset kubernetes.Interface, cmcClientset mc.Interface, recorder record.EventRecorderLogger, rootNamespace string) Watcher { logrus.Infof("Creating new namespace watcher for namespace %s", rootNamespace) @@ -56,6 +147,7 @@ func NewNamespaceWatcher(clientset kubernetes.Interface, cmcClientset mc.Interfa Logrus: logrus.WithFields(logrus.Fields{"component": "NamespaceWatcher", "rootNamespace": rootNamespace}), CrdWatchers: map[string]Watcher{}, Mutex: &sync.Mutex{}, + metrics: newNwMetrics(rootNamespace), CleanupTimeout: 1 * time.Minute, RootNamespace: rootNamespace, Running: false, @@ -104,9 +196,12 @@ func (n *NamespaceWatcher) Start(ctx context.Context) error { _ = n.Stop() } + n.metrics.restarts.Inc() break } + requestStart := time.Now().UnixMicro() + ns := evt.Object.(*corev1.Namespace) switch evt.Type { @@ -126,6 +221,7 @@ func (n *NamespaceWatcher) Start(ctx context.Context) error { n.Logrus.Debug("All is good! Sending event.") n.startCrdWatcher(ctx, ns.Name, &wg) n.Eventf(ns, "Normal", "Added", "CRD Watcher added for %s", ns.Name) + n.metrics.addedEvents.Inc() case watch.Deleted: n.Logrus.Infof("Deleting watcher for namespace %s", ns.Name) @@ -134,9 +230,14 @@ func (n *NamespaceWatcher) Start(ctx context.Context) error { } n.Logrus.Debug("All is good! Sending event.") + n.metrics.deletedEvents.Inc() n.Eventf(ns, "Normal", "Deleted", "CRD Watcher deleted for %s", ns.Name) } + requestEnd := time.Now().UnixMicro() + + n.metrics.eventDuration.Observe(float64(requestEnd - requestStart)) + case <-ctx.Done(): n.Logrus.Info("Context cancelled") _ = n.Stop() @@ -163,6 +264,8 @@ func (n *NamespaceWatcher) Start(ctx context.Context) error { n.Logrus.Info("Waiting for all CRD Watchers to finish") wg.Wait() + n.Logrus.Debug("Unregistering Prometheus metrics") + n.metrics.unregister() return err } @@ -204,6 +307,7 @@ func (n *NamespaceWatcher) removeWatcher(namespace string) error { err = fmt.Errorf("Watcher for namespace %s does not exist", namespace) } + n.metrics.cmcActive.Dec() return err } @@ -251,6 +355,9 @@ func (n *NamespaceWatcher) startCrdWatcher(ctx context.Context, namespace string n.Logrus.Errorf("Error while starting CRD Watcher for namespace %s", namespace) } }() + + n.metrics.cmcActive.Inc() + n.metrics.cmcSpawned.Inc() } func (n *NamespaceWatcher) restartWatch(ctx context.Context, wg *sync.WaitGroup) (watch.Interface, error) { diff --git a/internal/watcher/pod.go b/internal/watcher/pod.go index 6484c84..924087e 100644 --- a/internal/watcher/pod.go +++ b/internal/watcher/pod.go @@ -9,6 +9,8 @@ import ( "sync" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/sirupsen/logrus" apicorev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -25,16 +27,62 @@ type PodWatcher struct { Logrus logrus.FieldLogger Mutex *sync.Mutex - Namespace string + ForceStopChan chan interface{} + metrics *pwMetrics LabelSelector string + Namespace string PodList []*apicorev1.Pod Timeout time.Duration WatchTimeout time.Duration - ForceStopChan chan interface{} Enabled bool Running bool } +type pwMetrics struct { + // General statistics about the pods + podsAdded prometheus.Counter + podsRemoved prometheus.Counter + podsActive prometheus.Gauge + podsKilled prometheus.Counter + + // Number of restarts + restarts prometheus.Counter +} + +func (pw *pwMetrics) unregister() { + prometheus.Unregister(pw.podsAdded) + prometheus.Unregister(pw.podsRemoved) + prometheus.Unregister(pw.podsActive) + prometheus.Unregister(pw.podsKilled) + prometheus.Unregister(pw.restarts) +} + +func newPwMetrics(namespace, combinedLabelSelector string) *pwMetrics { + mkCounterOpts := func(name, help string) prometheus.CounterOpts { + return prometheus.CounterOpts{ + Namespace: "chaos_monkey", + Subsystem: "podwatcher", + Name: name, + Help: help, + ConstLabels: map[string]string{"namespace": namespace, "label_selector": combinedLabelSelector}, + } + } + + return &pwMetrics{ + podsAdded: promauto.NewCounter(mkCounterOpts("pods_added", "Total number of pods added")), + podsRemoved: promauto.NewCounter(mkCounterOpts("pods_removed", "Total number of pods removed")), + podsKilled: promauto.NewCounter(mkCounterOpts("pods_killed", "Total number of pods killed")), + restarts: promauto.NewCounter(mkCounterOpts("restarts", "Total number of restarts")), + podsActive: promauto.NewGauge(prometheus.GaugeOpts{ + Namespace: "chaos_monkey", + Subsystem: "podwatcher", + Name: "pods_active", + Help: "Current number of pods being targeted", + ConstLabels: map[string]string{"namespace": namespace, "label_selector": combinedLabelSelector}, + }), + } +} + var _ = (ConfigurableWatcher)((*PodWatcher)(nil)) func NewPodWatcher(clientset kubernetes.Interface, recorder record.EventRecorderLogger, namespace string, labelSelector ...string) ConfigurableWatcher { @@ -63,6 +111,7 @@ func NewPodWatcher(clientset kubernetes.Interface, recorder record.EventRecorder Timeout: 30 * time.Second, WatchTimeout: 15 * time.Minute, ForceStopChan: make(chan interface{}), + metrics: newPwMetrics(namespace, combinedSelector), Enabled: true, Running: false, } @@ -135,6 +184,7 @@ func (p *PodWatcher) Start(ctx context.Context) error { p.setRunning(false) } + p.metrics.restarts.Inc() break } @@ -183,6 +233,7 @@ func (p *PodWatcher) Start(ctx context.Context) error { "Pod got killed by Chaos Monkey", ) p.Logrus.Debug("Pod disrupted, event sent!") + p.metrics.podsKilled.Inc() } } @@ -218,6 +269,8 @@ func (p *PodWatcher) Stop() error { p.Logrus.Warn("Could not write in channel") } + p.metrics.unregister() + return nil } @@ -249,6 +302,8 @@ func (p *PodWatcher) addPodToList(pod *apicorev1.Pod) { p.Logrus.Debugf("Current pod list size: %d", len(p.PodList)) p.PodList = append(p.PodList, pod) p.Logrus.Debugf("Final pod list size: %d", len(p.PodList)) + p.metrics.podsAdded.Inc() + p.metrics.podsActive.Set(float64(len(p.PodList))) } func (p *PodWatcher) removePodFromList(pod *apicorev1.Pod) { @@ -261,6 +316,8 @@ func (p *PodWatcher) removePodFromList(pod *apicorev1.Pod) { return v.Name == pod.Name }) p.Logrus.Debugf("Final pod list size: %d", len(p.PodList)) + p.metrics.podsRemoved.Inc() + p.metrics.podsActive.Set(float64(len(p.PodList))) } func (p *PodWatcher) getRandomPod() (*apicorev1.Pod, error) { diff --git a/main.tf b/main.tf index ce93eb6..db16d95 100644 --- a/main.tf +++ b/main.tf @@ -206,29 +206,35 @@ resource "kubernetes_cluster_role_binding" "chaos-monkey-bind" { } resource "kubernetes_deployment" "chaos-monkey-deployment" { + timeouts { + create = "30s" + delete = "30s" + update = "30s" + } + metadata { name = "chaos-monkey" namespace = kubernetes_namespace.chaosmonkey.id labels = { - "fr.arnal.app/name" = "chaos-monkey" + "apps.massix.github.io/name" = "chaos-monkey" } annotations = { - "fr.arnal.app/image-id" = docker_image.chaos-monkey-image.id - "fr.arnal.app/dockerfile-sha" = sha256(file("${path.module}/Dockerfile")) + "apps.massix.github.io/image-id" = docker_image.chaos-monkey-image.id + "apps.massix.github.io/dockerfile-sha" = sha256(file("${path.module}/Dockerfile")) } } spec { selector { match_labels = { - "fr.arnal.app/name" = "chaos-monkey" + "apps.massix.github.io/name" = "chaos-monkey" } } template { metadata { labels = { - "fr.arnal.app/name" = "chaos-monkey" + "apps.massix.github.io/name" = "chaos-monkey" } } spec { @@ -241,6 +247,11 @@ resource "kubernetes_deployment" "chaos-monkey-deployment" { name = "CHAOSMONKEY_LOGLEVEL" value = "debug" } + port { + container_port = 9000 + name = "metrics" + protocol = "TCP" + } } } } @@ -261,6 +272,27 @@ resource "kubernetes_deployment" "chaos-monkey-deployment" { } } +resource "kubernetes_service" "chaos-monkey-service" { + metadata { + name = "chaos-monkey" + namespace = kubernetes_namespace.chaosmonkey.id + annotations = { + "prometheus.io/scrape" = "true" + } + } + spec { + type = "ClusterIP" + selector = { + "apps.massix.github.io/name" = "chaos-monkey" + } + port { + target_port = "metrics" + port = 80 + name = "metrics" + } + } +} + // We are going to disrupt the SCALE of this deployment resource "kubernetes_deployment" "nginx-disrupt-scale" { metadata { diff --git a/tests/kubetest.sh b/tests/kubetest.sh index 74efb52..73d5f66 100755 --- a/tests/kubetest.sh +++ b/tests/kubetest.sh @@ -2,6 +2,7 @@ KUBECTL=$(which kubectl) CLUSTER_NAME="${TERRAFORM_CLUSTER_NAME:-chaosmonkey-cluster}" +CURL=$(which curl) set -eo pipefail @@ -47,6 +48,12 @@ debug() { err() { log error "$*" + + if [[ "${PF_PID}" != "" ]]; then + info "Force stopping port-forward after failure" + kill -15 "${PF_PID}" + fi + exit 1 } @@ -157,12 +164,28 @@ checkNumberPods() { done } +checkMetric() { + local hostname="$1" + local metricName="$2" + + info "Checking presence of metric $metricName @ $hostname" + if ! ${CURL} -s "${hostname}/metrics" | grep "${metricName}" 2>/dev/null >/dev/null; then + err "Metric ${metricName} not found on ${hostname}" + fi +} + debug "Checking kubectl @ ${KUBECTL}" if [[ -z "${KUBECTL}" ]]; then err "Please install kubectl: https://kubernetes.io/docs/tasks/tools/install-kubectl/" fi info "Kubectl found at ${KUBECTL}" +debug "Checking curl @ ${CURL}" +if [[ -z "${CURL}" ]]; then + err "Please install curl: https://curl.se/download.html" +fi +info "Curl found at ${CURL}" + # Check if the cluster has been started debug "Check that ${CLUSTER_NAME} exists" if ! ${KUBECTL} config get-contexts | grep "kind-${CLUSTER_NAME}" &>/dev/null; then @@ -188,7 +211,7 @@ info "Checking pods" for ns in target chaosmonkey; do debug "Checking if pods in namespace ${ns} are ready" if ! ${KUBECTL} get pods --namespace=${ns} | grep Running &>/dev/null; then - err "Pods in namespace ${ns} target are not ready" + err "Pods in namespace ${ns} are not ready" fi done @@ -199,6 +222,13 @@ if [[ ${deploymentCount} != 1 ]]; then err "chaosmonkey namespace should contain 1 deployment" fi +info "Checking service" +serviceCount=$(${KUBECTL} get services --namespace=chaosmonkey --no-headers | wc -l) +debug "chaosmonkey namespace contains ${serviceCount} services" +if [[ ${serviceCount} != 1 ]]; then + err "chaosmonkey namespace should contain 1 service" +fi + deploymentCount=$(${KUBECTL} get deployments --namespace=target --no-headers | wc -l) debug "target namespace contains ${deploymentCount} deployment(s)" if [[ ${deploymentCount} != 2 ]]; then @@ -295,4 +325,49 @@ checkPods "app=${disruptScale}" info "Checking that we still have 2 pods" checkNumberPods "app=${disruptScale}" 2 +info "Checking that chaosmonkey did not crash even once" +restartCount=$(${KUBECTL} -n chaosmonkey get pods -o jsonpath='{.items[0].status.containerStatuses[0].restartCount}') +debug "Restart count: ${restartCount}" +if [ "${restartCount}" -ne 0 ]; then + err "Chaosmonkey crashed :(" +fi + +info "Checking exposed metrics by ChaosMonkey" +debug "Opening port-forward" +${KUBECTL} port-forward -n chaosmonkey svc/chaos-monkey 9090:80 >/dev/null & + +PF_PID="$!" +HOSTNAME="http://localhost:9090" +debug "port-forward pid is ${PF_PID}" +sleep 2 + +ALLMETRICS=( + "chaos_monkey_nswatcher_events" + "chaos_monkey_nswatcher_event_duration_bucket" + "chaos_monkey_nswatcher_cmc_spawned" + "chaos_monkey_nswatcher_cmc_active" + "chaos_monkey_nswatcher_restarts" + "chaos_monkey_crdwatcher_events" + "chaos_monkey_crdwatcher_pw_spawned" + "chaos_monkey_crdwatcher_pw_active" + "chaos_monkey_crdwatcher_dw_spawned" + "chaos_monkey_crdwatcher_dw_active" + "chaos_monkey_crdwatcher_event_duration_bucket" + "chaos_monkey_crdwatcher_restarts" + "chaos_monkey_podwatcher_pods_added" + "chaos_monkey_podwatcher_pods_removed" + "chaos_monkey_podwatcher_pods_killed" + "chaos_monkey_podwatcher_pods_active" + "chaos_monkey_podwatcher_restarts" + "chaos_monkey_deploymentwatcher_deployments_rescaled" + "chaos_monkey_deploymentwatcher_random_distribution" + "chaos_monkey_deploymentwatcher_last_scale" +) +for m in "${ALLMETRICS[@]}"; do + checkMetric $HOSTNAME "$m" +done + +debug "Stopping port-forward" +kill -15 ${PF_PID} + info "All tests passed!"