Skip to content

Commit

Permalink
feat(health): implement health endpoint (wip)
Browse files Browse the repository at this point in the history
  • Loading branch information
massix committed Jul 6, 2024
1 parent 25905aa commit 463726b
Show file tree
Hide file tree
Showing 4 changed files with 289 additions and 26 deletions.
12 changes: 8 additions & 4 deletions cmd/chaosmonkey/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/massix/chaos-monkey/internal/apis/clientset/versioned"
"github.com/massix/chaos-monkey/internal/configuration"
"github.com/massix/chaos-monkey/internal/endpoints"
"github.com/massix/chaos-monkey/internal/watcher"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -76,23 +77,26 @@ func main() {

// Spawn the HTTP Server for Prometheus in background
srv := &http.Server{
Handler: promhttp.Handler(),
Addr: "0.0.0.0:9000",
Addr: "0.0.0.0:9000",
}

// Register methods
http.Handle("GET /metrics", promhttp.Handler())
http.Handle("GET /health", endpoints.NewHealthEndpoint(nsWatcher.(*watcher.NamespaceWatcher)))

wg.Add(1)
go func() {
defer wg.Done()
if err := srv.ListenAndServe(); err != nil {
log.Warnf("Could not spawn Prometheus handler: %s", err)
log.Warnf("Could not spawn http server: %s", err)
}
}()

// Wait for a signal to arrive
<-s

if err := srv.Shutdown(context.Background()); err != nil {
log.Warnf("Could not shutdown Prometheus handler: %s", err)
log.Warnf("Could not shutdown http server: %s", err)
}

log.Info("Shutting down...")
Expand Down
139 changes: 139 additions & 0 deletions internal/endpoints/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package endpoints

import (
"encoding/json"
"errors"
"net/http"

"github.com/massix/chaos-monkey/internal/configuration"
"github.com/massix/chaos-monkey/internal/watcher"
"github.com/sirupsen/logrus"
)

type HealthEndpoint struct {
mainWatcher *watcher.NamespaceWatcher
logrus logrus.FieldLogger
}

type response struct {
Status string `json:"status"`
RootNamespace string `json:"root_namespace"`
Behavior configuration.Behavior `json:"behavior"`
Loglevel string `json:"loglevel"`
TotalWatchers int `json:"total_watchers"`
}

type stack[T any] struct {
elems []T
}

func (s *stack[T]) push(elem T) {
s.elems = append(s.elems, elem)
}

func (s *stack[T]) pop() (T, error) {
if s.isEmpty() {
return *new(T), errors.New("Empty stack")
}
ret := s.elems[0]
s.elems = s.elems[1:]
return ret, nil
}

func (s *stack[T]) isEmpty() bool {
return len(s.elems) == 0
}

func NewHealthEndpoint(w *watcher.NamespaceWatcher) *HealthEndpoint {
return &HealthEndpoint{
mainWatcher: w,
logrus: logrus.WithFields(logrus.Fields{"component": "HealthEndpoint"}),
}
}

func (e *HealthEndpoint) CountWatchers() int {
stack := &stack[watcher.Watcher]{}
count := 0
stack.push(e.mainWatcher)
e.logrus.Debug("Counting watchers")

for !stack.isEmpty() {
current, err := stack.pop()
if err != nil {
e.logrus.Warn(err)
continue
}

switch w := current.(type) {
case (*watcher.NamespaceWatcher):
if w == nil {
e.logrus.Warn("Nil pointer in loop")
continue
}
e.logrus.Debugf("Found NamespaceWatcher with %d children", len(w.CrdWatchers))
for _, child := range w.CrdWatchers {
stack.push(child)
}
case (*watcher.CrdWatcher):
if w == nil {
e.logrus.Warn("Nil pointer in loop")
continue
}
e.logrus.Debugf("Found CrdWatcher with %d children", len(w.DeploymentWatchers))
for _, child := range w.DeploymentWatchers {
stack.push(child.Watcher)
}
default:
if w == nil {
e.logrus.Warn("Nil pointer in loop")
continue
}
e.logrus.Debug("Pod or DeploymentWatcher found")
}

count++
}

e.logrus.Debugf("Total count: %d", count)
return count
}

// ServeHTTP implements http.Handler.
func (e *HealthEndpoint) ServeHTTP(w http.ResponseWriter, r *http.Request) {
e.logrus.Debugf(
"Handling request '%s %s' from %s",
r.Method,
r.URL.Path,
r.RemoteAddr,
)

if e.mainWatcher == nil {
e.logrus.Error("Nil pointer in main watcher")
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte{})
return
}

response := &response{
Status: "up",
TotalWatchers: e.CountWatchers(),
RootNamespace: e.mainWatcher.RootNamespace,
Behavior: e.mainWatcher.Behavior,
Loglevel: logrus.GetLevel().String(),
}

if b, err := json.Marshal(response); err == nil {
w.WriteHeader(http.StatusOK)
if _, err := w.Write(b); err != nil {
e.logrus.Errorf("Could not write response: %s", err)
}
} else {
e.logrus.Errorf("Could not marshal response: %s", err)
w.WriteHeader(http.StatusInternalServerError)
if _, err := w.Write([]byte{}); err != nil {
e.logrus.Errorf("Could not write response: %s", err)
}
}
}

var _ = (http.Handler)((*HealthEndpoint)(nil))
127 changes: 127 additions & 0 deletions internal/endpoints/health_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package endpoints

import (
"encoding/json"
"fmt"
"io"
"math/rand"
"net/http"
"net/http/httptest"
"strings"
"testing"

"github.com/massix/chaos-monkey/internal/configuration"
"github.com/massix/chaos-monkey/internal/watcher"
"github.com/sirupsen/logrus"
)

func setup(crdWatchers, deployWatchers int) (*watcher.NamespaceWatcher, int) {
mainWatcher := &watcher.NamespaceWatcher{
CrdWatchers: map[string]watcher.Watcher{},
RootNamespace: "root",
Behavior: configuration.DenyAll,
}
created := 1

for i := 0; i < crdWatchers; i++ {
crdWatcher := &watcher.CrdWatcher{DeploymentWatchers: map[string]*watcher.WatcherConfiguration{}}
mainWatcher.CrdWatchers[fmt.Sprintf("%d", i)] = crdWatcher
created++

for j := 0; j < deployWatchers; j++ {
var target watcher.ConfigurableWatcher
if rand.Intn(2) == 0 {
target = &watcher.PodWatcher{}
} else {
target = &watcher.DeploymentWatcher{}
}
crdWatcher.DeploymentWatchers[fmt.Sprintf("%d", j)] = &watcher.WatcherConfiguration{
Watcher: target,
}
created++
}
}

return mainWatcher, created
}

func TestHealthEndpoint_CountWatchers(t *testing.T) {
logrus.SetLevel(logrus.DebugLevel)
t.Run("Nominal case", func(t *testing.T) {
mw, created := setup(10, 15)
ep := NewHealthEndpoint(mw)
if cnt := ep.CountWatchers(); cnt != (10*15)+10+1 {
t.Fatalf("wrong count: %d (expecting: %d)", cnt, created)
}
})
t.Run("Nil pointer", func(t *testing.T) {
ep := NewHealthEndpoint(nil)
if cnt := ep.CountWatchers(); cnt != 0 {
t.Fatalf("wrong count: %d (expecting: %d)", cnt, 0)
}
})
}

func TestHealthEndpoint_ServeHTTP(t *testing.T) {
newTest := func() (*httptest.ResponseRecorder, *http.Request) {
rec := httptest.NewRecorder()
req := httptest.NewRequest("GET", "/health", strings.NewReader(""))

return rec, req
}

t.Run("Nominal case", func(t *testing.T) {
rec, req := newTest()
w, tot := setup(10, 20)
ep := NewHealthEndpoint(w)
ep.ServeHTTP(rec, req)

if rec.Code != http.StatusOK {
t.Fatalf("Wrong status code: %d", rec.Code)
}

var response response
bResponse, err := io.ReadAll(rec.Result().Body)
defer rec.Result().Body.Close()

if err != nil {
t.Fatal(err)
}

if err := json.Unmarshal(bResponse, &response); err != nil {
t.Fatal(err)
}

if response.TotalWatchers != tot {
t.Errorf("Wrong count of watchers %d", response.TotalWatchers)
}

if response.RootNamespace != "root" {
t.Errorf("Wrong namespace %q", response.RootNamespace)
}

if response.Behavior != configuration.DenyAll {
t.Errorf("Wrong behavior %q", response.Behavior)
}
})

t.Run("Nil pointer", func(t *testing.T) {
rec, req := newTest()
ep := NewHealthEndpoint(nil)
ep.ServeHTTP(rec, req)

if rec.Code != http.StatusInternalServerError {
t.Fatalf("Wrong status code: %d", rec.Code)
}
})
}

func BenchmarkHealthEndpoint_CountWatchers(b *testing.B) {
logrus.SetLevel(logrus.InfoLevel)
mw, _ := setup(1000, 500)
ep := NewHealthEndpoint(mw)

for i := 0; i < b.N; i++ {
ep.CountWatchers()
}
}
37 changes: 15 additions & 22 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ resource "kubernetes_deployment" "chaos-monkey-deployment" {
annotations = {
"apps.massix.github.io/image-id" = docker_image.chaos-monkey-image.id
"apps.massix.github.io/dockerfile-sha" = sha256(file("${path.module}/Dockerfile"))
"prometheus.io/scrape" = "true"
"prometheus.io/path" = "/metrics"
}
}

Expand All @@ -242,6 +244,18 @@ resource "kubernetes_deployment" "chaos-monkey-deployment" {
name = "chaos-monkey"
image = docker_image.chaos-monkey-image.name
image_pull_policy = "Never"
liveness_probe {
http_get {
port = "http"
path = "/health"
}
}
readiness_probe {
http_get {
port = "http"
path = "/health"
}
}
env {
name = "CHAOSMONKEY_LOGLEVEL"
value = "debug"
Expand All @@ -252,7 +266,7 @@ resource "kubernetes_deployment" "chaos-monkey-deployment" {
}
port {
container_port = 9000
name = "metrics"
name = "http"
protocol = "TCP"
}
}
Expand All @@ -275,27 +289,6 @@ resource "kubernetes_deployment" "chaos-monkey-deployment" {
}
}

resource "kubernetes_service" "chaos-monkey-service" {
metadata {
name = "chaos-monkey"
namespace = kubernetes_namespace.chaosmonkey.id
annotations = {
"prometheus.io/scrape" = "true"
}
}
spec {
type = "ClusterIP"
selector = {
"apps.massix.github.io/name" = "chaos-monkey"
}
port {
target_port = "metrics"
port = 80
name = "metrics"
}
}
}

// We are going to disrupt the SCALE of this deployment
resource "kubernetes_deployment" "nginx-disrupt-scale" {
metadata {
Expand Down

0 comments on commit 463726b

Please sign in to comment.