Skip to content

Commit

Permalink
Merge pull request #88 from atlassian-labs/vportella/add-max-failed-c…
Browse files Browse the repository at this point in the history
…nrs-threshold

Add a max failed CNRs threshold to Nodegroups
  • Loading branch information
vincentportella authored Sep 2, 2024
2 parents dc71b47 + 4eedd00 commit dc687f3
Show file tree
Hide file tree
Showing 13 changed files with 741 additions and 63 deletions.
5 changes: 5 additions & 0 deletions deploy/crds/atlassian.com_nodegroups_crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@ spec:
- waitPeriod
type: object
type: array
maxFailedCycleNodeRequests:
description: MaxFailedCycleNodeRequests defines the maximum number
of allowed failed CNRs for a nodegroup before the observer stops
generating them.
type: integer
nodeGroupName:
description: NodeGroupName is the name of the node group in the cloud
provider that corresponds to this NodeGroup resource.
Expand Down
20 changes: 20 additions & 0 deletions pkg/apis/atlassian/v1/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,23 @@ func buildNodeGroupNames(nodeGroupsList []string, nodeGroupName string) []string

return nodeGroups
}

// sameNodeGroups compares two lists of nodegroup names and check they are the
// same. Ordering does not affect equality.
func sameNodeGroups(groupA, groupB []string) bool {
if len(groupA) != len(groupB) {
return false
}

groupMap := make(map[string]struct{})
for _, group := range groupA {
groupMap[group] = struct{}{}
}

for _, group := range groupB {
if _, ok := groupMap[group]; !ok {
return false
}
}
return true
}
41 changes: 41 additions & 0 deletions pkg/apis/atlassian/v1/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,44 @@ func TestBuildNodeGroupNames(t *testing.T) {
})
}
}

func Test_sameNodeGroups(t *testing.T) {
tests := []struct {
name string
groupA []string
groupB []string
expect bool
}{
{
"pass case with same order",
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
true,
},
{
"pass case with different order",
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
[]string{"ingress-us-west-2b", "ingress-us-west-2c", "ingress-us-west-2a"},
true,
},
{
"failure case with different length",
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
[]string{"ingress-us-west-2b", "ingress-us-west-2c"},
false,
},
{
"failure case with different items",
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
[]string{"ingress-us-west-2b", "ingress-us-west-2c", "system"},
false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := sameNodeGroups(tt.groupA, tt.groupB)
assert.Equal(t, tt.expect, got)
})
}
}
21 changes: 21 additions & 0 deletions pkg/apis/atlassian/v1/cyclenoderequest.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,24 @@ func (in *CycleNodeRequest) NodeLabelSelector() (labels.Selector, error) {
func (in *CycleNodeRequest) GetNodeGroupNames() []string {
return buildNodeGroupNames(in.Spec.NodeGroupsList, in.Spec.NodeGroupName)
}

// IsPartOfNodeGroup returns whether the CycleNodeRequest is part of the
// provided NodeGroup by comparing the list of named cloud provider nodegroups
// defined in each one. Ordering does not affect equality.
func (in *CycleNodeRequest) IsFromNodeGroup(nodegroup NodeGroup) bool {
return sameNodeGroups(
buildNodeGroupNames(in.Spec.NodeGroupsList, in.Spec.NodeGroupName),
nodegroup.GetNodeGroupNames(),
)
}

// IsFromSameNodeGroup returns whether the CycleNodeRequest is part of the
// same Nodegroup provided as the provided CycleNodeRequest by comparing the
// list of named cloud provider nodegroups defined in each one. Ordering does
// not affect equality.
func (in *CycleNodeRequest) IsFromSameNodeGroup(cnr CycleNodeRequest) bool {
return sameNodeGroups(
buildNodeGroupNames(in.Spec.NodeGroupsList, in.Spec.NodeGroupName),
cnr.GetNodeGroupNames(),
)
}
4 changes: 4 additions & 0 deletions pkg/apis/atlassian/v1/nodegroup_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ type NodeGroupSpec struct {
// CycleSettings stores the settings to use for cycling the nodes.
CycleSettings CycleSettings `json:"cycleSettings"`

// MaxFailedCycleNodeRequests defines the maximum number of allowed failed CNRs for a nodegroup before the observer
// stops generating them.
MaxFailedCycleNodeRequests uint `json:"maxFailedCycleNodeRequests,omitempty"`

// ValidationOptions stores the settings to use for validating state of nodegroups
// in kube and the cloud provider for cycling the nodes.
ValidationOptions ValidationOptions `json:"validationOptions,omitempty"`
Expand Down
26 changes: 24 additions & 2 deletions pkg/controller/cyclenoderequest/transitioner/test_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import (
v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1"
"github.com/atlassian-labs/cyclops/pkg/controller"
"github.com/atlassian-labs/cyclops/pkg/mock"

"sigs.k8s.io/controller-runtime/pkg/client"
)

type Option func(t *Transitioner)
Expand All @@ -22,6 +24,18 @@ func WithKubeNodes(nodes []*mock.Node) Option {
}
}

func WithExtraKubeObject(extraKubeObject client.Object) Option {
return func(t *Transitioner) {
t.extraKubeObjects = append(t.extraKubeObjects, extraKubeObject)
}
}

func WithTransitionerOptions(options Options) Option {
return func(t *Transitioner) {
t.transitionerOptions = options
}
}

// ************************************************************************** //

type Transitioner struct {
Expand All @@ -30,6 +44,10 @@ type Transitioner struct {

CloudProviderInstances []*mock.Node
KubeNodes []*mock.Node

extraKubeObjects []client.Object

transitionerOptions Options
}

func NewFakeTransitioner(cnr *v1.CycleNodeRequest, opts ...Option) *Transitioner {
Expand All @@ -38,13 +56,17 @@ func NewFakeTransitioner(cnr *v1.CycleNodeRequest, opts ...Option) *Transitioner
// override these as needed
CloudProviderInstances: make([]*mock.Node, 0),
KubeNodes: make([]*mock.Node, 0),
extraKubeObjects: []client.Object{cnr},
transitionerOptions: Options{},
}

for _, opt := range opts {
opt(t)
}

t.Client = mock.NewClient(t.KubeNodes, t.CloudProviderInstances, cnr)
t.Client = mock.NewClient(
t.KubeNodes, t.CloudProviderInstances, t.extraKubeObjects...,
)

rm := &controller.ResourceManager{
Client: t.K8sClient,
Expand All @@ -54,7 +76,7 @@ func NewFakeTransitioner(cnr *v1.CycleNodeRequest, opts ...Option) *Transitioner
}

t.CycleNodeRequestTransitioner = NewCycleNodeRequestTransitioner(
cnr, rm, Options{},
cnr, rm, t.transitionerOptions,
)

return t
Expand Down
13 changes: 11 additions & 2 deletions pkg/controller/cyclenoderequest/transitioner/transitions.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,10 +582,18 @@ func (t *CycleNodeRequestTransitioner) transitionSuccessful() (reconcile.Result,
if err != nil {
return t.transitionToHealing(err)
}

if shouldRequeue {
return reconcile.Result{Requeue: true, RequeueAfter: transitionDuration}, nil
}

// Delete failed sibling CNRs regardless of whether the CNR for the
// transitioner should be deleted. If failed CNRs pile up that will prevent
// Cyclops observer from auto-generating new CNRs for a nodegroup.
if err := t.deleteFailedSiblingCNRs(); err != nil {
return t.transitionToHealing(err)
}

// If deleting CycleNodeRequests is not enabled, stop here
if !t.options.DeleteCNR {
return reconcile.Result{}, nil
Expand All @@ -595,10 +603,11 @@ func (t *CycleNodeRequestTransitioner) transitionSuccessful() (reconcile.Result,
// than the time configured to keep them for.
if t.cycleNodeRequest.CreationTimestamp.Add(t.options.DeleteCNRExpiry).Before(time.Now()) {
t.rm.Logger.Info("Deleting CycleNodeRequest")
err := t.rm.Client.Delete(context.TODO(), t.cycleNodeRequest)
if err != nil {

if err := t.rm.Client.Delete(context.TODO(), t.cycleNodeRequest); err != nil {
t.rm.Logger.Error(err, "Unable to delete expired CycleNodeRequest")
}

return reconcile.Result{}, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,74 @@ func TestPendingTimeoutReached(t *testing.T) {
assert.Equal(t, v1.CycleNodeRequestHealing, cnr.Status.Phase)
}

// Test to ensure that Cyclops will not proceed if there is node detached from
// the nodegroup on the cloud provider. It should try to wait for the issue to
// resolve and transition to Initialised when it does before reaching the
// timeout period.
func TestPendingReattachedCloudProviderNode(t *testing.T) {
nodegroup, err := mock.NewNodegroup("ng-1", 2)
if err != nil {
assert.NoError(t, err)
}

// "detach" one instance from the asg
nodegroup[0].Nodegroup = ""

cnr := &v1.CycleNodeRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "cnr-1",
Namespace: "kube-system",
},
Spec: v1.CycleNodeRequestSpec{
NodeGroupsList: []string{"ng-1"},
CycleSettings: v1.CycleSettings{
Concurrency: 1,
Method: v1.CycleNodeRequestMethodDrain,
},
Selector: metav1.LabelSelector{
MatchLabels: map[string]string{
"customer": "kitt",
},
},
},
Status: v1.CycleNodeRequestStatus{
Phase: v1.CycleNodeRequestPending,
},
}

fakeTransitioner := NewFakeTransitioner(cnr,
WithKubeNodes(nodegroup),
WithCloudProviderInstances(nodegroup),
)

// Should requeue while it tries to wait
_, err = fakeTransitioner.Run()
assert.NoError(t, err)
assert.Equal(t, v1.CycleNodeRequestPending, cnr.Status.Phase)

// Simulate waiting for 1s less than the wait limit
cnr.Status.EquilibriumWaitStarted = &metav1.Time{
Time: time.Now().Add(-nodeEquilibriumWaitLimit + time.Second),
}

_, err = fakeTransitioner.Autoscaling.AttachInstances(&autoscaling.AttachInstancesInput{
AutoScalingGroupName: aws.String("ng-1"),
InstanceIds: aws.StringSlice([]string{nodegroup[0].InstanceID}),
})

assert.NoError(t, err)

// "re-attach" the instance to the asg
fakeTransitioner.CloudProviderInstances[0].Nodegroup = "ng-1"

// The CNR should transition to the Initialised phase because the state of
// the nodes is now correct and this happened within the timeout period.
_, err = fakeTransitioner.Run()
assert.NoError(t, err)
assert.Equal(t, v1.CycleNodeRequestInitialised, cnr.Status.Phase)
assert.Len(t, cnr.Status.NodesToTerminate, 2)
}

// Test to ensure that Cyclops will not proceed if there is node detached from
// the nodegroup on the cloud provider. It should wait and especially should not
// succeed if the instance is re-attached by the final requeuing of the Pending
Expand Down Expand Up @@ -596,6 +664,16 @@ func TestPendingReattachedCloudProviderNodeTooLate(t *testing.T) {
Time: time.Now().Add(-nodeEquilibriumWaitLimit - time.Second),
}

_, err = fakeTransitioner.Autoscaling.AttachInstances(&autoscaling.AttachInstancesInput{
AutoScalingGroupName: aws.String("ng-1"),
InstanceIds: aws.StringSlice([]string{nodegroup[0].InstanceID}),
})

assert.NoError(t, err)

// "re-attach" the instance to the asg
fakeTransitioner.CloudProviderInstances[0].Nodegroup = "ng-1"

// This time should transition to the healing phase even though the state
// is correct because the timeout check happens first
_, err = fakeTransitioner.Run()
Expand Down
Loading

0 comments on commit dc687f3

Please sign in to comment.