From 1ccf4dda897a5b069c6c1f1a4ea2e5564d4044cb Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Mon, 25 Jul 2022 13:15:26 +0530 Subject: [PATCH] Added logics for quorum loss scenario. --- config/rbac/role.yaml | 116 ++---------- controllers/config/custodian.go | 2 + controllers/controllers_suite_test.go | 10 +- controllers/etcd_controller.go | 213 ++++++++++++++++++++-- controllers/etcd_controller_test.go | 213 ++++++++++++++++++++++ controllers/etcd_custodian_controller.go | 45 +++++ main.go | 8 +- pkg/health/etcdmember/check_ready.go | 2 +- pkg/health/etcdmember/check_ready_test.go | 7 +- pkg/health/status/check.go | 13 ++ pkg/predicate/predicate.go | 31 ++++ pkg/predicate/predicate_test.go | 66 ++++++- pkg/utils/miscellaneous.go | 1 + 13 files changed, 601 insertions(+), 126 deletions(-) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 4fb453b04..425eb3e38 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -7,151 +7,69 @@ metadata: name: manager-role rules: - apiGroups: - - "" + - batch resources: - - pods + - jobs verbs: - - list - - watch + - create - delete -- apiGroups: - - "" - resources: - - secrets - - endpoints - verbs: - get - list - patch - update - watch - apiGroups: - - "" + - coordination.k8s.io resources: - - events + - leases verbs: - create + - delete + - deletecollection - get - list - - watch - patch - update -- apiGroups: - - "" - resources: - - serviceaccounts - verbs: - - get - - list - watch - - create - - update - - patch - - delete - apiGroups: - - rbac.authorization.k8s.io + - druid.gardener.cloud resources: - - roles - - rolebindings + - etcds verbs: - - get - - list - - watch - create - - update - - patch - delete -- apiGroups: - - "" - - apps - resources: - - services - - configmaps - - statefulsets - verbs: - get - list - patch - update - watch - - create - - delete -- apiGroups: - - batch - resources: - - jobs - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - batch - resources: - - cronjobs - verbs: - - get - - list - - watch - - delete -- apiGroups: - - druid.gardener.cloud - resources: - - etcds - - etcdcopybackupstasks - verbs: - - get - - list - - watch - - create - - update - - patch - - delete - apiGroups: - druid.gardener.cloud resources: - etcds/status - - etcds/finalizers - - etcdcopybackupstasks/status - - etcdcopybackupstasks/finalizers verbs: - get - - update - patch - - create -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - update - - patch - - delete - - deletecollection - apiGroups: - - "" + - druid.gardener.cloud resources: - - persistentvolumeclaims + - secrets verbs: - get - list + - patch + - update - watch - apiGroups: - policy resources: - poddisruptionbudgets verbs: + - create + - delete - get - list - - watch - - create - - update - patch - - delete + - update + - watch diff --git a/controllers/config/custodian.go b/controllers/config/custodian.go index f964fdd41..68778a699 100644 --- a/controllers/config/custodian.go +++ b/controllers/config/custodian.go @@ -22,6 +22,8 @@ type EtcdCustodianController struct { EtcdMember EtcdMemberConfig // SyncPeriod is the duration after which re-enqueuing happens. SyncPeriod time.Duration + // EnableAutomaticQuorumLossHandling is the flag to enable automatic handling of quorum loss. Set it false by default. + EnableAutomaticQuorumLossHandling bool } type EtcdMemberConfig struct { diff --git a/controllers/controllers_suite_test.go b/controllers/controllers_suite_test.go index 147c871a1..5ad5f8bec 100644 --- a/controllers/controllers_suite_test.go +++ b/controllers/controllers_suite_test.go @@ -51,6 +51,7 @@ var ( mgrStopped *sync.WaitGroup activeDeadlineDuration time.Duration + waitDuration time.Duration backupCompactionSchedule = "15 */24 * * *" revertFns []func() @@ -102,7 +103,9 @@ var _ = BeforeSuite(func(done Done) { }) Expect(err).NotTo(HaveOccurred()) - er, err := NewEtcdReconcilerWithImageVector(mgr, false) + waitDuration, err = time.ParseDuration("10s") + Expect(err).NotTo(HaveOccurred()) + er, err := NewEtcdReconcilerWithImageVector(mgr, false, waitDuration) Expect(err).NotTo(HaveOccurred()) err = er.SetupWithManager(mgr, 5, true) @@ -114,8 +117,9 @@ var _ = BeforeSuite(func(done Done) { Expect(err).NotTo(HaveOccurred()) custodian := NewEtcdCustodian(mgr, controllersconfig.EtcdCustodianController{ + SyncPeriod: 10 * time.Second, EtcdMember: controllersconfig.EtcdMemberConfig{ - EtcdMemberNotReadyThreshold: 1 * time.Minute, + EtcdMemberNotReadyThreshold: 20 * time.Second, }, }) @@ -138,7 +142,7 @@ var _ = BeforeSuite(func(done Done) { }) Expect(err).NotTo(HaveOccurred()) - err = lc.SetupWithManager(mgr, 1) + err = lc.SetupWithManager(mgr, 5) Expect(err).NotTo(HaveOccurred()) mgrStopped = startTestManager(mgrCtx, mgr) diff --git a/controllers/etcd_controller.go b/controllers/etcd_controller.go index 382840e9a..425d47553 100644 --- a/controllers/etcd_controller.go +++ b/controllers/etcd_controller.go @@ -28,10 +28,10 @@ import ( componentconfigmap "github.com/gardener/etcd-druid/pkg/component/etcd/configmap" componentlease "github.com/gardener/etcd-druid/pkg/component/etcd/lease" componentservice "github.com/gardener/etcd-druid/pkg/component/etcd/service" - "github.com/gardener/etcd-druid/pkg/component/etcd/statefulset" componentsts "github.com/gardener/etcd-druid/pkg/component/etcd/statefulset" druidpredicates "github.com/gardener/etcd-druid/pkg/predicate" "github.com/gardener/etcd-druid/pkg/utils" + coordinationv1 "k8s.io/api/coordination/v1" extensionspredicate "github.com/gardener/gardener/extensions/pkg/predicate" v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants" @@ -41,6 +41,7 @@ import ( gardenercomponent "github.com/gardener/gardener/pkg/operation/botanist/component" "github.com/gardener/gardener/pkg/utils/imagevector" kutil "github.com/gardener/gardener/pkg/utils/kubernetes" + gardenerretry "github.com/gardener/gardener/pkg/utils/retry" "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" batchv1beta1 "k8s.io/api/batch/v1beta1" @@ -105,31 +106,25 @@ type EtcdReconciler struct { ImageVector imagevector.ImageVector logger logr.Logger disableEtcdServiceAccountAutomount bool -} - -// NewReconcilerWithImageVector creates a new EtcdReconciler object with an image vector -func NewReconcilerWithImageVector(mgr manager.Manager, disableEtcdServiceAccountAutomount bool) (*EtcdReconciler, error) { - etcdReconciler, err := NewEtcdReconciler(mgr, disableEtcdServiceAccountAutomount) - if err != nil { - return nil, err - } - return etcdReconciler.InitializeControllerWithImageVector() + //waitTimeForTests allow some waiting time between certain operations. This variable help some unit test cases + waitTimeForTests time.Duration } // NewEtcdReconciler creates a new EtcdReconciler object -func NewEtcdReconciler(mgr manager.Manager, disableEtcdServiceAccountAutomount bool) (*EtcdReconciler, error) { +func NewEtcdReconciler(mgr manager.Manager, disableEtcdServiceAccountAutomount bool, waitTimeForTests time.Duration) (*EtcdReconciler, error) { return (&EtcdReconciler{ Client: mgr.GetClient(), Config: mgr.GetConfig(), Scheme: mgr.GetScheme(), logger: log.Log.WithName("etcd-controller"), disableEtcdServiceAccountAutomount: disableEtcdServiceAccountAutomount, + waitTimeForTests: waitTimeForTests, }).InitializeControllerWithChartApplier() } // NewEtcdReconcilerWithImageVector creates a new EtcdReconciler object -func NewEtcdReconcilerWithImageVector(mgr manager.Manager, disableEtcdServiceAccountAutomount bool) (*EtcdReconciler, error) { - ec, err := NewEtcdReconciler(mgr, disableEtcdServiceAccountAutomount) +func NewEtcdReconcilerWithImageVector(mgr manager.Manager, disableEtcdServiceAccountAutomount bool, waitTimeForTests time.Duration) (*EtcdReconciler, error) { + ec, err := NewEtcdReconciler(mgr, disableEtcdServiceAccountAutomount, waitTimeForTests) if err != nil { return nil, err } @@ -206,11 +201,15 @@ func (r *EtcdReconciler) SetupWithManager(mgr ctrl.Manager, workers int, ignoreO func buildPredicate(ignoreOperationAnnotation bool) predicate.Predicate { if ignoreOperationAnnotation { - return predicate.GenerationChangedPredicate{} + return predicate.Or( + predicate.GenerationChangedPredicate{}, + druidpredicates.HasQuorumLossAnnotation(), + ) } return predicate.Or( druidpredicates.HasOperationAnnotation(), + druidpredicates.HasQuorumLossAnnotation(), druidpredicates.LastOperationNotSuccessful(), extensionspredicate.IsDeleting(), ) @@ -238,6 +237,102 @@ func (r *EtcdReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. if !etcd.DeletionTimestamp.IsZero() { return r.delete(ctx, etcd) } + + // Check if annotation for quorum loss is present in the ETCD annotaions + // if yes, take necessarry actions + if val, ok := etcd.Annotations[druidpredicates.QuorumLossAnnotation]; ok { + if val == "true" { + // scale down the statefulset to 0 + sts := &appsv1.StatefulSet{} + err := r.Get(ctx, req.NamespacedName, sts) + if err != nil { + return ctrl.Result{ + RequeueAfter: 10 * time.Second, + }, fmt.Errorf("cound not fetch statefulset though the annotaion action/quorum-loss is set in ETCD CR: %v", err) + } + + r.logger.Info("Scaling down the statefulset to 0 while tackling quorum loss scenario in ETCD multi node cluster") + if _, err := controllerutils.GetAndCreateOrStrategicMergePatch(ctx, r.Client, sts, func() error { + sts.Spec.Replicas = pointer.Int32(0) + return nil + }); err != nil { + return ctrl.Result{ + RequeueAfter: 10 * time.Second, + }, fmt.Errorf("cound not scale down statefulset to 0 while tackling quorum loss scenario in ETCD multi node cluster: %v", err) + } + time.Sleep(r.waitTimeForTests) + + r.logger.Info("Deleting PVCs while tackling quorum loss scenario in ETCD multi node cluster") + // delete the pvcs + if err := r.DeleteAllOf(ctx, &corev1.PersistentVolumeClaim{}, + client.InNamespace(sts.GetNamespace()), + client.MatchingLabels(getMatchingLabels(sts))); client.IgnoreNotFound(err) != nil { + return ctrl.Result{ + RequeueAfter: 10 * time.Second, + }, fmt.Errorf("cound not delete pvcs while tackling quorum loss scenario in ETCD multi node cluster : %v", err) + } + + r.logger.Info("Update the lease renewal time as nil") + leases := &coordinationv1.LeaseList{} + if err := r.List(ctx, leases, client.InNamespace(etcd.Namespace), client.MatchingLabels{ + common.GardenerOwnedBy: etcd.Name, v1beta1constants.GardenerPurpose: componentlease.PurposeMemberLease}); err != nil { + r.logger.Error(err, "failed to get leases for etcd member readiness check") + } + + for _, lease := range leases.Items { + copyLease := lease.DeepCopy() + withNilRenewal := copyLease.DeepCopy() + copyLease.Spec.RenewTime = nil + err := r.Patch(ctx, copyLease, client.MergeFrom(withNilRenewal)) + + if err != nil { + return ctrl.Result{ + RequeueAfter: 10 * time.Second, + }, fmt.Errorf("could not set all the lease items with nil as renewal time: %v", err) + } + } + + r.logger.Info("Scaling up the statefulset to 1 while tackling quorum loss scenario in ETCD multi node cluster") + // scale up the statefulset to 1 + if _, err := controllerutils.GetAndCreateOrStrategicMergePatch(ctx, r.Client, sts, func() error { + sts.Spec.Replicas = pointer.Int32(1) + return nil + }); err != nil { + return ctrl.Result{ + RequeueAfter: 10 * time.Second, + }, fmt.Errorf("cound not scale up statefulset to 1 while tackling quorum loss scenario in ETCD multi node cluster : %v", err) + } + + if err := r.waitUntilStatefulSetReady(ctx, r.logger, req.NamespacedName, 1); err != nil { + return ctrl.Result{ + RequeueAfter: 10 * time.Second, + }, fmt.Errorf("statefulset with 1 replica is not ready yet while tackling quorum loss scenario in ETCD multi node cluster : %v", err) + } + + // scale up the statefulset to ETCD replicas + r.logger.Info("Scaling up the statefulset to the number of replicas mentioned in ETCD spec") + if _, err := controllerutils.GetAndCreateOrStrategicMergePatch(ctx, r.Client, sts, func() error { + sts.Spec.Replicas = &etcd.Spec.Replicas + return nil + }); err != nil { + return ctrl.Result{ + RequeueAfter: 10 * time.Second, + }, fmt.Errorf("cound not scale up statefulset to replica number while tackling quorum loss scenario in ETCD multi node cluster : %v", err) + } + if err = r.removeQuorumLossAnnotation(ctx, r.logger, etcd); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{ + Requeue: true, + }, err + } + return ctrl.Result{ + Requeue: false, + }, nil + } + } + return r.reconcile(ctx, etcd) } @@ -676,7 +771,7 @@ func (r *EtcdReconciler) reconcileEtcd(ctx context.Context, logger logr.Logger, return nil, nil, err } - statefulSetValues := statefulset.GenerateValues(etcd, + statefulSetValues := componentsts.GenerateValues(etcd, &serviceValues.ClientPort, &serviceValues.ServerPort, &serviceValues.BackupPort, @@ -692,10 +787,35 @@ func (r *EtcdReconciler) reconcileEtcd(ctx context.Context, logger logr.Logger, deployWaiter = gardenercomponent.OpWaiter(stsDeployer) ) + if _, err := stsDeployer.Get(ctx); apierrors.IsNotFound(err) { + + logger.Info("As statefulsets are not yet created, it's bootstrap case. Adding annotation to ETCD CR for it") + withBootstrapAnnotation := etcd.DeepCopy() + annotations := make(map[string]string) + if etcd.Annotations != nil { + for key, value := range etcd.Annotations { + annotations[key] = value + } + } + // Set annotaion in ETCD to take corrective measure + annotations[utils.BootstrapAnnotation] = "true" + etcd.Annotations = annotations + logger.Info(fmt.Sprintf("Updating ETCD with the annotation %v", utils.BootstrapAnnotation)) + if err := r.Patch(ctx, etcd, client.MergeFrom(withBootstrapAnnotation)); err != nil { + return nil, nil, err + } + } + if err := deployWaiter.Deploy(ctx); err != nil { return nil, nil, err } + if err = r.removeBootstrapAnnotation(ctx, r.logger, etcd); err != nil { + if !apierrors.IsNotFound(err) { + return nil, nil, err + } + } + sts, err := stsDeployer.Get(ctx) return &serviceValues.ClientServiceName, sts, err @@ -795,6 +915,15 @@ func clusterInBootstrap(etcd *druidv1alpha1.Etcd) bool { (etcd.Spec.Replicas > 1 && etcd.Status.Replicas == 1) } +func getMatchingLabels(sts *appsv1.StatefulSet) map[string]string { + labels := make(map[string]string) + + labels["name"] = sts.Labels["name"] + labels["instance"] = sts.Labels["instance"] + + return labels +} + func (r *EtcdReconciler) updateEtcdErrorStatus(ctx context.Context, etcd *druidv1alpha1.Etcd, sts *appsv1.StatefulSet, lastError error) error { return controllerutils.TryUpdateStatus(ctx, retry.DefaultBackoff, r.Client, etcd, func() error { lastErrStr := fmt.Sprintf("%v", lastError) @@ -832,6 +961,40 @@ func (r *EtcdReconciler) updateEtcdStatus(ctx context.Context, etcd *druidv1alph }) } +func (r *EtcdReconciler) waitUntilStatefulSetReady(ctx context.Context, logger logr.Logger, ns types.NamespacedName, replicas int32) error { + sts := &appsv1.StatefulSet{} + err := gardenerretry.UntilTimeout(ctx, DefaultInterval, DefaultTimeout, func(ctx context.Context) (bool, error) { + if err := r.Get(ctx, ns, sts); err != nil { + if apierrors.IsNotFound(err) { + return gardenerretry.MinorError(err) + } + return gardenerretry.SevereError(err) + } + if err := checkStatefulSet(sts, replicas); err != nil { + return gardenerretry.MinorError(err) + } + return gardenerretry.Ok() + }) + + return err +} + +// checkStatefulSet checks whether the given StatefulSet is healthy. +// A StatefulSet is considered healthy if its controller observed its current revision, +// it is not in an update (i.e. UpdateRevision is empty) and if its current replicas are equal to +// desired replicas specified in ETCD specs. +func checkStatefulSet(statefulSet *appsv1.StatefulSet, replicas int32) error { + if statefulSet.Status.ObservedGeneration < statefulSet.Generation { + return fmt.Errorf("observed generation outdated (%d/%d)", statefulSet.Status.ObservedGeneration, statefulSet.Generation) + } + + if statefulSet.Status.ReadyReplicas < replicas { + return fmt.Errorf("not enough ready replicas (%d/%d)", statefulSet.Status.ReadyReplicas, replicas) + } + + return nil +} + func (r *EtcdReconciler) removeOperationAnnotation(ctx context.Context, logger logr.Logger, etcd *druidv1alpha1.Etcd) error { if _, ok := etcd.Annotations[v1beta1constants.GardenerOperation]; ok { logger.Info("Removing operation annotation") @@ -842,6 +1005,26 @@ func (r *EtcdReconciler) removeOperationAnnotation(ctx context.Context, logger l return nil } +func (r *EtcdReconciler) removeQuorumLossAnnotation(ctx context.Context, logger logr.Logger, etcd *druidv1alpha1.Etcd) error { + if _, ok := etcd.Annotations[druidpredicates.QuorumLossAnnotation]; ok { + logger.Info("Removing quorum loss annotation") + withQlAnnotation := etcd.DeepCopy() + delete(etcd.Annotations, druidpredicates.QuorumLossAnnotation) + return r.Patch(ctx, etcd, client.MergeFrom(withQlAnnotation)) + } + return nil +} + +func (r *EtcdReconciler) removeBootstrapAnnotation(ctx context.Context, logger logr.Logger, etcd *druidv1alpha1.Etcd) error { + if _, ok := etcd.Annotations[utils.BootstrapAnnotation]; ok { + logger.Info("Removing bootstrap annotation") + withBsAnnotation := etcd.DeepCopy() + delete(etcd.Annotations, utils.BootstrapAnnotation) + return r.Patch(ctx, etcd, client.MergeFrom(withBsAnnotation)) + } + return nil +} + func (r *EtcdReconciler) updateEtcdStatusAsNotReady(ctx context.Context, etcd *druidv1alpha1.Etcd) (*druidv1alpha1.Etcd, error) { err := controllerutils.TryUpdateStatus(ctx, retry.DefaultBackoff, r.Client, etcd, func() error { etcd.Status.Ready = nil diff --git a/controllers/etcd_controller_test.go b/controllers/etcd_controller_test.go index 26a63c2dd..f0dc804ff 100644 --- a/controllers/etcd_controller_test.go +++ b/controllers/etcd_controller_test.go @@ -461,9 +461,18 @@ var _ = Describe("Druid", func() { rb = &rbac.RoleBinding{} Eventually(func() error { return roleBindingIsCorrectlyReconciled(c, instance, rb) }, timeout, pollingInterval).Should(BeNil()) + Eventually(func() error { return setReniewTimeForMemberLeases(c, instance) }, timeout, pollingInterval).Should(BeNil()) + validate(instance, s, cm, clSvc, prSvc) validateRole(instance, role) + req := types.NamespacedName{ + Name: instance.Name, + Namespace: instance.Namespace, + } + + err = c.Get(context.TODO(), req, s) + Expect(err).NotTo(HaveOccurred()) setStatefulSetReady(s) err = c.Status().Update(context.TODO(), s) Expect(err).NotTo(HaveOccurred()) @@ -709,7 +718,10 @@ var _ = Describe("Multinode ETCD", func() { svc = &corev1.Service{} Eventually(func() error { return clientServiceIsCorrectlyReconciled(c, instance, svc) }, timeout, pollingInterval).Should(BeNil()) + Eventually(func() error { return setReniewTimeForMemberLeases(c, instance) }, timeout, pollingInterval).Should(BeNil()) + // Validate statefulset + Expect(sts.Spec.Replicas).ShouldNot(BeNil()) Expect(*sts.Spec.Replicas).To(Equal(int32(instance.Spec.Replicas))) if instance.Spec.Replicas == 1 { @@ -727,6 +739,177 @@ var _ = Describe("Multinode ETCD", func() { ) }) +var ( + unknownThreshold = 300 * time.Second + notReadyThreshold = 60 * time.Second + expire = time.Minute * 3 +) + +var _ = Describe("Quorum Loss Scenario", func() { + Context("when quorum is lost for multinode ETCD cluster", func() { + var ( + err error + instance *druidv1alpha1.Etcd + c client.Client + s *appsv1.StatefulSet + cm *corev1.ConfigMap + svc *corev1.Service + now time.Time + ) + BeforeEach(func() { + instance = getMultinodeEtcdDefault("foo85", "default") + c = mgr.GetClient() + ns := corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Namespace, + }, + } + + _, err = controllerutil.CreateOrUpdate(context.TODO(), c, &ns, func() error { return nil }) + Expect(err).To(Not(HaveOccurred())) + + err = c.Create(context.TODO(), instance) + Expect(err).NotTo(HaveOccurred()) + s = &appsv1.StatefulSet{} + Eventually(func() error { return statefulsetIsCorrectlyReconciled(c, instance, s) }, timeout, pollingInterval).Should(BeNil()) + setStatefulSetReady(s) + err = c.Status().Update(context.TODO(), s) + Expect(err).NotTo(HaveOccurred()) + + cm = &corev1.ConfigMap{} + Eventually(func() error { return configMapIsCorrectlyReconciled(c, instance, cm) }, timeout, pollingInterval).Should(BeNil()) + svc = &corev1.Service{} + Eventually(func() error { return clientServiceIsCorrectlyReconciled(c, instance, svc) }, timeout, pollingInterval).Should(BeNil()) + }) + It("when renew time of some of the member leases expired", func() { + // Deliberately update the first member lease with current time + memberLease := &coordinationv1.Lease{} + currentTime := metav1.NewMicroTime(time.Now()) + Eventually(func() error { return fetchMemberLease(c, instance, memberLease, 0) }, timeout, pollingInterval).Should(BeNil()) + err = controllerutils.TryUpdate(context.TODO(), retry.DefaultBackoff, c, memberLease, func() error { + memberLease.Spec.RenewTime = ¤tTime + return nil + }) + Expect(err).To(Not(HaveOccurred())) + + // Deliberately update the second member lease with expired time + memberLease = &coordinationv1.Lease{} + longExpirationTime := metav1.NewMicroTime(now.Add(-1 * unknownThreshold).Add(-1 * time.Second).Add(-1 * notReadyThreshold)) + Eventually(func() error { return fetchMemberLease(c, instance, memberLease, 1) }, timeout, pollingInterval).Should(BeNil()) + err = controllerutils.TryUpdate(context.TODO(), retry.DefaultBackoff, c, memberLease, func() error { + memberLease.Spec.RenewTime = &longExpirationTime + return nil + }) + Expect(err).To(Not(HaveOccurred())) + + // Deliberately update the third member lease with expired time + memberLease = &coordinationv1.Lease{} + longExpirationTime = metav1.NewMicroTime(now.Add(-1 * unknownThreshold).Add(-1 * time.Second).Add(-1 * notReadyThreshold)) + Eventually(func() error { return fetchMemberLease(c, instance, memberLease, 2) }, timeout, pollingInterval).Should(BeNil()) + err = controllerutils.TryUpdate(context.TODO(), retry.DefaultBackoff, c, memberLease, func() error { + memberLease.Spec.RenewTime = &longExpirationTime + return nil + }) + Expect(err).To(Not(HaveOccurred())) + + // Check if statefulset replicas is scaled down to 0 + s = &appsv1.StatefulSet{} + Eventually(func() error { return statefulsetIsScaled(c, instance, s, 0) }, timeout, pollingInterval).Should(BeNil()) + + // Check if statefulset replicas is scaled up to 1 + s = &appsv1.StatefulSet{} + Eventually(func() error { return statefulsetIsScaled(c, instance, s, 1) }, timeout, pollingInterval).Should(BeNil()) + setStatefulSetReady(s) + err = c.Status().Update(context.TODO(), s) + Expect(err).NotTo(HaveOccurred()) + + // Check if statefulset replicas is scaled up to etcd replicas + s = &appsv1.StatefulSet{} + Eventually(func() error { return statefulsetIsScaled(c, instance, s, instance.Spec.Replicas) }, timeout, pollingInterval).Should(BeNil()) + + }) + + AfterEach(func() { + Expect(c.Delete(context.TODO(), instance)).To(Succeed()) + Eventually(func() error { return statefulSetRemoved(c, s) }, timeout, pollingInterval).Should(BeNil()) + Eventually(func() error { return etcdRemoved(c, instance) }, timeout, pollingInterval).Should(BeNil()) + }) + }) +}) + +func getMultinodeEtcdDefault(name, namespace string) *druidv1alpha1.Etcd { + instance := &druidv1alpha1.Etcd{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: druidv1alpha1.EtcdSpec{ + Annotations: map[string]string{ + "app": "etcd-statefulset", + "role": "test", + "instance": name, + }, + Labels: map[string]string{ + "name": "etcd", + "instance": name, + }, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": "etcd", + "instance": name, + }, + }, + Replicas: 3, + Backup: druidv1alpha1.BackupSpec{}, + Etcd: druidv1alpha1.EtcdConfig{}, + Common: druidv1alpha1.SharedConfig{}, + }, + } + return instance +} + +func fetchMemberLease(c client.Client, instance *druidv1alpha1.Etcd, lease *coordinationv1.Lease, replica int) error { + ctx, cancel := context.WithTimeout(context.TODO(), timeout) + defer cancel() + req := types.NamespacedName{ + Name: memberLeaseName(instance.Name, replica), + Namespace: instance.Namespace, + } + + if err := c.Get(ctx, req, lease); err != nil { + return err + } + + if !checkEtcdOwnerReference(lease.GetOwnerReferences(), instance) { + return fmt.Errorf("ownerReference does not exists for lease") + } + return nil +} + +func memberLeaseName(etcdName string, replica int) string { + return fmt.Sprintf("%s-%d", etcdName, replica) +} + +func statefulsetIsScaled(c client.Client, instance *druidv1alpha1.Etcd, ss *appsv1.StatefulSet, replicas int32) error { + ctx, cancel := context.WithTimeout(context.TODO(), expire) + defer cancel() + req := types.NamespacedName{ + Name: instance.Name, + Namespace: instance.Namespace, + } + + if err := c.Get(ctx, req, ss); err != nil { + return err + } + + stsReplicas := *ss.Spec.Replicas + if stsReplicas != replicas { + return fmt.Errorf("statefulset replicas are yet %d instead of %d", stsReplicas, replicas) + } + + return nil +} + func validateRole(instance *druidv1alpha1.Etcd, role *rbac.Role) { Expect(*role).To(MatchFields(IgnoreExtras, Fields{ "ObjectMeta": MatchFields(IgnoreExtras, Fields{ @@ -1973,6 +2156,36 @@ func statefulsetIsCorrectlyReconciled(c client.Client, instance *druidv1alpha1.E return nil } +func setReniewTimeForMemberLeases(c client.Client, instance *druidv1alpha1.Etcd) error { + ctx, cancel := context.WithTimeout(context.TODO(), timeout) + defer cancel() + + for i := 0; i < int(instance.Spec.Replicas); i++ { + leaseName := memberLeaseName(instance.Name, i) + + req := types.NamespacedName{ + Name: leaseName, + Namespace: instance.Namespace, + } + + ls := &coordinationv1.Lease{} + if err := c.Get(ctx, req, ls); err != nil { + return err + } + + setTime := metav1.NewMicroTime(time.Now()) + if err := controllerutils.TryUpdate(context.TODO(), retry.DefaultBackoff, c, ls, func() error { + ls.Spec.RenewTime = &setTime + return nil + }); err != nil { + return err + } + + } + + return nil +} + func configMapIsCorrectlyReconciled(c client.Client, instance *druidv1alpha1.Etcd, cm *corev1.ConfigMap) error { ctx, cancel := context.WithTimeout(context.TODO(), timeout) defer cancel() diff --git a/controllers/etcd_custodian_controller.go b/controllers/etcd_custodian_controller.go index 035d7b9ee..20fbb09dc 100644 --- a/controllers/etcd_custodian_controller.go +++ b/controllers/etcd_custodian_controller.go @@ -48,6 +48,7 @@ import ( controllersconfig "github.com/gardener/etcd-druid/controllers/config" "github.com/gardener/etcd-druid/pkg/health/status" druidmapper "github.com/gardener/etcd-druid/pkg/mapper" + "github.com/gardener/etcd-druid/pkg/predicate" druidpredicates "github.com/gardener/etcd-druid/pkg/predicate" "github.com/gardener/etcd-druid/pkg/utils" ) @@ -87,8 +88,21 @@ func (ec *EtcdCustodian) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. return ctrl.Result{}, err } + if !etcd.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + logger := ec.logger.WithValues("etcd", kutil.Key(etcd.Namespace, etcd.Name).String()) + if val, ok := etcd.Annotations[predicate.QuorumLossAnnotation]; ok { + if val == "true" { + logger.Info("Requeue item after 30 seconds because the annotaion action/quorum-loss is set in ETCD CR which means a corrective measure for quorum loss in multi node scenario is being taken") + return ctrl.Result{ + RequeueAfter: 30 * time.Second, + }, nil + } + } + if etcd.Status.LastError != nil && *etcd.Status.LastError != "" { logger.Info(fmt.Sprintf("Requeue item because of last error: %v", *etcd.Status.LastError)) return ctrl.Result{ @@ -108,6 +122,37 @@ func (ec *EtcdCustodian) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. return ctrl.Result{}, err } + conLength := len(etcd.Status.Conditions) + if conLength > 0 && etcd.Status.Conditions[conLength-1].Reason == "QuorumLost" && etcd.Spec.Replicas > 1 { + logger.Info("Quorum loss detected. Taking measures to fix it.") + if !ec.config.EnableAutomaticQuorumLossHandling { + logger.Info("As Automatic Quorum Loss Handling is turned off in Druid, quorum loss needs to be handled by operator. Please follow the operator playbook to handle.") + // If the flag to automatically handle quorum loss is not turned on, allow some time before requeuing custodian controller + return ctrl.Result{ + RequeueAfter: 2 * time.Minute, + }, nil + } + + withQlAnnotation := etcd.DeepCopy() + annotations := make(map[string]string) + if etcd.Annotations != nil { + for key, value := range etcd.Annotations { + annotations[key] = value + } + } + // Set annotaion in ETCD to take corrective measure + annotations[predicate.QuorumLossAnnotation] = "true" + etcd.Annotations = annotations + logger.Info(fmt.Sprintf("Updating ETCD with the annotation %v", predicate.QuorumLossAnnotation)) + if err := ec.Patch(ctx, etcd, client.MergeFrom(withQlAnnotation)); err != nil { + return ctrl.Result{}, err + } + // Allow some time to fix the quorum loss by ETCD controller + return ctrl.Result{ + RequeueAfter: 2 * time.Minute, + }, nil + } + refMgr := NewEtcdDruidRefManager(ec.Client, ec.Scheme, etcd, selector, etcdGVK, nil) stsList, err := refMgr.FetchStatefulSet(ctx, etcd) diff --git a/main.go b/main.go index 4936bfc90..2276a875a 100644 --- a/main.go +++ b/main.go @@ -53,6 +53,7 @@ func main() { activeDeadlineDuration time.Duration ignoreOperationAnnotation bool disableEtcdServiceAccountAutomount bool + enableAutomaticQuorumLossHandling bool etcdMemberNotReadyThreshold time.Duration etcdMemberUnknownThreshold time.Duration @@ -82,6 +83,7 @@ func main() { flag.BoolVar(&ignoreOperationAnnotation, "ignore-operation-annotation", true, "Ignore the operation annotation or not.") flag.DurationVar(&etcdMemberNotReadyThreshold, "etcd-member-notready-threshold", 5*time.Minute, "Threshold after which an etcd member is considered not ready if the status was unknown before.") flag.BoolVar(&disableEtcdServiceAccountAutomount, "disable-etcd-serviceaccount-automount", false, "If true then .automountServiceAccountToken will be set to false for the ServiceAccount created for etcd statefulsets.") + flag.BoolVar(&enableAutomaticQuorumLossHandling, "enable-automatic-quorum-loss-handling", false, "If true then the quorum loss case will be handled automatically. Default false.") flag.DurationVar(&etcdMemberUnknownThreshold, "etcd-member-unknown-threshold", 1*time.Minute, "Threshold after which an etcd member is considered unknown.") flag.Parse() @@ -109,7 +111,8 @@ func main() { os.Exit(1) } - etcd, err := controllers.NewEtcdReconcilerWithImageVector(mgr, disableEtcdServiceAccountAutomount) + waitDuration := 0 * time.Second + etcd, err := controllers.NewEtcdReconcilerWithImageVector(mgr, disableEtcdServiceAccountAutomount, waitDuration) if err != nil { setupLog.Error(err, "Unable to initialize etcd controller with image vector") os.Exit(1) @@ -132,7 +135,8 @@ func main() { EtcdMemberNotReadyThreshold: etcdMemberNotReadyThreshold, EtcdMemberUnknownThreshold: etcdMemberUnknownThreshold, }, - SyncPeriod: custodianSyncPeriod, + SyncPeriod: custodianSyncPeriod, + EnableAutomaticQuorumLossHandling: enableAutomaticQuorumLossHandling, }) if err := custodian.SetupWithManager(ctx, mgr, custodianWorkers, ignoreOperationAnnotation); err != nil { diff --git a/pkg/health/etcdmember/check_ready.go b/pkg/health/etcdmember/check_ready.go index b917f7ed8..93bbf534d 100644 --- a/pkg/health/etcdmember/check_ready.go +++ b/pkg/health/etcdmember/check_ready.go @@ -73,7 +73,7 @@ func (r *readyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) []Resul renew := lease.Spec.RenewTime if renew == nil { r.logger.Info("Member hasn't acquired lease yet, still in bootstrapping phase", "name", lease.Name) - continue + return []Result{} } // Check if member state must be considered as not ready diff --git a/pkg/health/etcdmember/check_ready_test.go b/pkg/health/etcdmember/check_ready_test.go index 9b4b1a23e..8ecd6cbd7 100644 --- a/pkg/health/etcdmember/check_ready_test.go +++ b/pkg/health/etcdmember/check_ready_test.go @@ -389,17 +389,14 @@ var _ = Describe("ReadyCheck", func() { } }) - It("should only contain members which acquired lease once", func() { + It("should not contain any member even if acquired lease once", func() { defer test.WithVar(&TimeNow, func() time.Time { return now })() results := check.Check(ctx, etcd) - Expect(results).To(HaveLen(1)) - Expect(results[0].Status()).To(Equal(druidv1alpha1.EtcdMemberStatusReady)) - Expect(results[0].ID()).To(Equal(member1ID)) - Expect(results[0].Role()).To(gstruct.PointTo(Equal(druidv1alpha1.EtcdRoleLeader))) + Expect(results).To(HaveLen(0)) }) }) }) diff --git a/pkg/health/status/check.go b/pkg/health/status/check.go index b167608f1..1824b11d9 100644 --- a/pkg/health/status/check.go +++ b/pkg/health/status/check.go @@ -28,6 +28,7 @@ import ( controllersconfig "github.com/gardener/etcd-druid/controllers/config" "github.com/gardener/etcd-druid/pkg/health/condition" "github.com/gardener/etcd-druid/pkg/health/etcdmember" + "github.com/gardener/etcd-druid/pkg/utils" ) // ConditionCheckFn is a type alias for a function which returns an implementation of `Check`. @@ -72,10 +73,15 @@ func (c *checker) Check(ctx context.Context, logger logr.Logger, etcd *druidv1al return err } + logger.Info("ETCD member checks are done") + // Execute condition checks after the etcd member checks because we need their result here. if err := c.executeConditionChecks(ctx, etcd); err != nil { return err } + + logger.Info("ETCD condition check is done") + return nil } @@ -120,6 +126,13 @@ func (c *checker) executeConditionChecks(ctx context.Context, etcd *druidv1alpha // executeEtcdMemberChecks runs all registered etcd member checks **sequentially**. // The result of a check is passed via the `status` sub-resources to the next check. func (c *checker) executeEtcdMemberChecks(ctx context.Context, logger logr.Logger, etcd *druidv1alpha1.Etcd) error { + // Don't execute member checks until bootstrapping is going on + if val, ok := etcd.Annotations[utils.BootstrapAnnotation]; ok { + if val == "true" { + return nil + } + } + // Run etcd member checks sequentially as most of them act on multiple elements. for _, newCheck := range c.etcdMemberCheckFns { results := newCheck(c.cl, logger, c.config).Check(ctx, *etcd) diff --git a/pkg/predicate/predicate.go b/pkg/predicate/predicate.go index 145d22723..9065195a4 100644 --- a/pkg/predicate/predicate.go +++ b/pkg/predicate/predicate.go @@ -31,6 +31,10 @@ import ( druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" ) +const ( + QuorumLossAnnotation = "druid.gardener.cloud/quorum-loss" +) + func hasOperationAnnotation(obj client.Object) bool { return obj.GetAnnotations()[v1beta1constants.GardenerOperation] == v1beta1constants.GardenerOperationReconcile } @@ -53,6 +57,33 @@ func HasOperationAnnotation() predicate.Predicate { } } +func hasQuorumLossAnnotation(obj client.Object) bool { + etcd, ok := obj.(*druidv1alpha1.Etcd) + if !ok { + return false + } + _, ok = etcd.Annotations[QuorumLossAnnotation] + return ok +} + +// HasQuorumLossAnnotation is a predicate for the operation annotation. +func HasQuorumLossAnnotation() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(event event.CreateEvent) bool { + return hasQuorumLossAnnotation(event.Object) + }, + UpdateFunc: func(event event.UpdateEvent) bool { + return hasQuorumLossAnnotation(event.ObjectNew) + }, + GenericFunc: func(event event.GenericEvent) bool { + return hasQuorumLossAnnotation(event.Object) + }, + DeleteFunc: func(event event.DeleteEvent) bool { + return true + }, + } +} + // LastOperationNotSuccessful is a predicate for unsuccessful last operations for creation events. func LastOperationNotSuccessful() predicate.Predicate { operationNotSucceeded := func(obj runtime.Object) bool { diff --git a/pkg/predicate/predicate_test.go b/pkg/predicate/predicate_test.go index f428871b4..909514058 100644 --- a/pkg/predicate/predicate_test.go +++ b/pkg/predicate/predicate_test.go @@ -261,17 +261,62 @@ var _ = Describe("Druid Predicate", func() { }) }) + Describe("#HasQuorumLossAnnotation", func() { + var pred predicate.Predicate + + JustBeforeEach(func() { + pred = HasQuorumLossAnnotation() + }) + + Context("when has no quorum loss annotation", func() { + BeforeEach(func() { + obj = &druidv1alpha1.Etcd{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: make(map[string]string), + }, + } + }) + + It("should return false", func() { + gomega.Expect(pred.Create(createEvent)).To(gomega.BeFalse()) + gomega.Expect(pred.Update(updateEvent)).To(gomega.BeFalse()) + gomega.Expect(pred.Delete(deleteEvent)).To(gomega.BeTrue()) + gomega.Expect(pred.Generic(genericEvent)).To(gomega.BeFalse()) + }) + }) + + Context("when has quorum loss annotation", func() { + BeforeEach(func() { + obj = &druidv1alpha1.Etcd{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + QuorumLossAnnotation: "true", + }, + }, + } + }) + + It("should return true", func() { + gomega.Expect(pred.Create(createEvent)).To(gomega.BeTrue()) + gomega.Expect(pred.Update(updateEvent)).To(gomega.BeTrue()) + gomega.Expect(pred.Delete(deleteEvent)).To(gomega.BeTrue()) + gomega.Expect(pred.Generic(genericEvent)).To(gomega.BeTrue()) + }) + }) + }) + Describe("#OR", func() { var pred predicate.Predicate JustBeforeEach(func() { pred = predicate.Or( HasOperationAnnotation(), + HasQuorumLossAnnotation(), LastOperationNotSuccessful(), ) }) - Context("when has neither operation annotation nor last error", func() { + Context("when has neither operation annotation nor quorum loss annotation nor last error", func() { BeforeEach(func() { obj = &druidv1alpha1.Etcd{ ObjectMeta: metav1.ObjectMeta{ @@ -307,6 +352,25 @@ var _ = Describe("Druid Predicate", func() { }) }) + Context("when has quorum loss annotation", func() { + BeforeEach(func() { + obj = &druidv1alpha1.Etcd{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + QuorumLossAnnotation: "true", + }, + }, + } + }) + + It("should return true", func() { + gomega.Expect(pred.Create(createEvent)).To(gomega.BeTrue()) + gomega.Expect(pred.Update(updateEvent)).To(gomega.BeTrue()) + gomega.Expect(pred.Delete(deleteEvent)).To(gomega.BeTrue()) + gomega.Expect(pred.Generic(genericEvent)).To(gomega.BeTrue()) + }) + }) + Context("when has last error", func() { BeforeEach(func() { obj = &druidv1alpha1.Etcd{ diff --git a/pkg/utils/miscellaneous.go b/pkg/utils/miscellaneous.go index 7028136c6..374d60d93 100644 --- a/pkg/utils/miscellaneous.go +++ b/pkg/utils/miscellaneous.go @@ -31,6 +31,7 @@ import ( const ( // LocalProviderDefaultMountPath is the default path where the buckets directory is mounted. LocalProviderDefaultMountPath = "/etc/gardener/local-backupbuckets" + BootstrapAnnotation = "druid.gardener.cloud/bootstrap" // EtcdBackupSecretHostPath is the hostPath field in the etcd-backup secret. EtcdBackupSecretHostPath = "hostPath" )