forked from ankilosaurus/kube_remediator
-
Notifications
You must be signed in to change notification settings - Fork 2
/
crashloopbackoffrescheduler.go
133 lines (114 loc) · 3.93 KB
/
crashloopbackoffrescheduler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
package remediator
import (
"context"
"github.com/aksgithub/kube_remediator/pkg/k8s"
"github.com/aksgithub/kube_remediator/pkg/metrics"
"github.com/spf13/viper"
"go.uber.org/zap"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/informers"
"k8s.io/client-go/tools/cache"
"sync"
)
// TODO: this cannot be global since we have multiple remediator in this package ... folder can be set though but should
// still not be here
var CONFIG_FILE = "config/crash_loop_back_off_rescheduler.json"
type PodFilter struct {
annotation string
failureThreshold int32
namespace string
}
type CrashLoopBackOffRescheduler struct {
Base
filter PodFilter
informerFactory informers.SharedInformerFactory
metrics *metrics.CrashLoopBackOff_Metrics
}
func (p *CrashLoopBackOffRescheduler) Setup(logger *zap.Logger, client k8s.ClientInterface) error {
logger.Info("Reading config", zap.String("file", CONFIG_FILE))
viper.SetConfigFile(CONFIG_FILE)
viper.SetConfigType("json")
viper.SetDefault("annotation", "kube-remediator/CrashLoopBackOffRemediator")
viper.SetDefault("failureThreshold", 5)
viper.SetDefault("namespace", "")
if err := viper.ReadInConfig(); err != nil {
return err // untested section
}
logger.Sugar().Infof("Config %v", viper.AllSettings()) // TODO: prefer using zap.Map or something like that
filter := PodFilter{
annotation: viper.GetString("annotation"),
failureThreshold: viper.GetInt32("failureThreshold"),
namespace: viper.GetString("namespace"),
}
metrics := metrics.NewCrashLoopBackOffMetrics(logger)
metrics.Register()
informerFactory, err := client.NewSharedInformerFactory(filter.namespace)
if err != nil {
return err // untested section
}
p.informerFactory = informerFactory
p.filter = filter
p.metrics = metrics
p.logger = logger
p.client = client
return nil
}
func (p *CrashLoopBackOffRescheduler) Run(ctx context.Context, wg *sync.WaitGroup) {
defer wg.Done()
p.logStartAndStop(func() {
// Check for any CrashLoopBackOff Pods first
p.reschedulePods()
informer := p.informerFactory.Core().V1().Pods().Informer()
informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
UpdateFunc: p.rescheduleIfNecessary,
})
informer.Run(ctx.Done())
<-ctx.Done()
p.metrics.UnRegister()
})
}
func (p *CrashLoopBackOffRescheduler) reschedulePods() {
p.logger.Info("Running")
for _, pod := range *p.getCrashLoopBackOffPods() {
p.rescheduleIfNecessary(nil, &pod)
}
}
func (p *CrashLoopBackOffRescheduler) rescheduleIfNecessary(oldObj, newObj interface{}) {
pod := newObj.(*v1.Pod)
if p.shouldReschedule(pod) {
p.deletePod(*pod)
}
}
func (p *CrashLoopBackOffRescheduler) getCrashLoopBackOffPods() *[]v1.Pod {
pods, err := p.client.GetPods(p.filter.namespace, metav1.ListOptions{})
if err != nil {
p.logger.Error("Error getting pod list: ", zap.Error(err))
return &[]v1.Pod{}
}
var unhealthyPods []v1.Pod
for _, pod := range pods.Items {
if p.shouldReschedule(&pod) {
unhealthyPods = append(unhealthyPods, pod)
}
}
return &unhealthyPods
}
func (p *CrashLoopBackOffRescheduler) shouldReschedule(pod *v1.Pod) bool {
return (p.filter.annotation == "" || pod.ObjectMeta.Annotations[p.filter.annotation] != "false") && // not opted-out
len(pod.ObjectMeta.OwnerReferences) > 0 && // Assuming Pod has owner reference of kind Controller
p.isPodUnhealthy(pod)
}
// This is not 100% reliable because Pod could toggle between Terminated with Error and Waiting with CrashLoopBackOff
func (p *CrashLoopBackOffRescheduler) isPodUnhealthy(pod *v1.Pod) bool {
// Check if any of Containers is in CrashLoop
statuses := append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)
for _, containerStatus := range statuses {
if containerStatus.RestartCount >= p.filter.failureThreshold {
if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason == "CrashLoopBackOff" {
return true
}
}
}
return false
}