-
Notifications
You must be signed in to change notification settings - Fork 0
/
alerterator-prod.yml
85 lines (79 loc) · 4.12 KB
/
alerterator-prod.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{app}}-alerts
labels:
team: {{team}}
namespace: {{namespace}}
spec:
groups:
- name: {{app}}-alerts
rules:
- alert: Applikasjon nede
expr: kube_deployment_status_replicas_available{deployment="{{app}}"} == 0
for: 5m
annotations:
consequence: Ingen søknader blir mellomlagret.
action: "`kubectl describe pod <podname>` -> `kubectl logs <podname>`"
summary: "App \{{ $labels.deployment }} er nede i namespace \{{ $labels.namespace }}"
labels:
namespace: {{namespace}}
severity: critical
- alert: Høy andel error i logger
expr: sum by (app, container, pod, namespace) (floor(increase(logback_events_total{app="{{app}}", level="error"} [3m]))) > 0
for: 5m
annotations:
action: "`kubectl logs \{{ $labels.pod }} -c \{{ $labels.container }} -n \{{ $labels.namespace }}`"
summary: "Høy andel error i logger for app \{{ $labels.app }} feiler med \{{ $labels.exception }} i namespace \{{ $labels.namespace }}"
labels:
namespace: {{namespace}}
severity: critical
- alert: Høy andel warning i logger
expr: sum by (app, container, pod, namespace) (floor(increase(logback_events_total{app="{{app}}", level="warning"} [3m]))) > 0
for: 5m
annotations:
action: "`kubectl logs \{{ $labels.pod }} -c \{{ $labels.container }} -n \{{ $labels.namespace }}`"
summary: "Høy andel warning i logger for app \{{ $labels.app }} feiler med \{{ $labels.exception }} i namespace \{{ $labels.namespace }}"
labels:
namespace: {{namespace}}
severity: warning
- alert: Høy andel HTTP serverfeil (5xx responser)
expr: floor(increase(http_server_requests_seconds_count{status=~"5.*", app="{{app}}"}[3m])) > 1
for: 1m
annotations:
summary: "Følgende request feilet: `Status \{{ $labels.status }} - \{{ $labels.method }} \{{ $labels.route }}`.\n
Grunn:\n ```\{{ $labels.problem_details }}```\n
Sjekk loggene for å se hvorfor dette feiler."
action: "`kubectl logs \{{ $labels.pod }} -n \{{ $labels.namespace }} -c \{{ $labels.app }}`"
labels:
namespace: {{namespace}}
severity: critical
- alert: Høy andel HTTP klientfeil (4xx responser)
expr: floor(increase(http_server_requests_seconds_count{status=~"4.*", status!~"404|401|403", app="{{app}}"}[3m])) > 0
for: 1m
annotations:
summary: "Følgende request feilet: `Status \{{ $labels.status }} - \{{ $labels.method }} \{{ $labels.route }}`.\n
Grunn:\n ```\{{ $labels.problem_details }}```\n
Sjekk loggene for å se hvorfor dette feiler."
action: "`kubectl logs \{{ $labels.pod }} -n \{{ $labels.namespace }} -c \{{ $labels.app }}`"
labels:
namespace: {{namespace}}
severity: critical
- alert: Helsesjekk feiler
expr: floor(increase(http_server_requests_seconds_count{status!~"200", uri="/actuator/health", app="{{app}}"}[3m])) > 0
for: 2m
annotations:
summary: "Sjekk loggene for å se hvorfor helsesjekken feiler.`"
action: "`kubectl logs \{{ $labels.pod }} -n \{{ $labels.namespace }} -c \{{ $labels.app }}`"
labels:
namespace: {{namespace}}
severity: critical
- alert: Publisering av meldinger feiler
expr: ceil(increase(spring_kafka_template_seconds_count{result="failure", app="{{app}}"}[3m])) > 0
for: 1m
annotations:
summary: "Publisering av melding med \{{ $labels.name }} feiler med \{{ $labels.exception }}. Sjekk loggene for å finne ut hvorfor dette feiler."
action: "`kubectl logs \{{ $labels.pod }} -n \{{ $labels.namespace }} -c \{{ $labels.app }}`"
labels:
namespace: {{namespace}}
severity: critical