From fbac2e3465a43d12b8050e45b79fab993dfacacf Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Wed, 28 Aug 2024 09:25:08 +0200 Subject: [PATCH] Add istio control plane dashboard and alert rule (#478) * Add istio control plane dashboard The source of this dashboard is [1]. --- [1]: https://grafana.com/grafana/dashboards/7645-istio-control-plane-dashboard/ * Add aler rule from source [1] based on metrics [2] Found some intteresting alert rules in [1], however based on istio doc [2], only the last one will work, so I added it here. --- [1]: https://samber.github.io/awesome-prometheus-alerts/rules#istio [2]: https://istio.io/latest/docs/reference/commands/pilot-discovery/#metrics --- .../src/prometheus_alert_rules/basic.rules | 2 +- .../istio_control_plane.json.tmpl | 1990 +++++++++++++++++ .../IstioPilotDuplicateEntry.rule | 8 + tests/test_cos_integration.py | 12 + 4 files changed, 2011 insertions(+), 1 deletion(-) create mode 100644 charms/istio-pilot/src/grafana_dashboards/istio_control_plane.json.tmpl create mode 100644 charms/istio-pilot/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule diff --git a/charms/istio-gateway/src/prometheus_alert_rules/basic.rules b/charms/istio-gateway/src/prometheus_alert_rules/basic.rules index 6d64f95c..13c1ba00 100644 --- a/charms/istio-gateway/src/prometheus_alert_rules/basic.rules +++ b/charms/istio-gateway/src/prometheus_alert_rules/basic.rules @@ -7,7 +7,7 @@ groups: summary: 'ingress gateway traffic missing' description: '[Critical]: ingress gateway traffic missing, likely other monitors are misleading, check client logs' expr: > - absent(reporter="source", source_workload=~"istio-(ingress|egress)gateway-workload"})==1 + absent(istio_requests_total{reporter="source", source_workload=~"istio-(ingress|egress)gateway-workload"})==1 for: 5m - alert: IstioMetricsMissing annotations: diff --git a/charms/istio-pilot/src/grafana_dashboards/istio_control_plane.json.tmpl b/charms/istio-pilot/src/grafana_dashboards/istio_control_plane.json.tmpl new file mode 100644 index 00000000..a6c64abc --- /dev/null +++ b/charms/istio-pilot/src/grafana_dashboards/istio_control_plane.json.tmpl @@ -0,0 +1,1990 @@ +{ + "__inputs": [ + { + "name": "prometheusds", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.4.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "${prometheusds}" + }, + "enable": true, + "hide": true, + "iconColor": "#6ed0e0", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [], + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Istio Control Plane Dashboard version 1.22.3", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 60, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "refId": "A" + } + ], + "title": "Deployed Versions", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 56, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(istio_build{component=\"pilot\"}) by (tag, juju_model)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ juju_model }} {{ tag }}", + "refId": "A" + } + ], + "title": "Pilot Versions", + "type": "timeseries" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 62, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 5, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "process_virtual_memory_bytes{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "instant": false, + "intervalFactor": 2, + "legendFormat": "Virtual Memory", + "refId": "I", + "step": 2 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "process_resident_memory_bytes{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Resident Memory", + "refId": "H", + "step": 2 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "go_memstats_heap_sys_bytes{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "heap sys", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "go_memstats_heap_alloc_bytes{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "heap alloc", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "go_memstats_alloc_bytes{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Alloc", + "refId": "F", + "step": 2 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "go_memstats_heap_inuse_bytes{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Heap in-use", + "refId": "E", + "step": 2 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "go_memstats_stack_inuse_bytes{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Stack in-use", + "refId": "G", + "step": 2 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "container_memory_working_set_bytes{container=~\"discovery\", pod=~\"istiod-.*|istio-pilot-.*\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Discovery (container)", + "refId": "B", + "step": 2 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "container_memory_working_set_bytes{container=~\"istio-proxy\", pod=~\"istiod-.*|istio-pilot-.*\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Sidecar (container)", + "refId": "C" + } + ], + "title": "Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 7 + }, + "id": 6, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "irate(process_cpu_seconds_total{juju_charm=\"istio-pilot\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Discovery (process)", + "refId": "C", + "step": 2 + } + ], + "title": "CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 7 + }, + "id": 4, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "go_goroutines{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Number of Goroutines", + "refId": "A", + "step": 2 + } + ], + "title": "Goroutines", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "refId": "A" + } + ], + "title": "Resource Usage", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 58, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "description": "Shows the rate of pilot pushes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 15 + }, + "id": 622, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(irate(pilot_xds_pushes{type=\"cds\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(irate(pilot_xds_pushes{type=\"eds\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Endpoints", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(irate(pilot_xds_pushes{type=\"lds\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Listeners", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(irate(pilot_xds_pushes{type=\"rds\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Routes", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(irate(pilot_xds_pushes{type=\"sds\"}[5m]))", + "interval": "", + "legendFormat": "Secrets", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(irate(pilot_xds_pushes{type=\"nds\"}[5m]))", + "interval": "", + "legendFormat": "Nametables", + "refId": "F" + } + ], + "title": "Pilot Pushes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "description": "Captures a variety of pilot errors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 15 + }, + "id": 67, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(pilot_xds_cds_reject{juju_charm=\"istio-pilot\"}) or (absent(pilot_xds_cds_reject{juju_charm=\"istio-pilot\"}) - 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Rejected CDS Configs", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(pilot_xds_eds_reject{juju_charm=\"istio-pilot\"}) or (absent(pilot_xds_eds_reject{juju_charm=\"istio-pilot\"}) - 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Rejected EDS Configs", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(pilot_xds_rds_reject{juju_charm=\"istio-pilot\"}) or (absent(pilot_xds_rds_reject{juju_charm=\"istio-pilot\"}) - 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Rejected RDS Configs", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(pilot_xds_lds_reject{juju_charm=\"istio-pilot\"}) or (absent(pilot_xds_lds_reject{juju_charm=\"istio-pilot\"}) - 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Rejected LDS Configs", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(rate(pilot_xds_write_timeout{juju_charm=\"istio-pilot\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write Timeouts", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(rate(pilot_total_xds_internal_errors{juju_charm=\"istio-pilot\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Internal Errors", + "refId": "H" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(rate(pilot_total_xds_rejects{juju_charm=\"istio-pilot\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Config Rejection Rate", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(rate(pilot_xds_push_context_errors{juju_charm=\"istio-pilot\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Push Context Errors", + "refId": "K" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(rate(pilot_xds_write_timeout{juju_charm=\"istio-pilot\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Push Timeouts", + "refId": "G" + } + ], + "title": "Pilot Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "description": "Shows the total time it takes to push a config update to a proxy", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 15 + }, + "id": 624, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "histogram_quantile(0.5, sum(rate(pilot_proxy_convergence_time_bucket[5m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "p50 ", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "histogram_quantile(0.9, sum(rate(pilot_proxy_convergence_time_bucket[5m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "p90", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "histogram_quantile(0.99, sum(rate(pilot_proxy_convergence_time_bucket[5m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "p99", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "histogram_quantile(0.999, sum(rate(pilot_proxy_convergence_time_bucket[5m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "p99.9", + "refId": "D" + } + ], + "title": "Proxy Push Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 45, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "pilot_conflict_inbound_listener{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Inbound Listeners", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "pilot_conflict_outbound_listener_tcp_over_current_tcp{juju_charm=\"istio-pilot\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Outbound Listeners (tcp over current tcp)", + "refId": "C" + } + ], + "title": "Conflicts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 47, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "avg(pilot_virt_services{juju_charm=\"istio-pilot\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Virtual Services", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "avg(pilot_services{juju_charm=\"istio-pilot\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Services", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(pilot_xds{juju_charm=\"istio-pilot\"}) by (instance)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Connected Endpoints {{instance}}", + "refId": "E" + } + ], + "title": "ADS Monitoring", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "refId": "A" + } + ], + "title": "Pilot Push Information", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 64, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "description": "Shows details about Envoy proxies in the mesh", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 32 + }, + "id": 40, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(irate(envoy_cluster_upstream_cx_total{cluster_name=\"xds-grpc\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "XDS Connections", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(irate(envoy_cluster_upstream_cx_connect_fail{cluster_name=\"xds-grpc\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "XDS Connection Failures", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(increase(envoy_server_hot_restart_epoch[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Envoy Restarts", + "refId": "B" + } + ], + "title": "Envoy Details", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 32 + }, + "id": 41, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(envoy_cluster_upstream_cx_active{cluster_name=\"xds-grpc\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "XDS Active Connections", + "refId": "C", + "step": 2 + } + ], + "title": "XDS Active Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "description": "Shows the size of XDS requests and responses", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 32 + }, + "id": 42, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "max(rate(envoy_cluster_upstream_cx_rx_bytes_total{cluster_name=\"xds-grpc\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "XDS Response Bytes Max", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "quantile(0.5, rate(envoy_cluster_upstream_cx_rx_bytes_total{cluster_name=\"xds-grpc\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "XDS Response Bytes Average", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "max(rate(envoy_cluster_upstream_cx_tx_bytes_total{cluster_name=\"xds-grpc\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "XDS Request Bytes Max", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "quantile(.5, rate(envoy_cluster_upstream_cx_tx_bytes_total{cluster_name=\"xds-grpc\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "XDS Request Bytes Average", + "refId": "C" + } + ], + "title": "XDS Requests Size", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "refId": "A" + } + ], + "title": "Envoy Information", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 626, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 629, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(irate(galley_validation_passed[5m]))", + "interval": "", + "legendFormat": "Validations (Success)", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(rate(galley_validation_failed[5m]))", + "interval": "", + "legendFormat": "Validation (Failure)", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(rate(galley_validation_config_updates[5m]))", + "interval": "", + "legendFormat": "Validation updates", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum(rate(galley_validation_config_update_error[5m]))", + "interval": "", + "legendFormat": "Validation updates (Failure)", + "refId": "D" + } + ], + "title": "Configuration Validation", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "refId": "A" + } + ], + "title": "Webhooks", + "type": "row" + } + ], + "refresh": "", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "CKF", + "istio" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "30s", + "5m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Istio Control Plane Dashboard", + "version": 1, + "weekStart": "", + "gnetId": 7645 +} diff --git a/charms/istio-pilot/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule b/charms/istio-pilot/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule new file mode 100644 index 00000000..39301de5 --- /dev/null +++ b/charms/istio-pilot/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule @@ -0,0 +1,8 @@ +alert: IstioPilotDuplicateEntry +expr: sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0 +for: 0m +labels: + severity: critical +annotations: + summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }}) + description: "Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/tests/test_cos_integration.py b/tests/test_cos_integration.py index 62a30f61..4892813b 100644 --- a/tests/test_cos_integration.py +++ b/tests/test_cos_integration.py @@ -10,9 +10,11 @@ GRAFANA_AGENT_APP, GRAFANA_AGENT_METRICS_ENDPOINT, assert_alert_rules, + assert_grafana_dashboards, assert_metrics_endpoint, deploy_and_assert_grafana_agent, get_alert_rules, + get_grafana_dashboards, ) from pytest_operator.plugin import OpsTest @@ -20,6 +22,7 @@ ISTIO_PILOT = "istio-pilot" ISTIO_PILOT_ALER_RULES = Path("./charms/istio-pilot/src/prometheus_alert_rules") +ISTIO_PILOT_DASHBOARDS = Path("./charms/istio-pilot/src/grafana_dashboards") ISTIO_GATEWAY_APP_NAME = "istio-ingressgateway" ISTIO_GATEWAY_ALER_RULES = Path("./charms/istio-gateway/src/prometheus_alert_rules") @@ -96,3 +99,12 @@ async def test_alert_rules(charm, path_to_alert_rules, ops_test): alert_rules = get_alert_rules(path_to_alert_rules) log.info("found alert_rules: %s", alert_rules) await assert_alert_rules(app, alert_rules) + + +@pytest.mark.parametrize("charm, path_to_dashboards", [(ISTIO_PILOT, ISTIO_PILOT_DASHBOARDS)]) +async def test_grafana_dashboards(charm, path_to_dashboards, ops_test): + """Test Grafana dashboards are defined in relation data bag.""" + app = ops_test.model.applications[charm] + dashboards = get_grafana_dashboards(path_to_dashboards) + log.info("found dashboards: %s", dashboards) + await assert_grafana_dashboards(app, dashboards)