From a725040df05daef99f1297a13d97950feb160dc7 Mon Sep 17 00:00:00 2001 From: Venkata Mutyala Date: Thu, 9 Nov 2023 09:18:28 +0000 Subject: [PATCH] fixes --- Dockerfile | 25 ++--------- README.md | 41 ++++++++++++------ monitoring_script.py | 91 +++++++++++++++++++-------------------- serviceconfig.py | 34 +++++++++++++++ test_monitoring_script.py | 83 ----------------------------------- 5 files changed, 109 insertions(+), 165 deletions(-) create mode 100644 serviceconfig.py delete mode 100644 test_monitoring_script.py diff --git a/Dockerfile b/Dockerfile index ac9143b..57b7ca2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,29 +1,10 @@ -# --- Stage 1: Testing --- -# Use the same base image for consistency -FROM python:3.11.6-alpine3.18 as tester - -# Set working directory -WORKDIR /app - -# Install dependencies -COPY requirements.txt /app/ -RUN pip install --upgrade pip && pip install -r requirements.txt && pip install pytest - -# Copy your application code and test files -COPY monitoring_script.py /app/ -COPY test_monitoring_script.py /app/ - -# Run tests -RUN python -u -m pytest -v test_monitoring_script.py - -# --- Stage 2: Final Image --- FROM python:3.11.6-alpine3.18 as final WORKDIR /app -COPY --from=tester /app/requirements.txt /app/ +COPY requirements.txt /app/ RUN pip install --upgrade pip && pip install -r requirements.txt -# Copy only the necessary files from the tester stage -COPY --from=tester /app/monitoring_script.py /app/ +COPY monitoring_script.py /app/ +COPY serviceconfig.py /app/ CMD [ "python", "-u", "monitoring_script.py" ] diff --git a/README.md b/README.md index 0ec9e0d..8585a9e 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,32 @@ -# Cluster Monitoring +# GLUEOPS CLUSTER MONITORING -This repo contains a script that sends a ping to Opsgenie's Heartbeat every 5 minutes. If the cluster is down, Opsgenie will not get a ping and will send a mail to inform team members of the cluster's failed state. +This application is designed for monitoring a Kubernetes cluster with the Prometheus and Alertmanager components from the Kubernetes Prometheus Stack (KPS). -The script is deployed into the ArgoCD cluster under monitoring. Once this cluster is down, pings will not be sent to Opsgenie, triggering an alert which is sent to concerned team members. +## Configuration -## Running the script +Before running the application, make sure to configure the following environment variables: -- Create a ```.env``` file, with the following contents -```bash -OPSGENIE_API_KEY= -HEARTBEAT_NAME= -PING_SLEEP= -``` +- `OPSGENIE_API_KEY`: Your Opsgenie API key for sending heartbeat notifications. +- `OPSGENIE_HEARTBEAT_NAME`: The name of the Opsgenie heartbeat to ping. +- `OPSGENIE_PING_INTERVAL_MINUTES`: The interval (in minutes) between pinging the Opsgenie heartbeat (default: 2 minutes). -- Runing the script -```bash -$ docker run --env-file .env ghcr.io/glueops/cluster-monitoring:main -``` +## Running in a Kubernetes Cluster + +To run this application within a Kubernetes cluster, follow these steps: + +1. Ensure your Kubernetes cluster is up and running. +2. Deploy the application with the configured environment variables. +3. The application will automatically detect its environment and use in-cluster URLs for Prometheus and Alertmanager. + +## Running Locally for Debugging + +To run this application locally for debugging purposes and access Prometheus and Alertmanager, you can set up port forwarding to your cluster. Follow these steps: + +1. Ensure you have `kubectl` installed and configured to communicate with your Kubernetes cluster. +2. Identify the Prometheus and Alertmanager pods in your cluster: + + ```bash +# For Prometheus +kubectl port-forward svc/kps-prometheus 9090:9090 -n glueops-core-kube-prometheus-stack +# For Alertmanager +kubectl port-forward svc/kps-alertmanager 9093:9093 -n glueops-core-kube-prometheus-stack diff --git a/monitoring_script.py b/monitoring_script.py index f0c2129..b8b53ca 100644 --- a/monitoring_script.py +++ b/monitoring_script.py @@ -2,43 +2,22 @@ import requests from glueops.setup_logging import configure as go_configure_logging import time +from serviceconfig import ServiceConfig #=== configure logging logger = go_configure_logging( name='GLUEOPS_CLUSTER_MONITORING', level=os.getenv('PYTHON_LOG_LEVEL', 'INFO') ) + -OPSGENIE_API_KEY = os.getenv('OPSGENIE_API_KEY') -OPSGENIE_HEARTBEAT_NAME = os.getenv('OPSGENIE_HEARTBEAT_NAME') -OPSGENIE_PING_INTERVAL_MINUTES = os.getenv('OPSGENIE_PING_INTERVAL_MINUTES',2) - -CLUSTER_DEFAULT_DOMAIN_NAME = os.getenv( - 'CLUSTER_DEFAULT_DOMAIN_NAME', - 'cluster.local' -) -KUBE_PROMETHEUS_STACK_NAMESPACE = os.getenv( - 'KUBE_PROMETHEUS_STACK_NAMESPACE', - 'glueops-core-kube-prometheus-stack' -) - -GLUEOPS_CORE_NAMESPACE = os.getenv( - 'GLUEOPS_CORE_NAMESPACE', - 'glueops-core' -) - -PROMETHEUS_URL_HEALTH = f"kps-prometheus.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9090/-/healthy" -PROMETHEUS_URL_READY = f"kps-prometheus.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9090/-/ready" -ALERTMANAGER_URL_HEALTH = f"kps-alertmanager.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9093/-/healthy" -ALERTMANAGER_URL_READY = f"kps-alertmanager.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9093/-/ready" - -def is_cluster_healthy(): +def is_cluster_healthy(config): return ( - get_alertmanager_notifification_health_for_opsgenie() and - check_for_200_response(PROMETHEUS_URL_HEALTH) and - check_for_200_response(PROMETHEUS_URL_READY) and - check_for_200_response(ALERTMANAGER_URL_HEALTH) and - check_for_200_response(ALERTMANAGER_URL_READY) + get_alertmanager_notifification_health_for_opsgenie(config.PROMETHEUS_QUERY_URL) and + check_for_200_response(config.PROMETHEUS_URL_HEALTH) and + check_for_200_response(config.PROMETHEUS_URL_READY) and + check_for_200_response(config.ALERTMANAGER_URL_HEALTH) and + check_for_200_response(config.ALERTMANAGER_URL_READY) ) def check_for_200_response(url): @@ -56,11 +35,9 @@ def check_for_200_response(url): raise -def get_alertmanager_notifification_health_for_opsgenie(): +def get_alertmanager_notifification_health_for_opsgenie(prometheus_query_url): # Prometheus query query = 'sum(increase(alertmanager_notifications_failed_total{integration="opsgenie"}[10m]))' - # URL for the Prometheus HTTP API - url = 'http://'+ KUBE_PROMETHEUS_STACK_NAMESPACE + ':9090/api/v1/query' # Parameters for the request params = { @@ -69,7 +46,7 @@ def get_alertmanager_notifification_health_for_opsgenie(): try: # Send the request to Prometheus - response = requests.get(url, params=params) + response = requests.get(prometheus_query_url, params=params) response.raise_for_status() # Parse the JSON response @@ -95,30 +72,52 @@ def get_alertmanager_notifification_health_for_opsgenie(): logger.exception(f"Error querying Prometheus: {e}") raise -def send_opsgenie_heartbeat(heartbeat_name): - heart_eat_url = f"https://api.opsgenie.com/v2/heartbeats/{heartbeat_name}/ping" +def send_opsgenie_heartbeat(config): + url = f"https://api.opsgenie.com/v2/heartbeats/{config.OPSGENIE_HEARTBEAT_NAME}/ping" headers = { - "Authorization": f"GenieKey {OPSGENIE_API_KEY}" + "Authorization": f"GenieKey {config.OPSGENIE_API_KEY}" } try: - response = requests.get(heart_eat_url, headers=headers) + response = requests.get(url, headers=headers) response.raise_for_status() - logger.info("Pinged Opsgenie heartbeat successfully!") + logger.debug("Pinged Opsgenie heartbeat successfully!") except requests.RequestException as e: logger.exception(f"Failed to send Opsgenie heartbeat. Error: {e}") raise if __name__ == '__main__': - interval_in_seconds = OPSGENIE_PING_INTERVAL_MINUTES * 60 - frequency = max(interval_in_seconds / 2, 1) + config = ServiceConfig() + interval_in_seconds = config.OPSGENIE_PING_INTERVAL_MINUTES * 60 + + # Check if the interval is less than 1 minute + if interval_in_seconds < 60: + try: + raise ValueError("OPSGENIE_PING_INTERVAL_MINUTES must be 1 minute or greater.") + except Exception as e: + logger.exception(str(e)) + raise + + # The frequency is half the interval but not less than 1 minute + frequency = max(interval_in_seconds / 2, 60) + execution_count = 0 + while True: - time.sleep(frequency) - if is_cluster_healthy(): - logger.info("Checks: STARTED") - #send_opsgenie_heartbeat(OPSGENIE_HEARTBEAT_NAME) - logger.info("Checks: PASSED") + if execution_count < 2: + if is_cluster_healthy(config): + logger.info("Checks: STARTED") + #send_opsgenie_heartbeat(config.OPSGENIE_HEARTBEAT_NAME) + logger.info("Checks: PASSED") + else: + logger.error(f"One or more health checks failed. Heartbeat for {config.OPSGENIE_HEARTBEAT_NAME} was not sent") + logger.info("Waiting 5mins before checking again") + time.sleep(60*5) + execution_count += 1 else: - logger.error(f"One or more health checks failed. Heartbeat for {OPSGENIE_HEARTBEAT_NAME} was not sent") + # Reset the count and sleep for the full interval before checking again + execution_count = 0 + time.sleep(interval_in_seconds - 2 * frequency) + time.sleep(frequency) + diff --git a/serviceconfig.py b/serviceconfig.py new file mode 100644 index 0000000..8d7d84f --- /dev/null +++ b/serviceconfig.py @@ -0,0 +1,34 @@ +import os + +class ServiceConfig: + def __init__(self): + if os.getenv('KUBERNETES_SERVICE_HOST'): + print("Setting up for Kubernetes environment.") + self._setup_kubernetes_config() + else: + print("Setting up for local environment.") + self._setup_local_config() + + # New environment variable settings + self.OPSGENIE_API_KEY = os.getenv('OPSGENIE_API_KEY') + self.OPSGENIE_HEARTBEAT_NAME = os.getenv('OPSGENIE_HEARTBEAT_NAME') + self.OPSGENIE_PING_INTERVAL_MINUTES = int(os.getenv('OPSGENIE_PING_INTERVAL_MINUTES', 3)) + + + def _setup_kubernetes_config(self): + suffix = "glueops-core-kube-prometheus-stack.svc.cluster.local" + self.prometheus = f"kps-prometheus.{suffix}:9090" + self.alertmanager = f"kps-alertmanager.{suffix}:9093" + self._set_urls() + + def _setup_local_config(self): + self.prometheus = "localhost:9090" + self.alertmanager = "localhost:9093" + self._set_urls() + + def _set_urls(self): + self.PROMETHEUS_URL_HEALTH = f"http://{self.prometheus}/-/healthy" + self.ALERTMANAGER_URL_HEALTH = f"http://{self.alertmanager}/-/healthy" + self.PROMETHEUS_URL_READY = f"http://{self.prometheus}/-/ready" + self.ALERTMANAGER_URL_READY = f"http://{self.alertmanager}/-/ready" + self.PROMETHEUS_QUERY_URL = f"http://{self.prometheus}/api/v1/query" diff --git a/test_monitoring_script.py b/test_monitoring_script.py deleted file mode 100644 index ad04616..0000000 --- a/test_monitoring_script.py +++ /dev/null @@ -1,83 +0,0 @@ -import unittest -from unittest.mock import patch, Mock -import requests -import monitoring_script # Replace with the actual name of your Python script - -class TestMonitoringScript(unittest.TestCase): - - @patch('monitoring_script.requests.get') - def test_check_for_200_response_success(self, mock_get): - mock_get.return_value = Mock(status_code=200) - result = monitoring_script.check_for_200_response("http://example.com") - self.assertTrue(result) - - @patch('monitoring_script.requests.get') - def test_check_for_200_response_failure(self, mock_get): - mock_get.return_value = Mock(status_code=404) - result = monitoring_script.check_for_200_response("http://example.com") - self.assertFalse(result) - - @patch('monitoring_script.requests.get') - def test_check_for_200_response_exception(self, mock_get): - mock_get.side_effect = requests.exceptions.ConnectionError - with self.assertRaises(requests.exceptions.ConnectionError): - monitoring_script.check_for_200_response("http://example.com") - - @patch('monitoring_script.requests.get') - def test_get_alertmanager_notification_health_for_opsgenie_success(self, mock_get): - mock_response = Mock() - mock_response.json.return_value = { - "status": "success", - "data": { - "resultType": "vector", - "result": [{"value": [1234567890, "0"]}] - } - } - mock_get.return_value = mock_response - result = monitoring_script.get_alertmanager_notifification_health_for_opsgenie() - self.assertTrue(result) - - @patch('monitoring_script.requests.get') - def test_get_alertmanager_notification_health_for_opsgenie_failure(self, mock_get): - mock_response = Mock() - mock_response.json.return_value = { - "status": "success", - "data": { - "resultType": "vector", - "result": [{"value": [1234567890, "1"]}] - } - } - mock_get.return_value = mock_response - result = monitoring_script.get_alertmanager_notifification_health_for_opsgenie() - self.assertFalse(result) - - @patch('monitoring_script.requests.get') - def test_send_opsgenie_heartbeat_success(self, mock_get): - mock_get.return_value = Mock(status_code=202) - result = monitoring_script.send_opsgenie_heartbeat("heartbeat_name") - self.assertIsNone(result) # Assuming your function doesn't return anything - - @patch('monitoring_script.requests.get') - def test_send_opsgenie_heartbeat_failure(self, mock_get): - mock_get.side_effect = requests.exceptions.HTTPError - with self.assertRaises(requests.exceptions.HTTPError): - monitoring_script.send_opsgenie_heartbeat("heartbeat_name") - - @patch('monitoring_script.get_alertmanager_notifification_health_for_opsgenie') - @patch('monitoring_script.check_for_200_response') - def test_is_cluster_healthy_all_checks_pass(self, mock_check_for_200, mock_get_alertmanager): - mock_check_for_200.return_value = True - mock_get_alertmanager.return_value = True - result = monitoring_script.is_cluster_healthy() - self.assertTrue(result) - - @patch('monitoring_script.get_alertmanager_notifification_health_for_opsgenie') - @patch('monitoring_script.check_for_200_response') - def test_is_cluster_healthy_some_checks_fail(self, mock_check_for_200, mock_get_alertmanager): - mock_check_for_200.side_effect = [True, False, True, False] # Simulating some checks passing and some failing - mock_get_alertmanager.return_value = True - result = monitoring_script.is_cluster_healthy() - self.assertFalse(result) - -if __name__ == '__main__': - unittest.main()