fixes

GlueOps · Nov 9, 2023 · a725040 · a725040
1 parent b5c9125
commit a725040
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 165 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,29 +1,10 @@
-# --- Stage 1: Testing ---
-# Use the same base image for consistency
-FROM python:3.11.6-alpine3.18 as tester
-
-# Set working directory
-WORKDIR /app
-
-# Install dependencies
-COPY requirements.txt /app/
-RUN pip install --upgrade pip && pip install -r requirements.txt && pip install pytest
-
-# Copy your application code and test files
-COPY monitoring_script.py /app/
-COPY test_monitoring_script.py /app/
-
-# Run tests
-RUN python -u -m pytest -v test_monitoring_script.py
-
-# --- Stage 2: Final Image ---
 FROM python:3.11.6-alpine3.18 as final
 
 WORKDIR /app
-COPY --from=tester /app/requirements.txt /app/
+COPY requirements.txt /app/
 RUN pip install --upgrade pip && pip install -r requirements.txt
 
-# Copy only the necessary files from the tester stage
-COPY --from=tester /app/monitoring_script.py /app/
+COPY monitoring_script.py /app/
+COPY serviceconfig.py /app/
 
 CMD [ "python", "-u", "monitoring_script.py" ]
diff --git a/README.md b/README.md
@@ -1,19 +1,32 @@
-# Cluster Monitoring
+# GLUEOPS CLUSTER MONITORING
 
-This repo contains a script that sends a ping to Opsgenie's Heartbeat every 5 minutes. If the cluster is down, Opsgenie will not get a ping and will send a mail to inform team members of the cluster's failed state.
+This application is designed for monitoring a Kubernetes cluster with the Prometheus and Alertmanager components from the Kubernetes Prometheus Stack (KPS).
 
-The script is deployed into the ArgoCD cluster under monitoring. Once this cluster is down, pings will not be sent to Opsgenie, triggering an alert which is sent to concerned team members.
+## Configuration
 
-## Running the script
+Before running the application, make sure to configure the following environment variables:
 
-- Create a ```.env``` file, with the following contents
-```bash
-OPSGENIE_API_KEY=<some-value>
-HEARTBEAT_NAME=<some-value>
-PING_SLEEP=<some-value>
-```
+- `OPSGENIE_API_KEY`: Your Opsgenie API key for sending heartbeat notifications.
+- `OPSGENIE_HEARTBEAT_NAME`: The name of the Opsgenie heartbeat to ping.
+- `OPSGENIE_PING_INTERVAL_MINUTES`: The interval (in minutes) between pinging the Opsgenie heartbeat (default: 2 minutes).
 
-- Runing the script
-```bash
-$ docker run --env-file .env ghcr.io/glueops/cluster-monitoring:main
-```
+## Running in a Kubernetes Cluster
+
+To run this application within a Kubernetes cluster, follow these steps:
+
+1. Ensure your Kubernetes cluster is up and running.
+2. Deploy the application with the configured environment variables.
+3. The application will automatically detect its environment and use in-cluster URLs for Prometheus and Alertmanager.
+
+## Running Locally for Debugging
+
+To run this application locally for debugging purposes and access Prometheus and Alertmanager, you can set up port forwarding to your cluster. Follow these steps:
+
+1. Ensure you have `kubectl` installed and configured to communicate with your Kubernetes cluster.
+2. Identify the Prometheus and Alertmanager pods in your cluster:
+
+   ```bash
+# For Prometheus
+kubectl port-forward svc/kps-prometheus 9090:9090 -n glueops-core-kube-prometheus-stack
+# For Alertmanager
+kubectl port-forward svc/kps-alertmanager 9093:9093 -n glueops-core-kube-prometheus-stack
diff --git a/monitoring_script.py b/monitoring_script.py
@@ -2,43 +2,22 @@
 import requests
 from glueops.setup_logging import configure as go_configure_logging
 import time
+from serviceconfig import ServiceConfig
 
 #=== configure logging
 logger = go_configure_logging(
     name='GLUEOPS_CLUSTER_MONITORING',
     level=os.getenv('PYTHON_LOG_LEVEL', 'INFO')
 )
+
 
-OPSGENIE_API_KEY = os.getenv('OPSGENIE_API_KEY')
-OPSGENIE_HEARTBEAT_NAME = os.getenv('OPSGENIE_HEARTBEAT_NAME')
-OPSGENIE_PING_INTERVAL_MINUTES = os.getenv('OPSGENIE_PING_INTERVAL_MINUTES',2)
-
-CLUSTER_DEFAULT_DOMAIN_NAME = os.getenv(
-    'CLUSTER_DEFAULT_DOMAIN_NAME',
-    'cluster.local'
-)
-KUBE_PROMETHEUS_STACK_NAMESPACE = os.getenv(
-    'KUBE_PROMETHEUS_STACK_NAMESPACE',
-    'glueops-core-kube-prometheus-stack'
-)
-
-GLUEOPS_CORE_NAMESPACE = os.getenv(
-    'GLUEOPS_CORE_NAMESPACE',
-    'glueops-core'
-)
-
-PROMETHEUS_URL_HEALTH = f"kps-prometheus.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9090/-/healthy"
-PROMETHEUS_URL_READY = f"kps-prometheus.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9090/-/ready"
-ALERTMANAGER_URL_HEALTH = f"kps-alertmanager.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9093/-/healthy"
-ALERTMANAGER_URL_READY = f"kps-alertmanager.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9093/-/ready"                
-
-def is_cluster_healthy():
+def is_cluster_healthy(config):
     return (
-        get_alertmanager_notifification_health_for_opsgenie() and
-        check_for_200_response(PROMETHEUS_URL_HEALTH) and
-        check_for_200_response(PROMETHEUS_URL_READY) and
-        check_for_200_response(ALERTMANAGER_URL_HEALTH) and
-        check_for_200_response(ALERTMANAGER_URL_READY)
+        get_alertmanager_notifification_health_for_opsgenie(config.PROMETHEUS_QUERY_URL) and
+        check_for_200_response(config.PROMETHEUS_URL_HEALTH) and
+        check_for_200_response(config.PROMETHEUS_URL_READY) and
+        check_for_200_response(config.ALERTMANAGER_URL_HEALTH) and
+        check_for_200_response(config.ALERTMANAGER_URL_READY)
     )
 
 def check_for_200_response(url):
@@ -56,11 +35,9 @@ def check_for_200_response(url):
         raise
 
 
-def get_alertmanager_notifification_health_for_opsgenie():
+def get_alertmanager_notifification_health_for_opsgenie(prometheus_query_url):
     # Prometheus query
     query = 'sum(increase(alertmanager_notifications_failed_total{integration="opsgenie"}[10m]))'
-    # URL for the Prometheus HTTP API
-    url = 'http://'+ KUBE_PROMETHEUS_STACK_NAMESPACE + ':9090/api/v1/query'
 
     # Parameters for the request
     params = {
@@ -69,7 +46,7 @@ def get_alertmanager_notifification_health_for_opsgenie():
 
     try:
         # Send the request to Prometheus
-        response = requests.get(url, params=params)
+        response = requests.get(prometheus_query_url, params=params)
         response.raise_for_status()
 
         # Parse the JSON response
@@ -95,30 +72,52 @@ def get_alertmanager_notifification_health_for_opsgenie():
         logger.exception(f"Error querying Prometheus: {e}")
         raise
 
-def send_opsgenie_heartbeat(heartbeat_name):
-    heart_eat_url = f"https://api.opsgenie.com/v2/heartbeats/{heartbeat_name}/ping"
+def send_opsgenie_heartbeat(config):
+    url = f"https://api.opsgenie.com/v2/heartbeats/{config.OPSGENIE_HEARTBEAT_NAME}/ping"
     headers = {
-        "Authorization": f"GenieKey {OPSGENIE_API_KEY}"
+        "Authorization": f"GenieKey {config.OPSGENIE_API_KEY}"
     }
 
     try:
-        response = requests.get(heart_eat_url, headers=headers)
+        response = requests.get(url, headers=headers)
         response.raise_for_status()
-        logger.info("Pinged Opsgenie heartbeat successfully!")
+        logger.debug("Pinged Opsgenie heartbeat successfully!")
 
     except requests.RequestException as e:
         logger.exception(f"Failed to send Opsgenie heartbeat. Error: {e}")
         raise
 
 if __name__ == '__main__':
-    interval_in_seconds = OPSGENIE_PING_INTERVAL_MINUTES * 60
-    frequency = max(interval_in_seconds / 2, 1)
+    config = ServiceConfig()   
+    interval_in_seconds = config.OPSGENIE_PING_INTERVAL_MINUTES * 60
+
+    # Check if the interval is less than 1 minute
+    if interval_in_seconds < 60:
+        try:
+            raise ValueError("OPSGENIE_PING_INTERVAL_MINUTES must be 1 minute or greater.")
+        except Exception as e:
+            logger.exception(str(e))
+            raise
 
+
+    # The frequency is half the interval but not less than 1 minute
+    frequency = max(interval_in_seconds / 2, 60)
+    execution_count = 0
+
     while True:
-        time.sleep(frequency)
-        if is_cluster_healthy():
-            logger.info("Checks: STARTED")
-            #send_opsgenie_heartbeat(OPSGENIE_HEARTBEAT_NAME)
-            logger.info("Checks: PASSED")
+        if execution_count < 2:
+            if is_cluster_healthy(config):
+                logger.info("Checks: STARTED")
+                #send_opsgenie_heartbeat(config.OPSGENIE_HEARTBEAT_NAME)
+                logger.info("Checks: PASSED")
+            else:
+                logger.error(f"One or more health checks failed. Heartbeat for {config.OPSGENIE_HEARTBEAT_NAME} was not sent")   
+                logger.info("Waiting 5mins before checking again")
+                time.sleep(60*5) 
+            execution_count += 1
         else:
-            logger.error(f"One or more health checks failed. Heartbeat for {OPSGENIE_HEARTBEAT_NAME} was not sent")    
+            # Reset the count and sleep for the full interval before checking again
+            execution_count = 0
+            time.sleep(interval_in_seconds - 2 * frequency)
+        time.sleep(frequency)
+
diff --git a/serviceconfig.py b/serviceconfig.py
@@ -0,0 +1,34 @@
+import os
+
+class ServiceConfig:
+    def __init__(self):
+        if os.getenv('KUBERNETES_SERVICE_HOST'):
+            print("Setting up for Kubernetes environment.")
+            self._setup_kubernetes_config()
+        else:
+            print("Setting up for local environment.")
+            self._setup_local_config()
+
+        # New environment variable settings
+        self.OPSGENIE_API_KEY = os.getenv('OPSGENIE_API_KEY')
+        self.OPSGENIE_HEARTBEAT_NAME = os.getenv('OPSGENIE_HEARTBEAT_NAME')
+        self.OPSGENIE_PING_INTERVAL_MINUTES = int(os.getenv('OPSGENIE_PING_INTERVAL_MINUTES', 3))
+
+
+    def _setup_kubernetes_config(self):
+        suffix = "glueops-core-kube-prometheus-stack.svc.cluster.local"
+        self.prometheus = f"kps-prometheus.{suffix}:9090"
+        self.alertmanager = f"kps-alertmanager.{suffix}:9093"
+        self._set_urls()
+
+    def _setup_local_config(self):
+        self.prometheus = "localhost:9090"
+        self.alertmanager = "localhost:9093"
+        self._set_urls()
+
+    def _set_urls(self):
+        self.PROMETHEUS_URL_HEALTH = f"http://{self.prometheus}/-/healthy"
+        self.ALERTMANAGER_URL_HEALTH = f"http://{self.alertmanager}/-/healthy"
+        self.PROMETHEUS_URL_READY = f"http://{self.prometheus}/-/ready"
+        self.ALERTMANAGER_URL_READY = f"http://{self.alertmanager}/-/ready"
+        self.PROMETHEUS_QUERY_URL = f"http://{self.prometheus}/api/v1/query"
diff --git a/test_monitoring_script.py b/test_monitoring_script.py