Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
venkatamutyala authored Nov 9, 2023
1 parent b5c9125 commit a725040
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 165 deletions.
25 changes: 3 additions & 22 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,10 @@
# --- Stage 1: Testing ---
# Use the same base image for consistency
FROM python:3.11.6-alpine3.18 as tester

# Set working directory
WORKDIR /app

# Install dependencies
COPY requirements.txt /app/
RUN pip install --upgrade pip && pip install -r requirements.txt && pip install pytest

# Copy your application code and test files
COPY monitoring_script.py /app/
COPY test_monitoring_script.py /app/

# Run tests
RUN python -u -m pytest -v test_monitoring_script.py

# --- Stage 2: Final Image ---
FROM python:3.11.6-alpine3.18 as final

WORKDIR /app
COPY --from=tester /app/requirements.txt /app/
COPY requirements.txt /app/
RUN pip install --upgrade pip && pip install -r requirements.txt

# Copy only the necessary files from the tester stage
COPY --from=tester /app/monitoring_script.py /app/
COPY monitoring_script.py /app/
COPY serviceconfig.py /app/

CMD [ "python", "-u", "monitoring_script.py" ]
41 changes: 27 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,32 @@
# Cluster Monitoring
# GLUEOPS CLUSTER MONITORING

This repo contains a script that sends a ping to Opsgenie's Heartbeat every 5 minutes. If the cluster is down, Opsgenie will not get a ping and will send a mail to inform team members of the cluster's failed state.
This application is designed for monitoring a Kubernetes cluster with the Prometheus and Alertmanager components from the Kubernetes Prometheus Stack (KPS).

The script is deployed into the ArgoCD cluster under monitoring. Once this cluster is down, pings will not be sent to Opsgenie, triggering an alert which is sent to concerned team members.
## Configuration

## Running the script
Before running the application, make sure to configure the following environment variables:

- Create a ```.env``` file, with the following contents
```bash
OPSGENIE_API_KEY=<some-value>
HEARTBEAT_NAME=<some-value>
PING_SLEEP=<some-value>
```
- `OPSGENIE_API_KEY`: Your Opsgenie API key for sending heartbeat notifications.
- `OPSGENIE_HEARTBEAT_NAME`: The name of the Opsgenie heartbeat to ping.
- `OPSGENIE_PING_INTERVAL_MINUTES`: The interval (in minutes) between pinging the Opsgenie heartbeat (default: 2 minutes).

- Runing the script
```bash
$ docker run --env-file .env ghcr.io/glueops/cluster-monitoring:main
```
## Running in a Kubernetes Cluster

To run this application within a Kubernetes cluster, follow these steps:

1. Ensure your Kubernetes cluster is up and running.
2. Deploy the application with the configured environment variables.
3. The application will automatically detect its environment and use in-cluster URLs for Prometheus and Alertmanager.

## Running Locally for Debugging

To run this application locally for debugging purposes and access Prometheus and Alertmanager, you can set up port forwarding to your cluster. Follow these steps:

1. Ensure you have `kubectl` installed and configured to communicate with your Kubernetes cluster.
2. Identify the Prometheus and Alertmanager pods in your cluster:

```bash
# For Prometheus
kubectl port-forward svc/kps-prometheus 9090:9090 -n glueops-core-kube-prometheus-stack
# For Alertmanager
kubectl port-forward svc/kps-alertmanager 9093:9093 -n glueops-core-kube-prometheus-stack
91 changes: 45 additions & 46 deletions monitoring_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,22 @@
import requests
from glueops.setup_logging import configure as go_configure_logging
import time
from serviceconfig import ServiceConfig

#=== configure logging
logger = go_configure_logging(
name='GLUEOPS_CLUSTER_MONITORING',
level=os.getenv('PYTHON_LOG_LEVEL', 'INFO')
)


OPSGENIE_API_KEY = os.getenv('OPSGENIE_API_KEY')
OPSGENIE_HEARTBEAT_NAME = os.getenv('OPSGENIE_HEARTBEAT_NAME')
OPSGENIE_PING_INTERVAL_MINUTES = os.getenv('OPSGENIE_PING_INTERVAL_MINUTES',2)

CLUSTER_DEFAULT_DOMAIN_NAME = os.getenv(
'CLUSTER_DEFAULT_DOMAIN_NAME',
'cluster.local'
)
KUBE_PROMETHEUS_STACK_NAMESPACE = os.getenv(
'KUBE_PROMETHEUS_STACK_NAMESPACE',
'glueops-core-kube-prometheus-stack'
)

GLUEOPS_CORE_NAMESPACE = os.getenv(
'GLUEOPS_CORE_NAMESPACE',
'glueops-core'
)

PROMETHEUS_URL_HEALTH = f"kps-prometheus.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9090/-/healthy"
PROMETHEUS_URL_READY = f"kps-prometheus.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9090/-/ready"
ALERTMANAGER_URL_HEALTH = f"kps-alertmanager.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9093/-/healthy"
ALERTMANAGER_URL_READY = f"kps-alertmanager.{KUBE_PROMETHEUS_STACK_NAMESPACE}.svc.{CLUSTER_DEFAULT_DOMAIN_NAME}:9093/-/ready"

def is_cluster_healthy():
def is_cluster_healthy(config):
return (
get_alertmanager_notifification_health_for_opsgenie() and
check_for_200_response(PROMETHEUS_URL_HEALTH) and
check_for_200_response(PROMETHEUS_URL_READY) and
check_for_200_response(ALERTMANAGER_URL_HEALTH) and
check_for_200_response(ALERTMANAGER_URL_READY)
get_alertmanager_notifification_health_for_opsgenie(config.PROMETHEUS_QUERY_URL) and
check_for_200_response(config.PROMETHEUS_URL_HEALTH) and
check_for_200_response(config.PROMETHEUS_URL_READY) and
check_for_200_response(config.ALERTMANAGER_URL_HEALTH) and
check_for_200_response(config.ALERTMANAGER_URL_READY)
)

def check_for_200_response(url):
Expand All @@ -56,11 +35,9 @@ def check_for_200_response(url):
raise


def get_alertmanager_notifification_health_for_opsgenie():
def get_alertmanager_notifification_health_for_opsgenie(prometheus_query_url):
# Prometheus query
query = 'sum(increase(alertmanager_notifications_failed_total{integration="opsgenie"}[10m]))'
# URL for the Prometheus HTTP API
url = 'http://'+ KUBE_PROMETHEUS_STACK_NAMESPACE + ':9090/api/v1/query'

# Parameters for the request
params = {
Expand All @@ -69,7 +46,7 @@ def get_alertmanager_notifification_health_for_opsgenie():

try:
# Send the request to Prometheus
response = requests.get(url, params=params)
response = requests.get(prometheus_query_url, params=params)
response.raise_for_status()

# Parse the JSON response
Expand All @@ -95,30 +72,52 @@ def get_alertmanager_notifification_health_for_opsgenie():
logger.exception(f"Error querying Prometheus: {e}")
raise

def send_opsgenie_heartbeat(heartbeat_name):
heart_eat_url = f"https://api.opsgenie.com/v2/heartbeats/{heartbeat_name}/ping"
def send_opsgenie_heartbeat(config):
url = f"https://api.opsgenie.com/v2/heartbeats/{config.OPSGENIE_HEARTBEAT_NAME}/ping"
headers = {
"Authorization": f"GenieKey {OPSGENIE_API_KEY}"
"Authorization": f"GenieKey {config.OPSGENIE_API_KEY}"
}

try:
response = requests.get(heart_eat_url, headers=headers)
response = requests.get(url, headers=headers)
response.raise_for_status()
logger.info("Pinged Opsgenie heartbeat successfully!")
logger.debug("Pinged Opsgenie heartbeat successfully!")

except requests.RequestException as e:
logger.exception(f"Failed to send Opsgenie heartbeat. Error: {e}")
raise

if __name__ == '__main__':
interval_in_seconds = OPSGENIE_PING_INTERVAL_MINUTES * 60
frequency = max(interval_in_seconds / 2, 1)
config = ServiceConfig()
interval_in_seconds = config.OPSGENIE_PING_INTERVAL_MINUTES * 60

# Check if the interval is less than 1 minute
if interval_in_seconds < 60:
try:
raise ValueError("OPSGENIE_PING_INTERVAL_MINUTES must be 1 minute or greater.")
except Exception as e:
logger.exception(str(e))
raise


# The frequency is half the interval but not less than 1 minute
frequency = max(interval_in_seconds / 2, 60)
execution_count = 0

while True:
time.sleep(frequency)
if is_cluster_healthy():
logger.info("Checks: STARTED")
#send_opsgenie_heartbeat(OPSGENIE_HEARTBEAT_NAME)
logger.info("Checks: PASSED")
if execution_count < 2:
if is_cluster_healthy(config):
logger.info("Checks: STARTED")
#send_opsgenie_heartbeat(config.OPSGENIE_HEARTBEAT_NAME)
logger.info("Checks: PASSED")
else:
logger.error(f"One or more health checks failed. Heartbeat for {config.OPSGENIE_HEARTBEAT_NAME} was not sent")
logger.info("Waiting 5mins before checking again")
time.sleep(60*5)
execution_count += 1
else:
logger.error(f"One or more health checks failed. Heartbeat for {OPSGENIE_HEARTBEAT_NAME} was not sent")
# Reset the count and sleep for the full interval before checking again
execution_count = 0
time.sleep(interval_in_seconds - 2 * frequency)
time.sleep(frequency)

34 changes: 34 additions & 0 deletions serviceconfig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os

class ServiceConfig:
def __init__(self):
if os.getenv('KUBERNETES_SERVICE_HOST'):
print("Setting up for Kubernetes environment.")
self._setup_kubernetes_config()
else:
print("Setting up for local environment.")
self._setup_local_config()

# New environment variable settings
self.OPSGENIE_API_KEY = os.getenv('OPSGENIE_API_KEY')
self.OPSGENIE_HEARTBEAT_NAME = os.getenv('OPSGENIE_HEARTBEAT_NAME')
self.OPSGENIE_PING_INTERVAL_MINUTES = int(os.getenv('OPSGENIE_PING_INTERVAL_MINUTES', 3))


def _setup_kubernetes_config(self):
suffix = "glueops-core-kube-prometheus-stack.svc.cluster.local"
self.prometheus = f"kps-prometheus.{suffix}:9090"
self.alertmanager = f"kps-alertmanager.{suffix}:9093"
self._set_urls()

def _setup_local_config(self):
self.prometheus = "localhost:9090"
self.alertmanager = "localhost:9093"
self._set_urls()

def _set_urls(self):
self.PROMETHEUS_URL_HEALTH = f"http://{self.prometheus}/-/healthy"
self.ALERTMANAGER_URL_HEALTH = f"http://{self.alertmanager}/-/healthy"
self.PROMETHEUS_URL_READY = f"http://{self.prometheus}/-/ready"
self.ALERTMANAGER_URL_READY = f"http://{self.alertmanager}/-/ready"
self.PROMETHEUS_QUERY_URL = f"http://{self.prometheus}/api/v1/query"
83 changes: 0 additions & 83 deletions test_monitoring_script.py

This file was deleted.

0 comments on commit a725040

Please sign in to comment.