Skip to content

Commit

Permalink
bug-1906108: document statsd metrics
Browse files Browse the repository at this point in the history
This adds a statsd_metrics.yaml file that registers all the statsd
metrics that Antenna emits.

This adds a document_metrics.py Sphinx extension for autogenerating
metrics documentation.

While doing this, I removed statsd bits from health state since we don't
use that anymore.
  • Loading branch information
willkg committed Aug 10, 2024
1 parent e04b56e commit f29fa97
Show file tree
Hide file tree
Showing 15 changed files with 312 additions and 50 deletions.
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ insert_final_newline = true
charset = utf-8
end_of_line = lf

[*.yml]
[*.{yml,yaml}]
indent_size = 2

[LICENSE]
Expand Down
8 changes: 4 additions & 4 deletions antenna/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
VersionResource,
)
from antenna.libdockerflow import get_release_name
from antenna.liblogging import setup_logging, log_config
from antenna.libmarkus import setup_metrics, METRICS
from antenna.liblogging import set_up_logging, log_config
from antenna.libmarkus import set_up_metrics, METRICS


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -207,7 +207,7 @@ def setup(self):
log_config(LOGGER, self.config_manager, self)

# Set up metrics
setup_metrics(
set_up_metrics(
statsd_host=self.config("statsd_host"),
statsd_port=self.config("statsd_port"),
hostname=self.config("hostname"),
Expand Down Expand Up @@ -278,7 +278,7 @@ def get_app(config_manager=None):

# Set up logging and sentry first, so we have something to log to. Then
# build and log everything else.
setup_logging(
set_up_logging(
logging_level=app_config("logging_level"),
debug=app_config("local_dev_env"),
host_id=app_config("hostname"),
Expand Down
2 changes: 1 addition & 1 deletion antenna/crashmover.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def _incr_wait_generator(counter, attempts, sleep_seconds):
def _generator_generator():
for _ in range(attempts - 1):
METRICS.incr(f"collector.crashmover.{counter}")
METRICS.incr("collector.crashmover.retry_count", tags=[f"count:{counter}"])
yield sleep_seconds

return _generator_generator
Expand Down
13 changes: 1 addition & 12 deletions antenna/health_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,6 @@ class HealthState:

def __init__(self):
self.errors = []
self.statsd = {}

def add_statsd(self, name, key, value):
"""Add a key -> value gauge."""
if not isinstance(name, str):
name = name.__class__.__name__
self.statsd["%s.%s" % (name, key)] = value

def add_error(self, name, msg):
"""Add an error."""
Expand All @@ -73,7 +66,7 @@ def is_healthy(self):

def to_dict(self):
"""Convert state to a dict."""
return OrderedDict([("errors", self.errors), ("info", self.statsd)])
return OrderedDict([("errors", self.errors)])


class HeartbeatResource:
Expand All @@ -95,10 +88,6 @@ def on_get(self, req, resp):
if hasattr(resource, "check_health"):
resource.check_health(state)

# Go through and call gauge for each statsd item.
for key, value in state.statsd.items():
METRICS.gauge(f"collector.health.{key}", value=value)

if state.is_healthy():
resp.status = falcon.HTTP_200
else:
Expand Down
10 changes: 5 additions & 5 deletions antenna/liblogging.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
)


_IS_LOGGING_SETUP = False
_IS_LOGGING_SET_UP = False

LOGGER = logging.getLogger(__name__)


def setup_logging(logging_level, debug=False, host_id=None, processname=None):
def set_up_logging(logging_level, debug=False, host_id=None, processname=None):
"""Initialize Python logging configuration.
Note: This only sets up logging once per process. Additional calls will get ignored.
Expand All @@ -33,8 +33,8 @@ def setup_logging(logging_level, debug=False, host_id=None, processname=None):
:arg processname: the process name to log
"""
global _IS_LOGGING_SETUP
if _IS_LOGGING_SETUP:
global _IS_LOGGING_SET_UP
if _IS_LOGGING_SET_UP:
return

host_id = host_id or socket.gethostname()
Expand Down Expand Up @@ -100,7 +100,7 @@ def filter(self, record):
f"set up logging logging_level={logging_level} debug={debug} "
+ f"host_id={host_id} processname={processname}"
)
_IS_LOGGING_SETUP = True
_IS_LOGGING_SET_UP = True


def traverse_tree(instance, namespace=None):
Expand Down
33 changes: 27 additions & 6 deletions antenna/libmarkus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,33 @@
"""Holds Everett-configurable shims for Markus metrics backends."""

import logging
from pathlib import Path

import markus
from markus.filters import AddTagFilter
from markus.filters import AddTagFilter, RegisteredMetricsFilter
import yaml


_IS_MARKUS_SETUP = False
_IS_MARKUS_SET_UP = False

LOGGER = logging.getLogger(__name__)
METRICS = markus.get_metrics("socorro")


def setup_metrics(statsd_host, statsd_port, hostname, debug=False):
# Complete index of all metrics. This is used in documentation and to filter outgoing
# metrics.
def _load_registered_metrics():
# Load the metrics yaml file in this directory
path = Path(__file__).parent / "statsd_metrics.yaml"
with open(path) as fp:
data = yaml.safe_load(fp)
return data


STATSD_METRICS = _load_registered_metrics()


def set_up_metrics(statsd_host, statsd_port, hostname, debug=False):
"""Initialize and configures the metrics system.
:arg statsd_host: the statsd host to send metrics to
Expand All @@ -25,8 +40,8 @@ def setup_metrics(statsd_host, statsd_port, hostname, debug=False):
:arg debug: whether or not to additionally log metrics to the logger
"""
global _IS_MARKUS_SETUP, METRICS
if _IS_MARKUS_SETUP:
global _IS_MARKUS_SET_UP, METRICS
if _IS_MARKUS_SET_UP:
return

markus_backends = [
Expand All @@ -48,10 +63,16 @@ def setup_metrics(statsd_host, statsd_port, hostname, debug=False):
},
}
)
# In local dev and test environments, we want the RegisteredMetricsFilter to
# raise exceptions when metrics are emitted but not documented.
metrics_filter = RegisteredMetricsFilter(
registered_metrics=STATSD_METRICS, raise_error=True
)
METRICS.filters.append(metrics_filter)

if hostname:
METRICS.filters.append(AddTagFilter(f"host:{hostname}"))

markus.configure(markus_backends)

_IS_MARKUS_SETUP = True
_IS_MARKUS_SET_UP = True
135 changes: 135 additions & 0 deletions antenna/statsd_metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# statsd metrics emitted using Markus.
#
# When adding a new metric, make sure to add it here first.
---

socorro.collector.sentry_scrub_error:
type: "incr"
description: |
Emitted every time there was an error in the Sentry event scrubbing code.
socorro.collector.breakpad_resource.gzipped_crash:
type: "incr"
description: |
Counter for crash report payloads submitted that were compressed.
socorro.collector.breakpad_resource.gzipped_crash_decompress:
type: "histogram"
description: |
Timer for how long it takes to decompress a compressed crash report
payload.
Tags:
* ``result``: ``success`` or ``fail`` depending on whether there
was an error when decompressing
socorro.collector.breakpad_resource.crash_size:
type: "histogram"
description: |
Histogram for crash report payload size.
Tags:
* ``payload``: ``compressed`` or ``uncompressed``
socorro.collector.breakpad_resource.on_post.time:
type: "timing"
description: |
Timer for how long it takes to handle a crash report HTTP POST request.
socorro.collector.breakpad_resource.malformed:
type: "incr"
description: |
Counter for how many malformed crash report payloads have been submitted.
Tags:
* ``reason``: a short string specifying how the crash report payload was
malformed.
socorro.collector.breakpad_resource.incoming_crash:
type: "incr"
description: |
Counter for number of well-formed crash reports submitted.
socorro.collector.breakpad_resource.throttle_rule:
type: "incr"
description: |
Counter for which throttle rule dictated how the crash report was directed.
Tags:
* ``rule``: a short string indicating the rule used
socorro.collector.breakpad_resource.throttle:
type: "incr"
description: |
Counter for the throttle result.
Tags:
* ``result``: ``accept``, ``defer``, ``reject``, ``fakeaccept``, or
``continue``
socorro.collector.crashmover.retry_count:
type: "incr"
description: |
Counter for retry attempts for the crashmover operations.
Tags:
* ``count``: the retry count
socorro.collector.crashmover.crash_handling.time:
type: "timing"
description: |
Timer for how long it takes to store the crash report data and publish for
processing.
socorro.collector.crashmover.save_crash_dropped.count:
type: "incr"
description: |
Counter for how many crash reports couldn't be saved to storage because
of errors.
socorro.collector.crashmover.save_crash.count:
type: "incr"
description: |
Counter for how many crash reports were saved and published for processing.
socorro.collector.crashmover.publish_crash_dropped.count:
type: "incr"
description: |
Counter for how many crash reports were saved, but were not published for
processing because of errors.
socorro.collector.crashmover.crash_save.time:
type: "timing"
description: |
Timer for how long it takes to save a crash report to storage.
socorro.collector.crashmover.crash_publish.time:
type: "timing"
description: |
Timer for how long it takes to publish a crash report for processing.
socorro.collector.health.broken.count:
type: "incr"
description: |
Counter for ``/__broken__`` view.
socorro.collector.health.version.count:
type: "incr"
description: |
Counter for ``/__version__`` view.
socorro.collector.health.lbheartbeat.count:
type: "incr"
description: |
Counter for ``/__lbheartbeat__`` view.
socorro.collector.health.heartbeat.count:
type: "incr"
description: |
Counter for ``/__heartbeat__`` view.
2 changes: 1 addition & 1 deletion bin/run_lint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

set -euo pipefail

FILES="antenna bin testlib tests systemtest"
FILES="antenna bin docs testlib tests systemtest"
PYTHON_VERSION=$(python --version)


Expand Down
Loading

0 comments on commit f29fa97

Please sign in to comment.