Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug-1906108: upgrade markus to 5.0.0 and document metrics #1052

Merged
merged 5 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ insert_final_newline = true
charset = utf-8
end_of_line = lf

[*.yml]
[*.{yml,yaml}]
indent_size = 2

[LICENSE]
Expand Down
10 changes: 5 additions & 5 deletions antenna/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
VersionResource,
)
from antenna.libdockerflow import get_release_name
from antenna.liblogging import setup_logging, log_config
from antenna.libmarkus import setup_metrics, METRICS
from antenna.liblogging import set_up_logging, log_config
from antenna.libmarkus import set_up_metrics, METRICS


LOGGER = logging.getLogger(__name__)
Expand All @@ -57,7 +57,7 @@


def count_sentry_scrub_error(msg):
METRICS.incr("collector.sentry_scrub_error", value=1)
METRICS.incr("sentry_scrub_error", value=1, tags=["service:collector"])


def configure_sentry(app_config):
Expand Down Expand Up @@ -207,7 +207,7 @@ def setup(self):
log_config(LOGGER, self.config_manager, self)

# Set up metrics
setup_metrics(
set_up_metrics(
statsd_host=self.config("statsd_host"),
statsd_port=self.config("statsd_port"),
hostname=self.config("hostname"),
Expand Down Expand Up @@ -278,7 +278,7 @@ def get_app(config_manager=None):

# Set up logging and sentry first, so we have something to log to. Then
# build and log everything else.
setup_logging(
set_up_logging(
logging_level=app_config("logging_level"),
debug=app_config("local_dev_env"),
host_id=app_config("hostname"),
Expand Down
2 changes: 1 addition & 1 deletion antenna/crashmover.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def _incr_wait_generator(counter, attempts, sleep_seconds):
def _generator_generator():
for _ in range(attempts - 1):
METRICS.incr(f"collector.crashmover.{counter}")
METRICS.incr("collector.crashmover.retry_count", tags=[f"count:{counter}"])
yield sleep_seconds

return _generator_generator
Expand Down
13 changes: 1 addition & 12 deletions antenna/health_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,6 @@ class HealthState:

def __init__(self):
self.errors = []
self.statsd = {}

def add_statsd(self, name, key, value):
"""Add a key -> value gauge."""
if not isinstance(name, str):
name = name.__class__.__name__
self.statsd["%s.%s" % (name, key)] = value

def add_error(self, name, msg):
"""Add an error."""
Expand All @@ -73,7 +66,7 @@ def is_healthy(self):

def to_dict(self):
"""Convert state to a dict."""
return OrderedDict([("errors", self.errors), ("info", self.statsd)])
return OrderedDict([("errors", self.errors)])


class HeartbeatResource:
Expand All @@ -95,10 +88,6 @@ def on_get(self, req, resp):
if hasattr(resource, "check_health"):
resource.check_health(state)

# Go through and call gauge for each statsd item.
for key, value in state.statsd.items():
METRICS.gauge(f"collector.health.{key}", value=value)

if state.is_healthy():
resp.status = falcon.HTTP_200
else:
Expand Down
10 changes: 5 additions & 5 deletions antenna/liblogging.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
)


_IS_LOGGING_SETUP = False
_IS_LOGGING_SET_UP = False

LOGGER = logging.getLogger(__name__)


def setup_logging(logging_level, debug=False, host_id=None, processname=None):
def set_up_logging(logging_level, debug=False, host_id=None, processname=None):
"""Initialize Python logging configuration.

Note: This only sets up logging once per process. Additional calls will get ignored.
Expand All @@ -33,8 +33,8 @@ def setup_logging(logging_level, debug=False, host_id=None, processname=None):
:arg processname: the process name to log

"""
global _IS_LOGGING_SETUP
if _IS_LOGGING_SETUP:
global _IS_LOGGING_SET_UP
if _IS_LOGGING_SET_UP:
return

host_id = host_id or socket.gethostname()
Expand Down Expand Up @@ -100,7 +100,7 @@ def filter(self, record):
f"set up logging logging_level={logging_level} debug={debug} "
+ f"host_id={host_id} processname={processname}"
)
_IS_LOGGING_SETUP = True
_IS_LOGGING_SET_UP = True


def traverse_tree(instance, namespace=None):
Expand Down
33 changes: 27 additions & 6 deletions antenna/libmarkus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,33 @@
"""Holds Everett-configurable shims for Markus metrics backends."""

import logging
from pathlib import Path

import markus
from markus.filters import AddTagFilter
from markus.filters import AddTagFilter, RegisteredMetricsFilter
import yaml


_IS_MARKUS_SETUP = False
_IS_MARKUS_SET_UP = False

LOGGER = logging.getLogger(__name__)
METRICS = markus.get_metrics("socorro")


def setup_metrics(statsd_host, statsd_port, hostname, debug=False):
# Complete index of all metrics. This is used in documentation and to filter outgoing
# metrics.
def _load_registered_metrics():
# Load the metrics yaml file in this directory
path = Path(__file__).parent / "statsd_metrics.yaml"
with open(path) as fp:
data = yaml.safe_load(fp)
return data


STATSD_METRICS = _load_registered_metrics()


def set_up_metrics(statsd_host, statsd_port, hostname, debug=False):
"""Initialize and configures the metrics system.

:arg statsd_host: the statsd host to send metrics to
Expand All @@ -25,8 +40,8 @@ def setup_metrics(statsd_host, statsd_port, hostname, debug=False):
:arg debug: whether or not to additionally log metrics to the logger

"""
global _IS_MARKUS_SETUP, METRICS
if _IS_MARKUS_SETUP:
global _IS_MARKUS_SET_UP, METRICS
if _IS_MARKUS_SET_UP:
return

markus_backends = [
Expand All @@ -48,10 +63,16 @@ def setup_metrics(statsd_host, statsd_port, hostname, debug=False):
},
}
)
# In local dev and test environments, we want the RegisteredMetricsFilter to
# raise exceptions when metrics are emitted but not documented.
metrics_filter = RegisteredMetricsFilter(
registered_metrics=STATSD_METRICS, raise_error=True
)
METRICS.filters.append(metrics_filter)

if hostname:
METRICS.filters.append(AddTagFilter(f"host:{hostname}"))

markus.configure(markus_backends)

_IS_MARKUS_SETUP = True
_IS_MARKUS_SET_UP = True
139 changes: 139 additions & 0 deletions antenna/statsd_metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# statsd metrics emitted using Markus.
#
# When adding a new metric, make sure to add it here first.
---

socorro.sentry_scrub_error:
type: "incr"
description: |
Emitted every time there was an error in the Sentry event scrubbing code.

Tags:

* ``service``: ``collector``

socorro.collector.breakpad_resource.gzipped_crash:
type: "incr"
description: |
Counter for crash report payloads submitted that were compressed.

socorro.collector.breakpad_resource.gzipped_crash_decompress:
type: "histogram"
description: |
Timer for how long it takes to decompress a compressed crash report
payload.

Tags:

* ``result``: ``success`` or ``fail`` depending on whether there
was an error when decompressing

socorro.collector.breakpad_resource.crash_size:
type: "histogram"
description: |
Histogram for crash report payload size.

Tags:

* ``payload``: ``compressed`` or ``uncompressed``

socorro.collector.breakpad_resource.on_post.time:
type: "timing"
description: |
Timer for how long it takes to handle a crash report HTTP POST request.

socorro.collector.breakpad_resource.malformed:
type: "incr"
description: |
Counter for how many malformed crash report payloads have been submitted.

Tags:

* ``reason``: a short string specifying how the crash report payload was
malformed.

socorro.collector.breakpad_resource.incoming_crash:
type: "incr"
description: |
Counter for number of well-formed crash reports submitted.

socorro.collector.breakpad_resource.throttle_rule:
type: "incr"
description: |
Counter for which throttle rule dictated how the crash report was directed.

Tags:

* ``rule``: a short string indicating the rule used

socorro.collector.breakpad_resource.throttle:
type: "incr"
description: |
Counter for the throttle result.

Tags:

* ``result``: ``accept``, ``defer``, ``reject``, ``fakeaccept``, or
``continue``

socorro.collector.crashmover.retry_count:
type: "incr"
description: |
Counter for retry attempts for the crashmover operations.

Tags:

* ``count``: the retry count

socorro.collector.crashmover.crash_handling.time:
type: "timing"
description: |
Timer for how long it takes to store the crash report data and publish for
processing.

socorro.collector.crashmover.save_crash_dropped.count:
type: "incr"
description: |
Counter for how many crash reports couldn't be saved to storage because
of errors.

socorro.collector.crashmover.save_crash.count:
type: "incr"
description: |
Counter for how many crash reports were saved and published for processing.

socorro.collector.crashmover.publish_crash_dropped.count:
type: "incr"
description: |
Counter for how many crash reports were saved, but were not published for
processing because of errors.

socorro.collector.crashmover.crash_save.time:
type: "timing"
description: |
Timer for how long it takes to save a crash report to storage.

socorro.collector.crashmover.crash_publish.time:
type: "timing"
description: |
Timer for how long it takes to publish a crash report for processing.

socorro.collector.health.broken.count:
type: "incr"
description: |
Counter for ``/__broken__`` view.

socorro.collector.health.version.count:
type: "incr"
description: |
Counter for ``/__version__`` view.

socorro.collector.health.lbheartbeat.count:
type: "incr"
description: |
Counter for ``/__lbheartbeat__`` view.

socorro.collector.health.heartbeat.count:
type: "incr"
description: |
Counter for ``/__heartbeat__`` view.
2 changes: 1 addition & 1 deletion bin/run_lint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

set -euo pipefail

FILES="antenna bin testlib tests systemtest"
FILES="antenna bin docs testlib tests systemtest"
PYTHON_VERSION=$(python --version)


Expand Down
Loading