From f29fa9732eea96e9f560f4f202762cecadeb74cc Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 10 Aug 2024 14:01:19 -0400 Subject: [PATCH] bug-1906108: document statsd metrics This adds a statsd_metrics.yaml file that registers all the statsd metrics that Antenna emits. This adds a document_metrics.py Sphinx extension for autogenerating metrics documentation. While doing this, I removed statsd bits from health state since we don't use that anymore. --- .editorconfig | 2 +- antenna/app.py | 8 +- antenna/crashmover.py | 2 +- antenna/health_resource.py | 13 +--- antenna/liblogging.py | 10 +-- antenna/libmarkus.py | 33 +++++++-- antenna/statsd_metrics.yaml | 135 ++++++++++++++++++++++++++++++++++ bin/run_lint.sh | 2 +- docs/conf.py | 36 +++++---- docs/exts/__init__.py | 3 + docs/exts/document_metrics.py | 109 +++++++++++++++++++++++++++ docs/index.rst | 1 + docs/overview.rst | 2 +- tests/conftest.py | 4 +- tests/test_health_resource.py | 2 +- 15 files changed, 312 insertions(+), 50 deletions(-) create mode 100644 antenna/statsd_metrics.yaml create mode 100644 docs/exts/__init__.py create mode 100644 docs/exts/document_metrics.py diff --git a/.editorconfig b/.editorconfig index eec5edb0..ef5ce19e 100644 --- a/.editorconfig +++ b/.editorconfig @@ -10,7 +10,7 @@ insert_final_newline = true charset = utf-8 end_of_line = lf -[*.yml] +[*.{yml,yaml}] indent_size = 2 [LICENSE] diff --git a/antenna/app.py b/antenna/app.py index 739b5561..2bb99a3e 100644 --- a/antenna/app.py +++ b/antenna/app.py @@ -37,8 +37,8 @@ VersionResource, ) from antenna.libdockerflow import get_release_name -from antenna.liblogging import setup_logging, log_config -from antenna.libmarkus import setup_metrics, METRICS +from antenna.liblogging import set_up_logging, log_config +from antenna.libmarkus import set_up_metrics, METRICS LOGGER = logging.getLogger(__name__) @@ -207,7 +207,7 @@ def setup(self): log_config(LOGGER, self.config_manager, self) # Set up metrics - setup_metrics( + set_up_metrics( statsd_host=self.config("statsd_host"), statsd_port=self.config("statsd_port"), hostname=self.config("hostname"), @@ -278,7 +278,7 @@ def get_app(config_manager=None): # Set up logging and sentry first, so we have something to log to. Then # build and log everything else. - setup_logging( + set_up_logging( logging_level=app_config("logging_level"), debug=app_config("local_dev_env"), host_id=app_config("hostname"), diff --git a/antenna/crashmover.py b/antenna/crashmover.py index afa481ae..930c46ee 100644 --- a/antenna/crashmover.py +++ b/antenna/crashmover.py @@ -17,7 +17,7 @@ def _incr_wait_generator(counter, attempts, sleep_seconds): def _generator_generator(): for _ in range(attempts - 1): - METRICS.incr(f"collector.crashmover.{counter}") + METRICS.incr("collector.crashmover.retry_count", tags=[f"count:{counter}"]) yield sleep_seconds return _generator_generator diff --git a/antenna/health_resource.py b/antenna/health_resource.py index c2b7d4d4..680cacca 100644 --- a/antenna/health_resource.py +++ b/antenna/health_resource.py @@ -52,13 +52,6 @@ class HealthState: def __init__(self): self.errors = [] - self.statsd = {} - - def add_statsd(self, name, key, value): - """Add a key -> value gauge.""" - if not isinstance(name, str): - name = name.__class__.__name__ - self.statsd["%s.%s" % (name, key)] = value def add_error(self, name, msg): """Add an error.""" @@ -73,7 +66,7 @@ def is_healthy(self): def to_dict(self): """Convert state to a dict.""" - return OrderedDict([("errors", self.errors), ("info", self.statsd)]) + return OrderedDict([("errors", self.errors)]) class HeartbeatResource: @@ -95,10 +88,6 @@ def on_get(self, req, resp): if hasattr(resource, "check_health"): resource.check_health(state) - # Go through and call gauge for each statsd item. - for key, value in state.statsd.items(): - METRICS.gauge(f"collector.health.{key}", value=value) - if state.is_healthy(): resp.status = falcon.HTTP_200 else: diff --git a/antenna/liblogging.py b/antenna/liblogging.py index cf79039e..28380ba5 100644 --- a/antenna/liblogging.py +++ b/antenna/liblogging.py @@ -17,12 +17,12 @@ ) -_IS_LOGGING_SETUP = False +_IS_LOGGING_SET_UP = False LOGGER = logging.getLogger(__name__) -def setup_logging(logging_level, debug=False, host_id=None, processname=None): +def set_up_logging(logging_level, debug=False, host_id=None, processname=None): """Initialize Python logging configuration. Note: This only sets up logging once per process. Additional calls will get ignored. @@ -33,8 +33,8 @@ def setup_logging(logging_level, debug=False, host_id=None, processname=None): :arg processname: the process name to log """ - global _IS_LOGGING_SETUP - if _IS_LOGGING_SETUP: + global _IS_LOGGING_SET_UP + if _IS_LOGGING_SET_UP: return host_id = host_id or socket.gethostname() @@ -100,7 +100,7 @@ def filter(self, record): f"set up logging logging_level={logging_level} debug={debug} " + f"host_id={host_id} processname={processname}" ) - _IS_LOGGING_SETUP = True + _IS_LOGGING_SET_UP = True def traverse_tree(instance, namespace=None): diff --git a/antenna/libmarkus.py b/antenna/libmarkus.py index dff4d684..703b6215 100644 --- a/antenna/libmarkus.py +++ b/antenna/libmarkus.py @@ -5,18 +5,33 @@ """Holds Everett-configurable shims for Markus metrics backends.""" import logging +from pathlib import Path import markus -from markus.filters import AddTagFilter +from markus.filters import AddTagFilter, RegisteredMetricsFilter +import yaml -_IS_MARKUS_SETUP = False +_IS_MARKUS_SET_UP = False LOGGER = logging.getLogger(__name__) METRICS = markus.get_metrics("socorro") -def setup_metrics(statsd_host, statsd_port, hostname, debug=False): +# Complete index of all metrics. This is used in documentation and to filter outgoing +# metrics. +def _load_registered_metrics(): + # Load the metrics yaml file in this directory + path = Path(__file__).parent / "statsd_metrics.yaml" + with open(path) as fp: + data = yaml.safe_load(fp) + return data + + +STATSD_METRICS = _load_registered_metrics() + + +def set_up_metrics(statsd_host, statsd_port, hostname, debug=False): """Initialize and configures the metrics system. :arg statsd_host: the statsd host to send metrics to @@ -25,8 +40,8 @@ def setup_metrics(statsd_host, statsd_port, hostname, debug=False): :arg debug: whether or not to additionally log metrics to the logger """ - global _IS_MARKUS_SETUP, METRICS - if _IS_MARKUS_SETUP: + global _IS_MARKUS_SET_UP, METRICS + if _IS_MARKUS_SET_UP: return markus_backends = [ @@ -48,10 +63,16 @@ def setup_metrics(statsd_host, statsd_port, hostname, debug=False): }, } ) + # In local dev and test environments, we want the RegisteredMetricsFilter to + # raise exceptions when metrics are emitted but not documented. + metrics_filter = RegisteredMetricsFilter( + registered_metrics=STATSD_METRICS, raise_error=True + ) + METRICS.filters.append(metrics_filter) if hostname: METRICS.filters.append(AddTagFilter(f"host:{hostname}")) markus.configure(markus_backends) - _IS_MARKUS_SETUP = True + _IS_MARKUS_SET_UP = True diff --git a/antenna/statsd_metrics.yaml b/antenna/statsd_metrics.yaml new file mode 100644 index 00000000..6aae2405 --- /dev/null +++ b/antenna/statsd_metrics.yaml @@ -0,0 +1,135 @@ +# statsd metrics emitted using Markus. +# +# When adding a new metric, make sure to add it here first. +--- + +socorro.collector.sentry_scrub_error: + type: "incr" + description: | + Emitted every time there was an error in the Sentry event scrubbing code. + +socorro.collector.breakpad_resource.gzipped_crash: + type: "incr" + description: | + Counter for crash report payloads submitted that were compressed. + +socorro.collector.breakpad_resource.gzipped_crash_decompress: + type: "histogram" + description: | + Timer for how long it takes to decompress a compressed crash report + payload. + + Tags: + + * ``result``: ``success`` or ``fail`` depending on whether there + was an error when decompressing + +socorro.collector.breakpad_resource.crash_size: + type: "histogram" + description: | + Histogram for crash report payload size. + + Tags: + + * ``payload``: ``compressed`` or ``uncompressed`` + +socorro.collector.breakpad_resource.on_post.time: + type: "timing" + description: | + Timer for how long it takes to handle a crash report HTTP POST request. + +socorro.collector.breakpad_resource.malformed: + type: "incr" + description: | + Counter for how many malformed crash report payloads have been submitted. + + Tags: + + * ``reason``: a short string specifying how the crash report payload was + malformed. + +socorro.collector.breakpad_resource.incoming_crash: + type: "incr" + description: | + Counter for number of well-formed crash reports submitted. + +socorro.collector.breakpad_resource.throttle_rule: + type: "incr" + description: | + Counter for which throttle rule dictated how the crash report was directed. + + Tags: + + * ``rule``: a short string indicating the rule used + +socorro.collector.breakpad_resource.throttle: + type: "incr" + description: | + Counter for the throttle result. + + Tags: + + * ``result``: ``accept``, ``defer``, ``reject``, ``fakeaccept``, or + ``continue`` + +socorro.collector.crashmover.retry_count: + type: "incr" + description: | + Counter for retry attempts for the crashmover operations. + + Tags: + + * ``count``: the retry count + +socorro.collector.crashmover.crash_handling.time: + type: "timing" + description: | + Timer for how long it takes to store the crash report data and publish for + processing. + +socorro.collector.crashmover.save_crash_dropped.count: + type: "incr" + description: | + Counter for how many crash reports couldn't be saved to storage because + of errors. + +socorro.collector.crashmover.save_crash.count: + type: "incr" + description: | + Counter for how many crash reports were saved and published for processing. + +socorro.collector.crashmover.publish_crash_dropped.count: + type: "incr" + description: | + Counter for how many crash reports were saved, but were not published for + processing because of errors. + +socorro.collector.crashmover.crash_save.time: + type: "timing" + description: | + Timer for how long it takes to save a crash report to storage. + +socorro.collector.crashmover.crash_publish.time: + type: "timing" + description: | + Timer for how long it takes to publish a crash report for processing. + +socorro.collector.health.broken.count: + type: "incr" + description: | + Counter for ``/__broken__`` view. + +socorro.collector.health.version.count: + type: "incr" + description: | + Counter for ``/__version__`` view. + +socorro.collector.health.lbheartbeat.count: + type: "incr" + description: | + Counter for ``/__lbheartbeat__`` view. + +socorro.collector.health.heartbeat.count: + type: "incr" + description: | + Counter for ``/__heartbeat__`` view. diff --git a/bin/run_lint.sh b/bin/run_lint.sh index 2e9a8bd7..9ef8eaea 100755 --- a/bin/run_lint.sh +++ b/bin/run_lint.sh @@ -12,7 +12,7 @@ set -euo pipefail -FILES="antenna bin testlib tests systemtest" +FILES="antenna bin docs testlib tests systemtest" PYTHON_VERSION=$(python --version) diff --git a/docs/conf.py b/docs/conf.py index 37e8c303..0f1a5d59 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,17 +17,20 @@ BASEDIR = Path(__file__).parent.parent +# Insert repo base dir which will pick up Antenna things sys.path.insert(0, str(BASEDIR)) +# Insert this directory to pick up extensions +sys.path.insert(0, str(Path(__file__).parent)) # -- Project information ----------------------------------------------------- -project = 'Antenna' -copyright = '2016-2022, Mozilla Foundation' -author = 'Socorro Team' +project = "Antenna" +copyright = "2016-2024, Mozilla Foundation" +author = "Observability Team" # The full version, including alpha/beta/rc tags. -release = '1.0' +release = "1.0" # -- General configuration ------------------------------------------------ @@ -36,20 +39,21 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'everett.sphinxext', + "sphinx.ext.autodoc", + "everett.sphinxext", + "exts.document_metrics", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -60,7 +64,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom themes here, relative to this directory. html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] @@ -68,14 +72,14 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom sidebar templates, maps document names to template names. html_sidebars = { - '**': [ - 'globaltoc.html', - 'relations.html', - 'searchbox.html', - 'genindex.html', + "**": [ + "globaltoc.html", + "relations.html", + "searchbox.html", + "genindex.html", ] } diff --git a/docs/exts/__init__.py b/docs/exts/__init__.py new file mode 100644 index 00000000..448bb865 --- /dev/null +++ b/docs/exts/__init__.py @@ -0,0 +1,3 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/docs/exts/document_metrics.py b/docs/exts/document_metrics.py new file mode 100644 index 00000000..83544aed --- /dev/null +++ b/docs/exts/document_metrics.py @@ -0,0 +1,109 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + +"""Generates the documentation for metrics.""" + +import importlib +import sys +import textwrap + +from docutils import nodes +from docutils.parsers.rst import Directive +from docutils.statemachine import ViewList + + +def build_table(table): + col_size = [0] * len(table[0]) + for row in table: + for i, col in enumerate(row): + col_size[i] = max(col_size[i], len(col)) + + col_size = [width + 2 for width in col_size] + + yield " ".join("=" * width for width in col_size) + yield " ".join( + header + (" " * (width - len(header))) + for header, width in zip(table[0], col_size, strict=True) + ) + yield " ".join("=" * width for width in col_size) + for row in table[1:]: + yield " ".join( + col + (" " * (width - len(col))) + for col, width in zip(row, col_size, strict=True) + ) + yield " ".join("=" * width for width in col_size) + + +class AutoMetricsDirective(Directive): + has_content = False + required_arguments = 1 + optional_arguments = 0 + final_argument_whitespace = False + + option_spec = {} + + def add_line(self, line, source, *lineno): + """Add a line to the result""" + self.result.append(line, source, *lineno) + + def generate_docs(self, clspath): + modpath, name = clspath.rsplit(".", 1) + importlib.import_module(modpath) + module = sys.modules[modpath] + metrics = getattr(module, name) + + sourcename = f"metrics of {clspath}" + + # First build a table of metric items + self.add_line("Table of metrics:", sourcename) + self.add_line("", sourcename) + + table = [] + table.append(("Key", "Type")) + for key, metric in metrics.items(): + table.append((f":py:data:`{key}`", metric["type"])) + + for line in build_table(table): + self.add_line(line, sourcename) + + self.add_line("", sourcename) + self.add_line("Metrics details:", sourcename) + self.add_line("", sourcename) + + for key, metric in metrics.items(): + self.add_line(f".. py:data:: {key}", sourcename) + self.add_line("", sourcename) + self.add_line("", sourcename) + self.add_line(f" **Type**: ``{metric['type']}``", sourcename) + self.add_line("", sourcename) + self.add_line("", sourcename) + for line in textwrap.dedent(metric["description"]).splitlines(): + self.add_line(f" {line}", sourcename) + self.add_line("", sourcename) + self.add_line("", sourcename) + + def run(self): + self.reporter = self.state.document.reporter + self.result = ViewList() + + self.generate_docs(self.arguments[0]) + + if not self.result: + return [] + + node = nodes.paragraph() + node.document = self.state.document + self.state.nested_parse(self.result, 0, node) + return node.children + + +def setup(app): + """Register directive in Sphinx.""" + app.add_directive("autometrics", AutoMetricsDirective) + + return { + "version": 1.0, + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/docs/index.rst b/docs/index.rst index 46a9d5f0..14cacdcf 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,6 +9,7 @@ dev contributing configuration + metrics deploy .. toctree:: diff --git a/docs/overview.rst b/docs/overview.rst index b13306b3..4807cd0c 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -201,7 +201,7 @@ instance--either will work fine. Cloud storage file hierarchy ---------------------- +---------------------------- If you use the Google Cloud Storage crashstorage component, then crashes get saved in this hierarchy in the bucket: diff --git a/tests/conftest.py b/tests/conftest.py index 400bc2fa..7a86b7bf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,7 @@ REPO_ROOT = Path(__file__).parent.parent.resolve() sys.path.insert(0, str(REPO_ROOT)) -from antenna.app import get_app, setup_logging # noqa +from antenna.app import get_app, set_up_logging # noqa from antenna.app import reset_verify_funs # noqa @@ -43,7 +43,7 @@ def emit(self, record): def pytest_runtest_setup(): # Make sure we set up logging and metrics to sane default values. - setup_logging(logging_level="DEBUG", debug=True, host_id="", processname="antenna") + set_up_logging(logging_level="DEBUG", debug=True, host_id="", processname="antenna") markus.configure( [ {"class": "markus.backends.logging.LoggingMetrics"}, diff --git a/tests/test_health_resource.py b/tests/test_health_resource.py index ae425f94..0f24b97d 100644 --- a/tests/test_health_resource.py +++ b/tests/test_health_resource.py @@ -36,7 +36,7 @@ def test_heartbeat(self, client): assert resp.status_code == 200 # NOTE(willkg): This isn't mocked out, so it's entirely likely that # this expected result will change over time. - assert resp.json == {"errors": [], "info": {}} + assert resp.json == {"errors": []} def test_broken(self, client): resp = client.simulate_get("/__broken__")