diff --git a/.editorconfig b/.editorconfig index eec5edb0..ef5ce19e 100644 --- a/.editorconfig +++ b/.editorconfig @@ -10,7 +10,7 @@ insert_final_newline = true charset = utf-8 end_of_line = lf -[*.yml] +[*.{yml,yaml}] indent_size = 2 [LICENSE] diff --git a/antenna/app.py b/antenna/app.py index 739b5561..8a524dcb 100644 --- a/antenna/app.py +++ b/antenna/app.py @@ -37,8 +37,8 @@ VersionResource, ) from antenna.libdockerflow import get_release_name -from antenna.liblogging import setup_logging, log_config -from antenna.libmarkus import setup_metrics, METRICS +from antenna.liblogging import set_up_logging, log_config +from antenna.libmarkus import set_up_metrics, METRICS LOGGER = logging.getLogger(__name__) @@ -57,7 +57,7 @@ def count_sentry_scrub_error(msg): - METRICS.incr("collector.sentry_scrub_error", value=1) + METRICS.incr("sentry_scrub_error", value=1, tags=["service:collector"]) def configure_sentry(app_config): @@ -207,7 +207,7 @@ def setup(self): log_config(LOGGER, self.config_manager, self) # Set up metrics - setup_metrics( + set_up_metrics( statsd_host=self.config("statsd_host"), statsd_port=self.config("statsd_port"), hostname=self.config("hostname"), @@ -278,7 +278,7 @@ def get_app(config_manager=None): # Set up logging and sentry first, so we have something to log to. Then # build and log everything else. - setup_logging( + set_up_logging( logging_level=app_config("logging_level"), debug=app_config("local_dev_env"), host_id=app_config("hostname"), diff --git a/antenna/crashmover.py b/antenna/crashmover.py index afa481ae..930c46ee 100644 --- a/antenna/crashmover.py +++ b/antenna/crashmover.py @@ -17,7 +17,7 @@ def _incr_wait_generator(counter, attempts, sleep_seconds): def _generator_generator(): for _ in range(attempts - 1): - METRICS.incr(f"collector.crashmover.{counter}") + METRICS.incr("collector.crashmover.retry_count", tags=[f"count:{counter}"]) yield sleep_seconds return _generator_generator diff --git a/antenna/health_resource.py b/antenna/health_resource.py index c2b7d4d4..680cacca 100644 --- a/antenna/health_resource.py +++ b/antenna/health_resource.py @@ -52,13 +52,6 @@ class HealthState: def __init__(self): self.errors = [] - self.statsd = {} - - def add_statsd(self, name, key, value): - """Add a key -> value gauge.""" - if not isinstance(name, str): - name = name.__class__.__name__ - self.statsd["%s.%s" % (name, key)] = value def add_error(self, name, msg): """Add an error.""" @@ -73,7 +66,7 @@ def is_healthy(self): def to_dict(self): """Convert state to a dict.""" - return OrderedDict([("errors", self.errors), ("info", self.statsd)]) + return OrderedDict([("errors", self.errors)]) class HeartbeatResource: @@ -95,10 +88,6 @@ def on_get(self, req, resp): if hasattr(resource, "check_health"): resource.check_health(state) - # Go through and call gauge for each statsd item. - for key, value in state.statsd.items(): - METRICS.gauge(f"collector.health.{key}", value=value) - if state.is_healthy(): resp.status = falcon.HTTP_200 else: diff --git a/antenna/liblogging.py b/antenna/liblogging.py index cf79039e..28380ba5 100644 --- a/antenna/liblogging.py +++ b/antenna/liblogging.py @@ -17,12 +17,12 @@ ) -_IS_LOGGING_SETUP = False +_IS_LOGGING_SET_UP = False LOGGER = logging.getLogger(__name__) -def setup_logging(logging_level, debug=False, host_id=None, processname=None): +def set_up_logging(logging_level, debug=False, host_id=None, processname=None): """Initialize Python logging configuration. Note: This only sets up logging once per process. Additional calls will get ignored. @@ -33,8 +33,8 @@ def setup_logging(logging_level, debug=False, host_id=None, processname=None): :arg processname: the process name to log """ - global _IS_LOGGING_SETUP - if _IS_LOGGING_SETUP: + global _IS_LOGGING_SET_UP + if _IS_LOGGING_SET_UP: return host_id = host_id or socket.gethostname() @@ -100,7 +100,7 @@ def filter(self, record): f"set up logging logging_level={logging_level} debug={debug} " + f"host_id={host_id} processname={processname}" ) - _IS_LOGGING_SETUP = True + _IS_LOGGING_SET_UP = True def traverse_tree(instance, namespace=None): diff --git a/antenna/libmarkus.py b/antenna/libmarkus.py index dff4d684..703b6215 100644 --- a/antenna/libmarkus.py +++ b/antenna/libmarkus.py @@ -5,18 +5,33 @@ """Holds Everett-configurable shims for Markus metrics backends.""" import logging +from pathlib import Path import markus -from markus.filters import AddTagFilter +from markus.filters import AddTagFilter, RegisteredMetricsFilter +import yaml -_IS_MARKUS_SETUP = False +_IS_MARKUS_SET_UP = False LOGGER = logging.getLogger(__name__) METRICS = markus.get_metrics("socorro") -def setup_metrics(statsd_host, statsd_port, hostname, debug=False): +# Complete index of all metrics. This is used in documentation and to filter outgoing +# metrics. +def _load_registered_metrics(): + # Load the metrics yaml file in this directory + path = Path(__file__).parent / "statsd_metrics.yaml" + with open(path) as fp: + data = yaml.safe_load(fp) + return data + + +STATSD_METRICS = _load_registered_metrics() + + +def set_up_metrics(statsd_host, statsd_port, hostname, debug=False): """Initialize and configures the metrics system. :arg statsd_host: the statsd host to send metrics to @@ -25,8 +40,8 @@ def setup_metrics(statsd_host, statsd_port, hostname, debug=False): :arg debug: whether or not to additionally log metrics to the logger """ - global _IS_MARKUS_SETUP, METRICS - if _IS_MARKUS_SETUP: + global _IS_MARKUS_SET_UP, METRICS + if _IS_MARKUS_SET_UP: return markus_backends = [ @@ -48,10 +63,16 @@ def setup_metrics(statsd_host, statsd_port, hostname, debug=False): }, } ) + # In local dev and test environments, we want the RegisteredMetricsFilter to + # raise exceptions when metrics are emitted but not documented. + metrics_filter = RegisteredMetricsFilter( + registered_metrics=STATSD_METRICS, raise_error=True + ) + METRICS.filters.append(metrics_filter) if hostname: METRICS.filters.append(AddTagFilter(f"host:{hostname}")) markus.configure(markus_backends) - _IS_MARKUS_SETUP = True + _IS_MARKUS_SET_UP = True diff --git a/antenna/statsd_metrics.yaml b/antenna/statsd_metrics.yaml new file mode 100644 index 00000000..1ae132b1 --- /dev/null +++ b/antenna/statsd_metrics.yaml @@ -0,0 +1,139 @@ +# statsd metrics emitted using Markus. +# +# When adding a new metric, make sure to add it here first. +--- + +socorro.sentry_scrub_error: + type: "incr" + description: | + Emitted every time there was an error in the Sentry event scrubbing code. + + Tags: + + * ``service``: ``collector`` + +socorro.collector.breakpad_resource.gzipped_crash: + type: "incr" + description: | + Counter for crash report payloads submitted that were compressed. + +socorro.collector.breakpad_resource.gzipped_crash_decompress: + type: "histogram" + description: | + Timer for how long it takes to decompress a compressed crash report + payload. + + Tags: + + * ``result``: ``success`` or ``fail`` depending on whether there + was an error when decompressing + +socorro.collector.breakpad_resource.crash_size: + type: "histogram" + description: | + Histogram for crash report payload size. + + Tags: + + * ``payload``: ``compressed`` or ``uncompressed`` + +socorro.collector.breakpad_resource.on_post.time: + type: "timing" + description: | + Timer for how long it takes to handle a crash report HTTP POST request. + +socorro.collector.breakpad_resource.malformed: + type: "incr" + description: | + Counter for how many malformed crash report payloads have been submitted. + + Tags: + + * ``reason``: a short string specifying how the crash report payload was + malformed. + +socorro.collector.breakpad_resource.incoming_crash: + type: "incr" + description: | + Counter for number of well-formed crash reports submitted. + +socorro.collector.breakpad_resource.throttle_rule: + type: "incr" + description: | + Counter for which throttle rule dictated how the crash report was directed. + + Tags: + + * ``rule``: a short string indicating the rule used + +socorro.collector.breakpad_resource.throttle: + type: "incr" + description: | + Counter for the throttle result. + + Tags: + + * ``result``: ``accept``, ``defer``, ``reject``, ``fakeaccept``, or + ``continue`` + +socorro.collector.crashmover.retry_count: + type: "incr" + description: | + Counter for retry attempts for the crashmover operations. + + Tags: + + * ``count``: the retry count + +socorro.collector.crashmover.crash_handling.time: + type: "timing" + description: | + Timer for how long it takes to store the crash report data and publish for + processing. + +socorro.collector.crashmover.save_crash_dropped.count: + type: "incr" + description: | + Counter for how many crash reports couldn't be saved to storage because + of errors. + +socorro.collector.crashmover.save_crash.count: + type: "incr" + description: | + Counter for how many crash reports were saved and published for processing. + +socorro.collector.crashmover.publish_crash_dropped.count: + type: "incr" + description: | + Counter for how many crash reports were saved, but were not published for + processing because of errors. + +socorro.collector.crashmover.crash_save.time: + type: "timing" + description: | + Timer for how long it takes to save a crash report to storage. + +socorro.collector.crashmover.crash_publish.time: + type: "timing" + description: | + Timer for how long it takes to publish a crash report for processing. + +socorro.collector.health.broken.count: + type: "incr" + description: | + Counter for ``/__broken__`` view. + +socorro.collector.health.version.count: + type: "incr" + description: | + Counter for ``/__version__`` view. + +socorro.collector.health.lbheartbeat.count: + type: "incr" + description: | + Counter for ``/__lbheartbeat__`` view. + +socorro.collector.health.heartbeat.count: + type: "incr" + description: | + Counter for ``/__heartbeat__`` view. diff --git a/bin/run_lint.sh b/bin/run_lint.sh index 2e9a8bd7..9ef8eaea 100755 --- a/bin/run_lint.sh +++ b/bin/run_lint.sh @@ -12,7 +12,7 @@ set -euo pipefail -FILES="antenna bin testlib tests systemtest" +FILES="antenna bin docs testlib tests systemtest" PYTHON_VERSION=$(python --version) diff --git a/docs/conf.py b/docs/conf.py index 37e8c303..0f1a5d59 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,17 +17,20 @@ BASEDIR = Path(__file__).parent.parent +# Insert repo base dir which will pick up Antenna things sys.path.insert(0, str(BASEDIR)) +# Insert this directory to pick up extensions +sys.path.insert(0, str(Path(__file__).parent)) # -- Project information ----------------------------------------------------- -project = 'Antenna' -copyright = '2016-2022, Mozilla Foundation' -author = 'Socorro Team' +project = "Antenna" +copyright = "2016-2024, Mozilla Foundation" +author = "Observability Team" # The full version, including alpha/beta/rc tags. -release = '1.0' +release = "1.0" # -- General configuration ------------------------------------------------ @@ -36,20 +39,21 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'everett.sphinxext', + "sphinx.ext.autodoc", + "everett.sphinxext", + "exts.document_metrics", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -60,7 +64,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom themes here, relative to this directory. html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] @@ -68,14 +72,14 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom sidebar templates, maps document names to template names. html_sidebars = { - '**': [ - 'globaltoc.html', - 'relations.html', - 'searchbox.html', - 'genindex.html', + "**": [ + "globaltoc.html", + "relations.html", + "searchbox.html", + "genindex.html", ] } diff --git a/docs/exts/__init__.py b/docs/exts/__init__.py new file mode 100644 index 00000000..448bb865 --- /dev/null +++ b/docs/exts/__init__.py @@ -0,0 +1,3 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/docs/exts/document_metrics.py b/docs/exts/document_metrics.py new file mode 100644 index 00000000..83544aed --- /dev/null +++ b/docs/exts/document_metrics.py @@ -0,0 +1,109 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + +"""Generates the documentation for metrics.""" + +import importlib +import sys +import textwrap + +from docutils import nodes +from docutils.parsers.rst import Directive +from docutils.statemachine import ViewList + + +def build_table(table): + col_size = [0] * len(table[0]) + for row in table: + for i, col in enumerate(row): + col_size[i] = max(col_size[i], len(col)) + + col_size = [width + 2 for width in col_size] + + yield " ".join("=" * width for width in col_size) + yield " ".join( + header + (" " * (width - len(header))) + for header, width in zip(table[0], col_size, strict=True) + ) + yield " ".join("=" * width for width in col_size) + for row in table[1:]: + yield " ".join( + col + (" " * (width - len(col))) + for col, width in zip(row, col_size, strict=True) + ) + yield " ".join("=" * width for width in col_size) + + +class AutoMetricsDirective(Directive): + has_content = False + required_arguments = 1 + optional_arguments = 0 + final_argument_whitespace = False + + option_spec = {} + + def add_line(self, line, source, *lineno): + """Add a line to the result""" + self.result.append(line, source, *lineno) + + def generate_docs(self, clspath): + modpath, name = clspath.rsplit(".", 1) + importlib.import_module(modpath) + module = sys.modules[modpath] + metrics = getattr(module, name) + + sourcename = f"metrics of {clspath}" + + # First build a table of metric items + self.add_line("Table of metrics:", sourcename) + self.add_line("", sourcename) + + table = [] + table.append(("Key", "Type")) + for key, metric in metrics.items(): + table.append((f":py:data:`{key}`", metric["type"])) + + for line in build_table(table): + self.add_line(line, sourcename) + + self.add_line("", sourcename) + self.add_line("Metrics details:", sourcename) + self.add_line("", sourcename) + + for key, metric in metrics.items(): + self.add_line(f".. py:data:: {key}", sourcename) + self.add_line("", sourcename) + self.add_line("", sourcename) + self.add_line(f" **Type**: ``{metric['type']}``", sourcename) + self.add_line("", sourcename) + self.add_line("", sourcename) + for line in textwrap.dedent(metric["description"]).splitlines(): + self.add_line(f" {line}", sourcename) + self.add_line("", sourcename) + self.add_line("", sourcename) + + def run(self): + self.reporter = self.state.document.reporter + self.result = ViewList() + + self.generate_docs(self.arguments[0]) + + if not self.result: + return [] + + node = nodes.paragraph() + node.document = self.state.document + self.state.nested_parse(self.result, 0, node) + return node.children + + +def setup(app): + """Register directive in Sphinx.""" + app.add_directive("autometrics", AutoMetricsDirective) + + return { + "version": 1.0, + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/docs/index.rst b/docs/index.rst index 46a9d5f0..14cacdcf 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,6 +9,7 @@ dev contributing configuration + metrics deploy .. toctree:: diff --git a/docs/overview.rst b/docs/overview.rst index b13306b3..4807cd0c 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -201,7 +201,7 @@ instance--either will work fine. Cloud storage file hierarchy ---------------------- +---------------------------- If you use the Google Cloud Storage crashstorage component, then crashes get saved in this hierarchy in the bucket: diff --git a/requirements.in b/requirements.in index 88f33e86..74b221bb 100644 --- a/requirements.in +++ b/requirements.in @@ -1,6 +1,5 @@ # Production requirements attrs==23.2.0 -datadog==0.49.1 dockerflow==2024.4.2 everett==3.3.0 falcon==3.1.3 @@ -8,7 +7,7 @@ fillmore==2.0.0 google-cloud-storage==2.18.0 gunicorn==22.0.0 isodate==0.6.1 -markus==4.2.0 +markus[datadog]==5.0.0 sentry-sdk==2.8.0 google-cloud-pubsub==2.23.0 diff --git a/requirements.txt b/requirements.txt index 9be53a3f..df505a53 100644 --- a/requirements.txt +++ b/requirements.txt @@ -137,7 +137,7 @@ click==8.1.7 \ datadog==0.49.1 \ --hash=sha256:4a56d57490ea699a0dfd9253547485a57b4120e93489defadcf95c66272374d6 \ --hash=sha256:4cb7a7991af6cadb868fe450cd456473e65f11fc678b7d7cf61044ff1c6074d8 - # via -r requirements.in + # via markus dockerflow==2024.4.2 \ --hash=sha256:b9f92455449ba46555f57db34cccefc4c49d3533c67793624ab7e80a1625caa7 \ --hash=sha256:f4216a3a809093860d7b2db84ba0a25c894cb8eb98b74f4f6a04badbc4f6b0a4 @@ -468,9 +468,9 @@ markupsafe==2.1.3 \ # via # jinja2 # werkzeug -markus==4.2.0 \ - --hash=sha256:156398b7de56db4e8ef420a80fcce1c49b6d3d41405874a3e128cb209cf4bfd8 \ - --hash=sha256:9dd41ce53b25a3e806b0d065808fec00c4ec945e80cf5a72431bf9b61b44d7fb +markus==5.0.0 \ + --hash=sha256:14fe47ebe3d3447cc5eebcd4691f71e7f38dee6338e4eb5d72d64d67f289c6ff \ + --hash=sha256:fd0f0de0914a3ae645cc1eac760dca2fa28943fbbb3671692e09916f7f2e2237 # via -r requirements.in mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ diff --git a/tests/conftest.py b/tests/conftest.py index 400bc2fa..74094b18 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,8 +15,6 @@ from google.auth.credentials import AnonymousCredentials from google.cloud import storage as gcs_storage from google.cloud.exceptions import NotFound as GCSNotFound -import markus -from markus.backends import BackendBase from markus.testing import MetricsMock import pytest @@ -25,32 +23,34 @@ REPO_ROOT = Path(__file__).parent.parent.resolve() sys.path.insert(0, str(REPO_ROOT)) -from antenna.app import get_app, setup_logging # noqa + +from antenna.app import AntennaApp, build_config_manager, get_app, set_up_logging # noqa from antenna.app import reset_verify_funs # noqa +from antenna.libmarkus import set_up_metrics # noqa -class CaptureMetricsUsed(BackendBase): - """Markus backend for capturing all the metrics that were emitted during tests""" +def pytest_sessionstart(): + config = build_config_manager().with_options(AntennaApp) - def __init__(self, options=None, filters=None): - self.options = options - self.filters = filters + # Make sure we set up logging and metrics + set_up_logging( + logging_level="DEBUG", + debug=True, + host_id=config("hostname"), + processname="antenna", + ) - def emit(self, record): - with open("metrics_emitted.txt", "a") as fp: - fp.write(f"{record.stat_type}\t{record.key}\t{record.tags!r}\n") + # FIXME(willkg): when we fix settings so they're not tied to the app, we can + # simplify this + set_up_metrics( + statsd_host=config("statsd_host"), + statsd_port=config("statsd_port"), + hostname=config("hostname"), + debug=config("local_dev_env"), + ) def pytest_runtest_setup(): - # Make sure we set up logging and metrics to sane default values. - setup_logging(logging_level="DEBUG", debug=True, host_id="", processname="antenna") - markus.configure( - [ - {"class": "markus.backends.logging.LoggingMetrics"}, - {"class": CaptureMetricsUsed}, - ] - ) - # Wipe any registered verify functions reset_verify_funs() diff --git a/tests/test_breakpad_resource.py b/tests/test_breakpad_resource.py index 77c54ee5..3cab1838 100644 --- a/tests/test_breakpad_resource.py +++ b/tests/test_breakpad_resource.py @@ -8,6 +8,7 @@ from unittest.mock import ANY from everett.manager import ConfigManager +from markus.testing import AnyTagValue import pytest from antenna.breakpad_resource import ( @@ -19,34 +20,6 @@ from testlib.mini_poster import compress, multipart_encode -class AnyTagValue: - """Matches a markus metrics tag with any value""" - - def __init__(self, key): - self.key = key - - def __repr__(self): - return f"" - - def get_other_key(self, other): - # This is comparing against a tag string - if ":" in other: - other_key, _ = other.split(":") - else: - other_key = other - return other_key - - def __eq__(self, other): - if isinstance(other, AnyTagValue): - return self.key == other.key - return self.key == self.get_other_key(other) - - def __lt__(self, other): - if isinstance(other, AnyTagValue): - return self.key < other.key - return self.key < self.get_other_key(other) - - class FakeCrashMover: """Fake crash mover that raises an error when used""" diff --git a/tests/test_health_resource.py b/tests/test_health_resource.py index ae425f94..0f24b97d 100644 --- a/tests/test_health_resource.py +++ b/tests/test_health_resource.py @@ -36,7 +36,7 @@ def test_heartbeat(self, client): assert resp.status_code == 200 # NOTE(willkg): This isn't mocked out, so it's entirely likely that # this expected result will change over time. - assert resp.json == {"errors": [], "info": {}} + assert resp.json == {"errors": []} def test_broken(self, client): resp = client.simulate_get("/__broken__") diff --git a/tests/test_sentry.py b/tests/test_sentry.py index 9a367c7e..9392fad6 100644 --- a/tests/test_sentry.py +++ b/tests/test_sentry.py @@ -5,7 +5,7 @@ from unittest.mock import ANY from fillmore.test import diff_structure -from markus.testing import MetricsMock +from markus.testing import AnyTagValue, MetricsMock from werkzeug.test import Client from antenna.app import get_app, count_sentry_scrub_error @@ -145,4 +145,8 @@ def test_count_sentry_scrub_error(): with MetricsMock() as metricsmock: metricsmock.clear_records() count_sentry_scrub_error("foo") - metricsmock.assert_incr("socorro.collector.sentry_scrub_error", value=1) + metricsmock.assert_incr( + "socorro.sentry_scrub_error", + value=1, + tags=["service:collector", AnyTagValue("host")], + )