Skip to content

Commit

Permalink
bug-1898341: switch to a single METRICS client
Browse files Browse the repository at this point in the history
  • Loading branch information
willkg committed May 23, 2024
1 parent 5f030dd commit 8a3da3f
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 38 deletions.
6 changes: 2 additions & 4 deletions antenna/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from falcon.errors import HTTPInternalServerError
from fillmore.libsentry import set_up_sentry
from fillmore.scrubber import Scrubber, Rule, SCRUB_RULES_DEFAULT
import markus
import sentry_sdk
from sentry_sdk.hub import Hub
from sentry_sdk.integrations.atexit import AtexitIntegration
Expand All @@ -40,11 +39,10 @@
)
from antenna.libdockerflow import get_release_name
from antenna.liblogging import setup_logging, log_config
from antenna.libmarkus import setup_metrics
from antenna.libmarkus import setup_metrics, METRICS


LOGGER = logging.getLogger(__name__)
METRICS = markus.get_metrics("app")


# Set up Sentry to scrub user ip addresses, exclude frame-local vars, exclude the
Expand All @@ -60,7 +58,7 @@


def count_sentry_scrub_error(msg):
METRICS.incr("sentry_scrub_error", value=1)
METRICS.incr("app.sentry_scrub_error", value=1)


def configure_sentry(app_config):
Expand Down
38 changes: 21 additions & 17 deletions antenna/breakpad_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
MultipartParseError,
MultipartParseOptions,
)
import markus

from antenna.libmarkus import METRICS
from antenna.throttler import REJECT, FAKEACCEPT, RESULT_TO_TEXT, Throttler
from antenna.util import (
create_crash_id,
Expand All @@ -30,7 +30,6 @@


logger = logging.getLogger(__name__)
mymetrics = markus.get_metrics("breakpad_resource")


#: Bad fields we should never save, so remove them from the payload before
Expand Down Expand Up @@ -144,7 +143,7 @@ def extract_payload(self, req):

# Decompress payload if it's compressed
if req.env.get("HTTP_CONTENT_ENCODING") == "gzip":
mymetrics.incr("gzipped_crash")
METRICS.incr("breakpad_resource.gzipped_crash")
crash_report.payload_compressed = "1"

# If the content is gzipped, we pull it out and decompress it. We
Expand All @@ -154,14 +153,14 @@ def extract_payload(self, req):
start_time = time.perf_counter()
try:
data = zlib.decompress(req.stream.read(content_length), gzip_header)
mymetrics.histogram(
"gzipped_crash_decompress",
METRICS.histogram(
"breakpad_resource.gzipped_crash_decompress",
value=(time.perf_counter() - start_time) * 1000.0,
tags=["result:success"],
)
except zlib.error as exc:
mymetrics.histogram(
"gzipped_crash_decompress",
METRICS.histogram(
"breakpad_resource.gzipped_crash_decompress",
value=(time.perf_counter() - start_time) * 1000.0,
tags=["result:fail"],
)
Expand All @@ -178,14 +177,18 @@ def extract_payload(self, req):
req.env["CONTENT_LENGTH"] = str(content_length)

data = io.BytesIO(data)
mymetrics.histogram(
"crash_size", value=content_length, tags=["payload:compressed"]
METRICS.histogram(
"breakpad_resource.crash_size",
value=content_length,
tags=["payload:compressed"],
)

else:
data = req.bounded_stream
mymetrics.histogram(
"crash_size", value=content_length, tags=["payload:uncompressed"]
METRICS.histogram(
"breakpad_resource.crash_size",
value=content_length,
tags=["payload:uncompressed"],
)

has_json = False
Expand Down Expand Up @@ -299,7 +302,7 @@ def cleanup_crash_report(self, raw_crash):
del raw_crash[bad_field]
notes.append("Removed %s from raw crash." % bad_field)

@mymetrics.timer_decorator("on_post.time")
@METRICS.timer_decorator("breakpad_resource.on_post.time")
def on_post(self, req, resp):
"""Handle incoming HTTP POSTs.
Expand All @@ -321,12 +324,12 @@ def on_post(self, req, resp):
except MalformedCrashReport as exc:
# If this is malformed, then reject it with malformed error code.
msg = str(exc)
mymetrics.incr("malformed", tags=["reason:%s" % msg])
METRICS.incr("breakpad_resource.malformed", tags=["reason:%s" % msg])
resp.status = falcon.HTTP_400
resp.text = "Discarded=malformed_%s" % msg
return

mymetrics.incr("incoming_crash")
METRICS.incr("breakpad_resource.incoming_crash")

raw_crash = crash_report.annotations

Expand Down Expand Up @@ -377,9 +380,10 @@ def on_post(self, req, resp):
rule_name,
RESULT_TO_TEXT[throttle_result],
)
mymetrics.incr("throttle_rule", tags=["rule:%s" % rule_name])
mymetrics.incr(
"throttle", tags=["result:%s" % RESULT_TO_TEXT[throttle_result].lower()]
METRICS.incr("breakpad_resource.throttle_rule", tags=["rule:%s" % rule_name])
METRICS.incr(
"breakpad_resource.throttle",
tags=["result:%s" % RESULT_TO_TEXT[throttle_result].lower()],
)
raw_crash["metadata"]["throttle_rule"] = rule_name

Expand Down
17 changes: 8 additions & 9 deletions antenna/crashmover.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,18 @@
import logging

from everett.manager import Option, parse_class
import markus

from antenna.libmarkus import METRICS
from antenna.util import MaxAttemptsError, retry


LOGGER = logging.getLogger(__name__)
MYMETRICS = markus.get_metrics("crashmover")


def _incr_wait_generator(counter, attempts, sleep_seconds):
def _generator_generator():
for _ in range(attempts - 1):
MYMETRICS.incr(counter)
METRICS.incr(f"crashmover.{counter}")
yield sleep_seconds

return _generator_generator
Expand Down Expand Up @@ -125,7 +124,7 @@ def check_health(self, state):
if hasattr(self.crashpublish, "check_health"):
self.crashpublish.check_health(state)

@MYMETRICS.timer("crash_handling.time")
@METRICS.timer("crashmover.crash_handling.time")
def handle_crashreport(self, raw_crash, dumps, crash_id):
"""Handle a new crash report synchronously and return whether that succeeded.
Expand All @@ -144,27 +143,27 @@ def handle_crashreport(self, raw_crash, dumps, crash_id):
except MaxAttemptsError:
# After max attempts, we give up on this crash
LOGGER.error("%s: too many errors trying to save; dropped", crash_id)
MYMETRICS.incr("save_crash_dropped.count")
METRICS.incr("crashmover.save_crash_dropped.count")
return False

try:
self.crashmover_publish_with_retry(crash_report)
MYMETRICS.incr("save_crash.count")
METRICS.incr("crashmover.save_crash.count")
except MaxAttemptsError:
LOGGER.error("%s: too many errors trying to publish; dropped", crash_id)
MYMETRICS.incr("publish_crash_dropped.count")
METRICS.incr("crashmover.publish_crash_dropped.count")
# return True even when publish fails because it will be automatically
# published later via self-healing mechanisms

return True

@MYMETRICS.timer("crash_save.time")
@METRICS.timer("crashmover.crash_save.time")
def crashmover_save(self, crash_report):
"""Save crash report to storage."""
self.crashstorage.save_crash(crash_report)
LOGGER.info("%s saved", crash_report.crash_id)

@MYMETRICS.timer("crash_publish.time")
@METRICS.timer("crashmover.crash_publish.time")
def crashmover_publish(self, crash_report):
"""Publish crash_id in publish queue."""
self.crashpublish.publish_crash(crash_report)
Expand Down
15 changes: 7 additions & 8 deletions antenna/health_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,20 @@
import os

import falcon
import markus

from antenna.libdockerflow import get_version_info
from antenna.libmarkus import METRICS


logger = logging.getLogger(__name__)
mymetrics = markus.get_metrics("health")


class BrokenResource:
"""Handle ``/__broken__`` endpoint."""

def on_get(self, req, resp):
"""Implement GET HTTP request."""
mymetrics.incr("broken.count")
METRICS.incr("health.broken.count")
# This is intentional breakage
raise Exception("intentional exception")

Expand All @@ -35,7 +34,7 @@ def __init__(self, basedir):

def on_get(self, req, resp):
"""Implement GET HTTP request."""
mymetrics.incr("version.count")
METRICS.incr("health.version.count")
version_info = get_version_info(self.basedir)
# FIXME(willkg): there's no cloud provider environment variable to use, so
# we'll cheat and look at whether there's a "gcs" in
Expand All @@ -57,7 +56,7 @@ class LBHeartbeatResource:

def on_get(self, req, resp):
"""Implement GET HTTP request."""
mymetrics.incr("lbheartbeat.count")
METRICS.incr("health.lbheartbeat.count")
resp.content_type = "application/json; charset=utf-8"
resp.status = falcon.HTTP_200

Expand Down Expand Up @@ -99,7 +98,7 @@ def __init__(self, app):

def on_get(self, req, resp):
"""Implement GET HTTP request."""
mymetrics.incr("heartbeat.count")
METRICS.incr("health.heartbeat.count")
state = HealthState()

# So we're going to think of Antenna like a big object graph and
Expand All @@ -111,8 +110,8 @@ def on_get(self, req, resp):
resource.check_health(state)

# Go through and call gauge for each statsd item.
for k, v in state.statsd.items():
mymetrics.gauge(k, value=v)
for key, value in state.statsd.items():
METRICS.gauge(f"health.{key}", value=value)

if state.is_healthy():
resp.status = falcon.HTTP_200
Expand Down
3 changes: 3 additions & 0 deletions antenna/libmarkus.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,6 @@ def setup_metrics(statsd_host, statsd_port, debug=False):
markus.configure(markus_backends)

_IS_MARKUS_SETUP = True


METRICS = markus.get_metrics()

0 comments on commit 8a3da3f

Please sign in to comment.