From 09e37663f46dd786d0165089e725023aeddf039d Mon Sep 17 00:00:00 2001 From: Yakko Majuri <38760734+yakkomajuri@users.noreply.github.com> Date: Mon, 31 Jan 2022 18:10:10 +0000 Subject: [PATCH] Add dead letter queue metrics to system status (#8294) * add dead letter queue metrics to system status * today -> last 24h * fix test --- ee/clickhouse/system_status.py | 38 ++++++++++++++++++- ee/clickhouse/test/test_system_status.py | 7 +++- .../src/layout/navigation/navigationLogic.ts | 2 +- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/ee/clickhouse/system_status.py b/ee/clickhouse/system_status.py index 1c6dc874929..b83296837d7 100644 --- a/ee/clickhouse/system_status.py +++ b/ee/clickhouse/system_status.py @@ -82,12 +82,46 @@ def system_status() -> Generator[SystemStatusRow, None, None]: "subrows": {"columns": ["Metric", "Value", "Description"], "rows": list(sorted(system_metrics))}, } - last_event_ingested_timestamp = sync_execute("SELECT max(_timestamp) FROM events") + last_event_ingested_timestamp = sync_execute("SELECT max(_timestamp) FROM events")[0][0] yield { "key": "last_event_ingested_timestamp", "metric": "Last event ingested", - "value": last_event_ingested_timestamp[0], + "value": last_event_ingested_timestamp, + } + + dead_letter_queue_size = sync_execute("SELECT count(*) FROM events_dead_letter_queue")[0][0] + + yield { + "key": "dead_letter_queue_size", + "metric": "Dead letter queue size", + "value": dead_letter_queue_size, + } + + dead_letter_queue_events_last_day = sync_execute( + "SELECT count(*) FROM events_dead_letter_queue WHERE _timestamp >= (NOW() - INTERVAL 1 DAY)" + )[0][0] + + yield { + "key": "dead_letter_queue_events_last_day", + "metric": "Events sent to dead letter queue in the last 24h", + "value": dead_letter_queue_events_last_day, + } + + total_events_ingested_last_day = sync_execute( + "SELECT count(*) as b from events WHERE _timestamp >= (NOW() - INTERVAL 1 DAY)" + )[0][0] + dead_letter_queue_ingestion_ratio = dead_letter_queue_events_last_day / total_events_ingested_last_day + + # if the dead letter queue has as many events today as ingestion, issue an alert + dead_letter_queue_events_high = ( + dead_letter_queue_ingestion_ratio >= 0.5 and dead_letter_queue_events_last_day > 10000 + ) + + yield { + "key": "dead_letter_queue_ratio_ok", + "metric": "Dead letter queue ratio healthy", + "value": not dead_letter_queue_events_high, } diff --git a/ee/clickhouse/test/test_system_status.py b/ee/clickhouse/test/test_system_status.py index 2dd5d5c2876..936b90b6abb 100644 --- a/ee/clickhouse/test/test_system_status.py +++ b/ee/clickhouse/test/test_system_status.py @@ -13,6 +13,9 @@ def test_system_status(db): "clickhouse_table_sizes", "clickhouse_system_metrics", "last_event_ingested_timestamp", + "dead_letter_queue_size", + "dead_letter_queue_events_last_day", + "dead_letter_queue_ratio_ok", ] - assert len(results[-3]["subrows"]["rows"]) > 0 - assert len(results[-2]["subrows"]["rows"]) > 0 + assert len(results[6]["subrows"]["rows"]) > 0 + assert len(results[7]["subrows"]["rows"]) > 0 diff --git a/frontend/src/layout/navigation/navigationLogic.ts b/frontend/src/layout/navigation/navigationLogic.ts index 5c1545ec804..bba0eee5e6f 100644 --- a/frontend/src/layout/navigation/navigationLogic.ts +++ b/frontend/src/layout/navigation/navigationLogic.ts @@ -130,7 +130,7 @@ export const navigationLogic = kea>({ } // if you have status metrics these three must have `value: true` - const aliveMetrics = ['redis_alive', 'db_alive', 'plugin_sever_alive'] + const aliveMetrics = ['redis_alive', 'db_alive', 'plugin_sever_alive', 'dead_letter_queue_ratio_ok'] const aliveSignals = statusMetrics .filter((sm) => sm.key && aliveMetrics.includes(sm.key)) .filter((sm) => sm.value).length