mirror of
https://github.com/PostHog/posthog.git
synced 2024-12-01 04:12:23 +01:00
5d2ad6c7bb
* chore(deps): Update `black` to `22.8.0` * Format
244 lines
7.9 KiB
Python
244 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import logging
|
|
import os
|
|
import socket
|
|
import struct
|
|
import sys
|
|
import threading
|
|
import time
|
|
|
|
import structlog
|
|
from prometheus_client import CollectorRegistry, Gauge, multiprocess, start_http_server
|
|
|
|
loglevel = "error"
|
|
keepalive = 120
|
|
|
|
# Set the timeout to something lower than any downstreams, such that if the
|
|
# timeout is hit, then the worker will be killed and respawned, which will then
|
|
# we able to pick up any connections that were previously pending on the socket
|
|
# and serve the requests before the downstream timeout.
|
|
timeout = 15
|
|
|
|
grateful_timeout = 120
|
|
|
|
|
|
METRICS_UPDATE_INTERVAL_SECONDS = int(os.getenv("GUNICORN_METRICS_UPDATE_SECONDS", 5))
|
|
|
|
|
|
def when_ready(server):
|
|
"""
|
|
To ease being able to hide the /metrics endpoint when running in production,
|
|
we serve the metrics on a separate port, using the
|
|
prometheus_client.multiprocess Collector to pull in data from the worker
|
|
processes.
|
|
"""
|
|
registry = CollectorRegistry()
|
|
multiprocess.MultiProcessCollector(registry)
|
|
port = int(os.environ.get("PROMETHEUS_METRICS_EXPORT_PORT", 8001))
|
|
start_http_server(port=port, registry=registry)
|
|
|
|
# Start a thread in the Arbiter that will monitor the backlog on the sockets
|
|
# Gunicorn is listening on.
|
|
socket_monitor = SocketMonitor(server=server, registry=registry)
|
|
socket_monitor.start()
|
|
|
|
|
|
def post_fork(server, worker):
|
|
"""
|
|
Within each worker process, start a thread that will monitor the thread and
|
|
connection pool.
|
|
"""
|
|
worker_monitor = WorkerMonitor(worker=worker)
|
|
worker_monitor.start()
|
|
|
|
|
|
def worker_exit(server, worker):
|
|
"""
|
|
Ensure that we mark workers as dead with the prometheus_client such that
|
|
any cleanup can happen.
|
|
"""
|
|
multiprocess.mark_process_dead(worker.pid)
|
|
|
|
|
|
class SocketMonitor(threading.Thread):
|
|
"""
|
|
We have enabled the statsd collector for Gunicorn, but this doesn't include
|
|
the backlog due to concerns over portability, see
|
|
https://github.com/benoitc/gunicorn/pull/2407
|
|
|
|
Instead, we expose to Prometheus a gauge that will report the backlog size.
|
|
|
|
We can then:
|
|
|
|
1. use this to monitor how well the Gunicorn instances are keeping up with
|
|
requests.
|
|
2. use this metric to handle HPA scaling e.g. in Kubernetes
|
|
|
|
"""
|
|
|
|
def __init__(self, server, registry):
|
|
super().__init__()
|
|
self.daemon = True
|
|
self.server = server
|
|
self.registry = registry
|
|
|
|
def run(self):
|
|
"""
|
|
Every X seconds, check to see how many connections are pending for each
|
|
server socket.
|
|
|
|
We label each individually, as limits such as `--backlog` will apply to
|
|
each individually.
|
|
"""
|
|
if sys.platform != "linux":
|
|
# We use the assumption that we are on Linux to be able to get the
|
|
# socket backlog, so if we're not on Linux, we return immediately.
|
|
return
|
|
|
|
backlog_gauge = Gauge(
|
|
"gunicorn_pending_connections",
|
|
"The number of pending connections on all sockets. Linux only.",
|
|
registry=self.registry,
|
|
labelnames=["listener"],
|
|
)
|
|
|
|
while True:
|
|
for sock in self.server.LISTENERS:
|
|
backlog = self.get_backlog(sock=sock)
|
|
backlog_gauge.labels(listener=str(sock)).set(backlog)
|
|
|
|
time.sleep(METRICS_UPDATE_INTERVAL_SECONDS)
|
|
|
|
def get_backlog(self, sock):
|
|
# tcp_info struct from include/uapi/linux/tcp.h
|
|
fmt = "B" * 8 + "I" * 24
|
|
tcp_info_struct = sock.getsockopt(socket.IPPROTO_TCP, socket.TCP_INFO, 104)
|
|
# 12 is tcpi_unacked
|
|
return struct.unpack(fmt, tcp_info_struct)[12]
|
|
|
|
|
|
class WorkerMonitor(threading.Thread):
|
|
"""
|
|
There is a statsd logger support in Gunicorn that allows us to gather
|
|
metrics e.g. on the number of workers, requests, request duration etc. See
|
|
https://docs.gunicorn.org/en/stable/instrumentation.html for details.
|
|
|
|
To get a better understanding of the pool utilization, number of accepted
|
|
connections, we start a thread in head worker to report these via prometheus
|
|
metrics.
|
|
"""
|
|
|
|
def __init__(self, worker):
|
|
super().__init__()
|
|
self.daemon = True
|
|
self.worker = worker
|
|
|
|
def run(self):
|
|
"""
|
|
Every X seconds, check the status of the Thread pool, as well as the
|
|
"""
|
|
active_worker_connections = Gauge(
|
|
"gunicorn_active_worker_connections", "Number of active connections.", labelnames=["pid"]
|
|
)
|
|
max_worker_connections = Gauge(
|
|
"gunicorn_max_worker_connections", "Maximum worker connections.", labelnames=["pid"]
|
|
)
|
|
|
|
total_threads = Gauge("gunicorn_max_worker_threads", "Size of the thread pool per worker.", labelnames=["pid"])
|
|
active_threads = Gauge(
|
|
"gunicorn_active_worker_threads", "Number of threads actively processing requests.", labelnames=["pid"]
|
|
)
|
|
|
|
pending_requests = Gauge(
|
|
"gunicorn_pending_requests",
|
|
"Number of requests that have been read from a connection but have not completed yet",
|
|
labelnames=["pid"],
|
|
)
|
|
|
|
max_worker_connections.labels(pid=self.worker.pid).set(self.worker.cfg.worker_connections)
|
|
total_threads.labels(pid=self.worker.pid).set(self.worker.cfg.threads)
|
|
|
|
while True:
|
|
active_worker_connections.labels(pid=self.worker.pid).set(self.worker.nr_conns)
|
|
active_threads.labels(pid=self.worker.pid).set(min(self.worker.cfg.threads, len(self.worker.futures)))
|
|
pending_requests.labels(pid=self.worker.pid).set(len(self.worker.futures))
|
|
|
|
time.sleep(METRICS_UPDATE_INTERVAL_SECONDS)
|
|
|
|
|
|
LOGGING_FORMATTER_NAME = os.getenv("LOGGING_FORMATTER_NAME", "default")
|
|
|
|
# Setup stdlib logging to be handled by Structlog
|
|
def add_pid_and_tid(
|
|
logger: logging.Logger, method_name: str, event_dict: structlog.types.EventDict
|
|
) -> structlog.types.EventDict:
|
|
event_dict["pid"] = os.getpid()
|
|
event_dict["tid"] = threading.get_ident()
|
|
return event_dict
|
|
|
|
|
|
pre_chain = [
|
|
# Add the log level and a timestamp to the event_dict if the log entry
|
|
# is not from structlog.
|
|
structlog.stdlib.add_log_level,
|
|
structlog.stdlib.add_logger_name,
|
|
add_pid_and_tid,
|
|
structlog.processors.TimeStamper(fmt="iso"),
|
|
]
|
|
|
|
|
|
# This is a copy the default logging config for gunicorn but with additions to:
|
|
#
|
|
# 1. non propagate loggers to the root handlers (otherwise we get duplicate log
|
|
# lines)
|
|
# 2. use structlog for processing of log records
|
|
#
|
|
# See
|
|
# https://github.com/benoitc/gunicorn/blob/0b953b803786997d633d66c0f7c7b290df75e07c/gunicorn/glogging.py#L48
|
|
# for the default log settings.
|
|
logconfig_dict = {
|
|
"version": 1,
|
|
"disable_existing_loggers": True,
|
|
"formatters": {
|
|
"default": {
|
|
"()": structlog.stdlib.ProcessorFormatter,
|
|
"processor": structlog.dev.ConsoleRenderer(colors=True),
|
|
"foreign_pre_chain": pre_chain,
|
|
},
|
|
"json": {
|
|
"()": structlog.stdlib.ProcessorFormatter,
|
|
"processor": structlog.processors.JSONRenderer(),
|
|
"foreign_pre_chain": pre_chain,
|
|
},
|
|
},
|
|
"root": {"level": "INFO", "handlers": ["console"]},
|
|
"loggers": {
|
|
"gunicorn.error": {
|
|
"level": "INFO",
|
|
"handlers": ["error_console"],
|
|
"propagate": False,
|
|
"qualname": "gunicorn.error",
|
|
},
|
|
"gunicorn.access": {
|
|
"level": "INFO",
|
|
"handlers": ["console"],
|
|
"propagate": False,
|
|
"qualname": "gunicorn.access",
|
|
},
|
|
},
|
|
"handlers": {
|
|
"error_console": {
|
|
"class": "logging.StreamHandler",
|
|
"formatter": LOGGING_FORMATTER_NAME,
|
|
"stream": "ext://sys.stderr",
|
|
},
|
|
"console": {
|
|
"class": "logging.StreamHandler",
|
|
"formatter": LOGGING_FORMATTER_NAME,
|
|
"stream": "ext://sys.stdout",
|
|
},
|
|
},
|
|
}
|