0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-28 09:16:49 +01:00
posthog/ee/clickhouse/client.py
Karl-Aksel Puulmann f0c9d68166
Make PRINT_SQL output readable (#6245)
Our queries now contain a lot of comment noise that's not relevant when
developing. This change makes PRINT_SQL env variable usable again by
stripping these comments.
2021-10-05 11:27:17 +03:00

283 lines
9.5 KiB
Python

import asyncio
import hashlib
import json
import types
from time import perf_counter
from typing import Any, Dict, List, Optional, Tuple, Union
import sqlparse
from aioch import Client
from asgiref.sync import async_to_sync
from clickhouse_driver import Client as SyncClient
from clickhouse_driver.util.escape import escape_params
from clickhouse_pool import ChPool
from django.conf import settings as app_settings
from django.core.cache import cache
from django.utils.timezone import now
from sentry_sdk.api import capture_exception
from ee.clickhouse.errors import wrap_query_error
from ee.clickhouse.timer import get_timer_thread
from posthog import redis
from posthog.constants import AnalyticsDBMS
from posthog.internal_metrics import incr, timing
from posthog.settings import (
CLICKHOUSE_ASYNC,
CLICKHOUSE_CA,
CLICKHOUSE_CONN_POOL_MAX,
CLICKHOUSE_CONN_POOL_MIN,
CLICKHOUSE_DATABASE,
CLICKHOUSE_HOST,
CLICKHOUSE_PASSWORD,
CLICKHOUSE_SECURE,
CLICKHOUSE_USER,
CLICKHOUSE_VERIFY,
PRIMARY_DB,
TEST,
)
from posthog.utils import get_safe_cache
InsertParams = Union[list, tuple, types.GeneratorType]
NonInsertParams = Union[Dict[str, Any]]
QueryArgs = Optional[Union[InsertParams, NonInsertParams]]
CACHE_TTL = 60 # seconds
SLOW_QUERY_THRESHOLD_MS = 15000
QUERY_TIMEOUT_THREAD = get_timer_thread("ee.clickhouse.client", SLOW_QUERY_THRESHOLD_MS)
_request_information: Optional[Dict] = None
def make_ch_pool(**overrides) -> ChPool:
kwargs = {
"host": CLICKHOUSE_HOST,
"database": CLICKHOUSE_DATABASE,
"secure": CLICKHOUSE_SECURE,
"user": CLICKHOUSE_USER,
"password": CLICKHOUSE_PASSWORD,
"ca_certs": CLICKHOUSE_CA,
"verify": CLICKHOUSE_VERIFY,
"connections_min": CLICKHOUSE_CONN_POOL_MIN,
"connections_max": CLICKHOUSE_CONN_POOL_MAX,
"settings": {"mutations_sync": "1"} if TEST else {},
**overrides,
}
return ChPool(**kwargs)
if PRIMARY_DB != AnalyticsDBMS.CLICKHOUSE:
ch_client = None # type: Client
class ClickHouseNotConfigured(NotImplementedError):
def __init__(self, msg='This function only works if PRIMARY_DB is set to indicate ClickHouse!"', *args):
super().__init__(msg, *args)
def async_execute(query, args=None, settings=None, with_column_types=False):
raise ClickHouseNotConfigured()
def sync_execute(query, args=None, settings=None, with_column_types=False):
raise ClickHouseNotConfigured()
def cache_sync_execute(query, args=None, redis_client=None, ttl=None, settings=None, with_column_types=False):
raise ClickHouseNotConfigured()
else:
if not TEST and CLICKHOUSE_ASYNC:
ch_client = Client(
host=CLICKHOUSE_HOST,
database=CLICKHOUSE_DATABASE,
secure=CLICKHOUSE_SECURE,
user=CLICKHOUSE_USER,
password=CLICKHOUSE_PASSWORD,
ca_certs=CLICKHOUSE_CA,
verify=CLICKHOUSE_VERIFY,
)
ch_pool = make_ch_pool()
@async_to_sync
async def async_execute(query, args=None, settings=None, with_column_types=False):
loop = asyncio.get_event_loop()
task = loop.create_task(
ch_client.execute(query, args, settings=settings, with_column_types=with_column_types)
)
return task
else:
# if this is a test use the sync client
ch_client = SyncClient(
host=CLICKHOUSE_HOST,
database=CLICKHOUSE_DATABASE,
secure=CLICKHOUSE_SECURE,
user=CLICKHOUSE_USER,
password=CLICKHOUSE_PASSWORD,
ca_certs=CLICKHOUSE_CA,
verify=CLICKHOUSE_VERIFY,
settings={"mutations_sync": "1"} if TEST else {},
)
ch_pool = make_ch_pool()
def async_execute(query, args=None, settings=None, with_column_types=False):
return sync_execute(query, args, settings=settings, with_column_types=with_column_types)
def cache_sync_execute(query, args=None, redis_client=None, ttl=CACHE_TTL, settings=None, with_column_types=False):
if not redis_client:
redis_client = redis.get_client()
key = _key_hash(query, args)
if redis_client.exists(key):
result = _deserialize(redis_client.get(key))
return result
else:
result = sync_execute(query, args, settings=settings, with_column_types=with_column_types)
redis_client.set(key, _serialize(result), ex=ttl)
return result
def sync_execute(query, args=None, settings=None, with_column_types=False):
with ch_pool.get_client() as client:
start_time = perf_counter()
prepared_sql, prepared_args, tags = _prepare_query(client=client, query=query, args=args)
timeout_task = QUERY_TIMEOUT_THREAD.schedule(_notify_of_slow_query_failure, tags)
try:
result = client.execute(
prepared_sql, params=prepared_args, settings=settings, with_column_types=with_column_types
)
except Exception as err:
err = wrap_query_error(err)
tags["failed"] = True
tags["reason"] = type(err).__name__
incr("clickhouse_sync_execution_failure", tags=tags)
raise err
finally:
execution_time = perf_counter() - start_time
QUERY_TIMEOUT_THREAD.cancel(timeout_task)
timing("clickhouse_sync_execution_time", execution_time * 1000.0, tags=tags)
if app_settings.SHELL_PLUS_PRINT_SQL:
print("Execution time: %.6fs" % (execution_time,))
if _request_information is not None and _request_information.get("save", False):
save_query(prepared_sql, execution_time)
return result
def _prepare_query(client: SyncClient, query: str, args: QueryArgs):
"""
Given a string query with placeholders we do one of two things:
1. for a insert query we just format, and remove comments
2. for non-insert queries, we return the sql with placeholders
evaluated with the contents of `args`
We also return `tags` which contains some detail around the context
within which the query was executed e.g. the django view name
NOTE: `client.execute` would normally handle substitution, but
because we want to strip the comments to make it easier to copy
and past queries from the `system.query_log` easily with metabase
(metabase doesn't show new lines, so with comments, you can't get
a working query without exporting to csv or similar), we need to
do it manually.
We only want to try to substitue for SELECT queries, which
clickhouse_driver at this moment in time decides based on the
below predicate.
"""
if isinstance(args, (list, tuple, types.GeneratorType)):
rendered_sql = query
else:
rendered_sql = client.substitute_params(query, args or {})
args = None
formatted_sql = sqlparse.format(rendered_sql, strip_comments=True)
annotated_sql, tags = _annotate_tagged_query(formatted_sql, args)
if app_settings.SHELL_PLUS_PRINT_SQL:
print()
print(format_sql(formatted_sql))
return annotated_sql, args, tags
def _deserialize(result_bytes: bytes) -> List[Tuple]:
results = []
for x in json.loads(result_bytes):
results.append(tuple(x))
return results
def _serialize(result: Any) -> bytes:
return json.dumps(result).encode("utf-8")
def _key_hash(query: str, args: Any) -> bytes:
key = hashlib.md5(query.encode("utf-8") + json.dumps(args).encode("utf-8")).digest()
return key
def _annotate_tagged_query(query, args):
"""
Adds in a /* */ so we can look in clickhouses `system.query_log`
to easily marry up to the generating code.
"""
tags = {"kind": (_request_information or {}).get("kind"), "id": (_request_information or {}).get("id")}
if isinstance(args, dict) and "team_id" in args:
tags["team_id"] = args["team_id"]
# Annotate the query with information on the request/task
if _request_information is not None:
query = f"/* {_request_information['kind']}:{_request_information['id'].replace('/', '_')} */ {query}"
return query, tags
def _notify_of_slow_query_failure(tags: Dict[str, Any]):
tags["failed"] = True
tags["reason"] = "timeout"
incr("clickhouse_sync_execution_failure", tags=tags)
def format_sql(rendered_sql, colorize=True):
formatted_sql = sqlparse.format(rendered_sql, reindent_aligned=True)
if colorize:
try:
import pygments.formatters
import pygments.lexers
return pygments.highlight(
formatted_sql, pygments.lexers.get_lexer_by_name("sql"), pygments.formatters.TerminalFormatter()
)
except:
pass
return formatted_sql
def save_query(sql: str, execution_time: float) -> None:
"""
Save query for debugging purposes
"""
if _request_information is None:
return
try:
key = "save_query_{}".format(_request_information["user_id"])
queries = json.loads(get_safe_cache(key) or "[]")
queries.insert(
0,
{
"timestamp": now().isoformat(),
"query": format_sql(sql, colorize=False),
"execution_time": execution_time,
},
)
cache.set(key, json.dumps(queries), timeout=120)
except Exception as e:
capture_exception(e)