import asyncio import hashlib import json import types from time import perf_counter from typing import Any, Dict, List, Optional, Tuple, Union, cast import sqlparse from aioch import Client from asgiref.sync import async_to_sync from clickhouse_driver import Client as SyncClient from clickhouse_pool import ChPool from django.conf import settings as app_settings from django.core.cache import cache from django.utils.timezone import now from sentry_sdk.api import capture_exception from ee.clickhouse.errors import wrap_query_error from ee.clickhouse.timer import get_timer_thread from posthog import redis from posthog.constants import AnalyticsDBMS from posthog.internal_metrics import incr, timing from posthog.settings import ( CLICKHOUSE_ASYNC, CLICKHOUSE_CA, CLICKHOUSE_CONN_POOL_MAX, CLICKHOUSE_CONN_POOL_MIN, CLICKHOUSE_DATABASE, CLICKHOUSE_HOST, CLICKHOUSE_PASSWORD, CLICKHOUSE_SECURE, CLICKHOUSE_USER, CLICKHOUSE_VERIFY, PRIMARY_DB, TEST, ) from posthog.utils import get_safe_cache InsertParams = Union[list, tuple, types.GeneratorType] NonInsertParams = Dict[str, Any] QueryArgs = Optional[Union[InsertParams, NonInsertParams]] CACHE_TTL = 60 # seconds SLOW_QUERY_THRESHOLD_MS = 15000 QUERY_TIMEOUT_THREAD = get_timer_thread("ee.clickhouse.client", SLOW_QUERY_THRESHOLD_MS) _request_information: Optional[Dict] = None def default_client(): """ Return a bare bones client for use in places where we are only interested in general ClickHouse state DO NOT USE THIS FOR QUERYING DATA """ return SyncClient( host=CLICKHOUSE_HOST, secure=CLICKHOUSE_SECURE, user=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, ca_certs=CLICKHOUSE_CA, verify=CLICKHOUSE_VERIFY, ) def make_ch_pool(**overrides) -> ChPool: kwargs = { "host": CLICKHOUSE_HOST, "database": CLICKHOUSE_DATABASE, "secure": CLICKHOUSE_SECURE, "user": CLICKHOUSE_USER, "password": CLICKHOUSE_PASSWORD, "ca_certs": CLICKHOUSE_CA, "verify": CLICKHOUSE_VERIFY, "connections_min": CLICKHOUSE_CONN_POOL_MIN, "connections_max": CLICKHOUSE_CONN_POOL_MAX, "settings": {"mutations_sync": "1"} if TEST else {}, **overrides, } return ChPool(**kwargs) if PRIMARY_DB != AnalyticsDBMS.CLICKHOUSE: ch_client = None # type: Client class ClickHouseNotConfigured(NotImplementedError): def __init__(self, msg='This function only works if PRIMARY_DB is set to indicate ClickHouse!"', *args): super().__init__(msg, *args) def async_execute(query, args=None, settings=None, with_column_types=False): raise ClickHouseNotConfigured() def sync_execute(query, args=None, settings=None, with_column_types=False): raise ClickHouseNotConfigured() def cache_sync_execute(query, args=None, redis_client=None, ttl=None, settings=None, with_column_types=False): raise ClickHouseNotConfigured() else: if not TEST and CLICKHOUSE_ASYNC: ch_client = Client( host=CLICKHOUSE_HOST, database=CLICKHOUSE_DATABASE, secure=CLICKHOUSE_SECURE, user=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, ca_certs=CLICKHOUSE_CA, verify=CLICKHOUSE_VERIFY, ) ch_pool = make_ch_pool() @async_to_sync async def async_execute(query, args=None, settings=None, with_column_types=False): loop = asyncio.get_event_loop() task = loop.create_task( ch_client.execute(query, args, settings=settings, with_column_types=with_column_types) ) return task else: # if this is a test use the sync client ch_client = SyncClient( host=CLICKHOUSE_HOST, database=CLICKHOUSE_DATABASE, secure=CLICKHOUSE_SECURE, user=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, ca_certs=CLICKHOUSE_CA, verify=CLICKHOUSE_VERIFY, settings={"mutations_sync": "1"} if TEST else {}, ) ch_pool = make_ch_pool() def async_execute(query, args=None, settings=None, with_column_types=False): return sync_execute(query, args, settings=settings, with_column_types=with_column_types) def cache_sync_execute(query, args=None, redis_client=None, ttl=CACHE_TTL, settings=None, with_column_types=False): if not redis_client: redis_client = redis.get_client() key = _key_hash(query, args) if redis_client.exists(key): result = _deserialize(redis_client.get(key)) return result else: result = sync_execute(query, args, settings=settings, with_column_types=with_column_types) redis_client.set(key, _serialize(result), ex=ttl) return result def sync_execute(query, args=None, settings=None, with_column_types=False): with ch_pool.get_client() as client: start_time = perf_counter() prepared_sql, prepared_args, tags = _prepare_query(client=client, query=query, args=args) timeout_task = QUERY_TIMEOUT_THREAD.schedule(_notify_of_slow_query_failure, tags) try: result = client.execute( prepared_sql, params=prepared_args, settings=settings, with_column_types=with_column_types ) except Exception as err: err = wrap_query_error(err) tags["failed"] = True tags["reason"] = type(err).__name__ incr("clickhouse_sync_execution_failure", tags=tags) raise err finally: execution_time = perf_counter() - start_time QUERY_TIMEOUT_THREAD.cancel(timeout_task) timing("clickhouse_sync_execution_time", execution_time * 1000.0, tags=tags) if app_settings.SHELL_PLUS_PRINT_SQL: print("Execution time: %.6fs" % (execution_time,)) if _request_information is not None and _request_information.get("save", False): save_query(prepared_sql, execution_time) return result def substitute_params(query, params): """ Helper method to ease rendering of sql clickhouse queries progressively. For example, there are many places where we construct queries to be used as subqueries of others. Each time we generate a subquery we also pass up the "bound" parameters to be used to render the query, which otherwise only happens at the point of calling clickhouse via the clickhouse_driver `Client`. This results in sometimes large lists of parameters with no relevance to the containing query being passed up. Rather than do this, we can instead "render" the subqueries prior to using as a subquery, so our containing code is only responsible for it's parameters, and we can avoid any potential param collisions. """ return cast(SyncClient, ch_client).substitute_params(query, params) def _prepare_query(client: SyncClient, query: str, args: QueryArgs): """ Given a string query with placeholders we do one of two things: 1. for a insert query we just format, and remove comments 2. for non-insert queries, we return the sql with placeholders evaluated with the contents of `args` We also return `tags` which contains some detail around the context within which the query was executed e.g. the django view name NOTE: `client.execute` would normally handle substitution, but because we want to strip the comments to make it easier to copy and past queries from the `system.query_log` easily with metabase (metabase doesn't show new lines, so with comments, you can't get a working query without exporting to csv or similar), we need to do it manually. We only want to try to substitue for SELECT queries, which clickhouse_driver at this moment in time decides based on the below predicate. """ prepared_args: Any = QueryArgs if isinstance(args, (list, tuple, types.GeneratorType)): # If we get one of these it means we have an insert, let the clickhouse # client handle substitution here. rendered_sql = query prepared_args = args elif not args: # If `args` is not truthy then make prepared_args `None`, which the # clickhouse client uses to signal no substitution is desired. Expected # args balue are `None` or `{}` for instance rendered_sql = query prepared_args = None else: # Else perform the substitution so we can perform operations on the raw # non-templated SQL rendered_sql = client.substitute_params(query, args) prepared_args = None formatted_sql = sqlparse.format(rendered_sql, strip_comments=True) annotated_sql, tags = _annotate_tagged_query(formatted_sql, args) if app_settings.SHELL_PLUS_PRINT_SQL: print() print(format_sql(formatted_sql)) return annotated_sql, prepared_args, tags def _deserialize(result_bytes: bytes) -> List[Tuple]: results = [] for x in json.loads(result_bytes): results.append(tuple(x)) return results def _serialize(result: Any) -> bytes: return json.dumps(result).encode("utf-8") def _key_hash(query: str, args: Any) -> bytes: key = hashlib.md5(query.encode("utf-8") + json.dumps(args).encode("utf-8")).digest() return key def _annotate_tagged_query(query, args): """ Adds in a /* */ so we can look in clickhouses `system.query_log` to easily marry up to the generating code. """ tags = {"kind": (_request_information or {}).get("kind"), "id": (_request_information or {}).get("id")} if isinstance(args, dict) and "team_id" in args: tags["team_id"] = args["team_id"] # Annotate the query with information on the request/task if _request_information is not None: query = f"/* {_request_information['kind']}:{_request_information['id'].replace('/', '_')} */ {query}" return query, tags def _notify_of_slow_query_failure(tags: Dict[str, Any]): tags["failed"] = True tags["reason"] = "timeout" incr("clickhouse_sync_execution_failure", tags=tags) def format_sql(rendered_sql, colorize=True): formatted_sql = sqlparse.format(rendered_sql, reindent_aligned=True) if colorize: try: import pygments.formatters import pygments.lexers return pygments.highlight( formatted_sql, pygments.lexers.get_lexer_by_name("sql"), pygments.formatters.TerminalFormatter() ) except: pass return formatted_sql def save_query(sql: str, execution_time: float) -> None: """ Save query for debugging purposes """ if _request_information is None: return try: key = "save_query_{}".format(_request_information["user_id"]) queries = json.loads(get_safe_cache(key) or "[]") queries.insert( 0, { "timestamp": now().isoformat(), "query": format_sql(sql, colorize=False), "execution_time": execution_time, }, ) cache.set(key, json.dumps(queries), timeout=120) except Exception as e: capture_exception(e)