posthog/ee/clickhouse/client.py

import asyncio
import hashlib
import json
import types
from time import perf_counter
from typing import Any, Dict, List, Optional, Tuple, Union, cast

import sqlparse
from aioch import Client
from asgiref.sync import async_to_sync
from clickhouse_driver import Client as SyncClient
from clickhouse_pool import ChPool
from django.conf import settings as app_settings
from django.core.cache import cache
from django.utils.timezone import now
from sentry_sdk.api import capture_exception

from ee.clickhouse.errors import wrap_query_error
from ee.clickhouse.timer import get_timer_thread
from posthog import redis
from posthog.constants import AnalyticsDBMS
from posthog.internal_metrics import incr, timing
from posthog.settings import (
    CLICKHOUSE_ASYNC,
    CLICKHOUSE_CA,
    CLICKHOUSE_CONN_POOL_MAX,
    CLICKHOUSE_CONN_POOL_MIN,
    CLICKHOUSE_DATABASE,
    CLICKHOUSE_HOST,
    CLICKHOUSE_PASSWORD,
    CLICKHOUSE_SECURE,
    CLICKHOUSE_USER,
    CLICKHOUSE_VERIFY,
    PRIMARY_DB,
    TEST,
)
from posthog.utils import get_safe_cache

InsertParams = Union[list, tuple, types.GeneratorType]
NonInsertParams = Dict[str, Any]
QueryArgs = Optional[Union[InsertParams, NonInsertParams]]

CACHE_TTL = 60  # seconds
SLOW_QUERY_THRESHOLD_MS = 15000
QUERY_TIMEOUT_THREAD = get_timer_thread("ee.clickhouse.client", SLOW_QUERY_THRESHOLD_MS)

_request_information: Optional[Dict] = None


def default_client():
    """
    Return a bare bones client for use in places where we are only interested in general ClickHouse state
    DO NOT USE THIS FOR QUERYING DATA
    """
    return SyncClient(
        host=CLICKHOUSE_HOST,
        secure=CLICKHOUSE_SECURE,
        user=CLICKHOUSE_USER,
        password=CLICKHOUSE_PASSWORD,
        ca_certs=CLICKHOUSE_CA,
        verify=CLICKHOUSE_VERIFY,
    )


def make_ch_pool(**overrides) -> ChPool:
    kwargs = {
        "host": CLICKHOUSE_HOST,
        "database": CLICKHOUSE_DATABASE,
        "secure": CLICKHOUSE_SECURE,
        "user": CLICKHOUSE_USER,
        "password": CLICKHOUSE_PASSWORD,
        "ca_certs": CLICKHOUSE_CA,
        "verify": CLICKHOUSE_VERIFY,
        "connections_min": CLICKHOUSE_CONN_POOL_MIN,
        "connections_max": CLICKHOUSE_CONN_POOL_MAX,
        "settings": {"mutations_sync": "1"} if TEST else {},
        **overrides,
    }

    return ChPool(**kwargs)


if PRIMARY_DB != AnalyticsDBMS.CLICKHOUSE:
    ch_client = None  # type: Client

    class ClickHouseNotConfigured(NotImplementedError):
        def __init__(self, msg='This function only works if PRIMARY_DB is set to indicate ClickHouse!"', *args):
            super().__init__(msg, *args)

    def async_execute(query, args=None, settings=None, with_column_types=False):
        raise ClickHouseNotConfigured()

    def sync_execute(query, args=None, settings=None, with_column_types=False):
        raise ClickHouseNotConfigured()

    def cache_sync_execute(query, args=None, redis_client=None, ttl=None, settings=None, with_column_types=False):
        raise ClickHouseNotConfigured()


else:
    if not TEST and CLICKHOUSE_ASYNC:
        ch_client = Client(
            host=CLICKHOUSE_HOST,
            database=CLICKHOUSE_DATABASE,
            secure=CLICKHOUSE_SECURE,
            user=CLICKHOUSE_USER,
            password=CLICKHOUSE_PASSWORD,
            ca_certs=CLICKHOUSE_CA,
            verify=CLICKHOUSE_VERIFY,
        )

        ch_pool = make_ch_pool()

        @async_to_sync
        async def async_execute(query, args=None, settings=None, with_column_types=False):
            loop = asyncio.get_event_loop()
            task = loop.create_task(
                ch_client.execute(query, args, settings=settings, with_column_types=with_column_types)
            )
            return task

    else:
        # if this is a test use the sync client
        ch_client = SyncClient(
            host=CLICKHOUSE_HOST,
            database=CLICKHOUSE_DATABASE,
            secure=CLICKHOUSE_SECURE,
            user=CLICKHOUSE_USER,
            password=CLICKHOUSE_PASSWORD,
            ca_certs=CLICKHOUSE_CA,
            verify=CLICKHOUSE_VERIFY,
            settings={"mutations_sync": "1"} if TEST else {},
        )

        ch_pool = make_ch_pool()

        def async_execute(query, args=None, settings=None, with_column_types=False):
            return sync_execute(query, args, settings=settings, with_column_types=with_column_types)

    def cache_sync_execute(query, args=None, redis_client=None, ttl=CACHE_TTL, settings=None, with_column_types=False):
        if not redis_client:
            redis_client = redis.get_client()
        key = _key_hash(query, args)
        if redis_client.exists(key):
            result = _deserialize(redis_client.get(key))
            return result
        else:
            result = sync_execute(query, args, settings=settings, with_column_types=with_column_types)
            redis_client.set(key, _serialize(result), ex=ttl)
            return result

    def sync_execute(query, args=None, settings=None, with_column_types=False):
        with ch_pool.get_client() as client:
            start_time = perf_counter()

            prepared_sql, prepared_args, tags = _prepare_query(client=client, query=query, args=args)

            timeout_task = QUERY_TIMEOUT_THREAD.schedule(_notify_of_slow_query_failure, tags)

            try:
                result = client.execute(
                    prepared_sql, params=prepared_args, settings=settings, with_column_types=with_column_types
                )
            except Exception as err:
                err = wrap_query_error(err)
                tags["failed"] = True
                tags["reason"] = type(err).__name__
                incr("clickhouse_sync_execution_failure", tags=tags)

                raise err
            finally:
                execution_time = perf_counter() - start_time

                QUERY_TIMEOUT_THREAD.cancel(timeout_task)
                timing("clickhouse_sync_execution_time", execution_time * 1000.0, tags=tags)

                if app_settings.SHELL_PLUS_PRINT_SQL:
                    print("Execution time: %.6fs" % (execution_time,))
                if _request_information is not None and _request_information.get("save", False):
                    save_query(prepared_sql, execution_time)
        return result

    def substitute_params(query, params):
        """
        Helper method to ease rendering of sql clickhouse queries progressively.
        For example, there are many places where we construct queries to be used
        as subqueries of others. Each time we generate a subquery we also pass
        up the "bound" parameters to be used to render the query, which
        otherwise only happens at the point of calling clickhouse via the
        clickhouse_driver `Client`.

        This results in sometimes large lists of parameters with no relevance to
        the containing query being passed up. Rather than do this, we can
        instead "render" the subqueries prior to using as a subquery, so our
        containing code is only responsible for it's parameters, and we can
        avoid any potential param collisions.
        """
        return cast(SyncClient, ch_client).substitute_params(query, params)


def _prepare_query(client: SyncClient, query: str, args: QueryArgs):
    """
    Given a string query with placeholders we do one of two things:

        1. for a insert query we just format, and remove comments
        2. for non-insert queries, we return the sql with placeholders
        evaluated with the contents of `args`

    We also return `tags` which contains some detail around the context
    within which the query was executed e.g. the django view name

    NOTE: `client.execute` would normally handle substitution, but
    because we want to strip the comments to make it easier to copy
    and past queries from the `system.query_log` easily with metabase
    (metabase doesn't show new lines, so with comments, you can't get
    a working query without exporting to csv or similar), we need to
    do it manually.

    We only want to try to substitue for SELECT queries, which
    clickhouse_driver at this moment in time decides based on the
    below predicate.
    """
    prepared_args: Any = QueryArgs
    if isinstance(args, (list, tuple, types.GeneratorType)):
        # If we get one of these it means we have an insert, let the clickhouse
        # client handle substitution here.
        rendered_sql = query
        prepared_args = args
    elif not args:
        # If `args` is not truthy then make prepared_args `None`, which the
        # clickhouse client uses to signal no substitution is desired. Expected
        # args balue are `None` or `{}` for instance
        rendered_sql = query
        prepared_args = None
    else:
        # Else perform the substitution so we can perform operations on the raw
        # non-templated SQL
        rendered_sql = client.substitute_params(query, args)
        prepared_args = None

    formatted_sql = sqlparse.format(rendered_sql, strip_comments=True)
    annotated_sql, tags = _annotate_tagged_query(formatted_sql, args)

    if app_settings.SHELL_PLUS_PRINT_SQL:
        print()
        print(format_sql(formatted_sql))

    return annotated_sql, prepared_args, tags


def _deserialize(result_bytes: bytes) -> List[Tuple]:
    results = []
    for x in json.loads(result_bytes):
        results.append(tuple(x))
    return results


def _serialize(result: Any) -> bytes:
    return json.dumps(result).encode("utf-8")


def _key_hash(query: str, args: Any) -> bytes:
    key = hashlib.md5(query.encode("utf-8") + json.dumps(args).encode("utf-8")).digest()
    return key


def _annotate_tagged_query(query, args):
    """
    Adds in a /* */ so we can look in clickhouses `system.query_log`
    to easily marry up to the generating code.
    """
    tags = {"kind": (_request_information or {}).get("kind"), "id": (_request_information or {}).get("id")}
    if isinstance(args, dict) and "team_id" in args:
        tags["team_id"] = args["team_id"]
    # Annotate the query with information on the request/task
    if _request_information is not None:
        query = f"/* {_request_information['kind']}:{_request_information['id'].replace('/', '_')} */ {query}"

    return query, tags


def _notify_of_slow_query_failure(tags: Dict[str, Any]):
    tags["failed"] = True
    tags["reason"] = "timeout"
    incr("clickhouse_sync_execution_failure", tags=tags)


def format_sql(rendered_sql, colorize=True):
    formatted_sql = sqlparse.format(rendered_sql, reindent_aligned=True)
    if colorize:
        try:
            import pygments.formatters
            import pygments.lexers

            return pygments.highlight(
                formatted_sql, pygments.lexers.get_lexer_by_name("sql"), pygments.formatters.TerminalFormatter()
            )
        except:
            pass

    return formatted_sql


def save_query(sql: str, execution_time: float) -> None:
    """
    Save query for debugging purposes
    """
    if _request_information is None:
        return

    try:
        key = "save_query_{}".format(_request_information["user_id"])
        queries = json.loads(get_safe_cache(key) or "[]")

        queries.insert(
            0,
            {
                "timestamp": now().isoformat(),
                "query": format_sql(sql, colorize=False),
                "execution_time": execution_time,
            },
        )
        cache.set(key, json.dumps(queries), timeout=120)
    except Exception as e:
        capture_exception(e)