0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-12-01 04:12:23 +01:00
posthog/ee/clickhouse/client.py
Harry Waye 83386a5512
fix(retention): add regression test for not_icontains filter (#7748)
* fix(retention): add regression test for not_icontains filter

Previously if using the not_icontains filter, we were trying to double
substitute a query when fetching persons for retention. This adds a
test (and in the next commit a fix) such that we instead apply the
clickhouse substitute only once to each part of the query.

Resolves https://github.com/PostHog/posthog/issues/7747

* avoid double substitution

* Rework solution to not change ActorBaseQuery interface

* Remove unued import

* fix typing
2021-12-17 10:13:38 +00:00

326 lines
11 KiB
Python

import asyncio
import hashlib
import json
import types
from time import perf_counter
from typing import Any, Dict, List, Optional, Tuple, Union, cast
import sqlparse
from aioch import Client
from asgiref.sync import async_to_sync
from clickhouse_driver import Client as SyncClient
from clickhouse_pool import ChPool
from django.conf import settings as app_settings
from django.core.cache import cache
from django.utils.timezone import now
from sentry_sdk.api import capture_exception
from ee.clickhouse.errors import wrap_query_error
from ee.clickhouse.timer import get_timer_thread
from posthog import redis
from posthog.constants import AnalyticsDBMS
from posthog.internal_metrics import incr, timing
from posthog.settings import (
CLICKHOUSE_ASYNC,
CLICKHOUSE_CA,
CLICKHOUSE_CONN_POOL_MAX,
CLICKHOUSE_CONN_POOL_MIN,
CLICKHOUSE_DATABASE,
CLICKHOUSE_HOST,
CLICKHOUSE_PASSWORD,
CLICKHOUSE_SECURE,
CLICKHOUSE_USER,
CLICKHOUSE_VERIFY,
PRIMARY_DB,
TEST,
)
from posthog.utils import get_safe_cache
InsertParams = Union[list, tuple, types.GeneratorType]
NonInsertParams = Dict[str, Any]
QueryArgs = Optional[Union[InsertParams, NonInsertParams]]
CACHE_TTL = 60 # seconds
SLOW_QUERY_THRESHOLD_MS = 15000
QUERY_TIMEOUT_THREAD = get_timer_thread("ee.clickhouse.client", SLOW_QUERY_THRESHOLD_MS)
_request_information: Optional[Dict] = None
def default_client():
"""
Return a bare bones client for use in places where we are only interested in general ClickHouse state
DO NOT USE THIS FOR QUERYING DATA
"""
return SyncClient(
host=CLICKHOUSE_HOST,
secure=CLICKHOUSE_SECURE,
user=CLICKHOUSE_USER,
password=CLICKHOUSE_PASSWORD,
ca_certs=CLICKHOUSE_CA,
verify=CLICKHOUSE_VERIFY,
)
def make_ch_pool(**overrides) -> ChPool:
kwargs = {
"host": CLICKHOUSE_HOST,
"database": CLICKHOUSE_DATABASE,
"secure": CLICKHOUSE_SECURE,
"user": CLICKHOUSE_USER,
"password": CLICKHOUSE_PASSWORD,
"ca_certs": CLICKHOUSE_CA,
"verify": CLICKHOUSE_VERIFY,
"connections_min": CLICKHOUSE_CONN_POOL_MIN,
"connections_max": CLICKHOUSE_CONN_POOL_MAX,
"settings": {"mutations_sync": "1"} if TEST else {},
**overrides,
}
return ChPool(**kwargs)
if PRIMARY_DB != AnalyticsDBMS.CLICKHOUSE:
ch_client = None # type: Client
class ClickHouseNotConfigured(NotImplementedError):
def __init__(self, msg='This function only works if PRIMARY_DB is set to indicate ClickHouse!"', *args):
super().__init__(msg, *args)
def async_execute(query, args=None, settings=None, with_column_types=False):
raise ClickHouseNotConfigured()
def sync_execute(query, args=None, settings=None, with_column_types=False):
raise ClickHouseNotConfigured()
def cache_sync_execute(query, args=None, redis_client=None, ttl=None, settings=None, with_column_types=False):
raise ClickHouseNotConfigured()
else:
if not TEST and CLICKHOUSE_ASYNC:
ch_client = Client(
host=CLICKHOUSE_HOST,
database=CLICKHOUSE_DATABASE,
secure=CLICKHOUSE_SECURE,
user=CLICKHOUSE_USER,
password=CLICKHOUSE_PASSWORD,
ca_certs=CLICKHOUSE_CA,
verify=CLICKHOUSE_VERIFY,
)
ch_pool = make_ch_pool()
@async_to_sync
async def async_execute(query, args=None, settings=None, with_column_types=False):
loop = asyncio.get_event_loop()
task = loop.create_task(
ch_client.execute(query, args, settings=settings, with_column_types=with_column_types)
)
return task
else:
# if this is a test use the sync client
ch_client = SyncClient(
host=CLICKHOUSE_HOST,
database=CLICKHOUSE_DATABASE,
secure=CLICKHOUSE_SECURE,
user=CLICKHOUSE_USER,
password=CLICKHOUSE_PASSWORD,
ca_certs=CLICKHOUSE_CA,
verify=CLICKHOUSE_VERIFY,
settings={"mutations_sync": "1"} if TEST else {},
)
ch_pool = make_ch_pool()
def async_execute(query, args=None, settings=None, with_column_types=False):
return sync_execute(query, args, settings=settings, with_column_types=with_column_types)
def cache_sync_execute(query, args=None, redis_client=None, ttl=CACHE_TTL, settings=None, with_column_types=False):
if not redis_client:
redis_client = redis.get_client()
key = _key_hash(query, args)
if redis_client.exists(key):
result = _deserialize(redis_client.get(key))
return result
else:
result = sync_execute(query, args, settings=settings, with_column_types=with_column_types)
redis_client.set(key, _serialize(result), ex=ttl)
return result
def sync_execute(query, args=None, settings=None, with_column_types=False):
with ch_pool.get_client() as client:
start_time = perf_counter()
prepared_sql, prepared_args, tags = _prepare_query(client=client, query=query, args=args)
timeout_task = QUERY_TIMEOUT_THREAD.schedule(_notify_of_slow_query_failure, tags)
try:
result = client.execute(
prepared_sql, params=prepared_args, settings=settings, with_column_types=with_column_types
)
except Exception as err:
err = wrap_query_error(err)
tags["failed"] = True
tags["reason"] = type(err).__name__
incr("clickhouse_sync_execution_failure", tags=tags)
raise err
finally:
execution_time = perf_counter() - start_time
QUERY_TIMEOUT_THREAD.cancel(timeout_task)
timing("clickhouse_sync_execution_time", execution_time * 1000.0, tags=tags)
if app_settings.SHELL_PLUS_PRINT_SQL:
print("Execution time: %.6fs" % (execution_time,))
if _request_information is not None and _request_information.get("save", False):
save_query(prepared_sql, execution_time)
return result
def substitute_params(query, params):
"""
Helper method to ease rendering of sql clickhouse queries progressively.
For example, there are many places where we construct queries to be used
as subqueries of others. Each time we generate a subquery we also pass
up the "bound" parameters to be used to render the query, which
otherwise only happens at the point of calling clickhouse via the
clickhouse_driver `Client`.
This results in sometimes large lists of parameters with no relevance to
the containing query being passed up. Rather than do this, we can
instead "render" the subqueries prior to using as a subquery, so our
containing code is only responsible for it's parameters, and we can
avoid any potential param collisions.
"""
return cast(SyncClient, ch_client).substitute_params(query, params)
def _prepare_query(client: SyncClient, query: str, args: QueryArgs):
"""
Given a string query with placeholders we do one of two things:
1. for a insert query we just format, and remove comments
2. for non-insert queries, we return the sql with placeholders
evaluated with the contents of `args`
We also return `tags` which contains some detail around the context
within which the query was executed e.g. the django view name
NOTE: `client.execute` would normally handle substitution, but
because we want to strip the comments to make it easier to copy
and past queries from the `system.query_log` easily with metabase
(metabase doesn't show new lines, so with comments, you can't get
a working query without exporting to csv or similar), we need to
do it manually.
We only want to try to substitue for SELECT queries, which
clickhouse_driver at this moment in time decides based on the
below predicate.
"""
prepared_args: Any = QueryArgs
if isinstance(args, (list, tuple, types.GeneratorType)):
# If we get one of these it means we have an insert, let the clickhouse
# client handle substitution here.
rendered_sql = query
prepared_args = args
elif not args:
# If `args` is not truthy then make prepared_args `None`, which the
# clickhouse client uses to signal no substitution is desired. Expected
# args balue are `None` or `{}` for instance
rendered_sql = query
prepared_args = None
else:
# Else perform the substitution so we can perform operations on the raw
# non-templated SQL
rendered_sql = client.substitute_params(query, args)
prepared_args = None
formatted_sql = sqlparse.format(rendered_sql, strip_comments=True)
annotated_sql, tags = _annotate_tagged_query(formatted_sql, args)
if app_settings.SHELL_PLUS_PRINT_SQL:
print()
print(format_sql(formatted_sql))
return annotated_sql, prepared_args, tags
def _deserialize(result_bytes: bytes) -> List[Tuple]:
results = []
for x in json.loads(result_bytes):
results.append(tuple(x))
return results
def _serialize(result: Any) -> bytes:
return json.dumps(result).encode("utf-8")
def _key_hash(query: str, args: Any) -> bytes:
key = hashlib.md5(query.encode("utf-8") + json.dumps(args).encode("utf-8")).digest()
return key
def _annotate_tagged_query(query, args):
"""
Adds in a /* */ so we can look in clickhouses `system.query_log`
to easily marry up to the generating code.
"""
tags = {"kind": (_request_information or {}).get("kind"), "id": (_request_information or {}).get("id")}
if isinstance(args, dict) and "team_id" in args:
tags["team_id"] = args["team_id"]
# Annotate the query with information on the request/task
if _request_information is not None:
query = f"/* {_request_information['kind']}:{_request_information['id'].replace('/', '_')} */ {query}"
return query, tags
def _notify_of_slow_query_failure(tags: Dict[str, Any]):
tags["failed"] = True
tags["reason"] = "timeout"
incr("clickhouse_sync_execution_failure", tags=tags)
def format_sql(rendered_sql, colorize=True):
formatted_sql = sqlparse.format(rendered_sql, reindent_aligned=True)
if colorize:
try:
import pygments.formatters
import pygments.lexers
return pygments.highlight(
formatted_sql, pygments.lexers.get_lexer_by_name("sql"), pygments.formatters.TerminalFormatter()
)
except:
pass
return formatted_sql
def save_query(sql: str, execution_time: float) -> None:
"""
Save query for debugging purposes
"""
if _request_information is None:
return
try:
key = "save_query_{}".format(_request_information["user_id"])
queries = json.loads(get_safe_cache(key) or "[]")
queries.insert(
0,
{
"timestamp": now().isoformat(),
"query": format_sql(sql, colorize=False),
"execution_time": execution_time,
},
)
cache.set(key, json.dumps(queries), timeout=120)
except Exception as e:
capture_exception(e)