0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-12-01 04:12:23 +01:00
posthog/ee/clickhouse/sql/events.py
Karl-Aksel Puulmann 8d8705d1bb
Materialize person columns automatically (#5760)
* Hotfix: Use materialized columns on cloud

This was broken since default_kind was different on Distributed tables
on cloud

* Improve __init__.py

* Make reverting DEFAULT column async, only ON CLUSTER for events table
2021-08-28 12:14:30 +03:00

207 lines
5.7 KiB
Python

from ee.kafka_client.topics import KAFKA_EVENTS
from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE
from .clickhouse import KAFKA_COLUMNS, REPLACING_MERGE_TREE, STORAGE_POLICY, kafka_engine, table_engine
from .person import GET_TEAM_PERSON_DISTINCT_IDS
EVENTS_TABLE = "events"
DROP_EVENTS_TABLE_SQL = f"DROP TABLE {EVENTS_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}"
EVENTS_TABLE_BASE_SQL = """
CREATE TABLE {table_name} ON CLUSTER {cluster}
(
uuid UUID,
event VARCHAR,
properties VARCHAR,
timestamp DateTime64(6, 'UTC'),
team_id Int64,
distinct_id VARCHAR,
elements_chain VARCHAR,
created_at DateTime64(6, 'UTC')
{materialized_columns}
{extra_fields}
) ENGINE = {engine}
"""
EVENTS_TABLE_MATERIALIZED_COLUMNS = """
, properties_issampledevent VARCHAR materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, 'isSampledEvent'))
, properties_currentscreen VARCHAR materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, 'currentScreen'))
, properties_objectname VARCHAR materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, 'objectName'))
, properties_test_prop VARCHAR materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, 'test_prop'))
"""
EVENTS_TABLE_SQL = (
EVENTS_TABLE_BASE_SQL
+ """PARTITION BY toYYYYMM(timestamp)
ORDER BY (team_id, toDate(timestamp), distinct_id, uuid)
SAMPLE BY uuid
{storage_policy}
"""
).format(
table_name=EVENTS_TABLE,
cluster=CLICKHOUSE_CLUSTER,
engine=table_engine(EVENTS_TABLE, "_timestamp", REPLACING_MERGE_TREE),
extra_fields=KAFKA_COLUMNS,
materialized_columns=EVENTS_TABLE_MATERIALIZED_COLUMNS,
storage_policy=STORAGE_POLICY,
)
KAFKA_EVENTS_TABLE_SQL = EVENTS_TABLE_BASE_SQL.format(
table_name="kafka_" + EVENTS_TABLE,
cluster=CLICKHOUSE_CLUSTER,
engine=kafka_engine(topic=KAFKA_EVENTS, serialization="Protobuf", proto_schema="events:Event"),
extra_fields="",
materialized_columns="",
)
# You must include the database here because of a bug in clickhouse
# related to https://github.com/ClickHouse/ClickHouse/issues/10471
EVENTS_TABLE_MV_SQL = """
CREATE MATERIALIZED VIEW {table_name}_mv ON CLUSTER {cluster}
TO {database}.{table_name}
AS SELECT
uuid,
event,
properties,
timestamp,
team_id,
distinct_id,
elements_chain,
created_at,
_timestamp,
_offset
FROM {database}.kafka_{table_name}
""".format(
table_name=EVENTS_TABLE, cluster=CLICKHOUSE_CLUSTER, database=CLICKHOUSE_DATABASE,
)
INSERT_EVENT_SQL = """
INSERT INTO events (uuid, event, properties, timestamp, team_id, distinct_id, elements_chain, created_at, _timestamp, _offset)
SELECT %(uuid)s, %(event)s, %(properties)s, %(timestamp)s, %(team_id)s, %(distinct_id)s, %(elements_chain)s, %(created_at)s, now(), 0
"""
GET_EVENTS_SQL = """
SELECT
uuid,
event,
properties,
timestamp,
team_id,
distinct_id,
elements_chain,
created_at
FROM events
"""
GET_EVENTS_BY_TEAM_SQL = """
SELECT
uuid,
event,
properties,
timestamp,
team_id,
distinct_id,
elements_chain,
created_at
FROM events WHERE team_id = %(team_id)s
"""
SELECT_PROP_VALUES_SQL = """
SELECT DISTINCT trim(BOTH '\"' FROM JSONExtractRaw(properties, %(key)s)) FROM events where JSONHas(properties, %(key)s) AND team_id = %(team_id)s {parsed_date_from} {parsed_date_to} LIMIT 10
"""
SELECT_PROP_VALUES_SQL_WITH_FILTER = """
SELECT DISTINCT trim(BOTH '\"' FROM JSONExtractRaw(properties, %(key)s)) FROM events where team_id = %(team_id)s AND trim(BOTH '\"' FROM JSONExtractRaw(properties, %(key)s)) LIKE %(value)s {parsed_date_from} {parsed_date_to} LIMIT 10
"""
SELECT_EVENT_BY_TEAM_AND_CONDITIONS_SQL = """
SELECT
uuid,
event,
properties,
timestamp,
team_id,
distinct_id,
elements_chain,
created_at
FROM
events
where team_id = %(team_id)s
{conditions}
ORDER BY toDate(timestamp) DESC, timestamp DESC {limit}
"""
SELECT_EVENT_BY_TEAM_AND_CONDITIONS_FILTERS_SQL = """
SELECT
uuid,
event,
properties,
timestamp,
team_id,
distinct_id,
elements_chain,
created_at
FROM events
WHERE
team_id = %(team_id)s
{conditions}
{filters}
ORDER BY toDate(timestamp) DESC, timestamp DESC {limit}
"""
SELECT_ONE_EVENT_SQL = """
SELECT
uuid,
event,
properties,
timestamp,
team_id,
distinct_id,
elements_chain,
created_at
FROM events WHERE uuid = %(event_id)s AND team_id = %(team_id)s
"""
GET_EARLIEST_TIMESTAMP_SQL = """
SELECT timestamp from events WHERE team_id = %(team_id)s order by toDate(timestamp), timestamp limit 1
"""
NULL_SQL = """
SELECT toUInt16(0) AS total, {trunc_func}(toDateTime(%(date_to)s) - {interval_func}(number)) AS day_start
FROM numbers(dateDiff(%(interval)s, toDateTime(%(date_from)s), toDateTime(%(date_to)s)) + 1)
"""
EVENT_JOIN_PERSON_SQL = f"""
INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) as pdi ON events.distinct_id = pdi.distinct_id
"""
GET_EVENTS_WITH_PROPERTIES = """
SELECT * FROM events WHERE
team_id = %(team_id)s
{filters}
{order_by}
"""
EXTRACT_TAG_REGEX = "extract(elements_chain, '^(.*?)[.|:]')"
EXTRACT_TEXT_REGEX = "extract(elements_chain, 'text=\"(.*?)\"')"
ELEMENT_TAG_COUNT = """
SELECT concat('<', {tag_regex}, '> ', {text_regex}) AS tag_name,
events.elements_chain,
count(*) as tag_count
FROM events
WHERE events.team_id = %(team_id)s AND event = '$autocapture'
GROUP BY tag_name, elements_chain
ORDER BY tag_count desc, tag_name
LIMIT %(limit)s
""".format(
tag_regex=EXTRACT_TAG_REGEX, text_regex=EXTRACT_TEXT_REGEX
)
GET_CUSTOM_EVENTS = """
SELECT DISTINCT event FROM events where team_id = %(team_id)s AND event NOT IN ['$autocapture', '$pageview', '$identify', '$pageleave', '$screen']
"""
GET_EVENTS_VOLUME = "SELECT event, count(1) as count FROM events WHERE team_id = %(team_id)s AND timestamp > %(timestamp)s GROUP BY event ORDER BY count DESC"