mirror of
https://github.com/PostHog/posthog.git
synced 2024-11-25 11:17:50 +01:00
320 lines
12 KiB
Python
320 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from collections.abc import Iterator
|
|
from dataclasses import dataclass, replace
|
|
from datetime import timedelta
|
|
from typing import Literal, NamedTuple, cast
|
|
|
|
from clickhouse_driver.errors import ServerException
|
|
from django.utils.timezone import now
|
|
|
|
from posthog.clickhouse.kafka_engine import trim_quotes_expr
|
|
from posthog.clickhouse.materialized_columns import ColumnName, TablesWithMaterializedColumns
|
|
from posthog.client import sync_execute
|
|
from posthog.models.instance_setting import get_instance_setting
|
|
from posthog.models.property import PropertyName, TableColumn, TableWithProperties
|
|
from posthog.models.utils import generate_random_short_suffix
|
|
from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE, TEST
|
|
|
|
DEFAULT_TABLE_COLUMN: Literal["properties"] = "properties"
|
|
|
|
TRIM_AND_EXTRACT_PROPERTY = trim_quotes_expr("JSONExtractRaw({table_column}, %(property)s)")
|
|
|
|
SHORT_TABLE_COLUMN_NAME = {
|
|
"properties": "p",
|
|
"group_properties": "gp",
|
|
"person_properties": "pp",
|
|
"group0_properties": "gp0",
|
|
"group1_properties": "gp1",
|
|
"group2_properties": "gp2",
|
|
"group3_properties": "gp3",
|
|
"group4_properties": "gp4",
|
|
}
|
|
|
|
|
|
class MaterializedColumn(NamedTuple):
|
|
name: ColumnName
|
|
details: MaterializedColumnDetails
|
|
|
|
@staticmethod
|
|
def get_all(table: TablesWithMaterializedColumns) -> Iterator[MaterializedColumn]:
|
|
rows = sync_execute(
|
|
"""
|
|
SELECT name, comment
|
|
FROM system.columns
|
|
WHERE database = %(database)s
|
|
AND table = %(table)s
|
|
AND comment LIKE '%%column_materializer::%%'
|
|
AND comment not LIKE '%%column_materializer::elements_chain::%%'
|
|
""",
|
|
{"database": CLICKHOUSE_DATABASE, "table": table},
|
|
)
|
|
|
|
for name, comment in rows:
|
|
yield MaterializedColumn(name, MaterializedColumnDetails.from_column_comment(comment))
|
|
|
|
@staticmethod
|
|
def get(table: TablesWithMaterializedColumns, column_name: ColumnName) -> MaterializedColumn:
|
|
# TODO: It would be more efficient to push the filter here down into the `get_all` query, but that would require
|
|
# more a sophisticated method of constructing queries than we have right now, and this data set should be small
|
|
# enough that this doesn't really matter (at least as of writing.)
|
|
columns = [column for column in MaterializedColumn.get_all(table) if column.name == column_name]
|
|
match columns:
|
|
case []:
|
|
raise ValueError("column does not exist")
|
|
case [column]:
|
|
return column
|
|
case _:
|
|
# this should never happen (column names are unique within a table) and suggests an error in the query
|
|
raise ValueError(f"got {len(columns)} columns, expected 0 or 1")
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MaterializedColumnDetails:
|
|
table_column: TableColumn
|
|
property_name: PropertyName
|
|
is_disabled: bool
|
|
|
|
COMMENT_PREFIX = "column_materializer"
|
|
COMMENT_SEPARATOR = "::"
|
|
COMMENT_DISABLED_MARKER = "disabled"
|
|
|
|
def as_column_comment(self) -> str:
|
|
bits = [self.COMMENT_PREFIX, self.table_column, self.property_name]
|
|
if self.is_disabled:
|
|
bits.append(self.COMMENT_DISABLED_MARKER)
|
|
return self.COMMENT_SEPARATOR.join(bits)
|
|
|
|
@classmethod
|
|
def from_column_comment(cls, comment: str) -> MaterializedColumnDetails:
|
|
match comment.split(cls.COMMENT_SEPARATOR, 3):
|
|
# Old style comments have the format "column_materializer::property", dealing with the default table column.
|
|
case [cls.COMMENT_PREFIX, property_name]:
|
|
return MaterializedColumnDetails(DEFAULT_TABLE_COLUMN, property_name, is_disabled=False)
|
|
# Otherwise, it's "column_materializer::table_column::property" for columns that are active.
|
|
case [cls.COMMENT_PREFIX, table_column, property_name]:
|
|
return MaterializedColumnDetails(cast(TableColumn, table_column), property_name, is_disabled=False)
|
|
# Columns that are marked as disabled have an extra trailer indicating their status.
|
|
case [cls.COMMENT_PREFIX, table_column, property_name, cls.COMMENT_DISABLED_MARKER]:
|
|
return MaterializedColumnDetails(cast(TableColumn, table_column), property_name, is_disabled=True)
|
|
case _:
|
|
raise ValueError(f"unexpected comment format: {comment!r}")
|
|
|
|
|
|
def get_materialized_columns(
|
|
table: TablesWithMaterializedColumns,
|
|
exclude_disabled_columns: bool = False,
|
|
) -> dict[tuple[PropertyName, TableColumn], ColumnName]:
|
|
if not get_instance_setting("MATERIALIZED_COLUMNS_ENABLED"):
|
|
return {}
|
|
|
|
return {
|
|
(column.details.property_name, column.details.table_column): column.name
|
|
for column in MaterializedColumn.get_all(table)
|
|
if not (exclude_disabled_columns and column.details.is_disabled)
|
|
}
|
|
|
|
|
|
def get_on_cluster_clause_for_table(table: TableWithProperties) -> str:
|
|
return f"ON CLUSTER '{CLICKHOUSE_CLUSTER}'" if table == "events" else ""
|
|
|
|
|
|
def materialize(
|
|
table: TableWithProperties,
|
|
property: PropertyName,
|
|
column_name: ColumnName | None = None,
|
|
table_column: TableColumn = DEFAULT_TABLE_COLUMN,
|
|
create_minmax_index=not TEST,
|
|
) -> ColumnName | None:
|
|
if (property, table_column) in get_materialized_columns(table):
|
|
if TEST:
|
|
return None
|
|
|
|
raise ValueError(f"Property already materialized. table={table}, property={property}, column={table_column}")
|
|
|
|
if table_column not in SHORT_TABLE_COLUMN_NAME:
|
|
raise ValueError(f"Invalid table_column={table_column} for materialisation")
|
|
|
|
column_name = column_name or _materialized_column_name(table, property, table_column)
|
|
on_cluster = get_on_cluster_clause_for_table(table)
|
|
|
|
if table == "events":
|
|
sync_execute(
|
|
f"""
|
|
ALTER TABLE sharded_{table} {on_cluster}
|
|
ADD COLUMN IF NOT EXISTS
|
|
{column_name} VARCHAR MATERIALIZED {TRIM_AND_EXTRACT_PROPERTY.format(table_column=table_column)}
|
|
""",
|
|
{"property": property},
|
|
settings={"alter_sync": 2 if TEST else 1},
|
|
)
|
|
sync_execute(
|
|
f"""
|
|
ALTER TABLE {table} {on_cluster}
|
|
ADD COLUMN IF NOT EXISTS
|
|
{column_name} VARCHAR
|
|
""",
|
|
settings={"alter_sync": 2 if TEST else 1},
|
|
)
|
|
else:
|
|
sync_execute(
|
|
f"""
|
|
ALTER TABLE {table} {on_cluster}
|
|
ADD COLUMN IF NOT EXISTS
|
|
{column_name} VARCHAR MATERIALIZED {TRIM_AND_EXTRACT_PROPERTY.format(table_column=table_column)}
|
|
""",
|
|
{"property": property},
|
|
settings={"alter_sync": 2 if TEST else 1},
|
|
)
|
|
|
|
sync_execute(
|
|
f"ALTER TABLE {table} {on_cluster} COMMENT COLUMN {column_name} %(comment)s",
|
|
{"comment": MaterializedColumnDetails(table_column, property, is_disabled=False).as_column_comment()},
|
|
settings={"alter_sync": 2 if TEST else 1},
|
|
)
|
|
|
|
if create_minmax_index:
|
|
add_minmax_index(table, column_name)
|
|
|
|
return column_name
|
|
|
|
|
|
def update_column_is_disabled(table: TablesWithMaterializedColumns, column_name: str, is_disabled: bool) -> None:
|
|
details = replace(
|
|
MaterializedColumn.get(table, column_name).details,
|
|
is_disabled=is_disabled,
|
|
)
|
|
|
|
on_cluster = get_on_cluster_clause_for_table(table)
|
|
sync_execute(
|
|
f"ALTER TABLE {table} {on_cluster} COMMENT COLUMN {column_name} %(comment)s",
|
|
{"comment": details.as_column_comment()},
|
|
settings={"alter_sync": 2 if TEST else 1},
|
|
)
|
|
|
|
|
|
def drop_column(table: TablesWithMaterializedColumns, column_name: str) -> None:
|
|
drop_minmax_index(table, column_name)
|
|
|
|
on_cluster = get_on_cluster_clause_for_table(table)
|
|
sync_execute(
|
|
f"ALTER TABLE {table} {on_cluster} DROP COLUMN IF EXISTS {column_name}",
|
|
settings={"alter_sync": 2 if TEST else 1},
|
|
)
|
|
|
|
if table == "events":
|
|
sync_execute(
|
|
f"ALTER TABLE sharded_{table} {on_cluster} DROP COLUMN IF EXISTS {column_name}",
|
|
{"property": property},
|
|
settings={"alter_sync": 2 if TEST else 1},
|
|
)
|
|
|
|
|
|
def add_minmax_index(table: TablesWithMaterializedColumns, column_name: ColumnName):
|
|
# Note: This will be populated on backfill
|
|
on_cluster = get_on_cluster_clause_for_table(table)
|
|
updated_table = "sharded_events" if table == "events" else table
|
|
index_name = f"minmax_{column_name}"
|
|
|
|
try:
|
|
sync_execute(
|
|
f"""
|
|
ALTER TABLE {updated_table} {on_cluster}
|
|
ADD INDEX {index_name} {column_name}
|
|
TYPE minmax GRANULARITY 1
|
|
""",
|
|
settings={"alter_sync": 2 if TEST else 1},
|
|
)
|
|
except ServerException as err:
|
|
if "index with this name already exists" not in str(err):
|
|
raise
|
|
|
|
return index_name
|
|
|
|
|
|
def drop_minmax_index(table: TablesWithMaterializedColumns, column_name: ColumnName) -> None:
|
|
on_cluster = get_on_cluster_clause_for_table(table)
|
|
|
|
# XXX: copy/pasted from `add_minmax_index`
|
|
updated_table = "sharded_events" if table == "events" else table
|
|
index_name = f"minmax_{column_name}"
|
|
|
|
sync_execute(
|
|
f"ALTER TABLE {updated_table} {on_cluster} DROP INDEX IF EXISTS {index_name}",
|
|
settings={"alter_sync": 2 if TEST else 1},
|
|
)
|
|
|
|
|
|
def backfill_materialized_columns(
|
|
table: TableWithProperties,
|
|
properties: list[tuple[PropertyName, TableColumn]],
|
|
backfill_period: timedelta,
|
|
test_settings=None,
|
|
) -> None:
|
|
"""
|
|
Backfills the materialized column after its creation.
|
|
|
|
This will require reading and writing a lot of data on clickhouse disk.
|
|
"""
|
|
|
|
if len(properties) == 0:
|
|
return
|
|
|
|
updated_table = "sharded_events" if table == "events" else table
|
|
on_cluster = get_on_cluster_clause_for_table(table)
|
|
|
|
materialized_columns = get_materialized_columns(table)
|
|
|
|
# Hack from https://github.com/ClickHouse/ClickHouse/issues/19785
|
|
# Note that for this to work all inserts should list columns explicitly
|
|
# Improve this if https://github.com/ClickHouse/ClickHouse/issues/27730 ever gets resolved
|
|
for property, table_column in properties:
|
|
sync_execute(
|
|
f"""
|
|
ALTER TABLE {updated_table} {on_cluster}
|
|
MODIFY COLUMN
|
|
{materialized_columns[(property, table_column)]} VARCHAR DEFAULT {TRIM_AND_EXTRACT_PROPERTY.format(table_column=table_column)}
|
|
""",
|
|
{"property": property},
|
|
settings=test_settings,
|
|
)
|
|
|
|
# Kick off mutations which will update clickhouse partitions in the background. This will return immediately
|
|
assignments = ", ".join(
|
|
f"{materialized_columns[property_and_column]} = {materialized_columns[property_and_column]}"
|
|
for property_and_column in properties
|
|
)
|
|
|
|
sync_execute(
|
|
f"""
|
|
ALTER TABLE {updated_table} {on_cluster}
|
|
UPDATE {assignments}
|
|
WHERE {"timestamp > %(cutoff)s" if table == "events" else "1 = 1"}
|
|
""",
|
|
{"cutoff": (now() - backfill_period).strftime("%Y-%m-%d")},
|
|
settings=test_settings,
|
|
)
|
|
|
|
|
|
def _materialized_column_name(
|
|
table: TableWithProperties,
|
|
property: PropertyName,
|
|
table_column: TableColumn = DEFAULT_TABLE_COLUMN,
|
|
) -> ColumnName:
|
|
"Returns a sanitized and unique column name to use for materialized column"
|
|
|
|
prefix = "pmat_" if table == "person" else "mat_"
|
|
|
|
if table_column != DEFAULT_TABLE_COLUMN:
|
|
prefix += f"{SHORT_TABLE_COLUMN_NAME[table_column]}_"
|
|
property_str = re.sub("[^0-9a-zA-Z$]", "_", property)
|
|
|
|
existing_materialized_columns = set(get_materialized_columns(table).values())
|
|
suffix = ""
|
|
|
|
while f"{prefix}{property_str}{suffix}" in existing_materialized_columns:
|
|
suffix = "_" + generate_random_short_suffix()
|
|
|
|
return f"{prefix}{property_str}{suffix}"
|