0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-12-01 12:21:02 +01:00
posthog/ee/clickhouse/materialized_columns/test/test_columns.py
Rick Marron fd150761ec
Materialize has_full_snapshot in session_recording_events (#7281)
* materialize has_full_snapshot

* add basic backfill command

* some clean up

* bug fix

* use comment function

* remove unneeded column check

* add to test for get_materialized_columns

* update test

* ClickhouseDestroyTablesMixin resets session_recording_events

* remove backfill command
2021-11-29 16:18:08 -08:00

188 lines
6.8 KiB
Python

import random
from datetime import timedelta
from time import sleep
from uuid import uuid4
from freezegun import freeze_time
from ee.clickhouse.client import sync_execute
from ee.clickhouse.materialized_columns.columns import (
backfill_materialized_columns,
get_materialized_columns,
materialize,
)
from ee.clickhouse.models.event import create_event
from ee.clickhouse.util import ClickhouseDestroyTablesMixin, ClickhouseTestMixin
from ee.tasks.materialized_columns import mark_all_materialized
from posthog.constants import GROUP_TYPES_LIMIT
from posthog.settings import CLICKHOUSE_DATABASE
from posthog.test.base import BaseTest
GROUPS_COLUMNS = [f"$group_{i}" for i in range(GROUP_TYPES_LIMIT)]
def _create_event(**kwargs):
pk = uuid4()
kwargs.update({"event_uuid": pk})
create_event(**kwargs)
return pk
class TestMaterializedColumns(ClickhouseTestMixin, ClickhouseDestroyTablesMixin, BaseTest):
def test_get_columns_default(self):
self.assertCountEqual(get_materialized_columns("events"), GROUPS_COLUMNS)
self.assertCountEqual(get_materialized_columns("person"), [])
self.assertEqual(
get_materialized_columns("session_recording_events"), {"has_full_snapshot": "has_full_snapshot"}
)
def test_caching_and_materializing(self):
with freeze_time("2020-01-04T13:01:01Z"):
materialize("events", "$foo")
materialize("events", "$bar")
materialize("person", "$zeta")
self.assertCountEqual(
get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", *GROUPS_COLUMNS]
)
self.assertCountEqual(get_materialized_columns("person", use_cache=True).keys(), ["$zeta"])
materialize("events", "abc")
self.assertCountEqual(
get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", *GROUPS_COLUMNS]
)
with freeze_time("2020-01-04T14:00:01Z"):
self.assertCountEqual(
get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", "abc", *GROUPS_COLUMNS]
)
def test_materialized_column_naming(self):
random.seed(0)
materialize("events", "$foO();--sqlinject")
materialize("events", "$foO();ääsqlinject")
materialize("events", "$foO_____sqlinject")
materialize("person", "SoMePrOp")
self.assertDictContainsSubset(
{
"$foO();--sqlinject": "mat_$foO_____sqlinject",
"$foO();ääsqlinject": "mat_$foO_____sqlinject_yWAc",
"$foO_____sqlinject": "mat_$foO_____sqlinject_qGFz",
},
get_materialized_columns("events"),
)
self.assertEqual(get_materialized_columns("person"), {"SoMePrOp": "pmat_SoMePrOp"})
def test_backfilling_data(self):
sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_prop")
sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_another")
_create_event(
event="some_event", distinct_id="1", team=self.team, timestamp="2020-01-01 00:00:00", properties={"prop": 1}
)
_create_event(
event="some_event",
distinct_id="1",
team=self.team,
timestamp="2021-05-02 00:00:00",
properties={"prop": 2, "another": 5},
)
_create_event(
event="some_event", distinct_id="1", team=self.team, timestamp="2021-05-03 00:00:00", properties={"prop": 3}
)
_create_event(event="another_event", distinct_id="1", team=self.team, timestamp="2021-05-04 00:00:00")
_create_event(
event="third_event",
distinct_id="1",
team=self.team,
timestamp="2021-05-05 00:00:00",
properties={"prop": 4},
)
_create_event(
event="fourth_event",
distinct_id="1",
team=self.team,
timestamp="2021-05-06 00:00:00",
properties={"another": 6},
)
materialize("events", "prop")
materialize("events", "another")
self.assertEqual(self._count_materialized_rows("mat_prop"), 0)
self.assertEqual(self._count_materialized_rows("mat_another"), 0)
with freeze_time("2021-05-10T14:00:01Z"):
backfill_materialized_columns(
"events", ["prop", "another"], timedelta(days=50), test_settings={"mutations_sync": "0"}
)
_create_event(
event="fifth_event",
distinct_id="1",
team=self.team,
timestamp="2021-05-07 00:00:00",
properties={"another": 7},
)
iterations = 0
while self._get_count_of_mutations_running() > 0 and iterations < 100:
sleep(0.1)
iterations += 1
self.assertGreaterEqual(self._count_materialized_rows("mat_prop"), 4)
self.assertGreaterEqual(self._count_materialized_rows("mat_another"), 4)
self.assertEqual(
sync_execute("SELECT mat_prop, mat_another FROM events ORDER BY timestamp"),
[("1", ""), ("2", "5"), ("3", ""), ("", ""), ("4", ""), ("", "6"), ("", "7")],
)
def test_column_types(self):
materialize("events", "myprop")
# :KLUDGE: ClickHouse replaces our trim(BOTH '"' FROM properties) with this
expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), concat('^[', regexpQuoteMeta('\"'), ']*|[', regexpQuoteMeta('\"'), ']*$'), '')"
self.assertEqual(("MATERIALIZED", expr), self._get_column_types("events", "mat_myprop"))
backfill_materialized_columns("events", ["myprop"], timedelta(days=50))
self.assertEqual(("DEFAULT", expr), self._get_column_types("events", "mat_myprop"))
mark_all_materialized()
self.assertEqual(("MATERIALIZED", expr), self._get_column_types("events", "mat_myprop"))
def _count_materialized_rows(self, column):
return sync_execute(
"""
SELECT sum(rows)
FROM system.parts_columns
WHERE table = 'events'
AND database = %(database)s
AND column = %(column)s
""",
{"database": CLICKHOUSE_DATABASE, "column": column},
)[0][0]
def _get_count_of_mutations_running(self) -> int:
return sync_execute(
"""
SELECT count(*)
FROM system.mutations
WHERE is_done = 0
"""
)[0][0]
def _get_column_types(self, table: str, column: str):
return sync_execute(
"""
SELECT default_kind, default_expression
FROM system.columns
WHERE database = %(database)s AND table = %(table)s AND name = %(column)s
""",
{"table": table, "database": CLICKHOUSE_DATABASE, "column": column},
)[0]