0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-28 09:16:49 +01:00
posthog/ee/clickhouse/queries/event_query.py
Karl-Aksel Puulmann d9bc06b7dd
Speed up lifecycle query (#8021)
* refactor(lifecycle): simplify clickhouse sql logic

This updates the SQL to be comprised of two queries, one for getting
new, returning, and resurrecting periods of activity, one for getting
dormant periods right after periods of activity.

Refers to https://github.com/PostHog/posthog/issues/7382

* refactor(lifecyle): use `ClickhouseEventQuery` to build event query

* format

* Use bounded_person_activity_by_period for both sides of dormant join

* refactor(lifecycle): reduce pdi2 join by one

This means we're now under the current query memory limit for orgs with
around 20m distinct_ids. It does remove some readability though :(

* update snapshot

* Add further comments to query

* Add further comments to query

* Add further comments to query

* Remove dead variables

* Refactor person_query overriding

* Lifecycle refactoring continued

* Update lifecycle tests (except people ones)

* Make lifecycle people endpoint happy

* Remove django lifecycle tests

* Add some edge case tests

* Add missing type

Co-authored-by: Harry Waye <harry@posthog.com>
2022-01-13 16:31:09 +02:00

196 lines
7.6 KiB
Python

from abc import ABCMeta, abstractmethod
from typing import Any, Dict, List, Tuple, Union
from ee.clickhouse.materialized_columns.columns import ColumnName
from ee.clickhouse.models.cohort import format_person_query, format_precalculated_cohort_query, is_precalculated_query
from ee.clickhouse.models.property import parse_prop_clauses
from ee.clickhouse.models.util import PersonPropertiesMode
from ee.clickhouse.queries.column_optimizer import ColumnOptimizer
from ee.clickhouse.queries.groups_join_query import GroupsJoinQuery
from ee.clickhouse.queries.person_distinct_id_query import get_team_distinct_ids_query
from ee.clickhouse.queries.person_query import ClickhousePersonQuery
from ee.clickhouse.queries.util import parse_timestamps
from posthog.models import Cohort, Filter, Property
from posthog.models.filters.mixins.utils import cached_property
from posthog.models.filters.path_filter import PathFilter
from posthog.models.filters.retention_filter import RetentionFilter
from posthog.models.filters.session_recordings_filter import SessionRecordingsFilter
from posthog.models.filters.stickiness_filter import StickinessFilter
from posthog.models.property import PropertyName
class ClickhouseEventQuery(metaclass=ABCMeta):
DISTINCT_ID_TABLE_ALIAS = "pdi"
PERSON_TABLE_ALIAS = "person"
EVENT_TABLE_ALIAS = "e"
_filter: Union[Filter, PathFilter, RetentionFilter, StickinessFilter, SessionRecordingsFilter]
_team_id: int
_column_optimizer: ColumnOptimizer
_should_join_distinct_ids = False
_should_join_persons = False
_should_round_interval = False
_extra_fields: List[ColumnName]
_extra_event_properties: List[PropertyName]
_extra_person_fields: List[ColumnName]
def __init__(
self,
filter: Union[Filter, PathFilter, RetentionFilter, StickinessFilter, SessionRecordingsFilter],
team_id: int,
round_interval=False,
should_join_distinct_ids=False,
should_join_persons=False,
# Extra events/person table columns to fetch since parent query needs them
extra_fields: List[ColumnName] = [],
extra_event_properties: List[PropertyName] = [],
extra_person_fields: List[ColumnName] = [],
**kwargs,
) -> None:
self._filter = filter
self._team_id = team_id
self._extra_event_properties = extra_event_properties
self._column_optimizer = ColumnOptimizer(self._filter, self._team_id)
self._extra_person_fields = extra_person_fields
self.params: Dict[str, Any] = {
"team_id": self._team_id,
}
self._should_join_distinct_ids = should_join_distinct_ids
self._should_join_persons = should_join_persons
self._extra_fields = extra_fields
self._extra_person_fields = extra_person_fields
if not self._should_join_distinct_ids:
self._determine_should_join_distinct_ids()
if not self._should_join_persons:
self._determine_should_join_persons()
self._should_round_interval = round_interval
@abstractmethod
def get_query(self) -> Tuple[str, Dict[str, Any]]:
pass
@abstractmethod
def _determine_should_join_distinct_ids(self) -> None:
pass
def _get_distinct_id_query(self) -> str:
if self._should_join_distinct_ids:
return f"""
INNER JOIN ({get_team_distinct_ids_query(self._team_id)}) AS {self.DISTINCT_ID_TABLE_ALIAS}
ON {self.EVENT_TABLE_ALIAS}.distinct_id = {self.DISTINCT_ID_TABLE_ALIAS}.distinct_id
"""
else:
return ""
def _determine_should_join_persons(self) -> None:
if self._person_query.is_used:
self._should_join_distinct_ids = True
self._should_join_persons = True
return
# :KLUDGE: The following is mostly making sure if cohorts are included as well.
# Can be simplified significantly after https://github.com/PostHog/posthog/issues/5854
if any(self._should_property_join_persons(prop) for prop in self._filter.properties):
self._should_join_distinct_ids = True
self._should_join_persons = True
return
if any(
self._should_property_join_persons(prop) for entity in self._filter.entities for prop in entity.properties
):
self._should_join_distinct_ids = True
self._should_join_persons = True
return
def _should_property_join_persons(self, prop: Property) -> bool:
return prop.type == "cohort" and self._does_cohort_need_persons(prop)
def _does_cohort_need_persons(self, prop: Property) -> bool:
try:
cohort: Cohort = Cohort.objects.get(pk=prop.value, team_id=self._team_id)
except Cohort.DoesNotExist:
return False
if is_precalculated_query(cohort):
return True
if cohort.is_static:
return True
for group in cohort.groups:
if group.get("properties"):
return True
return False
@cached_property
def _person_query(self):
return ClickhousePersonQuery(
self._filter, self._team_id, self._column_optimizer, extra_fields=self._extra_person_fields
)
def _get_person_query(self) -> Tuple[str, Dict]:
if self._should_join_persons:
person_query, params = self._person_query.get_query()
return (
f"""
INNER JOIN ({person_query}) {self.PERSON_TABLE_ALIAS}
ON {self.PERSON_TABLE_ALIAS}.id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id
""",
params,
)
else:
return "", {}
def _get_groups_query(self) -> Tuple[str, Dict]:
return GroupsJoinQuery(self._filter, self._team_id, self._column_optimizer).get_join_query()
def _get_date_filter(self) -> Tuple[str, Dict]:
parsed_date_from, parsed_date_to, date_params = parse_timestamps(filter=self._filter, team_id=self._team_id)
query = f"""
{parsed_date_from}
{parsed_date_to}
"""
return query, date_params
def _get_props(self, filters: List[Property]) -> Tuple[str, Dict]:
final = []
params: Dict[str, Any] = {}
for idx, prop in enumerate(filters):
if prop.type == "cohort":
person_id_query, cohort_filter_params = self._get_cohort_subquery(prop)
params = {**params, **cohort_filter_params}
final.append(f"AND {person_id_query}")
else:
filter_query, filter_params = parse_prop_clauses(
[prop],
prepend=f"global_{idx}",
allow_denormalized_props=True,
person_properties_mode=PersonPropertiesMode.EXCLUDE,
)
final.append(filter_query)
params.update(filter_params)
return " ".join(final), params
def _get_cohort_subquery(self, prop) -> Tuple[str, Dict[str, Any]]:
try:
cohort: Cohort = Cohort.objects.get(pk=prop.value, team_id=self._team_id)
except Cohort.DoesNotExist:
return "0 = 11", {} # If cohort doesn't exist, nothing can match
is_precalculated = is_precalculated_query(cohort)
person_id_query, cohort_filter_params = (
format_precalculated_cohort_query(
cohort.pk, 0, custom_match_field=f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id"
)
if is_precalculated
else format_person_query(cohort, 0, custom_match_field=f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id")
)
return person_id_query, cohort_filter_params