0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-28 09:16:49 +01:00
posthog/ee/clickhouse/queries/column_optimizer.py
Harry Waye b94d02bc10
test(retention): add test for retention breakdown with materialized property (#7505)
* test(retention): add test for retention breakdown with materialized property

* remove unused imports

* Join tables based on breakdowns properties
2021-12-07 15:05:03 +00:00

149 lines
7.0 KiB
Python

from typing import Counter, List, Set, Union, cast
from ee.clickhouse.materialized_columns.columns import ColumnName, get_materialized_columns
from ee.clickhouse.models.action import get_action_tables_and_properties, uses_elements_chain
from ee.clickhouse.models.property import box_value, extract_tables_and_properties
from posthog.constants import TREND_FILTER_TYPE_ACTIONS, FunnelCorrelationType
from posthog.models.entity import Entity
from posthog.models.filters import Filter
from posthog.models.filters.mixins.utils import cached_property
from posthog.models.filters.path_filter import PathFilter
from posthog.models.filters.retention_filter import RetentionFilter
from posthog.models.filters.stickiness_filter import StickinessFilter
from posthog.models.filters.utils import GroupTypeIndex
from posthog.models.property import PropertyIdentifier, PropertyType, TableWithProperties
class ColumnOptimizer:
"""
This class is responsible for figuring out what columns can and should be materialized based on the query filter.
This speeds up queries since clickhouse ends up selecting less data.
"""
def __init__(self, filter: Union[Filter, PathFilter, RetentionFilter, StickinessFilter], team_id: int):
self.filter = filter
self.team_id = team_id
@cached_property
def event_columns_to_query(self) -> Set[ColumnName]:
"Returns a list of event table columns containing materialized properties that this query needs"
return self.columns_to_query("events", set(self._used_properties_with_type("event")))
@cached_property
def person_columns_to_query(self) -> Set[ColumnName]:
"Returns a list of person table columns containing materialized properties that this query needs"
return self.columns_to_query("person", set(self._used_properties_with_type("person")))
def columns_to_query(self, table: TableWithProperties, used_properties: Set[PropertyIdentifier]) -> Set[ColumnName]:
"Transforms a list of property names to what columns are needed for that query"
materialized_columns = get_materialized_columns(table)
return set(materialized_columns.get(property_name, "properties") for property_name, _, _ in used_properties)
@cached_property
def is_using_person_properties(self) -> bool:
return len(self._used_properties_with_type("person")) > 0
@cached_property
def group_types_to_query(self) -> Set[GroupTypeIndex]:
used_properties = self._used_properties_with_type("group")
return set(cast(GroupTypeIndex, group_type_index) for _, _, group_type_index in used_properties)
@cached_property
def should_query_elements_chain_column(self) -> bool:
"Returns whether this query uses elements_chain"
has_element_type_property = lambda properties: any(prop.type == "element" for prop in properties)
if has_element_type_property(self.filter.properties):
return True
# Both entities and funnel exclusions can contain nested elements_chain inclusions
for entity in self.filter.entities + cast(List[Entity], self.filter.exclusions):
if has_element_type_property(entity.properties):
return True
# :TRICKY: Action definition may contain elements_chain usage
#
# See ee/clickhouse/models/action.py#format_action_filter for an example
if entity.type == TREND_FILTER_TYPE_ACTIONS:
if uses_elements_chain(entity.get_action()):
return True
return False
@cached_property
def properties_used_in_filter(self) -> Counter[PropertyIdentifier]:
"Returns collection of properties + types that this query would use"
counter: Counter[PropertyIdentifier] = extract_tables_and_properties(self.filter.properties)
if not isinstance(self.filter, StickinessFilter):
# Some breakdown types read properties
#
# See ee/clickhouse/queries/trends/breakdown.py#get_query or
# ee/clickhouse/queries/breakdown_props.py#get_breakdown_prop_values
if self.filter.breakdown_type in ["event", "person"]:
boxed_breakdown = box_value(self.filter.breakdown)
for b in boxed_breakdown:
if isinstance(b, str):
counter[(b, self.filter.breakdown_type, self.filter.breakdown_group_type_index)] += 1
elif self.filter.breakdown_type == "group":
# :TRICKY: We only support string breakdown for group properties
assert isinstance(self.filter.breakdown, str)
counter[
(self.filter.breakdown, self.filter.breakdown_type, self.filter.breakdown_group_type_index)
] += 1
# If we have a breakdowns attribute then make sure we pull in everything we
# need to calculate it
for breakdown in self.filter.breakdowns or []:
counter[(breakdown["property"], breakdown["type"], self.filter.breakdown_group_type_index)] += 1
# Both entities and funnel exclusions can contain nested property filters
for entity in self.filter.entities + cast(List[Entity], self.filter.exclusions):
counter += extract_tables_and_properties(entity.properties)
# Math properties are also implicitly used.
#
# See ee/clickhouse/queries/trends/util.py#process_math
if entity.math_property:
counter[(entity.math_property, "event", None)] += 1
# If groups are involved, they're also used
#
# See ee/clickhouse/queries/trends/util.py#process_math
if entity.math == "unique_group":
counter[(f"$group_{entity.math_group_type_index}", "event", None)] += 1
# :TRICKY: If action contains property filters, these need to be included
#
# See ee/clickhouse/models/action.py#format_action_filter for an example
if entity.type == TREND_FILTER_TYPE_ACTIONS:
counter += get_action_tables_and_properties(entity.get_action())
if (
not isinstance(self.filter, StickinessFilter)
and self.filter.correlation_type == FunnelCorrelationType.PROPERTIES
and self.filter.correlation_property_names
):
if self.filter.aggregation_group_type_index is not None:
for prop_value in self.filter.correlation_property_names:
counter[(prop_value, "group", self.filter.aggregation_group_type_index)] += 1
else:
for prop_value in self.filter.correlation_property_names:
counter[(prop_value, "person", None)] += 1
return counter
def _used_properties_with_type(self, property_type: PropertyType) -> Counter[PropertyIdentifier]:
return Counter(
{
(name, type, group_type_index): count
for (name, type, group_type_index), count in self.properties_used_in_filter.items()
if type == property_type
}
)