0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-12-01 04:12:23 +01:00
posthog/ee/clickhouse/queries/paths/path_event_query.py
Rick Marron 2b9917a915
Recordings in paths (#8015)
* add recordings to path query

* uncomment cache

* add clarifying comment

* works for start/end paths

* move to extra fields/properties

* add tests

* cleanup

* update ff name

* fix flaky test

* test and handle path_dropoff_key case
2022-01-18 15:29:52 -08:00

206 lines
8.7 KiB
Python

from typing import Any, Dict, List, Tuple
from ee.clickhouse.models.property import get_property_string_expr
from ee.clickhouse.queries.event_query import ClickhouseEventQuery
from posthog.constants import (
FUNNEL_PATH_AFTER_STEP,
FUNNEL_PATH_BEFORE_STEP,
FUNNEL_PATH_BETWEEN_STEPS,
PAGEVIEW_EVENT,
SCREEN_EVENT,
)
from posthog.models.filters.path_filter import PathFilter
from posthog.models.team import Team
class PathEventQuery(ClickhouseEventQuery):
FUNNEL_PERSONS_ALIAS = "funnel_actors"
_filter: PathFilter
def get_query(self) -> Tuple[str, Dict[str, Any]]:
funnel_paths_timestamp = ""
funnel_paths_join = ""
funnel_paths_filter = ""
if self._filter.funnel_paths == FUNNEL_PATH_AFTER_STEP or self._filter.funnel_paths == FUNNEL_PATH_BEFORE_STEP:
# used when looking for paths up to a dropoff point to account for events happening between the latest even and when the person is deemed dropped off
funnel_window = (
f"+ INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}"
)
operator = ">=" if self._filter.funnel_paths == FUNNEL_PATH_AFTER_STEP else "<="
funnel_paths_timestamp = f"{self.FUNNEL_PERSONS_ALIAS}.timestamp AS target_timestamp"
funnel_paths_join = f"JOIN {self.FUNNEL_PERSONS_ALIAS} ON {self.FUNNEL_PERSONS_ALIAS}.actor_id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id"
funnel_paths_filter = f"AND {self.EVENT_TABLE_ALIAS}.timestamp {operator} target_timestamp {funnel_window if self._filter.funnel_paths == FUNNEL_PATH_BEFORE_STEP and self._filter.funnel_step and self._filter.funnel_step < 0 else ''}"
elif self._filter.funnel_paths == FUNNEL_PATH_BETWEEN_STEPS:
funnel_paths_timestamp = f"{self.FUNNEL_PERSONS_ALIAS}.min_timestamp as min_timestamp, {self.FUNNEL_PERSONS_ALIAS}.max_timestamp as max_timestamp"
funnel_paths_join = f"JOIN {self.FUNNEL_PERSONS_ALIAS} ON {self.FUNNEL_PERSONS_ALIAS}.actor_id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id"
funnel_paths_filter = f"AND {self.EVENT_TABLE_ALIAS}.timestamp >= min_timestamp AND {self.EVENT_TABLE_ALIAS}.timestamp <= max_timestamp"
# We don't use ColumnOptimizer to decide what to query because Paths query doesn't surface any filter properties
_fields = [
f"{self.EVENT_TABLE_ALIAS}.timestamp AS timestamp",
f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id as person_id" if self._should_join_distinct_ids else "",
funnel_paths_timestamp,
]
_fields += [f"{self.EVENT_TABLE_ALIAS}.{field} AS {field}" for field in self._extra_fields]
_fields += [
get_property_string_expr("events", field, f"'{field}'", "properties", table_alias=self.EVENT_TABLE_ALIAS)[0]
+ f" as {field}"
for field in self._extra_event_properties
]
event_conditional = (
f"if({self.EVENT_TABLE_ALIAS}.event = '{SCREEN_EVENT}', {self._get_screen_name_parsing()}, "
if self._should_query_screen()
else "if(0, '', "
)
event_conditional += (
f"if({self.EVENT_TABLE_ALIAS}.event = '{PAGEVIEW_EVENT}', {self._get_current_url_parsing()}, "
if self._should_query_url()
else "if(0, '', "
)
event_conditional += f"{self.EVENT_TABLE_ALIAS}.event)) AS path_item_ungrouped"
_fields.append(event_conditional)
grouping_fields, grouping_params = self._get_grouping_fields()
_fields.extend(grouping_fields)
self.params.update(grouping_params)
# remove empty strings
_fields = list(filter(None, _fields))
date_query, date_params = self._get_date_filter()
self.params.update(date_params)
prop_filters = self._filter.properties
prop_query, prop_params = self._get_props(prop_filters)
self.params.update(prop_params)
event_query, event_params = self._get_event_query()
self.params.update(event_params)
person_query, person_params = self._get_person_query()
self.params.update(person_params)
groups_query, groups_params = self._get_groups_query()
self.params.update(groups_params)
query = f"""
SELECT {','.join(_fields)} FROM events {self.EVENT_TABLE_ALIAS}
{self._get_distinct_id_query()}
{person_query}
{groups_query}
{funnel_paths_join}
WHERE team_id = %(team_id)s
{event_query}
{date_query}
{prop_query}
{funnel_paths_filter}
ORDER BY {self.DISTINCT_ID_TABLE_ALIAS}.person_id, {self.EVENT_TABLE_ALIAS}.timestamp
"""
return query, self.params
def _determine_should_join_distinct_ids(self) -> None:
self._should_join_distinct_ids = True
def _get_grouping_fields(self) -> Tuple[List[str], Dict[str, Any]]:
_fields = []
params = {}
team: Team = Team.objects.get(pk=self._team_id)
replacements = []
if self._filter.path_replacements and team.path_cleaning_filters and len(team.path_cleaning_filters) > 0:
replacements.extend(team.path_cleaning_filters)
if self._filter.local_path_cleaning_filters and len(self._filter.local_path_cleaning_filters) > 0:
replacements.extend(self._filter.local_path_cleaning_filters)
if len(replacements) > 0:
for idx, replacement in enumerate(replacements):
alias = replacement["alias"]
regex = replacement["regex"]
if idx == 0:
name = "path_item" if idx == len(replacements) - 1 else f"path_item_{idx}"
_fields.append(
f"replaceRegexpAll(path_item_ungrouped, %(regex_replacement_{idx})s, %(alias_{idx})s) as {name}"
)
elif idx == len(replacements) - 1:
_fields.append(
f"replaceRegexpAll(path_item_{idx - 1}, %(regex_replacement_{idx})s, %(alias_{idx})s) as path_item"
)
else:
_fields.append(
f"replaceRegexpAll(path_item_{idx - 1}, %(regex_replacement_{idx})s, %(alias_{idx})s) as path_item_{idx}"
)
params[f"regex_replacement_{idx}"] = regex
params[f"alias_{idx}"] = alias
else:
_fields.append("multiMatchAnyIndex(path_item_ungrouped, %(regex_groupings)s) AS group_index")
_fields.append("if(group_index > 0, %(groupings)s[group_index], path_item_ungrouped) AS path_item")
return _fields, params
def _get_current_url_parsing(self):
path_type, _ = get_property_string_expr("events", "$current_url", "'$current_url'", "properties")
return f"if(length({path_type}) > 1, trim( TRAILING '/' FROM {path_type}), {path_type})"
def _get_screen_name_parsing(self):
path_type, _ = get_property_string_expr("events", "$screen_name", "'$screen_name'", "properties")
return path_type
def _get_event_query(self) -> Tuple[str, Dict[str, Any]]:
params: Dict[str, Any] = {}
conditions = []
or_conditions = []
if self._filter.include_pageviews:
or_conditions.append(f"event = '{PAGEVIEW_EVENT}'")
if self._filter.include_screenviews:
or_conditions.append(f"event = '{SCREEN_EVENT}'")
if self._filter.include_all_custom_events:
or_conditions.append(f"NOT event LIKE '$%%'")
if self._filter.custom_events:
or_conditions.append(f"event IN %(custom_events)s")
params["custom_events"] = self._filter.custom_events
if or_conditions:
conditions.append(f"({' OR '.join(or_conditions)})")
if self._filter.exclude_events:
conditions.append(f"NOT path_item IN %(exclude_events)s")
params["exclude_events"] = self._filter.exclude_events
if conditions:
return f" AND {' AND '.join(conditions)}", params
return "", {}
def _should_query_url(self) -> bool:
if (
self._filter.target_events == [] and self._filter.custom_events == []
) and PAGEVIEW_EVENT not in self._filter.exclude_events:
return True
elif self._filter.include_pageviews:
return True
return False
def _should_query_screen(self) -> bool:
if (
self._filter.target_events == [] and self._filter.custom_events == []
) and SCREEN_EVENT not in self._filter.exclude_events:
return True
elif self._filter.include_screenviews:
return True
return False