0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-28 09:16:49 +01:00
posthog/ee/clickhouse/queries/funnels/funnel_unordered.py
Neil Kakkar d2fe491bfc
Funnel Groups: Rename person_id to aggregation_target (#6939)
* smallest change to make aggregation work

* address comments

* add snapshot

* move function to groups model

* update funnel snapshot

* rename person_id to aggregation_target

* update snapshots as well

* dont support persons query mods for now

* update snapshot

* make array orders deterministic
2021-11-08 14:45:13 +02:00

161 lines
6.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List, cast
from rest_framework.exceptions import ValidationError
from ee.clickhouse.queries.funnels.base import ClickhouseFunnelBase
class ClickhouseFunnelUnordered(ClickhouseFunnelBase):
"""
Unordered Funnel is a funnel where the order of steps doesn't matter.
## Query Intuition
Imagine a funnel with three events: A, B, and C.
This query splits the problem into two parts:
1. Given the first event is A, find the furthest everyone went starting from A.
This finds any B's and C's that happen after A (without ordering them)
2. Repeat the above, assuming first event to be B, and then C.
Then, the outer query unions the result of (2) and takes the maximum of these.
## Results
The result format is the same as the basic funnel, i.e. [step, count].
Here, `step_i` (0 indexed) signifies the number of people that did at least `i+1` steps.
## Exclusion Semantics
For unordered funnels, exclusion is a bit weird. It means, given all ordering of the steps,
how far can you go without seeing an exclusion event.
If you see an exclusion event => you're discarded.
See test_advanced_funnel_multiple_exclusions_between_steps for details.
"""
def get_query(self):
max_steps = len(self._filter.entities)
for exclusion in self._filter.exclusions:
if exclusion.funnel_from_step != 0 or exclusion.funnel_to_step != max_steps - 1:
raise ValidationError("Partial Exclusions not allowed in unordered funnels")
breakdown_clause = self._get_breakdown_prop()
return f"""
SELECT {self._get_count_columns(max_steps)} {self._get_step_time_avgs(max_steps)} {self._get_step_time_median(max_steps)} {breakdown_clause} FROM (
{self.get_step_counts_query()}
) {'GROUP BY prop' if breakdown_clause != '' else ''} SETTINGS allow_experimental_window_functions = 1
"""
def get_step_counts_query(self):
max_steps = len(self._filter.entities)
union_query = self.get_step_counts_without_aggregation_query()
breakdown_clause = self._get_breakdown_prop()
inner_timestamps, outer_timestamps = self._get_timestamp_selects()
return f"""
SELECT aggregation_target, steps {self._get_step_time_avgs(max_steps, inner_query=True)} {self._get_step_time_median(max_steps, inner_query=True)} {breakdown_clause} {outer_timestamps} FROM (
SELECT aggregation_target, steps, max(steps) over (PARTITION BY aggregation_target {breakdown_clause}) as max_steps {self._get_step_time_names(max_steps)} {breakdown_clause} {inner_timestamps} FROM (
{union_query}
)
) GROUP BY aggregation_target, steps {breakdown_clause}
HAVING steps = max_steps
"""
def get_step_counts_without_aggregation_query(self):
max_steps = len(self._filter.entities)
union_queries = []
entities_to_use = list(self._filter.entities)
partition_select = self._get_partition_cols(1, max_steps)
sorting_condition = self.get_sorting_condition(max_steps)
breakdown_clause = self._get_breakdown_prop(group_remaining=True)
exclusion_clause = self._get_exclusion_condition()
for i in range(max_steps):
inner_query = f"""
SELECT
aggregation_target,
timestamp,
{partition_select}
{breakdown_clause}
FROM ({self._get_inner_event_query(entities_to_use, f"events_{i}")})
"""
formatted_query = f"""
SELECT *, {sorting_condition} AS steps {exclusion_clause} {self._get_step_times(max_steps)} FROM (
{inner_query}
) WHERE step_0 = 1
{'AND exclusion = 0' if exclusion_clause else ''}
"""
#  rotate entities by 1 to get new first event
entities_to_use.append(entities_to_use.pop(0))
union_queries.append(formatted_query)
return " UNION ALL ".join(union_queries)
def _get_step_times(self, max_steps: int):
conditions: List[str] = []
conversion_times_elements = []
for i in range(max_steps):
conversion_times_elements.append(f"latest_{i}")
conditions.append(f"arraySort([{','.join(conversion_times_elements)}]) as conversion_times")
for i in range(1, max_steps):
conditions.append(
f"if(isNotNull(conversion_times[{i+1}]) AND conversion_times[{i+1}] <= conversion_times[{i}] + INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}, "
f"dateDiff('second', conversion_times[{i}], conversion_times[{i+1}]), NULL) step_{i}_conversion_time"
)
# array indices in ClickHouse are 1-based :shrug:
formatted = ", ".join(conditions)
return f", {formatted}" if formatted else ""
def get_sorting_condition(self, max_steps: int):
conditions = []
event_times_elements = []
for i in range(max_steps):
event_times_elements.append(f"latest_{i}")
conditions.append(f"arraySort([{','.join(event_times_elements)}]) as event_times")
# replacement of latest_i for whatever query part requires it, just like conversion_times
basic_conditions: List[str] = []
for i in range(1, max_steps):
basic_conditions.append(
f"if(latest_0 < latest_{i} AND latest_{i} <= latest_0 + INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}, 1, 0)"
)
conditions.append(f"arraySum([{','.join(basic_conditions)}, 1])")
if basic_conditions:
return ",".join(conditions)
else:
return "1"
def _get_exclusion_condition(self):
if not self._filter.exclusions:
return ""
conditions = []
for exclusion_id, exclusion in enumerate(self._filter.exclusions):
from_time = f"latest_{exclusion.funnel_from_step}"
to_time = f"event_times[{cast(int, exclusion.funnel_to_step) + 1}]"
exclusion_time = f"exclusion_{exclusion_id}_latest_{exclusion.funnel_from_step}"
condition = (
f"if( {exclusion_time} > {from_time} AND {exclusion_time} < "
f"if(isNull({to_time}), {from_time} + INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}, {to_time}), 1, 0)"
)
conditions.append(condition)
if conditions:
return f", arraySum([{','.join(conditions)}]) as exclusion"
else:
return ""