0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-12-01 12:21:02 +01:00
posthog/ee/clickhouse/queries/funnels/funnel_time_to_convert.py
Alex Gyujin Kim 662c2be31a
Remove steps parameter in time conversion result (#6046)
* remove steps parameter in time conversion result

* fix tests

Co-authored-by: Marius Andra <marius.andra@gmail.com>
2021-09-21 15:23:32 +02:00

122 lines
6.1 KiB
Python

from typing import Type
from rest_framework.exceptions import ValidationError
from ee.clickhouse.queries.funnels.base import ClickhouseFunnelBase
from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel
from posthog.constants import FUNNEL_TO_STEP
from posthog.models.filters.filter import Filter
from posthog.models.team import Team
class ClickhouseFunnelTimeToConvert(ClickhouseFunnelBase):
def __init__(
self, filter: Filter, team: Team, funnel_order_class: Type[ClickhouseFunnelBase] = ClickhouseFunnel
) -> None:
super().__init__(filter, team)
self.funnel_order = funnel_order_class(filter, team)
def _format_results(self, results: list) -> dict:
return {
"bins": [(bin_from_seconds, person_count) for bin_from_seconds, person_count, _ in results],
"average_conversion_time": results[0][2],
}
def get_query(self) -> str:
steps_per_person_query = self.funnel_order.get_step_counts_query()
self.params.update(self.funnel_order.params)
# expects 1 person per row, whatever their max step is, and the step conversion times for this person
# Conversion from which step should be calculated
from_step = self._filter.funnel_from_step or 0
# Conversion to which step should be calculated
to_step = self._filter.funnel_to_step or len(self._filter.entities) - 1
# Use custom bin_count if provided by user, otherwise infer an automatic one based on the number of samples
bin_count = self._filter.bin_count
if bin_count is not None:
# Custom count is clamped between 1 and 90
if bin_count < 1:
bin_count = 1
elif bin_count > 90:
bin_count = 90
bin_count_identifier = str(bin_count)
bin_count_expression = None
else:
# Auto count is clamped between 3 and 60
bin_count_identifier = "bin_count"
bin_count_expression = f"""
count() AS sample_count,
least(60, greatest(3, ceil(cbrt(sample_count)))) AS {bin_count_identifier},
"""
if not (0 < to_step < len(self._filter.entities)):
raise ValidationError(
f'Filter parameter {FUNNEL_TO_STEP} can only be one of {", ".join(map(str, range(1, len(self._filter.entities))))} for time to convert!'
)
steps_average_conversion_time_identifiers = [
f"step_{step+1}_average_conversion_time_inner" for step in range(from_step, to_step)
]
steps_average_conversion_time_expression_sum = " + ".join(steps_average_conversion_time_identifiers)
steps_average_conditional_for_invalid_values = [
f"{identifier} >= 0" for identifier in steps_average_conversion_time_identifiers
]
# :HACK: Protect against CH bug https://github.com/ClickHouse/ClickHouse/issues/26580
# once the issue is resolved, stop skipping the test: test_auto_bin_count_single_step_duplicate_events
# and remove this comment
query = f"""
WITH
step_runs AS (
SELECT * FROM (
{steps_per_person_query}
) WHERE {" AND ".join(steps_average_conditional_for_invalid_values)}
),
histogram_params AS (
/* Binning ensures that each sample belongs to a bin in results */
/* If bin_count is not a custom number, it's calculated in bin_count_expression */
SELECT
floor(min({steps_average_conversion_time_expression_sum})) AS from_seconds,
ceil(max({steps_average_conversion_time_expression_sum})) AS to_seconds,
round(avg({steps_average_conversion_time_expression_sum}), 2) AS average_conversion_time,
{bin_count_expression or ""}
ceil((to_seconds - from_seconds) / {bin_count_identifier}) AS bin_width_seconds_raw,
/* Use 60 seconds as fallback bin width in case of only one sample */
if(bin_width_seconds_raw > 0, bin_width_seconds_raw, 60) AS bin_width_seconds
FROM step_runs
),
/* Below CTEs make histogram_params columns available to the query below as straightforward identifiers */
( SELECT bin_width_seconds FROM histogram_params ) AS bin_width_seconds,
/* bin_count is only made available as an identifier if it had to be calculated */
{
f"( SELECT {bin_count_identifier} FROM histogram_params ) AS {bin_count_identifier},"
if bin_count_expression else ""
}
( SELECT from_seconds FROM histogram_params ) AS histogram_from_seconds,
( SELECT to_seconds FROM histogram_params ) AS histogram_to_seconds,
( SELECT average_conversion_time FROM histogram_params ) AS histogram_average_conversion_time
SELECT
bin_from_seconds,
person_count,
histogram_average_conversion_time AS average_conversion_time
FROM (
/* Calculating bins from step runs */
SELECT
histogram_from_seconds + floor(({steps_average_conversion_time_expression_sum} - histogram_from_seconds) / bin_width_seconds) * bin_width_seconds AS bin_from_seconds,
count() AS person_count
FROM step_runs
GROUP BY bin_from_seconds
) results
RIGHT OUTER JOIN (
/* Making sure bin_count bins are returned */
/* Those not present in the results query due to lack of data simply get person_count 0 */
SELECT histogram_from_seconds + number * bin_width_seconds AS bin_from_seconds FROM system.numbers LIMIT {bin_count_identifier} + 1
) fill
USING (bin_from_seconds)
ORDER BY bin_from_seconds
SETTINGS allow_experimental_window_functions = 1"""
return query