2021-06-29 00:48:35 +02:00
|
|
|
|
from datetime import date, datetime, timedelta
|
2021-07-27 18:17:14 +02:00
|
|
|
|
from itertools import groupby
|
2021-07-13 11:57:19 +02:00
|
|
|
|
from typing import Optional, Tuple, Type, Union
|
2021-06-03 23:06:08 +02:00
|
|
|
|
|
2021-07-27 19:09:29 +02:00
|
|
|
|
from dateutil.relativedelta import relativedelta
|
|
|
|
|
|
2021-07-01 18:11:54 +02:00
|
|
|
|
from ee.clickhouse.queries.funnels.base import ClickhouseFunnelBase
|
2021-07-09 02:03:56 +02:00
|
|
|
|
from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel
|
2021-06-29 00:48:35 +02:00
|
|
|
|
from ee.clickhouse.queries.util import get_time_diff, get_trunc_func_ch
|
2021-06-30 17:36:51 +02:00
|
|
|
|
from posthog.constants import BREAKDOWN
|
2021-07-27 18:17:14 +02:00
|
|
|
|
from posthog.models.cohort import Cohort
|
2021-06-30 17:36:51 +02:00
|
|
|
|
from posthog.models.filters.filter import Filter
|
|
|
|
|
from posthog.models.team import Team
|
2021-06-03 23:06:08 +02:00
|
|
|
|
|
|
|
|
|
TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S"
|
2021-07-27 03:46:20 +02:00
|
|
|
|
HUMAN_READABLE_TIMESTAMP_FORMAT = "%-d-%b-%Y"
|
2021-06-03 23:06:08 +02:00
|
|
|
|
|
|
|
|
|
|
2021-07-01 18:11:54 +02:00
|
|
|
|
class ClickhouseFunnelTrends(ClickhouseFunnelBase):
|
2021-06-29 00:48:35 +02:00
|
|
|
|
"""
|
2021-06-30 02:16:48 +02:00
|
|
|
|
## Funnel trends assumptions
|
2021-06-29 00:48:35 +02:00
|
|
|
|
|
|
|
|
|
Funnel trends are a graph of conversion over time – meaning a Y ({conversion_rate}) for each X ({entrance_period}).
|
|
|
|
|
|
|
|
|
|
### What is {entrance_period}?
|
|
|
|
|
|
2021-06-29 18:11:40 +02:00
|
|
|
|
A funnel is considered entered by a user when they have performed its first step.
|
|
|
|
|
When that happens, we consider that an entrance of funnel.
|
2021-06-29 00:48:35 +02:00
|
|
|
|
|
|
|
|
|
Now, our time series is based on a sequence of {entrance_period}s, each starting at {entrance_period_start}
|
2021-06-29 18:11:40 +02:00
|
|
|
|
and ending _right before the next_ {entrance_period_start}. A person is then counted at most once in each
|
2021-06-30 02:16:48 +02:00
|
|
|
|
{entrance_period}.
|
2021-06-29 00:48:35 +02:00
|
|
|
|
|
|
|
|
|
### What is {conversion_rate}?
|
|
|
|
|
|
2021-07-27 19:09:29 +02:00
|
|
|
|
Each time a funnel is entered by a person, they have exactly {funnel_window_interval} {funnel_window_interval_unit} to go
|
2021-06-30 02:16:48 +02:00
|
|
|
|
through the funnel's steps. Later events are just not taken into account.
|
2021-06-29 00:48:35 +02:00
|
|
|
|
|
2021-06-29 18:11:40 +02:00
|
|
|
|
For {conversion_rate}, we need to know reference steps: {from_step} and {to_step}.
|
|
|
|
|
By default they are respectively the first and the last steps of the funnel.
|
2021-06-30 02:16:48 +02:00
|
|
|
|
|
2021-06-29 18:11:40 +02:00
|
|
|
|
Then for each {entrance_period} we calculate {reached_from_step_count} – the number of persons
|
|
|
|
|
who entered the funnel and reached step {from_step} (along with all the steps leading up to it, if there any).
|
|
|
|
|
Similarly we calculate {reached_to_step_count}, which is the number of persons from {reached_from_step_count}
|
|
|
|
|
who also reached step {to_step} (along with all the steps leading up to it, including of course step {from_step}).
|
2021-06-29 00:48:35 +02:00
|
|
|
|
|
2021-06-30 02:16:48 +02:00
|
|
|
|
{conversion_rate} is simply {reached_to_step_count} divided by {reached_from_step_count},
|
2021-06-29 18:11:40 +02:00
|
|
|
|
multiplied by 100 to be a percentage.
|
|
|
|
|
|
2021-06-30 02:16:48 +02:00
|
|
|
|
If no people have reached step {from_step} in the period, {conversion_rate} is zero.
|
2021-06-29 00:48:35 +02:00
|
|
|
|
"""
|
|
|
|
|
|
2021-07-09 02:03:56 +02:00
|
|
|
|
def __init__(
|
|
|
|
|
self, filter: Filter, team: Team, funnel_order_class: Type[ClickhouseFunnelBase] = ClickhouseFunnel
|
|
|
|
|
) -> None:
|
2021-07-08 18:38:42 +02:00
|
|
|
|
|
2021-06-30 17:36:51 +02:00
|
|
|
|
super().__init__(filter, team)
|
|
|
|
|
|
2021-07-08 18:38:42 +02:00
|
|
|
|
self.funnel_order = funnel_order_class(filter, team)
|
|
|
|
|
|
2021-07-13 11:57:19 +02:00
|
|
|
|
def _exec_query(self):
|
|
|
|
|
return self._summarize_data(super()._exec_query())
|
2021-06-24 03:49:39 +02:00
|
|
|
|
|
2021-07-13 11:57:19 +02:00
|
|
|
|
def get_step_counts_without_aggregation_query(
|
|
|
|
|
self, *, specific_entrance_period_start: Optional[datetime] = None
|
|
|
|
|
) -> str:
|
|
|
|
|
steps_per_person_query = self.funnel_order.get_step_counts_without_aggregation_query()
|
2021-07-27 18:17:14 +02:00
|
|
|
|
|
2021-07-13 11:57:19 +02:00
|
|
|
|
interval_method = get_trunc_func_ch(self._filter.interval)
|
2021-06-03 23:06:08 +02:00
|
|
|
|
|
2021-07-13 11:57:19 +02:00
|
|
|
|
# This is used by funnel trends when we only need data for one period, e.g. person per data point
|
|
|
|
|
if specific_entrance_period_start:
|
|
|
|
|
self.params["entrance_period_start"] = specific_entrance_period_start.strftime(TIMESTAMP_FORMAT)
|
2021-06-03 23:06:08 +02:00
|
|
|
|
|
2021-07-27 18:17:14 +02:00
|
|
|
|
breakdown_clause = self._get_breakdown_prop()
|
2021-07-13 11:57:19 +02:00
|
|
|
|
return f"""
|
|
|
|
|
SELECT
|
|
|
|
|
person_id,
|
|
|
|
|
{interval_method}(timestamp) AS entrance_period_start,
|
|
|
|
|
max(steps) AS steps_completed
|
2021-07-27 18:17:14 +02:00
|
|
|
|
{breakdown_clause}
|
2021-07-13 11:57:19 +02:00
|
|
|
|
FROM (
|
|
|
|
|
{steps_per_person_query}
|
|
|
|
|
)
|
2021-07-22 13:49:15 +02:00
|
|
|
|
{"WHERE toDateTime(entrance_period_start) = %(entrance_period_start)s" if specific_entrance_period_start else ""}
|
2021-07-27 18:17:14 +02:00
|
|
|
|
GROUP BY person_id, entrance_period_start {breakdown_clause}"""
|
2021-07-08 18:38:42 +02:00
|
|
|
|
|
2021-07-13 11:57:19 +02:00
|
|
|
|
def get_query(self) -> str:
|
|
|
|
|
step_counts = self.get_step_counts_without_aggregation_query()
|
|
|
|
|
# Expects multiple rows for same person, first event time, steps taken.
|
2021-07-08 18:38:42 +02:00
|
|
|
|
self.params.update(self.funnel_order.params)
|
|
|
|
|
|
2021-07-13 11:57:19 +02:00
|
|
|
|
reached_from_step_count_condition, reached_to_step_count_condition, _ = self.get_steps_reached_conditions()
|
|
|
|
|
interval_method = get_trunc_func_ch(self._filter.interval)
|
2021-06-29 00:48:35 +02:00
|
|
|
|
num_intervals, seconds_in_interval, _ = get_time_diff(
|
|
|
|
|
self._filter.interval or "day", self._filter.date_from, self._filter.date_to, team_id=self._team.pk
|
|
|
|
|
)
|
|
|
|
|
|
2021-07-27 18:17:14 +02:00
|
|
|
|
breakdown_clause = self._get_breakdown_prop()
|
|
|
|
|
|
2021-06-29 00:48:35 +02:00
|
|
|
|
query = f"""
|
|
|
|
|
SELECT
|
2021-07-09 16:41:23 +02:00
|
|
|
|
entrance_period_start,
|
2021-06-29 18:11:40 +02:00
|
|
|
|
reached_from_step_count,
|
|
|
|
|
reached_to_step_count,
|
2021-07-09 16:41:23 +02:00
|
|
|
|
if(reached_from_step_count > 0, round(reached_to_step_count / reached_from_step_count * 100, 2), 0) AS conversion_rate
|
2021-07-27 18:17:14 +02:00
|
|
|
|
{breakdown_clause}
|
2021-07-09 16:41:23 +02:00
|
|
|
|
FROM (
|
2021-06-29 00:48:35 +02:00
|
|
|
|
SELECT
|
|
|
|
|
entrance_period_start,
|
2021-07-09 16:41:23 +02:00
|
|
|
|
countIf({reached_from_step_count_condition}) AS reached_from_step_count,
|
|
|
|
|
countIf({reached_to_step_count_condition}) AS reached_to_step_count
|
2021-07-27 18:17:14 +02:00
|
|
|
|
{breakdown_clause}
|
2021-06-29 00:48:35 +02:00
|
|
|
|
FROM (
|
2021-07-13 11:57:19 +02:00
|
|
|
|
{step_counts}
|
2021-07-27 18:17:14 +02:00
|
|
|
|
) GROUP BY entrance_period_start {breakdown_clause}
|
2021-06-29 00:48:35 +02:00
|
|
|
|
) data
|
2021-07-13 11:57:19 +02:00
|
|
|
|
FULL OUTER JOIN (
|
2021-07-09 16:41:23 +02:00
|
|
|
|
SELECT
|
|
|
|
|
{interval_method}(toDateTime('{self._filter.date_from.strftime(TIMESTAMP_FORMAT)}') + number * {seconds_in_interval}) AS entrance_period_start
|
2021-07-27 18:17:14 +02:00
|
|
|
|
{', breakdown_value as prop' if breakdown_clause else ''}
|
2021-07-09 16:41:23 +02:00
|
|
|
|
FROM numbers({num_intervals}) AS period_offsets
|
2021-07-27 18:17:14 +02:00
|
|
|
|
{'ARRAY JOIN (%(breakdown_values)s) AS breakdown_value' if breakdown_clause else ''}
|
2021-07-09 16:41:23 +02:00
|
|
|
|
) fill
|
2021-07-27 18:17:14 +02:00
|
|
|
|
USING (entrance_period_start {breakdown_clause})
|
2021-06-29 00:48:35 +02:00
|
|
|
|
ORDER BY entrance_period_start ASC
|
|
|
|
|
SETTINGS allow_experimental_window_functions = 1"""
|
|
|
|
|
|
|
|
|
|
return query
|
2021-06-03 23:06:08 +02:00
|
|
|
|
|
2021-07-13 11:57:19 +02:00
|
|
|
|
def get_steps_reached_conditions(self) -> Tuple[str, str, str]:
|
|
|
|
|
# How many steps must have been done to count for the denominator of a funnel trends data point
|
2021-07-20 12:06:24 +02:00
|
|
|
|
from_step = self._filter.funnel_from_step or 0
|
2021-07-13 11:57:19 +02:00
|
|
|
|
# How many steps must have been done to count for the numerator of a funnel trends data point
|
2021-07-20 12:06:24 +02:00
|
|
|
|
to_step = self._filter.funnel_to_step or len(self._filter.entities) - 1
|
2021-07-13 11:57:19 +02:00
|
|
|
|
|
|
|
|
|
# Those who converted OR dropped off
|
2021-07-20 12:06:24 +02:00
|
|
|
|
reached_from_step_count_condition = f"steps_completed >= {from_step+1}"
|
2021-07-13 11:57:19 +02:00
|
|
|
|
# Those who converted
|
2021-07-20 12:06:24 +02:00
|
|
|
|
reached_to_step_count_condition = f"steps_completed >= {to_step+1}"
|
2021-07-13 11:57:19 +02:00
|
|
|
|
# Those who dropped off
|
2021-07-20 12:06:24 +02:00
|
|
|
|
did_not_reach_to_step_count_condition = f"{reached_from_step_count_condition} AND steps_completed < {to_step+1}"
|
2021-07-13 11:57:19 +02:00
|
|
|
|
return reached_from_step_count_condition, reached_to_step_count_condition, did_not_reach_to_step_count_condition
|
|
|
|
|
|
2021-06-03 23:06:08 +02:00
|
|
|
|
def _summarize_data(self, results):
|
2021-07-27 18:17:14 +02:00
|
|
|
|
|
|
|
|
|
breakdown_clause = self._get_breakdown_prop()
|
|
|
|
|
|
|
|
|
|
summary = []
|
|
|
|
|
for period_row in results:
|
|
|
|
|
serialized_result = {
|
2021-06-29 00:48:35 +02:00
|
|
|
|
"timestamp": period_row[0],
|
2021-06-29 18:11:40 +02:00
|
|
|
|
"reached_from_step_count": period_row[1],
|
|
|
|
|
"reached_to_step_count": period_row[2],
|
2021-06-29 00:48:35 +02:00
|
|
|
|
"conversion_rate": period_row[3],
|
|
|
|
|
"is_period_final": self._is_period_final(period_row[0]),
|
2021-06-03 23:06:08 +02:00
|
|
|
|
}
|
2021-07-27 18:17:14 +02:00
|
|
|
|
|
|
|
|
|
if breakdown_clause:
|
|
|
|
|
serialized_result.update(
|
|
|
|
|
{
|
|
|
|
|
"breakdown_value": period_row[-1]
|
|
|
|
|
if isinstance(period_row[-1], str)
|
|
|
|
|
else Cohort.objects.get(pk=period_row[-1]).name
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
summary.append(serialized_result)
|
2021-06-29 00:48:35 +02:00
|
|
|
|
return summary
|
2021-06-03 23:06:08 +02:00
|
|
|
|
|
2021-07-15 19:05:36 +02:00
|
|
|
|
def _format_results(self, summary):
|
2021-07-27 18:17:14 +02:00
|
|
|
|
|
|
|
|
|
if self._filter.breakdown:
|
|
|
|
|
grouper = lambda row: row["breakdown_value"]
|
|
|
|
|
sorted_data = sorted(summary, key=grouper)
|
|
|
|
|
final_res = []
|
|
|
|
|
for key, value in groupby(sorted_data, grouper):
|
|
|
|
|
breakdown_res = self._format_single_summary(list(value))
|
|
|
|
|
final_res.append({**breakdown_res, "breakdown_value": key})
|
|
|
|
|
return final_res
|
|
|
|
|
else:
|
|
|
|
|
res = self._format_single_summary(summary)
|
|
|
|
|
|
|
|
|
|
return [res]
|
|
|
|
|
|
|
|
|
|
def _format_single_summary(self, summary):
|
2021-06-03 23:06:08 +02:00
|
|
|
|
count = len(summary)
|
|
|
|
|
data = []
|
|
|
|
|
days = []
|
|
|
|
|
labels = []
|
|
|
|
|
for row in summary:
|
2021-06-29 00:48:35 +02:00
|
|
|
|
data.append(row["conversion_rate"])
|
2021-07-15 19:05:36 +02:00
|
|
|
|
hour_min_sec = " %H:%M:%S" if self._filter.interval == "hour" or self._filter.interval == "minute" else ""
|
|
|
|
|
days.append(row["timestamp"].strftime(f"%Y-%m-%d{hour_min_sec}"))
|
2021-06-03 23:06:08 +02:00
|
|
|
|
labels.append(row["timestamp"].strftime(HUMAN_READABLE_TIMESTAMP_FORMAT))
|
2021-07-27 18:17:14 +02:00
|
|
|
|
return {
|
|
|
|
|
"count": count,
|
|
|
|
|
"data": data,
|
|
|
|
|
"days": days,
|
|
|
|
|
"labels": labels,
|
|
|
|
|
}
|
2021-06-03 23:06:08 +02:00
|
|
|
|
|
2021-06-29 00:48:35 +02:00
|
|
|
|
def _is_period_final(self, timestamp: Union[datetime, date]):
|
2021-06-03 23:06:08 +02:00
|
|
|
|
# difference between current date and timestamp greater than window
|
|
|
|
|
now = datetime.utcnow().date()
|
2021-07-27 19:09:29 +02:00
|
|
|
|
intervals_to_subtract = self._filter.funnel_window_interval * -1
|
|
|
|
|
interval_unit = (
|
|
|
|
|
"day" if self._filter.funnel_window_interval_unit is None else self._filter.funnel_window_interval_unit
|
|
|
|
|
)
|
|
|
|
|
delta = relativedelta(**{f"{interval_unit}s": intervals_to_subtract})
|
2021-06-03 23:06:08 +02:00
|
|
|
|
completed_end = now + delta
|
2021-06-29 00:48:35 +02:00
|
|
|
|
compare_timestamp = timestamp.date() if isinstance(timestamp, datetime) else timestamp
|
|
|
|
|
is_final = compare_timestamp <= completed_end
|
|
|
|
|
return is_final
|