0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-12-01 04:12:23 +01:00
posthog/ee/clickhouse/views/test/test_clickhouse_retention.py
Harry Waye c595976779
fix(retention): fix breakdown people urls (#7642)
* fix(retention): fix breakdown people urls

This change returns people_url for each breakdown cohort in the
response. We also merge the initial and returning queries together,
as this makes it easier to align the people query also.

Note that I'm talking about person_id as opposed to actor_type etc.
but perhaps that can be a followup.

* clean up clickhouse params

* tidy up a little

* remove import

* remove non-breakdown specific code

* make cohort by initial event date a special breakdown case

* keep date for backwards compat

* Remove unused sql

* make test stable

* wip

* Get most of the tests working

* test(retention): remove graph retention test

We no longer need this, we have all the information we need from the
table response for retention, and can construct this on the frontend.

* revert any changes to posthog/queries/retention.py

* revert any changes to ee/clickhouse/models/person.py

* Revert posthog/queries/retention.py to merge-base

* Ensure actor id is a str

* Add type for actor serialiser for type narrowing

* run black

* sort imports

* Remove retention_actors.py

* fix typings

* format

* reverse str type

* sort imports

* rename

* split out functions

* remove deuplicate logic

* working

* fix type

* don't stringify

* fix test

* ordering doesn't matter

* trigger ci

Co-authored-by: eric <eeoneric@gmail.com>
2021-12-15 18:20:56 +00:00

449 lines
18 KiB
Python

from dataclasses import asdict, dataclass
from typing import List, Literal, Optional, TypedDict, Union
from django.test import TestCase
from django.test.client import Client
from ee.clickhouse.test.test_journeys import _create_all_events, update_or_create_person
from ee.clickhouse.util import ClickhouseTestMixin
from ee.clickhouse.views.test.funnel.util import EventPattern
from posthog.api.test.test_organization import create_organization
from posthog.api.test.test_team import create_team
from posthog.api.test.test_user import create_user
from posthog.test.base import test_with_materialized_columns
from posthog.utils import encode_get_request_params
class RetentionBreakdownTests(TestCase, ClickhouseTestMixin):
def test_can_get_retention_cohort_breakdown(self):
organization = create_organization(name="test")
team = create_team(organization=organization)
user = create_user(email="test@posthog.com", password="1234", organization=organization)
self.client.force_login(user)
update_or_create_person(distinct_ids=["person 1"], team_id=team.pk)
update_or_create_person(distinct_ids=["person 2"], team_id=team.pk)
update_or_create_person(distinct_ids=["person 3"], team_id=team.pk)
setup_user_activity_by_day(
daily_activity={
"2020-01-01": {"person 1": [{"event": "target event"}], "person 2": [{"event": "target event"}]},
"2020-01-02": {"person 1": [{"event": "target event"}], "person 3": [{"event": "target event"}]},
"2020-01-03": {"person 1": [{"event": "target event"}], "person 3": [{"event": "target event"}]},
},
team=team,
)
retention = get_retention_ok(
client=self.client,
team_id=team.pk,
request=RetentionRequest(
target_entity={"id": "target event", "type": "events"},
returning_entity={"id": "target event", "type": "events"},
date_from="2020-01-01",
total_intervals=2,
date_to="2020-01-02",
period="Day",
retention_type="retention_first_time",
),
)
retention_by_cohort_by_period = get_by_cohort_by_period_for_response(client=self.client, response=retention)
assert retention_by_cohort_by_period == {
"Day 0": {"1": ["person 1", "person 2"], "2": ["person 1"],},
"Day 1": {"1": ["person 3"]},
}
def test_can_get_retention_cohort_breakdown_with_retention_type_target(self):
organization = create_organization(name="test")
team = create_team(organization=organization)
user = create_user(email="test@posthog.com", password="1234", organization=organization)
self.client.force_login(user)
update_or_create_person(distinct_ids=["person 1"], team_id=team.pk)
update_or_create_person(distinct_ids=["person 2"], team_id=team.pk)
update_or_create_person(distinct_ids=["person 3"], team_id=team.pk)
setup_user_activity_by_day(
daily_activity={
"2020-01-01": {"person 1": [{"event": "target event"}], "person 2": [{"event": "target event"}]},
"2020-01-02": {"person 1": [{"event": "target event"}], "person 3": [{"event": "target event"}]},
"2020-01-03": {"person 1": [{"event": "target event"}], "person 3": [{"event": "target event"}]},
},
team=team,
)
retention = get_retention_ok(
client=self.client,
team_id=team.pk,
request=RetentionRequest(
target_entity={"id": "target event", "type": "events"},
returning_entity={"id": "target event", "type": "events"},
date_from="2020-01-01",
total_intervals=2,
date_to="2020-01-02",
period="Day",
retention_type="retention",
),
)
retention_by_cohort_by_period = get_by_cohort_by_period_for_response(client=self.client, response=retention)
assert retention_by_cohort_by_period == {
"Day 0": {"1": ["person 1", "person 2"], "2": ["person 1"],},
"Day 1": {"1": ["person 3", "person 1"]},
}
@test_with_materialized_columns(person_properties=["os"])
def test_can_specify_breakdown_person_property(self):
"""
By default, we group users together by the first time they perform the
`target_event`. However, we should also be able to specify, e.g. the
users OS to be able to compare retention between the OSs.
"""
organization = create_organization(name="test")
team = create_team(organization=organization)
user = create_user(email="test@posthog.com", password="1234", organization=organization)
self.client.force_login(user)
update_or_create_person(distinct_ids=["person 1"], team_id=team.pk, properties={"os": "Chrome"})
update_or_create_person(distinct_ids=["person 2"], team_id=team.pk, properties={"os": "Safari"})
setup_user_activity_by_day(
daily_activity={
"2020-01-01": {"person 1": [{"event": "target event"}]},
"2020-01-02": {"person 1": [{"event": "target event"}], "person 2": [{"event": "target event"}]},
# IMPORTANT: we include data past the end of the requested
# window, as we want to ensure that we pick up all retention
# periods for a user. e.g. for "person 2" we do not want to miss
# the count from 2020-01-03 e.g. the second period, otherwise we
# will skew results for users that didn't perform their target
# event right at the beginning of the requested range.
"2020-01-03": {"person 1": [{"event": "target event"}], "person 2": [{"event": "target event"}]},
},
team=team,
)
retention = get_retention_ok(
client=self.client,
team_id=team.pk,
request=RetentionRequest(
target_entity={"id": "target event", "type": "events"},
returning_entity={"id": "target event", "type": "events"},
date_from="2020-01-01",
total_intervals=2,
date_to="2020-01-02",
period="Day",
retention_type="retention_first_time",
breakdowns=[Breakdown(type="person", property="os")],
# NOTE: we need to specify breakdown_type as well, as the
# breakdown logic currently does not support multiple differing
# types
breakdown_type="person",
),
)
retention_by_cohort_by_period = get_by_cohort_by_period_for_response(client=self.client, response=retention)
assert retention_by_cohort_by_period, {
"Chrome": {"1": ["person 1"], "2": ["person 1"]},
"Safari": {
"1": ["person 2"],
"2": ["person 2"],
}, # IMPORTANT: the "2" value is from past the requested `date_to`
}
@test_with_materialized_columns(event_properties=["os"])
def test_can_specify_breakdown_event_property(self):
"""
By default, we group users together by the first time they perform the
`target_event`. However, we should also be able to specify, e.g. the
users OS to be able to compare retention between the OSs.
"""
organization = create_organization(name="test")
team = create_team(organization=organization)
user = create_user(email="test@posthog.com", password="1234", organization=organization)
self.client.force_login(user)
update_or_create_person(distinct_ids=["person 1"], team_id=team.pk)
update_or_create_person(distinct_ids=["person 2"], team_id=team.pk)
setup_user_activity_by_day(
daily_activity={
"2020-01-01": {"person 1": [{"event": "target event", "properties": {"os": "Chrome"}}]},
"2020-01-02": {
"person 1": [{"event": "target event"}],
"person 2": [{"event": "target event", "properties": {"os": "Safari"}}],
},
# IMPORTANT: we include data past the end of the requested
# window, as we want to ensure that we pick up all retention
# periods for a user. e.g. for "person 2" we do not want to miss
# the count from 2020-01-03 e.g. the second period, otherwise we
# will skew results for users that didn't perform their target
# event right at the beginning of the requested range.
"2020-01-03": {"person 1": [{"event": "target event"}], "person 2": [{"event": "target event"}]},
},
team=team,
)
retention = get_retention_ok(
client=self.client,
team_id=team.pk,
request=RetentionRequest(
target_entity={"id": "target event", "type": "events"},
returning_entity={"id": "target event", "type": "events"},
date_from="2020-01-01",
total_intervals=2,
date_to="2020-01-02",
period="Day",
retention_type="retention_first_time",
breakdowns=[Breakdown(type="event", property="os")],
# NOTE: we need to specify breakdown_type as well, as the
# breakdown logic currently does not support multiple differing
# types
breakdown_type="event",
),
)
retention_by_cohort_by_period = get_by_cohort_by_period_for_response(client=self.client, response=retention)
assert retention_by_cohort_by_period == {
"Chrome": {"1": ["person 1"], "2": ["person 1"]},
"Safari": {
"1": ["person 2"],
"2": ["person 2"],
}, # IMPORTANT: the "2" value is from past the requested `date_to`
}
@test_with_materialized_columns(event_properties=["os"])
def test_can_specify_breakdown_event_property_and_retrieve_people(self):
"""
This test is slightly different from the
get_by_cohort_by_period_for_response based tests in that here we are
checking a cohort/period specific people url that does not include the
"appearances" detail.
This is used, e.g. for the frontend retentions trend graph
"""
organization = create_organization(name="test")
team = create_team(organization=organization)
user = create_user(email="test@posthog.com", password="1234", organization=organization)
self.client.force_login(user)
update_or_create_person(distinct_ids=["person 1"], team_id=team.pk)
update_or_create_person(distinct_ids=["person 2"], team_id=team.pk)
setup_user_activity_by_day(
daily_activity={
"2020-01-01": {
"person 1": [{"event": "target event", "properties": {"os": "Chrome"}}],
"person 2": [{"event": "target event", "properties": {"os": "Safari"}}],
},
"2020-01-02": {"person 1": [{"event": "target event"}], "person 2": [{"event": "target event"}],},
},
team=team,
)
retention = get_retention_ok(
client=self.client,
team_id=team.pk,
request=RetentionRequest(
target_entity={"id": "target event", "type": "events"},
returning_entity={"id": "target event", "type": "events"},
date_from="2020-01-01",
total_intervals=2,
date_to="2020-01-02",
period="Day",
retention_type="retention_first_time",
breakdowns=[Breakdown(type="event", property="os")],
# NOTE: we need to specify breakdown_type as well, as the
# breakdown logic currently does not support multiple differing
# types
breakdown_type="event",
),
)
chrome_cohort = [cohort for cohort in retention["result"] if cohort["label"] == "Chrome"][0]
people_url = chrome_cohort["values"][0]["people_url"]
people_response = self.client.get(people_url)
assert people_response.status_code == 200
people = people_response.json()["result"]
assert [distinct_id for person in people for distinct_id in person["distinct_ids"]] == ["person 1"]
class RetentionIntervalTests(TestCase, ClickhouseTestMixin):
def test_can_get_retention_week_interval(self):
organization = create_organization(name="test")
team = create_team(organization=organization)
user = create_user(email="test@posthog.com", password="1234", organization=organization)
self.client.force_login(user)
update_or_create_person(distinct_ids=["person 1"], team_id=team.pk)
update_or_create_person(distinct_ids=["person 2"], team_id=team.pk)
setup_user_activity_by_day(
daily_activity={
"2020-01-01": {"person 1": [{"event": "target event"}]},
"2020-01-08": {"person 2": [{"event": "target event"}]},
},
team=team,
)
retention = get_retention_ok(
client=self.client,
team_id=team.pk,
request=RetentionRequest(
target_entity={"id": "target event", "type": "events"},
returning_entity={"id": "target event", "type": "events"},
date_from="2020-01-01",
total_intervals=2,
date_to="2020-01-08",
period="Week",
retention_type="retention_first_time",
),
)
retention_by_cohort_by_period = get_by_cohort_by_period_for_response(client=self.client, response=retention)
assert retention_by_cohort_by_period == {
"Week 0": {"1": ["person 1"], "2": [],},
"Week 1": {"1": ["person 2"]},
}
def setup_user_activity_by_day(daily_activity, team):
_create_all_events(
[
{"distinct_id": person_id, "team": team, "timestamp": timestamp, **event}
for timestamp, people in daily_activity.items()
for person_id, events in people.items()
for event in events
]
)
@dataclass
class Breakdown:
type: str
property: str
@dataclass
class RetentionRequest:
date_from: str # From what I can tell, this doesn't do anything, rather `total_intervals` is used
total_intervals: int
date_to: str
target_entity: EventPattern
returning_entity: EventPattern
period: Union[Literal["Hour"], Literal["Day"], Literal["Week"], Literal["Month"]]
retention_type: Literal["retention_first_time", "retention"] # probably not an exhaustive list
breakdowns: Optional[List[Breakdown]] = None
breakdown_type: Optional[Literal["person", "event"]] = None
class Value(TypedDict):
count: int
people_url: str
class Cohort(TypedDict):
values: List[Value]
date: str
label: str
class RetentionResponse(TypedDict):
result: List[Cohort]
class Person(TypedDict):
distinct_ids: List[str]
class RetentionTableAppearance(TypedDict):
person: Person
appearances: List[int]
class RetentionTablePeopleResponse(TypedDict):
result: List[RetentionTableAppearance]
def get_retention_ok(client: Client, team_id: int, request: RetentionRequest) -> RetentionResponse:
response = get_retention(client=client, team_id=team_id, request=request)
assert response.status_code == 200, response.content
return response.json()
def get_retention(client: Client, team_id: int, request: RetentionRequest):
return client.get(
f"/api/projects/{team_id}/insights/retention/",
# NOTE: for get requests we need to JSON encode non-scalars
data=encode_get_request_params(asdict(request)),
)
def get_retention_table_people_from_url_ok(client: Client, people_url: str):
response = client.get(people_url)
assert response.status_code == 200
return response.json()
def get_by_cohort_by_period_for_response(client: Client, response: RetentionResponse):
"""
Helper that, given a retention response, will fetch all corresponding distinct ids
and return in the format:
```
{
"<cohort-label>": {
"1": ["person 1", ...]
"2": [...]
...
}
...
}
```
"""
def create_cohort_period(people, period, value):
people_in_period = [
distinct_id
for person in people
for distinct_id in person["person"]["distinct_ids"]
if person["appearances"][period]
]
# Check the count is the same as the people size. We don't handle any
# pagination so this could be wrong for large counts
assert value["count"] == len(people_in_period)
return people_in_period
def create_cohort_response(cohort):
people = get_retention_table_people_from_url_ok(client=client, people_url=cohort["people_url"])["result"]
return {
f"{period + 1}": create_cohort_period(people, period, value)
for period, value in enumerate(cohort["values"])
}
return {cohort["label"]: create_cohort_response(cohort) for cohort in response["result"]}
def get_by_cohort_by_period_from_response(response: RetentionResponse):
return {
cohort["label"]: {f"{period + 1}": value["count"] for period, value in enumerate(cohort["values"])}
for cohort in response["result"]
}