0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-21 13:39:22 +01:00

Only include events in session recording query when needed (#7248)

* query for events with matching session_id if possible

* move test to clickhouse

* move event list to list instead of set for flaky test snapshot

* join events only when needed

* trigger benchmarking

* typo
This commit is contained in:
Rick Marron 2021-11-30 09:33:41 -08:00 committed by GitHub
parent 7520aca027
commit 6fc64efafa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 121 additions and 123 deletions

View File

@ -24,17 +24,7 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
_filter: SessionRecordingsFilter
SESSION_RECORDINGS_DEFAULT_LIMIT = 50
_session_recordings_query_with_entity_filter: str = """
SELECT
session_recordings.session_id,
any(session_recordings.start_time) as start_time,
any(session_recordings.end_time) as end_time,
any(session_recordings.duration) as duration,
any(session_recordings.distinct_id) as distinct_id,
arrayElement(groupArray(current_url), 1) as start_url,
arrayElement(groupArray(current_url), -1) as end_url
{event_filter_aggregate_select_clause}
FROM (
_core_events_query = """
SELECT
distinct_id,
event,
@ -46,8 +36,29 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
team_id = %(team_id)s
{event_filter_where_conditions}
{events_timestamp_clause}
) AS events
RIGHT OUTER JOIN (
"""
_event_and_recording_match_conditions_clause = """
(
-- If there is a window_id on the recording, then it is newer data and we can match
-- the recording and events on session_id
(
notEmpty(session_recordings.window_id) AND
events.session_id == session_recordings.session_id
) OR
-- If there's no window_id on the recording, then it is older data and we should match
-- events and recordings on timestamps
(
empty(session_recordings.window_id) AND
(
events.timestamp >= session_recordings.start_time
AND events.timestamp <= session_recordings.end_time
)
)
)
"""
_core_session_recordings_query = """
SELECT
session_id,
any(window_id) as window_id,
@ -64,6 +75,21 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
HAVING full_snapshots > 0
{recording_start_time_clause}
{duration_clause}
"""
_session_recordings_query_with_events: str = """
SELECT
session_recordings.session_id,
any(session_recordings.start_time) as start_time,
any(session_recordings.end_time) as end_time,
any(session_recordings.duration) as duration,
any(session_recordings.distinct_id) as distinct_id
{event_filter_aggregate_select_clause}
FROM (
{core_events_query}
) AS events
JOIN (
{core_recordings_query}
) AS session_recordings
ON session_recordings.distinct_id = events.distinct_id
JOIN (
@ -72,25 +98,7 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
ON pdi.distinct_id = session_recordings.distinct_id
{person_query}
WHERE
(
-- If there is a window_id on the recording, then it is newer data and we can match
-- the recording and events on session_id
(
notEmpty(session_recordings.window_id) AND
events.session_id == session_recordings.session_id
) OR
-- If there is no window_id on the recording, then it is older data and we should match
-- events and recordings on timestamps
(
empty(session_recordings.window_id) AND
(
events.timestamp >= session_recordings.start_time
AND events.timestamp <= session_recordings.end_time
)
) OR
-- If there are no event matches, we don't want to filter out the recording itself
empty(events.event)
)
{event_and_recording_match_comditions_clause}
{prop_filter_clause}
{person_id_clause}
GROUP BY session_recordings.session_id
@ -100,6 +108,29 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
LIMIT %(limit)s OFFSET %(offset)s
"""
_session_recordings_query: str = """
SELECT
session_recordings.session_id,
any(session_recordings.start_time) as start_time,
any(session_recordings.end_time) as end_time,
any(session_recordings.duration) as duration,
any(session_recordings.distinct_id) as distinct_id
FROM (
{core_recordings_query}
) AS session_recordings
JOIN (
{person_distinct_id_query}
) as pdi
ON pdi.distinct_id = session_recordings.distinct_id
{person_query}
WHERE 1 = 1
{prop_filter_clause}
{person_id_clause}
GROUP BY session_recordings.session_id
ORDER BY start_time DESC
LIMIT %(limit)s OFFSET %(offset)s
"""
@property
def limit(self):
return self._filter.limit or self.SESSION_RECORDINGS_DEFAULT_LIMIT
@ -115,11 +146,12 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
self._should_join_persons = True
return
def _determine_should_join_events(self):
return self._filter.entities and len(self._filter.entities) > 0
def _get_properties_select_clause(self) -> str:
current_url_clause, _ = get_property_string_expr("events", "$current_url", "'$current_url'", "properties")
session_id_clause, _ = get_property_string_expr("events", "$session_id", "'$session_id'", "properties")
clause = f""",
{current_url_clause} as current_url,
{session_id_clause} as session_id
"""
clause += (
@ -132,9 +164,6 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
)
return clause
def _has_entity_filters(self):
return self._filter.entities and len(self._filter.entities) > 0
def _get_person_id_clause(self) -> Tuple[str, Dict[str, Any]]:
person_id_clause = ""
person_id_params = {}
@ -198,12 +227,11 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
return filter_sql, params
def format_event_filters(self) -> EventFiltersSQL:
aggregate_select_clause = ""
aggregate_having_clause = ""
where_conditions = "AND event IN %(event_names)s"
# Always include $pageview events so the start_url and end_url can be extracted
event_names_to_filter: List[Union[int, str]] = ["$pageview"]
event_names_to_filter: List[Union[int, str]] = []
params: Dict = {}
@ -227,9 +255,7 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
return EventFiltersSQL(aggregate_select_clause, aggregate_having_clause, where_conditions, params,)
def get_query(self) -> Tuple[str, Dict[str, Any]]:
offset = self._filter.offset or 0
# One more is added to the limit to check if there are more results available
base_params = {"team_id": self._team_id, "limit": self.limit + 1, "offset": offset}
person_query, person_query_params = self._get_person_query()
prop_query, prop_params = self._get_props(self._filter.properties)
@ -237,21 +263,52 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
recording_start_time_clause, recording_start_time_params = self._get_recording_start_time_clause()
person_id_clause, person_id_params = self._get_person_id_clause()
duration_clause, duration_params = self._get_duration_clause()
event_filters = self.format_event_filters()
properties_select_clause = self._get_properties_select_clause()
core_recordings_query = self._core_session_recordings_query.format(
recording_start_time_clause=recording_start_time_clause,
duration_clause=duration_clause,
events_timestamp_clause=events_timestamp_clause,
)
if not self._determine_should_join_events():
return (
self._session_recordings_query.format(
core_recordings_query=core_recordings_query,
person_distinct_id_query=GET_TEAM_PERSON_DISTINCT_IDS,
person_query=person_query,
prop_filter_clause=prop_query,
person_id_clause=person_id_clause,
),
{
**base_params,
**person_id_params,
**person_query_params,
**prop_params,
**events_timestamp_params,
**duration_params,
**recording_start_time_params,
},
)
event_filters = self.format_event_filters()
core_events_query = self._core_events_query.format(
properties_select_clause=properties_select_clause,
event_filter_where_conditions=event_filters.where_conditions,
events_timestamp_clause=events_timestamp_clause,
)
return (
self._session_recordings_query_with_entity_filter.format(
person_id_clause=person_id_clause,
prop_filter_clause=prop_query,
self._session_recordings_query_with_events.format(
event_filter_aggregate_select_clause=event_filters.aggregate_select_clause,
core_events_query=core_events_query,
core_recordings_query=core_recordings_query,
person_distinct_id_query=GET_TEAM_PERSON_DISTINCT_IDS,
person_query=person_query,
properties_select_clause=properties_select_clause,
events_timestamp_clause=events_timestamp_clause,
recording_start_time_clause=recording_start_time_clause,
duration_clause=duration_clause,
event_filter_where_conditions=event_filters.where_conditions,
event_filter_aggregate_select_clause=event_filters.aggregate_select_clause,
event_and_recording_match_comditions_clause=self._event_and_recording_match_conditions_clause,
prop_filter_clause=prop_query,
person_id_clause=person_id_clause,
event_filter_aggregate_having_clause=event_filters.aggregate_having_clause,
),
{

View File

@ -169,24 +169,8 @@
any(session_recordings.start_time) as start_time,
any(session_recordings.end_time) as end_time,
any(session_recordings.duration) as duration,
any(session_recordings.distinct_id) as distinct_id,
arrayElement(groupArray(current_url), 1) as start_url,
arrayElement(groupArray(current_url), -1) as end_url
any(session_recordings.distinct_id) as distinct_id
FROM
(SELECT distinct_id,
event,
team_id,
timestamp ,
trim(BOTH '"'
FROM JSONExtractRaw(properties, '$current_url')) as current_url,
trim(BOTH '"'
FROM JSONExtractRaw(properties, '$session_id')) as session_id
FROM events
WHERE team_id = 2
AND event IN ['$pageview']
AND timestamp >= '2021-08-13 12:00:00'
AND timestamp <= '2021-08-22 08:00:00' ) AS events
RIGHT OUTER JOIN
(SELECT session_id,
any(window_id) as window_id,
MIN(timestamp) AS start_time,
@ -201,7 +185,7 @@
GROUP BY session_id
HAVING full_snapshots > 0
AND start_time >= '2021-08-14 00:00:00'
AND start_time <= '2021-08-21 20:00:00') AS session_recordings ON session_recordings.distinct_id = events.distinct_id
AND start_time <= '2021-08-21 20:00:00') AS session_recordings
JOIN
(SELECT distinct_id,
argMax(person_id, _timestamp) as person_id
@ -222,12 +206,7 @@
WHERE team_id = 2
GROUP BY id
HAVING max(is_deleted) = 0) person ON person.id = pdi.person_id
WHERE ((notEmpty(session_recordings.window_id)
AND events.session_id == session_recordings.session_id)
OR (empty(session_recordings.window_id)
AND (events.timestamp >= session_recordings.start_time
AND events.timestamp <= session_recordings.end_time))
OR empty(events.event))
WHERE 1 = 1
AND person_id IN
(SELECT person_id
FROM cohortpeople
@ -238,7 +217,6 @@
team_id
HAVING sum(sign) > 0)
GROUP BY session_recordings.session_id
HAVING 1 = 1
ORDER BY start_time DESC
LIMIT 51
OFFSET 0
@ -251,17 +229,13 @@
any(session_recordings.start_time) as start_time,
any(session_recordings.end_time) as end_time,
any(session_recordings.duration) as duration,
any(session_recordings.distinct_id) as distinct_id,
arrayElement(groupArray(current_url), 1) as start_url,
arrayElement(groupArray(current_url), -1) as end_url ,
any(session_recordings.distinct_id) as distinct_id ,
sum(if(event = '$pageview', 1, 0)) as count_event_match_0
FROM
(SELECT distinct_id,
event,
team_id,
timestamp ,
trim(BOTH '"'
FROM JSONExtractRaw(properties, '$current_url')) as current_url,
trim(BOTH '"'
FROM JSONExtractRaw(properties, '$session_id')) as session_id
FROM events
@ -269,7 +243,7 @@
AND event IN ['$pageview']
AND timestamp >= '2021-01-13 12:00:00'
AND timestamp <= '2021-01-22 08:00:00' ) AS events
RIGHT OUTER JOIN
JOIN
(SELECT session_id,
any(window_id) as window_id,
MIN(timestamp) AS start_time,
@ -303,8 +277,7 @@
AND events.session_id == session_recordings.session_id)
OR (empty(session_recordings.window_id)
AND (events.timestamp >= session_recordings.start_time
AND events.timestamp <= session_recordings.end_time))
OR empty(events.event))
AND events.timestamp <= session_recordings.end_time)))
GROUP BY session_recordings.session_id
HAVING 1 = 1
AND count_event_match_0 > 0
@ -320,25 +293,21 @@
any(session_recordings.start_time) as start_time,
any(session_recordings.end_time) as end_time,
any(session_recordings.duration) as duration,
any(session_recordings.distinct_id) as distinct_id,
arrayElement(groupArray(current_url), 1) as start_url,
arrayElement(groupArray(current_url), -1) as end_url ,
any(session_recordings.distinct_id) as distinct_id ,
sum(if(event = '$autocapture', 1, 0)) as count_event_match_0
FROM
(SELECT distinct_id,
event,
team_id,
timestamp ,
trim(BOTH '"'
FROM JSONExtractRaw(properties, '$current_url')) as current_url,
trim(BOTH '"'
FROM JSONExtractRaw(properties, '$session_id')) as session_id
FROM events
WHERE team_id = 2
AND event IN ['$pageview', '$autocapture']
AND event IN ['$autocapture']
AND timestamp >= '2021-01-13 12:00:00'
AND timestamp <= '2021-01-22 08:00:00' ) AS events
RIGHT OUTER JOIN
JOIN
(SELECT session_id,
any(window_id) as window_id,
MIN(timestamp) AS start_time,
@ -372,8 +341,7 @@
AND events.session_id == session_recordings.session_id)
OR (empty(session_recordings.window_id)
AND (events.timestamp >= session_recordings.start_time
AND events.timestamp <= session_recordings.end_time))
OR empty(events.event))
AND events.timestamp <= session_recordings.end_time)))
GROUP BY session_recordings.session_id
HAVING 1 = 1
AND count_event_match_0 > 0
@ -389,24 +357,8 @@
any(session_recordings.start_time) as start_time,
any(session_recordings.end_time) as end_time,
any(session_recordings.duration) as duration,
any(session_recordings.distinct_id) as distinct_id,
arrayElement(groupArray(current_url), 1) as start_url,
arrayElement(groupArray(current_url), -1) as end_url
any(session_recordings.distinct_id) as distinct_id
FROM
(SELECT distinct_id,
event,
team_id,
timestamp ,
trim(BOTH '"'
FROM JSONExtractRaw(properties, '$current_url')) as current_url,
trim(BOTH '"'
FROM JSONExtractRaw(properties, '$session_id')) as session_id
FROM events
WHERE team_id = 2
AND event IN ['$pageview']
AND timestamp >= '2021-01-13 12:00:00'
AND timestamp <= '2021-01-22 08:00:00' ) AS events
RIGHT OUTER JOIN
(SELECT session_id,
any(window_id) as window_id,
MIN(timestamp) AS start_time,
@ -421,7 +373,7 @@
GROUP BY session_id
HAVING full_snapshots > 0
AND start_time >= '2021-01-14 00:00:00'
AND start_time <= '2021-01-21 20:00:00') AS session_recordings ON session_recordings.distinct_id = events.distinct_id
AND start_time <= '2021-01-21 20:00:00') AS session_recordings
JOIN
(SELECT distinct_id,
argMax(person_id, _timestamp) as person_id
@ -444,14 +396,8 @@
HAVING max(is_deleted) = 0
AND has(['bla'], trim(BOTH '"'
FROM JSONExtractRaw(argMax(person.properties, _timestamp), 'email')))) person ON person.id = pdi.person_id
WHERE ((notEmpty(session_recordings.window_id)
AND events.session_id == session_recordings.session_id)
OR (empty(session_recordings.window_id)
AND (events.timestamp >= session_recordings.start_time
AND events.timestamp <= session_recordings.end_time))
OR empty(events.event))
WHERE 1 = 1
GROUP BY session_recordings.session_id
HAVING 1 = 1
ORDER BY start_time DESC
LIMIT 51
OFFSET 0

View File

@ -86,9 +86,6 @@ class TestClickhouseSessionRecordingsList(ClickhouseTestMixin, factory_session_r
self.create_event("user", self.base_time, properties={"$session_id": "1"})
self.create_event("user", self.base_time, event_name="$autocapture", properties={"$session_id": "2"})
self.create_snapshot("user", "1", self.base_time + relativedelta(seconds=30), window_id="1")
# Filter for recordings that contain a pageview event - will get recording because the $pageview event
# and recording share a session id
filter = SessionRecordingsFilter(
team=self.team, data={"events": [{"id": "$pageview", "type": "events", "order": 0, "name": "$pageview"}]},
)
@ -97,8 +94,6 @@ class TestClickhouseSessionRecordingsList(ClickhouseTestMixin, factory_session_r
self.assertEqual(len(session_recordings), 1)
self.assertEqual(session_recordings[0]["session_id"], "1")
# Filter for recordings that contain an autocapture event - will not get the recording because the $autocapture event
# and recording have different session ids
filter = SessionRecordingsFilter(
team=self.team,
data={"events": [{"id": "$autocapture", "type": "events", "order": 0, "name": "$autocapture"}]},