mirror of
https://github.com/PostHog/posthog.git
synced 2024-11-21 13:39:22 +01:00
Only include events in session recording query when needed (#7248)
* query for events with matching session_id if possible * move test to clickhouse * move event list to list instead of set for flaky test snapshot * join events only when needed * trigger benchmarking * typo
This commit is contained in:
parent
7520aca027
commit
6fc64efafa
@ -24,17 +24,7 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
_filter: SessionRecordingsFilter
|
||||
SESSION_RECORDINGS_DEFAULT_LIMIT = 50
|
||||
|
||||
_session_recordings_query_with_entity_filter: str = """
|
||||
SELECT
|
||||
session_recordings.session_id,
|
||||
any(session_recordings.start_time) as start_time,
|
||||
any(session_recordings.end_time) as end_time,
|
||||
any(session_recordings.duration) as duration,
|
||||
any(session_recordings.distinct_id) as distinct_id,
|
||||
arrayElement(groupArray(current_url), 1) as start_url,
|
||||
arrayElement(groupArray(current_url), -1) as end_url
|
||||
{event_filter_aggregate_select_clause}
|
||||
FROM (
|
||||
_core_events_query = """
|
||||
SELECT
|
||||
distinct_id,
|
||||
event,
|
||||
@ -46,8 +36,29 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
team_id = %(team_id)s
|
||||
{event_filter_where_conditions}
|
||||
{events_timestamp_clause}
|
||||
) AS events
|
||||
RIGHT OUTER JOIN (
|
||||
"""
|
||||
|
||||
_event_and_recording_match_conditions_clause = """
|
||||
(
|
||||
-- If there is a window_id on the recording, then it is newer data and we can match
|
||||
-- the recording and events on session_id
|
||||
(
|
||||
notEmpty(session_recordings.window_id) AND
|
||||
events.session_id == session_recordings.session_id
|
||||
) OR
|
||||
-- If there's no window_id on the recording, then it is older data and we should match
|
||||
-- events and recordings on timestamps
|
||||
(
|
||||
empty(session_recordings.window_id) AND
|
||||
(
|
||||
events.timestamp >= session_recordings.start_time
|
||||
AND events.timestamp <= session_recordings.end_time
|
||||
)
|
||||
)
|
||||
)
|
||||
"""
|
||||
|
||||
_core_session_recordings_query = """
|
||||
SELECT
|
||||
session_id,
|
||||
any(window_id) as window_id,
|
||||
@ -64,6 +75,21 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
HAVING full_snapshots > 0
|
||||
{recording_start_time_clause}
|
||||
{duration_clause}
|
||||
"""
|
||||
|
||||
_session_recordings_query_with_events: str = """
|
||||
SELECT
|
||||
session_recordings.session_id,
|
||||
any(session_recordings.start_time) as start_time,
|
||||
any(session_recordings.end_time) as end_time,
|
||||
any(session_recordings.duration) as duration,
|
||||
any(session_recordings.distinct_id) as distinct_id
|
||||
{event_filter_aggregate_select_clause}
|
||||
FROM (
|
||||
{core_events_query}
|
||||
) AS events
|
||||
JOIN (
|
||||
{core_recordings_query}
|
||||
) AS session_recordings
|
||||
ON session_recordings.distinct_id = events.distinct_id
|
||||
JOIN (
|
||||
@ -72,25 +98,7 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
ON pdi.distinct_id = session_recordings.distinct_id
|
||||
{person_query}
|
||||
WHERE
|
||||
(
|
||||
-- If there is a window_id on the recording, then it is newer data and we can match
|
||||
-- the recording and events on session_id
|
||||
(
|
||||
notEmpty(session_recordings.window_id) AND
|
||||
events.session_id == session_recordings.session_id
|
||||
) OR
|
||||
-- If there is no window_id on the recording, then it is older data and we should match
|
||||
-- events and recordings on timestamps
|
||||
(
|
||||
empty(session_recordings.window_id) AND
|
||||
(
|
||||
events.timestamp >= session_recordings.start_time
|
||||
AND events.timestamp <= session_recordings.end_time
|
||||
)
|
||||
) OR
|
||||
-- If there are no event matches, we don't want to filter out the recording itself
|
||||
empty(events.event)
|
||||
)
|
||||
{event_and_recording_match_comditions_clause}
|
||||
{prop_filter_clause}
|
||||
{person_id_clause}
|
||||
GROUP BY session_recordings.session_id
|
||||
@ -100,6 +108,29 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
LIMIT %(limit)s OFFSET %(offset)s
|
||||
"""
|
||||
|
||||
_session_recordings_query: str = """
|
||||
SELECT
|
||||
session_recordings.session_id,
|
||||
any(session_recordings.start_time) as start_time,
|
||||
any(session_recordings.end_time) as end_time,
|
||||
any(session_recordings.duration) as duration,
|
||||
any(session_recordings.distinct_id) as distinct_id
|
||||
FROM (
|
||||
{core_recordings_query}
|
||||
) AS session_recordings
|
||||
JOIN (
|
||||
{person_distinct_id_query}
|
||||
) as pdi
|
||||
ON pdi.distinct_id = session_recordings.distinct_id
|
||||
{person_query}
|
||||
WHERE 1 = 1
|
||||
{prop_filter_clause}
|
||||
{person_id_clause}
|
||||
GROUP BY session_recordings.session_id
|
||||
ORDER BY start_time DESC
|
||||
LIMIT %(limit)s OFFSET %(offset)s
|
||||
"""
|
||||
|
||||
@property
|
||||
def limit(self):
|
||||
return self._filter.limit or self.SESSION_RECORDINGS_DEFAULT_LIMIT
|
||||
@ -115,11 +146,12 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
self._should_join_persons = True
|
||||
return
|
||||
|
||||
def _determine_should_join_events(self):
|
||||
return self._filter.entities and len(self._filter.entities) > 0
|
||||
|
||||
def _get_properties_select_clause(self) -> str:
|
||||
current_url_clause, _ = get_property_string_expr("events", "$current_url", "'$current_url'", "properties")
|
||||
session_id_clause, _ = get_property_string_expr("events", "$session_id", "'$session_id'", "properties")
|
||||
clause = f""",
|
||||
{current_url_clause} as current_url,
|
||||
{session_id_clause} as session_id
|
||||
"""
|
||||
clause += (
|
||||
@ -132,9 +164,6 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
)
|
||||
return clause
|
||||
|
||||
def _has_entity_filters(self):
|
||||
return self._filter.entities and len(self._filter.entities) > 0
|
||||
|
||||
def _get_person_id_clause(self) -> Tuple[str, Dict[str, Any]]:
|
||||
person_id_clause = ""
|
||||
person_id_params = {}
|
||||
@ -198,12 +227,11 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
return filter_sql, params
|
||||
|
||||
def format_event_filters(self) -> EventFiltersSQL:
|
||||
|
||||
aggregate_select_clause = ""
|
||||
aggregate_having_clause = ""
|
||||
where_conditions = "AND event IN %(event_names)s"
|
||||
# Always include $pageview events so the start_url and end_url can be extracted
|
||||
event_names_to_filter: List[Union[int, str]] = ["$pageview"]
|
||||
event_names_to_filter: List[Union[int, str]] = []
|
||||
|
||||
params: Dict = {}
|
||||
|
||||
@ -227,9 +255,7 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
return EventFiltersSQL(aggregate_select_clause, aggregate_having_clause, where_conditions, params,)
|
||||
|
||||
def get_query(self) -> Tuple[str, Dict[str, Any]]:
|
||||
|
||||
offset = self._filter.offset or 0
|
||||
# One more is added to the limit to check if there are more results available
|
||||
base_params = {"team_id": self._team_id, "limit": self.limit + 1, "offset": offset}
|
||||
person_query, person_query_params = self._get_person_query()
|
||||
prop_query, prop_params = self._get_props(self._filter.properties)
|
||||
@ -237,21 +263,52 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery):
|
||||
recording_start_time_clause, recording_start_time_params = self._get_recording_start_time_clause()
|
||||
person_id_clause, person_id_params = self._get_person_id_clause()
|
||||
duration_clause, duration_params = self._get_duration_clause()
|
||||
event_filters = self.format_event_filters()
|
||||
properties_select_clause = self._get_properties_select_clause()
|
||||
|
||||
core_recordings_query = self._core_session_recordings_query.format(
|
||||
recording_start_time_clause=recording_start_time_clause,
|
||||
duration_clause=duration_clause,
|
||||
events_timestamp_clause=events_timestamp_clause,
|
||||
)
|
||||
|
||||
if not self._determine_should_join_events():
|
||||
return (
|
||||
self._session_recordings_query.format(
|
||||
core_recordings_query=core_recordings_query,
|
||||
person_distinct_id_query=GET_TEAM_PERSON_DISTINCT_IDS,
|
||||
person_query=person_query,
|
||||
prop_filter_clause=prop_query,
|
||||
person_id_clause=person_id_clause,
|
||||
),
|
||||
{
|
||||
**base_params,
|
||||
**person_id_params,
|
||||
**person_query_params,
|
||||
**prop_params,
|
||||
**events_timestamp_params,
|
||||
**duration_params,
|
||||
**recording_start_time_params,
|
||||
},
|
||||
)
|
||||
|
||||
event_filters = self.format_event_filters()
|
||||
|
||||
core_events_query = self._core_events_query.format(
|
||||
properties_select_clause=properties_select_clause,
|
||||
event_filter_where_conditions=event_filters.where_conditions,
|
||||
events_timestamp_clause=events_timestamp_clause,
|
||||
)
|
||||
|
||||
return (
|
||||
self._session_recordings_query_with_entity_filter.format(
|
||||
person_id_clause=person_id_clause,
|
||||
prop_filter_clause=prop_query,
|
||||
self._session_recordings_query_with_events.format(
|
||||
event_filter_aggregate_select_clause=event_filters.aggregate_select_clause,
|
||||
core_events_query=core_events_query,
|
||||
core_recordings_query=core_recordings_query,
|
||||
person_distinct_id_query=GET_TEAM_PERSON_DISTINCT_IDS,
|
||||
person_query=person_query,
|
||||
properties_select_clause=properties_select_clause,
|
||||
events_timestamp_clause=events_timestamp_clause,
|
||||
recording_start_time_clause=recording_start_time_clause,
|
||||
duration_clause=duration_clause,
|
||||
event_filter_where_conditions=event_filters.where_conditions,
|
||||
event_filter_aggregate_select_clause=event_filters.aggregate_select_clause,
|
||||
event_and_recording_match_comditions_clause=self._event_and_recording_match_conditions_clause,
|
||||
prop_filter_clause=prop_query,
|
||||
person_id_clause=person_id_clause,
|
||||
event_filter_aggregate_having_clause=event_filters.aggregate_having_clause,
|
||||
),
|
||||
{
|
||||
|
@ -169,24 +169,8 @@
|
||||
any(session_recordings.start_time) as start_time,
|
||||
any(session_recordings.end_time) as end_time,
|
||||
any(session_recordings.duration) as duration,
|
||||
any(session_recordings.distinct_id) as distinct_id,
|
||||
arrayElement(groupArray(current_url), 1) as start_url,
|
||||
arrayElement(groupArray(current_url), -1) as end_url
|
||||
any(session_recordings.distinct_id) as distinct_id
|
||||
FROM
|
||||
(SELECT distinct_id,
|
||||
event,
|
||||
team_id,
|
||||
timestamp ,
|
||||
trim(BOTH '"'
|
||||
FROM JSONExtractRaw(properties, '$current_url')) as current_url,
|
||||
trim(BOTH '"'
|
||||
FROM JSONExtractRaw(properties, '$session_id')) as session_id
|
||||
FROM events
|
||||
WHERE team_id = 2
|
||||
AND event IN ['$pageview']
|
||||
AND timestamp >= '2021-08-13 12:00:00'
|
||||
AND timestamp <= '2021-08-22 08:00:00' ) AS events
|
||||
RIGHT OUTER JOIN
|
||||
(SELECT session_id,
|
||||
any(window_id) as window_id,
|
||||
MIN(timestamp) AS start_time,
|
||||
@ -201,7 +185,7 @@
|
||||
GROUP BY session_id
|
||||
HAVING full_snapshots > 0
|
||||
AND start_time >= '2021-08-14 00:00:00'
|
||||
AND start_time <= '2021-08-21 20:00:00') AS session_recordings ON session_recordings.distinct_id = events.distinct_id
|
||||
AND start_time <= '2021-08-21 20:00:00') AS session_recordings
|
||||
JOIN
|
||||
(SELECT distinct_id,
|
||||
argMax(person_id, _timestamp) as person_id
|
||||
@ -222,12 +206,7 @@
|
||||
WHERE team_id = 2
|
||||
GROUP BY id
|
||||
HAVING max(is_deleted) = 0) person ON person.id = pdi.person_id
|
||||
WHERE ((notEmpty(session_recordings.window_id)
|
||||
AND events.session_id == session_recordings.session_id)
|
||||
OR (empty(session_recordings.window_id)
|
||||
AND (events.timestamp >= session_recordings.start_time
|
||||
AND events.timestamp <= session_recordings.end_time))
|
||||
OR empty(events.event))
|
||||
WHERE 1 = 1
|
||||
AND person_id IN
|
||||
(SELECT person_id
|
||||
FROM cohortpeople
|
||||
@ -238,7 +217,6 @@
|
||||
team_id
|
||||
HAVING sum(sign) > 0)
|
||||
GROUP BY session_recordings.session_id
|
||||
HAVING 1 = 1
|
||||
ORDER BY start_time DESC
|
||||
LIMIT 51
|
||||
OFFSET 0
|
||||
@ -251,17 +229,13 @@
|
||||
any(session_recordings.start_time) as start_time,
|
||||
any(session_recordings.end_time) as end_time,
|
||||
any(session_recordings.duration) as duration,
|
||||
any(session_recordings.distinct_id) as distinct_id,
|
||||
arrayElement(groupArray(current_url), 1) as start_url,
|
||||
arrayElement(groupArray(current_url), -1) as end_url ,
|
||||
any(session_recordings.distinct_id) as distinct_id ,
|
||||
sum(if(event = '$pageview', 1, 0)) as count_event_match_0
|
||||
FROM
|
||||
(SELECT distinct_id,
|
||||
event,
|
||||
team_id,
|
||||
timestamp ,
|
||||
trim(BOTH '"'
|
||||
FROM JSONExtractRaw(properties, '$current_url')) as current_url,
|
||||
trim(BOTH '"'
|
||||
FROM JSONExtractRaw(properties, '$session_id')) as session_id
|
||||
FROM events
|
||||
@ -269,7 +243,7 @@
|
||||
AND event IN ['$pageview']
|
||||
AND timestamp >= '2021-01-13 12:00:00'
|
||||
AND timestamp <= '2021-01-22 08:00:00' ) AS events
|
||||
RIGHT OUTER JOIN
|
||||
JOIN
|
||||
(SELECT session_id,
|
||||
any(window_id) as window_id,
|
||||
MIN(timestamp) AS start_time,
|
||||
@ -303,8 +277,7 @@
|
||||
AND events.session_id == session_recordings.session_id)
|
||||
OR (empty(session_recordings.window_id)
|
||||
AND (events.timestamp >= session_recordings.start_time
|
||||
AND events.timestamp <= session_recordings.end_time))
|
||||
OR empty(events.event))
|
||||
AND events.timestamp <= session_recordings.end_time)))
|
||||
GROUP BY session_recordings.session_id
|
||||
HAVING 1 = 1
|
||||
AND count_event_match_0 > 0
|
||||
@ -320,25 +293,21 @@
|
||||
any(session_recordings.start_time) as start_time,
|
||||
any(session_recordings.end_time) as end_time,
|
||||
any(session_recordings.duration) as duration,
|
||||
any(session_recordings.distinct_id) as distinct_id,
|
||||
arrayElement(groupArray(current_url), 1) as start_url,
|
||||
arrayElement(groupArray(current_url), -1) as end_url ,
|
||||
any(session_recordings.distinct_id) as distinct_id ,
|
||||
sum(if(event = '$autocapture', 1, 0)) as count_event_match_0
|
||||
FROM
|
||||
(SELECT distinct_id,
|
||||
event,
|
||||
team_id,
|
||||
timestamp ,
|
||||
trim(BOTH '"'
|
||||
FROM JSONExtractRaw(properties, '$current_url')) as current_url,
|
||||
trim(BOTH '"'
|
||||
FROM JSONExtractRaw(properties, '$session_id')) as session_id
|
||||
FROM events
|
||||
WHERE team_id = 2
|
||||
AND event IN ['$pageview', '$autocapture']
|
||||
AND event IN ['$autocapture']
|
||||
AND timestamp >= '2021-01-13 12:00:00'
|
||||
AND timestamp <= '2021-01-22 08:00:00' ) AS events
|
||||
RIGHT OUTER JOIN
|
||||
JOIN
|
||||
(SELECT session_id,
|
||||
any(window_id) as window_id,
|
||||
MIN(timestamp) AS start_time,
|
||||
@ -372,8 +341,7 @@
|
||||
AND events.session_id == session_recordings.session_id)
|
||||
OR (empty(session_recordings.window_id)
|
||||
AND (events.timestamp >= session_recordings.start_time
|
||||
AND events.timestamp <= session_recordings.end_time))
|
||||
OR empty(events.event))
|
||||
AND events.timestamp <= session_recordings.end_time)))
|
||||
GROUP BY session_recordings.session_id
|
||||
HAVING 1 = 1
|
||||
AND count_event_match_0 > 0
|
||||
@ -389,24 +357,8 @@
|
||||
any(session_recordings.start_time) as start_time,
|
||||
any(session_recordings.end_time) as end_time,
|
||||
any(session_recordings.duration) as duration,
|
||||
any(session_recordings.distinct_id) as distinct_id,
|
||||
arrayElement(groupArray(current_url), 1) as start_url,
|
||||
arrayElement(groupArray(current_url), -1) as end_url
|
||||
any(session_recordings.distinct_id) as distinct_id
|
||||
FROM
|
||||
(SELECT distinct_id,
|
||||
event,
|
||||
team_id,
|
||||
timestamp ,
|
||||
trim(BOTH '"'
|
||||
FROM JSONExtractRaw(properties, '$current_url')) as current_url,
|
||||
trim(BOTH '"'
|
||||
FROM JSONExtractRaw(properties, '$session_id')) as session_id
|
||||
FROM events
|
||||
WHERE team_id = 2
|
||||
AND event IN ['$pageview']
|
||||
AND timestamp >= '2021-01-13 12:00:00'
|
||||
AND timestamp <= '2021-01-22 08:00:00' ) AS events
|
||||
RIGHT OUTER JOIN
|
||||
(SELECT session_id,
|
||||
any(window_id) as window_id,
|
||||
MIN(timestamp) AS start_time,
|
||||
@ -421,7 +373,7 @@
|
||||
GROUP BY session_id
|
||||
HAVING full_snapshots > 0
|
||||
AND start_time >= '2021-01-14 00:00:00'
|
||||
AND start_time <= '2021-01-21 20:00:00') AS session_recordings ON session_recordings.distinct_id = events.distinct_id
|
||||
AND start_time <= '2021-01-21 20:00:00') AS session_recordings
|
||||
JOIN
|
||||
(SELECT distinct_id,
|
||||
argMax(person_id, _timestamp) as person_id
|
||||
@ -444,14 +396,8 @@
|
||||
HAVING max(is_deleted) = 0
|
||||
AND has(['bla'], trim(BOTH '"'
|
||||
FROM JSONExtractRaw(argMax(person.properties, _timestamp), 'email')))) person ON person.id = pdi.person_id
|
||||
WHERE ((notEmpty(session_recordings.window_id)
|
||||
AND events.session_id == session_recordings.session_id)
|
||||
OR (empty(session_recordings.window_id)
|
||||
AND (events.timestamp >= session_recordings.start_time
|
||||
AND events.timestamp <= session_recordings.end_time))
|
||||
OR empty(events.event))
|
||||
WHERE 1 = 1
|
||||
GROUP BY session_recordings.session_id
|
||||
HAVING 1 = 1
|
||||
ORDER BY start_time DESC
|
||||
LIMIT 51
|
||||
OFFSET 0
|
||||
|
@ -86,9 +86,6 @@ class TestClickhouseSessionRecordingsList(ClickhouseTestMixin, factory_session_r
|
||||
self.create_event("user", self.base_time, properties={"$session_id": "1"})
|
||||
self.create_event("user", self.base_time, event_name="$autocapture", properties={"$session_id": "2"})
|
||||
self.create_snapshot("user", "1", self.base_time + relativedelta(seconds=30), window_id="1")
|
||||
|
||||
# Filter for recordings that contain a pageview event - will get recording because the $pageview event
|
||||
# and recording share a session id
|
||||
filter = SessionRecordingsFilter(
|
||||
team=self.team, data={"events": [{"id": "$pageview", "type": "events", "order": 0, "name": "$pageview"}]},
|
||||
)
|
||||
@ -97,8 +94,6 @@ class TestClickhouseSessionRecordingsList(ClickhouseTestMixin, factory_session_r
|
||||
self.assertEqual(len(session_recordings), 1)
|
||||
self.assertEqual(session_recordings[0]["session_id"], "1")
|
||||
|
||||
# Filter for recordings that contain an autocapture event - will not get the recording because the $autocapture event
|
||||
# and recording have different session ids
|
||||
filter = SessionRecordingsFilter(
|
||||
team=self.team,
|
||||
data={"events": [{"id": "$autocapture", "type": "events", "order": 0, "name": "$autocapture"}]},
|
||||
|
Loading…
Reference in New Issue
Block a user