diff --git a/ee/clickhouse/queries/session_recordings/clickhouse_session_recording_list.py b/ee/clickhouse/queries/session_recordings/clickhouse_session_recording_list.py index ca6cb11409f..d6177aaf246 100644 --- a/ee/clickhouse/queries/session_recordings/clickhouse_session_recording_list.py +++ b/ee/clickhouse/queries/session_recordings/clickhouse_session_recording_list.py @@ -24,17 +24,7 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): _filter: SessionRecordingsFilter SESSION_RECORDINGS_DEFAULT_LIMIT = 50 - _session_recordings_query_with_entity_filter: str = """ - SELECT - session_recordings.session_id, - any(session_recordings.start_time) as start_time, - any(session_recordings.end_time) as end_time, - any(session_recordings.duration) as duration, - any(session_recordings.distinct_id) as distinct_id, - arrayElement(groupArray(current_url), 1) as start_url, - arrayElement(groupArray(current_url), -1) as end_url - {event_filter_aggregate_select_clause} - FROM ( + _core_events_query = """ SELECT distinct_id, event, @@ -46,8 +36,29 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): team_id = %(team_id)s {event_filter_where_conditions} {events_timestamp_clause} - ) AS events - RIGHT OUTER JOIN ( + """ + + _event_and_recording_match_conditions_clause = """ + ( + -- If there is a window_id on the recording, then it is newer data and we can match + -- the recording and events on session_id + ( + notEmpty(session_recordings.window_id) AND + events.session_id == session_recordings.session_id + ) OR + -- If there's no window_id on the recording, then it is older data and we should match + -- events and recordings on timestamps + ( + empty(session_recordings.window_id) AND + ( + events.timestamp >= session_recordings.start_time + AND events.timestamp <= session_recordings.end_time + ) + ) + ) + """ + + _core_session_recordings_query = """ SELECT session_id, any(window_id) as window_id, @@ -64,6 +75,21 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): HAVING full_snapshots > 0 {recording_start_time_clause} {duration_clause} + """ + + _session_recordings_query_with_events: str = """ + SELECT + session_recordings.session_id, + any(session_recordings.start_time) as start_time, + any(session_recordings.end_time) as end_time, + any(session_recordings.duration) as duration, + any(session_recordings.distinct_id) as distinct_id + {event_filter_aggregate_select_clause} + FROM ( + {core_events_query} + ) AS events + JOIN ( + {core_recordings_query} ) AS session_recordings ON session_recordings.distinct_id = events.distinct_id JOIN ( @@ -72,25 +98,7 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): ON pdi.distinct_id = session_recordings.distinct_id {person_query} WHERE - ( - -- If there is a window_id on the recording, then it is newer data and we can match - -- the recording and events on session_id - ( - notEmpty(session_recordings.window_id) AND - events.session_id == session_recordings.session_id - ) OR - -- If there is no window_id on the recording, then it is older data and we should match - -- events and recordings on timestamps - ( - empty(session_recordings.window_id) AND - ( - events.timestamp >= session_recordings.start_time - AND events.timestamp <= session_recordings.end_time - ) - ) OR - -- If there are no event matches, we don't want to filter out the recording itself - empty(events.event) - ) + {event_and_recording_match_comditions_clause} {prop_filter_clause} {person_id_clause} GROUP BY session_recordings.session_id @@ -100,6 +108,29 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): LIMIT %(limit)s OFFSET %(offset)s """ + _session_recordings_query: str = """ + SELECT + session_recordings.session_id, + any(session_recordings.start_time) as start_time, + any(session_recordings.end_time) as end_time, + any(session_recordings.duration) as duration, + any(session_recordings.distinct_id) as distinct_id + FROM ( + {core_recordings_query} + ) AS session_recordings + JOIN ( + {person_distinct_id_query} + ) as pdi + ON pdi.distinct_id = session_recordings.distinct_id + {person_query} + WHERE 1 = 1 + {prop_filter_clause} + {person_id_clause} + GROUP BY session_recordings.session_id + ORDER BY start_time DESC + LIMIT %(limit)s OFFSET %(offset)s + """ + @property def limit(self): return self._filter.limit or self.SESSION_RECORDINGS_DEFAULT_LIMIT @@ -115,11 +146,12 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): self._should_join_persons = True return + def _determine_should_join_events(self): + return self._filter.entities and len(self._filter.entities) > 0 + def _get_properties_select_clause(self) -> str: - current_url_clause, _ = get_property_string_expr("events", "$current_url", "'$current_url'", "properties") session_id_clause, _ = get_property_string_expr("events", "$session_id", "'$session_id'", "properties") clause = f""", - {current_url_clause} as current_url, {session_id_clause} as session_id """ clause += ( @@ -132,9 +164,6 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): ) return clause - def _has_entity_filters(self): - return self._filter.entities and len(self._filter.entities) > 0 - def _get_person_id_clause(self) -> Tuple[str, Dict[str, Any]]: person_id_clause = "" person_id_params = {} @@ -198,12 +227,11 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): return filter_sql, params def format_event_filters(self) -> EventFiltersSQL: - aggregate_select_clause = "" aggregate_having_clause = "" where_conditions = "AND event IN %(event_names)s" # Always include $pageview events so the start_url and end_url can be extracted - event_names_to_filter: List[Union[int, str]] = ["$pageview"] + event_names_to_filter: List[Union[int, str]] = [] params: Dict = {} @@ -227,9 +255,7 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): return EventFiltersSQL(aggregate_select_clause, aggregate_having_clause, where_conditions, params,) def get_query(self) -> Tuple[str, Dict[str, Any]]: - offset = self._filter.offset or 0 - # One more is added to the limit to check if there are more results available base_params = {"team_id": self._team_id, "limit": self.limit + 1, "offset": offset} person_query, person_query_params = self._get_person_query() prop_query, prop_params = self._get_props(self._filter.properties) @@ -237,21 +263,52 @@ class ClickhouseSessionRecordingList(ClickhouseEventQuery): recording_start_time_clause, recording_start_time_params = self._get_recording_start_time_clause() person_id_clause, person_id_params = self._get_person_id_clause() duration_clause, duration_params = self._get_duration_clause() - event_filters = self.format_event_filters() properties_select_clause = self._get_properties_select_clause() + core_recordings_query = self._core_session_recordings_query.format( + recording_start_time_clause=recording_start_time_clause, + duration_clause=duration_clause, + events_timestamp_clause=events_timestamp_clause, + ) + + if not self._determine_should_join_events(): + return ( + self._session_recordings_query.format( + core_recordings_query=core_recordings_query, + person_distinct_id_query=GET_TEAM_PERSON_DISTINCT_IDS, + person_query=person_query, + prop_filter_clause=prop_query, + person_id_clause=person_id_clause, + ), + { + **base_params, + **person_id_params, + **person_query_params, + **prop_params, + **events_timestamp_params, + **duration_params, + **recording_start_time_params, + }, + ) + + event_filters = self.format_event_filters() + + core_events_query = self._core_events_query.format( + properties_select_clause=properties_select_clause, + event_filter_where_conditions=event_filters.where_conditions, + events_timestamp_clause=events_timestamp_clause, + ) + return ( - self._session_recordings_query_with_entity_filter.format( - person_id_clause=person_id_clause, - prop_filter_clause=prop_query, + self._session_recordings_query_with_events.format( + event_filter_aggregate_select_clause=event_filters.aggregate_select_clause, + core_events_query=core_events_query, + core_recordings_query=core_recordings_query, person_distinct_id_query=GET_TEAM_PERSON_DISTINCT_IDS, person_query=person_query, - properties_select_clause=properties_select_clause, - events_timestamp_clause=events_timestamp_clause, - recording_start_time_clause=recording_start_time_clause, - duration_clause=duration_clause, - event_filter_where_conditions=event_filters.where_conditions, - event_filter_aggregate_select_clause=event_filters.aggregate_select_clause, + event_and_recording_match_comditions_clause=self._event_and_recording_match_conditions_clause, + prop_filter_clause=prop_query, + person_id_clause=person_id_clause, event_filter_aggregate_having_clause=event_filters.aggregate_having_clause, ), { diff --git a/ee/clickhouse/queries/session_recordings/test/__snapshots__/test_clickhouse_session_recording_list.ambr b/ee/clickhouse/queries/session_recordings/test/__snapshots__/test_clickhouse_session_recording_list.ambr index 13863fbd8eb..5778a48e1c8 100644 --- a/ee/clickhouse/queries/session_recordings/test/__snapshots__/test_clickhouse_session_recording_list.ambr +++ b/ee/clickhouse/queries/session_recordings/test/__snapshots__/test_clickhouse_session_recording_list.ambr @@ -169,24 +169,8 @@ any(session_recordings.start_time) as start_time, any(session_recordings.end_time) as end_time, any(session_recordings.duration) as duration, - any(session_recordings.distinct_id) as distinct_id, - arrayElement(groupArray(current_url), 1) as start_url, - arrayElement(groupArray(current_url), -1) as end_url + any(session_recordings.distinct_id) as distinct_id FROM - (SELECT distinct_id, - event, - team_id, - timestamp , - trim(BOTH '"' - FROM JSONExtractRaw(properties, '$current_url')) as current_url, - trim(BOTH '"' - FROM JSONExtractRaw(properties, '$session_id')) as session_id - FROM events - WHERE team_id = 2 - AND event IN ['$pageview'] - AND timestamp >= '2021-08-13 12:00:00' - AND timestamp <= '2021-08-22 08:00:00' ) AS events - RIGHT OUTER JOIN (SELECT session_id, any(window_id) as window_id, MIN(timestamp) AS start_time, @@ -201,7 +185,7 @@ GROUP BY session_id HAVING full_snapshots > 0 AND start_time >= '2021-08-14 00:00:00' - AND start_time <= '2021-08-21 20:00:00') AS session_recordings ON session_recordings.distinct_id = events.distinct_id + AND start_time <= '2021-08-21 20:00:00') AS session_recordings JOIN (SELECT distinct_id, argMax(person_id, _timestamp) as person_id @@ -222,12 +206,7 @@ WHERE team_id = 2 GROUP BY id HAVING max(is_deleted) = 0) person ON person.id = pdi.person_id - WHERE ((notEmpty(session_recordings.window_id) - AND events.session_id == session_recordings.session_id) - OR (empty(session_recordings.window_id) - AND (events.timestamp >= session_recordings.start_time - AND events.timestamp <= session_recordings.end_time)) - OR empty(events.event)) + WHERE 1 = 1 AND person_id IN (SELECT person_id FROM cohortpeople @@ -238,7 +217,6 @@ team_id HAVING sum(sign) > 0) GROUP BY session_recordings.session_id - HAVING 1 = 1 ORDER BY start_time DESC LIMIT 51 OFFSET 0 @@ -251,17 +229,13 @@ any(session_recordings.start_time) as start_time, any(session_recordings.end_time) as end_time, any(session_recordings.duration) as duration, - any(session_recordings.distinct_id) as distinct_id, - arrayElement(groupArray(current_url), 1) as start_url, - arrayElement(groupArray(current_url), -1) as end_url , + any(session_recordings.distinct_id) as distinct_id , sum(if(event = '$pageview', 1, 0)) as count_event_match_0 FROM (SELECT distinct_id, event, team_id, timestamp , - trim(BOTH '"' - FROM JSONExtractRaw(properties, '$current_url')) as current_url, trim(BOTH '"' FROM JSONExtractRaw(properties, '$session_id')) as session_id FROM events @@ -269,7 +243,7 @@ AND event IN ['$pageview'] AND timestamp >= '2021-01-13 12:00:00' AND timestamp <= '2021-01-22 08:00:00' ) AS events - RIGHT OUTER JOIN + JOIN (SELECT session_id, any(window_id) as window_id, MIN(timestamp) AS start_time, @@ -303,8 +277,7 @@ AND events.session_id == session_recordings.session_id) OR (empty(session_recordings.window_id) AND (events.timestamp >= session_recordings.start_time - AND events.timestamp <= session_recordings.end_time)) - OR empty(events.event)) + AND events.timestamp <= session_recordings.end_time))) GROUP BY session_recordings.session_id HAVING 1 = 1 AND count_event_match_0 > 0 @@ -320,25 +293,21 @@ any(session_recordings.start_time) as start_time, any(session_recordings.end_time) as end_time, any(session_recordings.duration) as duration, - any(session_recordings.distinct_id) as distinct_id, - arrayElement(groupArray(current_url), 1) as start_url, - arrayElement(groupArray(current_url), -1) as end_url , + any(session_recordings.distinct_id) as distinct_id , sum(if(event = '$autocapture', 1, 0)) as count_event_match_0 FROM (SELECT distinct_id, event, team_id, timestamp , - trim(BOTH '"' - FROM JSONExtractRaw(properties, '$current_url')) as current_url, trim(BOTH '"' FROM JSONExtractRaw(properties, '$session_id')) as session_id FROM events WHERE team_id = 2 - AND event IN ['$pageview', '$autocapture'] + AND event IN ['$autocapture'] AND timestamp >= '2021-01-13 12:00:00' AND timestamp <= '2021-01-22 08:00:00' ) AS events - RIGHT OUTER JOIN + JOIN (SELECT session_id, any(window_id) as window_id, MIN(timestamp) AS start_time, @@ -372,8 +341,7 @@ AND events.session_id == session_recordings.session_id) OR (empty(session_recordings.window_id) AND (events.timestamp >= session_recordings.start_time - AND events.timestamp <= session_recordings.end_time)) - OR empty(events.event)) + AND events.timestamp <= session_recordings.end_time))) GROUP BY session_recordings.session_id HAVING 1 = 1 AND count_event_match_0 > 0 @@ -389,24 +357,8 @@ any(session_recordings.start_time) as start_time, any(session_recordings.end_time) as end_time, any(session_recordings.duration) as duration, - any(session_recordings.distinct_id) as distinct_id, - arrayElement(groupArray(current_url), 1) as start_url, - arrayElement(groupArray(current_url), -1) as end_url + any(session_recordings.distinct_id) as distinct_id FROM - (SELECT distinct_id, - event, - team_id, - timestamp , - trim(BOTH '"' - FROM JSONExtractRaw(properties, '$current_url')) as current_url, - trim(BOTH '"' - FROM JSONExtractRaw(properties, '$session_id')) as session_id - FROM events - WHERE team_id = 2 - AND event IN ['$pageview'] - AND timestamp >= '2021-01-13 12:00:00' - AND timestamp <= '2021-01-22 08:00:00' ) AS events - RIGHT OUTER JOIN (SELECT session_id, any(window_id) as window_id, MIN(timestamp) AS start_time, @@ -421,7 +373,7 @@ GROUP BY session_id HAVING full_snapshots > 0 AND start_time >= '2021-01-14 00:00:00' - AND start_time <= '2021-01-21 20:00:00') AS session_recordings ON session_recordings.distinct_id = events.distinct_id + AND start_time <= '2021-01-21 20:00:00') AS session_recordings JOIN (SELECT distinct_id, argMax(person_id, _timestamp) as person_id @@ -444,14 +396,8 @@ HAVING max(is_deleted) = 0 AND has(['bla'], trim(BOTH '"' FROM JSONExtractRaw(argMax(person.properties, _timestamp), 'email')))) person ON person.id = pdi.person_id - WHERE ((notEmpty(session_recordings.window_id) - AND events.session_id == session_recordings.session_id) - OR (empty(session_recordings.window_id) - AND (events.timestamp >= session_recordings.start_time - AND events.timestamp <= session_recordings.end_time)) - OR empty(events.event)) + WHERE 1 = 1 GROUP BY session_recordings.session_id - HAVING 1 = 1 ORDER BY start_time DESC LIMIT 51 OFFSET 0 diff --git a/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording_list.py b/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording_list.py index 1c5149f81c5..b24daa0d712 100644 --- a/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording_list.py +++ b/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording_list.py @@ -86,9 +86,6 @@ class TestClickhouseSessionRecordingsList(ClickhouseTestMixin, factory_session_r self.create_event("user", self.base_time, properties={"$session_id": "1"}) self.create_event("user", self.base_time, event_name="$autocapture", properties={"$session_id": "2"}) self.create_snapshot("user", "1", self.base_time + relativedelta(seconds=30), window_id="1") - - # Filter for recordings that contain a pageview event - will get recording because the $pageview event - # and recording share a session id filter = SessionRecordingsFilter( team=self.team, data={"events": [{"id": "$pageview", "type": "events", "order": 0, "name": "$pageview"}]}, ) @@ -97,8 +94,6 @@ class TestClickhouseSessionRecordingsList(ClickhouseTestMixin, factory_session_r self.assertEqual(len(session_recordings), 1) self.assertEqual(session_recordings[0]["session_id"], "1") - # Filter for recordings that contain an autocapture event - will not get the recording because the $autocapture event - # and recording have different session ids filter = SessionRecordingsFilter( team=self.team, data={"events": [{"id": "$autocapture", "type": "events", "order": 0, "name": "$autocapture"}]},