0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-29 02:46:32 +01:00
posthog/ee/clickhouse/models/event.py

301 lines
9.2 KiB
Python

import json
import uuid
from typing import Dict, List, Optional, Tuple, Union
import celery
import pytz
from dateutil.parser import isoparse
from django.conf import settings
from django.utils import timezone
from rest_framework import serializers
from sentry_sdk import capture_exception
from statshog.defaults.django import statsd
from ee.clickhouse.client import sync_execute
from ee.clickhouse.models.element import chain_to_elements, elements_to_string
from ee.clickhouse.sql.events import GET_EVENTS_BY_TEAM_SQL, INSERT_EVENT_SQL
from ee.idl.gen import events_pb2
from ee.kafka_client.client import ClickhouseProducer
from ee.kafka_client.topics import KAFKA_EVENTS
from ee.models.hook import Hook
from posthog.constants import AvailableFeature
from posthog.models.action_step import ActionStep
from posthog.models.element import Element
from posthog.models.person import Person
from posthog.models.team import Team
def create_event(
event_uuid: uuid.UUID,
event: str,
team: Team,
distinct_id: str,
timestamp: Optional[Union[timezone.datetime, str]] = None,
properties: Optional[Dict] = {},
elements: Optional[List[Element]] = None,
site_url: Optional[str] = None,
) -> str:
if not timestamp:
timestamp = timezone.now()
assert timestamp is not None
# clickhouse specific formatting
if isinstance(timestamp, str):
timestamp = isoparse(timestamp)
else:
timestamp = timestamp.astimezone(pytz.utc)
elements_chain = ""
if elements and len(elements) > 0:
elements_chain = elements_to_string(elements=elements)
pb_event = events_pb2.Event()
pb_event.uuid = str(event_uuid)
pb_event.event = event
pb_event.properties = json.dumps(properties)
pb_event.timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S.%f")
pb_event.team_id = team.pk
pb_event.distinct_id = str(distinct_id)
pb_event.elements_chain = elements_chain
pb_event.created_at = timestamp.strftime("%Y-%m-%d %H:%M:%S.%f")
p = ClickhouseProducer()
p.produce_proto(sql=INSERT_EVENT_SQL, topic=KAFKA_EVENTS, data=pb_event)
return str(event_uuid)
def get_events_by_team(team_id: Union[str, int]):
events = sync_execute(GET_EVENTS_BY_TEAM_SQL, {"team_id": str(team_id)})
return ClickhouseEventSerializer(events, many=True, context={"elements": None, "people": None}).data
class ElementSerializer(serializers.ModelSerializer):
event = serializers.CharField()
class Meta:
model = Element
fields = [
"event",
"text",
"tag_name",
"attr_class",
"href",
"attr_id",
"nth_child",
"nth_of_type",
"attributes",
"order",
]
# reference raw sql for
class ClickhouseEventSerializer(serializers.Serializer):
id = serializers.SerializerMethodField()
distinct_id = serializers.SerializerMethodField()
properties = serializers.SerializerMethodField()
event = serializers.SerializerMethodField()
timestamp = serializers.SerializerMethodField()
person = serializers.SerializerMethodField()
elements = serializers.SerializerMethodField()
elements_chain = serializers.SerializerMethodField()
def get_id(self, event):
return str(event[0])
def get_distinct_id(self, event):
return event[5]
def get_properties(self, event):
if len(event) >= 10 and event[8] and event[9]:
prop_vals = [res.strip('"') for res in event[9]]
return dict(zip(event[8], prop_vals))
else:
# parse_constants gets called for any NaN, Infinity etc values
# we just want those to be returned as None
props = json.loads(event[2], parse_constant=lambda x: None)
unpadded = {key: value.strip('"') if isinstance(value, str) else value for key, value in props.items()}
return unpadded
def get_event(self, event):
return event[1]
def get_timestamp(self, event):
dt = event[3].replace(tzinfo=timezone.utc)
return dt.astimezone().isoformat()
def get_person(self, event):
if not self.context.get("people") or event[5] not in self.context["people"]:
return None
person = self.context["people"][event[5]]
return {
"is_identified": person.is_identified,
"distinct_ids": person.distinct_ids[:1], # only send the first one to avoid a payload bloat
"properties": {
key: person.properties[key] for key in ["email", "name", "username"] if key in person.properties
},
}
def get_elements(self, event):
if not event[6]:
return []
return ElementSerializer(chain_to_elements(event[6]), many=True).data
def get_elements_chain(self, event):
return event[6]
def determine_event_conditions(
team: Team, conditions: Dict[str, Union[str, List[str]]], long_date_from: bool
) -> Tuple[str, Dict]:
result = ""
params: Dict[str, Union[str, List[str]]] = {}
for idx, (k, v) in enumerate(conditions.items()):
if not isinstance(v, str):
continue
if k == "after" and not long_date_from:
timestamp = isoparse(v).strftime("%Y-%m-%d %H:%M:%S.%f")
result += "AND timestamp > %(after)s"
params.update({"after": timestamp})
elif k == "before":
timestamp = isoparse(v).strftime("%Y-%m-%d %H:%M:%S.%f")
result += "AND timestamp < %(before)s"
params.update({"before": timestamp})
elif k == "person_id":
result += """AND distinct_id IN (%(distinct_ids)s)"""
person = Person.objects.filter(pk=v, team_id=team.pk).first()
distinct_ids = person.distinct_ids if person is not None else []
params.update({"distinct_ids": list(map(str, distinct_ids))})
elif k == "distinct_id":
result += "AND distinct_id = %(distinct_id)s"
params.update({"distinct_id": v})
elif k == "event":
result += "AND event = %(event)s"
params.update({"event": v})
return result, params
def get_event_count_for_team_and_period(
team_id: Union[str, int], begin: timezone.datetime, end: timezone.datetime
) -> int:
result = sync_execute(
"""
SELECT count(1) as count
FROM events
WHERE team_id = %(team_id)s
AND timestamp between %(begin)s AND %(end)s
""",
{"team_id": str(team_id), "begin": begin, "end": end},
)[0][0]
return result
def get_agg_event_count_for_teams(team_ids: List[Union[str, int]]) -> int:
result = sync_execute(
"""
SELECT count(1) as count
FROM events
WHERE team_id IN (%(team_id_clause)s)
""",
{"team_id_clause": team_ids},
)[0][0]
return result
def get_agg_event_count_for_teams_and_period(
team_ids: List[Union[str, int]], begin: timezone.datetime, end: timezone.datetime
) -> int:
result = sync_execute(
"""
SELECT count(1) as count
FROM events
WHERE team_id IN (%(team_id_clause)s)
AND timestamp between %(begin)s AND %(end)s
""",
{"team_id_clause": team_ids, "begin": begin, "end": end},
)[0][0]
return result
def get_event_count_for_team(team_id: Union[str, int]) -> int:
result = sync_execute(
"""
SELECT count(1) as count
FROM events
WHERE team_id = %(team_id)s
""",
{"team_id": str(team_id)},
)[0][0]
return result
def get_event_count() -> int:
result = sync_execute(
"""
SELECT count(1) as count
FROM events
"""
)[0][0]
return result
def get_event_count_for_last_month() -> int:
result = sync_execute(
"""
-- count of events last month
SELECT
COUNT(1) freq
FROM events
WHERE
toStartOfMonth(timestamp) = toStartOfMonth(date_sub(MONTH, 1, now()))
"""
)[0][0]
return result
def get_event_count_month_to_date() -> int:
result = sync_execute(
"""
-- count of events month to date
SELECT
COUNT(1) freq
FROM events
WHERE toStartOfMonth(timestamp) = toStartOfMonth(now());
"""
)[0][0]
return result
def get_events_count_for_team_by_client_lib(
team_id: Union[str, int], begin: timezone.datetime, end: timezone.datetime
) -> dict:
results = sync_execute(
"""
SELECT JSONExtractString(properties, '$lib') as lib, COUNT(1) as freq
FROM events
WHERE team_id = %(team_id)s
AND timestamp between %(begin)s AND %(end)s
GROUP BY lib
""",
{"team_id": str(team_id), "begin": begin, "end": end},
)
return {result[0]: result[1] for result in results}
def get_events_count_for_team_by_event_type(
team_id: Union[str, int], begin: timezone.datetime, end: timezone.datetime
) -> dict:
results = sync_execute(
"""
SELECT event, COUNT(1) as freq
FROM events
WHERE team_id = %(team_id)s
AND timestamp between %(begin)s AND %(end)s
GROUP BY event
""",
{"team_id": str(team_id), "begin": begin, "end": end},
)
return {result[0]: result[1] for result in results}