0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-12-01 04:12:23 +01:00
posthog/ee/clickhouse/models/element.py

85 lines
3.1 KiB
Python
Raw Normal View History

import datetime
import json
import re
"Clickhouse Features V2 (#1565)" (#1750) * initial * migration command * migrations working * add modelless views for clickhouse * initial testing structure * use test factory * scaffold for all tests * add insight and person api * add basic readme * add client * change how migrations are run * add base tables * ingesting events * restore delay * remove print * updated testing flow * changed sessions tests * update tests * reorganized sql * parametrize strings * element list query * change to seralizer * add values endpoint * retrieve with filter * pruned code to prepare for staged merge * working ingestion again * tests for ee * undo unneeded tests right now * fix linting * more typing errors * fix tests * add clickhouse image to workflow * move to right job * remove django_clickhouse * return database url * run super * remove keepdb * reordered calls * fix type * fractional seconds * fix type error * add checks * remove retention sql * fix tests * add property storage and tests * merge master * fix tests * fix tests * . * remove keepdb * format python files * update CI env vars * Override defaults and insecure tests * Update how ClickHouse database gets evaluated * remove bootstrapping clickhouse database routine * Don't initialize the clickhouse connection unless we say it's primary * . * fixed id generation * remove dump * black settings * empty client * add param * move docker-compose for ch to ee dir * Add _public_ key to repo for verifying self signed cert on server * update ee compose file for ee dir * fix a few issues with tls in migrations * update migrations to be flexible about storage profile and engine * black settings * add elements prop tables * add elements prop tables * working filter * refactored * better url handling * add mapping table * add processing to worker task * working cohort with actions * add cohort property filtering * add cohort property filtering * reformat and add cohort processing * prop clauses * add util * add more util * add clickhouse modifier * Clickhouse Sessions (#1623) * sessions sql * skeleton * add endpoint * better tests * sessions list * merge clickhouse-actions * added session endpoint * sessions sql working again * add clickhouse modifier * session avg with props working * add dist * tests working (no list) * list working * add formatting * more formatting * fix tests * dummy commit * fix types * remove unnecessary improt * ignore type when importing from ee in task * fix test running * Clickhouse Trends Base (#1609) * initial working * date param almost working * fix date range and labels * fixed monthly math * handle compare * change table * using new event ingestion * direct query actions working * remove interface * fix date range * properties initial working * handle operator * handle operator * move timestamp parse * move more to util * inital breaking down working * working cohort breakdown * some tests running * fix sessions * cohort tests * action and interval test * reorder cohort filtering * rename retention test * fix inits * change multitenancy tests * fix types * fix optional types * replace ch_client.execute with sync_execute * replace ch_client.execute with sync_execute, part 2 * Clickhouse Stickiness + Process Event (#1654) * generate clickhouse uuid script * set CLICKHOUSE_SECURE=False by default if running in TEST or DEBUG * convert person_id to UUID, make adding `person_id` optional, add distinct_ids already in the `create_person` function * Fix test_process_event_ee.py, remove all calls to Person.objects.* * add back util * fix broken imports * improve process_event test clickhouse queries * Basic stickiness query * Clickhouse Stickiness tests * stickiness test [WIP, actions fail] * generate clickhouse uuid script * change default test runner if PRIMARY_DB=clickhouse * fix stickiness test for actions * fix merge bug * remove _create_person stub; cohort person_id is UUID now * fix typing * Clickhouse trends process math (#1660) * most of process math works * all process math * fix ordering issue * unusued imports * update property comparison for process_event_ee * indentation wrong missing calls * demo users and events (#1661) * finish breakdown filtering tests and reformat label function * add increment to demo_data * update demo data populating * Add people endpoint for ch (#1670) * add people endpoint for ch * stickiness people * fix value padding * add process math to breakdown and * add limit * fix tests * condensed code * converted test to factory * add people tests * add month handling * add typing fix * change people test handling * fix tests * Clickhouse funnels 2 (#1668) * add elements to create_event * WIP closes #1663 Add funnels to clickhouse * Make funnels work * Clean up * Move filtering around * Add mypy tests and fix * Performance improvements * fix person tests again * add people for funnel endpoint * fix prop numbering Co-authored-by: Marius Andra <marius.andra@gmail.com> Co-authored-by: Eric <eeoneric@gmail.com> * merge master * add retention * update types * more typing errors * fix types * bug with kafka payload, elements insert, and demo data * Clickhouse Paths (#1657) * paths clickhouse test (fails) * add elements to create_event * make this fail for clickhouse * hardcoded query that returns good results for $pageviews, no filters yet * clean up queries * bound by time, fix 30min new session boundary * support screen and custom events * add properties filter * paths url * filter by path start * better path start test * even better path start test * start from the first "path start" in a group * test for person_id in paths * partition by person_id for POSTGRES paths * partition by person_id for Clickhouse paths * clean up order in paths test * clean up order in paths test * join elements * force element order on element group creation * remove "order" when creating elements in tests and demo * get list of elements for paths * add limit to paths query * use materialized view * rename "element_hash" to "elements_hash" (no change in db) * cull rows that are definitely unused * simplify query * New highly optimized paths clickhouse query * start_point for $autocapture paths * extract event property values from clickhouse * prevent crash * select one element sql * get elements for event * remove lodash * remove host from $pageview path elements if same domain as incoming path * show metadata based on loaded paths filter, not in flight filter * fix order (all soures and targets in order, not all sources first, then all targets after) - makes for a better looking graph * add test that makes the Postgres paths query fail * fix postgres paths --> no fuzzy matching, breaks "starts with" for urls and gives too many incorrect start points * create automatic /demo urls that match the real urls (no ending /) * fix elements queries * path element joins * create persons via postgres in paths test * change serializers back to id * fix tests with uuid * fix demo * more bugs * fix type * change now to timezone aware * [clickhouse] retention filters (#1725) * implemented target entity and prop filtering * add insight view override * fix endpoint and filters * include tests * fix tests * add period filtering * . * fix pg param name * add filtering params to both queries in retention sql * fix param again * change to todatetime * change tz to timezone * add back timezone in model/event * [clickhouse] feature flag endpoint requests (#1731) * add feature flags to endpoints * add flags to endpoints that check on request * remove magic strings and fill in missing flags * fix types * add missing flag * change from iso * fix more timestamps and comparator * change _people to get_people in actions view * remove action and cohort populating * change inheritance * "Clickhouse Features V2 (#1565)" This reverts commit 0b371d43eca149cd3632410ea5653b85b2173e39. * fix types * change to super * change to super x2 Co-authored-by: Eric <eeoneric@gmail.com> Co-authored-by: Marius Andra <marius.andra@gmail.com> Co-authored-by: Tim Glaser <tim.glaser@hiberly.com>
2020-09-29 16:17:26 +02:00
from datetime import timezone
from typing import List, Optional
from uuid import UUID
from django.utils.timezone import now
from rest_framework import serializers
from ee.clickhouse.client import sync_execute
from ee.kafka.client import ClickhouseProducer
from ee.kafka.topics import KAFKA_ELEMENTS
from posthog.cache import get_cached_value, set_cached_value
from posthog.models.element import Element
from posthog.models.element_group import hash_elements
from posthog.models.team import Team
from posthog.models.utils import UUIDT
chain_to_elements_regex = re.compile(
r"(?P<tag_name>^[a-zA-Z\-]*)|\.*(?P<class>.*?)[\.|\:]|(?P<attribute>(?P<key>.*?)\=\"(?P<value>.*?[^\\])\")",
re.MULTILINE,
)
# Below splits all elements by ;, while ignoring escaped quotes and semicolons within quotes
split_chain_regex = re.compile(r'(?:[^\s;"]|"(?:\\.|[^"])*")+')
def _escape(input: str) -> str:
return input.replace('"', r"\"")
def elements_to_string(elements: List[Element],) -> str:
ret = []
for element in elements:
el_string = ""
if element.tag_name:
el_string += element.tag_name
if element.attr_class:
for single_class in sorted(element.attr_class):
el_string += ".{}".format(single_class)
attributes = {
**({"text": element.text} if element.text else {}),
"nth-child": element.nth_child or 0,
"nth-of-type": element.nth_of_type or 0,
**({"href": element.href} if element.href else {}),
**({"attr_id": element.attr_id} if element.attr_id else {}),
**element.attributes,
}
attributes = {_escape(key): _escape(str(value)) for key, value in sorted(attributes.items())}
el_string += ":"
el_string += "".join(['{}="{}"'.format(key, value) for key, value in attributes.items()])
ret.append(el_string)
return ";".join(ret)
def chain_to_elements(chain: str) -> List[Element]:
elements = []
for idx, el_string in enumerate(re.findall(split_chain_regex, chain)):
parsed = re.finditer(chain_to_elements_regex, el_string)
element = Element(order=idx)
for ii in parsed:
item = ii.groupdict()
if item["tag_name"]:
element.tag_name = item["tag_name"]
elif item["class"]:
if not element.attr_class:
element.attr_class = []
element.attr_class.append(item["class"])
elif item["key"] == "href":
element.href = item["value"]
elif item["key"] == "nth-child":
element.nth_child = int(item["value"])
elif item["key"] == "nth-of-type":
element.nth_of_type = int(item["value"])
elif item["key"] == "text":
element.text = item["value"]
elif item["key"] == "attr_id":
element.attr_id = item["value"]
elif item["key"]:
element.attributes[item["key"]] = item["value"]
elements.append(element)
return elements