diff --git a/serde_elements/.gitignore b/serde_elements/.gitignore new file mode 100644 index 00000000000..c8f044299db --- /dev/null +++ b/serde_elements/.gitignore @@ -0,0 +1,72 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version diff --git a/elements/Cargo.lock b/serde_elements/Cargo.lock similarity index 99% rename from elements/Cargo.lock rename to serde_elements/Cargo.lock index a26064744bc..f15fbe06f75 100644 --- a/elements/Cargo.lock +++ b/serde_elements/Cargo.lock @@ -54,16 +54,6 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" -[[package]] -name = "elements" -version = "0.1.0" -dependencies = [ - "once_cell", - "pyo3", - "rayon", - "regex", -] - [[package]] name = "heck" version = "0.5.0" @@ -239,6 +229,16 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "serde-elements" +version = "0.1.0" +dependencies = [ + "once_cell", + "pyo3", + "rayon", + "regex", +] + [[package]] name = "syn" version = "2.0.87" diff --git a/elements/Cargo.toml b/serde_elements/Cargo.toml similarity index 82% rename from elements/Cargo.toml rename to serde_elements/Cargo.toml index 0c73136c4e5..092b5ba554a 100644 --- a/elements/Cargo.toml +++ b/serde_elements/Cargo.toml @@ -1,10 +1,10 @@ [package] -name = "elements" +name = "serde-elements" version = "0.1.0" edition = "2021" [lib] -name = "elements" +name = "serde_elements" crate-type = ["cdylib"] [dependencies] diff --git a/serde_elements/README.md b/serde_elements/README.md new file mode 100644 index 00000000000..d0f0fd2f630 --- /dev/null +++ b/serde_elements/README.md @@ -0,0 +1,22 @@ +# Serde Elements + +A Python extension for deserialization of PostHog HTML elements (as represented by the [`Element` model](../posthog/models/element/element.py)) + +# Build & Install +Build with `maturin`: +```sh +maturin build --release +``` + +Install Python wheel: +```sh +pip install target/wheels/serde_elements_chain-0.1.0-cp311-cp311-manylinux_2_34_x86_64.whl +``` + +# Usage +```Python +from serde_elements import deserialize + +elements_chain = ... +d = deserialize(elements_chain) +``` diff --git a/elements/pyproject.toml b/serde_elements/pyproject.toml similarity index 92% rename from elements/pyproject.toml rename to serde_elements/pyproject.toml index 7418950e766..3c770f970f1 100644 --- a/elements/pyproject.toml +++ b/serde_elements/pyproject.toml @@ -3,7 +3,7 @@ requires = ["maturin>=1.7,<2.0"] build-backend = "maturin" [project] -name = "elements" +name = "serde_elements_chain" requires-python = ">=3.11" classifiers = [ "Programming Language :: Rust", diff --git a/elements/src/lib.rs b/serde_elements/src/lib.rs similarity index 67% rename from elements/src/lib.rs rename to serde_elements/src/lib.rs index 61368e70f50..ecc3259db2f 100644 --- a/elements/src/lib.rs +++ b/serde_elements/src/lib.rs @@ -1,9 +1,16 @@ -//! Python extension to deserialize chains of HTML elements as serialized by PostHog +//! Python extension to deserialize chains of HTML elements as serialized by PostHog. +//! +//! The PostHog serialization format of HTML elements requires several regular +//! expressions to deserialize, and in hot parts of the application the reference +//! Python code this can be very slow. This extension provides a parallelized +//! version of the deserialization running in Rust. During bench-marking, we saw +//! deserialization take one third of the time it takes with the reference Python +//! implementation. use std::collections; use once_cell::sync::Lazy; use pyo3::prelude::*; -use pyo3::types::{IntoPyDict, PyDict, PyList}; +use pyo3::types::IntoPyDict; use rayon::prelude::*; use regex::{Regex, RegexBuilder}; @@ -22,9 +29,11 @@ static PARSE_ATTRIBUTES_REGEX: Lazy = Lazy::new(|| { .expect("hard-coded regular expression should be valid") }); -/// Represents an HTML element. +/// Represents a PostHog HTML element. /// -/// Meant to replicate a PostHog `Element` model internally. +/// Meant to replicate a PostHog `Element` Django model internally. +/// This struct implements `IntoPy` to allow conversion into a Python +/// dictionary. struct Element { order: usize, text: Option, @@ -37,6 +46,9 @@ struct Element { attributes: collections::HashMap, } +/// Implementation of `Element`. +/// +/// Provides mutable access to all of the Element's attributes. impl Element { fn new_with_order(order: usize) -> Self { Self { @@ -93,38 +105,49 @@ impl Element { } impl IntoPy for Element { - /// Convert a Rust `Element` into a Python dictionary. + /// Convert an `Element` struct into a Python dictionary. + /// + /// This dictionary will match the Django `Element` model (except for + /// foreign keys). fn into_py(self, py: Python<'_>) -> PyObject { let dict = &[("order", self.order)].into_py_dict_bound(py); - dict.set_item("attributes", self.attributes); + dict.set_item("attributes", self.attributes) + .expect("Python dictionary failed to insert `attributes`"); if let Some(tag_name) = self.tag_name { - dict.set_item("tag_name", tag_name); + dict.set_item("tag_name", tag_name) + .expect("Python dictionary failed to insert `tag_name`"); } if self.attr_class.len() > 0 { - dict.set_item("attr_class", self.attr_class); + dict.set_item("attr_class", self.attr_class) + .expect("Python dictionary failed to insert `attr_class`"); } if let Some(href) = self.href { - dict.set_item("href", href); + dict.set_item("href", href) + .expect("Python dictionary failed to insert `href`"); } if let Some(nth_child) = self.nth_child { - dict.set_item("nth_child", nth_child); + dict.set_item("nth_child", nth_child) + .expect("Python dictionary failed to insert `nth_child`"); } if let Some(nth_of_type) = self.nth_of_type { - dict.set_item("nth_of_type", nth_of_type); + dict.set_item("nth_of_type", nth_of_type) + .expect("Python dictionary failed to insert `nth_of_type`"); } if let Some(text) = self.text { - dict.set_item("text", text); + dict.set_item("text", text) + .expect("Python dictionary failed to insert `text`"); } if let Some(attr_id) = self.attr_id { - dict.set_item("attr_id", attr_id); + dict.set_item("attr_id", attr_id) + .expect("Python dictionary failed to insert `attr_id`"); } dict.into_py(py) @@ -136,8 +159,20 @@ impl IntoPy for Element { /// This function mimics the `chain_to_elements` Python function provided /// by the `posthog.models.element.elements` module. The only difference is /// that this function returns a dictionary instead of a Django model. +/// +/// This function is divided into 3 parts (each associated with a regular +/// expression): +/// 1. Iterating over HTML elements stored in a chain. +/// 2. Recording HTML class tags and class attributes of each HTML element. +/// 3. Recording all other HTML attributes from the HTML element. +/// Each iteration creates an `Element` struct to record everything, and +/// only once we are done deserializing everything do we acquire the GIL and +/// Output the result as a Python dictionary. +/// +/// We parallelize iteration over HTML elements using `rayon` for blazing +/// speed. #[pyfunction] -pub fn chain_to_elements_dict(chain: &str) -> PyResult { +pub fn deserialize(chain: &str) -> PyResult { let elements: Vec = SPLIT_CHAIN_REGEX .find_iter(chain) .collect::>>() @@ -189,8 +224,9 @@ pub fn chain_to_elements_dict(chain: &str) -> PyResult { Python::with_gil(|py| Ok(elements.into_py(py))) } +/// Provides the `serde_elements` Python module. #[pymodule] -fn elements(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_function(wrap_pyfunction!(chain_to_elements_dict, m)?)?; +fn serde_elements(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(deserialize, m)?)?; Ok(()) }