0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-21 13:39:22 +01:00

feat(hog): memory limits (#23564)

This commit is contained in:
Marius Andra 2024-07-10 15:11:15 +02:00 committed by GitHub
parent cf575b0129
commit 043aa16e5b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 700 additions and 161 deletions

View File

@ -0,0 +1,4 @@
["_h", 32, "key", 32, "value", 32, "key2", 32, "value2", 42, 2, 32, "na", 33, 0, 33, 100, 36, 2, 15, 40, 45, 32, "na",
36, 1, 2, "concat", 2, 37, 1, 36, 0, 36, 2, 32, "key_", 2, "concat", 2, 32, "wasted", 32, " batman!", 36, 1, 32,
"memory: ", 2, "concat", 3, 32, "something", 36, 0, 42, 2, 46, 33, 1, 36, 2, 6, 37, 2, 39, -52, 35, 36, 0, 2, "print",
1, 35, 36, 0, 2, "jsonStringify", 1, 36, 2, 2, "jsonParse", 1, 2, "print", 1, 35, 35, 35, 35]

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,4 @@
["_h", 32, "key", 32, "value", 32, "key2", 32, "value2", 42, 2, 32, "key", 32, "value", 32, "key2", 32, "value2", 42, 2,
33, 0, 33, 30, 36, 2, 15, 40, 25, 36, 0, 36, 2, 32, "key_", 2, "concat", 2, 32, "something", 36, 1, 42, 1, 46, 33, 1,
36, 2, 6, 37, 2, 39, -32, 35, 36, 0, 2, "print", 1, 35, 36, 0, 2, "jsonStringify", 1, 2, "jsonParse", 1, 2, "print", 1,
35, 35, 35]

View File

@ -0,0 +1,2 @@
{'key': 'value', 'key2': 'value2', 'key_0': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_1': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_2': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_3': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_4': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_5': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_6': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_7': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_8': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_9': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_10': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_11': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_12': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_13': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_14': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_15': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_16': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_17': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_18': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_19': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_20': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_21': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_22': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_23': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_24': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_25': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_26': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_27': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_28': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_29': {'something': {'key': 'value', 'key2': 'value2'}}}
{'key': 'value', 'key2': 'value2', 'key_0': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_1': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_2': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_3': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_4': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_5': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_6': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_7': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_8': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_9': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_10': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_11': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_12': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_13': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_14': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_15': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_16': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_17': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_18': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_19': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_20': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_21': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_22': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_23': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_24': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_25': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_26': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_27': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_28': {'something': {'key': 'value', 'key2': 'value2'}}, 'key_29': {'something': {'key': 'value', 'key2': 'value2'}}}

View File

@ -0,0 +1,22 @@
// Printing recursive objects.
let obj := {'key': 'value', 'key2': 'value2'}
let str := 'na'
for (let i := 0; i < 100; i := i + 1) {
str := str || 'na'
obj[f'key_{i}'] := {
'wasted': 'memory: ' || str || ' batman!',
'something': obj, // something links to obj
}
}
// printing works without loops
print(obj)
// this doesn't crash
let json := jsonStringify(obj)
// Commented out because JSON output is slightly different in python vs nodejs
// print(json)
// Should be equal to the original printed object -> nulls instead of recursive nodes
print(jsonParse(json))

View File

@ -0,0 +1,11 @@
// Printing recursive objects.
let root := {'key': 'value', 'key2': 'value2'}
let leaf := {'key': 'value', 'key2': 'value2'}
for (let i := 0; i < 30; i := i + 1) {
root[f'key_{i}'] := {
'something': leaf,
}
}
// Should NOT replace all leaves with nulls.
print(root)
print(jsonParse(jsonStringify(root)))

View File

@ -10,11 +10,13 @@ from hogvm.python.operation import Operation, HOGQL_BYTECODE_IDENTIFIER
from hogvm.python.stl import STL
from dataclasses import dataclass
from hogvm.python.utils import HogVMException, get_nested_value, like, set_nested_value
from hogvm.python.utils import HogVMException, get_nested_value, like, set_nested_value, calculate_cost
if TYPE_CHECKING:
from posthog.models import Team
MAX_MEMORY = 64 * 1024 * 1024 # 64 MB
@dataclass
class BytecodeResult:
@ -35,8 +37,11 @@ def execute_bytecode(
start_time = time.time()
last_op = len(bytecode) - 1
stack: list = []
mem_stack: list = []
call_stack: list[tuple[int, int, int]] = [] # (ip, stack_start, arg_len)
declared_functions: dict[str, tuple[int, int]] = {}
mem_used = 0
max_mem_used = 0
ip = -1
ops = 0
stdout: list[str] = []
@ -52,8 +57,20 @@ def execute_bytecode(
def pop_stack():
if not stack:
raise HogVMException("Stack underflow")
nonlocal mem_used
mem_used -= mem_stack.pop()
return stack.pop()
def push_stack(value):
stack.append(value)
mem_stack.append(calculate_cost(value))
nonlocal mem_used
mem_used += mem_stack[-1]
nonlocal max_mem_used
max_mem_used = max(mem_used, max_mem_used)
if mem_used > MAX_MEMORY:
raise HogVMException(f"Memory limit of {MAX_MEMORY} bytes exceeded. Tried to allocate {mem_used} bytes.")
if next_token() != HOGQL_BYTECODE_IDENTIFIER:
raise HogVMException(f"Invalid bytecode. Must start with '{HOGQL_BYTECODE_IDENTIFIER}'")
@ -75,72 +92,72 @@ def execute_bytecode(
case None:
break
case Operation.STRING:
stack.append(next_token())
push_stack(next_token())
case Operation.INTEGER:
stack.append(next_token())
push_stack(next_token())
case Operation.FLOAT:
stack.append(next_token())
push_stack(next_token())
case Operation.TRUE:
stack.append(True)
push_stack(True)
case Operation.FALSE:
stack.append(False)
push_stack(False)
case Operation.NULL:
stack.append(None)
push_stack(None)
case Operation.NOT:
stack.append(not pop_stack())
push_stack(not pop_stack())
case Operation.AND:
stack.append(all([pop_stack() for _ in range(next_token())])) # noqa: C419
push_stack(all([pop_stack() for _ in range(next_token())])) # noqa: C419
case Operation.OR:
stack.append(any([pop_stack() for _ in range(next_token())])) # noqa: C419
push_stack(any([pop_stack() for _ in range(next_token())])) # noqa: C419
case Operation.PLUS:
stack.append(pop_stack() + pop_stack())
push_stack(pop_stack() + pop_stack())
case Operation.MINUS:
stack.append(pop_stack() - pop_stack())
push_stack(pop_stack() - pop_stack())
case Operation.DIVIDE:
stack.append(pop_stack() / pop_stack())
push_stack(pop_stack() / pop_stack())
case Operation.MULTIPLY:
stack.append(pop_stack() * pop_stack())
push_stack(pop_stack() * pop_stack())
case Operation.MOD:
stack.append(pop_stack() % pop_stack())
push_stack(pop_stack() % pop_stack())
case Operation.EQ:
stack.append(pop_stack() == pop_stack())
push_stack(pop_stack() == pop_stack())
case Operation.NOT_EQ:
stack.append(pop_stack() != pop_stack())
push_stack(pop_stack() != pop_stack())
case Operation.GT:
stack.append(pop_stack() > pop_stack())
push_stack(pop_stack() > pop_stack())
case Operation.GT_EQ:
stack.append(pop_stack() >= pop_stack())
push_stack(pop_stack() >= pop_stack())
case Operation.LT:
stack.append(pop_stack() < pop_stack())
push_stack(pop_stack() < pop_stack())
case Operation.LT_EQ:
stack.append(pop_stack() <= pop_stack())
push_stack(pop_stack() <= pop_stack())
case Operation.LIKE:
stack.append(like(pop_stack(), pop_stack()))
push_stack(like(pop_stack(), pop_stack()))
case Operation.ILIKE:
stack.append(like(pop_stack(), pop_stack(), re.IGNORECASE))
push_stack(like(pop_stack(), pop_stack(), re.IGNORECASE))
case Operation.NOT_LIKE:
stack.append(not like(pop_stack(), pop_stack()))
push_stack(not like(pop_stack(), pop_stack()))
case Operation.NOT_ILIKE:
stack.append(not like(pop_stack(), pop_stack(), re.IGNORECASE))
push_stack(not like(pop_stack(), pop_stack(), re.IGNORECASE))
case Operation.IN:
stack.append(pop_stack() in pop_stack())
push_stack(pop_stack() in pop_stack())
case Operation.NOT_IN:
stack.append(pop_stack() not in pop_stack())
push_stack(pop_stack() not in pop_stack())
case Operation.REGEX:
args = [pop_stack(), pop_stack()]
stack.append(bool(re.search(re.compile(args[1]), args[0])))
push_stack(bool(re.search(re.compile(args[1]), args[0])))
case Operation.NOT_REGEX:
args = [pop_stack(), pop_stack()]
stack.append(not bool(re.search(re.compile(args[1]), args[0])))
push_stack(not bool(re.search(re.compile(args[1]), args[0])))
case Operation.IREGEX:
args = [pop_stack(), pop_stack()]
stack.append(bool(re.search(re.compile(args[1], re.RegexFlag.IGNORECASE), args[0])))
push_stack(bool(re.search(re.compile(args[1], re.RegexFlag.IGNORECASE), args[0])))
case Operation.NOT_IREGEX:
args = [pop_stack(), pop_stack()]
stack.append(not bool(re.search(re.compile(args[1], re.RegexFlag.IGNORECASE), args[0])))
push_stack(not bool(re.search(re.compile(args[1], re.RegexFlag.IGNORECASE), args[0])))
case Operation.GET_GLOBAL:
chain = [pop_stack() for _ in range(next_token())]
stack.append(deepcopy(get_nested_value(globals, chain)))
push_stack(deepcopy(get_nested_value(globals, chain)))
case Operation.POP:
pop_stack()
case Operation.RETURN:
@ -148,22 +165,29 @@ def execute_bytecode(
ip, stack_start, arg_len = call_stack.pop()
response = pop_stack()
stack = stack[0:stack_start]
stack.append(response)
mem_used -= sum(mem_stack[stack_start:])
mem_stack = mem_stack[0:stack_start]
push_stack(response)
else:
return BytecodeResult(result=pop_stack(), stdout=stdout, bytecode=bytecode)
case Operation.GET_LOCAL:
stack_start = 0 if not call_stack else call_stack[-1][1]
stack.append(stack[next_token() + stack_start])
push_stack(stack[next_token() + stack_start])
case Operation.SET_LOCAL:
stack_start = 0 if not call_stack else call_stack[-1][1]
value = pop_stack()
stack[next_token() + stack_start] = value
index = next_token() + stack_start
stack[index] = value
last_cost = mem_stack[index]
mem_stack[index] = calculate_cost(value)
mem_used += mem_stack[index] - last_cost
max_mem_used = max(mem_used, max_mem_used)
case Operation.GET_PROPERTY:
property = pop_stack()
stack.append(get_nested_value(pop_stack(), [property]))
push_stack(get_nested_value(pop_stack(), [property]))
case Operation.GET_PROPERTY_NULLISH:
property = pop_stack()
stack.append(get_nested_value(pop_stack(), [property], nullish=True))
push_stack(get_nested_value(pop_stack(), [property], nullish=True))
case Operation.SET_PROPERTY:
value = pop_stack()
field = pop_stack()
@ -173,19 +197,25 @@ def execute_bytecode(
if count > 0:
elems = stack[-(count * 2) :]
stack = stack[: -(count * 2)]
stack.append({elems[i]: elems[i + 1] for i in range(0, len(elems), 2)})
mem_used -= sum(mem_stack[-(count * 2) :])
mem_stack = mem_stack[: -(count * 2)]
push_stack({elems[i]: elems[i + 1] for i in range(0, len(elems), 2)})
else:
stack.append({})
push_stack({})
case Operation.ARRAY:
count = next_token()
elems = stack[-count:]
stack = stack[:-count]
stack.append(elems)
mem_used -= sum(mem_stack[-count:])
mem_stack = mem_stack[:-count]
push_stack(elems)
case Operation.TUPLE:
count = next_token()
elems = stack[-count:]
stack = stack[:-count]
stack.append(tuple(elems))
mem_used -= sum(mem_stack[-count:])
mem_stack = mem_stack[:-count]
push_stack(tuple(elems))
case Operation.JUMP:
count = next_token()
ip += count
@ -214,13 +244,13 @@ def execute_bytecode(
args = [pop_stack() for _ in range(next_token())]
if functions is not None and name in functions:
stack.append(functions[name](*args))
push_stack(functions[name](*args))
continue
if name not in STL:
raise HogVMException(f"Unsupported function call: {name}")
stack.append(STL[name](name, args, team, stdout, timeout))
push_stack(STL[name](name, args, team, stdout, timeout))
if ip == last_op:
break
if debug:

View File

@ -107,9 +107,28 @@ def jsonParse(name: str, args: list[Any], team: Optional["Team"], stdout: Option
def jsonStringify(name: str, args: list[Any], team: Optional["Team"], stdout: Optional[list[str]], timeout: int) -> str:
marked = set()
def json_safe(obj):
if isinstance(obj, dict) or isinstance(obj, list) or isinstance(obj, tuple):
if id(obj) in marked:
return None
else:
marked.add(id(obj))
try:
if isinstance(obj, dict):
return {json_safe(k): json_safe(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [json_safe(v) for v in obj]
elif isinstance(obj, tuple):
return tuple(json_safe(v) for v in obj)
finally:
marked.remove(id(obj))
return obj
if len(args) > 1 and isinstance(args[1], int) and args[1] > 0:
return json.dumps(args[0], indent=args[1])
return json.dumps(args[0])
return json.dumps(json_safe(args[0]), indent=args[1])
return json.dumps(json_safe(args[0]))
def base64Encode(name: str, args: list[Any], team: Optional["Team"], stdout: Optional[list[str]], timeout: int) -> str:

View File

@ -31,15 +31,24 @@ def escape_identifier(identifier: str | int) -> str:
return "`{}`".format("".join(backquote_escape_chars_map.get(c, c) for c in identifier))
def print_hog_value(obj):
if isinstance(obj, list):
return f"[{', '.join(map(print_hog_value, obj))}]"
if isinstance(obj, dict):
return f"{{{', '.join([f'{print_hog_value(key)}: {print_hog_value(value)}' for key, value in obj.items()])}}}"
if isinstance(obj, tuple):
if len(obj) < 2:
return f"tuple({', '.join(map(print_hog_value, obj))})"
return f"({', '.join(map(print_hog_value, obj))})"
def print_hog_value(obj, marked: set | None = None):
if marked is None:
marked = set()
if isinstance(obj, list) or isinstance(obj, dict) or isinstance(obj, tuple):
if id(obj) in marked:
return "null"
marked.add(id(obj))
try:
if isinstance(obj, list):
return f"[{', '.join([print_hog_value(o, marked) for o in obj])}]"
if isinstance(obj, dict):
return f"{{{', '.join([f'{print_hog_value(key, marked)}: {print_hog_value(value, marked)}' for key, value in obj.items()])}}}"
if isinstance(obj, tuple):
if len(obj) < 2:
return f"tuple({', '.join([print_hog_value(o, marked) for o in obj])})"
return f"({', '.join([print_hog_value(o, marked) for o in obj])})"
finally:
marked.remove(id(obj))
if obj is True:
return "true"
if obj is False:

View File

@ -134,6 +134,143 @@ class TestBytecodeExecute:
else:
raise AssertionError("Expected Exception not raised")
def test_memory_limits_1(self):
# let string := 'banana'
# for (let i := 0; i < 100; i := i + 1) {
# string := string || string
# }
bytecode = [
"_h",
32,
"banana",
33,
0,
33,
100,
36,
1,
15,
40,
18,
36,
0,
36,
0,
2,
"concat",
2,
37,
0,
33,
1,
36,
1,
6,
37,
1,
39,
-25,
35,
35,
]
try:
execute_bytecode(bytecode, {})
except Exception as e:
assert str(e) == "Memory limit of 67108864 bytes exceeded. Tried to allocate 75497504 bytes."
else:
raise AssertionError("Expected Exception not raised")
def test_memory_limits_2(self):
# let string := 'banana'
# for (let i := 0; i < 100; i := i + 1) {
# string := string || string
# }
bytecode = [
"_h",
32,
"key",
32,
"value",
32,
"key2",
32,
"value2",
42,
2,
32,
"na",
33,
0,
33,
10000,
36,
2,
15,
40,
52,
33,
16,
36,
2,
15,
40,
9,
36,
1,
36,
1,
2,
"concat",
2,
37,
1,
36,
0,
36,
2,
32,
"key_",
2,
"concat",
2,
32,
"wasted",
32,
" batman!",
36,
1,
32,
"memory: ",
2,
"concat",
3,
32,
"something",
36,
0,
42,
2,
46,
33,
1,
36,
2,
6,
37,
2,
39,
-59,
35,
35,
35,
]
try:
execute_bytecode(bytecode, {})
except Exception as e:
assert str(e) == "Memory limit of 67108864 bytes exceeded. Tried to allocate 67155164 bytes."
else:
raise AssertionError("Expected Exception not raised")
def test_functions(self):
def stringify(*args):
if args[0] == 1:

View File

@ -2,6 +2,9 @@ import re
from typing import Any
COST_PER_UNIT = 8
class HogVMException(Exception):
pass
@ -46,3 +49,24 @@ def set_nested_value(obj, chain, value) -> Any:
raise HogVMException(f'Can not set property "{chain[-1]}" on object of type "{type(obj).__name__}"')
return obj
def calculate_cost(object, marked: set | None = None) -> int:
if marked is None:
marked = set()
if isinstance(object, dict) or isinstance(object, list) or isinstance(object, tuple):
if id(object) in marked:
return COST_PER_UNIT
marked.add(id(object))
try:
if isinstance(object, dict):
return COST_PER_UNIT + sum(
[calculate_cost(key, marked) + calculate_cost(value, marked) for key, value in object.items()]
)
elif isinstance(object, list) or isinstance(object, tuple):
return COST_PER_UNIT + sum([calculate_cost(val, marked) for val in object])
finally:
marked.remove(id(object))
elif isinstance(object, str):
return COST_PER_UNIT + len(object)
return COST_PER_UNIT

View File

@ -1,8 +1,14 @@
#!/bin/bash
set -e
cd typescript
pnpm run build
cd ..
cd ..
rm -f hogvm/__tests__/__snapshots__/*.stdout.nodejs
rm -f hogvm/__tests__/__snapshots__/*.stdout.python
for file in hogvm/__tests__/*.hog; do
echo "Testing $file"
@ -21,7 +27,6 @@ for file in hogvm/__tests__/*.hog; do
rm $basename.stdout.python
else
echo "Test failed"
rm $basename.stdout.nodejs $basename.stdout.python
fi
set -e
done

View File

@ -1,6 +1,6 @@
{
"name": "@posthog/hogvm",
"version": "1.0.18",
"version": "1.0.20",
"description": "PostHog Hog Virtual Machine",
"types": "dist/index.d.ts",
"main": "dist/index.js",

View File

@ -13,7 +13,7 @@ const tuple = (array: any[]): any[] => {
return array
}
describe('HogQL Bytecode', () => {
describe('hogvm execute', () => {
test('execution results', async () => {
const globals = { properties: { foo: 'bar', nullValue: null } }
const options = { globals }
@ -133,6 +133,149 @@ describe('HogQL Bytecode', () => {
expect(() => execSync(bytecode2)).toThrow('Too many arguments')
})
test('memory limits 1', async () => {
// let string := 'banana'
// for (let i := 0; i < 100; i := i + 1) {
// string := string || string
// }
const bytecode: any[] = [
'_h',
32,
'banana',
33,
0,
33,
100,
36,
1,
15,
40,
18,
36,
0,
36,
0,
2,
'concat',
2,
37,
0,
33,
1,
36,
1,
6,
37,
1,
39,
-25,
35,
35,
]
await expect(execAsync(bytecode)).rejects.toThrow(
'Memory limit of 67108864 bytes exceeded. Tried to allocate 75497504 bytes.'
)
})
test('memory limits 2', async () => {
// // Printing recursive objects.
// let obj := {'key': 'value', 'key2': 'value2'}
// let str := 'na'
// for (let i := 0; i < 10000; i := i + 1) {
// if (i < 16) {
// str := str || str
// }
// obj[f'key_{i}'] := {
// 'wasted': 'memory: ' || str || ' batman!',
// 'something': obj, // something links to obj
// }
// }
const bytecode: any[] = [
'_h',
32,
'key',
32,
'value',
32,
'key2',
32,
'value2',
42,
2,
32,
'na',
33,
0,
33,
10000,
36,
2,
15,
40,
52,
33,
16,
36,
2,
15,
40,
9,
36,
1,
36,
1,
2,
'concat',
2,
37,
1,
36,
0,
36,
2,
32,
'key_',
2,
'concat',
2,
32,
'wasted',
32,
' batman!',
36,
1,
32,
'memory: ',
2,
'concat',
3,
32,
'something',
36,
0,
42,
2,
46,
33,
1,
36,
2,
6,
37,
2,
39,
-59,
35,
35,
35,
]
await expect(execAsync(bytecode)).rejects.toThrow(
'Memory limit of 67108864 bytes exceeded. Tried to allocate 67155164 bytes.'
)
})
test('should execute user-defined stringify function correctly', async () => {
const functions = {
stringify: (arg: any) => {
@ -383,6 +526,7 @@ describe('HogQL Bytecode', () => {
callStack: [],
declaredFunctions: {},
ip: 8,
maxMemUsed: 16,
ops: 3,
stack: [4.2],
syncDuration: 0,

View File

@ -0,0 +1,32 @@
import { calculateCost } from '../utils'
const PTR_COST = 8
describe('hogvm utils', () => {
test('calculateCost', async () => {
expect(calculateCost(1)).toBe(PTR_COST)
expect(calculateCost('hello')).toBe(PTR_COST + 5)
expect(calculateCost(true)).toBe(PTR_COST)
expect(calculateCost(null)).toBe(PTR_COST)
expect(calculateCost([])).toBe(PTR_COST)
expect(calculateCost([1])).toBe(PTR_COST * 2)
expect(calculateCost(['hello'])).toBe(PTR_COST * 2 + 5)
expect(calculateCost({})).toBe(PTR_COST)
expect(calculateCost({ key: 'value' })).toBe(PTR_COST * 3 + 3 + 5)
expect(calculateCost(new Map([['key', 'value']]))).toBe(PTR_COST * 3 + 3 + 5)
expect(
calculateCost(
new Map<any, any>([
['key', 'value'],
['key2', new Map<any, any>([['key', 'value']])],
])
)
).toBe(PTR_COST * 7 + 3 + 5 + 4 + 3 + 5)
})
test('calculateCost with cycles', async () => {
const obj: Record<string, any> = {}
obj['key'] = obj
expect(calculateCost(obj)).toBe(PTR_COST * 3 + 3)
})
})

View File

@ -2,10 +2,12 @@ import RE2 from 're2'
import { Operation } from './operation'
import { ASYNC_STL, STL } from './stl/stl'
import { convertHogToJS, convertJSToHog, getNestedValue, like, setNestedValue } from './utils'
import { calculateCost, convertHogToJS, convertJSToHog, getNestedValue, like, setNestedValue } from './utils'
const DEFAULT_MAX_ASYNC_STEPS = 100
const DEFAULT_MAX_MEMORY = 64 * 1024 * 1024 // 64 MB
const DEFAULT_TIMEOUT_MS = 5000 // ms
const MAX_FUNCTION_ARGS_LENGTH = 300
export interface VMState {
/** Bytecode running in the VM */
@ -24,6 +26,8 @@ export interface VMState {
asyncSteps: number
/** Combined duration of sync steps */
syncDuration: number
/** Max memory used */
maxMemUsed: number
}
export interface ExecOptions {
@ -35,6 +39,8 @@ export interface ExecOptions {
timeout?: number
/** Max number of async function that can happen. When reached the function will throw */
maxAsyncSteps?: number
/** Memory limit in bytes. This is calculated based on the size of the VM stack. */
memoryLimit?: number
}
export interface ExecResult {
@ -45,9 +51,6 @@ export interface ExecResult {
state?: VMState
}
/** Maximum function arguments allowed */
const MAX_ARGS_LENGTH = 300
export function execSync(bytecode: any[], options?: ExecOptions): any {
const response = exec(bytecode, options)
if (response.finished) {
@ -109,8 +112,12 @@ export function exec(code: any[] | VMState, options?: ExecOptions): ExecResult {
const asyncSteps = vmState ? vmState.asyncSteps : 0
const syncDuration = vmState ? vmState.syncDuration : 0
const stack: any[] = vmState ? vmState.stack : []
const memStack: number[] = stack.map((s) => calculateCost(s))
const callStack: [number, number, number][] = vmState ? vmState.callStack : []
const declaredFunctions: Record<string, [number, number]> = vmState ? vmState.declaredFunctions : {}
let memUsed = memStack.reduce((acc, val) => acc + val, 0)
let maxMemUsed = Math.max(vmState ? vmState.maxMemUsed : 0, memUsed)
const memLimit = options?.memoryLimit ?? DEFAULT_MAX_MEMORY
let ip = vmState ? vmState.ip : 1
let ops = vmState ? vmState.ops : 0
const timeout = options?.timeout ?? DEFAULT_TIMEOUT_MS
@ -120,15 +127,36 @@ export function exec(code: any[] | VMState, options?: ExecOptions): ExecResult {
if (stack.length === 0) {
throw new Error('Invalid HogQL bytecode, stack is empty')
}
memUsed -= memStack.pop() ?? 0
return stack.pop()
}
function pushStack(value: any): any {
memStack.push(calculateCost(value))
memUsed += memStack[memStack.length - 1]
maxMemUsed = Math.max(maxMemUsed, memUsed)
if (memUsed > memLimit && memLimit > 0) {
throw new Error(`Memory limit of ${memLimit} bytes exceeded. Tried to allocate ${memUsed} bytes.`)
}
return stack.push(value)
}
function spliceStack2(start: number, deleteCount?: number): any[] {
memUsed -= memStack.splice(start, deleteCount).reduce((acc, val) => acc + val, 0)
return stack.splice(start, deleteCount)
}
function spliceStack1(start: number): any[] {
memUsed -= memStack.splice(start).reduce((acc, val) => acc + val, 0)
return stack.splice(start)
}
function next(): any {
if (ip >= bytecode!.length - 1) {
throw new Error('Unexpected end of bytecode')
}
return bytecode![++ip]
}
function checkTimeout(): void {
if (syncDuration + Date.now() - startTime > timeout) {
throw new Error(`Execution timed out after ${timeout / 1000} seconds. Performed ${ops} ops.`)
@ -144,110 +172,110 @@ export function exec(code: any[] | VMState, options?: ExecOptions): ExecResult {
case null:
break
case Operation.STRING:
stack.push(next())
pushStack(next())
break
case Operation.FLOAT:
stack.push(next())
pushStack(next())
break
case Operation.INTEGER:
stack.push(next())
pushStack(next())
break
case Operation.TRUE:
stack.push(true)
pushStack(true)
break
case Operation.FALSE:
stack.push(false)
pushStack(false)
break
case Operation.NULL:
stack.push(null)
pushStack(null)
break
case Operation.NOT:
stack.push(!popStack())
pushStack(!popStack())
break
case Operation.AND:
stack.push(
Array(next())
.fill(null)
.map(() => popStack())
.every(Boolean)
)
temp = next()
temp2 = true
for (let i = 0; i < temp; i++) {
temp2 = !!popStack() && temp2
}
pushStack(temp2)
break
case Operation.OR:
stack.push(
Array(next())
.fill(null)
.map(() => popStack())
.some(Boolean)
)
temp = next()
temp2 = false
for (let i = 0; i < temp; i++) {
temp2 = !!popStack() || temp2
}
pushStack(temp2)
break
case Operation.PLUS:
stack.push(Number(popStack()) + Number(popStack()))
pushStack(Number(popStack()) + Number(popStack()))
break
case Operation.MINUS:
stack.push(Number(popStack()) - Number(popStack()))
pushStack(Number(popStack()) - Number(popStack()))
break
case Operation.DIVIDE:
stack.push(Number(popStack()) / Number(popStack()))
pushStack(Number(popStack()) / Number(popStack()))
break
case Operation.MULTIPLY:
stack.push(Number(popStack()) * Number(popStack()))
pushStack(Number(popStack()) * Number(popStack()))
break
case Operation.MOD:
stack.push(Number(popStack()) % Number(popStack()))
pushStack(Number(popStack()) % Number(popStack()))
break
case Operation.EQ:
stack.push(popStack() === popStack())
pushStack(popStack() === popStack())
break
case Operation.NOT_EQ:
stack.push(popStack() !== popStack())
pushStack(popStack() !== popStack())
break
case Operation.GT:
stack.push(popStack() > popStack())
pushStack(popStack() > popStack())
break
case Operation.GT_EQ:
stack.push(popStack() >= popStack())
pushStack(popStack() >= popStack())
break
case Operation.LT:
stack.push(popStack() < popStack())
pushStack(popStack() < popStack())
break
case Operation.LT_EQ:
stack.push(popStack() <= popStack())
pushStack(popStack() <= popStack())
break
case Operation.LIKE:
stack.push(like(popStack(), popStack()))
pushStack(like(popStack(), popStack()))
break
case Operation.ILIKE:
stack.push(like(popStack(), popStack(), true))
pushStack(like(popStack(), popStack(), true))
break
case Operation.NOT_LIKE:
stack.push(!like(popStack(), popStack()))
pushStack(!like(popStack(), popStack()))
break
case Operation.NOT_ILIKE:
stack.push(!like(popStack(), popStack(), true))
pushStack(!like(popStack(), popStack(), true))
break
case Operation.IN:
temp = popStack()
stack.push(popStack().includes(temp))
pushStack(popStack().includes(temp))
break
case Operation.NOT_IN:
temp = popStack()
stack.push(!popStack().includes(temp))
pushStack(!popStack().includes(temp))
break
case Operation.REGEX:
temp = popStack()
stack.push(new RE2(popStack()).test(temp))
pushStack(new RE2(popStack()).test(temp))
break
case Operation.NOT_REGEX:
temp = popStack()
stack.push(!new RE2(popStack()).test(temp))
pushStack(!new RE2(popStack()).test(temp))
break
case Operation.IREGEX:
temp = popStack()
stack.push(new RE2(popStack(), 'i').test(temp))
pushStack(new RE2(popStack(), 'i').test(temp))
break
case Operation.NOT_IREGEX:
temp = popStack()
stack.push(!new RE2(popStack(), 'i').test(temp))
pushStack(!new RE2(popStack(), 'i').test(temp))
break
case Operation.GET_GLOBAL: {
const count = next()
@ -255,7 +283,7 @@ export function exec(code: any[] | VMState, options?: ExecOptions): ExecResult {
for (let i = 0; i < count; i++) {
chain.push(popStack())
}
stack.push(options?.globals ? convertJSToHog(getNestedValue(options.globals, chain)) : null)
pushStack(options?.globals ? convertJSToHog(getNestedValue(options.globals, chain)) : null)
break
}
case Operation.POP:
@ -265,8 +293,8 @@ export function exec(code: any[] | VMState, options?: ExecOptions): ExecResult {
if (callStack.length > 0) {
const [newIp, stackStart, _] = callStack.pop()!
const response = popStack()
stack.splice(stackStart)
stack.push(response)
spliceStack1(stackStart)
pushStack(response)
ip = newIp
break
} else {
@ -277,19 +305,23 @@ export function exec(code: any[] | VMState, options?: ExecOptions): ExecResult {
}
case Operation.GET_LOCAL:
temp = callStack.length > 0 ? callStack[callStack.length - 1][1] : 0
stack.push(stack[next() + temp])
pushStack(stack[next() + temp])
break
case Operation.SET_LOCAL:
temp = callStack.length > 0 ? callStack[callStack.length - 1][1] : 0
stack[next() + temp] = popStack()
temp = (callStack.length > 0 ? callStack[callStack.length - 1][1] : 0) + next()
stack[temp] = popStack()
temp2 = memStack[temp]
memStack[temp] = calculateCost(stack[temp])
memUsed += memStack[temp] - temp2
maxMemUsed = Math.max(maxMemUsed, memUsed)
break
case Operation.GET_PROPERTY:
temp = popStack() // property
stack.push(getNestedValue(popStack(), [temp]))
pushStack(getNestedValue(popStack(), [temp]))
break
case Operation.GET_PROPERTY_NULLISH:
temp = popStack() // property
stack.push(getNestedValue(popStack(), [temp], true))
pushStack(getNestedValue(popStack(), [temp], true))
break
case Operation.SET_PROPERTY:
temp = popStack() // value
@ -298,23 +330,23 @@ export function exec(code: any[] | VMState, options?: ExecOptions): ExecResult {
break
case Operation.DICT:
temp = next() * 2 // number of elements to remove from the stack
tempArray = stack.splice(stack.length - temp, temp)
tempArray = spliceStack2(stack.length - temp, temp)
tempMap = new Map()
for (let i = 0; i < tempArray.length; i += 2) {
tempMap.set(tempArray[i], tempArray[i + 1])
}
stack.push(tempMap)
pushStack(tempMap)
break
case Operation.ARRAY:
temp = next()
tempArray = stack.splice(stack.length - temp, temp)
stack.push(tempArray)
tempArray = spliceStack2(stack.length - temp, temp)
pushStack(tempArray)
break
case Operation.TUPLE:
temp = next()
tempArray = stack.splice(stack.length - temp, temp)
tempArray = spliceStack2(stack.length - temp, temp)
;(tempArray as any).__isHogTuple = true
stack.push(tempArray)
pushStack(tempArray)
break
case Operation.JUMP:
temp = next()
@ -353,14 +385,14 @@ export function exec(code: any[] | VMState, options?: ExecOptions): ExecResult {
if (temp > stack.length) {
throw new Error('Not enough arguments on the stack')
}
if (temp > MAX_ARGS_LENGTH) {
if (temp > MAX_FUNCTION_ARGS_LENGTH) {
throw new Error('Too many arguments')
}
const args = Array(temp)
.fill(null)
.map(() => popStack())
if (options?.functions && options.functions.hasOwnProperty(name) && options.functions[name]) {
stack.push(convertJSToHog(options.functions[name](...args.map(convertHogToJS))))
pushStack(convertJSToHog(options.functions[name](...args.map(convertHogToJS))))
} else if (
name !== 'toString' &&
((options?.asyncFunctions &&
@ -386,10 +418,11 @@ export function exec(code: any[] | VMState, options?: ExecOptions): ExecResult {
ops,
asyncSteps: asyncSteps + 1,
syncDuration: syncDuration + (Date.now() - startTime),
maxMemUsed,
},
} satisfies ExecResult
} else if (name in STL) {
stack.push(STL[name](args, name, timeout))
pushStack(STL[name](args, name, timeout))
} else {
throw new Error(`Unsupported function call: ${name}`)
}

View File

@ -39,34 +39,41 @@ export function escapeIdentifier(identifier: string | number): string {
.join('')}\``
}
export function printHogValue(obj: any): string {
if (Array.isArray(obj)) {
if ((obj as any).__isHogTuple) {
if (obj.length < 2) {
return `tuple(${obj.map(printHogValue).join(', ')})`
}
return `(${obj.map(printHogValue).join(', ')})`
} else {
return `[${obj.map(printHogValue).join(', ')}]`
}
}
if (obj instanceof Map) {
return `{${Array.from(obj.entries())
.map(([key, value]) => `${printHogValue(key)}: ${printHogValue(value)}`)
.join(', ')}}`
export function printHogValue(obj: any, marked: Set<any> | undefined = undefined): string {
if (!marked) {
marked = new Set()
}
if (typeof obj === 'object' && obj !== null) {
return `{${Object.entries(obj)
.map(([key, value]) => `${printHogValue(key)}: ${printHogValue(value)}`)
.join(', ')}}`
}
if (typeof obj === 'boolean') {
if (marked.has(obj)) {
return 'null'
}
marked.add(obj)
try {
if (Array.isArray(obj)) {
if ((obj as any).__isHogTuple) {
if (obj.length < 2) {
return `tuple(${obj.map((o) => printHogValue(o, marked)).join(', ')})`
}
return `(${obj.map((o) => printHogValue(o, marked)).join(', ')})`
}
return `[${obj.map((o) => printHogValue(o, marked)).join(', ')}]`
}
if (obj instanceof Map) {
return `{${Array.from(obj.entries())
.map(([key, value]) => `${printHogValue(key, marked)}: ${printHogValue(value, marked)}`)
.join(', ')}}`
}
return `{${Object.entries(obj)
.map(([key, value]) => `${printHogValue(key, marked)}: ${printHogValue(value, marked)}`)
.join(', ')}}`
} finally {
marked.delete(obj)
}
} else if (typeof obj === 'boolean') {
return obj ? 'true' : 'false'
}
if (obj === null) {
} else if (obj === null) {
return 'null'
}
if (typeof obj === 'string') {
} else if (typeof obj === 'string') {
return escapeString(obj)
}
return obj.toString()

View File

@ -80,21 +80,35 @@ export const STL: Record<string, (args: any[], name: string, timeout: number) =>
},
jsonStringify: (args) => {
// Recursively convert maps to objects
function convert(x: any): any {
if (x instanceof Map) {
const obj: Record<string, any> = {}
x.forEach((value, key) => {
obj[key] = convert(value)
})
return obj
} else if (typeof x === 'object' && Array.isArray(x)) {
return x.map(convert)
} else if (typeof x === 'object' && x !== null) {
const obj: Record<string, any> = {}
for (const key in x) {
obj[key] = convert(x[key])
function convert(x: any, marked?: Set<any>): any {
if (!marked) {
marked = new Set()
}
if (typeof x === 'object' && x !== null) {
if (marked.has(x)) {
return null
}
marked.add(x)
try {
if (x instanceof Map) {
const obj: Record<string, any> = {}
x.forEach((value, key) => {
obj[convert(key, marked)] = convert(value, marked)
})
return obj
}
if (typeof x === 'object' && Array.isArray(x)) {
return x.map((v) => convert(v, marked))
}
const obj: Record<string, any> = {}
for (const key in x) {
obj[key] = convert(x[key], marked)
}
return obj
} finally {
marked.delete(x)
}
return obj
}
return x
}

View File

@ -1,3 +1,6 @@
/** Fixed cost per object in memory */
const COST_PER_UNIT = 8
export function like(string: string, pattern: string, caseInsensitive = false): boolean {
pattern = String(pattern)
.replaceAll(/[-/\\^$*+?.()|[\]{}]/g, '\\$&')
@ -79,3 +82,40 @@ export function convertHogToJS(x: any): any {
}
return x
}
export function calculateCost(object: any, marked: Set<any> | undefined = undefined): any {
if (!marked) {
marked = new Set()
}
if (typeof object === 'object' && object !== null) {
if (marked.has(object)) {
return COST_PER_UNIT
}
marked.add(object)
try {
if (object instanceof Map) {
return (
COST_PER_UNIT +
Array.from(object.keys()).reduce(
(acc, key) => acc + calculateCost(key, marked) + calculateCost(object.get(key), marked),
0
)
)
} else if (Array.isArray(object)) {
return COST_PER_UNIT + object.reduce((acc, val) => acc + calculateCost(val, marked), 0)
}
return (
COST_PER_UNIT +
Object.keys(object).reduce(
(acc, key) => acc + calculateCost(key, marked) + calculateCost(object[key], marked),
0
)
)
} finally {
marked.delete(object)
}
} else if (typeof object === 'string') {
return COST_PER_UNIT + object.length
}
return COST_PER_UNIT
}

View File

@ -50,7 +50,7 @@
"@google-cloud/storage": "^5.8.5",
"@maxmind/geoip2-node": "^3.4.0",
"@posthog/clickhouse": "^1.7.0",
"@posthog/hogvm": "^1.0.18",
"@posthog/hogvm": "^1.0.20",
"@posthog/plugin-scaffold": "1.4.4",
"@sentry/node": "^7.49.0",
"@sentry/profiling-node": "^0.3.0",

View File

@ -44,8 +44,8 @@ dependencies:
specifier: ^1.7.0
version: 1.7.0
'@posthog/hogvm':
specifier: ^1.0.18
version: 1.0.18(re2@1.20.3)
specifier: ^1.0.20
version: 1.0.20(re2@1.20.3)
'@posthog/plugin-scaffold':
specifier: 1.4.4
version: 1.4.4
@ -3110,8 +3110,8 @@ packages:
engines: {node: '>=12'}
dev: false
/@posthog/hogvm@1.0.18(re2@1.20.3):
resolution: {integrity: sha512-h0C9AlpfDRYlSzmXFOZKXR5x5UD+sgXkiQ6CUCoBQX0TjxMYE7hU1lZ7cgWQTYWJwSHeLp5RTUN0BjRDEFhj/Q==}
/@posthog/hogvm@1.0.20(re2@1.20.3):
resolution: {integrity: sha512-NSy4EbjR0SyNCkHHA2wU7psI9oVQtEkxl1Tr8NBYxa2QBnmAS+yWcST0MhFIX38GrkYVWTsgrZUeSnEQTsvMxg==}
peerDependencies:
re2: ^1.21.3
dependencies: