0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-21 13:39:22 +01:00

feat: cyclotron (#24228)

Co-authored-by: Brett Hoerner <brett@bretthoerner.com>
Co-authored-by: Ben White <ben@posthog.com>
This commit is contained in:
Oliver Browne 2024-08-21 21:24:56 +03:00 committed by GitHub
parent e1def6e3c1
commit 9734a40c96
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
106 changed files with 6298 additions and 42 deletions

View File

@ -39,3 +39,11 @@
!test-runner-jest.config.js
!test-runner-jest-environment.js
!patches
!rust
rust/.env
rust/.github
rust/docker
rust/target
rust/cyclotron-node/dist
rust/cyclotron-node/node_modules
rust/cyclotron-node/index.node

2
.gitignore vendored
View File

@ -64,3 +64,5 @@ plugin-transpiler/dist
*-esbuild-bundle-visualization.html
.dlt
*.db
# Ignore any log files that happen to be present
*.log

24
bin/start-cyclotron Executable file
View File

@ -0,0 +1,24 @@
#!/bin/bash
set -ex
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
cd rust
cargo build
export RUST_LOG=${DEBUG:-debug}
SQLX_QUERY_LEVEL=${SQLX_QUERY_LEVEL:-warn}
export RUST_LOG=$RUST_LOG,sqlx::query=$SQLX_QUERY_LEVEL
export DATABASE_URL=${DATABASE_URL:-postgres://posthog:posthog@localhost:5432/posthog}
export ALLOW_INTERNAL_IPS=${ALLOW_INTERNAL_IPS:-true}
cd cyclotron-core
cargo sqlx migrate run
cd ..
./target/debug/cyclotron-fetch &
./target/debug/cyclotron-janitor &
wait

View File

@ -27,7 +27,9 @@
"services:start": "cd .. && docker compose -f docker-compose.dev.yml up",
"services:stop": "cd .. && docker compose -f docker-compose.dev.yml down",
"services:clean": "cd .. && docker compose -f docker-compose.dev.yml rm -v",
"services": "pnpm services:stop && pnpm services:clean && pnpm services:start"
"services": "pnpm services:stop && pnpm services:clean && pnpm services:start",
"build:cyclotron": "cd ../rust/cyclotron-node && pnpm run package",
"pnpm:devPreinstall": "pnpm run build:cyclotron"
},
"graphile-worker": {
"maxContiguousErrors": 300
@ -86,7 +88,8 @@
"uuid": "^9.0.1",
"v8-profiler-next": "^1.9.0",
"vm2": "3.9.18",
"detect-browser": "^5.3.0"
"detect-browser": "^5.3.0",
"@posthog/cyclotron": "file:../rust/cyclotron-node"
},
"devDependencies": {
"0x": "^5.5.0",

View File

@ -43,6 +43,9 @@ dependencies:
'@posthog/clickhouse':
specifier: ^1.7.0
version: 1.7.0
'@posthog/cyclotron':
specifier: file:../rust/cyclotron-node
version: file:../rust/cyclotron-node
'@posthog/hogvm':
specifier: ^1.0.32
version: 1.0.32(luxon@3.4.4)(re2@1.20.3)
@ -10731,3 +10734,8 @@ packages:
/yocto-queue@0.1.0:
resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==}
engines: {node: '>=10'}
file:../rust/cyclotron-node:
resolution: {directory: ../rust/cyclotron-node, type: directory}
name: '@posthog/cyclotron'
dev: false

View File

@ -26,6 +26,7 @@ export function getPluginServerCapabilities(config: PluginsServerConfig): Plugin
cdpProcessedEvents: true,
cdpFunctionCallbacks: true,
cdpFunctionOverflow: true,
cdpCyclotronWorker: true,
syncInlinePlugins: true,
...sharedCapabilities,
}
@ -108,6 +109,11 @@ export function getPluginServerCapabilities(config: PluginsServerConfig): Plugin
cdpFunctionOverflow: true,
...sharedCapabilities,
}
case PluginServerMode.cdp_cyclotron_worker:
return {
cdpCyclotronWorker: true,
...sharedCapabilities,
}
// This is only for functional tests, which time out if all capabilities are used
// ideally we'd run just the specific capability needed per test, but that's not easy to do atm
case PluginServerMode.functional_tests:

View File

@ -1,3 +1,4 @@
import cyclotron from '@posthog/cyclotron'
import { Histogram } from 'prom-client'
import { buildIntegerMatcher } from '../config/config'
@ -27,9 +28,11 @@ export type AsyncFunctionExecutorOptions = {
export class AsyncFunctionExecutor {
hogHookEnabledForTeams: ValueMatcher<number>
cyclotronEnabledForTeams: ValueMatcher<number>
constructor(private serverConfig: PluginsServerConfig, private rustyHook: RustyHook) {
this.hogHookEnabledForTeams = buildIntegerMatcher(serverConfig.CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS, true)
this.cyclotronEnabledForTeams = buildIntegerMatcher(serverConfig.CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS, true)
}
async execute(
@ -99,8 +102,44 @@ export class AsyncFunctionExecutor {
histogramFetchPayloadSize.observe(body.length / 1024)
}
// If the caller hasn't forced it to be synchronous and the team has the rustyhook enabled, enqueue it
if (!options?.sync && this.hogHookEnabledForTeams(request.teamId)) {
// If the caller hasn't forced it to be synchronous and the team has the cyclotron or
// rustyhook enabled, enqueue it in one of those services.
if (!options?.sync && this.cyclotronEnabledForTeams(request.teamId)) {
try {
await cyclotron.createJob({
teamId: request.teamId,
functionId: request.hogFunctionId,
queueName: 'fetch',
// TODO: The async function compression changes happen upstream of this
// function. I guess we'll want to unwind that change because we actually
// want the `vmState` (and the rest of state) so we can put it into PG here.
vmState: '',
parameters: JSON.stringify({
return_queue: 'hog',
url,
method,
headers,
body,
}),
metadata: JSON.stringify({
// TODO: It seems like Fetch expects metadata to have this shape, which
// I don't understand. I think `metadata` is where all the other Hog
// state is going to be stored? For now I'm just trying to make fetch
// work.
tries: 0,
trace: [],
}),
})
} catch (e) {
status.error(
'🦔',
`[HogExecutor] Cyclotron failed to enqueue async fetch function, sending directly instead`,
{
error: e,
}
)
}
} else if (!options?.sync && this.hogHookEnabledForTeams(request.teamId)) {
const hoghooksPayload = JSON.stringify(request)
histogramHogHooksPayloadSize.observe(hoghooksPayload.length / 1024)

View File

@ -1,3 +1,4 @@
import cyclotron from '@posthog/cyclotron'
import { captureException } from '@sentry/node'
import { features, librdkafkaVersion, Message } from 'node-rdkafka'
import { Counter, Histogram } from 'prom-client'
@ -443,7 +444,12 @@ abstract class CdpConsumerBase {
const globalConnectionConfig = createRdConnectionConfigFromEnvVars(this.hub)
const globalProducerConfig = createRdProducerConfigFromEnvVars(this.hub)
await Promise.all([this.hogFunctionManager.start()])
await Promise.all([
this.hogFunctionManager.start(),
this.hub.CYCLOTRON_DATABASE_URL
? cyclotron.initManager({ shards: [{ dbUrl: this.hub.CYCLOTRON_DATABASE_URL }] })
: Promise.resolve(),
])
this.kafkaProducer = new KafkaProducerWrapper(
await createKafkaProducer(globalConnectionConfig, globalProducerConfig)
@ -693,3 +699,57 @@ export class CdpOverflowConsumer extends CdpConsumerBase {
return invocationGlobals
}
}
// TODO: Split out non-Kafka specific parts of CdpConsumerBase so that it can be used by the
// Cyclotron worker below. Or maybe we can just wait, and rip the Kafka bits out once Cyclotron is
// shipped (and rename it something other than consomer, probably). For now, this is an easy way to
// use existing code and get an end-to-end demo shipped.
export class CdpCyclotronWorker extends CdpConsumerBase {
protected name = 'CdpCyclotronWorker'
protected topic = 'UNUSED-CdpCyclotronWorker'
protected consumerGroupId = 'UNUSED-CdpCyclotronWorker'
private runningWorker: Promise<void> | undefined
private isUnhealthy = false
public async _handleEachBatch(_: Message[]): Promise<void> {
// Not called, we override `start` below to use Cyclotron instead.
}
private async innerStart() {
try {
const limit = 100 // TODO: Make configurable.
while (!this.isStopping) {
const jobs = await cyclotron.dequeueJobsWithVmState('hog', limit)
for (const job of jobs) {
// TODO: Reassemble a HogFunctionInvocationAsyncResponse (or whatever proper type)
// from the fields on the job, and then execute the next Hog step.
console.log(job.id)
}
}
} catch (err) {
this.isUnhealthy = true
console.error('Error in Cyclotron worker', err)
throw err
}
}
public async start() {
await cyclotron.initManager({ shards: [{ dbUrl: this.hub.CYCLOTRON_DATABASE_URL }] })
await cyclotron.initWorker({ dbUrl: this.hub.CYCLOTRON_DATABASE_URL })
// Consumer `start` expects an async task is started, and not that `start` itself blocks
// indefinitely.
this.runningWorker = this.innerStart()
return Promise.resolve()
}
public async stop() {
await super.stop()
await this.runningWorker
}
public isHealthy() {
return this.isUnhealthy
}
}

View File

@ -187,9 +187,13 @@ export function getDefaultConfig(): PluginsServerConfig {
CDP_WATCHER_REFILL_RATE: 10,
CDP_WATCHER_DISABLED_TEMPORARY_MAX_COUNT: 3,
CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS: '',
CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS: '',
CDP_REDIS_PASSWORD: '',
CDP_REDIS_HOST: '',
CDP_REDIS_PORT: 6479,
// Cyclotron
CYCLOTRON_DATABASE_URL: '',
}
}

View File

@ -11,7 +11,12 @@ import v8Profiler from 'v8-profiler-next'
import { getPluginServerCapabilities } from '../capabilities'
import { CdpApi } from '../cdp/cdp-api'
import { CdpFunctionCallbackConsumer, CdpOverflowConsumer, CdpProcessedEventsConsumer } from '../cdp/cdp-consumers'
import {
CdpCyclotronWorker,
CdpFunctionCallbackConsumer,
CdpOverflowConsumer,
CdpProcessedEventsConsumer,
} from '../cdp/cdp-consumers'
import { defaultConfig, sessionRecordingConsumerConfig } from '../config/config'
import { Hub, PluginServerCapabilities, PluginsServerConfig } from '../types'
import { createHub, createKafkaClient, createKafkaProducerWrapper } from '../utils/db/hub'
@ -571,6 +576,17 @@ export async function startPluginsServer(
healthChecks['cdp-overflow'] = () => consumer.isHealthy() ?? false
}
if (capabilities.cdpCyclotronWorker) {
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, capabilities)
if (hub.CYCLOTRON_DATABASE_URL) {
const worker = new CdpCyclotronWorker(hub)
await worker.start()
} else {
// This is a temporary solution until we *require* Cyclotron to be configured.
status.warn('💥', 'CYCLOTRON_DATABASE_URL is not set, not running Cyclotron worker')
}
}
if (capabilities.http) {
const app = setupCommonRoutes(healthChecks, analyticsEventsIngestionConsumer)

View File

@ -85,6 +85,7 @@ export enum PluginServerMode {
cdp_processed_events = 'cdp-processed-events',
cdp_function_callbacks = 'cdp-function-callbacks',
cdp_function_overflow = 'cdp-function-overflow',
cdp_cyclotron_worker = 'cdp-cyclotron-worker',
functional_tests = 'functional-tests',
}
@ -107,6 +108,7 @@ export type CdpConfig = {
CDP_WATCHER_DISABLED_TEMPORARY_TTL: number // How long a function should be temporarily disabled for
CDP_WATCHER_DISABLED_TEMPORARY_MAX_COUNT: number // How many times a function can be disabled before it is disabled permanently
CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS: string
CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS: string
CDP_REDIS_HOST: string
CDP_REDIS_PORT: number
CDP_REDIS_PASSWORD: string
@ -279,6 +281,8 @@ export interface PluginsServerConfig extends CdpConfig {
// kafka debug stats interval
SESSION_RECORDING_KAFKA_CONSUMPTION_STATISTICS_EVENT_INTERVAL_MS: number
CYCLOTRON_DATABASE_URL: string
}
export interface Hub extends PluginsServerConfig {
@ -345,6 +349,7 @@ export interface PluginServerCapabilities {
cdpProcessedEvents?: boolean
cdpFunctionCallbacks?: boolean
cdpFunctionOverflow?: boolean
cdpCyclotronWorker?: boolean
appManagementSingleton?: boolean
preflightSchedules?: boolean // Used for instance health checks on hobby deploy, not useful on cloud
http?: boolean

View File

@ -97,6 +97,7 @@ describe('server', () => {
cdpProcessedEvents: true,
cdpFunctionCallbacks: true,
cdpFunctionOverflow: true,
cdpCyclotronWorker: true,
syncInlinePlugins: true,
}
)

View File

@ -38,11 +38,12 @@ COPY ./bin/ ./bin/
COPY babel.config.js tsconfig.json webpack.config.js tailwind.config.js ./
RUN pnpm build
#
# ---------------------------------------------------------
#
FROM node:18.19.1-bullseye-slim AS plugin-server-build
FROM ghcr.io/posthog/rust-node-container:bullseye_rust_1.80.1-node_18.19.1 AS plugin-server-build
WORKDIR /code
COPY ./rust ./rust
WORKDIR /code/plugin-server
SHELL ["/bin/bash", "-e", "-o", "pipefail", "-c"]
@ -182,6 +183,7 @@ COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/dist
COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/node_modules /code/plugin-server/node_modules
COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/package.json /code/plugin-server/package.json
# Copy the Python dependencies and Django staticfiles from the posthog-build stage.
COPY --from=posthog-build --chown=posthog:posthog /code/staticfiles /code/staticfiles
COPY --from=posthog-build --chown=posthog:posthog /python-runtime /python-runtime

4
rust/.cargo/config.toml Normal file
View File

@ -0,0 +1,4 @@
[env]
# Force SQLX to run in offline mode for CI. Devs can change this if they want, to live code against the DB,
# but we use it at the workspace level here to allow use of sqlx macros across all crates
SQLX_OFFLINE = "true"

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2",
"describe": {
"columns": [],
"parameters": {
"Left": ["Uuid", "Uuid"]
},
"nullable": []
},
"hash": "075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Text", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Text", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Int2", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805"
}

View File

@ -0,0 +1,117 @@
{
"db_name": "PostgreSQL",
"query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n NULL as vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "id",
"type_info": "Uuid"
},
{
"ordinal": 1,
"name": "team_id",
"type_info": "Int4"
},
{
"ordinal": 2,
"name": "state: JobState",
"type_info": {
"Custom": {
"name": "jobstate",
"kind": {
"Enum": ["available", "completed", "failed", "running", "paused"]
}
}
}
},
{
"ordinal": 3,
"name": "queue_name",
"type_info": "Text"
},
{
"ordinal": 4,
"name": "priority",
"type_info": "Int2"
},
{
"ordinal": 5,
"name": "function_id",
"type_info": "Uuid"
},
{
"ordinal": 6,
"name": "created",
"type_info": "Timestamptz"
},
{
"ordinal": 7,
"name": "last_transition",
"type_info": "Timestamptz"
},
{
"ordinal": 8,
"name": "scheduled",
"type_info": "Timestamptz"
},
{
"ordinal": 9,
"name": "transition_count",
"type_info": "Int2"
},
{
"ordinal": 10,
"name": "vm_state",
"type_info": "Text"
},
{
"ordinal": 11,
"name": "metadata",
"type_info": "Text"
},
{
"ordinal": 12,
"name": "parameters",
"type_info": "Text"
},
{
"ordinal": 13,
"name": "lock_id",
"type_info": "Uuid"
},
{
"ordinal": 14,
"name": "last_heartbeat",
"type_info": "Timestamptz"
},
{
"ordinal": 15,
"name": "janitor_touch_count",
"type_info": "Int2"
}
],
"parameters": {
"Left": ["Text", "Int8", "Uuid"]
},
"nullable": [
false,
false,
false,
false,
false,
true,
false,
false,
false,
false,
null,
true,
true,
true,
true,
false
]
},
"hash": "350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "\nWITH stalled AS (\n SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1\nFROM stalled\nWHERE cyclotron_jobs.id = stalled.id\n ",
"describe": {
"columns": [],
"parameters": {
"Left": ["Timestamptz"]
},
"nullable": []
},
"hash": "54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d"
}

View File

@ -0,0 +1,30 @@
{
"db_name": "PostgreSQL",
"query": "\nINSERT INTO cyclotron_jobs\n (\n id,\n team_id,\n function_id,\n created,\n lock_id,\n last_heartbeat,\n janitor_touch_count,\n transition_count,\n last_transition,\n queue_name,\n state,\n scheduled,\n priority,\n vm_state,\n metadata,\n parameters\n )\nVALUES\n ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)\n ",
"describe": {
"columns": [],
"parameters": {
"Left": [
"Uuid",
"Int4",
"Uuid",
"Text",
{
"Custom": {
"name": "jobstate",
"kind": {
"Enum": ["available", "completed", "failed", "running", "paused"]
}
}
},
"Timestamptz",
"Int2",
"Text",
"Text",
"Text"
]
},
"nullable": []
},
"hash": "7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2",
"describe": {
"columns": [],
"parameters": {
"Left": ["Uuid", "Uuid"]
},
"nullable": []
},
"hash": "884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Text", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Timestamptz", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23"
}

View File

@ -0,0 +1,18 @@
{
"db_name": "PostgreSQL",
"query": "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "vm_state",
"type_info": "Text"
}
],
"parameters": {
"Left": ["Uuid", "Uuid"]
},
"nullable": [true]
},
"hash": "aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Text", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7"
}

View File

@ -0,0 +1,23 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs\n SET state = $1, last_transition = NOW(), transition_count = transition_count + 1\n WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": [
{
"Custom": {
"name": "jobstate",
"kind": {
"Enum": ["available", "completed", "failed", "running", "paused"]
}
}
},
"Uuid",
"Uuid"
]
},
"nullable": []
},
"hash": "b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13"
}

View File

@ -0,0 +1,117 @@
{
"db_name": "PostgreSQL",
"query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "id",
"type_info": "Uuid"
},
{
"ordinal": 1,
"name": "team_id",
"type_info": "Int4"
},
{
"ordinal": 2,
"name": "state: JobState",
"type_info": {
"Custom": {
"name": "jobstate",
"kind": {
"Enum": ["available", "completed", "failed", "running", "paused"]
}
}
}
},
{
"ordinal": 3,
"name": "queue_name",
"type_info": "Text"
},
{
"ordinal": 4,
"name": "priority",
"type_info": "Int2"
},
{
"ordinal": 5,
"name": "function_id",
"type_info": "Uuid"
},
{
"ordinal": 6,
"name": "created",
"type_info": "Timestamptz"
},
{
"ordinal": 7,
"name": "last_transition",
"type_info": "Timestamptz"
},
{
"ordinal": 8,
"name": "scheduled",
"type_info": "Timestamptz"
},
{
"ordinal": 9,
"name": "transition_count",
"type_info": "Int2"
},
{
"ordinal": 10,
"name": "vm_state",
"type_info": "Text"
},
{
"ordinal": 11,
"name": "metadata",
"type_info": "Text"
},
{
"ordinal": 12,
"name": "parameters",
"type_info": "Text"
},
{
"ordinal": 13,
"name": "lock_id",
"type_info": "Uuid"
},
{
"ordinal": 14,
"name": "last_heartbeat",
"type_info": "Timestamptz"
},
{
"ordinal": 15,
"name": "janitor_touch_count",
"type_info": "Int2"
}
],
"parameters": {
"Left": ["Text", "Int8", "Uuid"]
},
"nullable": [
false,
false,
false,
false,
false,
true,
false,
false,
false,
false,
true,
true,
true,
true,
true,
false
]
},
"hash": "c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "DELETE FROM cyclotron_jobs WHERE state = 'completed'",
"describe": {
"columns": [],
"parameters": {
"Left": []
},
"nullable": []
},
"hash": "f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "\nDELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2\n ",
"describe": {
"columns": [],
"parameters": {
"Left": ["Timestamptz", "Int2"]
},
"nullable": []
},
"hash": "fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "DELETE FROM cyclotron_jobs WHERE state = 'failed'",
"describe": {
"columns": [],
"parameters": {
"Left": []
},
"nullable": []
},
"hash": "ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e"
}

160
rust/Cargo.lock generated
View File

@ -673,6 +673,7 @@ dependencies = [
"iana-time-zone",
"js-sys",
"num-traits",
"serde",
"wasm-bindgen",
"windows-targets 0.52.0",
]
@ -700,6 +701,25 @@ dependencies = [
"tokio-util",
]
[[package]]
name = "common-dns"
version = "0.1.0"
dependencies = [
"futures",
"reqwest 0.12.3",
"tokio",
]
[[package]]
name = "common-metrics"
version = "0.1.0"
dependencies = [
"axum 0.7.5",
"metrics",
"metrics-exporter-prometheus",
"tokio",
]
[[package]]
name = "concurrent-queue"
version = "2.5.0"
@ -819,6 +839,80 @@ dependencies = [
"typenum",
]
[[package]]
name = "cyclotron-core"
version = "0.1.0"
dependencies = [
"chrono",
"futures",
"rand",
"serde",
"sqlx",
"thiserror",
"tokio",
"uuid",
]
[[package]]
name = "cyclotron-fetch"
version = "0.1.0"
dependencies = [
"axum 0.7.5",
"chrono",
"common-dns",
"common-metrics",
"cyclotron-core",
"envconfig",
"futures",
"health",
"http 1.1.0",
"httpmock",
"metrics",
"rand",
"reqwest 0.12.3",
"serde",
"serde_json",
"sqlx",
"thiserror",
"tokio",
"tracing",
"tracing-subscriber",
"uuid",
]
[[package]]
name = "cyclotron-janitor"
version = "0.1.0"
dependencies = [
"axum 0.7.5",
"chrono",
"common-metrics",
"cyclotron-core",
"envconfig",
"eyre",
"health",
"metrics",
"sqlx",
"tokio",
"tracing",
"tracing-subscriber",
"uuid",
]
[[package]]
name = "cyclotron-node"
version = "0.1.0"
dependencies = [
"chrono",
"cyclotron-core",
"neon",
"once_cell",
"serde",
"serde_json",
"tokio",
"uuid",
]
[[package]]
name = "dashmap"
version = "5.5.3"
@ -1468,6 +1562,7 @@ name = "hook-api"
version = "0.1.0"
dependencies = [
"axum 0.7.5",
"common-metrics",
"envconfig",
"eyre",
"hook-common",
@ -1489,13 +1584,10 @@ name = "hook-common"
version = "0.1.0"
dependencies = [
"async-trait",
"axum 0.7.5",
"chrono",
"envconfig",
"health",
"http 1.1.0",
"metrics",
"metrics-exporter-prometheus",
"rdkafka",
"reqwest 0.12.3",
"serde",
@ -1514,6 +1606,7 @@ version = "0.1.0"
dependencies = [
"async-trait",
"axum 0.7.5",
"common-metrics",
"envconfig",
"eyre",
"futures",
@ -1537,6 +1630,8 @@ version = "0.1.0"
dependencies = [
"axum 0.7.5",
"chrono",
"common-dns",
"common-metrics",
"envconfig",
"futures",
"health",
@ -1944,6 +2039,16 @@ version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "libloading"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
dependencies = [
"cfg-if",
"windows-targets 0.52.0",
]
[[package]]
name = "libm"
version = "0.2.8"
@ -2160,6 +2265,32 @@ dependencies = [
"tempfile",
]
[[package]]
name = "neon"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d75440242411c87dc39847b0e33e961ec1f10326a9d8ecf9c1ea64a3b3c13dc"
dependencies = [
"getrandom",
"libloading",
"neon-macros",
"once_cell",
"semver",
"send_wrapper",
"smallvec",
]
[[package]]
name = "neon-macros"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6813fde79b646e47e7ad75f480aa80ef76a5d9599e2717407961531169ee38b"
dependencies = [
"quote",
"syn 2.0.48",
"syn-mid",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
@ -3181,6 +3312,18 @@ dependencies = [
"libc",
]
[[package]]
name = "semver"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
[[package]]
name = "send_wrapper"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73"
[[package]]
name = "serde"
version = "1.0.196"
@ -3660,6 +3803,17 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "syn-mid"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5dc35bb08dd1ca3dfb09dce91fd2d13294d6711c88897d9a9d60acf39bce049"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.48",
]
[[package]]
name = "sync_wrapper"
version = "0.1.2"

View File

@ -4,11 +4,17 @@ resolver = "2"
members = [
"capture",
"common/health",
"common/metrics",
"common/dns",
"feature-flags",
"hook-api",
"hook-common",
"hook-janitor",
"hook-worker",
"cyclotron-core",
"cyclotron-node",
"cyclotron-janitor",
"cyclotron-fetch",
]
[workspace.lints.rust]
@ -34,7 +40,7 @@ axum = { version = "0.7.5", features = ["http2", "macros", "matched-path"] }
axum-client-ip = "0.6.0"
base64 = "0.22.0"
bytes = "1"
chrono = { version = "0.4" }
chrono = { version = "0.4", features = ["default", "serde"]}
envconfig = "0.10.0"
eyre = "0.6.9"
flate2 = "1.0"
@ -80,3 +86,4 @@ tracing-opentelemetry = "0.23.0"
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
url = { version = "2.5.0 " }
uuid = { version = "1.6.1", features = ["v7", "serde"] }
neon = "1"

View File

@ -1,4 +1,4 @@
FROM docker.io/lukemathwalker/cargo-chef:latest-rust-1.77-bookworm AS chef
FROM docker.io/lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef
ARG BIN
WORKDIR /app

View File

@ -0,0 +1,12 @@
[package]
name = "common-dns"
version = "0.1.0"
edition = "2021"
[lints]
workspace = true
[dependencies]
futures = { workspace = true }
reqwest = { workspace = true }
tokio = { workspace = true }

View File

@ -86,7 +86,7 @@ impl Resolve for PublicIPv4Resolver {
#[cfg(test)]
mod tests {
use crate::dns::{NoPublicIPv4Error, PublicIPv4Resolver};
use crate::{NoPublicIPv4Error, PublicIPv4Resolver};
use reqwest::dns::{Name, Resolve};
use std::str::FromStr;

View File

@ -4,7 +4,7 @@ use std::sync::{Arc, RwLock};
use axum::http::StatusCode;
use axum::response::{IntoResponse, Response};
use time::Duration;
use std::time::Duration;
use tokio::sync::mpsc;
use tracing::{info, warn};
@ -143,7 +143,16 @@ impl HealthRegistry {
/// Registers a new component in the registry. The returned handle should be passed
/// to the component, to allow it to frequently report its health status.
pub async fn register(&self, component: String, deadline: Duration) -> HealthHandle {
pub async fn register<D>(&self, component: String, deadline: D) -> HealthHandle
where
// HACK: to let callers user time::Duration or std::time::Duration (and therefore chrono::Duration),
// since apparently we use all three
D: TryInto<Duration>,
{
let Ok(deadline) = deadline.try_into() else {
// TODO - I should return an error here, but I don't want to refactor everything that uses this right now
panic!("invalid deadline")
};
let handle = HealthHandle {
component,
deadline,

View File

@ -0,0 +1,13 @@
[package]
name = "common-metrics"
version = "0.1.0"
edition = "2021"
[lints]
workspace = true
[dependencies]
axum = { workspace = true }
metrics-exporter-prometheus = { workspace = true }
tokio = { workspace = true }
metrics = { workspace = true }

View File

@ -0,0 +1 @@
Ripped from rusty-hook, since it'll be used across more or less all cyclotron stuff, as well as rustyhook

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2",
"describe": {
"columns": [],
"parameters": {
"Left": ["Uuid", "Uuid"]
},
"nullable": []
},
"hash": "075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Text", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87"
}

View File

@ -0,0 +1,18 @@
{
"db_name": "PostgreSQL",
"query": "SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'available' AND scheduled <= NOW()",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "count",
"type_info": "Int8"
}
],
"parameters": {
"Left": []
},
"nullable": [null]
},
"hash": "213e9d70e145a01fb42d5c3a80f9126073113a4af03c4c9fd3a81004d898f883"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Text", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Int2", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805"
}

View File

@ -0,0 +1,117 @@
{
"db_name": "PostgreSQL",
"query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n NULL as vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "id",
"type_info": "Uuid"
},
{
"ordinal": 1,
"name": "team_id",
"type_info": "Int4"
},
{
"ordinal": 2,
"name": "state: JobState",
"type_info": {
"Custom": {
"name": "jobstate",
"kind": {
"Enum": ["available", "completed", "failed", "running", "paused"]
}
}
}
},
{
"ordinal": 3,
"name": "queue_name",
"type_info": "Text"
},
{
"ordinal": 4,
"name": "priority",
"type_info": "Int2"
},
{
"ordinal": 5,
"name": "function_id",
"type_info": "Uuid"
},
{
"ordinal": 6,
"name": "created",
"type_info": "Timestamptz"
},
{
"ordinal": 7,
"name": "last_transition",
"type_info": "Timestamptz"
},
{
"ordinal": 8,
"name": "scheduled",
"type_info": "Timestamptz"
},
{
"ordinal": 9,
"name": "transition_count",
"type_info": "Int2"
},
{
"ordinal": 10,
"name": "vm_state",
"type_info": "Text"
},
{
"ordinal": 11,
"name": "metadata",
"type_info": "Text"
},
{
"ordinal": 12,
"name": "parameters",
"type_info": "Text"
},
{
"ordinal": 13,
"name": "lock_id",
"type_info": "Uuid"
},
{
"ordinal": 14,
"name": "last_heartbeat",
"type_info": "Timestamptz"
},
{
"ordinal": 15,
"name": "janitor_touch_count",
"type_info": "Int2"
}
],
"parameters": {
"Left": ["Text", "Int8", "Uuid"]
},
"nullable": [
false,
false,
false,
false,
false,
true,
false,
false,
false,
false,
null,
true,
true,
true,
true,
false
]
},
"hash": "350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "\nWITH stalled AS (\n SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1\nFROM stalled\nWHERE cyclotron_jobs.id = stalled.id\n ",
"describe": {
"columns": [],
"parameters": {
"Left": ["Timestamptz"]
},
"nullable": []
},
"hash": "54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d"
}

View File

@ -0,0 +1,30 @@
{
"db_name": "PostgreSQL",
"query": "\nINSERT INTO cyclotron_jobs\n (\n id,\n team_id,\n function_id,\n created,\n lock_id,\n last_heartbeat,\n janitor_touch_count,\n transition_count,\n last_transition,\n queue_name,\n state,\n scheduled,\n priority,\n vm_state,\n metadata,\n parameters\n )\nVALUES\n ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)\n ",
"describe": {
"columns": [],
"parameters": {
"Left": [
"Uuid",
"Int4",
"Uuid",
"Text",
{
"Custom": {
"name": "jobstate",
"kind": {
"Enum": ["available", "completed", "failed", "running", "paused"]
}
}
},
"Timestamptz",
"Int2",
"Text",
"Text",
"Text"
]
},
"nullable": []
},
"hash": "7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2",
"describe": {
"columns": [],
"parameters": {
"Left": ["Uuid", "Uuid"]
},
"nullable": []
},
"hash": "884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Text", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Timestamptz", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23"
}

View File

@ -0,0 +1,18 @@
{
"db_name": "PostgreSQL",
"query": "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "vm_state",
"type_info": "Text"
}
],
"parameters": {
"Left": ["Uuid", "Uuid"]
},
"nullable": [true]
},
"hash": "aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": ["Text", "Uuid", "Uuid"]
},
"nullable": []
},
"hash": "b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7"
}

View File

@ -0,0 +1,23 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE cyclotron_jobs\n SET state = $1, last_transition = NOW(), transition_count = transition_count + 1\n WHERE id = $2 AND lock_id = $3",
"describe": {
"columns": [],
"parameters": {
"Left": [
{
"Custom": {
"name": "jobstate",
"kind": {
"Enum": ["available", "completed", "failed", "running", "paused"]
}
}
},
"Uuid",
"Uuid"
]
},
"nullable": []
},
"hash": "b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13"
}

View File

@ -0,0 +1,117 @@
{
"db_name": "PostgreSQL",
"query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "id",
"type_info": "Uuid"
},
{
"ordinal": 1,
"name": "team_id",
"type_info": "Int4"
},
{
"ordinal": 2,
"name": "state: JobState",
"type_info": {
"Custom": {
"name": "jobstate",
"kind": {
"Enum": ["available", "completed", "failed", "running", "paused"]
}
}
}
},
{
"ordinal": 3,
"name": "queue_name",
"type_info": "Text"
},
{
"ordinal": 4,
"name": "priority",
"type_info": "Int2"
},
{
"ordinal": 5,
"name": "function_id",
"type_info": "Uuid"
},
{
"ordinal": 6,
"name": "created",
"type_info": "Timestamptz"
},
{
"ordinal": 7,
"name": "last_transition",
"type_info": "Timestamptz"
},
{
"ordinal": 8,
"name": "scheduled",
"type_info": "Timestamptz"
},
{
"ordinal": 9,
"name": "transition_count",
"type_info": "Int2"
},
{
"ordinal": 10,
"name": "vm_state",
"type_info": "Text"
},
{
"ordinal": 11,
"name": "metadata",
"type_info": "Text"
},
{
"ordinal": 12,
"name": "parameters",
"type_info": "Text"
},
{
"ordinal": 13,
"name": "lock_id",
"type_info": "Uuid"
},
{
"ordinal": 14,
"name": "last_heartbeat",
"type_info": "Timestamptz"
},
{
"ordinal": 15,
"name": "janitor_touch_count",
"type_info": "Int2"
}
],
"parameters": {
"Left": ["Text", "Int8", "Uuid"]
},
"nullable": [
false,
false,
false,
false,
false,
true,
false,
false,
false,
false,
true,
true,
true,
true,
true,
false
]
},
"hash": "c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "DELETE FROM cyclotron_jobs WHERE state = 'completed'",
"describe": {
"columns": [],
"parameters": {
"Left": []
},
"nullable": []
},
"hash": "f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "\nDELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2\n ",
"describe": {
"columns": [],
"parameters": {
"Left": ["Timestamptz", "Int2"]
},
"nullable": []
},
"hash": "fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb"
}

View File

@ -0,0 +1,12 @@
{
"db_name": "PostgreSQL",
"query": "DELETE FROM cyclotron_jobs WHERE state = 'failed'",
"describe": {
"columns": [],
"parameters": {
"Left": []
},
"nullable": []
},
"hash": "ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e"
}

View File

@ -0,0 +1,17 @@
[package]
name = "cyclotron-core"
version = "0.1.0"
edition = "2021"
[lints]
workspace = true
[dependencies]
serde = { workspace = true }
sqlx = { workspace = true }
chrono = { workspace = true }
tokio = { workspace = true }
thiserror = { workspace = true }
uuid = { workspace = true }
rand = { workspace = true }
futures = { workspace = true }

View File

@ -0,0 +1,102 @@
CREATE TYPE JobState AS ENUM(
'available',
'completed',
'failed',
'running',
'paused'
);
---------------------------------------------------------------------
-- Job table
---------------------------------------------------------------------
-- When a job is dequeued, it is locked by generating a UUID and returning it to the dequeuing
-- worker. Any worker that can't provide the correct lock_id when updating will have their updates
-- rejected. The reason this is important is because if, e.g., a worker holds a job in a running
-- state without updating the heartbeat, the janitor will return the job to the queue eventually,
-- and if the worker /then/ tries to update the job after another worker has picked it up, that's a
-- race. We track transition count and times alongside lock_id's and heartbeats for reporting and
-- debugging purposes, and we track the number of times the janitor has touched a job to spot poison
-- pills.
CREATE TABLE IF NOT EXISTS cyclotron_jobs (
---------------------------------------------------------------------
-- Job metadata
---------------------------------------------------------------------
id UUID PRIMARY KEY,
team_id INT NOT NULL,
function_id UUID,
created TIMESTAMPTZ NOT NULL,
---------------------------------------------------------------------
-- Queue bookkeeping - invisible to the worker
---------------------------------------------------------------------
lock_id UUID,
-- This is set when a job is in a running state, and is required to update the job.
last_heartbeat TIMESTAMPTZ,
-- This is updated by the worker to indicate that the job is making forward progress even
-- without transitions (and should not be reaped)
janitor_touch_count SMALLINT NOT NULL,
transition_count SMALLINT NOT NULL,
last_transition TIMESTAMPTZ NOT NULL,
---------------------------------------------------------------------
-- Queue components - determines which workers will consume this job
---------------------------------------------------------------------
queue_name TEXT NOT NULL,
---------------------------------------------------------------------
-- Job availability and priority (can this job be dequeued, and in what order?)
---------------------------------------------------------------------
state JobState NOT NULL,
scheduled TIMESTAMPTZ NOT NULL,
priority SMALLINT NOT NULL,
---------------------------------------------------------------------
-- Job data
---------------------------------------------------------------------
vm_state TEXT,
-- This is meant for workers "talking to themselves", e.g. tracking retries or something
metadata TEXT,
-- This is meant for "the next guy" - hog might fill it with a URL to fetch, for example
parameters TEXT
);
-- For a given worker, the set of "available" jobs depends on state, queue_name, and scheduled (so
-- we can exclude sleeping jobs). This index is partial, because we don't care about other states
-- for the purpose of dequeuing
CREATE INDEX idx_cyclotron_jobs_dequeue ON cyclotron_jobs (queue_name, state, scheduled, priority)
WHERE
state = 'available';
-- We create simple indexes on team_id, function_id and queue_name to support fast joins to future
-- control tables
CREATE INDEX idx_queue_team_id ON cyclotron_jobs(team_id);
CREATE INDEX idx_queue_function_id ON cyclotron_jobs(function_id);
CREATE INDEX idx_queue_queue_name ON cyclotron_jobs(queue_name);
---------------------------------------------------------------------
-- Control tables
---------------------------------------------------------------------
-- These are just a starting point, supporting overriding the state for a given team, function or queue
-- For now these are entirely unused
CREATE TABLE IF NOT EXISTS cyclotron_team_control (
team_id INT PRIMARY KEY,
state_override JobState,
-- If this is not null, it overrides the state of all jobs for this team (allowing for e.g. pausing or force failing all of a teams jobs)
state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent
);
CREATE TABLE IF NOT EXISTS cyclotron_function_control (
function_id UUID PRIMARY KEY,
state_override JobState,
-- If this is not null, it overrides the state of all jobs for this function (allowing for e.g. pausing or force failing all of a functions jobs)
state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent
);
CREATE TABLE IF NOT EXISTS cyclotron_queue_control (
queue_name TEXT PRIMARY KEY,
state_override JobState,
-- If this is not null, it overrides the state of all jobs for this queue (allowing for e.g. pausing or force failing all of a queues jobs)
state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent
);

View File

@ -0,0 +1,697 @@
//! # PgQueue
//!
//! A job queue implementation backed by a PostgreSQL table.
use std::str::FromStr;
use chrono::{self, DateTime, Utc};
use serde::{self, Deserialize, Serialize};
use sqlx::{
postgres::{PgArguments, PgHasArrayType, PgQueryResult, PgTypeInfo},
query::Query,
};
use uuid::Uuid;
use crate::error::QueueError;
#[derive(Debug, Deserialize, Serialize, sqlx::Type)]
#[serde(rename_all = "lowercase")]
#[sqlx(type_name = "JobState", rename_all = "lowercase")]
pub enum JobState {
Available,
Running,
Completed,
Failed,
Paused,
}
impl FromStr for JobState {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"available" => Ok(JobState::Available),
"running" => Ok(JobState::Running),
"completed" => Ok(JobState::Completed),
"failed" => Ok(JobState::Failed),
_ => Err(()),
}
}
}
impl PgHasArrayType for JobState {
fn array_type_info() -> sqlx::postgres::PgTypeInfo {
// Postgres default naming convention for array types is "_typename"
PgTypeInfo::with_name("_JobState")
}
}
// The chunk of data needed to enqueue a job
#[derive(Debug, Deserialize, Serialize, Clone, Eq, PartialEq)]
pub struct JobInit {
pub team_id: i32,
pub queue_name: String,
pub priority: i16,
pub scheduled: DateTime<Utc>,
pub function_id: Option<Uuid>,
pub vm_state: Option<String>,
pub parameters: Option<String>,
pub metadata: Option<String>,
}
// TODO - there are certain things we might want to be on a per-team basis here... the ability to say
// "do not process any jobs for this team" independent of doing an operation on the job table seems powerful,
// but that requires a distinct team table. For now, I'm just making a note that it's something we might
// want (the command to modify the treatment of all jobs associated with a team should only need to be issued and
// processed /once/, not once per job, and should apply to all jobs both currently queued and any future ones). This
// can be added in a progressive way (by adding joins and clauses to the dequeue query), so we don't need to worry about
// it too much up front.
#[derive(Debug, Deserialize, Serialize)]
pub struct Job {
// Job metadata
pub id: Uuid,
pub team_id: i32,
pub function_id: Option<Uuid>, // Some jobs might not come from hog, and it doesn't /kill/ use to support that
pub created: DateTime<Utc>,
// Queue bookkeeping
// This will be set for any worker that ever has a job in the "running" state (so any worker that dequeues a job)
// but I don't want to do the work to encode that in the type system right now - later it should be
pub lock_id: Option<Uuid>,
pub last_heartbeat: Option<DateTime<Utc>>,
pub janitor_touch_count: i16,
pub transition_count: i16,
pub last_transition: DateTime<Utc>,
// Virtual queue components
pub queue_name: String, // We can have multiple "virtual queues" workers pull from
// Job availability
pub state: JobState,
pub priority: i16, // For sorting "available" jobs. Lower is higher priority
pub scheduled: DateTime<Utc>,
// Job data
pub vm_state: Option<String>, // The state of the VM this job is running on (if it exists)
pub metadata: Option<String>, // Additional fields a worker can tack onto a job, for e.g. tracking some state across retries (or number of retries in general by a given class of worker)
pub parameters: Option<String>, // The actual parameters of the job (function args for a hog function, http request for a fetch function)
}
pub async fn create_job<'c, E>(executor: E, data: JobInit) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let id = Uuid::now_v7();
sqlx::query!(
r#"
INSERT INTO cyclotron_jobs
(
id,
team_id,
function_id,
created,
lock_id,
last_heartbeat,
janitor_touch_count,
transition_count,
last_transition,
queue_name,
state,
scheduled,
priority,
vm_state,
metadata,
parameters
)
VALUES
($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)
"#,
id,
data.team_id,
data.function_id,
data.queue_name,
JobState::Available as _,
data.scheduled,
data.priority,
data.vm_state,
data.metadata,
data.parameters
)
.execute(executor)
.await?;
Ok(())
}
pub async fn bulk_create_jobs<'c, E>(executor: E, jobs: &[JobInit]) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let now = Utc::now();
// Flatten these jobs into a series of vecs of arguments PG can unnest
let mut ids = Vec::with_capacity(jobs.len());
let mut team_ids = Vec::with_capacity(jobs.len());
let mut function_ids = Vec::with_capacity(jobs.len());
let mut created_at = Vec::with_capacity(jobs.len());
let mut lock_ids = Vec::with_capacity(jobs.len());
let mut last_heartbeats = Vec::with_capacity(jobs.len());
let mut janitor_touch_counts = Vec::with_capacity(jobs.len());
let mut transition_counts = Vec::with_capacity(jobs.len());
let mut last_transitions = Vec::with_capacity(jobs.len());
let mut queue_names = Vec::with_capacity(jobs.len());
let mut states = Vec::with_capacity(jobs.len());
let mut scheduleds = Vec::with_capacity(jobs.len());
let mut priorities = Vec::with_capacity(jobs.len());
let mut vm_states = Vec::with_capacity(jobs.len());
let mut metadatas = Vec::with_capacity(jobs.len());
let mut parameters = Vec::with_capacity(jobs.len());
for d in jobs {
ids.push(Uuid::now_v7());
team_ids.push(d.team_id);
function_ids.push(d.function_id);
created_at.push(now);
lock_ids.push(None::<Uuid>);
last_heartbeats.push(None::<DateTime<Utc>>);
janitor_touch_counts.push(0);
transition_counts.push(0);
last_transitions.push(now);
queue_names.push(d.queue_name.clone());
states.push(JobState::Available);
scheduleds.push(d.scheduled);
priorities.push(d.priority);
vm_states.push(d.vm_state.clone());
metadatas.push(d.metadata.clone());
parameters.push(d.parameters.clone());
}
// Using the "unnest" function to turn an array of rows into a set of rows
sqlx::query(
r#"
INSERT INTO cyclotron_jobs
(
id,
team_id,
function_id,
created,
lock_id,
last_heartbeat,
janitor_touch_count,
transition_count,
last_transition,
queue_name,
state,
scheduled,
priority,
vm_state,
metadata,
parameters
)
SELECT *
FROM UNNEST(
$1,
$2,
$3,
$4,
$5,
$6,
$7,
$8,
$9,
$10,
$11,
$12,
$13,
$14,
$15,
$16
)
"#,
)
.bind(ids)
.bind(team_ids)
.bind(function_ids)
.bind(created_at)
.bind(lock_ids)
.bind(last_heartbeats)
.bind(janitor_touch_counts)
.bind(transition_counts)
.bind(last_transitions)
.bind(queue_names)
.bind(states)
.bind(scheduleds)
.bind(priorities)
.bind(vm_states)
.bind(metadatas)
.bind(parameters)
.execute(executor)
.await?;
Ok(())
}
// Dequeue the next job batch from the queue, skipping VM state since it can be large
pub async fn dequeue_jobs<'c, E>(
executor: E,
queue: &str,
max: usize,
) -> Result<Vec<Job>, QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
// TODO - right now, locks are completely transient. We could instead have the lock_id act like a
// "worker_id", and be provided by the caller, which would let workers do less bookkeeping, and make
// some kinds of debugging easier, but I prefer locks being opaque to workers for now, to avoid any
// confusion or potential for accidental deadlocking (e.g. if someone persisted the worker_id across
// process restarts).
let lock_id = Uuid::now_v7();
Ok(sqlx::query_as!(
Job,
r#"
WITH available AS (
SELECT
id,
state
FROM cyclotron_jobs
WHERE
state = 'available'::JobState
AND queue_name = $1
AND scheduled <= NOW()
ORDER BY
priority ASC,
scheduled ASC
LIMIT $2
FOR UPDATE SKIP LOCKED
)
UPDATE cyclotron_jobs
SET
state = 'running'::JobState,
lock_id = $3,
last_heartbeat = NOW(),
last_transition = NOW(),
transition_count = transition_count + 1
FROM available
WHERE
cyclotron_jobs.id = available.id
RETURNING
cyclotron_jobs.id,
team_id,
available.state as "state: JobState",
queue_name,
priority,
function_id,
created,
last_transition,
scheduled,
transition_count,
NULL as vm_state,
metadata,
parameters,
lock_id,
last_heartbeat,
janitor_touch_count
"#,
queue,
max as i64,
lock_id
)
.fetch_all(executor)
.await?)
}
// Dequeue a batch of jobs, also returning their VM state. This is an optimisation - you could
// dequeue a batch of jobs and then fetch their VM state in a separate query, but this is hopefully less
// heavy on the DB, if a given worker knows it needs VM state for all dequeue jobs
pub async fn dequeue_with_vm_state<'c, E>(
executor: E,
queue: &str,
max: usize,
) -> Result<Vec<Job>, QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let lock_id = Uuid::now_v7();
Ok(sqlx::query_as!(
Job,
r#"
WITH available AS (
SELECT
id,
state
FROM cyclotron_jobs
WHERE
state = 'available'::JobState
AND queue_name = $1
AND scheduled <= NOW()
ORDER BY
priority ASC,
scheduled ASC
LIMIT $2
FOR UPDATE SKIP LOCKED
)
UPDATE cyclotron_jobs
SET
state = 'running'::JobState,
lock_id = $3,
last_heartbeat = NOW(),
last_transition = NOW(),
transition_count = transition_count + 1
FROM available
WHERE
cyclotron_jobs.id = available.id
RETURNING
cyclotron_jobs.id,
team_id,
available.state as "state: JobState",
queue_name,
priority,
function_id,
created,
last_transition,
scheduled,
transition_count,
vm_state,
metadata,
parameters,
lock_id,
last_heartbeat,
janitor_touch_count
"#,
queue,
max as i64,
lock_id
)
.fetch_all(executor)
.await?)
}
// Grab a jobs VM state - for workers that might sometimes need a jobs vm state, but not always,
// this lets them use dequeue_jobs, and then fetch the states they need. VM state can only be retrieved
// by workers holding a job lock
pub async fn get_vm_state<'c, E>(
executor: E,
job_id: Uuid,
lock_id: Uuid,
) -> Result<Option<String>, QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
struct VMState {
vm_state: Option<String>,
}
// We use fetch_on here because giving us an unknown ID is an error
let res = sqlx::query_as!(
VMState,
"SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2",
job_id,
lock_id
)
.fetch_one(executor)
.await?;
Ok(res.vm_state)
}
// A struct representing a set of updates for a job. Outer none values mean "don't update this field",
// with nested none values meaning "set this field to null" for nullable fields
#[derive(Debug, Deserialize, Serialize)]
pub struct JobUpdate {
pub lock_id: Uuid, // The ID of the lock acquired when this worker dequeued the job, required for any update to be valid
pub state: Option<JobState>,
pub queue_name: Option<String>,
pub priority: Option<i16>,
pub scheduled: Option<DateTime<Utc>>,
pub vm_state: Option<Option<String>>,
pub metadata: Option<Option<String>>,
pub parameters: Option<Option<String>>,
}
impl JobUpdate {
pub fn new(lock_id: Uuid) -> Self {
Self {
lock_id,
state: None,
queue_name: None,
priority: None,
scheduled: None,
vm_state: None,
metadata: None,
parameters: None,
}
}
}
// TODO - I should think about a bulk-flush interface at /some/ point, although we expect jobs to be
// high variance with respect to work time, so maybe that wouldn't be that useful in the end.
// TODO - this isn't the cheapest way to update a row in a table... I could probably do better by instead
// using a query builder, but I wanted sqlx's nice macro handling, at least while iterating on the schema.
// If/when we start hitting perf issues, this is a good place to start.
// NOTE - this function permits multiple flushes to the same job, without losing the lock on it, but
// high level implementations are recommended to avoid this - ideally, for every de/requeue, there should be
// exactly 2 database operations.
pub async fn flush_job<'c, C>(
connection: &mut C,
job_id: Uuid,
updates: JobUpdate,
) -> Result<(), QueueError>
where
C: sqlx::Connection<Database = sqlx::Postgres>,
{
let mut txn = connection.begin().await?;
// Flushing any job state except "running" is a signal that the worker no longer holds this job
let job_returned = !matches!(updates.state, Some(JobState::Running));
let lock_id = updates.lock_id;
if let Some(state) = updates.state {
set_state(&mut *txn, job_id, updates.lock_id, state).await?;
}
if let Some(queue_name) = updates.queue_name {
set_queue(&mut *txn, job_id, &queue_name, lock_id).await?;
}
if let Some(priority) = updates.priority {
set_priority(&mut *txn, job_id, lock_id, priority).await?;
}
if let Some(scheduled) = updates.scheduled {
set_scheduled(&mut *txn, job_id, scheduled, lock_id).await?;
}
if let Some(vm_state) = updates.vm_state {
set_vm_state(&mut *txn, job_id, vm_state, lock_id).await?;
}
if let Some(metadata) = updates.metadata {
set_metadata(&mut *txn, job_id, metadata, lock_id).await?;
}
if let Some(parameters) = updates.parameters {
set_parameters(&mut *txn, job_id, parameters, lock_id).await?;
}
// Calling flush indicates forward progress, so we should touch the heartbeat
set_heartbeat(&mut *txn, job_id, lock_id).await?;
// We do this here, instead of in the set_state call, because otherwise the lock_id passed to other
// updates would be invalid
if job_returned {
let query = sqlx::query!(
"UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2",
job_id,
lock_id
);
assert_does_update(&mut *txn, job_id, lock_id, query).await?;
}
txn.commit().await?;
Ok(())
}
// Simple wrapper, that just executes a query and throws an error if no rows were affected
async fn assert_does_update<'c, E>(
executor: E,
job_id: Uuid,
lock_id: Uuid,
query: Query<'_, sqlx::Postgres, PgArguments>,
) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let res = query.execute(executor).await?;
throw_if_no_rows(res, job_id, lock_id)
}
// Most of the rest of these functions are designed to be used as part of larger transactions, e.g.
// "completing" a job means updating various rows and then marking it complete, and we can build that
// by composing a set of individual queries together using a transaction.
// Update the state of a job, also tracking the transition count and last transition time
pub async fn set_state<'c, E>(
executor: E,
job_id: Uuid,
lock_id: Uuid,
state: JobState,
) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let q = sqlx::query!(
r#"UPDATE cyclotron_jobs
SET state = $1, last_transition = NOW(), transition_count = transition_count + 1
WHERE id = $2 AND lock_id = $3"#,
state as _,
job_id,
lock_id
);
assert_does_update(executor, job_id, lock_id, q).await
}
pub async fn set_queue<'c, E>(
executor: E,
job_id: Uuid,
queue: &str,
lock_id: Uuid,
) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let q = sqlx::query!(
"UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3",
queue,
job_id,
lock_id
);
assert_does_update(executor, job_id, lock_id, q).await
}
pub async fn set_priority<'c, E>(
executor: E,
job_id: Uuid,
lock_id: Uuid,
priority: i16,
) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let q = sqlx::query!(
"UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3",
priority,
job_id,
lock_id
);
assert_does_update(executor, job_id, lock_id, q).await
}
pub async fn set_scheduled<'c, E>(
executor: E,
job_id: Uuid,
scheduled: DateTime<Utc>,
lock_id: Uuid,
) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let q = sqlx::query!(
"UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3",
scheduled,
job_id,
lock_id
);
assert_does_update(executor, job_id, lock_id, q).await
}
pub async fn set_vm_state<'c, E>(
executor: E,
job_id: Uuid,
vm_state: Option<String>,
lock_id: Uuid,
) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let q = sqlx::query!(
"UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3",
vm_state,
job_id,
lock_id
);
assert_does_update(executor, job_id, lock_id, q).await
}
pub async fn set_metadata<'c, E>(
executor: E,
job_id: Uuid,
metadata: Option<String>,
lock_id: Uuid,
) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let q = sqlx::query!(
"UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3",
metadata,
job_id,
lock_id
);
assert_does_update(executor, job_id, lock_id, q).await
}
pub async fn set_parameters<'c, E>(
executor: E,
job_id: Uuid,
parameters: Option<String>,
lock_id: Uuid,
) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let q = sqlx::query!(
"UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3",
parameters,
job_id,
lock_id
);
assert_does_update(executor, job_id, lock_id, q).await
}
pub async fn set_heartbeat<'c, E>(
executor: E,
job_id: Uuid,
lock_id: Uuid,
) -> Result<(), QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let q = sqlx::query!(
"UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2",
job_id,
lock_id
);
assert_does_update(executor, job_id, lock_id, q).await
}
pub async fn count_total_waiting_jobs<'c, E>(executor: E) -> Result<u64, QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let res = sqlx::query!(
"SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'available' AND scheduled <= NOW()",
)
.fetch_one(executor)
.await?;
let res = res.count.unwrap_or(0);
Ok(res as u64)
}
fn throw_if_no_rows(res: PgQueryResult, job: Uuid, lock: Uuid) -> Result<(), QueueError> {
if res.rows_affected() == 0 {
Err(QueueError::InvalidLock(lock, job))
} else {
Ok(())
}
}

View File

@ -0,0 +1,56 @@
use chrono::{Duration, Utc};
use cyclotron_core::{
base_ops::JobInit,
manager::{ManagerConfig, QueueManager},
PoolConfig,
};
use uuid::Uuid;
// Just inserts jobs as fast as it can, choosing randomly between hog and fetch workers, and between different priorities.
// prints every 100 jobs inserted.
#[tokio::main]
async fn main() {
let pool_config = PoolConfig {
db_url: "postgresql://posthog:posthog@localhost:5432/cyclotron".to_string(),
max_connections: None,
min_connections: None,
acquire_timeout_seconds: None,
max_lifetime_seconds: None,
idle_timeout_seconds: None,
};
let manager_config = ManagerConfig {
shards: vec![pool_config.clone()],
shard_depth_limit: None,
shard_depth_check_interval_seconds: None,
};
let manager = QueueManager::new(manager_config).await.unwrap();
let now = Utc::now() - Duration::minutes(1);
let start = Utc::now();
let mut count = 0;
loop {
let queue = if rand::random() { "fetch" } else { "hog" };
let priority = (rand::random::<u16>() % 3) as i16;
let test_job = JobInit {
team_id: 1,
queue_name: queue.to_string(),
priority,
scheduled: now,
function_id: Some(Uuid::now_v7()),
vm_state: None,
parameters: None,
metadata: None,
};
manager.create_job(test_job).await.unwrap();
count += 1;
if count % 100 == 0 {
println!("Elapsed: {:?}, count: {}", Utc::now() - start, count);
}
}
}

View File

@ -0,0 +1,167 @@
use std::{
sync::{atomic::AtomicUsize, Arc},
time::Instant,
};
use chrono::{Duration, Utc};
use cyclotron_core::{
base_ops::{JobInit, JobState},
manager::{ManagerConfig, QueueManager},
worker::Worker,
PoolConfig,
};
use futures::future::join_all;
use uuid::Uuid;
// This spins up a manager and 2 workers, and tries to simulate semi-realistic load (on the DB - the workers do nothing except complete jobs)
// - The manager inserts jobs as fast as it can, choosing randomly between hog and fetch workers, and between different priorities.
// - The workers will process jobs as fast as they can, in batches of 1000.
// - The manager and both workers track how long each insert and dequeue takes, in ms/job.
// - The manager never inserts more than 10,000 more jobs than the workers have processed.
const INSERT_BATCH_SIZE: usize = 1000;
struct SharedContext {
jobs_inserted: AtomicUsize,
jobs_dequeued: AtomicUsize,
}
async fn producer_loop(manager: QueueManager, shared_context: Arc<SharedContext>) {
let mut time_spent_inserting = Duration::zero();
let now = Utc::now() - Duration::minutes(1);
loop {
let mut to_insert = Vec::with_capacity(1000);
for _ in 0..INSERT_BATCH_SIZE {
let queue = if rand::random() { "fetch" } else { "hog" };
let priority = (rand::random::<u16>() % 3) as i16;
let test_job = JobInit {
team_id: 1,
queue_name: queue.to_string(),
priority,
scheduled: now,
function_id: Some(Uuid::now_v7()),
vm_state: None,
parameters: None,
metadata: None,
};
to_insert.push(test_job);
}
let start = Instant::now();
manager.bulk_create_jobs(to_insert).await;
let elapsed = start.elapsed();
time_spent_inserting += Duration::from_std(elapsed).unwrap();
let inserted = shared_context
.jobs_inserted
.fetch_add(INSERT_BATCH_SIZE, std::sync::atomic::Ordering::Relaxed);
println!("Inserted: {} in {}, ", inserted, time_spent_inserting);
let mut dequeued = shared_context
.jobs_dequeued
.load(std::sync::atomic::Ordering::Relaxed);
while inserted > dequeued + 10_000 {
println!(
"Waiting for workers to catch up, lagging by {}",
inserted - dequeued
);
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
dequeued = shared_context
.jobs_dequeued
.load(std::sync::atomic::Ordering::Relaxed);
}
}
}
async fn worker_loop(worker: Worker, shared_context: Arc<SharedContext>, queue: &str) {
let mut time_spent_dequeuing = Duration::zero();
let start = Utc::now();
loop {
let loop_start = Instant::now();
let jobs = worker.dequeue_jobs(queue, 1000).await.unwrap();
if jobs.is_empty() {
println!(
"Worker {:?} outpacing inserts, got no jobs, sleeping!",
queue
);
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
continue;
}
let mut futs = Vec::with_capacity(jobs.len());
for job in &jobs {
worker.set_state(job.id, JobState::Completed).unwrap();
futs.push(worker.flush_job(job.id));
}
for res in join_all(futs).await {
res.unwrap();
}
time_spent_dequeuing += Duration::from_std(loop_start.elapsed()).unwrap();
let dequeued = shared_context
.jobs_dequeued
.fetch_add(jobs.len(), std::sync::atomic::Ordering::Relaxed);
// To account for the bunch we just handled
let dequeued = dequeued + jobs.len();
println!(
"Dequeued, processed and completed {} jobs in {} for {:?}. Total time running: {}",
dequeued,
time_spent_dequeuing,
queue,
Utc::now() - start
);
if jobs.len() < 1000 {
println!(
"Worker {:?} outpacing manager, only got {} jobs, sleeping!",
queue,
jobs.len()
);
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
}
}
}
#[tokio::main]
async fn main() {
let pool_config = PoolConfig {
db_url: "postgresql://posthog:posthog@localhost:5432/cyclotron".to_string(),
max_connections: None,
min_connections: None,
acquire_timeout_seconds: None,
max_lifetime_seconds: None,
idle_timeout_seconds: None,
};
let manager_config = ManagerConfig {
shards: vec![pool_config.clone()],
shard_depth_limit: None,
shard_depth_check_interval_seconds: None,
};
let shared_context = Arc::new(SharedContext {
jobs_inserted: AtomicUsize::new(0),
jobs_dequeued: AtomicUsize::new(0),
});
let manager = QueueManager::new(manager_config).await.unwrap();
let worker_1 = Worker::new(pool_config.clone()).await.unwrap();
let worker_2 = Worker::new(pool_config.clone()).await.unwrap();
let producer = producer_loop(manager, shared_context.clone());
let worker_1 = worker_loop(worker_1, shared_context.clone(), "fetch");
let worker_2 = worker_loop(worker_2, shared_context.clone(), "hog");
let producer = tokio::spawn(producer);
let worker_1 = tokio::spawn(worker_1);
let worker_2 = tokio::spawn(worker_2);
tokio::try_join!(producer, worker_1, worker_2).unwrap();
}

View File

@ -0,0 +1,17 @@
use uuid::Uuid;
#[derive(Debug, thiserror::Error)]
pub enum QueueError {
#[error("sqlx error: {0}")]
SqlxError(#[from] sqlx::Error),
#[error("Unknown job id: {0}")]
UnknownJobId(Uuid), // Happens when someone tries to update a job through a QueueManager that wasn't dequeue or was already flushed
#[error("Job {0} flushed without a new state, which would leave it in a running state forever (or until reaped)")]
FlushWithoutNextState(Uuid),
#[error("Invalid lock {0} used to update job {1}. This usually means a job has been reaped from under a worker - did you forget to set the heartbeat?")]
InvalidLock(Uuid, Uuid),
#[error("Shard over capacity {0} for this manager, insert aborted")]
ShardFull(u64),
#[error("Timed waiting for shard to have capacity")]
TimedOutWaitingForCapacity,
}

View File

@ -0,0 +1,94 @@
use chrono::{Duration, Utc};
use crate::error::QueueError;
// As a general rule, janitor operations are not queue specific (as in, they don't account for the
// queue name). We can revisit this later, if we decide we need the ability to do janitor operations
// on a per-queue basis.
pub async fn delete_completed_jobs<'c, E>(executor: E) -> Result<u64, QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let result = sqlx::query!("DELETE FROM cyclotron_jobs WHERE state = 'completed'")
.execute(executor)
.await
.map_err(QueueError::from)?;
Ok(result.rows_affected())
}
pub async fn delete_failed_jobs<'c, E>(executor: E) -> Result<u64, QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let result = sqlx::query!("DELETE FROM cyclotron_jobs WHERE state = 'failed'")
.execute(executor)
.await
.map_err(QueueError::from)?;
Ok(result.rows_affected())
}
// Jobs are considered stalled if their lock is held and their last_heartbeat is older than `timeout`.
// NOTE - because this runs on running jobs, it can stall workers trying to flush updates as it
// executes. I need to use some of the load generators alongside explain/analyze to optimise this (and
// the set of DB indexes)
// TODO - this /could/ return the lock_id's held, which might help with debugging (if workers reported
// the lock_id's they dequeue'd), but lets not do that right now.
pub async fn reset_stalled_jobs<'c, E>(executor: E, timeout: Duration) -> Result<u64, QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let oldest_valid_heartbeat = Utc::now() - timeout;
let result = sqlx::query!(r#"
WITH stalled AS (
SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED
)
UPDATE cyclotron_jobs
SET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1
FROM stalled
WHERE cyclotron_jobs.id = stalled.id
"#,
oldest_valid_heartbeat
)
.execute(executor)
.await
.map_err(QueueError::from)?;
Ok(result.rows_affected())
}
// Poison pills are jobs whose lock is held and whose heartbeat is older than `timeout`, that have
// been returned to the queue by the janitor more than `max_janitor_touched` times.
// NOTE - this has the same performance caveat as reset_stalled_jobs
// TODO - This shoud, instead, move the job row to a dead letter table, for later investigation. Of course,
// rather than doing that, it could just put the job in a "dead letter" state, and no worker or janitor process
// will touch it... maybe the table moving isn't needed? but either way, being able to debug jobs that cause workers
// to stall would be good (and, thinking about it, moving it to a new table means we don't have to clear the lock,
// so have a potential way to trace back to the last worker that died holding the job)
pub async fn delete_poison_pills<'c, E>(
executor: E,
timeout: Duration,
max_janitor_touched: i16,
) -> Result<u64, QueueError>
where
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
{
let oldest_valid_heartbeat = Utc::now() - timeout;
// NOTE - we don't check the lock_id here, because it probably doesn't matter (the lock_id should be set if the
// job state is "running"), but perhaps we should only delete jobs with a set lock_id, and report an error
// if we find a job with a state of "running" and no lock_id. Also, we delete jobs whose last_heartbeat is
// null, which again should never happen (dequeuing a job should always set the last_heartbeat), but for
// robustness sake we may as well handle it
let result = sqlx::query!(
r#"
DELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2
"#,
oldest_valid_heartbeat,
max_janitor_touched
).execute(executor)
.await
.map_err(QueueError::from)?;
Ok(result.rows_affected())
}

View File

@ -0,0 +1,38 @@
use std::time::Duration;
use serde::{Deserialize, Serialize};
use sqlx::{pool::PoolOptions, PgPool};
pub mod base_ops;
pub mod error;
pub mod janitor_ops;
pub mod manager;
pub mod worker;
// A pool config object, designed to be passable across API boundaries
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct PoolConfig {
pub db_url: String,
pub max_connections: Option<u32>, // Default to 10
pub min_connections: Option<u32>, // Default to 1
pub acquire_timeout_seconds: Option<u64>, // Default to 30
pub max_lifetime_seconds: Option<u64>, // Default to 300
pub idle_timeout_seconds: Option<u64>, // Default to 60
}
impl PoolConfig {
pub async fn connect(&self) -> Result<PgPool, sqlx::Error> {
let builder = PoolOptions::new()
.max_connections(self.max_connections.unwrap_or(10))
.min_connections(self.min_connections.unwrap_or(1))
.max_lifetime(Duration::from_secs(
self.max_lifetime_seconds.unwrap_or(300),
))
.idle_timeout(Duration::from_secs(self.idle_timeout_seconds.unwrap_or(60)))
.acquire_timeout(Duration::from_secs(
self.acquire_timeout_seconds.unwrap_or(30),
));
builder.connect(&self.db_url).await
}
}

View File

@ -0,0 +1,262 @@
use std::sync::atomic::AtomicUsize;
use chrono::{DateTime, Duration, Utc};
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use tokio::sync::RwLock;
use crate::{
base_ops::{bulk_create_jobs, count_total_waiting_jobs, create_job, JobInit},
error::QueueError,
PoolConfig,
};
pub const DEFAULT_QUEUE_DEPTH_LIMIT: u64 = 10_000;
pub const DEFAULT_SHARD_HEALTH_CHECK_INTERVAL: u64 = 10;
// TODO - right now, a lot of this sharding stuff will be hollow, but later we'll add logic like
// e.g. routing work to alive shards if one is down, or reporting shard failure, etc.
// TODO - here's also where queue management commands will go, like "downgrade the priority of this function"
// or "pause jobs for this team", but we're going to add those ad-hoc as they're needed, not up front
#[derive(Debug, Serialize, Deserialize)]
pub struct ManagerConfig {
pub shards: Vec<PoolConfig>,
pub shard_depth_limit: Option<u64>, // Defaults to 10_000 available jobs per shard
pub shard_depth_check_interval_seconds: Option<u64>, // Defaults to 10 seconds - checking shard capacity
}
pub struct Shard {
pub pool: PgPool,
pub last_healthy: RwLock<DateTime<Utc>>,
pub check_interval: Duration,
pub depth_limit: u64,
}
pub struct QueueManager {
shards: RwLock<Vec<Shard>>,
next_shard: AtomicUsize,
}
// Bulk inserts across multiple shards can partially succeed, so we need to track failures
// and hand back failed job inits to the caller.
pub struct BulkInsertResult {
pub failures: Vec<(QueueError, Vec<JobInit>)>,
}
impl QueueManager {
pub async fn new(config: ManagerConfig) -> Result<Self, QueueError> {
let mut shards = vec![];
let depth_limit = config
.shard_depth_limit
.unwrap_or(DEFAULT_QUEUE_DEPTH_LIMIT);
let check_interval = Duration::seconds(
config
.shard_depth_check_interval_seconds
.unwrap_or(DEFAULT_SHARD_HEALTH_CHECK_INTERVAL) as i64,
);
for shard in config.shards {
let pool = shard.connect().await.unwrap();
let shard = Shard::new(pool, depth_limit, check_interval);
shards.push(shard);
}
Ok(Self {
shards: RwLock::new(shards),
next_shard: AtomicUsize::new(0),
})
}
// Designed mostly to be used for testing, but safe enough to expose publicly
pub fn from_pool(pool: PgPool) -> Self {
Self {
shards: RwLock::new(vec![Shard::new(
pool,
DEFAULT_QUEUE_DEPTH_LIMIT,
Duration::seconds(DEFAULT_SHARD_HEALTH_CHECK_INTERVAL as i64),
)]),
next_shard: AtomicUsize::new(0),
}
}
pub async fn create_job(&self, init: JobInit) -> Result<(), QueueError> {
// TODO - here is where a lot of shard health and failover logic will go, eventually.
let next = self
.next_shard
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let shards = self.shards.read().await;
let shard = &shards[next % shards.len()];
shard.create_job(init).await
}
pub async fn create_job_blocking(
&self,
init: JobInit,
timeout: Option<Duration>,
) -> Result<(), QueueError> {
let next = self
.next_shard
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let shards = self.shards.read().await;
let shard = &shards[next % shards.len()];
shard.create_job_blocking(init, timeout).await
}
pub async fn bulk_create_jobs(&self, inits: Vec<JobInit>) -> BulkInsertResult {
let shards = self.shards.read().await;
let chunk_size = inits.len() / shards.len();
let mut result = BulkInsertResult::new();
// TODO - at some point, we should dynamically re-acquire the lock each time, to allow
// for re-routing jobs away from a bad shard during a bulk insert, but right now, we
// don't even re-try inserts. Later work.
for chunk in inits.chunks(chunk_size) {
let next_shard = self
.next_shard
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let shard = &shards[next_shard % shards.len()];
let shard_result = shard.bulk_create_jobs(chunk).await;
if let Err(err) = shard_result {
result.add_failure(err, chunk.to_vec());
}
}
result
}
pub async fn bulk_create_jobs_blocking(
&self,
inits: Vec<JobInit>,
timeout: Option<Duration>,
) -> BulkInsertResult {
let shards = self.shards.read().await;
let chunk_size = inits.len() / shards.len();
let mut result = BulkInsertResult::new();
for chunk in inits.chunks(chunk_size) {
let next_shard = self
.next_shard
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let shard = &shards[next_shard % shards.len()];
// TODO - we sequentially try each shard, but we could try to parallelize this.
let shard_result = shard.bulk_create_jobs_blocking(chunk, timeout).await;
if let Err(err) = shard_result {
result.add_failure(err, chunk.to_vec());
}
}
result
}
}
impl Shard {
pub fn new(pool: PgPool, depth_limit: u64, check_interval: Duration) -> Self {
Self {
pool,
last_healthy: RwLock::new(Utc::now() - check_interval),
check_interval,
depth_limit,
}
}
// Inserts a job, failing if the shard is at capacity
pub async fn create_job(&self, init: JobInit) -> Result<(), QueueError> {
self.insert_guard().await?;
create_job(&self.pool, init).await
}
// Inserts a vec of jobs, failing if the shard is at capacity. Note "capacity" here just
// means "it isn't totally full" - if there's "capacity" for 1 job, and this is a vec of
// 1000, we still insert all 1000.
pub async fn bulk_create_jobs(&self, inits: &[JobInit]) -> Result<(), QueueError> {
self.insert_guard().await?;
bulk_create_jobs(&self.pool, inits).await
}
// Inserts a job, blocking until there's capacity (or until the timeout is reached)
pub async fn create_job_blocking(
&self,
init: JobInit,
timeout: Option<Duration>,
) -> Result<(), QueueError> {
let start = Utc::now();
while self.is_full().await? {
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
if let Some(timeout) = &timeout {
if Utc::now() - start > *timeout {
return Err(QueueError::TimedOutWaitingForCapacity);
}
}
}
create_job(&self.pool, init).await
}
pub async fn bulk_create_jobs_blocking(
&self,
inits: &[JobInit],
timeout: Option<Duration>,
) -> Result<(), QueueError> {
let start = Utc::now();
while self.is_full().await? {
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
if let Some(timeout) = &timeout {
if Utc::now() - start > *timeout {
return Err(QueueError::TimedOutWaitingForCapacity);
}
}
}
bulk_create_jobs(&self.pool, inits).await
}
pub async fn insert_guard(&self) -> Result<(), QueueError> {
if self.is_full().await? {
return Err(QueueError::ShardFull(self.depth_limit));
}
Ok(())
}
pub async fn is_full(&self) -> Result<bool, QueueError> {
let last_healthy = self.last_healthy.read().await;
// If we were healthy less than the check interval ago, assume we are still
if Utc::now() - *last_healthy < self.check_interval {
return Ok(false);
}
// Grab a write lock. This constrains the number of concurrent capacity checks
// to 1, purposefully - if someone spawns a thousand tasks to blockingly create
// a job, we don't want all of them to be querying the available count at once.
drop(last_healthy);
let mut last_healthy = self.last_healthy.write().await;
// TOCTOU - multiple tasks could be racing to re-do the check, and the firs time one
// succeeds all the rest should skip it.
if Utc::now() - *last_healthy < self.check_interval {
return Ok(false);
}
let pending = count_total_waiting_jobs(&self.pool).await?;
let is_full = pending >= self.depth_limit;
if !is_full {
*last_healthy = Utc::now();
}
Ok(is_full)
}
}
impl BulkInsertResult {
pub fn new() -> Self {
Self { failures: vec![] }
}
pub fn add_failure(&mut self, err: QueueError, jobs: Vec<JobInit>) {
self.failures.push((err, jobs));
}
pub fn all_succeeded(&self) -> bool {
self.failures.is_empty()
}
}
impl Default for BulkInsertResult {
fn default() -> Self {
Self::new()
}
}

View File

@ -0,0 +1,229 @@
use std::{collections::HashMap, sync::Arc};
use chrono::{DateTime, Utc};
use sqlx::PgPool;
use std::sync::Mutex;
use uuid::Uuid;
use crate::{
base_ops::{
dequeue_jobs, dequeue_with_vm_state, flush_job, set_heartbeat, Job, JobState, JobUpdate,
},
error::QueueError,
PoolConfig,
};
// The worker's interface to the underlying queue system - a worker can do everything except
// create jobs (because job creation has to be shard-aware).
//
// This interface looks stange, because a lot of things that would normally be done with lifetimes
// and references are done with uuid's instead (and we lose some nice raii stuff as a result), but
// the reason for this is that this is designed to be embedded in other runtimes, where handing out
// lifetime'd references or things with drop impls isn't really practical. This makes it a little
// awkward to use, but since it's meant to be the core of other abstractions, I think it's ok for
// now (client libraries should wrap this to provide better interfaces).
pub struct Worker {
pool: PgPool,
// All dequeued job IDs that haven't been flushed yet. The idea is this lets us
// manage, on the rust side of any API boundary, the "pending" update of any given
// job, such that a user can progressively build up a full update, and then flush it,
// rather than having to track the update state on their side and submit it all at once
// TODO - we don't handle people "forgetting" to abort a job, because we expect that to
// only happen if a process dies (in which case the job queue janitor should handle
// it)... this is a memory leak, but I think it's ok.
// TRICKY - this is a sync mutex, because we never hold it across an await point, and that
// radically simplifies using this for FFI (because there's no message passing across runtimes)
pending: Arc<Mutex<HashMap<Uuid, JobUpdate>>>,
}
impl Worker {
pub async fn new(config: PoolConfig) -> Result<Self, QueueError> {
let pool = config.connect().await?;
Ok(Self {
pool,
pending: Arc::new(Mutex::new(HashMap::new())),
})
}
pub fn from_pool(pool: PgPool) -> Self {
Self {
pool,
pending: Arc::new(Mutex::new(HashMap::new())),
}
}
/// Dequeues jobs from the queue, and returns them. Job sorting happens at the queue level,
/// workers can't provide any filtering or sorting criteria - queue managers decide which jobs are run,
/// workers just run them.
pub async fn dequeue_jobs(&self, queue: &str, limit: usize) -> Result<Vec<Job>, QueueError> {
let jobs = dequeue_jobs(&self.pool, queue, limit).await?;
let mut pending = self.pending.lock().unwrap();
for job in &jobs {
// We need to hang onto the locks for a job until we flush it, so we can send updates.
let update = JobUpdate::new(
job.lock_id
.expect("Yell at oliver that the dequeuing code is broken. He's very sorry that your process just panicked"),
);
pending.insert(job.id, update);
}
Ok(jobs)
}
/// This is the same as dequeue_jobs, but it also returns the vm_state of the job
pub async fn dequeue_with_vm_state(
&self,
queue: &str,
limit: usize,
) -> Result<Vec<Job>, QueueError> {
let jobs = dequeue_with_vm_state(&self.pool, queue, limit).await?;
let mut pending = self.pending.lock().unwrap();
for job in &jobs {
// We need to hang onto the locks for a job until we flush it, so we can send updates.
let update = JobUpdate::new(
job.lock_id
.expect("Yell at oliver that the dequeuing (with vm) code is broken. He's very sorry that your process just panicked"),
);
pending.insert(job.id, update);
}
Ok(jobs)
}
/// NOTE - This function can only be called once, even though the underlying
/// basic operation can be performed as many times as the caller likes (so long as
/// the job state is never set to something other than running, as that clears the
/// job lock). We're more strict here (flushes can only happen once, you must
/// flush some non-running state) to try and enforce a good interaction
/// pattern with the queue. I might return to this and loosen this constraint in the
/// future, if there's a motivating case for needing to flush partial job updates.
pub async fn flush_job(&self, job_id: Uuid) -> Result<(), QueueError> {
// TODO - this drops the job from the known jobs before the flush succeeds,
// which means that if the flush fails, we'll lose the job and can never
// update it's state (leaving it to the reaper). This is a bug, but I'm not
// sure I want to make flushes retryable just yet, so I'm leaving it for now.
// NIT: this wrapping is to ensure pending is dropped prior to the await
let update = {
let mut pending = self.pending.lock().unwrap();
let update = pending
.remove(&job_id)
.ok_or(QueueError::UnknownJobId(job_id))?;
// It's a programming error to flush a job without setting a new state
match update.state {
Some(JobState::Running) | None => {
// Keep track of any /other/ updates that might have been stored, even in this case,
// so a user can queue up the appropriate state transition and flush properly
pending.insert(job_id, update);
return Err(QueueError::FlushWithoutNextState(job_id));
}
_ => update,
}
};
let mut connection = self.pool.acquire().await?;
flush_job(connection.as_mut(), job_id, update).await
}
/// Jobs are reaped after some seconds (the number is deployment specific, and may become
/// specific on job properties like queue name in the future, as we figure out what /kinds/ of
/// jobs are longer or shorter running). A job is considered "dead" if it's in a running state,
/// and it's last heartbeat was more than the reaping time ago. This, like flush, returns an
/// error if you try to set the heartbeat on a job whose lock you don't have (which can happen
/// if e.g. the job was reaped out from under you).
pub async fn heartbeat(&self, job_id: Uuid) -> Result<(), QueueError> {
let lock_id = {
let pending = self.pending.lock().unwrap();
pending
.get(&job_id)
.ok_or(QueueError::UnknownJobId(job_id))?
.lock_id
};
let mut connection = self.pool.acquire().await?;
set_heartbeat(connection.as_mut(), job_id, lock_id).await
}
/// This is how you "return" a job to the queue, by setting the state to "available"
pub fn set_state(&self, job_id: Uuid, state: JobState) -> Result<(), QueueError> {
let mut pending = self.pending.lock().unwrap();
pending
.get_mut(&job_id)
.ok_or(QueueError::UnknownJobId(job_id))?
.state = Some(state);
Ok(())
}
pub fn set_queue(&self, job_id: Uuid, queue: &str) -> Result<(), QueueError> {
let mut pending = self.pending.lock().unwrap();
pending
.get_mut(&job_id)
.ok_or(QueueError::UnknownJobId(job_id))?
.queue_name = Some(queue.to_string());
Ok(())
}
/// Jobs are dequeued lowest-priority-first, so this is how you change the "base" priority of a job
/// (control tables may apply further deltas if e.g. a given function is in a degraded state)
pub fn set_priority(&self, job_id: Uuid, priority: i16) -> Result<(), QueueError> {
let mut pending = self.pending.lock().unwrap();
pending
.get_mut(&job_id)
.ok_or(QueueError::UnknownJobId(job_id))?
.priority = Some(priority);
Ok(())
}
/// This is how you do e.g. retries after some time, by setting the scheduled time
/// to some time in the future. Sleeping, retry backoff, scheduling - it's all the same operation,
/// this one.
pub fn set_scheduled_at(
&self,
job_id: Uuid,
scheduled: DateTime<Utc>,
) -> Result<(), QueueError> {
let mut pending = self.pending.lock().unwrap();
pending
.get_mut(&job_id)
.ok_or(QueueError::UnknownJobId(job_id))?
.scheduled = Some(scheduled);
Ok(())
}
/// Passing None here will clear the vm_state
pub fn set_vm_state(
&self,
job_id: Uuid,
vm_state: Option<String>, // This (and the following) are Options, because the user can null them (by calling with None)
) -> Result<(), QueueError> {
let mut pending = self.pending.lock().unwrap();
pending
.get_mut(&job_id)
.ok_or(QueueError::UnknownJobId(job_id))?
.vm_state = Some(vm_state);
Ok(())
}
/// Passing None here will clear the metadata
pub fn set_metadata(&self, job_id: Uuid, metadata: Option<String>) -> Result<(), QueueError> {
let mut pending = self.pending.lock().unwrap();
pending
.get_mut(&job_id)
.ok_or(QueueError::UnknownJobId(job_id))?
.metadata = Some(metadata);
Ok(())
}
/// Passing None here will clear the parameters
pub fn set_parameters(
&self,
job_id: Uuid,
parameters: Option<String>,
) -> Result<(), QueueError> {
let mut pending = self.pending.lock().unwrap();
pending
.get_mut(&job_id)
.ok_or(QueueError::UnknownJobId(job_id))?
.parameters = Some(parameters);
Ok(())
}
}

View File

@ -0,0 +1,255 @@
use std::sync::Arc;
use chrono::{Duration, Utc};
use common::{assert_job_matches_init, create_new_job, dates_match};
use cyclotron_core::{
base_ops::{bulk_create_jobs, JobState},
manager::QueueManager,
worker::Worker,
};
use sqlx::PgPool;
use uuid::Uuid;
mod common;
// I know this should be a bunch of tests, but for hacking together stuff right now, it'll do
#[sqlx::test(migrations = "./migrations")]
async fn test_queue(db: PgPool) {
let manager = QueueManager::from_pool(db.clone());
let worker = Worker::from_pool(db);
let job_1 = create_new_job();
let mut job_2 = create_new_job();
job_2.priority = 2; // Lower priority jobs should be returned second
let queue_name = job_1.queue_name.clone();
manager
.create_job(job_1.clone())
.await
.expect("failed to create job");
manager
.create_job(job_2.clone())
.await
.expect("failed to create job");
let jobs = worker
.dequeue_jobs(&queue_name, 2)
.await
.expect("failed to dequeue job");
assert_eq!(jobs.len(), 2);
// This also assert that the ordering is correct in terms of priority
assert_job_matches_init(&jobs[0], &job_1);
assert_job_matches_init(&jobs[1], &job_2);
// Now we can re-queue these jobs (imagine we had done work)
worker
.set_state(jobs[0].id, JobState::Available)
.expect("failed to set state");
worker
.set_state(jobs[1].id, JobState::Available)
.expect("failed to set state");
// Flush the two jobs, having made no other changes, then assert we can re-dequeue them
worker
.flush_job(jobs[0].id)
.await
.expect("failed to flush job");
worker
.flush_job(jobs[1].id)
.await
.expect("failed to flush job");
let jobs = worker
.dequeue_jobs(&queue_name, 2)
.await
.expect("failed to dequeue job");
assert_eq!(jobs.len(), 2);
assert_job_matches_init(&jobs[0], &job_1);
assert_job_matches_init(&jobs[1], &job_2);
// Re-queue them again
worker
.set_state(jobs[0].id, JobState::Available)
.expect("failed to set state");
worker
.set_state(jobs[1].id, JobState::Available)
.expect("failed to set state");
worker
.flush_job(jobs[0].id)
.await
.expect("failed to flush job");
worker
.flush_job(jobs[1].id)
.await
.expect("failed to flush job");
// Spin up two tasks to race on dequeuing, and assert at most 2 jobs are dequeued
let worker = Arc::new(worker);
let moved = worker.clone();
let queue_name_moved = queue_name.clone();
let fut_1 = async move {
moved
.dequeue_jobs(&queue_name_moved, 2)
.await
.expect("failed to dequeue job")
};
let moved = worker.clone();
let queue_name_moved = queue_name.clone();
let fut_2 = async move {
moved
.dequeue_jobs(&queue_name_moved, 2)
.await
.expect("failed to dequeue job")
};
let (jobs_1, jobs_2) = tokio::join!(fut_1, fut_2);
assert_eq!(jobs_1.len() + jobs_2.len(), 2);
let jobs = jobs_1
.into_iter()
.chain(jobs_2.into_iter())
.collect::<Vec<_>>();
// And now, any subsequent dequeues will return no jobs
let empty = worker
.dequeue_jobs(&queue_name, 2)
.await
.expect("failed to dequeue job");
assert_eq!(empty.len(), 0);
// If we try to flush a job without setting what it's next state will be (or if we set that next state to be "running"),
// we should get an error
worker
.flush_job(jobs[0].id)
.await
.expect_err("expected error due to no-next-state");
worker
.set_state(jobs[1].id, JobState::Running)
.expect("failed to set state");
worker
.flush_job(jobs[1].id)
.await
.expect_err("expected error due to running state");
// But if we properly set the state to completed or failed, now we can flush
worker
.set_state(jobs[0].id, JobState::Completed)
.expect("failed to set state");
worker
.set_state(jobs[1].id, JobState::Failed)
.expect("failed to set state");
worker
.flush_job(jobs[0].id)
.await
.expect("failed to flush job");
worker
.flush_job(jobs[1].id)
.await
.expect("failed to flush job");
// And now, any subsequent dequeues will return no jobs (because these jobs are finished)
let empty = worker
.dequeue_jobs(&queue_name, 2)
.await
.expect("failed to dequeue job");
assert_eq!(empty.len(), 0);
// Now, lets check that we can set every variable on a job
// Set up some initial values
let now = Utc::now();
let mut job = create_new_job();
job.queue_name = "test".to_string();
job.priority = 0;
job.scheduled = now - Duration::minutes(2);
job.vm_state = None;
job.parameters = None;
job.metadata = None;
// Queue the job
manager
.create_job(job.clone())
.await
.expect("failed to create job");
// Then dequeue it
let job = worker
.dequeue_jobs("test", 1)
.await
.expect("failed to dequeue job")
.pop()
.expect("failed to dequeue job");
// Set everything we're able to set, including state to available, so we can dequeue it again
worker
.set_state(job.id, JobState::Available)
.expect("failed to set state");
worker
.set_queue(job.id, "test_2")
.expect("failed to set queue");
worker
.set_priority(job.id, 1)
.expect("failed to set priority");
worker
.set_scheduled_at(job.id, now - Duration::minutes(10))
.expect("failed to set scheduled_at");
worker
.set_vm_state(job.id, Some("test".to_string()))
.expect("failed to set vm_state");
worker
.set_parameters(job.id, Some("test".to_string()))
.expect("failed to set parameters");
worker
.set_metadata(job.id, Some("test".to_string()))
.expect("failed to set metadata");
// Flush the job
worker.flush_job(job.id).await.expect("failed to flush job");
// Then dequeue it again (this time being sure to grab the vm state too)
let job = worker
.dequeue_with_vm_state("test_2", 1)
.await
.expect("failed to dequeue job")
.pop()
.expect("failed to dequeue job");
// And every value should be the updated one
assert_eq!(job.queue_name, "test_2");
assert_eq!(job.priority, 1);
assert!(dates_match(&job.scheduled, &(now - Duration::minutes(10))),);
assert_eq!(job.vm_state, Some("test".to_string()));
assert_eq!(job.parameters, Some("test".to_string()));
assert_eq!(job.metadata, Some("test".to_string()));
}
#[sqlx::test(migrations = "./migrations")]
pub async fn test_bulk_insert(db: PgPool) {
let worker = Worker::from_pool(db.clone());
let job_template = create_new_job();
let jobs = (0..1000)
.map(|_| {
let mut job = job_template.clone();
job.function_id = Some(Uuid::now_v7());
job
})
.collect::<Vec<_>>();
bulk_create_jobs(&db, &jobs).await.unwrap();
let dequeue_jobs = worker
.dequeue_jobs(&job_template.queue_name, 1000)
.await
.expect("failed to dequeue job");
assert_eq!(dequeue_jobs.len(), 1000);
}

View File

@ -0,0 +1,40 @@
use chrono::{DateTime, Duration, Utc};
use cyclotron_core::base_ops::{Job, JobInit};
use uuid::Uuid;
#[allow(dead_code)]
pub fn create_new_job() -> JobInit {
JobInit {
team_id: 1,
function_id: Some(Uuid::now_v7()), // Lets us uniquely identify jobs without having the Uuid
queue_name: "test".to_string(),
priority: 0,
scheduled: Utc::now() - Duration::minutes(1),
vm_state: None,
parameters: None,
metadata: None,
}
}
#[allow(dead_code)]
pub fn dates_match(left: &DateTime<Utc>, right: &DateTime<Utc>) -> bool {
// Roundtripping a datetime to PG can cause sub-ms differences, so we need to check within a margin of error
// Seeing errors like this in CI:
// assertion `left == right` failed
// left: 2024-08-08T20:41:55.964936Z
// right: 2024-08-08T20:41:55.964936997Z
let diff = *left - *right;
diff.abs() < Duration::milliseconds(1)
}
#[allow(dead_code)]
pub fn assert_job_matches_init(job: &Job, init: &JobInit) {
assert_eq!(job.team_id, init.team_id);
assert_eq!(job.function_id, init.function_id);
assert_eq!(job.queue_name, init.queue_name);
assert_eq!(job.priority, init.priority);
assert!(dates_match(&job.scheduled, &init.scheduled));
assert_eq!(job.vm_state, init.vm_state);
assert_eq!(job.parameters, init.parameters);
assert_eq!(job.metadata, init.metadata);
}

View File

@ -0,0 +1,68 @@
use chrono::{Duration, Utc};
use common::create_new_job;
use cyclotron_core::manager::Shard;
use sqlx::PgPool;
use tokio::sync::RwLock;
mod common;
pub fn get_shard(db: PgPool) -> Shard {
Shard {
pool: db,
last_healthy: RwLock::new(Utc::now()),
check_interval: Duration::milliseconds(0), // We always want to check the limit, for these tests
depth_limit: 10,
}
}
#[sqlx::test(migrations = "./migrations")]
pub async fn test_shard_limiting(db: PgPool) {
let shard = get_shard(db.clone());
// We should be able to insert 10 jobs
for _ in 0..10 {
shard.create_job(create_new_job()).await.unwrap();
}
// And then we should fail on the 11th
let result = shard.create_job(create_new_job()).await;
assert!(result.is_err());
}
#[sqlx::test(migrations = "./migrations")]
pub async fn test_shard_blocking_insert_waits(db: PgPool) {
let shard = get_shard(db.clone());
// We should be able to insert 10 jobs
for _ in 0..10 {
shard.create_job(create_new_job()).await.unwrap();
}
let timeout = Some(Duration::milliseconds(50));
let start = Utc::now();
// And then we should fail on the 11th
let result = shard.create_job_blocking(create_new_job(), timeout).await;
assert!(result.is_err());
// We should have waited at least 50ms
assert!(Utc::now() - start >= Duration::milliseconds(50));
}
#[sqlx::test(migrations = "./migrations")]
pub async fn test_shard_allows_bulk_inserts_beyond_capacity(db: PgPool) {
let shard = get_shard(db.clone());
// We should be able to insert 10 jobs
for _ in 0..9 {
shard.create_job(create_new_job()).await.unwrap();
}
// And then we should be able to bulk insert 1000
let inits = (0..1000).map(|_| create_new_job()).collect::<Vec<_>>();
shard.bulk_create_jobs(&inits).await.unwrap();
// And the next insert should fail
let result = shard.create_job(create_new_job()).await;
assert!(result.is_err());
}

View File

@ -0,0 +1,32 @@
[package]
name = "cyclotron-fetch"
version = "0.1.0"
edition = "2021"
[lints]
workspace = true
[dependencies]
tracing-subscriber = { workspace = true }
chrono = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
uuid = { workspace = true }
envconfig = { workspace = true }
axum = { workspace = true }
thiserror = { workspace = true }
metrics = { workspace = true }
cyclotron-core = { path = "../cyclotron-core" }
common-metrics = { path = "../common/metrics" }
common-dns = { path = "../common/dns" }
health = { path = "../common/health" }
reqwest = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
http = { workspace = true }
rand = { workspace = true }
futures = { workspace = true }
[dev-dependencies]
sqlx = { workspace = true }
httpmock = { workspace = true }

View File

@ -0,0 +1,104 @@
use chrono::Duration;
use cyclotron_core::PoolConfig;
use envconfig::Envconfig;
use uuid::Uuid;
#[derive(Envconfig)]
pub struct Config {
#[envconfig(from = "BIND_HOST", default = "::")]
pub host: String,
#[envconfig(from = "BIND_PORT", default = "3304")]
pub port: u16,
#[envconfig(default = "postgres://posthog:posthog@localhost:5432/cyclotron")]
pub database_url: String,
#[envconfig(default = "10")]
pub pg_max_connections: u32,
#[envconfig(default = "1")]
pub pg_min_connections: u32,
#[envconfig(default = "30")]
pub pg_acquire_timeout_seconds: u64,
#[envconfig(default = "300")]
pub pg_max_lifetime_seconds: u64,
#[envconfig(default = "60")]
pub pg_idle_timeout_seconds: u64,
#[envconfig(default = "false")]
pub allow_internal_ips: bool,
pub worker_id: Option<String>, // Default to a UUID
pub job_poll_interval_seconds: Option<u32>, // Defaults to 1
pub concurrent_requests_limit: Option<u32>, // Defaults to 1000
pub fetch_timeout_seconds: Option<u32>, // Defaults to 30
pub max_retry_attempts: Option<u32>, // Defaults to 10
pub queue_served: Option<String>, // Default to "fetch"
pub batch_size: Option<usize>, // Defaults to 1000
pub max_response_bytes: Option<usize>, // Defaults to 1MB
pub retry_backoff_base_ms: Option<u32>, // Defaults to 4000
}
// I do this instead of using envconfig's defaults because
// envconfig doesn't support defaults provided by functions,
// which is frustrating when I want to use UUIDs, and if I'm
// going to break out one field, I might as well break out
// everything into "AppConfig" and "PoolConfig"
#[derive(Debug, Clone)]
pub struct AppConfig {
pub host: String,
pub port: u16,
pub worker_id: String,
pub job_poll_interval: Duration, // How long we wait to poll for new jobs, when we're at capacity or find no new jobs
pub concurrent_requests_limit: u32,
pub fetch_timeout: Duration,
pub max_retry_attempts: u32,
pub queue_served: String,
pub batch_size: usize,
pub max_response_bytes: usize,
pub retry_backoff_base: Duration, // Job retry backoff times are this * attempt count
pub allow_internal_ips: bool,
}
impl Config {
pub fn to_components(self) -> (AppConfig, PoolConfig) {
let worker_id = self.worker_id.unwrap_or_else(|| Uuid::now_v7().to_string());
let job_poll_interval_seconds = self.job_poll_interval_seconds.unwrap_or(1);
let concurrent_requests_limit = self.concurrent_requests_limit.unwrap_or(1000);
let fetch_timeout_seconds = self.fetch_timeout_seconds.unwrap_or(30);
let max_retry_attempts = self.max_retry_attempts.unwrap_or(10);
let queue_served = self.queue_served.unwrap_or_else(|| "fetch".to_string());
let app_config = AppConfig {
host: self.host,
port: self.port,
worker_id,
job_poll_interval: Duration::seconds(job_poll_interval_seconds as i64),
concurrent_requests_limit,
fetch_timeout: Duration::seconds(fetch_timeout_seconds as i64),
max_retry_attempts,
queue_served,
batch_size: self.batch_size.unwrap_or(1000),
max_response_bytes: self.max_response_bytes.unwrap_or(1024 * 1024),
retry_backoff_base: Duration::milliseconds(
self.retry_backoff_base_ms.unwrap_or(4000) as i64
),
allow_internal_ips: self.allow_internal_ips,
};
let pool_config = PoolConfig {
db_url: self.database_url,
max_connections: Some(self.pg_max_connections),
min_connections: Some(self.pg_min_connections),
acquire_timeout_seconds: Some(self.pg_acquire_timeout_seconds),
max_lifetime_seconds: Some(self.pg_max_lifetime_seconds),
idle_timeout_seconds: Some(self.pg_idle_timeout_seconds),
};
(app_config, pool_config)
}
}

View File

@ -0,0 +1,55 @@
use std::sync::Arc;
use cyclotron_core::{worker::Worker, PoolConfig};
use health::HealthHandle;
use tokio::sync::Semaphore;
use crate::{config::AppConfig, fetch::FetchError};
pub struct AppContext {
pub worker: Worker,
pub client: reqwest::Client,
pub concurrency_limit: Arc<Semaphore>,
pub liveness: HealthHandle,
pub config: AppConfig,
}
impl AppContext {
pub async fn create(
config: AppConfig,
pool_config: PoolConfig,
liveness: HealthHandle,
) -> Result<Self, FetchError> {
let concurrency_limit = Arc::new(Semaphore::new(config.concurrent_requests_limit as usize));
let resolver = Arc::new(common_dns::PublicIPv4Resolver {});
let mut client = reqwest::Client::builder().timeout(config.fetch_timeout.to_std().unwrap());
if !config.allow_internal_ips {
client = client.dns_resolver(resolver);
}
let client = client.build();
let client = match client {
Ok(c) => c,
Err(e) => {
return Err(FetchError::StartupError(format!(
"Failed to create reqwest client: {}",
e
)));
}
};
let worker = Worker::new(pool_config).await?;
Ok(Self {
worker,
client,
concurrency_limit,
liveness,
config,
})
}
}

View File

@ -0,0 +1,653 @@
use std::{cmp::min, collections::HashMap, sync::Arc};
use chrono::{DateTime, Duration, Utc};
use cyclotron_core::{
base_ops::{Job, JobState},
error::QueueError,
worker::Worker,
};
use futures::StreamExt;
use http::StatusCode;
use reqwest::Response;
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tokio::sync::OwnedSemaphorePermit;
use tracing::error;
use crate::context::AppContext;
// TODO - a lot of these should maybe be configurable
pub const DEAD_LETTER_QUEUE: &str = "fetch-dead-letter";
pub const DEFAULT_RETRIES: u32 = 3;
pub const DEFAULT_ON_FINISH: OnFinish = OnFinish::Return;
pub const HEARTBEAT_INTERVAL_MS: i64 = 5000;
// Exclusively for errors in the worker - these will
// never be serialised into the job queue, and indicate
// bad worker health. As a general rule, if one of these
// is produced, we should let the worker fall over (as in,
// the outer worker loop should exit).
#[derive(Error, Debug)]
pub enum FetchError {
#[error("timeout fetching jobs")]
JobFetchTimeout,
#[error(transparent)]
QueueError(#[from] QueueError),
// TRICKY - in most cases, serde errors are a FetchError (something coming from the queue was
// invalid), but this is used in cases where /we/ fail to serialise something /to/ the queue
#[error(transparent)]
SerdeError(#[from] serde_json::Error),
// We failed doing some kind of setup, like creating a reqwest client
#[error("error during startup: {0}")]
StartupError(String),
}
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
#[serde(rename_all = "UPPERCASE")]
pub enum HttpMethod {
Get,
Post,
Patch,
Put,
Delete,
}
impl From<&HttpMethod> for http::Method {
fn from(method: &HttpMethod) -> Self {
match method {
HttpMethod::Get => http::Method::GET,
HttpMethod::Post => http::Method::POST,
HttpMethod::Patch => http::Method::PATCH,
HttpMethod::Put => http::Method::PUT,
HttpMethod::Delete => http::Method::DELETE,
}
}
}
// What does someone need to give us to execute a fetch?
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "lowercase")]
pub struct FetchParameters {
pub url: String,
pub method: HttpMethod,
pub return_queue: String,
pub headers: Option<HashMap<String, String>>,
pub body: Option<String>,
pub max_tries: Option<u32>, // Defaults to 3
pub on_finish: Option<OnFinish>, // Defaults to Return
}
// What should we do when we get a result, or run out of tries for a given job?
// Return means re-queue to the return_worker, Complete means mark as Completed/Failed
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
#[serde(rename_all = "lowercase")]
pub enum OnFinish {
Return,
Complete,
}
// Internal bookkeeping for a fetch job
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "lowercase")]
pub struct FetchMetadata {
tries: u32,
// The history of failures seen with this job
trace: Vec<FetchFailure>,
}
// This is what we put in the parameters of the job queue for the next
// worker to pick up
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(tag = "status", rename_all = "lowercase")]
pub enum FetchResult {
Success { response: FetchResponse },
Failure { trace: Vec<FetchFailure> }, // If we failed entirely to fetch the job, we return the trace for user debugging
}
impl FetchResult {
pub fn is_success(&self) -> bool {
matches!(self, FetchResult::Success { .. })
}
}
// We distinguish between a "fetch failure" and a "worker failure" -
// worker failures are internal-only, and do not count against the
// retries of a job (generally, on worker failure, the job is either
// moved to the dead letter queue, or dropped and left to the janitor to
// reset). Feture failures are, after retries, returned to the queue, and
// represent the result of the fetch operation.
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "lowercase")]
pub struct FetchFailure {
pub kind: FetchFailureKind,
pub message: String,
pub body: Option<String>, // If we have a body, we include it in the failure
pub headers: Option<HashMap<String, String>>, // If we have headers, we include them in the failure
pub status: Option<u16>, // If we have a status, we include it in the failure
pub timestamp: DateTime<Utc>, // Useful for users to correlate logs when debugging
}
impl FetchFailure {
pub fn new(kind: FetchFailureKind, message: impl AsRef<str>) -> Self {
Self {
kind,
message: message.as_ref().to_string(),
timestamp: Utc::now(),
body: None,
headers: None,
status: None,
}
}
pub fn failure_status(status: StatusCode) -> Self {
Self {
kind: FetchFailureKind::FailureStatus,
message: format!("Received failure status: {}", status),
timestamp: Utc::now(),
body: None,
headers: None,
status: Some(status.as_u16()),
}
}
pub fn with_body(self, body: String) -> Self {
Self {
body: Some(body),
..self
}
}
pub fn with_headers(self, headers: HashMap<String, String>) -> Self {
Self {
headers: Some(headers),
..self
}
}
pub fn with_status(self, status: u16) -> Self {
Self {
status: Some(status),
..self
}
}
}
impl From<reqwest::Error> for FetchFailure {
fn from(e: reqwest::Error) -> Self {
let kind = if e.is_timeout() {
FetchFailureKind::Timeout
} else {
FetchFailureKind::RequestError
};
Self {
kind,
message: e.to_string(),
timestamp: Utc::now(),
body: None,
headers: None,
status: None,
}
}
}
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
#[serde(rename_all = "lowercase")]
pub enum FetchFailureKind {
Timeout,
TimeoutGettingBody,
MissingParameters,
InvalidParameters,
RequestError,
FailureStatus,
InvalidBody, // Generally means the body could not be parsed toa utf8 string
ResponseTooLarge,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "lowercase")]
pub struct FetchResponse {
pub status: u16,
pub headers: HashMap<String, String>,
pub body: String,
}
pub fn report_worker_saturation(context: &AppContext) {
metrics::gauge!("fetch_worker_available_permits")
.set(context.concurrency_limit.available_permits() as f64);
}
pub async fn tick(context: Arc<AppContext>) -> Result<usize, FetchError> {
report_worker_saturation(&context);
let max_jobs = min(
context.concurrency_limit.available_permits(),
context.config.batch_size,
);
let jobs = context
.worker
.dequeue_jobs(&context.config.queue_served, max_jobs)
.await?;
let num_jobs = jobs.len();
for job in jobs {
let context = context.clone();
// We grab job permits individually, so that as soon as a job is finished, the
// permit to run another job is immediately available. This call should
// never block, since we only ever dequeue as many jobs as we have permits
// available.
let permit = context
.concurrency_limit
.clone()
.acquire_owned()
.await
.unwrap();
tokio::spawn(async move {
// TODO - since worker errors are never an indication of a fetch failure,
// only of some internal worker issue, we should report unhealthy or fall
// over or something here.
if let Err(e) = run_job(context.clone(), job, permit).await {
error!("Error running job: {:?}", e);
}
});
}
Ok(num_jobs)
}
// Mostly a thin wrapper to make ser/de a bit easier
struct FetchJob<'a> {
_job: &'a Job,
metadata: FetchMetadata,
parameters: FetchParameters,
}
impl<'a> TryFrom<&'a Job> for FetchJob<'a> {
type Error = FetchFailure;
fn try_from(job: &'a Job) -> Result<Self, Self::Error> {
let Some(parameters) = &job.parameters else {
return Err(FetchFailure::new(
FetchFailureKind::MissingParameters,
"Job is missing parameters",
));
};
let parameters: FetchParameters = match serde_json::from_str(parameters) {
Ok(p) => p,
Err(e) => {
return Err(FetchFailure::new(
FetchFailureKind::InvalidParameters,
format!("Failed to parse parameters: {}", e),
))
}
};
let metadata = match &job.metadata {
Some(m) => match serde_json::from_str(m) {
Ok(m) => m,
Err(_) => {
// If we can't decode the metadata, assume this is the first time we've seen the job
// TODO - this is maybe too lenient, I'm not sure.
FetchMetadata {
tries: 0,
trace: vec![],
}
}
},
None => FetchMetadata {
tries: 0,
trace: vec![],
},
};
Ok(Self {
_job: job,
metadata,
parameters,
})
}
}
pub async fn run_job(
context: Arc<AppContext>,
job: Job,
_permit: OwnedSemaphorePermit,
) -> Result<(), FetchError> {
let parsed: FetchJob = match (&job).try_into() {
Ok(p) => p,
Err(e) => return dead_letter_job(&context.worker, job, vec![e]).await,
};
let method: http::Method = (&parsed.parameters.method).into();
// Parsing errors are always dead letters - it /will/ fail every time, so dump it
// TODO - We should probably decide whether to dead letter or return Failed on the basis of OnFinish,
// in case the caller wants to do any cleanup on broken jobs
let url: reqwest::Url = match (parsed.parameters.url).parse() {
Ok(u) => u,
Err(e) => {
return dead_letter_job(
&context.worker,
job,
vec![FetchFailure::new(
FetchFailureKind::InvalidParameters,
format!("Invalid url: {}", e),
)],
)
.await;
}
};
let headers: reqwest::header::HeaderMap =
match (&parsed.parameters.headers.unwrap_or_default()).try_into() {
Ok(h) => h,
Err(e) => {
return dead_letter_job(
&context.worker,
job,
vec![FetchFailure::new(
FetchFailureKind::InvalidParameters,
format!("Invalid headers: {}", e),
)],
)
.await;
}
};
let body = reqwest::Body::from(parsed.parameters.body.unwrap_or_default());
let send_fut = context
.client
.request(method, url)
.headers(headers)
.body(body)
.send();
let mut send_fut = Box::pin(send_fut);
let start = Utc::now();
let res = loop {
tokio::select! {
res = &mut send_fut => {
break res
}
_ = tokio::time::sleep(Duration::milliseconds(HEARTBEAT_INTERVAL_MS).to_std().unwrap()) => {
context.worker.heartbeat(job.id).await?;
}
}
};
// If we took, say, 25% of the heartbeat interval to send the request, we may as well heartbeat now
if Utc::now() - start > Duration::milliseconds(HEARTBEAT_INTERVAL_MS / 4) {
context.worker.heartbeat(job.id).await?;
}
let res = match res {
Ok(r) => r,
Err(e) => {
return handle_fetch_failure(
&context,
&job,
&parsed.metadata,
parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES),
parsed.parameters.return_queue,
parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
e,
)
.await
}
};
// Grab the response metadata, since getting the body moves it
let status = res.status();
let headers: HashMap<String, String> = res
.headers()
.iter()
.map(|(k, v)| {
(
k.as_str().to_string(),
v.to_str().unwrap_or_default().to_string(),
)
})
.collect();
// We pre-emptively get the response body, because we incldued it in the failure trace, even if we got a failure status
let body = first_n_bytes_of_response(
&context.worker,
&job,
res,
context.config.max_response_bytes,
)
.await?;
let body = match body {
Ok(b) => b,
Err(e) => {
// Tag the status and headers onto the failure
let e = e.with_status(status.as_u16()).with_headers(headers);
return handle_fetch_failure(
&context,
&job,
&parsed.metadata,
parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES),
parsed.parameters.return_queue,
parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
e,
)
.await;
}
};
// TODO - we should handle "retryable" and "permanent" failures differently, mostly
// to be polite - retrying a permanent failure isn't a correctness problem, but it's
// rude (and inefficient)
if !status.is_success() {
let failure = FetchFailure::failure_status(status)
.with_body(body)
.with_headers(headers);
return handle_fetch_failure(
&context,
&job,
&parsed.metadata,
parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES),
parsed.parameters.return_queue,
parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
failure,
)
.await;
}
let result = FetchResult::Success {
response: FetchResponse {
status: status.as_u16(),
headers,
body,
},
};
complete_job(
&context.worker,
&job,
parsed.parameters.return_queue,
parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
result,
)
.await
}
// Checks if the retry limit has been reached, and does one of:
// - Schedule the job for retry, doing metadata bookkeeping
// - Complete the job, with the failure trace
#[allow(clippy::too_many_arguments)]
pub async fn handle_fetch_failure<F>(
context: &AppContext,
job: &Job,
metadata: &FetchMetadata,
max_tries: u32,
return_queue: String,
on_finish: OnFinish,
failure: F,
) -> Result<(), FetchError>
where
F: Into<FetchFailure>,
{
let failure = failure.into();
let mut metadata = metadata.clone();
metadata.tries += 1;
metadata.trace.push(failure);
// TODO - right now we treat all failures as retryable, but we should probably be more aggressive in
// culling retries for permanent failures (this is less of a correctness issue and more of an efficiency/
// politeness one). We might also want to make backoff configurable.
if metadata.tries < min(max_tries, context.config.max_retry_attempts) {
let next_available =
Utc::now() + (context.config.retry_backoff_base * (metadata.tries as i32));
// We back off for at most an hour (since callers can configure max retries to be very high)
let next_available = min(next_available, Utc::now() + Duration::hours(1));
// Add some seconds of jitter
let next_available =
next_available + Duration::seconds((rand::random::<u64>() % 30) as i64);
// Set us up for a retry - update metadata, reschedule, and put back in the queue we pulled from
context
.worker
.set_metadata(job.id, Some(serde_json::to_string(&metadata)?))?;
context.worker.set_state(job.id, JobState::Available)?;
context.worker.set_queue(job.id, &job.queue_name)?;
context.worker.set_scheduled_at(job.id, next_available)?;
// We downgrade the priority of jobs that fail, so first attempts at jobs get better QoS
context.worker.set_priority(job.id, job.priority + 1)?;
context.worker.flush_job(job.id).await?;
} else {
// Complete the job, with a Failed result
let result = FetchResult::Failure {
trace: metadata.trace.clone(),
};
complete_job(&context.worker, job, return_queue, on_finish, result).await?;
}
Ok(())
}
// Complete the job, either because we got a good response, or because the jobs retries
// have been exceeded.
pub async fn complete_job(
worker: &Worker,
job: &Job,
return_queue: String,
on_finish: OnFinish,
result: FetchResult,
) -> Result<(), FetchError> {
// If we fail any serde, we just want to flush to the DLQ and bail
worker.set_state(job.id, JobState::Available)?;
worker.set_queue(job.id, DEAD_LETTER_QUEUE)?;
let is_success = result.is_success();
let result = match serde_json::to_string(&result) {
Ok(r) => r,
Err(e) => {
// Leave behind a hint for debugging
worker.set_metadata(job.id, Some(format!("Failed to serialise result: {}", e)))?;
worker.flush_job(job.id).await?;
return Err(FetchError::SerdeError(e));
}
};
worker.set_queue(job.id, &return_queue)?;
match (is_success, on_finish) {
(true, _) | (false, OnFinish::Return) => {
worker.set_state(job.id, JobState::Available)?;
}
(false, OnFinish::Complete) => {
worker.set_state(job.id, JobState::Failed)?;
}
}
worker.set_parameters(job.id, Some(result))?;
worker.set_metadata(job.id, None)?; // We're finished with the job, so clear our internal state
worker.flush_job(job.id).await?;
Ok(())
}
// This moves the job to a dead letter queue, and sets the state to Available (to prevent it
// from being deleted by the janitor). This is for debugging purposes, and only really jobs
// that have some parsing failure on dequeue end up here (as they indicate a programming error
// in the caller, or the worker)
pub async fn dead_letter_job(
worker: &Worker,
job: Job,
errors: Vec<FetchFailure>,
) -> Result<(), FetchError> {
worker.set_state(job.id, JobState::Available)?;
worker.set_queue(job.id, DEAD_LETTER_QUEUE)?;
let result = FetchResult::Failure { trace: errors };
let result = match serde_json::to_string(&result) {
Ok(r) => r,
Err(e) => {
worker.set_metadata(
job.id,
Some(format!(
"Failed to serialise result during DLQ write: {}",
e
)),
)?;
worker.flush_job(job.id).await?;
return Err(FetchError::SerdeError(e));
}
};
worker.set_parameters(job.id, Some(result))?;
worker.flush_job(job.id).await?;
Ok(())
}
// Pulls the body, while maintaining the job heartbeat.
pub async fn first_n_bytes_of_response(
worker: &Worker,
job: &Job,
response: Response,
n: usize,
) -> Result<Result<String, FetchFailure>, FetchError> {
let mut body = response.bytes_stream();
// We deserialize into a vec<u8>, and then parse to a string
let mut buffer = Vec::with_capacity(n);
worker.heartbeat(job.id).await?;
loop {
tokio::select! {
chunk = body.next() => {
let chunk = match chunk {
Some(Ok(c)) => c,
Some(Err(e)) => return Ok(Err(FetchFailure::from(e))),
None => break,
};
buffer.extend_from_slice(&chunk);
if buffer.len() >= n {
return Ok(Err(
FetchFailure::new(FetchFailureKind::ResponseTooLarge, "Response too large")
));
};
}
_ = tokio::time::sleep(Duration::milliseconds(HEARTBEAT_INTERVAL_MS).to_std().unwrap()) => {}
}
// Heartbeat every time we get a new body chunk, or every HEARTBEAT_INTERVAL_MS
worker.heartbeat(job.id).await?;
}
let Ok(body) = String::from_utf8(buffer) else {
return Ok(Err(FetchFailure::new(
FetchFailureKind::InvalidBody,
"Body could not be parsed as utf8",
)));
};
Ok(Ok(body))
}

View File

@ -0,0 +1,3 @@
pub mod config;
pub mod context;
pub mod fetch;

View File

@ -0,0 +1,98 @@
use axum::{extract::State, routing::get, Router};
use common_metrics::setup_metrics_routes;
use cyclotron_fetch::{
config::Config,
context::AppContext,
fetch::{tick, FetchError},
};
use envconfig::Envconfig;
use health::HealthRegistry;
use std::{future::ready, sync::Arc};
use tracing::{error, info};
async fn listen(app: Router, bind: String) -> Result<(), std::io::Error> {
let listener = tokio::net::TcpListener::bind(bind).await?;
axum::serve(listener, app).await?;
Ok(())
}
// For axums state stuff
#[derive(Clone)]
struct WorkerId(pub String);
pub fn app(liveness: HealthRegistry, worker_id: String) -> Router {
Router::new()
.route("/", get(index))
.route("/_readiness", get(index))
.route("/_liveness", get(move || ready(liveness.get_status())))
.with_state(WorkerId(worker_id))
}
async fn index(State(worker_id): State<WorkerId>) -> String {
format!("cyclotron janitor {}", worker_id.0)
}
async fn worker_loop(context: AppContext) -> Result<(), FetchError> {
let context = Arc::new(context);
loop {
context.liveness.report_healthy().await;
let started = tick(context.clone()).await?;
info!("started {} jobs", started);
// This will happen if 1) there are no jobs or 2) we have no capacity to start new jobs. Either way, we should sleep for a bit
if started == 0 {
tokio::time::sleep(context.config.job_poll_interval.to_std().unwrap()).await;
}
}
}
#[tokio::main]
async fn main() {
let config = Config::init_from_env().expect("failed to load configuration from env");
tracing_subscriber::fmt::init();
let liveness = HealthRegistry::new("liveness");
let (app_config, pool_config) = config.to_components();
let bind = format!("{}:{}", app_config.host, app_config.port);
info!(
"Fetch worker starting with ID {:?}, listening at {}",
app_config.worker_id, bind
);
let worker_liveness = liveness
.register(
"worker".to_string(),
(app_config.job_poll_interval * 4).to_std().unwrap(),
)
.await;
let app = setup_metrics_routes(app(liveness, app_config.worker_id.clone()));
let context = AppContext::create(app_config, pool_config, worker_liveness)
.await
.expect("failed to create app context");
let http_server = tokio::spawn(listen(app, bind));
let worker_loop = tokio::spawn(worker_loop(context));
tokio::select! {
res = worker_loop => {
error!("janitor loop exited");
if let Err(e) = res {
error!("janitor failed with: {}", e)
}
}
res = http_server => {
error!("http server exited");
if let Err(e) = res {
error!("server failed with: {}", e)
}
}
}
info!("exiting");
}

View File

@ -0,0 +1,293 @@
use std::{collections::HashMap, str::FromStr, sync::Arc};
use chrono::Duration;
use cyclotron_core::{manager::QueueManager, worker::Worker};
use cyclotron_fetch::fetch::{tick, FetchResult, HttpMethod};
use httpmock::{Method, MockServer};
use serde_json::json;
use sqlx::PgPool;
use utils::{
construct_job, construct_params, get_app_test_context, make_immediately_available,
wait_on_no_running, wait_on_return,
};
mod utils;
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
pub async fn test_completes_fetch(db: PgPool) {
let context = Arc::new(get_app_test_context(db.clone()).await);
let producer = QueueManager::from_pool(db.clone());
let return_worker = Worker::from_pool(db.clone());
let server = MockServer::start();
let mock = server.mock(|when, then| {
when.method(Method::GET).path("/test");
then.status(200).body("Hello, world!");
});
let params = construct_params(server.url("/test"), HttpMethod::Get);
let job = construct_job(params);
producer.create_job(job).await.unwrap();
let started = tick(context).await.unwrap();
assert_eq!(started, 1);
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
let response: FetchResult =
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
let FetchResult::Success { response } = response else {
panic!("Expected success response");
};
assert_eq!(response.status, 200);
assert_eq!(response.body, "Hello, world!");
mock.assert_hits(1);
}
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
pub async fn test_returns_failure_after_retries(db: PgPool) {
let context = Arc::new(get_app_test_context(db.clone()).await);
let producer = QueueManager::from_pool(db.clone());
let return_worker = Worker::from_pool(db.clone());
let server = MockServer::start();
let mock = server.mock(|when, then| {
when.method(Method::GET).path("/test");
then.status(500).body("test server error body");
});
let mut params = construct_params(server.url("/test"), HttpMethod::Get);
params.max_tries = Some(2);
let job = construct_job(params);
producer.create_job(job).await.unwrap();
// Tick twice for retry
let started = tick(context.clone()).await.unwrap();
assert_eq!(started, 1);
wait_on_no_running(&db, Duration::milliseconds(100)).await;
make_immediately_available(&db).await;
let started = tick(context.clone()).await.unwrap();
assert_eq!(started, 1);
wait_on_no_running(&db, Duration::milliseconds(100)).await;
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
let response: FetchResult =
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
let FetchResult::Failure { trace } = response else {
panic!("Expected failure response");
};
assert!(trace.len() == 2);
for attempt in trace {
assert_eq!(attempt.status, Some(500));
assert_eq!(attempt.body, Some("test server error body".to_string()));
}
mock.assert_hits(2);
}
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
pub fn fetch_discards_bad_metadata(db: PgPool) {
let context = Arc::new(get_app_test_context(db.clone()).await);
let producer = QueueManager::from_pool(db.clone());
let return_worker = Worker::from_pool(db.clone());
let server = MockServer::start();
let mock = server.mock(|when, then| {
when.method(Method::GET).path("/test");
then.status(200).body("Hello, world!");
});
let params = construct_params(server.url("/test"), HttpMethod::Get);
let mut job = construct_job(params);
job.metadata = Some("bad json".to_string());
producer.create_job(job).await.unwrap();
let started = tick(context).await.unwrap();
assert_eq!(started, 1);
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
let response: FetchResult =
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
let FetchResult::Success { response } = response else {
panic!("Expected success response");
};
assert_eq!(response.status, 200);
assert_eq!(response.body, "Hello, world!");
mock.assert_hits(1);
}
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
pub fn fetch_with_minimum_params_works(db: PgPool) {
let context = Arc::new(get_app_test_context(db.clone()).await);
let producer = QueueManager::from_pool(db.clone());
let return_worker = Worker::from_pool(db.clone());
let server = MockServer::start();
let mock = server.mock(|when, then| {
when.method(Method::GET).path("/test");
then.status(200).body("Hello, world!");
});
let params = construct_params(server.url("/test"), HttpMethod::Get);
let mut job = construct_job(params);
let url = server.url("/test");
let manual_params = json!({
"url": url,
"method": "GET",
"return_queue": "return",
})
.to_string();
job.parameters = Some(manual_params);
producer.create_job(job).await.unwrap();
let started = tick(context).await.unwrap();
assert_eq!(started, 1);
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
let response: FetchResult =
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
let FetchResult::Success { response } = response else {
panic!("Expected success response");
};
assert_eq!(response.status, 200);
assert_eq!(response.body, "Hello, world!");
mock.assert_hits(1);
}
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
pub async fn test_completes_fetch_with_headers(db: PgPool) {
let context = Arc::new(get_app_test_context(db.clone()).await);
let producer = QueueManager::from_pool(db.clone());
let return_worker = Worker::from_pool(db.clone());
let server = MockServer::start();
let mock = server.mock(|when, then| {
when.method(Method::GET)
.path("/test")
.header("X-Test", "test");
then.status(200).body("Hello, world!");
});
let mut params = construct_params(server.url("/test"), HttpMethod::Get);
let mut headers = HashMap::new();
headers.insert("X-Test".to_string(), "test".to_string());
params.headers = Some(headers);
let job = construct_job(params);
producer.create_job(job).await.unwrap();
let started = tick(context).await.unwrap();
assert_eq!(started, 1);
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
let response: FetchResult =
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
let FetchResult::Success { response } = response else {
panic!("Expected success response");
};
assert_eq!(response.status, 200);
assert_eq!(response.body, "Hello, world!");
mock.assert_hits(1);
}
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
pub async fn test_completes_fetch_with_body(db: PgPool) {
let context = Arc::new(get_app_test_context(db.clone()).await);
let producer = QueueManager::from_pool(db.clone());
let return_worker = Worker::from_pool(db.clone());
let server = MockServer::start();
let mock = server.mock(|when, then| {
when.method(Method::POST).path("/test").body("test body");
then.status(200).body("Hello, world!");
});
let mut params = construct_params(server.url("/test"), HttpMethod::Post);
params.body = Some("test body".to_string());
let job = construct_job(params);
producer.create_job(job).await.unwrap();
let started = tick(context).await.unwrap();
assert_eq!(started, 1);
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
let response: FetchResult =
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
let FetchResult::Success { response } = response else {
panic!("Expected success response");
};
assert_eq!(response.status, 200);
assert_eq!(response.body, "Hello, world!");
mock.assert_hits(1);
}
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
pub async fn test_completes_fetch_with_vm_state(db: PgPool) {
let context = Arc::new(get_app_test_context(db.clone()).await);
let producer = QueueManager::from_pool(db.clone());
let return_worker = Worker::from_pool(db.clone());
let server = MockServer::start();
let mock = server.mock(|when, then| {
when.method(Method::GET).path("/test");
then.status(200).body("Hello, world!");
});
let params = construct_params(server.url("/test"), HttpMethod::Get);
let mut job = construct_job(params);
job.vm_state = Some(json!({"test": "state"}).to_string());
producer.create_job(job).await.unwrap();
let started = tick(context).await.unwrap();
assert_eq!(started, 1);
let returned = wait_on_return(&return_worker, 1, true).await.unwrap();
let state = serde_json::Value::from_str(returned[0].vm_state.as_ref().unwrap()).unwrap();
assert_eq!(state, json!({"test": "state"}));
let response: FetchResult =
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
let FetchResult::Success { response } = response else {
panic!("Expected success response");
};
assert_eq!(response.status, 200);
assert_eq!(response.body, "Hello, world!");
mock.assert_hits(1);
}

View File

@ -0,0 +1,127 @@
use std::sync::Arc;
use chrono::{Duration, Utc};
use cyclotron_core::{
base_ops::{Job, JobInit},
error::QueueError,
worker::Worker,
};
use cyclotron_fetch::{
config::AppConfig,
context::AppContext,
fetch::{FetchParameters, HttpMethod},
};
use sqlx::PgPool;
use tokio::sync::Semaphore;
const FETCH_QUEUE: &str = "fetch";
const RETURN_QUEUE: &str = "return";
pub async fn get_app_test_context(db: PgPool) -> AppContext {
let worker = Worker::from_pool(db.clone());
let client = reqwest::Client::new();
let concurrency_limit = Arc::new(Semaphore::new(1));
let health = health::HealthRegistry::new("test");
let liveness = health
.register("test".to_string(), Duration::seconds(30).to_std().unwrap())
.await;
let config = AppConfig {
fetch_timeout: Duration::seconds(10),
concurrent_requests_limit: 1,
host: "localhost".to_string(),
port: 16,
worker_id: "test".to_string(),
job_poll_interval: Duration::seconds(10),
max_retry_attempts: 3,
queue_served: FETCH_QUEUE.to_string(),
batch_size: 1000,
max_response_bytes: 1024 * 1024,
retry_backoff_base: Duration::milliseconds(1000),
allow_internal_ips: true,
};
AppContext {
worker,
client,
concurrency_limit,
liveness,
config,
}
}
pub fn construct_params(url: String, method: HttpMethod) -> FetchParameters {
FetchParameters {
url,
method,
return_queue: RETURN_QUEUE.to_string(),
headers: None,
body: None,
max_tries: None,
on_finish: None,
}
}
pub fn construct_job(parameters: FetchParameters) -> JobInit {
JobInit {
team_id: 1,
queue_name: FETCH_QUEUE.to_string(),
priority: 0,
scheduled: Utc::now() - Duration::seconds(1),
function_id: None,
vm_state: None,
parameters: Some(serde_json::to_string(&parameters).unwrap()),
metadata: None,
}
}
pub async fn wait_on_return(
worker: &Worker,
count: usize,
with_vm: bool,
) -> Result<Vec<Job>, QueueError> {
let timeout = Duration::seconds(1);
let start = Utc::now();
let mut returned = vec![];
while start + timeout > Utc::now() {
let mut jobs = if with_vm {
worker.dequeue_with_vm_state(RETURN_QUEUE, 1).await?
} else {
worker.dequeue_jobs(RETURN_QUEUE, 1).await?
};
returned.append(&mut jobs);
if returned.len() == count {
return Ok(returned);
}
if returned.len() > count {
panic!("Too many jobs returned");
}
}
panic!("Timeout waiting for jobs to return");
}
pub async fn wait_on_no_running(pool: &PgPool, max_time: Duration) {
let start = Utc::now();
loop {
let running: i64 =
sqlx::query_scalar("SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'running'")
.fetch_one(pool)
.await
.unwrap();
if running == 0 {
return;
}
if Utc::now() - start > max_time {
panic!("Timeout waiting for jobs to finish");
}
}
}
pub async fn make_immediately_available(pool: &PgPool) {
sqlx::query(
"UPDATE cyclotron_jobs SET scheduled = NOW() - INTERVAL '1 second' WHERE state = 'available'",
)
.execute(pool)
.await
.unwrap();
}

View File

@ -0,0 +1,22 @@
[package]
name = "cyclotron-janitor"
version = "0.1.0"
edition = "2021"
[lints]
workspace = true
[dependencies]
tracing-subscriber = { workspace = true }
sqlx = { workspace = true }
chrono = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
uuid = { workspace = true }
envconfig = { workspace = true }
axum = { workspace = true }
eyre = { workspace = true }
metrics = { workspace = true }
cyclotron-core = { path = "../cyclotron-core" }
common-metrics = { path = "../common/metrics" }
health = { path = "../common/health" }

View File

@ -0,0 +1,23 @@
#!/bin/bash
set -e
# I set all possible env vars here, tune them as you like
export RUST_LOG="INFO"
export HOST="::"
export PORT="3302"
export DATABASE_URL="postgres://posthog:posthog@localhost:5432/cyclotron"
export CLEANUP_INTERVAL_SECONDS="10"
export PG_MAX_CONNECTIONS="10"
export PG_MIN_CONNECTIONS="1"
export PG_ACQUIRE_TIMEOUT_SECONDS="5"
export PG_MAX_LIFETIME_SECONDS="300"
export PG_IDLE_TIMEOUT_SECONDS="60"
export JANITOR_ID="test-janitor"
export JANITOR_MAX_TOUCHES="2"
export JANITOR_STALL_TIMEOUT_SECONDS="30"
# Uncomment this to have the database be reset every time you start the janitor
sqlx database reset -y --source ../cyclotron-core/migrations
sqlx migrate run --source ../cyclotron-core/migrations
cargo run --release

View File

@ -0,0 +1,83 @@
use chrono::Duration;
use cyclotron_core::PoolConfig;
use envconfig::Envconfig;
use uuid::Uuid;
#[derive(Envconfig)]
pub struct Config {
#[envconfig(from = "BIND_HOST", default = "::")]
pub host: String,
#[envconfig(from = "BIND_PORT", default = "3303")]
pub port: u16,
#[envconfig(default = "postgres://posthog:posthog@localhost:5432/cyclotron")]
pub database_url: String,
#[envconfig(default = "30")]
pub cleanup_interval_secs: u64,
#[envconfig(default = "10")]
pub pg_max_connections: u32,
#[envconfig(default = "1")]
pub pg_min_connections: u32,
#[envconfig(default = "30")]
pub pg_acquire_timeout_seconds: u64,
#[envconfig(default = "300")]
pub pg_max_lifetime_seconds: u64,
#[envconfig(default = "60")]
pub pg_idle_timeout_seconds: u64,
// Generally, this should be equivalent to a "shard id", as only one janitor should be running
// per shard
pub janitor_id: Option<String>,
#[envconfig(default = "10")]
pub janitor_max_touches: i16,
#[envconfig(default = "60")]
pub janitor_stall_timeout_seconds: u16,
}
impl Config {
pub fn get_janitor_config(&self) -> JanitorConfig {
let pool_config = PoolConfig {
db_url: self.database_url.clone(),
max_connections: Some(self.pg_max_connections),
min_connections: Some(self.pg_min_connections),
acquire_timeout_seconds: Some(self.pg_acquire_timeout_seconds),
max_lifetime_seconds: Some(self.pg_max_lifetime_seconds),
idle_timeout_seconds: Some(self.pg_idle_timeout_seconds),
};
let settings = JanitorSettings {
stall_timeout: Duration::seconds(self.janitor_stall_timeout_seconds as i64),
max_touches: self.janitor_max_touches,
id: self
.janitor_id
.clone()
.unwrap_or_else(|| Uuid::now_v7().to_string()),
};
JanitorConfig {
pool: pool_config,
settings,
}
}
}
pub struct JanitorConfig {
pub pool: PoolConfig,
pub settings: JanitorSettings,
}
pub struct JanitorSettings {
pub stall_timeout: Duration,
pub max_touches: i16,
pub id: String,
}

View File

@ -0,0 +1,136 @@
use chrono::Utc;
use cyclotron_core::{
error::QueueError,
janitor_ops::{
delete_completed_jobs, delete_failed_jobs, delete_poison_pills, reset_stalled_jobs,
},
};
use sqlx::PgPool;
use tracing::{info, warn};
use crate::config::{JanitorConfig, JanitorSettings};
// The janitor reports it's own metrics, this is mostly for testing purposes
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct CleanupResult {
pub completed: u64,
pub failed: u64,
pub poisoned: u64,
pub stalled: u64,
}
pub struct Janitor {
pool: PgPool,
settings: JanitorSettings,
metrics_labels: Vec<(&'static str, String)>,
}
impl Janitor {
pub async fn new(config: JanitorConfig) -> Result<Self, QueueError> {
let settings = config.settings;
let pool = config.pool.connect().await?;
let metrics_labels = vec![("janitor_id", settings.id.clone())];
Ok(Self {
pool,
settings,
metrics_labels,
})
}
pub fn from_pool(pool: PgPool, settings: JanitorSettings) -> Self {
let metrics_labels = vec![("janitor_id", settings.id.clone())];
Self {
pool,
settings,
metrics_labels,
}
}
// TODO - right now, the metrics produced here are pretty rough - just per shard, without
// any per-queue or per-worker-type breakdown. It'd be nice to add that, eventually.
pub async fn run_once(&self) -> Result<CleanupResult, QueueError> {
info!("Running janitor loop");
let start = Utc::now();
metrics::counter!("cyclotron_janitor_run_starts", &self.metrics_labels).increment(1);
let before = Utc::now();
let completed = delete_completed_jobs(&self.pool).await?;
let taken = Utc::now() - before;
metrics::histogram!(
"cyclotron_janitor_completed_jobs_cleanup_duration_ms",
&self.metrics_labels
)
.record(taken.num_milliseconds() as f64);
metrics::counter!(
"cyclotron_janitor_completed_jobs_deleted",
&self.metrics_labels
)
.increment(completed);
let before = Utc::now();
let failed = delete_failed_jobs(&self.pool).await?;
let taken = Utc::now() - before;
metrics::histogram!(
"cyclotron_janitor_failed_jobs_cleanup_duration_ms",
&self.metrics_labels
)
.record(taken.num_milliseconds() as f64);
metrics::counter!(
"cyclotron_janitor_failed_jobs_deleted",
&self.metrics_labels
)
.increment(failed);
// Note - if we reset stalled jobs before deleting poison pills, we'll never delete poision
// pills, since resetting a stalled job clears the locked state.
let before = Utc::now();
let poisoned = delete_poison_pills(
&self.pool,
self.settings.stall_timeout,
self.settings.max_touches,
)
.await?;
let taken = Utc::now() - before;
metrics::histogram!(
"cyclotron_janitor_poison_pills_cleanup_duration_ms",
&self.metrics_labels
)
.record(taken.num_milliseconds() as f64);
metrics::counter!(
"cyclotron_janitor_poison_pills_deleted",
&self.metrics_labels
)
.increment(poisoned);
if poisoned > 0 {
warn!("Deleted {} poison pills", poisoned);
}
let before = Utc::now();
let stalled = reset_stalled_jobs(&self.pool, self.settings.stall_timeout).await?;
let taken = Utc::now() - before;
metrics::histogram!(
"cyclotron_janitor_stalled_jobs_reset_duration_ms",
&self.metrics_labels
)
.record(taken.num_milliseconds() as f64);
metrics::counter!("cyclotron_janitor_stalled_jobs_reset", &self.metrics_labels)
.increment(stalled);
if stalled > 0 {
warn!("Reset {} stalled jobs", stalled);
}
metrics::counter!("cyclotron_janitor_run_ends", &self.metrics_labels).increment(1);
let elapsed = Utc::now() - start;
metrics::histogram!("cyclotron_janitor_run_duration_ms", &self.metrics_labels)
.record(elapsed.num_milliseconds() as f64);
info!("Janitor loop complete");
Ok(CleanupResult {
completed,
failed,
poisoned,
stalled,
})
}
}

View File

@ -0,0 +1,2 @@
pub mod config;
pub mod janitor;

View File

@ -0,0 +1,105 @@
use axum::{extract::State, routing::get, Router};
use common_metrics::setup_metrics_routes;
use cyclotron_janitor::{config::Config, janitor::Janitor};
use envconfig::Envconfig;
use eyre::Result;
use health::{HealthHandle, HealthRegistry};
use std::{future::ready, time::Duration};
use tracing::{error, info};
/// Most of this stuff is stolen pretty shamelessly from the rustyhook janitor. It'll diverge more
/// once we introduce the management command stuff, but for now it's a good starting point.
async fn cleanup_loop(janitor: Janitor, livenes: HealthHandle, interval_secs: u64) -> Result<()> {
let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
loop {
interval.tick().await;
if let Err(e) = janitor.run_once().await {
// don't bother reporting unhealthy - a few times around this loop will put us in a stalled state
error!("janitor failed cleanup with: {}", e);
} else {
livenes.report_healthy().await;
}
}
}
async fn listen(app: Router, bind: String) -> Result<()> {
let listener = tokio::net::TcpListener::bind(bind).await?;
axum::serve(listener, app).await?;
Ok(())
}
// For axums state stuff
#[derive(Clone)]
struct JanitorId(pub String);
pub fn app(liveness: HealthRegistry, janitor_id: String) -> Router {
Router::new()
.route("/", get(index))
.route("/_readiness", get(index))
.route("/_liveness", get(move || ready(liveness.get_status())))
.with_state(JanitorId(janitor_id))
}
async fn index(State(janitor_id): State<JanitorId>) -> String {
format!("cyclotron janitor {}", janitor_id.0)
}
#[tokio::main]
async fn main() {
let config = Config::init_from_env().expect("failed to load configuration from env");
tracing_subscriber::fmt::init();
let liveness = HealthRegistry::new("liveness");
let janitor_config = config.get_janitor_config();
let janitor_id = janitor_config.settings.id.clone();
let bind = format!("{}:{}", config.host, config.port);
info!(
"Starting janitor with ID {:?}, listening at {}",
janitor_id, bind
);
let janitor = Janitor::new(janitor_config)
.await
.expect("failed to create janitor");
let janitor_liveness = liveness
.register(
"janitor".to_string(),
Duration::from_secs(config.cleanup_interval_secs * 4),
)
.await;
let janitor_loop = tokio::spawn(cleanup_loop(
janitor,
janitor_liveness,
config.cleanup_interval_secs,
));
let app = setup_metrics_routes(app(liveness, janitor_id));
let http_server = tokio::spawn(listen(app, bind));
tokio::select! {
res = janitor_loop => {
error!("janitor loop exited");
if let Err(e) = res {
error!("janitor failed with: {}", e)
}
}
res = http_server => {
error!("http server exited");
if let Err(e) = res {
error!("server failed with: {}", e)
}
}
}
info!("exiting");
}

View File

@ -0,0 +1,226 @@
use chrono::{Duration, Utc};
use cyclotron_core::{
base_ops::{JobInit, JobState},
manager::QueueManager,
worker::Worker,
};
use cyclotron_janitor::{config::JanitorSettings, janitor::Janitor};
use sqlx::PgPool;
use uuid::Uuid;
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
async fn janitor_test(db: PgPool) {
let worker = Worker::from_pool(db.clone());
let manager = QueueManager::from_pool(db.clone());
// Purposefully MUCH smaller than would be used in production, so
// we can simulate stalled or poison jobs quickly
let stall_timeout = Duration::milliseconds(10);
let max_touches = 3;
let settings = JanitorSettings {
stall_timeout,
max_touches,
id: "test_janitor".to_string(),
};
let janitor = Janitor::from_pool(db.clone(), settings);
let now = Utc::now() - Duration::seconds(10);
let queue_name = "default".to_string();
let job_init = JobInit {
team_id: 1,
queue_name: queue_name.clone(),
priority: 0,
scheduled: now,
function_id: Some(Uuid::now_v7()),
vm_state: None,
parameters: None,
metadata: None,
};
// First test - if we mark a job as completed, the janitor will clean it up
manager.create_job(job_init.clone()).await.unwrap();
let job = worker
.dequeue_jobs(&queue_name, 1)
.await
.unwrap()
.pop()
.unwrap();
worker.set_state(job.id, JobState::Completed).unwrap();
worker.flush_job(job.id).await.unwrap();
let result = janitor.run_once().await.unwrap();
assert_eq!(result.completed, 1);
assert_eq!(result.failed, 0);
assert_eq!(result.poisoned, 0);
assert_eq!(result.stalled, 0);
// Second test - if we mark a job as failed, the janitor will clean it up
manager.create_job(job_init.clone()).await.unwrap();
let job = worker
.dequeue_jobs(&queue_name, 1)
.await
.unwrap()
.pop()
.unwrap();
worker.set_state(job.id, JobState::Failed).unwrap();
worker.flush_job(job.id).await.unwrap();
let result = janitor.run_once().await.unwrap();
assert_eq!(result.completed, 0);
assert_eq!(result.failed, 1);
assert_eq!(result.poisoned, 0);
assert_eq!(result.stalled, 0);
// Third test - if we pick up a job, and then hold it for longer than
// the stall timeout, the janitor will reset it. After this, the worker
// cannot flush updates to the job, and must re-dequeue it.
manager.create_job(job_init.clone()).await.unwrap();
let job = worker
.dequeue_jobs(&queue_name, 1)
.await
.unwrap()
.pop()
.unwrap();
// First, cleanup won't do anything
let result = janitor.run_once().await.unwrap();
assert_eq!(result.completed, 0);
assert_eq!(result.failed, 0);
assert_eq!(result.poisoned, 0);
assert_eq!(result.stalled, 0);
// Then we stall on the job
tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await;
// Now, cleanup will reset the job
let result = janitor.run_once().await.unwrap();
assert_eq!(result.completed, 0);
assert_eq!(result.failed, 0);
assert_eq!(result.poisoned, 0);
assert_eq!(result.stalled, 1);
// Now, the worker can't flush the job
worker.set_state(job.id, JobState::Completed).unwrap();
let result = worker.flush_job(job.id).await;
assert!(result.is_err());
// But if we re-dequeue the job, we can flush it
let job = worker
.dequeue_jobs(&queue_name, 1)
.await
.unwrap()
.pop()
.unwrap();
worker.set_state(job.id, JobState::Completed).unwrap();
worker.flush_job(job.id).await.unwrap();
janitor.run_once().await.unwrap(); // Clean up the completed job to reset for the next test
// Fourth test - if a worker holds a job for longer than the stall
// time, but calls heartbeat, the job will not be reset
manager.create_job(job_init.clone()).await.unwrap();
let job = worker
.dequeue_jobs(&queue_name, 1)
.await
.unwrap()
.pop()
.unwrap();
let start = tokio::time::Instant::now();
loop {
worker.heartbeat(job.id).await.unwrap();
tokio::time::sleep(Duration::milliseconds(1).to_std().unwrap()).await;
if start.elapsed() > stall_timeout.to_std().unwrap() * 2 {
break;
}
}
let result = janitor.run_once().await.unwrap();
assert_eq!(result.completed, 0);
assert_eq!(result.failed, 0);
assert_eq!(result.poisoned, 0);
assert_eq!(result.stalled, 0);
// The worker can still flush the job
worker.set_state(job.id, JobState::Completed).unwrap();
worker.flush_job(job.id).await.unwrap();
// and now cleanup will work
let result = janitor.run_once().await.unwrap();
assert_eq!(result.completed, 1);
assert_eq!(result.failed, 0);
assert_eq!(result.poisoned, 0);
assert_eq!(result.stalled, 0);
// Fifth test - if a job stalls more than max_touches
// it will be marked as poisoned and deleted
manager.create_job(job_init.clone()).await.unwrap();
let mut job = worker
.dequeue_jobs(&queue_name, 1)
.await
.unwrap()
.pop()
.unwrap();
for _ in 0..max_touches {
tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await;
let result = janitor.run_once().await.unwrap();
assert_eq!(result.completed, 0);
assert_eq!(result.failed, 0);
assert_eq!(result.poisoned, 0);
assert_eq!(result.stalled, 1);
// assert we can't update the job (flush and heartbeat fail)
worker.set_state(job.id, JobState::Completed).unwrap();
let result = worker.heartbeat(job.id).await;
assert!(result.is_err());
let result = worker.flush_job(job.id).await;
assert!(result.is_err());
// re-dequeue the job
job = worker
.dequeue_jobs(&queue_name, 1)
.await
.unwrap()
.pop()
.unwrap();
}
// At this point, the "janitor touches" on the job is 3 (it's been stalled and reset 3 times), so one more cleanup loop will delete it
// Now stall one more time, and on cleanup, we should see the job was considered poison and deleted
tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await;
let result: cyclotron_janitor::janitor::CleanupResult = janitor.run_once().await.unwrap();
assert_eq!(result.completed, 0);
assert_eq!(result.failed, 0);
assert_eq!(result.poisoned, 1);
assert_eq!(result.stalled, 0);
// The worker can't flush the job
worker.set_state(job.id, JobState::Completed).unwrap();
let result = worker.flush_job(job.id).await;
assert!(result.is_err());
// Sixth test - the janitor can operate on multiple jobs at once
manager.create_job(job_init.clone()).await.unwrap();
manager.create_job(job_init.clone()).await.unwrap();
let jobs = worker.dequeue_jobs(&queue_name, 2).await.unwrap();
worker.set_state(jobs[0].id, JobState::Completed).unwrap();
worker.set_state(jobs[1].id, JobState::Failed).unwrap();
worker.flush_job(jobs[0].id).await.unwrap();
worker.flush_job(jobs[1].id).await.unwrap();
let result = janitor.run_once().await.unwrap();
assert_eq!(result.completed, 1);
assert_eq!(result.failed, 1);
assert_eq!(result.poisoned, 0);
assert_eq!(result.stalled, 0);
}

7
rust/cyclotron-node/.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
target
index.node
**/node_modules
**/.DS_Store
npm-debug.log*cargo.log
cross.log
dist/

View File

@ -0,0 +1,22 @@
[package]
name = "cyclotron-node"
version = "0.1.0"
edition = "2021"
exclude = ["index.node"]
[lints]
workspace = true
[lib]
crate-type = ["cdylib"]
[dependencies]
cyclotron-core = { path = "../cyclotron-core" }
neon = { workspace = true }
once_cell = { workspace = true }
tokio = { workspace = true }
serde_json = { workspace = true }
serde = { workspace = true }
uuid = { workspace = true }
chrono = { workspace = true }

View File

@ -0,0 +1,144 @@
const assert = require('assert')
const cyclotron = require('../.')
const crypto = require('crypto')
// Set of available job states
const JOB_STATES = Object.freeze({
AVAILABLE: 'available',
RUNNING: 'running',
FAILED: 'failed',
COMPLETED: 'completed',
})
const AVAILABLE_WORKERS = Object.freeze({
FETCH: 'fetch',
HOG: 'hog',
})
async function main() {
let poolConfig = {
db_url: 'postgresql://posthog:posthog@localhost:5432/cyclotron',
}
let managerConfig = {
shards: [poolConfig],
}
// Most processes will only need to do one of these, but we can do both here for demonstration purposes
await cyclotron.initWorker(JSON.stringify(poolConfig))
await cyclotron.initManager(JSON.stringify(managerConfig))
// Maybe inits won't throw on re-calling, and are also short-circuiting to be almost free, so safe to call frequently
// (although I still wouldn't call them in a loop)
await cyclotron.maybeInitWorker(JSON.stringify(poolConfig))
await cyclotron.maybeInitManager(JSON.stringify(managerConfig))
let five_mintes_ago = new Date(new Date().getTime() - 5 * 60000).toISOString()
let queue_name = 'default'
let job_1 = {
team_id: 1,
queue_name,
priority: 0,
scheduled: five_mintes_ago,
function_id: crypto.randomUUID(), // Is nullable
vm_state: null,
parameters: null,
metadata: null,
}
let job_2 = {
team_id: 1,
queue_name,
priority: 1,
scheduled: five_mintes_ago,
function_id: crypto.randomUUID(), // Is nullable
vm_state: null,
parameters: null,
metadata: null,
}
await cyclotron.createJob(JSON.stringify(job_1))
await cyclotron.createJob(JSON.stringify(job_2))
// Jobs (as well as any other 'complex' data shape) are serialized across the API boundary,
// because that's (according to the neon maintainers) /actually faster/ than doing a bunch
// of cross-runtime pointer chasing.
let jobs = JSON.parse(await cyclotron.dequeueJobs(queue_name, 2))
assert(jobs.length === 2)
assert(jobs[0].function_id === job_1.function_id)
assert(jobs[1].function_id === job_2.function_id)
job_1 = jobs[0]
job_2 = jobs[1]
// All of these throw if the job hasn't been dequeued by the worker created when init_worker was called,
// or if there's some serde error - generally, interacting with the cyclotron should involve try/catch in
// some far outer catch. We can iterate on this API to make it more ergonomic with time, but
// my js/ts is... rusty (co-pilot wrote this joke)
cyclotron.setState(job_1.id, JOB_STATES.AVAILABLE)
cyclotron.setState(job_2.id, JOB_STATES.AVAILABLE)
cyclotron.setQueue(job_1.id, 'non-default')
cyclotron.setQueue(job_2.id, 'non-default')
// Priority is lowest-first, so this means we can assert that job_2 will be returned first on subsequent dequeue_jobs
cyclotron.setPriority(job_1.id, 2)
cyclotron.setPriority(job_2.id, 1)
let ten_minutes_ago = new Date(new Date().getTime() - 10 * 60000).toISOString()
cyclotron.setScheduledAt(job_1.id, ten_minutes_ago)
cyclotron.setScheduledAt(job_2.id, ten_minutes_ago)
cyclotron.setVmState(job_1.id, JSON.stringify({ state: 'running' }))
cyclotron.setVmState(job_2.id, JSON.stringify({ state: 'running' }))
cyclotron.setParameters(job_1.id, JSON.stringify({ parameters: 'running' }))
cyclotron.setParameters(job_2.id, JSON.stringify({ parameters: 'running' }))
cyclotron.setMetadata(job_1.id, JSON.stringify({ metadata: 'running' }))
cyclotron.setMetadata(job_2.id, JSON.stringify({ metadata: 'running' }))
// Flush the updates queued up above back to the queue. Subsequent calls to flush
// will throw if a job isn't re-acquired. Flushes will fail if a job state update
// isn't included (workers should not purposefully leave jobs in a running state)
await cyclotron.flushJob(job_1.id)
await cyclotron.flushJob(job_2.id)
jobs = JSON.parse(await cyclotron.dequeueWithVmState('non-default', 2))
assert(jobs[0].id == job_2.id)
assert(jobs[1].id == job_1.id)
assert(jobs[0].function_id === job_2.function_id)
assert(jobs[1].function_id === job_1.function_id)
assert(jobs[0].team_id === job_2.team_id)
assert(jobs[1].team_id === job_1.team_id)
assert(jobs[0].queue_name === 'non-default')
assert(jobs[1].queue_name === 'non-default')
assert(jobs[0].priority === 1)
assert(jobs[1].priority === 2)
assert(jobs[0].scheduled === ten_minutes_ago)
assert(jobs[1].scheduled === ten_minutes_ago)
assert(jobs[0].vm_state === JSON.stringify({ state: 'running' }))
assert(jobs[1].vm_state === JSON.stringify({ state: 'running' }))
assert(jobs[0].parameters === JSON.stringify({ parameters: 'running' }))
assert(jobs[1].parameters === JSON.stringify({ parameters: 'running' }))
assert(jobs[0].metadata === JSON.stringify({ metadata: 'running' }))
assert(jobs[1].metadata === JSON.stringify({ metadata: 'running' }))
// Now we'll mark these jobs as completed
cyclotron.setState(job_1.id, JOB_STATES.COMPLETED)
cyclotron.setState(job_2.id, JOB_STATES.COMPLETED)
// And flush them back to the queue
await cyclotron.flushJob(job_1.id)
await cyclotron.flushJob(job_2.id)
}
main()

View File

@ -0,0 +1,27 @@
{
"name": "@posthog/cyclotron",
"version": "0.1.0",
"description": "Node bindings for cyclotron",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"test": "cargo test",
"build": "pnpm run build:cargo --release && pnpm run build:move-lib && pnpm run build:typescript",
"build:move-lib": "cp ../target/release/libcyclotron_node.dylib index.node || cp ../target/release/libcyclotron_node.so index.node",
"build:cargo": "cargo build --message-format=json > cargo.log",
"build:cargo:debug": "pnpm run build:cargo",
"build:cross": "cross build --message-format=json > cross.log",
"build:typescript": "tsc",
"package": "NODE_ENV=development pnpm i --dev && pnpm run build"
},
"author": "",
"license": "MIT",
"devDependencies": {
"@types/node": "^22.4.1",
"typescript": "^4.7.4"
},
"files": [
"dist",
"index.node"
]
}

View File

@ -0,0 +1,31 @@
lockfileVersion: '6.0'
settings:
autoInstallPeers: true
excludeLinksFromLockfile: false
devDependencies:
'@types/node':
specifier: ^22.4.1
version: 22.4.1
typescript:
specifier: ^4.7.4
version: 4.9.5
packages:
/@types/node@22.4.1:
resolution: {integrity: sha512-1tbpb9325+gPnKK0dMm+/LMriX0vKxf6RnB0SZUqfyVkQ4fMgUSySqhxE/y8Jvs4NyF1yHzTfG9KlnkIODxPKg==}
dependencies:
undici-types: 6.19.8
dev: true
/typescript@4.9.5:
resolution: {integrity: sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==}
engines: {node: '>=4.2.0'}
hasBin: true
dev: true
/undici-types@6.19.8:
resolution: {integrity: sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==}
dev: true

View File

@ -0,0 +1,257 @@
// eslint-disable-next-line @typescript-eslint/no-var-requires
const cyclotron = require('../index.node')
export interface PoolConfig {
dbUrl: string
maxConnections?: number
minConnections?: number
acquireTimeoutSeconds?: number
maxLifetimeSeconds?: number
idleTimeoutSeconds?: number
}
// Type as expected by Cyclotron.
interface InternalPoolConfig {
db_url: string
max_connections?: number
min_connections?: number
acquire_timeout_seconds?: number
max_lifetime_seconds?: number
idle_timeout_seconds?: number
}
export interface ManagerConfig {
shards: PoolConfig[]
}
// Type as expected by Cyclotron.
interface InternalManagerConfig {
shards: InternalPoolConfig[]
}
export interface JobInit {
teamId: number
functionId: string
queueName: string
priority?: number
scheduled?: Date
vmState?: string
parameters?: string
metadata?: string
}
// Type as expected by Cyclotron.
interface InternalJobInit {
team_id: number
function_id: string
queue_name: string
priority?: number
scheduled?: Date
vm_state?: string
parameters?: string
metadata?: string
}
export type JobState = 'available' | 'running' | 'completed' | 'failed' | 'paused'
export interface Job {
id: string
teamId: number
functionId: string | null
created: Date
lockId: string | null
lastHeartbeat: Date | null
janitorTouchCount: number
transitionCount: number
lastTransition: Date
queueName: string
state: JobState
priority: number
scheduled: Date
vmState: string | null
metadata: string | null
parameters: string | null
}
// Type as returned by Cyclotron.
interface InternalJob {
id: string
team_id: number
function_id: string | null
created: string
lock_id: string | null
last_heartbeat: string | null
janitor_touch_count: number
transition_count: number
last_transition: string
queue_name: string
state: JobState
priority: number
scheduled: string
vm_state: string | null
metadata: string | null
parameters: string | null
}
async function initWorker(poolConfig: PoolConfig): Promise<void> {
const initWorkerInternal: InternalPoolConfig = {
db_url: poolConfig.dbUrl,
max_connections: poolConfig.maxConnections,
min_connections: poolConfig.minConnections,
acquire_timeout_seconds: poolConfig.acquireTimeoutSeconds,
max_lifetime_seconds: poolConfig.maxLifetimeSeconds,
idle_timeout_seconds: poolConfig.idleTimeoutSeconds,
}
return await cyclotron.initWorker(JSON.stringify(initWorkerInternal))
}
async function initManager(managerConfig: ManagerConfig): Promise<void> {
const managerConfigInternal: InternalManagerConfig = {
shards: managerConfig.shards.map((shard) => ({
db_url: shard.dbUrl,
max_connections: shard.maxConnections,
min_connections: shard.minConnections,
acquire_timeout_seconds: shard.acquireTimeoutSeconds,
max_lifetime_seconds: shard.maxLifetimeSeconds,
idle_timeout_seconds: shard.idleTimeoutSeconds,
})),
}
return await cyclotron.initManager(JSON.stringify(managerConfigInternal))
}
async function maybeInitWorker(poolConfig: PoolConfig): Promise<void> {
const initWorkerInternal: InternalPoolConfig = {
db_url: poolConfig.dbUrl,
max_connections: poolConfig.maxConnections,
min_connections: poolConfig.minConnections,
acquire_timeout_seconds: poolConfig.acquireTimeoutSeconds,
max_lifetime_seconds: poolConfig.maxLifetimeSeconds,
idle_timeout_seconds: poolConfig.idleTimeoutSeconds,
}
return await cyclotron.maybeInitWorker(JSON.stringify(initWorkerInternal))
}
async function maybeInitManager(managerConfig: ManagerConfig): Promise<void> {
const managerConfigInternal: InternalManagerConfig = {
shards: managerConfig.shards.map((shard) => ({
db_url: shard.dbUrl,
max_connections: shard.maxConnections,
min_connections: shard.minConnections,
acquire_timeout_seconds: shard.acquireTimeoutSeconds,
max_lifetime_seconds: shard.maxLifetimeSeconds,
idle_timeout_seconds: shard.idleTimeoutSeconds,
})),
}
return await cyclotron.maybeInitManager(JSON.stringify(managerConfigInternal))
}
export async function createJob(job: JobInit): Promise<void> {
job.priority ??= 1
job.scheduled ??= new Date()
const jobInitInternal: InternalJobInit = {
team_id: job.teamId,
function_id: job.functionId,
queue_name: job.queueName,
priority: job.priority,
scheduled: job.scheduled,
vm_state: job.vmState,
parameters: job.parameters,
metadata: job.metadata,
}
return await cyclotron.createJob(JSON.stringify(jobInitInternal))
}
function convertInternalJobToJob(jobInternal: InternalJob): Job {
return {
id: jobInternal.id,
teamId: jobInternal.team_id,
functionId: jobInternal.function_id,
created: new Date(jobInternal.created),
lockId: jobInternal.lock_id,
lastHeartbeat: jobInternal.last_heartbeat ? new Date(jobInternal.last_heartbeat) : null,
janitorTouchCount: jobInternal.janitor_touch_count,
transitionCount: jobInternal.transition_count,
lastTransition: new Date(jobInternal.last_transition),
queueName: jobInternal.queue_name,
state: jobInternal.state,
priority: jobInternal.priority,
scheduled: new Date(jobInternal.scheduled),
vmState: jobInternal.vm_state,
metadata: jobInternal.metadata,
parameters: jobInternal.parameters,
}
}
async function dequeueJobs(queueName: string, limit: number): Promise<Job[]> {
const jobsStr = await cyclotron.dequeueJobs(queueName, limit)
const jobs: InternalJob[] = JSON.parse(jobsStr)
return jobs.map(convertInternalJobToJob)
}
async function dequeueJobsWithVmState(queueName: string, limit: number): Promise<Job[]> {
const jobsStr = await cyclotron.dequeueJobsWithVmState(queueName, limit)
const jobs: InternalJob[] = JSON.parse(jobsStr)
return jobs.map(convertInternalJobToJob)
}
async function flushJob(jobId: string): Promise<void> {
return await cyclotron.flushJob(jobId)
}
function setState(jobId: string, jobState: JobState): Promise<void> {
return cyclotron.setState(jobId, jobState)
}
function setQueue(jobId: string, queueName: string): Promise<void> {
return cyclotron.setQueue(jobId, queueName)
}
function setPriority(jobId: string, priority: number): Promise<void> {
return cyclotron.setPriority(jobId, priority)
}
function setScheduledAt(jobId: string, scheduledAt: Date): Promise<void> {
return cyclotron.setScheduledAt(jobId, scheduledAt.toISOString())
}
function serializeObject(name: string, obj: Record<string, any> | null): string | null {
if (obj === null) {
return null
} else if (typeof obj === 'object' && obj !== null) {
return JSON.stringify(obj)
}
throw new Error(`${name} must be either an object or null`)
}
function setVmState(jobId: string, vmState: Record<string, any> | null): Promise<void> {
const serialized = serializeObject('vmState', vmState)
return cyclotron.setVmState(jobId, serialized)
}
function setMetadata(jobId: string, metadata: Record<string, any> | null): Promise<void> {
const serialized = serializeObject('metadata', metadata)
return cyclotron.setMetadata(jobId, serialized)
}
function setParameters(jobId: string, parameters: Record<string, any> | null): Promise<void> {
const serialized = serializeObject('parameters', parameters)
return cyclotron.setParameters(jobId, serialized)
}
export default {
initWorker,
initManager,
maybeInitWorker,
maybeInitManager,
createJob,
dequeueJobs,
dequeueJobsWithVmState,
flushJob,
setState,
setQueue,
setPriority,
setScheduledAt,
setVmState,
setMetadata,
setParameters,
}

View File

@ -0,0 +1,450 @@
use chrono::{DateTime, Utc};
use cyclotron_core::{
base_ops::{JobInit, JobState},
manager::{ManagerConfig, QueueManager},
worker::Worker,
PoolConfig,
};
use neon::{
handle::Handle,
prelude::{Context, FunctionContext, ModuleContext},
result::{JsResult, NeonResult},
types::{JsNull, JsNumber, JsPromise, JsString, JsValue},
};
use once_cell::sync::OnceCell;
use serde::de::DeserializeOwned;
use serde_json::Value;
use tokio::runtime::Runtime;
use uuid::Uuid;
static WORKER: OnceCell<Worker> = OnceCell::new();
static MANAGER: OnceCell<QueueManager> = OnceCell::new();
static RUNTIME: OnceCell<Runtime> = OnceCell::new();
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
RUNTIME
.get_or_try_init(Runtime::new)
.or_else(|e| cx.throw_error(format!("failed to create tokio runtime: {}", e)))
}
// The general interface for calling our functions takes a JSON serialized stirng,
// because neon has no nice serde support for function arguments (and generally.
// rippping objects from the v8 runtime piece by piece is slower than just passing
// a since chunk of bytes). These are convenience functions for converting between
pub fn from_json_string<'a, T, C>(cx: &mut C, object: Handle<JsString>) -> NeonResult<T>
where
T: DeserializeOwned,
C: Context<'a>,
{
let value: T =
serde_json::from_str(&object.value(cx)).or_else(|e| cx.throw_error(format!("{}", e)))?;
Ok(value)
}
pub fn to_json_string<'a, T, C>(cx: &mut C, value: T) -> NeonResult<String>
where
T: serde::Serialize,
C: Context<'a>,
{
let value = serde_json::to_string(&value)
.or_else(|e| cx.throw_error(format!("failed to serialize value: {}", e)))?;
Ok(value)
}
fn hello(mut cx: FunctionContext) -> JsResult<JsString> {
let arg1 = cx.argument::<JsString>(0)?;
let value: Value = from_json_string(&mut cx, arg1)?;
let string = to_json_string(&mut cx, value)?;
Ok(cx.string(string))
}
fn init_worker_impl(mut cx: FunctionContext, throw_on_reinit: bool) -> JsResult<JsPromise> {
let arg1 = cx.argument::<JsString>(0)?;
let config: PoolConfig = from_json_string(&mut cx, arg1)?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let runtime = runtime(&mut cx)?;
let fut = async move {
let worker = Worker::new(config).await;
deferred.settle_with(&channel, move |mut cx| {
if WORKER.get().is_some() && !throw_on_reinit {
return Ok(cx.null()); // Short circuit to make using maybe_init a no-op
}
let worker = worker.or_else(|e| cx.throw_error(format!("{}", e)))?;
let already_set = WORKER.set(worker).is_err();
if already_set && throw_on_reinit {
cx.throw_error("worker already initialized")
} else {
Ok(cx.null())
}
});
};
runtime.spawn(fut);
Ok(promise)
}
fn init_manager_impl(mut cx: FunctionContext, throw_on_reinit: bool) -> JsResult<JsPromise> {
let arg1 = cx.argument::<JsString>(0)?;
let config: ManagerConfig = from_json_string(&mut cx, arg1)?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let runtime = runtime(&mut cx)?;
let fut = async move {
let manager = QueueManager::new(config).await;
deferred.settle_with(&channel, move |mut cx| {
if MANAGER.get().is_some() && !throw_on_reinit {
return Ok(cx.null()); // Short circuit to make using maybe_init a no-op
}
let manager = manager.or_else(|e| cx.throw_error(format!("{}", e)))?;
let already_set = MANAGER.set(manager).is_err();
if already_set && throw_on_reinit {
cx.throw_error("manager already initialized")
} else {
Ok(cx.null())
}
});
};
runtime.spawn(fut);
Ok(promise)
}
fn init_worker(cx: FunctionContext) -> JsResult<JsPromise> {
init_worker_impl(cx, true)
}
fn init_manager(cx: FunctionContext) -> JsResult<JsPromise> {
init_manager_impl(cx, true)
}
fn maybe_init_worker(cx: FunctionContext) -> JsResult<JsPromise> {
init_worker_impl(cx, false)
}
fn maybe_init_manager(cx: FunctionContext) -> JsResult<JsPromise> {
init_manager_impl(cx, false)
}
// throw_error has a type signature that makes it inconvenient to use in closures, because
// it requires that you specify the V of the NeonResult<V> returned, even though it's always
// an error. This is a sane thing for it to do, but it's inconvenient for us, because we
// frequently settle promises early, before we have a V to use for type inference. This little
// wrapper makes that easier, by specifying the V as JsNull
fn throw_null_err<'c, C>(cx: &mut C, msg: &str) -> NeonResult<Handle<'c, JsNull>>
where
C: Context<'c>,
{
cx.throw_error(msg)
}
fn create_job(mut cx: FunctionContext) -> JsResult<JsPromise> {
let arg1: Handle<JsString> = cx.argument::<JsString>(0)?;
let job: JobInit = from_json_string(&mut cx, arg1)?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let runtime = runtime(&mut cx)?;
let fut = async move {
let manager = match MANAGER.get() {
Some(manager) => manager,
None => {
deferred.settle_with(&channel, |mut cx| {
throw_null_err(&mut cx, "manager not initialized")
});
return;
}
};
let job = manager.create_job(job).await;
deferred.settle_with(&channel, move |mut cx| {
job.or_else(|e| cx.throw_error(format!("{}", e)))?;
Ok(cx.null())
});
};
runtime.spawn(fut);
Ok(promise)
}
fn dequeue_jobs(mut cx: FunctionContext) -> JsResult<JsPromise> {
let queue_name = cx.argument::<JsString>(0)?.value(&mut cx);
let limit = cx.argument::<JsNumber>(1)?.value(&mut cx) as usize; // TODO - I don't love this cast
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let runtime = runtime(&mut cx)?;
let fut = async move {
let worker = match WORKER.get() {
Some(worker) => worker,
None => {
deferred.settle_with(&channel, |mut cx| {
throw_null_err(&mut cx, "worker not initialized")
});
return;
}
};
let jobs = worker.dequeue_jobs(&queue_name, limit).await;
deferred.settle_with(&channel, move |mut cx| {
let jobs = jobs.or_else(|e| cx.throw_error(format!("{}", e)))?;
let jobs = to_json_string(&mut cx, jobs)?;
Ok(cx.string(jobs))
});
};
runtime.spawn(fut);
Ok(promise)
}
fn dequeue_with_vm_state(mut cx: FunctionContext) -> JsResult<JsPromise> {
let queue_name = cx.argument::<JsString>(0)?.value(&mut cx);
let limit = cx.argument::<JsNumber>(1)?.value(&mut cx) as usize; // TODO - I don't love this cast
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let runtime = runtime(&mut cx)?;
let fut = async move {
let worker = match WORKER.get() {
Some(worker) => worker,
None => {
deferred.settle_with(&channel, |mut cx| {
throw_null_err(&mut cx, "worker not initialized")
});
return;
}
};
let jobs = worker.dequeue_with_vm_state(&queue_name, limit).await;
deferred.settle_with(&channel, move |mut cx| {
let jobs = jobs.or_else(|e| cx.throw_error(format!("{}", e)))?;
let jobs = to_json_string(&mut cx, jobs)?;
Ok(cx.string(jobs))
});
};
runtime.spawn(fut);
Ok(promise)
}
fn flush_job(mut cx: FunctionContext) -> JsResult<JsPromise> {
let arg1 = cx.argument::<JsString>(0)?.value(&mut cx);
let job_id: Uuid = arg1
.parse()
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg1)))?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let runtime = runtime(&mut cx)?;
let fut = async move {
let worker = match WORKER.get() {
Some(worker) => worker,
None => {
deferred.settle_with(&channel, |mut cx| {
throw_null_err(&mut cx, "worker not initialized")
});
return;
}
};
let res = worker.flush_job(job_id).await;
deferred.settle_with(&channel, move |mut cx| {
res.or_else(|e: cyclotron_core::error::QueueError| cx.throw_error(format!("{}", e)))?;
Ok(cx.null())
});
};
runtime.spawn(fut);
Ok(promise)
}
fn set_state(mut cx: FunctionContext) -> JsResult<JsNull> {
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
let job_id: Uuid = arg
.parse()
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
let arg = cx.argument::<JsString>(1)?.value(&mut cx);
let state: JobState = arg
.parse()
.or_else(|_| cx.throw_error(format!("invalid job state: {}", arg)))?;
WORKER
.get()
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
.set_state(job_id, state)
.or_else(|e| cx.throw_error(format!("{}", e)))?;
Ok(cx.null())
}
fn set_queue(mut cx: FunctionContext) -> JsResult<JsNull> {
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
let job_id: Uuid = arg
.parse()
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
let queue = cx.argument::<JsString>(1)?.value(&mut cx);
WORKER
.get()
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
.set_queue(job_id, &queue)
.or_else(|e| cx.throw_error(format!("{}", e)))?;
Ok(cx.null())
}
fn set_priority(mut cx: FunctionContext) -> JsResult<JsNull> {
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
let job_id: Uuid = arg
.parse()
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
let arg = cx.argument::<JsNumber>(1)?.value(&mut cx);
let priority = arg as i16; // TODO - I /really/ don't love this cast
WORKER
.get()
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
.set_priority(job_id, priority)
.or_else(|e| cx.throw_error(format!("{}", e)))?;
Ok(cx.null())
}
fn set_scheduled_at(mut cx: FunctionContext) -> JsResult<JsNull> {
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
let job_id: Uuid = arg
.parse()
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
let arg = cx.argument::<JsString>(1)?.value(&mut cx);
let scheduled: DateTime<Utc> = arg
.parse()
.or_else(|_| cx.throw_error(format!("invalid scheduled at: {}", arg)))?;
WORKER
.get()
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
.set_scheduled_at(job_id, scheduled)
.or_else(|e| cx.throw_error(format!("{}", e)))?;
Ok(cx.null())
}
fn set_vm_state(mut cx: FunctionContext) -> JsResult<JsNull> {
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
let job_id: Uuid = arg
.parse()
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
// Tricky - we have to support passing nulls here, because that's how you clear vm state.
let vm_state = cx.argument::<JsValue>(1)?;
let vm_state = if vm_state.is_a::<JsNull, _>(&mut cx) {
None
} else {
Some(
vm_state
.downcast_or_throw::<JsString, _>(&mut cx)?
.value(&mut cx),
)
};
WORKER
.get()
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
.set_vm_state(job_id, vm_state)
.or_else(|e| cx.throw_error(format!("{}", e)))?;
Ok(cx.null())
}
fn set_metadata(mut cx: FunctionContext) -> JsResult<JsNull> {
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
let job_id: Uuid = arg
.parse()
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
// Tricky - we have to support passing nulls here, because that's how you clear metadata.
let metadata = cx.argument::<JsValue>(1)?;
let metadata = if metadata.is_a::<JsNull, _>(&mut cx) {
None
} else {
Some(
metadata
.downcast_or_throw::<JsString, _>(&mut cx)?
.value(&mut cx),
)
};
WORKER
.get()
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
.set_metadata(job_id, metadata)
.or_else(|e| cx.throw_error(format!("{}", e)))?;
Ok(cx.null())
}
fn set_parameters(mut cx: FunctionContext) -> JsResult<JsNull> {
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
let job_id: Uuid = arg
.parse()
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
// Tricky - we have to support passing nulls here, because that's how you clear parameters.
let parameters = cx.argument::<JsValue>(1)?;
let parameters = if parameters.is_a::<JsNull, _>(&mut cx) {
None
} else {
Some(
parameters
.downcast_or_throw::<JsString, _>(&mut cx)?
.value(&mut cx),
)
};
WORKER
.get()
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
.set_parameters(job_id, parameters)
.or_else(|e| cx.throw_error(format!("{}", e)))?;
Ok(cx.null())
}
#[neon::main]
fn main(mut cx: ModuleContext) -> NeonResult<()> {
cx.export_function("hello", hello)?;
cx.export_function("initWorker", init_worker)?;
cx.export_function("initManager", init_manager)?;
cx.export_function("maybeInitWorker", maybe_init_worker)?;
cx.export_function("maybeInitManager", maybe_init_manager)?;
cx.export_function("createJob", create_job)?;
cx.export_function("dequeueJobs", dequeue_jobs)?;
cx.export_function("dequeueJobsWithVmState", dequeue_with_vm_state)?;
cx.export_function("flushJob", flush_job)?;
cx.export_function("setState", set_state)?;
cx.export_function("setQueue", set_queue)?;
cx.export_function("setPriority", set_priority)?;
cx.export_function("setScheduledAt", set_scheduled_at)?;
cx.export_function("setVmState", set_vm_state)?;
cx.export_function("setMetadata", set_metadata)?;
cx.export_function("setParameters", set_parameters)?;
Ok(())
}

View File

@ -0,0 +1,24 @@
{
"compilerOptions": {
"module": "CommonJS",
"target": "ESNext",
"declaration": true,
"removeComments": true,
"emitDecoratorMetadata": true,
"experimentalDecorators": true,
"moduleResolution": "node",
"esModuleInterop": true,
"allowJs": true,
"sourceMap": true,
"baseUrl": "src/",
"rootDir": "src/",
"outDir": "dist/",
"types": ["node"],
"resolveJsonModule": true,
"strict": true,
"noImplicitAny": true,
"useUnknownInCatchVariables": false
},
"include": ["src"],
"exclude": ["node_modules", "dist", "bin"]
}

View File

@ -22,3 +22,4 @@ tower = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
url = { workspace = true }
common-metrics = { path = "../common/metrics" }

View File

@ -3,7 +3,7 @@ use config::Config;
use envconfig::Envconfig;
use eyre::Result;
use hook_common::metrics::setup_metrics_routes;
use common_metrics::setup_metrics_routes;
use hook_common::pgqueue::PgQueue;
mod config;

View File

@ -8,13 +8,10 @@ workspace = true
[dependencies]
async-trait = { workspace = true }
axum = { workspace = true, features = ["http2"] }
chrono = { workspace = true }
envconfig = { workspace = true }
health = { path = "../common/health" }
http = { workspace = true }
metrics = { workspace = true }
metrics-exporter-prometheus = { workspace = true }
rdkafka = { workspace = true }
reqwest = { workspace = true }
serde = { workspace = true }

View File

@ -1,7 +1,6 @@
pub mod config;
pub mod kafka_messages;
pub mod kafka_producer;
pub mod metrics;
pub mod pgqueue;
pub mod retry;
pub mod test;

View File

@ -24,3 +24,4 @@ time = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
common-metrics = { path = "../common/metrics" }

View File

@ -9,8 +9,8 @@ use std::{str::FromStr, time::Duration};
use tokio::sync::Semaphore;
use webhooks::WebhookCleaner;
use common_metrics::setup_metrics_routes;
use hook_common::kafka_producer::create_kafka_producer;
use hook_common::metrics::setup_metrics_routes;
mod cleanup;
mod config;

Some files were not shown because too many files have changed in this diff Show More