diff --git a/.dockerignore b/.dockerignore index cbfb4765f0e..5ff39789018 100644 --- a/.dockerignore +++ b/.dockerignore @@ -39,3 +39,11 @@ !test-runner-jest.config.js !test-runner-jest-environment.js !patches +!rust +rust/.env +rust/.github +rust/docker +rust/target +rust/cyclotron-node/dist +rust/cyclotron-node/node_modules +rust/cyclotron-node/index.node diff --git a/.gitignore b/.gitignore index bbd2fef53bc..7470edb7eeb 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,5 @@ plugin-transpiler/dist *-esbuild-bundle-visualization.html .dlt *.db +# Ignore any log files that happen to be present +*.log \ No newline at end of file diff --git a/bin/start-cyclotron b/bin/start-cyclotron new file mode 100755 index 00000000000..cb447f0f24b --- /dev/null +++ b/bin/start-cyclotron @@ -0,0 +1,24 @@ +#!/bin/bash + +set -ex + +trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT + +cd rust + +cargo build + +export RUST_LOG=${DEBUG:-debug} +SQLX_QUERY_LEVEL=${SQLX_QUERY_LEVEL:-warn} +export RUST_LOG=$RUST_LOG,sqlx::query=$SQLX_QUERY_LEVEL + +export DATABASE_URL=${DATABASE_URL:-postgres://posthog:posthog@localhost:5432/posthog} +export ALLOW_INTERNAL_IPS=${ALLOW_INTERNAL_IPS:-true} +cd cyclotron-core +cargo sqlx migrate run +cd .. + +./target/debug/cyclotron-fetch & +./target/debug/cyclotron-janitor & + +wait diff --git a/plugin-server/package.json b/plugin-server/package.json index 25df603b90e..e5442bf51b6 100644 --- a/plugin-server/package.json +++ b/plugin-server/package.json @@ -27,7 +27,9 @@ "services:start": "cd .. && docker compose -f docker-compose.dev.yml up", "services:stop": "cd .. && docker compose -f docker-compose.dev.yml down", "services:clean": "cd .. && docker compose -f docker-compose.dev.yml rm -v", - "services": "pnpm services:stop && pnpm services:clean && pnpm services:start" + "services": "pnpm services:stop && pnpm services:clean && pnpm services:start", + "build:cyclotron": "cd ../rust/cyclotron-node && pnpm run package", + "pnpm:devPreinstall": "pnpm run build:cyclotron" }, "graphile-worker": { "maxContiguousErrors": 300 @@ -86,7 +88,8 @@ "uuid": "^9.0.1", "v8-profiler-next": "^1.9.0", "vm2": "3.9.18", - "detect-browser": "^5.3.0" + "detect-browser": "^5.3.0", + "@posthog/cyclotron": "file:../rust/cyclotron-node" }, "devDependencies": { "0x": "^5.5.0", diff --git a/plugin-server/pnpm-lock.yaml b/plugin-server/pnpm-lock.yaml index c2c4bc52093..f242e25145a 100644 --- a/plugin-server/pnpm-lock.yaml +++ b/plugin-server/pnpm-lock.yaml @@ -43,6 +43,9 @@ dependencies: '@posthog/clickhouse': specifier: ^1.7.0 version: 1.7.0 + '@posthog/cyclotron': + specifier: file:../rust/cyclotron-node + version: file:../rust/cyclotron-node '@posthog/hogvm': specifier: ^1.0.32 version: 1.0.32(luxon@3.4.4)(re2@1.20.3) @@ -10731,3 +10734,8 @@ packages: /yocto-queue@0.1.0: resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==} engines: {node: '>=10'} + + file:../rust/cyclotron-node: + resolution: {directory: ../rust/cyclotron-node, type: directory} + name: '@posthog/cyclotron' + dev: false diff --git a/plugin-server/src/capabilities.ts b/plugin-server/src/capabilities.ts index 11158a284b9..7b8c8461b78 100644 --- a/plugin-server/src/capabilities.ts +++ b/plugin-server/src/capabilities.ts @@ -26,6 +26,7 @@ export function getPluginServerCapabilities(config: PluginsServerConfig): Plugin cdpProcessedEvents: true, cdpFunctionCallbacks: true, cdpFunctionOverflow: true, + cdpCyclotronWorker: true, syncInlinePlugins: true, ...sharedCapabilities, } @@ -108,6 +109,11 @@ export function getPluginServerCapabilities(config: PluginsServerConfig): Plugin cdpFunctionOverflow: true, ...sharedCapabilities, } + case PluginServerMode.cdp_cyclotron_worker: + return { + cdpCyclotronWorker: true, + ...sharedCapabilities, + } // This is only for functional tests, which time out if all capabilities are used // ideally we'd run just the specific capability needed per test, but that's not easy to do atm case PluginServerMode.functional_tests: diff --git a/plugin-server/src/cdp/async-function-executor.ts b/plugin-server/src/cdp/async-function-executor.ts index 93dd7e285cb..78a9374a781 100644 --- a/plugin-server/src/cdp/async-function-executor.ts +++ b/plugin-server/src/cdp/async-function-executor.ts @@ -1,3 +1,4 @@ +import cyclotron from '@posthog/cyclotron' import { Histogram } from 'prom-client' import { buildIntegerMatcher } from '../config/config' @@ -27,9 +28,11 @@ export type AsyncFunctionExecutorOptions = { export class AsyncFunctionExecutor { hogHookEnabledForTeams: ValueMatcher + cyclotronEnabledForTeams: ValueMatcher constructor(private serverConfig: PluginsServerConfig, private rustyHook: RustyHook) { this.hogHookEnabledForTeams = buildIntegerMatcher(serverConfig.CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS, true) + this.cyclotronEnabledForTeams = buildIntegerMatcher(serverConfig.CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS, true) } async execute( @@ -99,8 +102,44 @@ export class AsyncFunctionExecutor { histogramFetchPayloadSize.observe(body.length / 1024) } - // If the caller hasn't forced it to be synchronous and the team has the rustyhook enabled, enqueue it - if (!options?.sync && this.hogHookEnabledForTeams(request.teamId)) { + // If the caller hasn't forced it to be synchronous and the team has the cyclotron or + // rustyhook enabled, enqueue it in one of those services. + if (!options?.sync && this.cyclotronEnabledForTeams(request.teamId)) { + try { + await cyclotron.createJob({ + teamId: request.teamId, + functionId: request.hogFunctionId, + queueName: 'fetch', + // TODO: The async function compression changes happen upstream of this + // function. I guess we'll want to unwind that change because we actually + // want the `vmState` (and the rest of state) so we can put it into PG here. + vmState: '', + parameters: JSON.stringify({ + return_queue: 'hog', + url, + method, + headers, + body, + }), + metadata: JSON.stringify({ + // TODO: It seems like Fetch expects metadata to have this shape, which + // I don't understand. I think `metadata` is where all the other Hog + // state is going to be stored? For now I'm just trying to make fetch + // work. + tries: 0, + trace: [], + }), + }) + } catch (e) { + status.error( + '🦔', + `[HogExecutor] Cyclotron failed to enqueue async fetch function, sending directly instead`, + { + error: e, + } + ) + } + } else if (!options?.sync && this.hogHookEnabledForTeams(request.teamId)) { const hoghooksPayload = JSON.stringify(request) histogramHogHooksPayloadSize.observe(hoghooksPayload.length / 1024) diff --git a/plugin-server/src/cdp/cdp-consumers.ts b/plugin-server/src/cdp/cdp-consumers.ts index 89a5b3a3a59..fef401d4729 100644 --- a/plugin-server/src/cdp/cdp-consumers.ts +++ b/plugin-server/src/cdp/cdp-consumers.ts @@ -1,3 +1,4 @@ +import cyclotron from '@posthog/cyclotron' import { captureException } from '@sentry/node' import { features, librdkafkaVersion, Message } from 'node-rdkafka' import { Counter, Histogram } from 'prom-client' @@ -443,7 +444,12 @@ abstract class CdpConsumerBase { const globalConnectionConfig = createRdConnectionConfigFromEnvVars(this.hub) const globalProducerConfig = createRdProducerConfigFromEnvVars(this.hub) - await Promise.all([this.hogFunctionManager.start()]) + await Promise.all([ + this.hogFunctionManager.start(), + this.hub.CYCLOTRON_DATABASE_URL + ? cyclotron.initManager({ shards: [{ dbUrl: this.hub.CYCLOTRON_DATABASE_URL }] }) + : Promise.resolve(), + ]) this.kafkaProducer = new KafkaProducerWrapper( await createKafkaProducer(globalConnectionConfig, globalProducerConfig) @@ -693,3 +699,57 @@ export class CdpOverflowConsumer extends CdpConsumerBase { return invocationGlobals } } + +// TODO: Split out non-Kafka specific parts of CdpConsumerBase so that it can be used by the +// Cyclotron worker below. Or maybe we can just wait, and rip the Kafka bits out once Cyclotron is +// shipped (and rename it something other than consomer, probably). For now, this is an easy way to +// use existing code and get an end-to-end demo shipped. +export class CdpCyclotronWorker extends CdpConsumerBase { + protected name = 'CdpCyclotronWorker' + protected topic = 'UNUSED-CdpCyclotronWorker' + protected consumerGroupId = 'UNUSED-CdpCyclotronWorker' + private runningWorker: Promise | undefined + private isUnhealthy = false + + public async _handleEachBatch(_: Message[]): Promise { + // Not called, we override `start` below to use Cyclotron instead. + } + + private async innerStart() { + try { + const limit = 100 // TODO: Make configurable. + while (!this.isStopping) { + const jobs = await cyclotron.dequeueJobsWithVmState('hog', limit) + for (const job of jobs) { + // TODO: Reassemble a HogFunctionInvocationAsyncResponse (or whatever proper type) + // from the fields on the job, and then execute the next Hog step. + console.log(job.id) + } + } + } catch (err) { + this.isUnhealthy = true + console.error('Error in Cyclotron worker', err) + throw err + } + } + + public async start() { + await cyclotron.initManager({ shards: [{ dbUrl: this.hub.CYCLOTRON_DATABASE_URL }] }) + await cyclotron.initWorker({ dbUrl: this.hub.CYCLOTRON_DATABASE_URL }) + + // Consumer `start` expects an async task is started, and not that `start` itself blocks + // indefinitely. + this.runningWorker = this.innerStart() + + return Promise.resolve() + } + + public async stop() { + await super.stop() + await this.runningWorker + } + + public isHealthy() { + return this.isUnhealthy + } +} diff --git a/plugin-server/src/config/config.ts b/plugin-server/src/config/config.ts index c3fea809c61..7de2856530e 100644 --- a/plugin-server/src/config/config.ts +++ b/plugin-server/src/config/config.ts @@ -187,9 +187,13 @@ export function getDefaultConfig(): PluginsServerConfig { CDP_WATCHER_REFILL_RATE: 10, CDP_WATCHER_DISABLED_TEMPORARY_MAX_COUNT: 3, CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS: '', + CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS: '', CDP_REDIS_PASSWORD: '', CDP_REDIS_HOST: '', CDP_REDIS_PORT: 6479, + + // Cyclotron + CYCLOTRON_DATABASE_URL: '', } } diff --git a/plugin-server/src/main/pluginsServer.ts b/plugin-server/src/main/pluginsServer.ts index 0bcbf0e6359..3a7e8851774 100644 --- a/plugin-server/src/main/pluginsServer.ts +++ b/plugin-server/src/main/pluginsServer.ts @@ -11,7 +11,12 @@ import v8Profiler from 'v8-profiler-next' import { getPluginServerCapabilities } from '../capabilities' import { CdpApi } from '../cdp/cdp-api' -import { CdpFunctionCallbackConsumer, CdpOverflowConsumer, CdpProcessedEventsConsumer } from '../cdp/cdp-consumers' +import { + CdpCyclotronWorker, + CdpFunctionCallbackConsumer, + CdpOverflowConsumer, + CdpProcessedEventsConsumer, +} from '../cdp/cdp-consumers' import { defaultConfig, sessionRecordingConsumerConfig } from '../config/config' import { Hub, PluginServerCapabilities, PluginsServerConfig } from '../types' import { createHub, createKafkaClient, createKafkaProducerWrapper } from '../utils/db/hub' @@ -571,6 +576,17 @@ export async function startPluginsServer( healthChecks['cdp-overflow'] = () => consumer.isHealthy() ?? false } + if (capabilities.cdpCyclotronWorker) { + ;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, capabilities) + if (hub.CYCLOTRON_DATABASE_URL) { + const worker = new CdpCyclotronWorker(hub) + await worker.start() + } else { + // This is a temporary solution until we *require* Cyclotron to be configured. + status.warn('💥', 'CYCLOTRON_DATABASE_URL is not set, not running Cyclotron worker') + } + } + if (capabilities.http) { const app = setupCommonRoutes(healthChecks, analyticsEventsIngestionConsumer) diff --git a/plugin-server/src/types.ts b/plugin-server/src/types.ts index 953d45a56bc..1d596f034d8 100644 --- a/plugin-server/src/types.ts +++ b/plugin-server/src/types.ts @@ -85,6 +85,7 @@ export enum PluginServerMode { cdp_processed_events = 'cdp-processed-events', cdp_function_callbacks = 'cdp-function-callbacks', cdp_function_overflow = 'cdp-function-overflow', + cdp_cyclotron_worker = 'cdp-cyclotron-worker', functional_tests = 'functional-tests', } @@ -107,6 +108,7 @@ export type CdpConfig = { CDP_WATCHER_DISABLED_TEMPORARY_TTL: number // How long a function should be temporarily disabled for CDP_WATCHER_DISABLED_TEMPORARY_MAX_COUNT: number // How many times a function can be disabled before it is disabled permanently CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS: string + CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS: string CDP_REDIS_HOST: string CDP_REDIS_PORT: number CDP_REDIS_PASSWORD: string @@ -279,6 +281,8 @@ export interface PluginsServerConfig extends CdpConfig { // kafka debug stats interval SESSION_RECORDING_KAFKA_CONSUMPTION_STATISTICS_EVENT_INTERVAL_MS: number + + CYCLOTRON_DATABASE_URL: string } export interface Hub extends PluginsServerConfig { @@ -345,6 +349,7 @@ export interface PluginServerCapabilities { cdpProcessedEvents?: boolean cdpFunctionCallbacks?: boolean cdpFunctionOverflow?: boolean + cdpCyclotronWorker?: boolean appManagementSingleton?: boolean preflightSchedules?: boolean // Used for instance health checks on hobby deploy, not useful on cloud http?: boolean diff --git a/plugin-server/tests/server.test.ts b/plugin-server/tests/server.test.ts index 52fe0b989bf..009416547b3 100644 --- a/plugin-server/tests/server.test.ts +++ b/plugin-server/tests/server.test.ts @@ -97,6 +97,7 @@ describe('server', () => { cdpProcessedEvents: true, cdpFunctionCallbacks: true, cdpFunctionOverflow: true, + cdpCyclotronWorker: true, syncInlinePlugins: true, } ) diff --git a/production.Dockerfile b/production.Dockerfile index b64293dcb69..07906afd9bc 100644 --- a/production.Dockerfile +++ b/production.Dockerfile @@ -38,11 +38,12 @@ COPY ./bin/ ./bin/ COPY babel.config.js tsconfig.json webpack.config.js tailwind.config.js ./ RUN pnpm build - # # --------------------------------------------------------- # -FROM node:18.19.1-bullseye-slim AS plugin-server-build +FROM ghcr.io/posthog/rust-node-container:bullseye_rust_1.80.1-node_18.19.1 AS plugin-server-build +WORKDIR /code +COPY ./rust ./rust WORKDIR /code/plugin-server SHELL ["/bin/bash", "-e", "-o", "pipefail", "-c"] @@ -182,6 +183,7 @@ COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/dist COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/node_modules /code/plugin-server/node_modules COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/package.json /code/plugin-server/package.json + # Copy the Python dependencies and Django staticfiles from the posthog-build stage. COPY --from=posthog-build --chown=posthog:posthog /code/staticfiles /code/staticfiles COPY --from=posthog-build --chown=posthog:posthog /python-runtime /python-runtime diff --git a/rust/.cargo/config.toml b/rust/.cargo/config.toml new file mode 100644 index 00000000000..7a657288d3f --- /dev/null +++ b/rust/.cargo/config.toml @@ -0,0 +1,4 @@ +[env] +# Force SQLX to run in offline mode for CI. Devs can change this if they want, to live code against the DB, +# but we use it at the workspace level here to allow use of sqlx macros across all crates +SQLX_OFFLINE = "true" \ No newline at end of file diff --git a/rust/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json b/rust/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json new file mode 100644 index 00000000000..a2cb4e3a0a8 --- /dev/null +++ b/rust/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2", + "describe": { + "columns": [], + "parameters": { + "Left": ["Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503" +} diff --git a/rust/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json b/rust/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json new file mode 100644 index 00000000000..7a3a8b98d9d --- /dev/null +++ b/rust/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Text", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87" +} diff --git a/rust/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json b/rust/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json new file mode 100644 index 00000000000..3c2761eccb0 --- /dev/null +++ b/rust/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Text", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0" +} diff --git a/rust/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json b/rust/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json new file mode 100644 index 00000000000..b0e1ef22104 --- /dev/null +++ b/rust/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Int2", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805" +} diff --git a/rust/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json b/rust/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json new file mode 100644 index 00000000000..d3a54ba7ef2 --- /dev/null +++ b/rust/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json @@ -0,0 +1,117 @@ +{ + "db_name": "PostgreSQL", + "query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n NULL as vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Uuid" + }, + { + "ordinal": 1, + "name": "team_id", + "type_info": "Int4" + }, + { + "ordinal": 2, + "name": "state: JobState", + "type_info": { + "Custom": { + "name": "jobstate", + "kind": { + "Enum": ["available", "completed", "failed", "running", "paused"] + } + } + } + }, + { + "ordinal": 3, + "name": "queue_name", + "type_info": "Text" + }, + { + "ordinal": 4, + "name": "priority", + "type_info": "Int2" + }, + { + "ordinal": 5, + "name": "function_id", + "type_info": "Uuid" + }, + { + "ordinal": 6, + "name": "created", + "type_info": "Timestamptz" + }, + { + "ordinal": 7, + "name": "last_transition", + "type_info": "Timestamptz" + }, + { + "ordinal": 8, + "name": "scheduled", + "type_info": "Timestamptz" + }, + { + "ordinal": 9, + "name": "transition_count", + "type_info": "Int2" + }, + { + "ordinal": 10, + "name": "vm_state", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "metadata", + "type_info": "Text" + }, + { + "ordinal": 12, + "name": "parameters", + "type_info": "Text" + }, + { + "ordinal": 13, + "name": "lock_id", + "type_info": "Uuid" + }, + { + "ordinal": 14, + "name": "last_heartbeat", + "type_info": "Timestamptz" + }, + { + "ordinal": 15, + "name": "janitor_touch_count", + "type_info": "Int2" + } + ], + "parameters": { + "Left": ["Text", "Int8", "Uuid"] + }, + "nullable": [ + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + null, + true, + true, + true, + true, + false + ] + }, + "hash": "350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46" +} diff --git a/rust/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json b/rust/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json new file mode 100644 index 00000000000..2ff58c66714 --- /dev/null +++ b/rust/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "\nWITH stalled AS (\n SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1\nFROM stalled\nWHERE cyclotron_jobs.id = stalled.id\n ", + "describe": { + "columns": [], + "parameters": { + "Left": ["Timestamptz"] + }, + "nullable": [] + }, + "hash": "54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d" +} diff --git a/rust/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json b/rust/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json new file mode 100644 index 00000000000..230374e98d6 --- /dev/null +++ b/rust/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json @@ -0,0 +1,30 @@ +{ + "db_name": "PostgreSQL", + "query": "\nINSERT INTO cyclotron_jobs\n (\n id,\n team_id,\n function_id,\n created,\n lock_id,\n last_heartbeat,\n janitor_touch_count,\n transition_count,\n last_transition,\n queue_name,\n state,\n scheduled,\n priority,\n vm_state,\n metadata,\n parameters\n )\nVALUES\n ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + "Uuid", + "Int4", + "Uuid", + "Text", + { + "Custom": { + "name": "jobstate", + "kind": { + "Enum": ["available", "completed", "failed", "running", "paused"] + } + } + }, + "Timestamptz", + "Int2", + "Text", + "Text", + "Text" + ] + }, + "nullable": [] + }, + "hash": "7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6" +} diff --git a/rust/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json b/rust/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json new file mode 100644 index 00000000000..b728d398568 --- /dev/null +++ b/rust/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2", + "describe": { + "columns": [], + "parameters": { + "Left": ["Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb" +} diff --git a/rust/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json b/rust/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json new file mode 100644 index 00000000000..8c3a3dbde8b --- /dev/null +++ b/rust/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Text", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c" +} diff --git a/rust/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json b/rust/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json new file mode 100644 index 00000000000..59a56c441cb --- /dev/null +++ b/rust/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Timestamptz", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23" +} diff --git a/rust/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json b/rust/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json new file mode 100644 index 00000000000..bd8a7cdd902 --- /dev/null +++ b/rust/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json @@ -0,0 +1,18 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "vm_state", + "type_info": "Text" + } + ], + "parameters": { + "Left": ["Uuid", "Uuid"] + }, + "nullable": [true] + }, + "hash": "aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632" +} diff --git a/rust/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json b/rust/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json new file mode 100644 index 00000000000..ea9c7f8fceb --- /dev/null +++ b/rust/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Text", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7" +} diff --git a/rust/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json b/rust/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json new file mode 100644 index 00000000000..d2942f91b19 --- /dev/null +++ b/rust/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json @@ -0,0 +1,23 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs\n SET state = $1, last_transition = NOW(), transition_count = transition_count + 1\n WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": [ + { + "Custom": { + "name": "jobstate", + "kind": { + "Enum": ["available", "completed", "failed", "running", "paused"] + } + } + }, + "Uuid", + "Uuid" + ] + }, + "nullable": [] + }, + "hash": "b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13" +} diff --git a/rust/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json b/rust/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json new file mode 100644 index 00000000000..b94965873e7 --- /dev/null +++ b/rust/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json @@ -0,0 +1,117 @@ +{ + "db_name": "PostgreSQL", + "query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Uuid" + }, + { + "ordinal": 1, + "name": "team_id", + "type_info": "Int4" + }, + { + "ordinal": 2, + "name": "state: JobState", + "type_info": { + "Custom": { + "name": "jobstate", + "kind": { + "Enum": ["available", "completed", "failed", "running", "paused"] + } + } + } + }, + { + "ordinal": 3, + "name": "queue_name", + "type_info": "Text" + }, + { + "ordinal": 4, + "name": "priority", + "type_info": "Int2" + }, + { + "ordinal": 5, + "name": "function_id", + "type_info": "Uuid" + }, + { + "ordinal": 6, + "name": "created", + "type_info": "Timestamptz" + }, + { + "ordinal": 7, + "name": "last_transition", + "type_info": "Timestamptz" + }, + { + "ordinal": 8, + "name": "scheduled", + "type_info": "Timestamptz" + }, + { + "ordinal": 9, + "name": "transition_count", + "type_info": "Int2" + }, + { + "ordinal": 10, + "name": "vm_state", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "metadata", + "type_info": "Text" + }, + { + "ordinal": 12, + "name": "parameters", + "type_info": "Text" + }, + { + "ordinal": 13, + "name": "lock_id", + "type_info": "Uuid" + }, + { + "ordinal": 14, + "name": "last_heartbeat", + "type_info": "Timestamptz" + }, + { + "ordinal": 15, + "name": "janitor_touch_count", + "type_info": "Int2" + } + ], + "parameters": { + "Left": ["Text", "Int8", "Uuid"] + }, + "nullable": [ + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + true, + true, + true, + false + ] + }, + "hash": "c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e" +} diff --git a/rust/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json b/rust/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json new file mode 100644 index 00000000000..a585e9f7e7d --- /dev/null +++ b/rust/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "DELETE FROM cyclotron_jobs WHERE state = 'completed'", + "describe": { + "columns": [], + "parameters": { + "Left": [] + }, + "nullable": [] + }, + "hash": "f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c" +} diff --git a/rust/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json b/rust/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json new file mode 100644 index 00000000000..09fc24b340d --- /dev/null +++ b/rust/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "\nDELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2\n ", + "describe": { + "columns": [], + "parameters": { + "Left": ["Timestamptz", "Int2"] + }, + "nullable": [] + }, + "hash": "fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb" +} diff --git a/rust/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json b/rust/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json new file mode 100644 index 00000000000..605d79d57c0 --- /dev/null +++ b/rust/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "DELETE FROM cyclotron_jobs WHERE state = 'failed'", + "describe": { + "columns": [], + "parameters": { + "Left": [] + }, + "nullable": [] + }, + "hash": "ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e" +} diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 923fbc29286..a9f87dc07b0 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -673,6 +673,7 @@ dependencies = [ "iana-time-zone", "js-sys", "num-traits", + "serde", "wasm-bindgen", "windows-targets 0.52.0", ] @@ -700,6 +701,25 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "common-dns" +version = "0.1.0" +dependencies = [ + "futures", + "reqwest 0.12.3", + "tokio", +] + +[[package]] +name = "common-metrics" +version = "0.1.0" +dependencies = [ + "axum 0.7.5", + "metrics", + "metrics-exporter-prometheus", + "tokio", +] + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -819,6 +839,80 @@ dependencies = [ "typenum", ] +[[package]] +name = "cyclotron-core" +version = "0.1.0" +dependencies = [ + "chrono", + "futures", + "rand", + "serde", + "sqlx", + "thiserror", + "tokio", + "uuid", +] + +[[package]] +name = "cyclotron-fetch" +version = "0.1.0" +dependencies = [ + "axum 0.7.5", + "chrono", + "common-dns", + "common-metrics", + "cyclotron-core", + "envconfig", + "futures", + "health", + "http 1.1.0", + "httpmock", + "metrics", + "rand", + "reqwest 0.12.3", + "serde", + "serde_json", + "sqlx", + "thiserror", + "tokio", + "tracing", + "tracing-subscriber", + "uuid", +] + +[[package]] +name = "cyclotron-janitor" +version = "0.1.0" +dependencies = [ + "axum 0.7.5", + "chrono", + "common-metrics", + "cyclotron-core", + "envconfig", + "eyre", + "health", + "metrics", + "sqlx", + "tokio", + "tracing", + "tracing-subscriber", + "uuid", +] + +[[package]] +name = "cyclotron-node" +version = "0.1.0" +dependencies = [ + "chrono", + "cyclotron-core", + "neon", + "once_cell", + "serde", + "serde_json", + "tokio", + "uuid", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -1468,6 +1562,7 @@ name = "hook-api" version = "0.1.0" dependencies = [ "axum 0.7.5", + "common-metrics", "envconfig", "eyre", "hook-common", @@ -1489,13 +1584,10 @@ name = "hook-common" version = "0.1.0" dependencies = [ "async-trait", - "axum 0.7.5", "chrono", "envconfig", "health", "http 1.1.0", - "metrics", - "metrics-exporter-prometheus", "rdkafka", "reqwest 0.12.3", "serde", @@ -1514,6 +1606,7 @@ version = "0.1.0" dependencies = [ "async-trait", "axum 0.7.5", + "common-metrics", "envconfig", "eyre", "futures", @@ -1537,6 +1630,8 @@ version = "0.1.0" dependencies = [ "axum 0.7.5", "chrono", + "common-dns", + "common-metrics", "envconfig", "futures", "health", @@ -1944,6 +2039,16 @@ version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +[[package]] +name = "libloading" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" +dependencies = [ + "cfg-if", + "windows-targets 0.52.0", +] + [[package]] name = "libm" version = "0.2.8" @@ -2160,6 +2265,32 @@ dependencies = [ "tempfile", ] +[[package]] +name = "neon" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d75440242411c87dc39847b0e33e961ec1f10326a9d8ecf9c1ea64a3b3c13dc" +dependencies = [ + "getrandom", + "libloading", + "neon-macros", + "once_cell", + "semver", + "send_wrapper", + "smallvec", +] + +[[package]] +name = "neon-macros" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6813fde79b646e47e7ad75f480aa80ef76a5d9599e2717407961531169ee38b" +dependencies = [ + "quote", + "syn 2.0.48", + "syn-mid", +] + [[package]] name = "new_debug_unreachable" version = "1.0.6" @@ -3181,6 +3312,18 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" + +[[package]] +name = "send_wrapper" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73" + [[package]] name = "serde" version = "1.0.196" @@ -3660,6 +3803,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn-mid" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5dc35bb08dd1ca3dfb09dce91fd2d13294d6711c88897d9a9d60acf39bce049" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "sync_wrapper" version = "0.1.2" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index eae4b047f3d..712a5099b5b 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -4,11 +4,17 @@ resolver = "2" members = [ "capture", "common/health", + "common/metrics", + "common/dns", "feature-flags", "hook-api", "hook-common", "hook-janitor", "hook-worker", + "cyclotron-core", + "cyclotron-node", + "cyclotron-janitor", + "cyclotron-fetch", ] [workspace.lints.rust] @@ -34,7 +40,7 @@ axum = { version = "0.7.5", features = ["http2", "macros", "matched-path"] } axum-client-ip = "0.6.0" base64 = "0.22.0" bytes = "1" -chrono = { version = "0.4" } +chrono = { version = "0.4", features = ["default", "serde"]} envconfig = "0.10.0" eyre = "0.6.9" flate2 = "1.0" @@ -80,3 +86,4 @@ tracing-opentelemetry = "0.23.0" tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } url = { version = "2.5.0 " } uuid = { version = "1.6.1", features = ["v7", "serde"] } +neon = "1" \ No newline at end of file diff --git a/rust/Dockerfile b/rust/Dockerfile index a6c59b11a0e..a19bd3a74ae 100644 --- a/rust/Dockerfile +++ b/rust/Dockerfile @@ -1,4 +1,4 @@ -FROM docker.io/lukemathwalker/cargo-chef:latest-rust-1.77-bookworm AS chef +FROM docker.io/lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef ARG BIN WORKDIR /app diff --git a/rust/common/dns/Cargo.toml b/rust/common/dns/Cargo.toml new file mode 100644 index 00000000000..b67a2d04e63 --- /dev/null +++ b/rust/common/dns/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "common-dns" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +futures = { workspace = true } +reqwest = { workspace = true } +tokio = { workspace = true } \ No newline at end of file diff --git a/rust/hook-worker/src/dns.rs b/rust/common/dns/src/lib.rs similarity index 98% rename from rust/hook-worker/src/dns.rs rename to rust/common/dns/src/lib.rs index 36fd7a00539..12047437257 100644 --- a/rust/hook-worker/src/dns.rs +++ b/rust/common/dns/src/lib.rs @@ -86,7 +86,7 @@ impl Resolve for PublicIPv4Resolver { #[cfg(test)] mod tests { - use crate::dns::{NoPublicIPv4Error, PublicIPv4Resolver}; + use crate::{NoPublicIPv4Error, PublicIPv4Resolver}; use reqwest::dns::{Name, Resolve}; use std::str::FromStr; diff --git a/rust/common/health/src/lib.rs b/rust/common/health/src/lib.rs index 5d42bafa8ff..4a1370e5509 100644 --- a/rust/common/health/src/lib.rs +++ b/rust/common/health/src/lib.rs @@ -4,7 +4,7 @@ use std::sync::{Arc, RwLock}; use axum::http::StatusCode; use axum::response::{IntoResponse, Response}; -use time::Duration; +use std::time::Duration; use tokio::sync::mpsc; use tracing::{info, warn}; @@ -143,7 +143,16 @@ impl HealthRegistry { /// Registers a new component in the registry. The returned handle should be passed /// to the component, to allow it to frequently report its health status. - pub async fn register(&self, component: String, deadline: Duration) -> HealthHandle { + pub async fn register(&self, component: String, deadline: D) -> HealthHandle + where + // HACK: to let callers user time::Duration or std::time::Duration (and therefore chrono::Duration), + // since apparently we use all three + D: TryInto, + { + let Ok(deadline) = deadline.try_into() else { + // TODO - I should return an error here, but I don't want to refactor everything that uses this right now + panic!("invalid deadline") + }; let handle = HealthHandle { component, deadline, diff --git a/rust/common/metrics/Cargo.toml b/rust/common/metrics/Cargo.toml new file mode 100644 index 00000000000..14ed059df0f --- /dev/null +++ b/rust/common/metrics/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "common-metrics" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +axum = { workspace = true } +metrics-exporter-prometheus = { workspace = true } +tokio = { workspace = true } +metrics = { workspace = true } \ No newline at end of file diff --git a/rust/common/metrics/README.md b/rust/common/metrics/README.md new file mode 100644 index 00000000000..4788321ecdd --- /dev/null +++ b/rust/common/metrics/README.md @@ -0,0 +1 @@ +Ripped from rusty-hook, since it'll be used across more or less all cyclotron stuff, as well as rustyhook \ No newline at end of file diff --git a/rust/hook-common/src/metrics.rs b/rust/common/metrics/src/lib.rs similarity index 100% rename from rust/hook-common/src/metrics.rs rename to rust/common/metrics/src/lib.rs diff --git a/rust/cyclotron-core/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json b/rust/cyclotron-core/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json new file mode 100644 index 00000000000..a2cb4e3a0a8 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2", + "describe": { + "columns": [], + "parameters": { + "Left": ["Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503" +} diff --git a/rust/cyclotron-core/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json b/rust/cyclotron-core/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json new file mode 100644 index 00000000000..7a3a8b98d9d --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Text", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87" +} diff --git a/rust/cyclotron-core/.sqlx/query-213e9d70e145a01fb42d5c3a80f9126073113a4af03c4c9fd3a81004d898f883.json b/rust/cyclotron-core/.sqlx/query-213e9d70e145a01fb42d5c3a80f9126073113a4af03c4c9fd3a81004d898f883.json new file mode 100644 index 00000000000..f9150cfcda3 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-213e9d70e145a01fb42d5c3a80f9126073113a4af03c4c9fd3a81004d898f883.json @@ -0,0 +1,18 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'available' AND scheduled <= NOW()", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "count", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [] + }, + "nullable": [null] + }, + "hash": "213e9d70e145a01fb42d5c3a80f9126073113a4af03c4c9fd3a81004d898f883" +} diff --git a/rust/cyclotron-core/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json b/rust/cyclotron-core/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json new file mode 100644 index 00000000000..3c2761eccb0 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Text", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0" +} diff --git a/rust/cyclotron-core/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json b/rust/cyclotron-core/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json new file mode 100644 index 00000000000..b0e1ef22104 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Int2", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805" +} diff --git a/rust/cyclotron-core/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json b/rust/cyclotron-core/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json new file mode 100644 index 00000000000..d3a54ba7ef2 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json @@ -0,0 +1,117 @@ +{ + "db_name": "PostgreSQL", + "query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n NULL as vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Uuid" + }, + { + "ordinal": 1, + "name": "team_id", + "type_info": "Int4" + }, + { + "ordinal": 2, + "name": "state: JobState", + "type_info": { + "Custom": { + "name": "jobstate", + "kind": { + "Enum": ["available", "completed", "failed", "running", "paused"] + } + } + } + }, + { + "ordinal": 3, + "name": "queue_name", + "type_info": "Text" + }, + { + "ordinal": 4, + "name": "priority", + "type_info": "Int2" + }, + { + "ordinal": 5, + "name": "function_id", + "type_info": "Uuid" + }, + { + "ordinal": 6, + "name": "created", + "type_info": "Timestamptz" + }, + { + "ordinal": 7, + "name": "last_transition", + "type_info": "Timestamptz" + }, + { + "ordinal": 8, + "name": "scheduled", + "type_info": "Timestamptz" + }, + { + "ordinal": 9, + "name": "transition_count", + "type_info": "Int2" + }, + { + "ordinal": 10, + "name": "vm_state", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "metadata", + "type_info": "Text" + }, + { + "ordinal": 12, + "name": "parameters", + "type_info": "Text" + }, + { + "ordinal": 13, + "name": "lock_id", + "type_info": "Uuid" + }, + { + "ordinal": 14, + "name": "last_heartbeat", + "type_info": "Timestamptz" + }, + { + "ordinal": 15, + "name": "janitor_touch_count", + "type_info": "Int2" + } + ], + "parameters": { + "Left": ["Text", "Int8", "Uuid"] + }, + "nullable": [ + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + null, + true, + true, + true, + true, + false + ] + }, + "hash": "350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46" +} diff --git a/rust/cyclotron-core/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json b/rust/cyclotron-core/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json new file mode 100644 index 00000000000..2ff58c66714 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "\nWITH stalled AS (\n SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1\nFROM stalled\nWHERE cyclotron_jobs.id = stalled.id\n ", + "describe": { + "columns": [], + "parameters": { + "Left": ["Timestamptz"] + }, + "nullable": [] + }, + "hash": "54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d" +} diff --git a/rust/cyclotron-core/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json b/rust/cyclotron-core/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json new file mode 100644 index 00000000000..230374e98d6 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json @@ -0,0 +1,30 @@ +{ + "db_name": "PostgreSQL", + "query": "\nINSERT INTO cyclotron_jobs\n (\n id,\n team_id,\n function_id,\n created,\n lock_id,\n last_heartbeat,\n janitor_touch_count,\n transition_count,\n last_transition,\n queue_name,\n state,\n scheduled,\n priority,\n vm_state,\n metadata,\n parameters\n )\nVALUES\n ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + "Uuid", + "Int4", + "Uuid", + "Text", + { + "Custom": { + "name": "jobstate", + "kind": { + "Enum": ["available", "completed", "failed", "running", "paused"] + } + } + }, + "Timestamptz", + "Int2", + "Text", + "Text", + "Text" + ] + }, + "nullable": [] + }, + "hash": "7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6" +} diff --git a/rust/cyclotron-core/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json b/rust/cyclotron-core/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json new file mode 100644 index 00000000000..b728d398568 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2", + "describe": { + "columns": [], + "parameters": { + "Left": ["Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb" +} diff --git a/rust/cyclotron-core/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json b/rust/cyclotron-core/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json new file mode 100644 index 00000000000..8c3a3dbde8b --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Text", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c" +} diff --git a/rust/cyclotron-core/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json b/rust/cyclotron-core/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json new file mode 100644 index 00000000000..59a56c441cb --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Timestamptz", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23" +} diff --git a/rust/cyclotron-core/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json b/rust/cyclotron-core/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json new file mode 100644 index 00000000000..bd8a7cdd902 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json @@ -0,0 +1,18 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "vm_state", + "type_info": "Text" + } + ], + "parameters": { + "Left": ["Uuid", "Uuid"] + }, + "nullable": [true] + }, + "hash": "aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632" +} diff --git a/rust/cyclotron-core/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json b/rust/cyclotron-core/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json new file mode 100644 index 00000000000..ea9c7f8fceb --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": ["Text", "Uuid", "Uuid"] + }, + "nullable": [] + }, + "hash": "b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7" +} diff --git a/rust/cyclotron-core/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json b/rust/cyclotron-core/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json new file mode 100644 index 00000000000..d2942f91b19 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json @@ -0,0 +1,23 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE cyclotron_jobs\n SET state = $1, last_transition = NOW(), transition_count = transition_count + 1\n WHERE id = $2 AND lock_id = $3", + "describe": { + "columns": [], + "parameters": { + "Left": [ + { + "Custom": { + "name": "jobstate", + "kind": { + "Enum": ["available", "completed", "failed", "running", "paused"] + } + } + }, + "Uuid", + "Uuid" + ] + }, + "nullable": [] + }, + "hash": "b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13" +} diff --git a/rust/cyclotron-core/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json b/rust/cyclotron-core/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json new file mode 100644 index 00000000000..b94965873e7 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json @@ -0,0 +1,117 @@ +{ + "db_name": "PostgreSQL", + "query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Uuid" + }, + { + "ordinal": 1, + "name": "team_id", + "type_info": "Int4" + }, + { + "ordinal": 2, + "name": "state: JobState", + "type_info": { + "Custom": { + "name": "jobstate", + "kind": { + "Enum": ["available", "completed", "failed", "running", "paused"] + } + } + } + }, + { + "ordinal": 3, + "name": "queue_name", + "type_info": "Text" + }, + { + "ordinal": 4, + "name": "priority", + "type_info": "Int2" + }, + { + "ordinal": 5, + "name": "function_id", + "type_info": "Uuid" + }, + { + "ordinal": 6, + "name": "created", + "type_info": "Timestamptz" + }, + { + "ordinal": 7, + "name": "last_transition", + "type_info": "Timestamptz" + }, + { + "ordinal": 8, + "name": "scheduled", + "type_info": "Timestamptz" + }, + { + "ordinal": 9, + "name": "transition_count", + "type_info": "Int2" + }, + { + "ordinal": 10, + "name": "vm_state", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "metadata", + "type_info": "Text" + }, + { + "ordinal": 12, + "name": "parameters", + "type_info": "Text" + }, + { + "ordinal": 13, + "name": "lock_id", + "type_info": "Uuid" + }, + { + "ordinal": 14, + "name": "last_heartbeat", + "type_info": "Timestamptz" + }, + { + "ordinal": 15, + "name": "janitor_touch_count", + "type_info": "Int2" + } + ], + "parameters": { + "Left": ["Text", "Int8", "Uuid"] + }, + "nullable": [ + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + true, + true, + true, + false + ] + }, + "hash": "c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e" +} diff --git a/rust/cyclotron-core/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json b/rust/cyclotron-core/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json new file mode 100644 index 00000000000..a585e9f7e7d --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "DELETE FROM cyclotron_jobs WHERE state = 'completed'", + "describe": { + "columns": [], + "parameters": { + "Left": [] + }, + "nullable": [] + }, + "hash": "f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c" +} diff --git a/rust/cyclotron-core/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json b/rust/cyclotron-core/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json new file mode 100644 index 00000000000..09fc24b340d --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "\nDELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2\n ", + "describe": { + "columns": [], + "parameters": { + "Left": ["Timestamptz", "Int2"] + }, + "nullable": [] + }, + "hash": "fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb" +} diff --git a/rust/cyclotron-core/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json b/rust/cyclotron-core/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json new file mode 100644 index 00000000000..605d79d57c0 --- /dev/null +++ b/rust/cyclotron-core/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json @@ -0,0 +1,12 @@ +{ + "db_name": "PostgreSQL", + "query": "DELETE FROM cyclotron_jobs WHERE state = 'failed'", + "describe": { + "columns": [], + "parameters": { + "Left": [] + }, + "nullable": [] + }, + "hash": "ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e" +} diff --git a/rust/cyclotron-core/Cargo.toml b/rust/cyclotron-core/Cargo.toml new file mode 100644 index 00000000000..bfec9301eee --- /dev/null +++ b/rust/cyclotron-core/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "cyclotron-core" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +serde = { workspace = true } +sqlx = { workspace = true } +chrono = { workspace = true } +tokio = { workspace = true } +thiserror = { workspace = true } +uuid = { workspace = true } +rand = { workspace = true } +futures = { workspace = true } \ No newline at end of file diff --git a/rust/cyclotron-core/migrations/20240804122549_initial_job_queue_schema.sql b/rust/cyclotron-core/migrations/20240804122549_initial_job_queue_schema.sql new file mode 100644 index 00000000000..26cbf1c94b3 --- /dev/null +++ b/rust/cyclotron-core/migrations/20240804122549_initial_job_queue_schema.sql @@ -0,0 +1,102 @@ +CREATE TYPE JobState AS ENUM( + 'available', + 'completed', + 'failed', + 'running', + 'paused' +); + + +--------------------------------------------------------------------- +-- Job table +--------------------------------------------------------------------- +-- When a job is dequeued, it is locked by generating a UUID and returning it to the dequeuing +-- worker. Any worker that can't provide the correct lock_id when updating will have their updates +-- rejected. The reason this is important is because if, e.g., a worker holds a job in a running +-- state without updating the heartbeat, the janitor will return the job to the queue eventually, +-- and if the worker /then/ tries to update the job after another worker has picked it up, that's a +-- race. We track transition count and times alongside lock_id's and heartbeats for reporting and +-- debugging purposes, and we track the number of times the janitor has touched a job to spot poison +-- pills. +CREATE TABLE IF NOT EXISTS cyclotron_jobs ( + --------------------------------------------------------------------- + -- Job metadata + --------------------------------------------------------------------- + id UUID PRIMARY KEY, + team_id INT NOT NULL, + function_id UUID, + created TIMESTAMPTZ NOT NULL, + --------------------------------------------------------------------- + -- Queue bookkeeping - invisible to the worker + --------------------------------------------------------------------- + lock_id UUID, + -- This is set when a job is in a running state, and is required to update the job. + last_heartbeat TIMESTAMPTZ, + -- This is updated by the worker to indicate that the job is making forward progress even + -- without transitions (and should not be reaped) + janitor_touch_count SMALLINT NOT NULL, + transition_count SMALLINT NOT NULL, + last_transition TIMESTAMPTZ NOT NULL, + --------------------------------------------------------------------- + -- Queue components - determines which workers will consume this job + --------------------------------------------------------------------- + queue_name TEXT NOT NULL, + --------------------------------------------------------------------- + -- Job availability and priority (can this job be dequeued, and in what order?) + --------------------------------------------------------------------- + state JobState NOT NULL, + scheduled TIMESTAMPTZ NOT NULL, + priority SMALLINT NOT NULL, + --------------------------------------------------------------------- + -- Job data + --------------------------------------------------------------------- + vm_state TEXT, + -- This is meant for workers "talking to themselves", e.g. tracking retries or something + metadata TEXT, + -- This is meant for "the next guy" - hog might fill it with a URL to fetch, for example + parameters TEXT +); + +-- For a given worker, the set of "available" jobs depends on state, queue_name, and scheduled (so +-- we can exclude sleeping jobs). This index is partial, because we don't care about other states +-- for the purpose of dequeuing +CREATE INDEX idx_cyclotron_jobs_dequeue ON cyclotron_jobs (queue_name, state, scheduled, priority) +WHERE + state = 'available'; + +-- We create simple indexes on team_id, function_id and queue_name to support fast joins to future +-- control tables +CREATE INDEX idx_queue_team_id ON cyclotron_jobs(team_id); + +CREATE INDEX idx_queue_function_id ON cyclotron_jobs(function_id); + +CREATE INDEX idx_queue_queue_name ON cyclotron_jobs(queue_name); + + +--------------------------------------------------------------------- +-- Control tables +--------------------------------------------------------------------- + + +-- These are just a starting point, supporting overriding the state for a given team, function or queue +-- For now these are entirely unused +CREATE TABLE IF NOT EXISTS cyclotron_team_control ( + team_id INT PRIMARY KEY, + state_override JobState, + -- If this is not null, it overrides the state of all jobs for this team (allowing for e.g. pausing or force failing all of a teams jobs) + state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent +); + +CREATE TABLE IF NOT EXISTS cyclotron_function_control ( + function_id UUID PRIMARY KEY, + state_override JobState, + -- If this is not null, it overrides the state of all jobs for this function (allowing for e.g. pausing or force failing all of a functions jobs) + state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent +); + +CREATE TABLE IF NOT EXISTS cyclotron_queue_control ( + queue_name TEXT PRIMARY KEY, + state_override JobState, + -- If this is not null, it overrides the state of all jobs for this queue (allowing for e.g. pausing or force failing all of a queues jobs) + state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent +); \ No newline at end of file diff --git a/rust/cyclotron-core/src/base_ops.rs b/rust/cyclotron-core/src/base_ops.rs new file mode 100644 index 00000000000..5d1f194d88c --- /dev/null +++ b/rust/cyclotron-core/src/base_ops.rs @@ -0,0 +1,697 @@ +//! # PgQueue +//! +//! A job queue implementation backed by a PostgreSQL table. + +use std::str::FromStr; + +use chrono::{self, DateTime, Utc}; +use serde::{self, Deserialize, Serialize}; +use sqlx::{ + postgres::{PgArguments, PgHasArrayType, PgQueryResult, PgTypeInfo}, + query::Query, +}; +use uuid::Uuid; + +use crate::error::QueueError; + +#[derive(Debug, Deserialize, Serialize, sqlx::Type)] +#[serde(rename_all = "lowercase")] +#[sqlx(type_name = "JobState", rename_all = "lowercase")] +pub enum JobState { + Available, + Running, + Completed, + Failed, + Paused, +} + +impl FromStr for JobState { + type Err = (); + + fn from_str(s: &str) -> Result { + match s { + "available" => Ok(JobState::Available), + "running" => Ok(JobState::Running), + "completed" => Ok(JobState::Completed), + "failed" => Ok(JobState::Failed), + _ => Err(()), + } + } +} + +impl PgHasArrayType for JobState { + fn array_type_info() -> sqlx::postgres::PgTypeInfo { + // Postgres default naming convention for array types is "_typename" + PgTypeInfo::with_name("_JobState") + } +} + +// The chunk of data needed to enqueue a job +#[derive(Debug, Deserialize, Serialize, Clone, Eq, PartialEq)] +pub struct JobInit { + pub team_id: i32, + pub queue_name: String, + pub priority: i16, + pub scheduled: DateTime, + pub function_id: Option, + pub vm_state: Option, + pub parameters: Option, + pub metadata: Option, +} + +// TODO - there are certain things we might want to be on a per-team basis here... the ability to say +// "do not process any jobs for this team" independent of doing an operation on the job table seems powerful, +// but that requires a distinct team table. For now, I'm just making a note that it's something we might +// want (the command to modify the treatment of all jobs associated with a team should only need to be issued and +// processed /once/, not once per job, and should apply to all jobs both currently queued and any future ones). This +// can be added in a progressive way (by adding joins and clauses to the dequeue query), so we don't need to worry about +// it too much up front. +#[derive(Debug, Deserialize, Serialize)] +pub struct Job { + // Job metadata + pub id: Uuid, + pub team_id: i32, + pub function_id: Option, // Some jobs might not come from hog, and it doesn't /kill/ use to support that + pub created: DateTime, + + // Queue bookkeeping + // This will be set for any worker that ever has a job in the "running" state (so any worker that dequeues a job) + // but I don't want to do the work to encode that in the type system right now - later it should be + pub lock_id: Option, + pub last_heartbeat: Option>, + pub janitor_touch_count: i16, + pub transition_count: i16, + pub last_transition: DateTime, + + // Virtual queue components + pub queue_name: String, // We can have multiple "virtual queues" workers pull from + + // Job availability + pub state: JobState, + pub priority: i16, // For sorting "available" jobs. Lower is higher priority + pub scheduled: DateTime, + + // Job data + pub vm_state: Option, // The state of the VM this job is running on (if it exists) + pub metadata: Option, // Additional fields a worker can tack onto a job, for e.g. tracking some state across retries (or number of retries in general by a given class of worker) + pub parameters: Option, // The actual parameters of the job (function args for a hog function, http request for a fetch function) +} + +pub async fn create_job<'c, E>(executor: E, data: JobInit) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let id = Uuid::now_v7(); + sqlx::query!( + r#" +INSERT INTO cyclotron_jobs + ( + id, + team_id, + function_id, + created, + lock_id, + last_heartbeat, + janitor_touch_count, + transition_count, + last_transition, + queue_name, + state, + scheduled, + priority, + vm_state, + metadata, + parameters + ) +VALUES + ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10) + "#, + id, + data.team_id, + data.function_id, + data.queue_name, + JobState::Available as _, + data.scheduled, + data.priority, + data.vm_state, + data.metadata, + data.parameters + ) + .execute(executor) + .await?; + + Ok(()) +} + +pub async fn bulk_create_jobs<'c, E>(executor: E, jobs: &[JobInit]) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let now = Utc::now(); + // Flatten these jobs into a series of vecs of arguments PG can unnest + let mut ids = Vec::with_capacity(jobs.len()); + let mut team_ids = Vec::with_capacity(jobs.len()); + let mut function_ids = Vec::with_capacity(jobs.len()); + let mut created_at = Vec::with_capacity(jobs.len()); + let mut lock_ids = Vec::with_capacity(jobs.len()); + let mut last_heartbeats = Vec::with_capacity(jobs.len()); + let mut janitor_touch_counts = Vec::with_capacity(jobs.len()); + let mut transition_counts = Vec::with_capacity(jobs.len()); + let mut last_transitions = Vec::with_capacity(jobs.len()); + let mut queue_names = Vec::with_capacity(jobs.len()); + let mut states = Vec::with_capacity(jobs.len()); + let mut scheduleds = Vec::with_capacity(jobs.len()); + let mut priorities = Vec::with_capacity(jobs.len()); + let mut vm_states = Vec::with_capacity(jobs.len()); + let mut metadatas = Vec::with_capacity(jobs.len()); + let mut parameters = Vec::with_capacity(jobs.len()); + + for d in jobs { + ids.push(Uuid::now_v7()); + team_ids.push(d.team_id); + function_ids.push(d.function_id); + created_at.push(now); + lock_ids.push(None::); + last_heartbeats.push(None::>); + janitor_touch_counts.push(0); + transition_counts.push(0); + last_transitions.push(now); + queue_names.push(d.queue_name.clone()); + states.push(JobState::Available); + scheduleds.push(d.scheduled); + priorities.push(d.priority); + vm_states.push(d.vm_state.clone()); + metadatas.push(d.metadata.clone()); + parameters.push(d.parameters.clone()); + } + + // Using the "unnest" function to turn an array of rows into a set of rows + sqlx::query( + r#" +INSERT INTO cyclotron_jobs + ( + id, + team_id, + function_id, + created, + lock_id, + last_heartbeat, + janitor_touch_count, + transition_count, + last_transition, + queue_name, + state, + scheduled, + priority, + vm_state, + metadata, + parameters + ) +SELECT * +FROM UNNEST( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16 + ) +"#, + ) + .bind(ids) + .bind(team_ids) + .bind(function_ids) + .bind(created_at) + .bind(lock_ids) + .bind(last_heartbeats) + .bind(janitor_touch_counts) + .bind(transition_counts) + .bind(last_transitions) + .bind(queue_names) + .bind(states) + .bind(scheduleds) + .bind(priorities) + .bind(vm_states) + .bind(metadatas) + .bind(parameters) + .execute(executor) + .await?; + + Ok(()) +} + +// Dequeue the next job batch from the queue, skipping VM state since it can be large +pub async fn dequeue_jobs<'c, E>( + executor: E, + queue: &str, + max: usize, +) -> Result, QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + // TODO - right now, locks are completely transient. We could instead have the lock_id act like a + // "worker_id", and be provided by the caller, which would let workers do less bookkeeping, and make + // some kinds of debugging easier, but I prefer locks being opaque to workers for now, to avoid any + // confusion or potential for accidental deadlocking (e.g. if someone persisted the worker_id across + // process restarts). + let lock_id = Uuid::now_v7(); + Ok(sqlx::query_as!( + Job, + r#" +WITH available AS ( + SELECT + id, + state + FROM cyclotron_jobs + WHERE + state = 'available'::JobState + AND queue_name = $1 + AND scheduled <= NOW() + ORDER BY + priority ASC, + scheduled ASC + LIMIT $2 + FOR UPDATE SKIP LOCKED +) +UPDATE cyclotron_jobs +SET + state = 'running'::JobState, + lock_id = $3, + last_heartbeat = NOW(), + last_transition = NOW(), + transition_count = transition_count + 1 +FROM available +WHERE + cyclotron_jobs.id = available.id +RETURNING + cyclotron_jobs.id, + team_id, + available.state as "state: JobState", + queue_name, + priority, + function_id, + created, + last_transition, + scheduled, + transition_count, + NULL as vm_state, + metadata, + parameters, + lock_id, + last_heartbeat, + janitor_touch_count + "#, + queue, + max as i64, + lock_id + ) + .fetch_all(executor) + .await?) +} + +// Dequeue a batch of jobs, also returning their VM state. This is an optimisation - you could +// dequeue a batch of jobs and then fetch their VM state in a separate query, but this is hopefully less +// heavy on the DB, if a given worker knows it needs VM state for all dequeue jobs +pub async fn dequeue_with_vm_state<'c, E>( + executor: E, + queue: &str, + max: usize, +) -> Result, QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let lock_id = Uuid::now_v7(); + Ok(sqlx::query_as!( + Job, + r#" +WITH available AS ( + SELECT + id, + state + FROM cyclotron_jobs + WHERE + state = 'available'::JobState + AND queue_name = $1 + AND scheduled <= NOW() + ORDER BY + priority ASC, + scheduled ASC + LIMIT $2 + FOR UPDATE SKIP LOCKED +) +UPDATE cyclotron_jobs +SET + state = 'running'::JobState, + lock_id = $3, + last_heartbeat = NOW(), + last_transition = NOW(), + transition_count = transition_count + 1 +FROM available +WHERE + cyclotron_jobs.id = available.id +RETURNING + cyclotron_jobs.id, + team_id, + available.state as "state: JobState", + queue_name, + priority, + function_id, + created, + last_transition, + scheduled, + transition_count, + vm_state, + metadata, + parameters, + lock_id, + last_heartbeat, + janitor_touch_count + "#, + queue, + max as i64, + lock_id + ) + .fetch_all(executor) + .await?) +} + +// Grab a jobs VM state - for workers that might sometimes need a jobs vm state, but not always, +// this lets them use dequeue_jobs, and then fetch the states they need. VM state can only be retrieved +// by workers holding a job lock +pub async fn get_vm_state<'c, E>( + executor: E, + job_id: Uuid, + lock_id: Uuid, +) -> Result, QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + struct VMState { + vm_state: Option, + } + + // We use fetch_on here because giving us an unknown ID is an error + let res = sqlx::query_as!( + VMState, + "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2", + job_id, + lock_id + ) + .fetch_one(executor) + .await?; + + Ok(res.vm_state) +} + +// A struct representing a set of updates for a job. Outer none values mean "don't update this field", +// with nested none values meaning "set this field to null" for nullable fields +#[derive(Debug, Deserialize, Serialize)] +pub struct JobUpdate { + pub lock_id: Uuid, // The ID of the lock acquired when this worker dequeued the job, required for any update to be valid + pub state: Option, + pub queue_name: Option, + pub priority: Option, + pub scheduled: Option>, + pub vm_state: Option>, + pub metadata: Option>, + pub parameters: Option>, +} + +impl JobUpdate { + pub fn new(lock_id: Uuid) -> Self { + Self { + lock_id, + state: None, + queue_name: None, + priority: None, + scheduled: None, + vm_state: None, + metadata: None, + parameters: None, + } + } +} + +// TODO - I should think about a bulk-flush interface at /some/ point, although we expect jobs to be +// high variance with respect to work time, so maybe that wouldn't be that useful in the end. +// TODO - this isn't the cheapest way to update a row in a table... I could probably do better by instead +// using a query builder, but I wanted sqlx's nice macro handling, at least while iterating on the schema. +// If/when we start hitting perf issues, this is a good place to start. +// NOTE - this function permits multiple flushes to the same job, without losing the lock on it, but +// high level implementations are recommended to avoid this - ideally, for every de/requeue, there should be +// exactly 2 database operations. +pub async fn flush_job<'c, C>( + connection: &mut C, + job_id: Uuid, + updates: JobUpdate, +) -> Result<(), QueueError> +where + C: sqlx::Connection, +{ + let mut txn = connection.begin().await?; + + // Flushing any job state except "running" is a signal that the worker no longer holds this job + let job_returned = !matches!(updates.state, Some(JobState::Running)); + let lock_id = updates.lock_id; + + if let Some(state) = updates.state { + set_state(&mut *txn, job_id, updates.lock_id, state).await?; + } + + if let Some(queue_name) = updates.queue_name { + set_queue(&mut *txn, job_id, &queue_name, lock_id).await?; + } + + if let Some(priority) = updates.priority { + set_priority(&mut *txn, job_id, lock_id, priority).await?; + } + + if let Some(scheduled) = updates.scheduled { + set_scheduled(&mut *txn, job_id, scheduled, lock_id).await?; + } + + if let Some(vm_state) = updates.vm_state { + set_vm_state(&mut *txn, job_id, vm_state, lock_id).await?; + } + + if let Some(metadata) = updates.metadata { + set_metadata(&mut *txn, job_id, metadata, lock_id).await?; + } + + if let Some(parameters) = updates.parameters { + set_parameters(&mut *txn, job_id, parameters, lock_id).await?; + } + + // Calling flush indicates forward progress, so we should touch the heartbeat + set_heartbeat(&mut *txn, job_id, lock_id).await?; + + // We do this here, instead of in the set_state call, because otherwise the lock_id passed to other + // updates would be invalid + if job_returned { + let query = sqlx::query!( + "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2", + job_id, + lock_id + ); + assert_does_update(&mut *txn, job_id, lock_id, query).await?; + } + + txn.commit().await?; + + Ok(()) +} + +// Simple wrapper, that just executes a query and throws an error if no rows were affected +async fn assert_does_update<'c, E>( + executor: E, + job_id: Uuid, + lock_id: Uuid, + query: Query<'_, sqlx::Postgres, PgArguments>, +) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let res = query.execute(executor).await?; + throw_if_no_rows(res, job_id, lock_id) +} + +// Most of the rest of these functions are designed to be used as part of larger transactions, e.g. +// "completing" a job means updating various rows and then marking it complete, and we can build that +// by composing a set of individual queries together using a transaction. +// Update the state of a job, also tracking the transition count and last transition time +pub async fn set_state<'c, E>( + executor: E, + job_id: Uuid, + lock_id: Uuid, + state: JobState, +) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let q = sqlx::query!( + r#"UPDATE cyclotron_jobs + SET state = $1, last_transition = NOW(), transition_count = transition_count + 1 + WHERE id = $2 AND lock_id = $3"#, + state as _, + job_id, + lock_id + ); + + assert_does_update(executor, job_id, lock_id, q).await +} + +pub async fn set_queue<'c, E>( + executor: E, + job_id: Uuid, + queue: &str, + lock_id: Uuid, +) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let q = sqlx::query!( + "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3", + queue, + job_id, + lock_id + ); + assert_does_update(executor, job_id, lock_id, q).await +} + +pub async fn set_priority<'c, E>( + executor: E, + job_id: Uuid, + lock_id: Uuid, + priority: i16, +) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let q = sqlx::query!( + "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3", + priority, + job_id, + lock_id + ); + assert_does_update(executor, job_id, lock_id, q).await +} + +pub async fn set_scheduled<'c, E>( + executor: E, + job_id: Uuid, + scheduled: DateTime, + lock_id: Uuid, +) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let q = sqlx::query!( + "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3", + scheduled, + job_id, + lock_id + ); + assert_does_update(executor, job_id, lock_id, q).await +} + +pub async fn set_vm_state<'c, E>( + executor: E, + job_id: Uuid, + vm_state: Option, + lock_id: Uuid, +) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let q = sqlx::query!( + "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3", + vm_state, + job_id, + lock_id + ); + assert_does_update(executor, job_id, lock_id, q).await +} + +pub async fn set_metadata<'c, E>( + executor: E, + job_id: Uuid, + metadata: Option, + lock_id: Uuid, +) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let q = sqlx::query!( + "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3", + metadata, + job_id, + lock_id + ); + assert_does_update(executor, job_id, lock_id, q).await +} + +pub async fn set_parameters<'c, E>( + executor: E, + job_id: Uuid, + parameters: Option, + lock_id: Uuid, +) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let q = sqlx::query!( + "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3", + parameters, + job_id, + lock_id + ); + assert_does_update(executor, job_id, lock_id, q).await +} + +pub async fn set_heartbeat<'c, E>( + executor: E, + job_id: Uuid, + lock_id: Uuid, +) -> Result<(), QueueError> +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let q = sqlx::query!( + "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2", + job_id, + lock_id + ); + assert_does_update(executor, job_id, lock_id, q).await +} + +pub async fn count_total_waiting_jobs<'c, E>(executor: E) -> Result +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let res = sqlx::query!( + "SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'available' AND scheduled <= NOW()", + ) + .fetch_one(executor) + .await?; + + let res = res.count.unwrap_or(0); + Ok(res as u64) +} + +fn throw_if_no_rows(res: PgQueryResult, job: Uuid, lock: Uuid) -> Result<(), QueueError> { + if res.rows_affected() == 0 { + Err(QueueError::InvalidLock(lock, job)) + } else { + Ok(()) + } +} diff --git a/rust/cyclotron-core/src/bin/create_test_data.rs b/rust/cyclotron-core/src/bin/create_test_data.rs new file mode 100644 index 00000000000..e3010fe245a --- /dev/null +++ b/rust/cyclotron-core/src/bin/create_test_data.rs @@ -0,0 +1,56 @@ +use chrono::{Duration, Utc}; +use cyclotron_core::{ + base_ops::JobInit, + manager::{ManagerConfig, QueueManager}, + PoolConfig, +}; +use uuid::Uuid; + +// Just inserts jobs as fast as it can, choosing randomly between hog and fetch workers, and between different priorities. +// prints every 100 jobs inserted. +#[tokio::main] +async fn main() { + let pool_config = PoolConfig { + db_url: "postgresql://posthog:posthog@localhost:5432/cyclotron".to_string(), + max_connections: None, + min_connections: None, + acquire_timeout_seconds: None, + max_lifetime_seconds: None, + idle_timeout_seconds: None, + }; + + let manager_config = ManagerConfig { + shards: vec![pool_config.clone()], + shard_depth_limit: None, + shard_depth_check_interval_seconds: None, + }; + + let manager = QueueManager::new(manager_config).await.unwrap(); + + let now = Utc::now() - Duration::minutes(1); + let start = Utc::now(); + let mut count = 0; + loop { + let queue = if rand::random() { "fetch" } else { "hog" }; + + let priority = (rand::random::() % 3) as i16; + + let test_job = JobInit { + team_id: 1, + queue_name: queue.to_string(), + priority, + scheduled: now, + function_id: Some(Uuid::now_v7()), + vm_state: None, + parameters: None, + metadata: None, + }; + + manager.create_job(test_job).await.unwrap(); + + count += 1; + if count % 100 == 0 { + println!("Elapsed: {:?}, count: {}", Utc::now() - start, count); + } + } +} diff --git a/rust/cyclotron-core/src/bin/load_test.rs b/rust/cyclotron-core/src/bin/load_test.rs new file mode 100644 index 00000000000..21a47774e48 --- /dev/null +++ b/rust/cyclotron-core/src/bin/load_test.rs @@ -0,0 +1,167 @@ +use std::{ + sync::{atomic::AtomicUsize, Arc}, + time::Instant, +}; + +use chrono::{Duration, Utc}; +use cyclotron_core::{ + base_ops::{JobInit, JobState}, + manager::{ManagerConfig, QueueManager}, + worker::Worker, + PoolConfig, +}; +use futures::future::join_all; +use uuid::Uuid; + +// This spins up a manager and 2 workers, and tries to simulate semi-realistic load (on the DB - the workers do nothing except complete jobs) +// - The manager inserts jobs as fast as it can, choosing randomly between hog and fetch workers, and between different priorities. +// - The workers will process jobs as fast as they can, in batches of 1000. +// - The manager and both workers track how long each insert and dequeue takes, in ms/job. +// - The manager never inserts more than 10,000 more jobs than the workers have processed. +const INSERT_BATCH_SIZE: usize = 1000; + +struct SharedContext { + jobs_inserted: AtomicUsize, + jobs_dequeued: AtomicUsize, +} + +async fn producer_loop(manager: QueueManager, shared_context: Arc) { + let mut time_spent_inserting = Duration::zero(); + let now = Utc::now() - Duration::minutes(1); + loop { + let mut to_insert = Vec::with_capacity(1000); + for _ in 0..INSERT_BATCH_SIZE { + let queue = if rand::random() { "fetch" } else { "hog" }; + + let priority = (rand::random::() % 3) as i16; + + let test_job = JobInit { + team_id: 1, + queue_name: queue.to_string(), + priority, + scheduled: now, + function_id: Some(Uuid::now_v7()), + vm_state: None, + parameters: None, + metadata: None, + }; + + to_insert.push(test_job); + } + + let start = Instant::now(); + manager.bulk_create_jobs(to_insert).await; + let elapsed = start.elapsed(); + time_spent_inserting += Duration::from_std(elapsed).unwrap(); + + let inserted = shared_context + .jobs_inserted + .fetch_add(INSERT_BATCH_SIZE, std::sync::atomic::Ordering::Relaxed); + + println!("Inserted: {} in {}, ", inserted, time_spent_inserting); + let mut dequeued = shared_context + .jobs_dequeued + .load(std::sync::atomic::Ordering::Relaxed); + while inserted > dequeued + 10_000 { + println!( + "Waiting for workers to catch up, lagging by {}", + inserted - dequeued + ); + tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await; + dequeued = shared_context + .jobs_dequeued + .load(std::sync::atomic::Ordering::Relaxed); + } + } +} + +async fn worker_loop(worker: Worker, shared_context: Arc, queue: &str) { + let mut time_spent_dequeuing = Duration::zero(); + let start = Utc::now(); + loop { + let loop_start = Instant::now(); + let jobs = worker.dequeue_jobs(queue, 1000).await.unwrap(); + + if jobs.is_empty() { + println!( + "Worker {:?} outpacing inserts, got no jobs, sleeping!", + queue + ); + tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await; + continue; + } + + let mut futs = Vec::with_capacity(jobs.len()); + for job in &jobs { + worker.set_state(job.id, JobState::Completed).unwrap(); + futs.push(worker.flush_job(job.id)); + } + + for res in join_all(futs).await { + res.unwrap(); + } + + time_spent_dequeuing += Duration::from_std(loop_start.elapsed()).unwrap(); + + let dequeued = shared_context + .jobs_dequeued + .fetch_add(jobs.len(), std::sync::atomic::Ordering::Relaxed); + + // To account for the bunch we just handled + let dequeued = dequeued + jobs.len(); + + println!( + "Dequeued, processed and completed {} jobs in {} for {:?}. Total time running: {}", + dequeued, + time_spent_dequeuing, + queue, + Utc::now() - start + ); + + if jobs.len() < 1000 { + println!( + "Worker {:?} outpacing manager, only got {} jobs, sleeping!", + queue, + jobs.len() + ); + tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await; + } + } +} + +#[tokio::main] +async fn main() { + let pool_config = PoolConfig { + db_url: "postgresql://posthog:posthog@localhost:5432/cyclotron".to_string(), + max_connections: None, + min_connections: None, + acquire_timeout_seconds: None, + max_lifetime_seconds: None, + idle_timeout_seconds: None, + }; + + let manager_config = ManagerConfig { + shards: vec![pool_config.clone()], + shard_depth_limit: None, + shard_depth_check_interval_seconds: None, + }; + + let shared_context = Arc::new(SharedContext { + jobs_inserted: AtomicUsize::new(0), + jobs_dequeued: AtomicUsize::new(0), + }); + + let manager = QueueManager::new(manager_config).await.unwrap(); + let worker_1 = Worker::new(pool_config.clone()).await.unwrap(); + let worker_2 = Worker::new(pool_config.clone()).await.unwrap(); + + let producer = producer_loop(manager, shared_context.clone()); + let worker_1 = worker_loop(worker_1, shared_context.clone(), "fetch"); + let worker_2 = worker_loop(worker_2, shared_context.clone(), "hog"); + + let producer = tokio::spawn(producer); + let worker_1 = tokio::spawn(worker_1); + let worker_2 = tokio::spawn(worker_2); + + tokio::try_join!(producer, worker_1, worker_2).unwrap(); +} diff --git a/rust/cyclotron-core/src/error.rs b/rust/cyclotron-core/src/error.rs new file mode 100644 index 00000000000..1a95305f4fd --- /dev/null +++ b/rust/cyclotron-core/src/error.rs @@ -0,0 +1,17 @@ +use uuid::Uuid; + +#[derive(Debug, thiserror::Error)] +pub enum QueueError { + #[error("sqlx error: {0}")] + SqlxError(#[from] sqlx::Error), + #[error("Unknown job id: {0}")] + UnknownJobId(Uuid), // Happens when someone tries to update a job through a QueueManager that wasn't dequeue or was already flushed + #[error("Job {0} flushed without a new state, which would leave it in a running state forever (or until reaped)")] + FlushWithoutNextState(Uuid), + #[error("Invalid lock {0} used to update job {1}. This usually means a job has been reaped from under a worker - did you forget to set the heartbeat?")] + InvalidLock(Uuid, Uuid), + #[error("Shard over capacity {0} for this manager, insert aborted")] + ShardFull(u64), + #[error("Timed waiting for shard to have capacity")] + TimedOutWaitingForCapacity, +} diff --git a/rust/cyclotron-core/src/janitor_ops.rs b/rust/cyclotron-core/src/janitor_ops.rs new file mode 100644 index 00000000000..d2d1c947dd5 --- /dev/null +++ b/rust/cyclotron-core/src/janitor_ops.rs @@ -0,0 +1,94 @@ +use chrono::{Duration, Utc}; + +use crate::error::QueueError; + +// As a general rule, janitor operations are not queue specific (as in, they don't account for the +// queue name). We can revisit this later, if we decide we need the ability to do janitor operations +// on a per-queue basis. +pub async fn delete_completed_jobs<'c, E>(executor: E) -> Result +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let result = sqlx::query!("DELETE FROM cyclotron_jobs WHERE state = 'completed'") + .execute(executor) + .await + .map_err(QueueError::from)?; + + Ok(result.rows_affected()) +} + +pub async fn delete_failed_jobs<'c, E>(executor: E) -> Result +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let result = sqlx::query!("DELETE FROM cyclotron_jobs WHERE state = 'failed'") + .execute(executor) + .await + .map_err(QueueError::from)?; + + Ok(result.rows_affected()) +} + +// Jobs are considered stalled if their lock is held and their last_heartbeat is older than `timeout`. +// NOTE - because this runs on running jobs, it can stall workers trying to flush updates as it +// executes. I need to use some of the load generators alongside explain/analyze to optimise this (and +// the set of DB indexes) +// TODO - this /could/ return the lock_id's held, which might help with debugging (if workers reported +// the lock_id's they dequeue'd), but lets not do that right now. +pub async fn reset_stalled_jobs<'c, E>(executor: E, timeout: Duration) -> Result +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let oldest_valid_heartbeat = Utc::now() - timeout; + let result = sqlx::query!(r#" +WITH stalled AS ( + SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED +) +UPDATE cyclotron_jobs +SET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1 +FROM stalled +WHERE cyclotron_jobs.id = stalled.id + "#, + oldest_valid_heartbeat + ) + .execute(executor) + .await + .map_err(QueueError::from)?; + + Ok(result.rows_affected()) +} + +// Poison pills are jobs whose lock is held and whose heartbeat is older than `timeout`, that have +// been returned to the queue by the janitor more than `max_janitor_touched` times. +// NOTE - this has the same performance caveat as reset_stalled_jobs +// TODO - This shoud, instead, move the job row to a dead letter table, for later investigation. Of course, +// rather than doing that, it could just put the job in a "dead letter" state, and no worker or janitor process +// will touch it... maybe the table moving isn't needed? but either way, being able to debug jobs that cause workers +// to stall would be good (and, thinking about it, moving it to a new table means we don't have to clear the lock, +// so have a potential way to trace back to the last worker that died holding the job) +pub async fn delete_poison_pills<'c, E>( + executor: E, + timeout: Duration, + max_janitor_touched: i16, +) -> Result +where + E: sqlx::Executor<'c, Database = sqlx::Postgres>, +{ + let oldest_valid_heartbeat = Utc::now() - timeout; + // NOTE - we don't check the lock_id here, because it probably doesn't matter (the lock_id should be set if the + // job state is "running"), but perhaps we should only delete jobs with a set lock_id, and report an error + // if we find a job with a state of "running" and no lock_id. Also, we delete jobs whose last_heartbeat is + // null, which again should never happen (dequeuing a job should always set the last_heartbeat), but for + // robustness sake we may as well handle it + let result = sqlx::query!( + r#" +DELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2 + "#, + oldest_valid_heartbeat, + max_janitor_touched + ).execute(executor) + .await + .map_err(QueueError::from)?; + + Ok(result.rows_affected()) +} diff --git a/rust/cyclotron-core/src/lib.rs b/rust/cyclotron-core/src/lib.rs new file mode 100644 index 00000000000..a62d69b304b --- /dev/null +++ b/rust/cyclotron-core/src/lib.rs @@ -0,0 +1,38 @@ +use std::time::Duration; + +use serde::{Deserialize, Serialize}; +use sqlx::{pool::PoolOptions, PgPool}; + +pub mod base_ops; +pub mod error; +pub mod janitor_ops; +pub mod manager; +pub mod worker; + +// A pool config object, designed to be passable across API boundaries +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct PoolConfig { + pub db_url: String, + pub max_connections: Option, // Default to 10 + pub min_connections: Option, // Default to 1 + pub acquire_timeout_seconds: Option, // Default to 30 + pub max_lifetime_seconds: Option, // Default to 300 + pub idle_timeout_seconds: Option, // Default to 60 +} + +impl PoolConfig { + pub async fn connect(&self) -> Result { + let builder = PoolOptions::new() + .max_connections(self.max_connections.unwrap_or(10)) + .min_connections(self.min_connections.unwrap_or(1)) + .max_lifetime(Duration::from_secs( + self.max_lifetime_seconds.unwrap_or(300), + )) + .idle_timeout(Duration::from_secs(self.idle_timeout_seconds.unwrap_or(60))) + .acquire_timeout(Duration::from_secs( + self.acquire_timeout_seconds.unwrap_or(30), + )); + + builder.connect(&self.db_url).await + } +} diff --git a/rust/cyclotron-core/src/manager.rs b/rust/cyclotron-core/src/manager.rs new file mode 100644 index 00000000000..2cafdee9c91 --- /dev/null +++ b/rust/cyclotron-core/src/manager.rs @@ -0,0 +1,262 @@ +use std::sync::atomic::AtomicUsize; + +use chrono::{DateTime, Duration, Utc}; +use serde::{Deserialize, Serialize}; +use sqlx::PgPool; +use tokio::sync::RwLock; + +use crate::{ + base_ops::{bulk_create_jobs, count_total_waiting_jobs, create_job, JobInit}, + error::QueueError, + PoolConfig, +}; + +pub const DEFAULT_QUEUE_DEPTH_LIMIT: u64 = 10_000; +pub const DEFAULT_SHARD_HEALTH_CHECK_INTERVAL: u64 = 10; + +// TODO - right now, a lot of this sharding stuff will be hollow, but later we'll add logic like +// e.g. routing work to alive shards if one is down, or reporting shard failure, etc. +// TODO - here's also where queue management commands will go, like "downgrade the priority of this function" +// or "pause jobs for this team", but we're going to add those ad-hoc as they're needed, not up front +#[derive(Debug, Serialize, Deserialize)] +pub struct ManagerConfig { + pub shards: Vec, + pub shard_depth_limit: Option, // Defaults to 10_000 available jobs per shard + pub shard_depth_check_interval_seconds: Option, // Defaults to 10 seconds - checking shard capacity +} + +pub struct Shard { + pub pool: PgPool, + pub last_healthy: RwLock>, + pub check_interval: Duration, + pub depth_limit: u64, +} + +pub struct QueueManager { + shards: RwLock>, + next_shard: AtomicUsize, +} + +// Bulk inserts across multiple shards can partially succeed, so we need to track failures +// and hand back failed job inits to the caller. +pub struct BulkInsertResult { + pub failures: Vec<(QueueError, Vec)>, +} + +impl QueueManager { + pub async fn new(config: ManagerConfig) -> Result { + let mut shards = vec![]; + let depth_limit = config + .shard_depth_limit + .unwrap_or(DEFAULT_QUEUE_DEPTH_LIMIT); + let check_interval = Duration::seconds( + config + .shard_depth_check_interval_seconds + .unwrap_or(DEFAULT_SHARD_HEALTH_CHECK_INTERVAL) as i64, + ); + for shard in config.shards { + let pool = shard.connect().await.unwrap(); + let shard = Shard::new(pool, depth_limit, check_interval); + shards.push(shard); + } + Ok(Self { + shards: RwLock::new(shards), + next_shard: AtomicUsize::new(0), + }) + } + + // Designed mostly to be used for testing, but safe enough to expose publicly + pub fn from_pool(pool: PgPool) -> Self { + Self { + shards: RwLock::new(vec![Shard::new( + pool, + DEFAULT_QUEUE_DEPTH_LIMIT, + Duration::seconds(DEFAULT_SHARD_HEALTH_CHECK_INTERVAL as i64), + )]), + next_shard: AtomicUsize::new(0), + } + } + + pub async fn create_job(&self, init: JobInit) -> Result<(), QueueError> { + // TODO - here is where a lot of shard health and failover logic will go, eventually. + let next = self + .next_shard + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let shards = self.shards.read().await; + let shard = &shards[next % shards.len()]; + shard.create_job(init).await + } + + pub async fn create_job_blocking( + &self, + init: JobInit, + timeout: Option, + ) -> Result<(), QueueError> { + let next = self + .next_shard + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let shards = self.shards.read().await; + let shard = &shards[next % shards.len()]; + shard.create_job_blocking(init, timeout).await + } + + pub async fn bulk_create_jobs(&self, inits: Vec) -> BulkInsertResult { + let shards = self.shards.read().await; + let chunk_size = inits.len() / shards.len(); + let mut result = BulkInsertResult::new(); + // TODO - at some point, we should dynamically re-acquire the lock each time, to allow + // for re-routing jobs away from a bad shard during a bulk insert, but right now, we + // don't even re-try inserts. Later work. + for chunk in inits.chunks(chunk_size) { + let next_shard = self + .next_shard + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let shard = &shards[next_shard % shards.len()]; + let shard_result = shard.bulk_create_jobs(chunk).await; + if let Err(err) = shard_result { + result.add_failure(err, chunk.to_vec()); + } + } + + result + } + + pub async fn bulk_create_jobs_blocking( + &self, + inits: Vec, + timeout: Option, + ) -> BulkInsertResult { + let shards = self.shards.read().await; + let chunk_size = inits.len() / shards.len(); + let mut result = BulkInsertResult::new(); + for chunk in inits.chunks(chunk_size) { + let next_shard = self + .next_shard + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let shard = &shards[next_shard % shards.len()]; + // TODO - we sequentially try each shard, but we could try to parallelize this. + let shard_result = shard.bulk_create_jobs_blocking(chunk, timeout).await; + if let Err(err) = shard_result { + result.add_failure(err, chunk.to_vec()); + } + } + + result + } +} + +impl Shard { + pub fn new(pool: PgPool, depth_limit: u64, check_interval: Duration) -> Self { + Self { + pool, + last_healthy: RwLock::new(Utc::now() - check_interval), + check_interval, + depth_limit, + } + } + + // Inserts a job, failing if the shard is at capacity + pub async fn create_job(&self, init: JobInit) -> Result<(), QueueError> { + self.insert_guard().await?; + create_job(&self.pool, init).await + } + + // Inserts a vec of jobs, failing if the shard is at capacity. Note "capacity" here just + // means "it isn't totally full" - if there's "capacity" for 1 job, and this is a vec of + // 1000, we still insert all 1000. + pub async fn bulk_create_jobs(&self, inits: &[JobInit]) -> Result<(), QueueError> { + self.insert_guard().await?; + bulk_create_jobs(&self.pool, inits).await + } + + // Inserts a job, blocking until there's capacity (or until the timeout is reached) + pub async fn create_job_blocking( + &self, + init: JobInit, + timeout: Option, + ) -> Result<(), QueueError> { + let start = Utc::now(); + while self.is_full().await? { + tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await; + if let Some(timeout) = &timeout { + if Utc::now() - start > *timeout { + return Err(QueueError::TimedOutWaitingForCapacity); + } + } + } + + create_job(&self.pool, init).await + } + + pub async fn bulk_create_jobs_blocking( + &self, + inits: &[JobInit], + timeout: Option, + ) -> Result<(), QueueError> { + let start = Utc::now(); + while self.is_full().await? { + tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await; + if let Some(timeout) = &timeout { + if Utc::now() - start > *timeout { + return Err(QueueError::TimedOutWaitingForCapacity); + } + } + } + + bulk_create_jobs(&self.pool, inits).await + } + + pub async fn insert_guard(&self) -> Result<(), QueueError> { + if self.is_full().await? { + return Err(QueueError::ShardFull(self.depth_limit)); + } + + Ok(()) + } + + pub async fn is_full(&self) -> Result { + let last_healthy = self.last_healthy.read().await; + // If we were healthy less than the check interval ago, assume we are still + if Utc::now() - *last_healthy < self.check_interval { + return Ok(false); + } + + // Grab a write lock. This constrains the number of concurrent capacity checks + // to 1, purposefully - if someone spawns a thousand tasks to blockingly create + // a job, we don't want all of them to be querying the available count at once. + drop(last_healthy); + let mut last_healthy = self.last_healthy.write().await; + // TOCTOU - multiple tasks could be racing to re-do the check, and the firs time one + // succeeds all the rest should skip it. + if Utc::now() - *last_healthy < self.check_interval { + return Ok(false); + } + + let pending = count_total_waiting_jobs(&self.pool).await?; + let is_full = pending >= self.depth_limit; + if !is_full { + *last_healthy = Utc::now(); + } + Ok(is_full) + } +} + +impl BulkInsertResult { + pub fn new() -> Self { + Self { failures: vec![] } + } + + pub fn add_failure(&mut self, err: QueueError, jobs: Vec) { + self.failures.push((err, jobs)); + } + + pub fn all_succeeded(&self) -> bool { + self.failures.is_empty() + } +} + +impl Default for BulkInsertResult { + fn default() -> Self { + Self::new() + } +} diff --git a/rust/cyclotron-core/src/worker.rs b/rust/cyclotron-core/src/worker.rs new file mode 100644 index 00000000000..431bd22f447 --- /dev/null +++ b/rust/cyclotron-core/src/worker.rs @@ -0,0 +1,229 @@ +use std::{collections::HashMap, sync::Arc}; + +use chrono::{DateTime, Utc}; +use sqlx::PgPool; +use std::sync::Mutex; +use uuid::Uuid; + +use crate::{ + base_ops::{ + dequeue_jobs, dequeue_with_vm_state, flush_job, set_heartbeat, Job, JobState, JobUpdate, + }, + error::QueueError, + PoolConfig, +}; + +// The worker's interface to the underlying queue system - a worker can do everything except +// create jobs (because job creation has to be shard-aware). +// +// This interface looks stange, because a lot of things that would normally be done with lifetimes +// and references are done with uuid's instead (and we lose some nice raii stuff as a result), but +// the reason for this is that this is designed to be embedded in other runtimes, where handing out +// lifetime'd references or things with drop impls isn't really practical. This makes it a little +// awkward to use, but since it's meant to be the core of other abstractions, I think it's ok for +// now (client libraries should wrap this to provide better interfaces). +pub struct Worker { + pool: PgPool, + // All dequeued job IDs that haven't been flushed yet. The idea is this lets us + // manage, on the rust side of any API boundary, the "pending" update of any given + // job, such that a user can progressively build up a full update, and then flush it, + // rather than having to track the update state on their side and submit it all at once + // TODO - we don't handle people "forgetting" to abort a job, because we expect that to + // only happen if a process dies (in which case the job queue janitor should handle + // it)... this is a memory leak, but I think it's ok. + // TRICKY - this is a sync mutex, because we never hold it across an await point, and that + // radically simplifies using this for FFI (because there's no message passing across runtimes) + pending: Arc>>, +} + +impl Worker { + pub async fn new(config: PoolConfig) -> Result { + let pool = config.connect().await?; + Ok(Self { + pool, + pending: Arc::new(Mutex::new(HashMap::new())), + }) + } + + pub fn from_pool(pool: PgPool) -> Self { + Self { + pool, + pending: Arc::new(Mutex::new(HashMap::new())), + } + } + + /// Dequeues jobs from the queue, and returns them. Job sorting happens at the queue level, + /// workers can't provide any filtering or sorting criteria - queue managers decide which jobs are run, + /// workers just run them. + pub async fn dequeue_jobs(&self, queue: &str, limit: usize) -> Result, QueueError> { + let jobs = dequeue_jobs(&self.pool, queue, limit).await?; + + let mut pending = self.pending.lock().unwrap(); + for job in &jobs { + // We need to hang onto the locks for a job until we flush it, so we can send updates. + let update = JobUpdate::new( + job.lock_id + .expect("Yell at oliver that the dequeuing code is broken. He's very sorry that your process just panicked"), + ); + pending.insert(job.id, update); + } + + Ok(jobs) + } + + /// This is the same as dequeue_jobs, but it also returns the vm_state of the job + pub async fn dequeue_with_vm_state( + &self, + queue: &str, + limit: usize, + ) -> Result, QueueError> { + let jobs = dequeue_with_vm_state(&self.pool, queue, limit).await?; + + let mut pending = self.pending.lock().unwrap(); + for job in &jobs { + // We need to hang onto the locks for a job until we flush it, so we can send updates. + let update = JobUpdate::new( + job.lock_id + .expect("Yell at oliver that the dequeuing (with vm) code is broken. He's very sorry that your process just panicked"), + ); + pending.insert(job.id, update); + } + + Ok(jobs) + } + + /// NOTE - This function can only be called once, even though the underlying + /// basic operation can be performed as many times as the caller likes (so long as + /// the job state is never set to something other than running, as that clears the + /// job lock). We're more strict here (flushes can only happen once, you must + /// flush some non-running state) to try and enforce a good interaction + /// pattern with the queue. I might return to this and loosen this constraint in the + /// future, if there's a motivating case for needing to flush partial job updates. + pub async fn flush_job(&self, job_id: Uuid) -> Result<(), QueueError> { + // TODO - this drops the job from the known jobs before the flush succeeds, + // which means that if the flush fails, we'll lose the job and can never + // update it's state (leaving it to the reaper). This is a bug, but I'm not + // sure I want to make flushes retryable just yet, so I'm leaving it for now. + // NIT: this wrapping is to ensure pending is dropped prior to the await + let update = { + let mut pending = self.pending.lock().unwrap(); + let update = pending + .remove(&job_id) + .ok_or(QueueError::UnknownJobId(job_id))?; + // It's a programming error to flush a job without setting a new state + match update.state { + Some(JobState::Running) | None => { + // Keep track of any /other/ updates that might have been stored, even in this case, + // so a user can queue up the appropriate state transition and flush properly + pending.insert(job_id, update); + return Err(QueueError::FlushWithoutNextState(job_id)); + } + _ => update, + } + }; + let mut connection = self.pool.acquire().await?; + flush_job(connection.as_mut(), job_id, update).await + } + + /// Jobs are reaped after some seconds (the number is deployment specific, and may become + /// specific on job properties like queue name in the future, as we figure out what /kinds/ of + /// jobs are longer or shorter running). A job is considered "dead" if it's in a running state, + /// and it's last heartbeat was more than the reaping time ago. This, like flush, returns an + /// error if you try to set the heartbeat on a job whose lock you don't have (which can happen + /// if e.g. the job was reaped out from under you). + pub async fn heartbeat(&self, job_id: Uuid) -> Result<(), QueueError> { + let lock_id = { + let pending = self.pending.lock().unwrap(); + pending + .get(&job_id) + .ok_or(QueueError::UnknownJobId(job_id))? + .lock_id + }; + let mut connection = self.pool.acquire().await?; + set_heartbeat(connection.as_mut(), job_id, lock_id).await + } + + /// This is how you "return" a job to the queue, by setting the state to "available" + pub fn set_state(&self, job_id: Uuid, state: JobState) -> Result<(), QueueError> { + let mut pending = self.pending.lock().unwrap(); + pending + .get_mut(&job_id) + .ok_or(QueueError::UnknownJobId(job_id))? + .state = Some(state); + Ok(()) + } + + pub fn set_queue(&self, job_id: Uuid, queue: &str) -> Result<(), QueueError> { + let mut pending = self.pending.lock().unwrap(); + pending + .get_mut(&job_id) + .ok_or(QueueError::UnknownJobId(job_id))? + .queue_name = Some(queue.to_string()); + Ok(()) + } + + /// Jobs are dequeued lowest-priority-first, so this is how you change the "base" priority of a job + /// (control tables may apply further deltas if e.g. a given function is in a degraded state) + pub fn set_priority(&self, job_id: Uuid, priority: i16) -> Result<(), QueueError> { + let mut pending = self.pending.lock().unwrap(); + pending + .get_mut(&job_id) + .ok_or(QueueError::UnknownJobId(job_id))? + .priority = Some(priority); + Ok(()) + } + + /// This is how you do e.g. retries after some time, by setting the scheduled time + /// to some time in the future. Sleeping, retry backoff, scheduling - it's all the same operation, + /// this one. + pub fn set_scheduled_at( + &self, + job_id: Uuid, + scheduled: DateTime, + ) -> Result<(), QueueError> { + let mut pending = self.pending.lock().unwrap(); + pending + .get_mut(&job_id) + .ok_or(QueueError::UnknownJobId(job_id))? + .scheduled = Some(scheduled); + Ok(()) + } + + /// Passing None here will clear the vm_state + pub fn set_vm_state( + &self, + job_id: Uuid, + vm_state: Option, // This (and the following) are Options, because the user can null them (by calling with None) + ) -> Result<(), QueueError> { + let mut pending = self.pending.lock().unwrap(); + pending + .get_mut(&job_id) + .ok_or(QueueError::UnknownJobId(job_id))? + .vm_state = Some(vm_state); + Ok(()) + } + + /// Passing None here will clear the metadata + pub fn set_metadata(&self, job_id: Uuid, metadata: Option) -> Result<(), QueueError> { + let mut pending = self.pending.lock().unwrap(); + pending + .get_mut(&job_id) + .ok_or(QueueError::UnknownJobId(job_id))? + .metadata = Some(metadata); + Ok(()) + } + + /// Passing None here will clear the parameters + pub fn set_parameters( + &self, + job_id: Uuid, + parameters: Option, + ) -> Result<(), QueueError> { + let mut pending = self.pending.lock().unwrap(); + pending + .get_mut(&job_id) + .ok_or(QueueError::UnknownJobId(job_id))? + .parameters = Some(parameters); + Ok(()) + } +} diff --git a/rust/cyclotron-core/tests/base_ops.rs b/rust/cyclotron-core/tests/base_ops.rs new file mode 100644 index 00000000000..4b5684f6bea --- /dev/null +++ b/rust/cyclotron-core/tests/base_ops.rs @@ -0,0 +1,255 @@ +use std::sync::Arc; + +use chrono::{Duration, Utc}; +use common::{assert_job_matches_init, create_new_job, dates_match}; +use cyclotron_core::{ + base_ops::{bulk_create_jobs, JobState}, + manager::QueueManager, + worker::Worker, +}; +use sqlx::PgPool; +use uuid::Uuid; + +mod common; + +// I know this should be a bunch of tests, but for hacking together stuff right now, it'll do +#[sqlx::test(migrations = "./migrations")] +async fn test_queue(db: PgPool) { + let manager = QueueManager::from_pool(db.clone()); + let worker = Worker::from_pool(db); + + let job_1 = create_new_job(); + let mut job_2 = create_new_job(); + + job_2.priority = 2; // Lower priority jobs should be returned second + + let queue_name = job_1.queue_name.clone(); + + manager + .create_job(job_1.clone()) + .await + .expect("failed to create job"); + manager + .create_job(job_2.clone()) + .await + .expect("failed to create job"); + + let jobs = worker + .dequeue_jobs(&queue_name, 2) + .await + .expect("failed to dequeue job"); + + assert_eq!(jobs.len(), 2); + // This also assert that the ordering is correct in terms of priority + assert_job_matches_init(&jobs[0], &job_1); + assert_job_matches_init(&jobs[1], &job_2); + + // Now we can re-queue these jobs (imagine we had done work) + worker + .set_state(jobs[0].id, JobState::Available) + .expect("failed to set state"); + worker + .set_state(jobs[1].id, JobState::Available) + .expect("failed to set state"); + + // Flush the two jobs, having made no other changes, then assert we can re-dequeue them + worker + .flush_job(jobs[0].id) + .await + .expect("failed to flush job"); + worker + .flush_job(jobs[1].id) + .await + .expect("failed to flush job"); + + let jobs = worker + .dequeue_jobs(&queue_name, 2) + .await + .expect("failed to dequeue job"); + + assert_eq!(jobs.len(), 2); + assert_job_matches_init(&jobs[0], &job_1); + assert_job_matches_init(&jobs[1], &job_2); + + // Re-queue them again + worker + .set_state(jobs[0].id, JobState::Available) + .expect("failed to set state"); + worker + .set_state(jobs[1].id, JobState::Available) + .expect("failed to set state"); + + worker + .flush_job(jobs[0].id) + .await + .expect("failed to flush job"); + worker + .flush_job(jobs[1].id) + .await + .expect("failed to flush job"); + + // Spin up two tasks to race on dequeuing, and assert at most 2 jobs are dequeued + let worker = Arc::new(worker); + let moved = worker.clone(); + let queue_name_moved = queue_name.clone(); + let fut_1 = async move { + moved + .dequeue_jobs(&queue_name_moved, 2) + .await + .expect("failed to dequeue job") + }; + let moved = worker.clone(); + let queue_name_moved = queue_name.clone(); + let fut_2 = async move { + moved + .dequeue_jobs(&queue_name_moved, 2) + .await + .expect("failed to dequeue job") + }; + + let (jobs_1, jobs_2) = tokio::join!(fut_1, fut_2); + assert_eq!(jobs_1.len() + jobs_2.len(), 2); + + let jobs = jobs_1 + .into_iter() + .chain(jobs_2.into_iter()) + .collect::>(); + + // And now, any subsequent dequeues will return no jobs + let empty = worker + .dequeue_jobs(&queue_name, 2) + .await + .expect("failed to dequeue job"); + assert_eq!(empty.len(), 0); + + // If we try to flush a job without setting what it's next state will be (or if we set that next state to be "running"), + // we should get an error + worker + .flush_job(jobs[0].id) + .await + .expect_err("expected error due to no-next-state"); + + worker + .set_state(jobs[1].id, JobState::Running) + .expect("failed to set state"); + worker + .flush_job(jobs[1].id) + .await + .expect_err("expected error due to running state"); + + // But if we properly set the state to completed or failed, now we can flush + worker + .set_state(jobs[0].id, JobState::Completed) + .expect("failed to set state"); + worker + .set_state(jobs[1].id, JobState::Failed) + .expect("failed to set state"); + + worker + .flush_job(jobs[0].id) + .await + .expect("failed to flush job"); + worker + .flush_job(jobs[1].id) + .await + .expect("failed to flush job"); + + // And now, any subsequent dequeues will return no jobs (because these jobs are finished) + let empty = worker + .dequeue_jobs(&queue_name, 2) + .await + .expect("failed to dequeue job"); + assert_eq!(empty.len(), 0); + + // Now, lets check that we can set every variable on a job + + // Set up some initial values + let now = Utc::now(); + let mut job = create_new_job(); + job.queue_name = "test".to_string(); + job.priority = 0; + job.scheduled = now - Duration::minutes(2); + job.vm_state = None; + job.parameters = None; + job.metadata = None; + + // Queue the job + manager + .create_job(job.clone()) + .await + .expect("failed to create job"); + + // Then dequeue it + let job = worker + .dequeue_jobs("test", 1) + .await + .expect("failed to dequeue job") + .pop() + .expect("failed to dequeue job"); + + // Set everything we're able to set, including state to available, so we can dequeue it again + worker + .set_state(job.id, JobState::Available) + .expect("failed to set state"); + worker + .set_queue(job.id, "test_2") + .expect("failed to set queue"); + worker + .set_priority(job.id, 1) + .expect("failed to set priority"); + worker + .set_scheduled_at(job.id, now - Duration::minutes(10)) + .expect("failed to set scheduled_at"); + worker + .set_vm_state(job.id, Some("test".to_string())) + .expect("failed to set vm_state"); + worker + .set_parameters(job.id, Some("test".to_string())) + .expect("failed to set parameters"); + worker + .set_metadata(job.id, Some("test".to_string())) + .expect("failed to set metadata"); + + // Flush the job + worker.flush_job(job.id).await.expect("failed to flush job"); + + // Then dequeue it again (this time being sure to grab the vm state too) + let job = worker + .dequeue_with_vm_state("test_2", 1) + .await + .expect("failed to dequeue job") + .pop() + .expect("failed to dequeue job"); + + // And every value should be the updated one + assert_eq!(job.queue_name, "test_2"); + assert_eq!(job.priority, 1); + assert!(dates_match(&job.scheduled, &(now - Duration::minutes(10))),); + assert_eq!(job.vm_state, Some("test".to_string())); + assert_eq!(job.parameters, Some("test".to_string())); + assert_eq!(job.metadata, Some("test".to_string())); +} + +#[sqlx::test(migrations = "./migrations")] +pub async fn test_bulk_insert(db: PgPool) { + let worker = Worker::from_pool(db.clone()); + + let job_template = create_new_job(); + + let jobs = (0..1000) + .map(|_| { + let mut job = job_template.clone(); + job.function_id = Some(Uuid::now_v7()); + job + }) + .collect::>(); + + bulk_create_jobs(&db, &jobs).await.unwrap(); + + let dequeue_jobs = worker + .dequeue_jobs(&job_template.queue_name, 1000) + .await + .expect("failed to dequeue job"); + + assert_eq!(dequeue_jobs.len(), 1000); +} diff --git a/rust/cyclotron-core/tests/common.rs b/rust/cyclotron-core/tests/common.rs new file mode 100644 index 00000000000..0746e27590d --- /dev/null +++ b/rust/cyclotron-core/tests/common.rs @@ -0,0 +1,40 @@ +use chrono::{DateTime, Duration, Utc}; +use cyclotron_core::base_ops::{Job, JobInit}; +use uuid::Uuid; + +#[allow(dead_code)] +pub fn create_new_job() -> JobInit { + JobInit { + team_id: 1, + function_id: Some(Uuid::now_v7()), // Lets us uniquely identify jobs without having the Uuid + queue_name: "test".to_string(), + priority: 0, + scheduled: Utc::now() - Duration::minutes(1), + vm_state: None, + parameters: None, + metadata: None, + } +} + +#[allow(dead_code)] +pub fn dates_match(left: &DateTime, right: &DateTime) -> bool { + // Roundtripping a datetime to PG can cause sub-ms differences, so we need to check within a margin of error + // Seeing errors like this in CI: + // assertion `left == right` failed + // left: 2024-08-08T20:41:55.964936Z + // right: 2024-08-08T20:41:55.964936997Z + let diff = *left - *right; + diff.abs() < Duration::milliseconds(1) +} + +#[allow(dead_code)] +pub fn assert_job_matches_init(job: &Job, init: &JobInit) { + assert_eq!(job.team_id, init.team_id); + assert_eq!(job.function_id, init.function_id); + assert_eq!(job.queue_name, init.queue_name); + assert_eq!(job.priority, init.priority); + assert!(dates_match(&job.scheduled, &init.scheduled)); + assert_eq!(job.vm_state, init.vm_state); + assert_eq!(job.parameters, init.parameters); + assert_eq!(job.metadata, init.metadata); +} diff --git a/rust/cyclotron-core/tests/shard.rs b/rust/cyclotron-core/tests/shard.rs new file mode 100644 index 00000000000..cade4458162 --- /dev/null +++ b/rust/cyclotron-core/tests/shard.rs @@ -0,0 +1,68 @@ +use chrono::{Duration, Utc}; +use common::create_new_job; +use cyclotron_core::manager::Shard; +use sqlx::PgPool; +use tokio::sync::RwLock; + +mod common; + +pub fn get_shard(db: PgPool) -> Shard { + Shard { + pool: db, + last_healthy: RwLock::new(Utc::now()), + check_interval: Duration::milliseconds(0), // We always want to check the limit, for these tests + depth_limit: 10, + } +} + +#[sqlx::test(migrations = "./migrations")] +pub async fn test_shard_limiting(db: PgPool) { + let shard = get_shard(db.clone()); + + // We should be able to insert 10 jobs + for _ in 0..10 { + shard.create_job(create_new_job()).await.unwrap(); + } + + // And then we should fail on the 11th + let result = shard.create_job(create_new_job()).await; + assert!(result.is_err()); +} + +#[sqlx::test(migrations = "./migrations")] +pub async fn test_shard_blocking_insert_waits(db: PgPool) { + let shard = get_shard(db.clone()); + + // We should be able to insert 10 jobs + for _ in 0..10 { + shard.create_job(create_new_job()).await.unwrap(); + } + + let timeout = Some(Duration::milliseconds(50)); + + let start = Utc::now(); + // And then we should fail on the 11th + let result = shard.create_job_blocking(create_new_job(), timeout).await; + assert!(result.is_err()); + + // We should have waited at least 50ms + assert!(Utc::now() - start >= Duration::milliseconds(50)); +} + +#[sqlx::test(migrations = "./migrations")] +pub async fn test_shard_allows_bulk_inserts_beyond_capacity(db: PgPool) { + let shard = get_shard(db.clone()); + + // We should be able to insert 10 jobs + for _ in 0..9 { + shard.create_job(create_new_job()).await.unwrap(); + } + + // And then we should be able to bulk insert 1000 + let inits = (0..1000).map(|_| create_new_job()).collect::>(); + shard.bulk_create_jobs(&inits).await.unwrap(); + + // And the next insert should fail + let result = shard.create_job(create_new_job()).await; + assert!(result.is_err()); +} diff --git a/rust/cyclotron-fetch/Cargo.toml b/rust/cyclotron-fetch/Cargo.toml new file mode 100644 index 00000000000..d29188e9bef --- /dev/null +++ b/rust/cyclotron-fetch/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "cyclotron-fetch" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +tracing-subscriber = { workspace = true } +chrono = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +uuid = { workspace = true } +envconfig = { workspace = true } +axum = { workspace = true } +thiserror = { workspace = true } +metrics = { workspace = true } +cyclotron-core = { path = "../cyclotron-core" } +common-metrics = { path = "../common/metrics" } +common-dns = { path = "../common/dns" } +health = { path = "../common/health" } +reqwest = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +http = { workspace = true } +rand = { workspace = true } +futures = { workspace = true } + +[dev-dependencies] +sqlx = { workspace = true } +httpmock = { workspace = true } \ No newline at end of file diff --git a/rust/cyclotron-fetch/src/config.rs b/rust/cyclotron-fetch/src/config.rs new file mode 100644 index 00000000000..4f981359823 --- /dev/null +++ b/rust/cyclotron-fetch/src/config.rs @@ -0,0 +1,104 @@ +use chrono::Duration; +use cyclotron_core::PoolConfig; +use envconfig::Envconfig; +use uuid::Uuid; + +#[derive(Envconfig)] +pub struct Config { + #[envconfig(from = "BIND_HOST", default = "::")] + pub host: String, + + #[envconfig(from = "BIND_PORT", default = "3304")] + pub port: u16, + + #[envconfig(default = "postgres://posthog:posthog@localhost:5432/cyclotron")] + pub database_url: String, + + #[envconfig(default = "10")] + pub pg_max_connections: u32, + + #[envconfig(default = "1")] + pub pg_min_connections: u32, + + #[envconfig(default = "30")] + pub pg_acquire_timeout_seconds: u64, + + #[envconfig(default = "300")] + pub pg_max_lifetime_seconds: u64, + + #[envconfig(default = "60")] + pub pg_idle_timeout_seconds: u64, + + #[envconfig(default = "false")] + pub allow_internal_ips: bool, + + pub worker_id: Option, // Default to a UUID + pub job_poll_interval_seconds: Option, // Defaults to 1 + pub concurrent_requests_limit: Option, // Defaults to 1000 + pub fetch_timeout_seconds: Option, // Defaults to 30 + pub max_retry_attempts: Option, // Defaults to 10 + pub queue_served: Option, // Default to "fetch" + pub batch_size: Option, // Defaults to 1000 + pub max_response_bytes: Option, // Defaults to 1MB + pub retry_backoff_base_ms: Option, // Defaults to 4000 +} + +// I do this instead of using envconfig's defaults because +// envconfig doesn't support defaults provided by functions, +// which is frustrating when I want to use UUIDs, and if I'm +// going to break out one field, I might as well break out +// everything into "AppConfig" and "PoolConfig" +#[derive(Debug, Clone)] +pub struct AppConfig { + pub host: String, + pub port: u16, + pub worker_id: String, + pub job_poll_interval: Duration, // How long we wait to poll for new jobs, when we're at capacity or find no new jobs + pub concurrent_requests_limit: u32, + pub fetch_timeout: Duration, + pub max_retry_attempts: u32, + pub queue_served: String, + pub batch_size: usize, + pub max_response_bytes: usize, + pub retry_backoff_base: Duration, // Job retry backoff times are this * attempt count + pub allow_internal_ips: bool, +} + +impl Config { + pub fn to_components(self) -> (AppConfig, PoolConfig) { + let worker_id = self.worker_id.unwrap_or_else(|| Uuid::now_v7().to_string()); + let job_poll_interval_seconds = self.job_poll_interval_seconds.unwrap_or(1); + let concurrent_requests_limit = self.concurrent_requests_limit.unwrap_or(1000); + let fetch_timeout_seconds = self.fetch_timeout_seconds.unwrap_or(30); + let max_retry_attempts = self.max_retry_attempts.unwrap_or(10); + let queue_served = self.queue_served.unwrap_or_else(|| "fetch".to_string()); + + let app_config = AppConfig { + host: self.host, + port: self.port, + worker_id, + job_poll_interval: Duration::seconds(job_poll_interval_seconds as i64), + concurrent_requests_limit, + fetch_timeout: Duration::seconds(fetch_timeout_seconds as i64), + max_retry_attempts, + queue_served, + batch_size: self.batch_size.unwrap_or(1000), + max_response_bytes: self.max_response_bytes.unwrap_or(1024 * 1024), + retry_backoff_base: Duration::milliseconds( + self.retry_backoff_base_ms.unwrap_or(4000) as i64 + ), + allow_internal_ips: self.allow_internal_ips, + }; + + let pool_config = PoolConfig { + db_url: self.database_url, + max_connections: Some(self.pg_max_connections), + min_connections: Some(self.pg_min_connections), + acquire_timeout_seconds: Some(self.pg_acquire_timeout_seconds), + max_lifetime_seconds: Some(self.pg_max_lifetime_seconds), + idle_timeout_seconds: Some(self.pg_idle_timeout_seconds), + }; + + (app_config, pool_config) + } +} diff --git a/rust/cyclotron-fetch/src/context.rs b/rust/cyclotron-fetch/src/context.rs new file mode 100644 index 00000000000..36bb64678c1 --- /dev/null +++ b/rust/cyclotron-fetch/src/context.rs @@ -0,0 +1,55 @@ +use std::sync::Arc; + +use cyclotron_core::{worker::Worker, PoolConfig}; +use health::HealthHandle; +use tokio::sync::Semaphore; + +use crate::{config::AppConfig, fetch::FetchError}; + +pub struct AppContext { + pub worker: Worker, + pub client: reqwest::Client, + pub concurrency_limit: Arc, + pub liveness: HealthHandle, + pub config: AppConfig, +} + +impl AppContext { + pub async fn create( + config: AppConfig, + pool_config: PoolConfig, + liveness: HealthHandle, + ) -> Result { + let concurrency_limit = Arc::new(Semaphore::new(config.concurrent_requests_limit as usize)); + + let resolver = Arc::new(common_dns::PublicIPv4Resolver {}); + + let mut client = reqwest::Client::builder().timeout(config.fetch_timeout.to_std().unwrap()); + + if !config.allow_internal_ips { + client = client.dns_resolver(resolver); + } + + let client = client.build(); + + let client = match client { + Ok(c) => c, + Err(e) => { + return Err(FetchError::StartupError(format!( + "Failed to create reqwest client: {}", + e + ))); + } + }; + + let worker = Worker::new(pool_config).await?; + + Ok(Self { + worker, + client, + concurrency_limit, + liveness, + config, + }) + } +} diff --git a/rust/cyclotron-fetch/src/fetch.rs b/rust/cyclotron-fetch/src/fetch.rs new file mode 100644 index 00000000000..3245c8221c7 --- /dev/null +++ b/rust/cyclotron-fetch/src/fetch.rs @@ -0,0 +1,653 @@ +use std::{cmp::min, collections::HashMap, sync::Arc}; + +use chrono::{DateTime, Duration, Utc}; +use cyclotron_core::{ + base_ops::{Job, JobState}, + error::QueueError, + worker::Worker, +}; +use futures::StreamExt; +use http::StatusCode; +use reqwest::Response; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use tokio::sync::OwnedSemaphorePermit; +use tracing::error; + +use crate::context::AppContext; + +// TODO - a lot of these should maybe be configurable +pub const DEAD_LETTER_QUEUE: &str = "fetch-dead-letter"; +pub const DEFAULT_RETRIES: u32 = 3; +pub const DEFAULT_ON_FINISH: OnFinish = OnFinish::Return; +pub const HEARTBEAT_INTERVAL_MS: i64 = 5000; + +// Exclusively for errors in the worker - these will +// never be serialised into the job queue, and indicate +// bad worker health. As a general rule, if one of these +// is produced, we should let the worker fall over (as in, +// the outer worker loop should exit). +#[derive(Error, Debug)] +pub enum FetchError { + #[error("timeout fetching jobs")] + JobFetchTimeout, + #[error(transparent)] + QueueError(#[from] QueueError), + // TRICKY - in most cases, serde errors are a FetchError (something coming from the queue was + // invalid), but this is used in cases where /we/ fail to serialise something /to/ the queue + #[error(transparent)] + SerdeError(#[from] serde_json::Error), + // We failed doing some kind of setup, like creating a reqwest client + #[error("error during startup: {0}")] + StartupError(String), +} + +#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +#[serde(rename_all = "UPPERCASE")] +pub enum HttpMethod { + Get, + Post, + Patch, + Put, + Delete, +} + +impl From<&HttpMethod> for http::Method { + fn from(method: &HttpMethod) -> Self { + match method { + HttpMethod::Get => http::Method::GET, + HttpMethod::Post => http::Method::POST, + HttpMethod::Patch => http::Method::PATCH, + HttpMethod::Put => http::Method::PUT, + HttpMethod::Delete => http::Method::DELETE, + } + } +} + +// What does someone need to give us to execute a fetch? +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub struct FetchParameters { + pub url: String, + pub method: HttpMethod, + pub return_queue: String, + pub headers: Option>, + pub body: Option, + pub max_tries: Option, // Defaults to 3 + pub on_finish: Option, // Defaults to Return +} + +// What should we do when we get a result, or run out of tries for a given job? +// Return means re-queue to the return_worker, Complete means mark as Completed/Failed +#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +#[serde(rename_all = "lowercase")] +pub enum OnFinish { + Return, + Complete, +} + +// Internal bookkeeping for a fetch job +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub struct FetchMetadata { + tries: u32, + // The history of failures seen with this job + trace: Vec, +} + +// This is what we put in the parameters of the job queue for the next +// worker to pick up +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(tag = "status", rename_all = "lowercase")] +pub enum FetchResult { + Success { response: FetchResponse }, + Failure { trace: Vec }, // If we failed entirely to fetch the job, we return the trace for user debugging +} + +impl FetchResult { + pub fn is_success(&self) -> bool { + matches!(self, FetchResult::Success { .. }) + } +} + +// We distinguish between a "fetch failure" and a "worker failure" - +// worker failures are internal-only, and do not count against the +// retries of a job (generally, on worker failure, the job is either +// moved to the dead letter queue, or dropped and left to the janitor to +// reset). Feture failures are, after retries, returned to the queue, and +// represent the result of the fetch operation. +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub struct FetchFailure { + pub kind: FetchFailureKind, + pub message: String, + pub body: Option, // If we have a body, we include it in the failure + pub headers: Option>, // If we have headers, we include them in the failure + pub status: Option, // If we have a status, we include it in the failure + pub timestamp: DateTime, // Useful for users to correlate logs when debugging +} + +impl FetchFailure { + pub fn new(kind: FetchFailureKind, message: impl AsRef) -> Self { + Self { + kind, + message: message.as_ref().to_string(), + timestamp: Utc::now(), + body: None, + headers: None, + status: None, + } + } + + pub fn failure_status(status: StatusCode) -> Self { + Self { + kind: FetchFailureKind::FailureStatus, + message: format!("Received failure status: {}", status), + timestamp: Utc::now(), + body: None, + headers: None, + status: Some(status.as_u16()), + } + } + + pub fn with_body(self, body: String) -> Self { + Self { + body: Some(body), + ..self + } + } + + pub fn with_headers(self, headers: HashMap) -> Self { + Self { + headers: Some(headers), + ..self + } + } + + pub fn with_status(self, status: u16) -> Self { + Self { + status: Some(status), + ..self + } + } +} + +impl From for FetchFailure { + fn from(e: reqwest::Error) -> Self { + let kind = if e.is_timeout() { + FetchFailureKind::Timeout + } else { + FetchFailureKind::RequestError + }; + Self { + kind, + message: e.to_string(), + timestamp: Utc::now(), + body: None, + headers: None, + status: None, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +#[serde(rename_all = "lowercase")] +pub enum FetchFailureKind { + Timeout, + TimeoutGettingBody, + MissingParameters, + InvalidParameters, + RequestError, + FailureStatus, + InvalidBody, // Generally means the body could not be parsed toa utf8 string + ResponseTooLarge, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub struct FetchResponse { + pub status: u16, + pub headers: HashMap, + pub body: String, +} + +pub fn report_worker_saturation(context: &AppContext) { + metrics::gauge!("fetch_worker_available_permits") + .set(context.concurrency_limit.available_permits() as f64); +} + +pub async fn tick(context: Arc) -> Result { + report_worker_saturation(&context); + + let max_jobs = min( + context.concurrency_limit.available_permits(), + context.config.batch_size, + ); + + let jobs = context + .worker + .dequeue_jobs(&context.config.queue_served, max_jobs) + .await?; + + let num_jobs = jobs.len(); + + for job in jobs { + let context = context.clone(); + // We grab job permits individually, so that as soon as a job is finished, the + // permit to run another job is immediately available. This call should + // never block, since we only ever dequeue as many jobs as we have permits + // available. + let permit = context + .concurrency_limit + .clone() + .acquire_owned() + .await + .unwrap(); + tokio::spawn(async move { + // TODO - since worker errors are never an indication of a fetch failure, + // only of some internal worker issue, we should report unhealthy or fall + // over or something here. + if let Err(e) = run_job(context.clone(), job, permit).await { + error!("Error running job: {:?}", e); + } + }); + } + + Ok(num_jobs) +} + +// Mostly a thin wrapper to make ser/de a bit easier +struct FetchJob<'a> { + _job: &'a Job, + metadata: FetchMetadata, + parameters: FetchParameters, +} + +impl<'a> TryFrom<&'a Job> for FetchJob<'a> { + type Error = FetchFailure; + + fn try_from(job: &'a Job) -> Result { + let Some(parameters) = &job.parameters else { + return Err(FetchFailure::new( + FetchFailureKind::MissingParameters, + "Job is missing parameters", + )); + }; + let parameters: FetchParameters = match serde_json::from_str(parameters) { + Ok(p) => p, + Err(e) => { + return Err(FetchFailure::new( + FetchFailureKind::InvalidParameters, + format!("Failed to parse parameters: {}", e), + )) + } + }; + let metadata = match &job.metadata { + Some(m) => match serde_json::from_str(m) { + Ok(m) => m, + Err(_) => { + // If we can't decode the metadata, assume this is the first time we've seen the job + // TODO - this is maybe too lenient, I'm not sure. + FetchMetadata { + tries: 0, + trace: vec![], + } + } + }, + None => FetchMetadata { + tries: 0, + trace: vec![], + }, + }; + Ok(Self { + _job: job, + metadata, + parameters, + }) + } +} + +pub async fn run_job( + context: Arc, + job: Job, + _permit: OwnedSemaphorePermit, +) -> Result<(), FetchError> { + let parsed: FetchJob = match (&job).try_into() { + Ok(p) => p, + Err(e) => return dead_letter_job(&context.worker, job, vec![e]).await, + }; + + let method: http::Method = (&parsed.parameters.method).into(); + + // Parsing errors are always dead letters - it /will/ fail every time, so dump it + // TODO - We should probably decide whether to dead letter or return Failed on the basis of OnFinish, + // in case the caller wants to do any cleanup on broken jobs + let url: reqwest::Url = match (parsed.parameters.url).parse() { + Ok(u) => u, + Err(e) => { + return dead_letter_job( + &context.worker, + job, + vec![FetchFailure::new( + FetchFailureKind::InvalidParameters, + format!("Invalid url: {}", e), + )], + ) + .await; + } + }; + let headers: reqwest::header::HeaderMap = + match (&parsed.parameters.headers.unwrap_or_default()).try_into() { + Ok(h) => h, + Err(e) => { + return dead_letter_job( + &context.worker, + job, + vec![FetchFailure::new( + FetchFailureKind::InvalidParameters, + format!("Invalid headers: {}", e), + )], + ) + .await; + } + }; + + let body = reqwest::Body::from(parsed.parameters.body.unwrap_or_default()); + + let send_fut = context + .client + .request(method, url) + .headers(headers) + .body(body) + .send(); + + let mut send_fut = Box::pin(send_fut); + + let start = Utc::now(); + let res = loop { + tokio::select! { + res = &mut send_fut => { + break res + } + _ = tokio::time::sleep(Duration::milliseconds(HEARTBEAT_INTERVAL_MS).to_std().unwrap()) => { + context.worker.heartbeat(job.id).await?; + } + } + }; + + // If we took, say, 25% of the heartbeat interval to send the request, we may as well heartbeat now + if Utc::now() - start > Duration::milliseconds(HEARTBEAT_INTERVAL_MS / 4) { + context.worker.heartbeat(job.id).await?; + } + + let res = match res { + Ok(r) => r, + Err(e) => { + return handle_fetch_failure( + &context, + &job, + &parsed.metadata, + parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES), + parsed.parameters.return_queue, + parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH), + e, + ) + .await + } + }; + + // Grab the response metadata, since getting the body moves it + let status = res.status(); + let headers: HashMap = res + .headers() + .iter() + .map(|(k, v)| { + ( + k.as_str().to_string(), + v.to_str().unwrap_or_default().to_string(), + ) + }) + .collect(); + + // We pre-emptively get the response body, because we incldued it in the failure trace, even if we got a failure status + let body = first_n_bytes_of_response( + &context.worker, + &job, + res, + context.config.max_response_bytes, + ) + .await?; + let body = match body { + Ok(b) => b, + Err(e) => { + // Tag the status and headers onto the failure + let e = e.with_status(status.as_u16()).with_headers(headers); + return handle_fetch_failure( + &context, + &job, + &parsed.metadata, + parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES), + parsed.parameters.return_queue, + parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH), + e, + ) + .await; + } + }; + + // TODO - we should handle "retryable" and "permanent" failures differently, mostly + // to be polite - retrying a permanent failure isn't a correctness problem, but it's + // rude (and inefficient) + if !status.is_success() { + let failure = FetchFailure::failure_status(status) + .with_body(body) + .with_headers(headers); + return handle_fetch_failure( + &context, + &job, + &parsed.metadata, + parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES), + parsed.parameters.return_queue, + parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH), + failure, + ) + .await; + } + + let result = FetchResult::Success { + response: FetchResponse { + status: status.as_u16(), + headers, + body, + }, + }; + + complete_job( + &context.worker, + &job, + parsed.parameters.return_queue, + parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH), + result, + ) + .await +} + +// Checks if the retry limit has been reached, and does one of: +// - Schedule the job for retry, doing metadata bookkeeping +// - Complete the job, with the failure trace +#[allow(clippy::too_many_arguments)] +pub async fn handle_fetch_failure( + context: &AppContext, + job: &Job, + metadata: &FetchMetadata, + max_tries: u32, + return_queue: String, + on_finish: OnFinish, + failure: F, +) -> Result<(), FetchError> +where + F: Into, +{ + let failure = failure.into(); + let mut metadata = metadata.clone(); + metadata.tries += 1; + metadata.trace.push(failure); + + // TODO - right now we treat all failures as retryable, but we should probably be more aggressive in + // culling retries for permanent failures (this is less of a correctness issue and more of an efficiency/ + // politeness one). We might also want to make backoff configurable. + if metadata.tries < min(max_tries, context.config.max_retry_attempts) { + let next_available = + Utc::now() + (context.config.retry_backoff_base * (metadata.tries as i32)); + // We back off for at most an hour (since callers can configure max retries to be very high) + let next_available = min(next_available, Utc::now() + Duration::hours(1)); + // Add some seconds of jitter + let next_available = + next_available + Duration::seconds((rand::random::() % 30) as i64); + + // Set us up for a retry - update metadata, reschedule, and put back in the queue we pulled from + context + .worker + .set_metadata(job.id, Some(serde_json::to_string(&metadata)?))?; + context.worker.set_state(job.id, JobState::Available)?; + context.worker.set_queue(job.id, &job.queue_name)?; + context.worker.set_scheduled_at(job.id, next_available)?; + + // We downgrade the priority of jobs that fail, so first attempts at jobs get better QoS + context.worker.set_priority(job.id, job.priority + 1)?; + + context.worker.flush_job(job.id).await?; + } else { + // Complete the job, with a Failed result + let result = FetchResult::Failure { + trace: metadata.trace.clone(), + }; + complete_job(&context.worker, job, return_queue, on_finish, result).await?; + } + + Ok(()) +} + +// Complete the job, either because we got a good response, or because the jobs retries +// have been exceeded. +pub async fn complete_job( + worker: &Worker, + job: &Job, + return_queue: String, + on_finish: OnFinish, + result: FetchResult, +) -> Result<(), FetchError> { + // If we fail any serde, we just want to flush to the DLQ and bail + worker.set_state(job.id, JobState::Available)?; + worker.set_queue(job.id, DEAD_LETTER_QUEUE)?; + + let is_success = result.is_success(); + + let result = match serde_json::to_string(&result) { + Ok(r) => r, + Err(e) => { + // Leave behind a hint for debugging + worker.set_metadata(job.id, Some(format!("Failed to serialise result: {}", e)))?; + worker.flush_job(job.id).await?; + return Err(FetchError::SerdeError(e)); + } + }; + + worker.set_queue(job.id, &return_queue)?; + + match (is_success, on_finish) { + (true, _) | (false, OnFinish::Return) => { + worker.set_state(job.id, JobState::Available)?; + } + (false, OnFinish::Complete) => { + worker.set_state(job.id, JobState::Failed)?; + } + } + + worker.set_parameters(job.id, Some(result))?; + worker.set_metadata(job.id, None)?; // We're finished with the job, so clear our internal state + worker.flush_job(job.id).await?; + + Ok(()) +} + +// This moves the job to a dead letter queue, and sets the state to Available (to prevent it +// from being deleted by the janitor). This is for debugging purposes, and only really jobs +// that have some parsing failure on dequeue end up here (as they indicate a programming error +// in the caller, or the worker) +pub async fn dead_letter_job( + worker: &Worker, + job: Job, + errors: Vec, +) -> Result<(), FetchError> { + worker.set_state(job.id, JobState::Available)?; + worker.set_queue(job.id, DEAD_LETTER_QUEUE)?; + + let result = FetchResult::Failure { trace: errors }; + let result = match serde_json::to_string(&result) { + Ok(r) => r, + Err(e) => { + worker.set_metadata( + job.id, + Some(format!( + "Failed to serialise result during DLQ write: {}", + e + )), + )?; + worker.flush_job(job.id).await?; + return Err(FetchError::SerdeError(e)); + } + }; + + worker.set_parameters(job.id, Some(result))?; + + worker.flush_job(job.id).await?; + + Ok(()) +} + +// Pulls the body, while maintaining the job heartbeat. +pub async fn first_n_bytes_of_response( + worker: &Worker, + job: &Job, + response: Response, + n: usize, +) -> Result, FetchError> { + let mut body = response.bytes_stream(); + // We deserialize into a vec, and then parse to a string + let mut buffer = Vec::with_capacity(n); + + worker.heartbeat(job.id).await?; + + loop { + tokio::select! { + chunk = body.next() => { + let chunk = match chunk { + Some(Ok(c)) => c, + Some(Err(e)) => return Ok(Err(FetchFailure::from(e))), + None => break, + }; + + buffer.extend_from_slice(&chunk); + + if buffer.len() >= n { + return Ok(Err( + FetchFailure::new(FetchFailureKind::ResponseTooLarge, "Response too large") + )); + }; + } + _ = tokio::time::sleep(Duration::milliseconds(HEARTBEAT_INTERVAL_MS).to_std().unwrap()) => {} + } + // Heartbeat every time we get a new body chunk, or every HEARTBEAT_INTERVAL_MS + worker.heartbeat(job.id).await?; + } + + let Ok(body) = String::from_utf8(buffer) else { + return Ok(Err(FetchFailure::new( + FetchFailureKind::InvalidBody, + "Body could not be parsed as utf8", + ))); + }; + + Ok(Ok(body)) +} diff --git a/rust/cyclotron-fetch/src/lib.rs b/rust/cyclotron-fetch/src/lib.rs new file mode 100644 index 00000000000..a2752ee2141 --- /dev/null +++ b/rust/cyclotron-fetch/src/lib.rs @@ -0,0 +1,3 @@ +pub mod config; +pub mod context; +pub mod fetch; diff --git a/rust/cyclotron-fetch/src/main.rs b/rust/cyclotron-fetch/src/main.rs new file mode 100644 index 00000000000..7e32ca29293 --- /dev/null +++ b/rust/cyclotron-fetch/src/main.rs @@ -0,0 +1,98 @@ +use axum::{extract::State, routing::get, Router}; +use common_metrics::setup_metrics_routes; +use cyclotron_fetch::{ + config::Config, + context::AppContext, + fetch::{tick, FetchError}, +}; +use envconfig::Envconfig; +use health::HealthRegistry; +use std::{future::ready, sync::Arc}; +use tracing::{error, info}; + +async fn listen(app: Router, bind: String) -> Result<(), std::io::Error> { + let listener = tokio::net::TcpListener::bind(bind).await?; + + axum::serve(listener, app).await?; + + Ok(()) +} + +// For axums state stuff +#[derive(Clone)] +struct WorkerId(pub String); + +pub fn app(liveness: HealthRegistry, worker_id: String) -> Router { + Router::new() + .route("/", get(index)) + .route("/_readiness", get(index)) + .route("/_liveness", get(move || ready(liveness.get_status()))) + .with_state(WorkerId(worker_id)) +} + +async fn index(State(worker_id): State) -> String { + format!("cyclotron janitor {}", worker_id.0) +} + +async fn worker_loop(context: AppContext) -> Result<(), FetchError> { + let context = Arc::new(context); + loop { + context.liveness.report_healthy().await; + let started = tick(context.clone()).await?; + info!("started {} jobs", started); + // This will happen if 1) there are no jobs or 2) we have no capacity to start new jobs. Either way, we should sleep for a bit + if started == 0 { + tokio::time::sleep(context.config.job_poll_interval.to_std().unwrap()).await; + } + } +} + +#[tokio::main] +async fn main() { + let config = Config::init_from_env().expect("failed to load configuration from env"); + tracing_subscriber::fmt::init(); + + let liveness = HealthRegistry::new("liveness"); + + let (app_config, pool_config) = config.to_components(); + let bind = format!("{}:{}", app_config.host, app_config.port); + + info!( + "Fetch worker starting with ID {:?}, listening at {}", + app_config.worker_id, bind + ); + + let worker_liveness = liveness + .register( + "worker".to_string(), + (app_config.job_poll_interval * 4).to_std().unwrap(), + ) + .await; + + let app = setup_metrics_routes(app(liveness, app_config.worker_id.clone())); + + let context = AppContext::create(app_config, pool_config, worker_liveness) + .await + .expect("failed to create app context"); + + let http_server = tokio::spawn(listen(app, bind)); + + let worker_loop = tokio::spawn(worker_loop(context)); + + tokio::select! { + res = worker_loop => { + error!("janitor loop exited"); + if let Err(e) = res { + error!("janitor failed with: {}", e) + } + } + res = http_server => { + error!("http server exited"); + if let Err(e) = res { + error!("server failed with: {}", e) + } + } + } + + info!("exiting"); +} diff --git a/rust/cyclotron-fetch/tests/fetch.rs b/rust/cyclotron-fetch/tests/fetch.rs new file mode 100644 index 00000000000..5d148a8f7d8 --- /dev/null +++ b/rust/cyclotron-fetch/tests/fetch.rs @@ -0,0 +1,293 @@ +use std::{collections::HashMap, str::FromStr, sync::Arc}; + +use chrono::Duration; +use cyclotron_core::{manager::QueueManager, worker::Worker}; +use cyclotron_fetch::fetch::{tick, FetchResult, HttpMethod}; +use httpmock::{Method, MockServer}; +use serde_json::json; +use sqlx::PgPool; +use utils::{ + construct_job, construct_params, get_app_test_context, make_immediately_available, + wait_on_no_running, wait_on_return, +}; + +mod utils; + +#[sqlx::test(migrations = "../cyclotron-core/migrations")] +pub async fn test_completes_fetch(db: PgPool) { + let context = Arc::new(get_app_test_context(db.clone()).await); + let producer = QueueManager::from_pool(db.clone()); + let return_worker = Worker::from_pool(db.clone()); + let server = MockServer::start(); + + let mock = server.mock(|when, then| { + when.method(Method::GET).path("/test"); + then.status(200).body("Hello, world!"); + }); + + let params = construct_params(server.url("/test"), HttpMethod::Get); + let job = construct_job(params); + producer.create_job(job).await.unwrap(); + + let started = tick(context).await.unwrap(); + + assert_eq!(started, 1); + + let returned = wait_on_return(&return_worker, 1, false).await.unwrap(); + + let response: FetchResult = + serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap(); + + let FetchResult::Success { response } = response else { + panic!("Expected success response"); + }; + + assert_eq!(response.status, 200); + assert_eq!(response.body, "Hello, world!"); + + mock.assert_hits(1); +} + +#[sqlx::test(migrations = "../cyclotron-core/migrations")] +pub async fn test_returns_failure_after_retries(db: PgPool) { + let context = Arc::new(get_app_test_context(db.clone()).await); + let producer = QueueManager::from_pool(db.clone()); + let return_worker = Worker::from_pool(db.clone()); + let server = MockServer::start(); + + let mock = server.mock(|when, then| { + when.method(Method::GET).path("/test"); + then.status(500).body("test server error body"); + }); + + let mut params = construct_params(server.url("/test"), HttpMethod::Get); + params.max_tries = Some(2); + + let job = construct_job(params); + producer.create_job(job).await.unwrap(); + + // Tick twice for retry + let started = tick(context.clone()).await.unwrap(); + assert_eq!(started, 1); + wait_on_no_running(&db, Duration::milliseconds(100)).await; + make_immediately_available(&db).await; + let started = tick(context.clone()).await.unwrap(); + assert_eq!(started, 1); + wait_on_no_running(&db, Duration::milliseconds(100)).await; + + let returned = wait_on_return(&return_worker, 1, false).await.unwrap(); + + let response: FetchResult = + serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap(); + + let FetchResult::Failure { trace } = response else { + panic!("Expected failure response"); + }; + + assert!(trace.len() == 2); + for attempt in trace { + assert_eq!(attempt.status, Some(500)); + assert_eq!(attempt.body, Some("test server error body".to_string())); + } + + mock.assert_hits(2); +} + +#[sqlx::test(migrations = "../cyclotron-core/migrations")] +pub fn fetch_discards_bad_metadata(db: PgPool) { + let context = Arc::new(get_app_test_context(db.clone()).await); + let producer = QueueManager::from_pool(db.clone()); + let return_worker = Worker::from_pool(db.clone()); + let server = MockServer::start(); + + let mock = server.mock(|when, then| { + when.method(Method::GET).path("/test"); + then.status(200).body("Hello, world!"); + }); + + let params = construct_params(server.url("/test"), HttpMethod::Get); + let mut job = construct_job(params); + job.metadata = Some("bad json".to_string()); + producer.create_job(job).await.unwrap(); + + let started = tick(context).await.unwrap(); + + assert_eq!(started, 1); + + let returned = wait_on_return(&return_worker, 1, false).await.unwrap(); + + let response: FetchResult = + serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap(); + + let FetchResult::Success { response } = response else { + panic!("Expected success response"); + }; + + assert_eq!(response.status, 200); + assert_eq!(response.body, "Hello, world!"); + + mock.assert_hits(1); +} + +#[sqlx::test(migrations = "../cyclotron-core/migrations")] +pub fn fetch_with_minimum_params_works(db: PgPool) { + let context = Arc::new(get_app_test_context(db.clone()).await); + let producer = QueueManager::from_pool(db.clone()); + let return_worker = Worker::from_pool(db.clone()); + let server = MockServer::start(); + + let mock = server.mock(|when, then| { + when.method(Method::GET).path("/test"); + then.status(200).body("Hello, world!"); + }); + + let params = construct_params(server.url("/test"), HttpMethod::Get); + let mut job = construct_job(params); + + let url = server.url("/test"); + let manual_params = json!({ + "url": url, + "method": "GET", + "return_queue": "return", + }) + .to_string(); + + job.parameters = Some(manual_params); + + producer.create_job(job).await.unwrap(); + + let started = tick(context).await.unwrap(); + + assert_eq!(started, 1); + + let returned = wait_on_return(&return_worker, 1, false).await.unwrap(); + + let response: FetchResult = + serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap(); + + let FetchResult::Success { response } = response else { + panic!("Expected success response"); + }; + + assert_eq!(response.status, 200); + assert_eq!(response.body, "Hello, world!"); + + mock.assert_hits(1); +} + +#[sqlx::test(migrations = "../cyclotron-core/migrations")] +pub async fn test_completes_fetch_with_headers(db: PgPool) { + let context = Arc::new(get_app_test_context(db.clone()).await); + let producer = QueueManager::from_pool(db.clone()); + let return_worker = Worker::from_pool(db.clone()); + let server = MockServer::start(); + + let mock = server.mock(|when, then| { + when.method(Method::GET) + .path("/test") + .header("X-Test", "test"); + then.status(200).body("Hello, world!"); + }); + + let mut params = construct_params(server.url("/test"), HttpMethod::Get); + let mut headers = HashMap::new(); + headers.insert("X-Test".to_string(), "test".to_string()); + params.headers = Some(headers); + + let job = construct_job(params); + producer.create_job(job).await.unwrap(); + + let started = tick(context).await.unwrap(); + + assert_eq!(started, 1); + + let returned = wait_on_return(&return_worker, 1, false).await.unwrap(); + + let response: FetchResult = + serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap(); + + let FetchResult::Success { response } = response else { + panic!("Expected success response"); + }; + + assert_eq!(response.status, 200); + assert_eq!(response.body, "Hello, world!"); + + mock.assert_hits(1); +} + +#[sqlx::test(migrations = "../cyclotron-core/migrations")] +pub async fn test_completes_fetch_with_body(db: PgPool) { + let context = Arc::new(get_app_test_context(db.clone()).await); + let producer = QueueManager::from_pool(db.clone()); + let return_worker = Worker::from_pool(db.clone()); + let server = MockServer::start(); + + let mock = server.mock(|when, then| { + when.method(Method::POST).path("/test").body("test body"); + then.status(200).body("Hello, world!"); + }); + + let mut params = construct_params(server.url("/test"), HttpMethod::Post); + params.body = Some("test body".to_string()); + + let job = construct_job(params); + producer.create_job(job).await.unwrap(); + + let started = tick(context).await.unwrap(); + + assert_eq!(started, 1); + + let returned = wait_on_return(&return_worker, 1, false).await.unwrap(); + + let response: FetchResult = + serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap(); + + let FetchResult::Success { response } = response else { + panic!("Expected success response"); + }; + + assert_eq!(response.status, 200); + assert_eq!(response.body, "Hello, world!"); + + mock.assert_hits(1); +} + +#[sqlx::test(migrations = "../cyclotron-core/migrations")] +pub async fn test_completes_fetch_with_vm_state(db: PgPool) { + let context = Arc::new(get_app_test_context(db.clone()).await); + let producer = QueueManager::from_pool(db.clone()); + let return_worker = Worker::from_pool(db.clone()); + let server = MockServer::start(); + + let mock = server.mock(|when, then| { + when.method(Method::GET).path("/test"); + then.status(200).body("Hello, world!"); + }); + + let params = construct_params(server.url("/test"), HttpMethod::Get); + let mut job = construct_job(params); + job.vm_state = Some(json!({"test": "state"}).to_string()); + producer.create_job(job).await.unwrap(); + + let started = tick(context).await.unwrap(); + + assert_eq!(started, 1); + + let returned = wait_on_return(&return_worker, 1, true).await.unwrap(); + + let state = serde_json::Value::from_str(returned[0].vm_state.as_ref().unwrap()).unwrap(); + assert_eq!(state, json!({"test": "state"})); + + let response: FetchResult = + serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap(); + + let FetchResult::Success { response } = response else { + panic!("Expected success response"); + }; + + assert_eq!(response.status, 200); + assert_eq!(response.body, "Hello, world!"); + + mock.assert_hits(1); +} diff --git a/rust/cyclotron-fetch/tests/utils.rs b/rust/cyclotron-fetch/tests/utils.rs new file mode 100644 index 00000000000..306bfdf2571 --- /dev/null +++ b/rust/cyclotron-fetch/tests/utils.rs @@ -0,0 +1,127 @@ +use std::sync::Arc; + +use chrono::{Duration, Utc}; +use cyclotron_core::{ + base_ops::{Job, JobInit}, + error::QueueError, + worker::Worker, +}; +use cyclotron_fetch::{ + config::AppConfig, + context::AppContext, + fetch::{FetchParameters, HttpMethod}, +}; +use sqlx::PgPool; +use tokio::sync::Semaphore; + +const FETCH_QUEUE: &str = "fetch"; +const RETURN_QUEUE: &str = "return"; + +pub async fn get_app_test_context(db: PgPool) -> AppContext { + let worker = Worker::from_pool(db.clone()); + let client = reqwest::Client::new(); + let concurrency_limit = Arc::new(Semaphore::new(1)); + let health = health::HealthRegistry::new("test"); + let liveness = health + .register("test".to_string(), Duration::seconds(30).to_std().unwrap()) + .await; + + let config = AppConfig { + fetch_timeout: Duration::seconds(10), + concurrent_requests_limit: 1, + host: "localhost".to_string(), + port: 16, + worker_id: "test".to_string(), + job_poll_interval: Duration::seconds(10), + max_retry_attempts: 3, + queue_served: FETCH_QUEUE.to_string(), + batch_size: 1000, + max_response_bytes: 1024 * 1024, + retry_backoff_base: Duration::milliseconds(1000), + allow_internal_ips: true, + }; + + AppContext { + worker, + client, + concurrency_limit, + liveness, + config, + } +} + +pub fn construct_params(url: String, method: HttpMethod) -> FetchParameters { + FetchParameters { + url, + method, + return_queue: RETURN_QUEUE.to_string(), + headers: None, + body: None, + max_tries: None, + on_finish: None, + } +} + +pub fn construct_job(parameters: FetchParameters) -> JobInit { + JobInit { + team_id: 1, + queue_name: FETCH_QUEUE.to_string(), + priority: 0, + scheduled: Utc::now() - Duration::seconds(1), + function_id: None, + vm_state: None, + parameters: Some(serde_json::to_string(¶meters).unwrap()), + metadata: None, + } +} + +pub async fn wait_on_return( + worker: &Worker, + count: usize, + with_vm: bool, +) -> Result, QueueError> { + let timeout = Duration::seconds(1); + let start = Utc::now(); + let mut returned = vec![]; + while start + timeout > Utc::now() { + let mut jobs = if with_vm { + worker.dequeue_with_vm_state(RETURN_QUEUE, 1).await? + } else { + worker.dequeue_jobs(RETURN_QUEUE, 1).await? + }; + returned.append(&mut jobs); + if returned.len() == count { + return Ok(returned); + } + if returned.len() > count { + panic!("Too many jobs returned"); + } + } + panic!("Timeout waiting for jobs to return"); +} + +pub async fn wait_on_no_running(pool: &PgPool, max_time: Duration) { + let start = Utc::now(); + loop { + let running: i64 = + sqlx::query_scalar("SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'running'") + .fetch_one(pool) + .await + .unwrap(); + if running == 0 { + return; + } + if Utc::now() - start > max_time { + panic!("Timeout waiting for jobs to finish"); + } + } +} + +pub async fn make_immediately_available(pool: &PgPool) { + sqlx::query( + "UPDATE cyclotron_jobs SET scheduled = NOW() - INTERVAL '1 second' WHERE state = 'available'", + ) + .execute(pool) + .await + .unwrap(); +} diff --git a/rust/cyclotron-janitor/Cargo.toml b/rust/cyclotron-janitor/Cargo.toml new file mode 100644 index 00000000000..76279b14ce1 --- /dev/null +++ b/rust/cyclotron-janitor/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "cyclotron-janitor" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +tracing-subscriber = { workspace = true } +sqlx = { workspace = true } +chrono = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +uuid = { workspace = true } +envconfig = { workspace = true } +axum = { workspace = true } +eyre = { workspace = true } +metrics = { workspace = true } +cyclotron-core = { path = "../cyclotron-core" } +common-metrics = { path = "../common/metrics" } +health = { path = "../common/health" } diff --git a/rust/cyclotron-janitor/bin/entrypoint.sh b/rust/cyclotron-janitor/bin/entrypoint.sh new file mode 100755 index 00000000000..afbe62cd468 --- /dev/null +++ b/rust/cyclotron-janitor/bin/entrypoint.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -e + +# I set all possible env vars here, tune them as you like +export RUST_LOG="INFO" +export HOST="::" +export PORT="3302" +export DATABASE_URL="postgres://posthog:posthog@localhost:5432/cyclotron" +export CLEANUP_INTERVAL_SECONDS="10" +export PG_MAX_CONNECTIONS="10" +export PG_MIN_CONNECTIONS="1" +export PG_ACQUIRE_TIMEOUT_SECONDS="5" +export PG_MAX_LIFETIME_SECONDS="300" +export PG_IDLE_TIMEOUT_SECONDS="60" +export JANITOR_ID="test-janitor" +export JANITOR_MAX_TOUCHES="2" +export JANITOR_STALL_TIMEOUT_SECONDS="30" + +# Uncomment this to have the database be reset every time you start the janitor +sqlx database reset -y --source ../cyclotron-core/migrations +sqlx migrate run --source ../cyclotron-core/migrations + +cargo run --release \ No newline at end of file diff --git a/rust/cyclotron-janitor/src/config.rs b/rust/cyclotron-janitor/src/config.rs new file mode 100644 index 00000000000..12f3b0796db --- /dev/null +++ b/rust/cyclotron-janitor/src/config.rs @@ -0,0 +1,83 @@ +use chrono::Duration; + +use cyclotron_core::PoolConfig; +use envconfig::Envconfig; +use uuid::Uuid; + +#[derive(Envconfig)] +pub struct Config { + #[envconfig(from = "BIND_HOST", default = "::")] + pub host: String, + + #[envconfig(from = "BIND_PORT", default = "3303")] + pub port: u16, + + #[envconfig(default = "postgres://posthog:posthog@localhost:5432/cyclotron")] + pub database_url: String, + + #[envconfig(default = "30")] + pub cleanup_interval_secs: u64, + + #[envconfig(default = "10")] + pub pg_max_connections: u32, + + #[envconfig(default = "1")] + pub pg_min_connections: u32, + + #[envconfig(default = "30")] + pub pg_acquire_timeout_seconds: u64, + + #[envconfig(default = "300")] + pub pg_max_lifetime_seconds: u64, + + #[envconfig(default = "60")] + pub pg_idle_timeout_seconds: u64, + + // Generally, this should be equivalent to a "shard id", as only one janitor should be running + // per shard + pub janitor_id: Option, + + #[envconfig(default = "10")] + pub janitor_max_touches: i16, + + #[envconfig(default = "60")] + pub janitor_stall_timeout_seconds: u16, +} + +impl Config { + pub fn get_janitor_config(&self) -> JanitorConfig { + let pool_config = PoolConfig { + db_url: self.database_url.clone(), + max_connections: Some(self.pg_max_connections), + min_connections: Some(self.pg_min_connections), + acquire_timeout_seconds: Some(self.pg_acquire_timeout_seconds), + max_lifetime_seconds: Some(self.pg_max_lifetime_seconds), + idle_timeout_seconds: Some(self.pg_idle_timeout_seconds), + }; + + let settings = JanitorSettings { + stall_timeout: Duration::seconds(self.janitor_stall_timeout_seconds as i64), + max_touches: self.janitor_max_touches, + id: self + .janitor_id + .clone() + .unwrap_or_else(|| Uuid::now_v7().to_string()), + }; + + JanitorConfig { + pool: pool_config, + settings, + } + } +} + +pub struct JanitorConfig { + pub pool: PoolConfig, + pub settings: JanitorSettings, +} + +pub struct JanitorSettings { + pub stall_timeout: Duration, + pub max_touches: i16, + pub id: String, +} diff --git a/rust/cyclotron-janitor/src/janitor.rs b/rust/cyclotron-janitor/src/janitor.rs new file mode 100644 index 00000000000..7cccd17e886 --- /dev/null +++ b/rust/cyclotron-janitor/src/janitor.rs @@ -0,0 +1,136 @@ +use chrono::Utc; +use cyclotron_core::{ + error::QueueError, + janitor_ops::{ + delete_completed_jobs, delete_failed_jobs, delete_poison_pills, reset_stalled_jobs, + }, +}; +use sqlx::PgPool; +use tracing::{info, warn}; + +use crate::config::{JanitorConfig, JanitorSettings}; + +// The janitor reports it's own metrics, this is mostly for testing purposes +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct CleanupResult { + pub completed: u64, + pub failed: u64, + pub poisoned: u64, + pub stalled: u64, +} + +pub struct Janitor { + pool: PgPool, + settings: JanitorSettings, + metrics_labels: Vec<(&'static str, String)>, +} + +impl Janitor { + pub async fn new(config: JanitorConfig) -> Result { + let settings = config.settings; + let pool = config.pool.connect().await?; + + let metrics_labels = vec![("janitor_id", settings.id.clone())]; + + Ok(Self { + pool, + settings, + metrics_labels, + }) + } + + pub fn from_pool(pool: PgPool, settings: JanitorSettings) -> Self { + let metrics_labels = vec![("janitor_id", settings.id.clone())]; + Self { + pool, + settings, + metrics_labels, + } + } + + // TODO - right now, the metrics produced here are pretty rough - just per shard, without + // any per-queue or per-worker-type breakdown. It'd be nice to add that, eventually. + pub async fn run_once(&self) -> Result { + info!("Running janitor loop"); + let start = Utc::now(); + metrics::counter!("cyclotron_janitor_run_starts", &self.metrics_labels).increment(1); + + let before = Utc::now(); + let completed = delete_completed_jobs(&self.pool).await?; + let taken = Utc::now() - before; + metrics::histogram!( + "cyclotron_janitor_completed_jobs_cleanup_duration_ms", + &self.metrics_labels + ) + .record(taken.num_milliseconds() as f64); + metrics::counter!( + "cyclotron_janitor_completed_jobs_deleted", + &self.metrics_labels + ) + .increment(completed); + + let before = Utc::now(); + let failed = delete_failed_jobs(&self.pool).await?; + let taken = Utc::now() - before; + metrics::histogram!( + "cyclotron_janitor_failed_jobs_cleanup_duration_ms", + &self.metrics_labels + ) + .record(taken.num_milliseconds() as f64); + metrics::counter!( + "cyclotron_janitor_failed_jobs_deleted", + &self.metrics_labels + ) + .increment(failed); + + // Note - if we reset stalled jobs before deleting poison pills, we'll never delete poision + // pills, since resetting a stalled job clears the locked state. + let before = Utc::now(); + let poisoned = delete_poison_pills( + &self.pool, + self.settings.stall_timeout, + self.settings.max_touches, + ) + .await?; + let taken = Utc::now() - before; + metrics::histogram!( + "cyclotron_janitor_poison_pills_cleanup_duration_ms", + &self.metrics_labels + ) + .record(taken.num_milliseconds() as f64); + metrics::counter!( + "cyclotron_janitor_poison_pills_deleted", + &self.metrics_labels + ) + .increment(poisoned); + if poisoned > 0 { + warn!("Deleted {} poison pills", poisoned); + } + + let before = Utc::now(); + let stalled = reset_stalled_jobs(&self.pool, self.settings.stall_timeout).await?; + let taken = Utc::now() - before; + metrics::histogram!( + "cyclotron_janitor_stalled_jobs_reset_duration_ms", + &self.metrics_labels + ) + .record(taken.num_milliseconds() as f64); + metrics::counter!("cyclotron_janitor_stalled_jobs_reset", &self.metrics_labels) + .increment(stalled); + if stalled > 0 { + warn!("Reset {} stalled jobs", stalled); + } + + metrics::counter!("cyclotron_janitor_run_ends", &self.metrics_labels).increment(1); + let elapsed = Utc::now() - start; + metrics::histogram!("cyclotron_janitor_run_duration_ms", &self.metrics_labels) + .record(elapsed.num_milliseconds() as f64); + info!("Janitor loop complete"); + Ok(CleanupResult { + completed, + failed, + poisoned, + stalled, + }) + } +} diff --git a/rust/cyclotron-janitor/src/lib.rs b/rust/cyclotron-janitor/src/lib.rs new file mode 100644 index 00000000000..00db120a31c --- /dev/null +++ b/rust/cyclotron-janitor/src/lib.rs @@ -0,0 +1,2 @@ +pub mod config; +pub mod janitor; diff --git a/rust/cyclotron-janitor/src/main.rs b/rust/cyclotron-janitor/src/main.rs new file mode 100644 index 00000000000..e46a158c58d --- /dev/null +++ b/rust/cyclotron-janitor/src/main.rs @@ -0,0 +1,105 @@ +use axum::{extract::State, routing::get, Router}; +use common_metrics::setup_metrics_routes; +use cyclotron_janitor::{config::Config, janitor::Janitor}; +use envconfig::Envconfig; +use eyre::Result; +use health::{HealthHandle, HealthRegistry}; +use std::{future::ready, time::Duration}; +use tracing::{error, info}; + +/// Most of this stuff is stolen pretty shamelessly from the rustyhook janitor. It'll diverge more +/// once we introduce the management command stuff, but for now it's a good starting point. + +async fn cleanup_loop(janitor: Janitor, livenes: HealthHandle, interval_secs: u64) -> Result<()> { + let mut interval = tokio::time::interval(Duration::from_secs(interval_secs)); + + loop { + interval.tick().await; + + if let Err(e) = janitor.run_once().await { + // don't bother reporting unhealthy - a few times around this loop will put us in a stalled state + error!("janitor failed cleanup with: {}", e); + } else { + livenes.report_healthy().await; + } + } +} + +async fn listen(app: Router, bind: String) -> Result<()> { + let listener = tokio::net::TcpListener::bind(bind).await?; + + axum::serve(listener, app).await?; + + Ok(()) +} + +// For axums state stuff +#[derive(Clone)] +struct JanitorId(pub String); + +pub fn app(liveness: HealthRegistry, janitor_id: String) -> Router { + Router::new() + .route("/", get(index)) + .route("/_readiness", get(index)) + .route("/_liveness", get(move || ready(liveness.get_status()))) + .with_state(JanitorId(janitor_id)) +} + +async fn index(State(janitor_id): State) -> String { + format!("cyclotron janitor {}", janitor_id.0) +} + +#[tokio::main] +async fn main() { + let config = Config::init_from_env().expect("failed to load configuration from env"); + tracing_subscriber::fmt::init(); + + let liveness = HealthRegistry::new("liveness"); + + let janitor_config = config.get_janitor_config(); + + let janitor_id = janitor_config.settings.id.clone(); + let bind = format!("{}:{}", config.host, config.port); + + info!( + "Starting janitor with ID {:?}, listening at {}", + janitor_id, bind + ); + + let janitor = Janitor::new(janitor_config) + .await + .expect("failed to create janitor"); + + let janitor_liveness = liveness + .register( + "janitor".to_string(), + Duration::from_secs(config.cleanup_interval_secs * 4), + ) + .await; + + let janitor_loop = tokio::spawn(cleanup_loop( + janitor, + janitor_liveness, + config.cleanup_interval_secs, + )); + + let app = setup_metrics_routes(app(liveness, janitor_id)); + let http_server = tokio::spawn(listen(app, bind)); + + tokio::select! { + res = janitor_loop => { + error!("janitor loop exited"); + if let Err(e) = res { + error!("janitor failed with: {}", e) + } + } + res = http_server => { + error!("http server exited"); + if let Err(e) = res { + error!("server failed with: {}", e) + } + } + } + + info!("exiting"); +} diff --git a/rust/cyclotron-janitor/tests/janitor.rs b/rust/cyclotron-janitor/tests/janitor.rs new file mode 100644 index 00000000000..fb77a7faf23 --- /dev/null +++ b/rust/cyclotron-janitor/tests/janitor.rs @@ -0,0 +1,226 @@ +use chrono::{Duration, Utc}; +use cyclotron_core::{ + base_ops::{JobInit, JobState}, + manager::QueueManager, + worker::Worker, +}; +use cyclotron_janitor::{config::JanitorSettings, janitor::Janitor}; +use sqlx::PgPool; +use uuid::Uuid; + +#[sqlx::test(migrations = "../cyclotron-core/migrations")] +async fn janitor_test(db: PgPool) { + let worker = Worker::from_pool(db.clone()); + let manager = QueueManager::from_pool(db.clone()); + + // Purposefully MUCH smaller than would be used in production, so + // we can simulate stalled or poison jobs quickly + let stall_timeout = Duration::milliseconds(10); + let max_touches = 3; + + let settings = JanitorSettings { + stall_timeout, + max_touches, + id: "test_janitor".to_string(), + }; + let janitor = Janitor::from_pool(db.clone(), settings); + + let now = Utc::now() - Duration::seconds(10); + let queue_name = "default".to_string(); + + let job_init = JobInit { + team_id: 1, + queue_name: queue_name.clone(), + priority: 0, + scheduled: now, + function_id: Some(Uuid::now_v7()), + vm_state: None, + parameters: None, + metadata: None, + }; + + // First test - if we mark a job as completed, the janitor will clean it up + manager.create_job(job_init.clone()).await.unwrap(); + let job = worker + .dequeue_jobs(&queue_name, 1) + .await + .unwrap() + .pop() + .unwrap(); + + worker.set_state(job.id, JobState::Completed).unwrap(); + worker.flush_job(job.id).await.unwrap(); + + let result = janitor.run_once().await.unwrap(); + assert_eq!(result.completed, 1); + assert_eq!(result.failed, 0); + assert_eq!(result.poisoned, 0); + assert_eq!(result.stalled, 0); + + // Second test - if we mark a job as failed, the janitor will clean it up + manager.create_job(job_init.clone()).await.unwrap(); + let job = worker + .dequeue_jobs(&queue_name, 1) + .await + .unwrap() + .pop() + .unwrap(); + + worker.set_state(job.id, JobState::Failed).unwrap(); + worker.flush_job(job.id).await.unwrap(); + + let result = janitor.run_once().await.unwrap(); + assert_eq!(result.completed, 0); + assert_eq!(result.failed, 1); + assert_eq!(result.poisoned, 0); + assert_eq!(result.stalled, 0); + + // Third test - if we pick up a job, and then hold it for longer than + // the stall timeout, the janitor will reset it. After this, the worker + // cannot flush updates to the job, and must re-dequeue it. + + manager.create_job(job_init.clone()).await.unwrap(); + let job = worker + .dequeue_jobs(&queue_name, 1) + .await + .unwrap() + .pop() + .unwrap(); + + // First, cleanup won't do anything + let result = janitor.run_once().await.unwrap(); + assert_eq!(result.completed, 0); + assert_eq!(result.failed, 0); + assert_eq!(result.poisoned, 0); + assert_eq!(result.stalled, 0); + + // Then we stall on the job + tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await; + + // Now, cleanup will reset the job + let result = janitor.run_once().await.unwrap(); + assert_eq!(result.completed, 0); + assert_eq!(result.failed, 0); + assert_eq!(result.poisoned, 0); + assert_eq!(result.stalled, 1); + + // Now, the worker can't flush the job + worker.set_state(job.id, JobState::Completed).unwrap(); + let result = worker.flush_job(job.id).await; + assert!(result.is_err()); + + // But if we re-dequeue the job, we can flush it + let job = worker + .dequeue_jobs(&queue_name, 1) + .await + .unwrap() + .pop() + .unwrap(); + worker.set_state(job.id, JobState::Completed).unwrap(); + worker.flush_job(job.id).await.unwrap(); + + janitor.run_once().await.unwrap(); // Clean up the completed job to reset for the next test + + // Fourth test - if a worker holds a job for longer than the stall + // time, but calls heartbeat, the job will not be reset + + manager.create_job(job_init.clone()).await.unwrap(); + let job = worker + .dequeue_jobs(&queue_name, 1) + .await + .unwrap() + .pop() + .unwrap(); + + let start = tokio::time::Instant::now(); + loop { + worker.heartbeat(job.id).await.unwrap(); + tokio::time::sleep(Duration::milliseconds(1).to_std().unwrap()).await; + if start.elapsed() > stall_timeout.to_std().unwrap() * 2 { + break; + } + } + + let result = janitor.run_once().await.unwrap(); + assert_eq!(result.completed, 0); + assert_eq!(result.failed, 0); + assert_eq!(result.poisoned, 0); + assert_eq!(result.stalled, 0); + + // The worker can still flush the job + worker.set_state(job.id, JobState::Completed).unwrap(); + worker.flush_job(job.id).await.unwrap(); + + // and now cleanup will work + let result = janitor.run_once().await.unwrap(); + assert_eq!(result.completed, 1); + assert_eq!(result.failed, 0); + assert_eq!(result.poisoned, 0); + assert_eq!(result.stalled, 0); + + // Fifth test - if a job stalls more than max_touches + // it will be marked as poisoned and deleted + + manager.create_job(job_init.clone()).await.unwrap(); + let mut job = worker + .dequeue_jobs(&queue_name, 1) + .await + .unwrap() + .pop() + .unwrap(); + + for _ in 0..max_touches { + tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await; + let result = janitor.run_once().await.unwrap(); + assert_eq!(result.completed, 0); + assert_eq!(result.failed, 0); + assert_eq!(result.poisoned, 0); + assert_eq!(result.stalled, 1); + + // assert we can't update the job (flush and heartbeat fail) + worker.set_state(job.id, JobState::Completed).unwrap(); + let result = worker.heartbeat(job.id).await; + assert!(result.is_err()); + let result = worker.flush_job(job.id).await; + assert!(result.is_err()); + + // re-dequeue the job + job = worker + .dequeue_jobs(&queue_name, 1) + .await + .unwrap() + .pop() + .unwrap(); + } + // At this point, the "janitor touches" on the job is 3 (it's been stalled and reset 3 times), so one more cleanup loop will delete it + + // Now stall one more time, and on cleanup, we should see the job was considered poison and deleted + tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await; + let result: cyclotron_janitor::janitor::CleanupResult = janitor.run_once().await.unwrap(); + assert_eq!(result.completed, 0); + assert_eq!(result.failed, 0); + assert_eq!(result.poisoned, 1); + assert_eq!(result.stalled, 0); + + // The worker can't flush the job + worker.set_state(job.id, JobState::Completed).unwrap(); + let result = worker.flush_job(job.id).await; + assert!(result.is_err()); + + // Sixth test - the janitor can operate on multiple jobs at once + manager.create_job(job_init.clone()).await.unwrap(); + manager.create_job(job_init.clone()).await.unwrap(); + let jobs = worker.dequeue_jobs(&queue_name, 2).await.unwrap(); + + worker.set_state(jobs[0].id, JobState::Completed).unwrap(); + worker.set_state(jobs[1].id, JobState::Failed).unwrap(); + + worker.flush_job(jobs[0].id).await.unwrap(); + worker.flush_job(jobs[1].id).await.unwrap(); + + let result = janitor.run_once().await.unwrap(); + assert_eq!(result.completed, 1); + assert_eq!(result.failed, 1); + assert_eq!(result.poisoned, 0); + assert_eq!(result.stalled, 0); +} diff --git a/rust/cyclotron-node/.gitignore b/rust/cyclotron-node/.gitignore new file mode 100644 index 00000000000..01f3230c629 --- /dev/null +++ b/rust/cyclotron-node/.gitignore @@ -0,0 +1,7 @@ +target +index.node +**/node_modules +**/.DS_Store +npm-debug.log*cargo.log +cross.log +dist/ diff --git a/rust/cyclotron-node/Cargo.toml b/rust/cyclotron-node/Cargo.toml new file mode 100644 index 00000000000..0ae89199680 --- /dev/null +++ b/rust/cyclotron-node/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "cyclotron-node" +version = "0.1.0" +edition = "2021" +exclude = ["index.node"] + +[lints] +workspace = true + + +[lib] +crate-type = ["cdylib"] + +[dependencies] +cyclotron-core = { path = "../cyclotron-core" } +neon = { workspace = true } +once_cell = { workspace = true } +tokio = { workspace = true } +serde_json = { workspace = true } +serde = { workspace = true } +uuid = { workspace = true } +chrono = { workspace = true } \ No newline at end of file diff --git a/rust/cyclotron-node/examples/basic.js b/rust/cyclotron-node/examples/basic.js new file mode 100644 index 00000000000..270f0e5385a --- /dev/null +++ b/rust/cyclotron-node/examples/basic.js @@ -0,0 +1,144 @@ +const assert = require('assert') +const cyclotron = require('../.') +const crypto = require('crypto') + +// Set of available job states +const JOB_STATES = Object.freeze({ + AVAILABLE: 'available', + RUNNING: 'running', + FAILED: 'failed', + COMPLETED: 'completed', +}) + +const AVAILABLE_WORKERS = Object.freeze({ + FETCH: 'fetch', + HOG: 'hog', +}) + +async function main() { + let poolConfig = { + db_url: 'postgresql://posthog:posthog@localhost:5432/cyclotron', + } + + let managerConfig = { + shards: [poolConfig], + } + + // Most processes will only need to do one of these, but we can do both here for demonstration purposes + await cyclotron.initWorker(JSON.stringify(poolConfig)) + await cyclotron.initManager(JSON.stringify(managerConfig)) + + // Maybe inits won't throw on re-calling, and are also short-circuiting to be almost free, so safe to call frequently + // (although I still wouldn't call them in a loop) + await cyclotron.maybeInitWorker(JSON.stringify(poolConfig)) + await cyclotron.maybeInitManager(JSON.stringify(managerConfig)) + + let five_mintes_ago = new Date(new Date().getTime() - 5 * 60000).toISOString() + let queue_name = 'default' + + let job_1 = { + team_id: 1, + queue_name, + priority: 0, + scheduled: five_mintes_ago, + function_id: crypto.randomUUID(), // Is nullable + vm_state: null, + parameters: null, + metadata: null, + } + + let job_2 = { + team_id: 1, + queue_name, + priority: 1, + scheduled: five_mintes_ago, + function_id: crypto.randomUUID(), // Is nullable + vm_state: null, + parameters: null, + metadata: null, + } + + await cyclotron.createJob(JSON.stringify(job_1)) + await cyclotron.createJob(JSON.stringify(job_2)) + + // Jobs (as well as any other 'complex' data shape) are serialized across the API boundary, + // because that's (according to the neon maintainers) /actually faster/ than doing a bunch + // of cross-runtime pointer chasing. + let jobs = JSON.parse(await cyclotron.dequeueJobs(queue_name, 2)) + assert(jobs.length === 2) + assert(jobs[0].function_id === job_1.function_id) + assert(jobs[1].function_id === job_2.function_id) + + job_1 = jobs[0] + job_2 = jobs[1] + + // All of these throw if the job hasn't been dequeued by the worker created when init_worker was called, + // or if there's some serde error - generally, interacting with the cyclotron should involve try/catch in + // some far outer catch. We can iterate on this API to make it more ergonomic with time, but + // my js/ts is... rusty (co-pilot wrote this joke) + cyclotron.setState(job_1.id, JOB_STATES.AVAILABLE) + cyclotron.setState(job_2.id, JOB_STATES.AVAILABLE) + + cyclotron.setQueue(job_1.id, 'non-default') + cyclotron.setQueue(job_2.id, 'non-default') + + // Priority is lowest-first, so this means we can assert that job_2 will be returned first on subsequent dequeue_jobs + cyclotron.setPriority(job_1.id, 2) + cyclotron.setPriority(job_2.id, 1) + + let ten_minutes_ago = new Date(new Date().getTime() - 10 * 60000).toISOString() + cyclotron.setScheduledAt(job_1.id, ten_minutes_ago) + cyclotron.setScheduledAt(job_2.id, ten_minutes_ago) + + cyclotron.setVmState(job_1.id, JSON.stringify({ state: 'running' })) + cyclotron.setVmState(job_2.id, JSON.stringify({ state: 'running' })) + + cyclotron.setParameters(job_1.id, JSON.stringify({ parameters: 'running' })) + cyclotron.setParameters(job_2.id, JSON.stringify({ parameters: 'running' })) + + cyclotron.setMetadata(job_1.id, JSON.stringify({ metadata: 'running' })) + cyclotron.setMetadata(job_2.id, JSON.stringify({ metadata: 'running' })) + + // Flush the updates queued up above back to the queue. Subsequent calls to flush + // will throw if a job isn't re-acquired. Flushes will fail if a job state update + // isn't included (workers should not purposefully leave jobs in a running state) + await cyclotron.flushJob(job_1.id) + await cyclotron.flushJob(job_2.id) + + jobs = JSON.parse(await cyclotron.dequeueWithVmState('non-default', 2)) + + assert(jobs[0].id == job_2.id) + assert(jobs[1].id == job_1.id) + + assert(jobs[0].function_id === job_2.function_id) + assert(jobs[1].function_id === job_1.function_id) + + assert(jobs[0].team_id === job_2.team_id) + assert(jobs[1].team_id === job_1.team_id) + + assert(jobs[0].queue_name === 'non-default') + assert(jobs[1].queue_name === 'non-default') + + assert(jobs[0].priority === 1) + assert(jobs[1].priority === 2) + + assert(jobs[0].scheduled === ten_minutes_ago) + assert(jobs[1].scheduled === ten_minutes_ago) + + assert(jobs[0].vm_state === JSON.stringify({ state: 'running' })) + assert(jobs[1].vm_state === JSON.stringify({ state: 'running' })) + assert(jobs[0].parameters === JSON.stringify({ parameters: 'running' })) + assert(jobs[1].parameters === JSON.stringify({ parameters: 'running' })) + assert(jobs[0].metadata === JSON.stringify({ metadata: 'running' })) + assert(jobs[1].metadata === JSON.stringify({ metadata: 'running' })) + + // Now we'll mark these jobs as completed + cyclotron.setState(job_1.id, JOB_STATES.COMPLETED) + cyclotron.setState(job_2.id, JOB_STATES.COMPLETED) + + // And flush them back to the queue + await cyclotron.flushJob(job_1.id) + await cyclotron.flushJob(job_2.id) +} + +main() diff --git a/rust/cyclotron-node/package.json b/rust/cyclotron-node/package.json new file mode 100644 index 00000000000..a445cae4e92 --- /dev/null +++ b/rust/cyclotron-node/package.json @@ -0,0 +1,27 @@ +{ + "name": "@posthog/cyclotron", + "version": "0.1.0", + "description": "Node bindings for cyclotron", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "scripts": { + "test": "cargo test", + "build": "pnpm run build:cargo --release && pnpm run build:move-lib && pnpm run build:typescript", + "build:move-lib": "cp ../target/release/libcyclotron_node.dylib index.node || cp ../target/release/libcyclotron_node.so index.node", + "build:cargo": "cargo build --message-format=json > cargo.log", + "build:cargo:debug": "pnpm run build:cargo", + "build:cross": "cross build --message-format=json > cross.log", + "build:typescript": "tsc", + "package": "NODE_ENV=development pnpm i --dev && pnpm run build" + }, + "author": "", + "license": "MIT", + "devDependencies": { + "@types/node": "^22.4.1", + "typescript": "^4.7.4" + }, + "files": [ + "dist", + "index.node" + ] +} diff --git a/rust/cyclotron-node/pnpm-lock.yaml b/rust/cyclotron-node/pnpm-lock.yaml new file mode 100644 index 00000000000..9866808970b --- /dev/null +++ b/rust/cyclotron-node/pnpm-lock.yaml @@ -0,0 +1,31 @@ +lockfileVersion: '6.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +devDependencies: + '@types/node': + specifier: ^22.4.1 + version: 22.4.1 + typescript: + specifier: ^4.7.4 + version: 4.9.5 + +packages: + + /@types/node@22.4.1: + resolution: {integrity: sha512-1tbpb9325+gPnKK0dMm+/LMriX0vKxf6RnB0SZUqfyVkQ4fMgUSySqhxE/y8Jvs4NyF1yHzTfG9KlnkIODxPKg==} + dependencies: + undici-types: 6.19.8 + dev: true + + /typescript@4.9.5: + resolution: {integrity: sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==} + engines: {node: '>=4.2.0'} + hasBin: true + dev: true + + /undici-types@6.19.8: + resolution: {integrity: sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==} + dev: true diff --git a/rust/cyclotron-node/src/index.ts b/rust/cyclotron-node/src/index.ts new file mode 100644 index 00000000000..5f4c38e7454 --- /dev/null +++ b/rust/cyclotron-node/src/index.ts @@ -0,0 +1,257 @@ +// eslint-disable-next-line @typescript-eslint/no-var-requires +const cyclotron = require('../index.node') + +export interface PoolConfig { + dbUrl: string + maxConnections?: number + minConnections?: number + acquireTimeoutSeconds?: number + maxLifetimeSeconds?: number + idleTimeoutSeconds?: number +} + +// Type as expected by Cyclotron. +interface InternalPoolConfig { + db_url: string + max_connections?: number + min_connections?: number + acquire_timeout_seconds?: number + max_lifetime_seconds?: number + idle_timeout_seconds?: number +} + +export interface ManagerConfig { + shards: PoolConfig[] +} + +// Type as expected by Cyclotron. +interface InternalManagerConfig { + shards: InternalPoolConfig[] +} + +export interface JobInit { + teamId: number + functionId: string + queueName: string + priority?: number + scheduled?: Date + vmState?: string + parameters?: string + metadata?: string +} + +// Type as expected by Cyclotron. +interface InternalJobInit { + team_id: number + function_id: string + queue_name: string + priority?: number + scheduled?: Date + vm_state?: string + parameters?: string + metadata?: string +} + +export type JobState = 'available' | 'running' | 'completed' | 'failed' | 'paused' + +export interface Job { + id: string + teamId: number + functionId: string | null + created: Date + lockId: string | null + lastHeartbeat: Date | null + janitorTouchCount: number + transitionCount: number + lastTransition: Date + queueName: string + state: JobState + priority: number + scheduled: Date + vmState: string | null + metadata: string | null + parameters: string | null +} + +// Type as returned by Cyclotron. +interface InternalJob { + id: string + team_id: number + function_id: string | null + created: string + lock_id: string | null + last_heartbeat: string | null + janitor_touch_count: number + transition_count: number + last_transition: string + queue_name: string + state: JobState + priority: number + scheduled: string + vm_state: string | null + metadata: string | null + parameters: string | null +} + +async function initWorker(poolConfig: PoolConfig): Promise { + const initWorkerInternal: InternalPoolConfig = { + db_url: poolConfig.dbUrl, + max_connections: poolConfig.maxConnections, + min_connections: poolConfig.minConnections, + acquire_timeout_seconds: poolConfig.acquireTimeoutSeconds, + max_lifetime_seconds: poolConfig.maxLifetimeSeconds, + idle_timeout_seconds: poolConfig.idleTimeoutSeconds, + } + return await cyclotron.initWorker(JSON.stringify(initWorkerInternal)) +} + +async function initManager(managerConfig: ManagerConfig): Promise { + const managerConfigInternal: InternalManagerConfig = { + shards: managerConfig.shards.map((shard) => ({ + db_url: shard.dbUrl, + max_connections: shard.maxConnections, + min_connections: shard.minConnections, + acquire_timeout_seconds: shard.acquireTimeoutSeconds, + max_lifetime_seconds: shard.maxLifetimeSeconds, + idle_timeout_seconds: shard.idleTimeoutSeconds, + })), + } + return await cyclotron.initManager(JSON.stringify(managerConfigInternal)) +} + +async function maybeInitWorker(poolConfig: PoolConfig): Promise { + const initWorkerInternal: InternalPoolConfig = { + db_url: poolConfig.dbUrl, + max_connections: poolConfig.maxConnections, + min_connections: poolConfig.minConnections, + acquire_timeout_seconds: poolConfig.acquireTimeoutSeconds, + max_lifetime_seconds: poolConfig.maxLifetimeSeconds, + idle_timeout_seconds: poolConfig.idleTimeoutSeconds, + } + return await cyclotron.maybeInitWorker(JSON.stringify(initWorkerInternal)) +} + +async function maybeInitManager(managerConfig: ManagerConfig): Promise { + const managerConfigInternal: InternalManagerConfig = { + shards: managerConfig.shards.map((shard) => ({ + db_url: shard.dbUrl, + max_connections: shard.maxConnections, + min_connections: shard.minConnections, + acquire_timeout_seconds: shard.acquireTimeoutSeconds, + max_lifetime_seconds: shard.maxLifetimeSeconds, + idle_timeout_seconds: shard.idleTimeoutSeconds, + })), + } + return await cyclotron.maybeInitManager(JSON.stringify(managerConfigInternal)) +} + +export async function createJob(job: JobInit): Promise { + job.priority ??= 1 + job.scheduled ??= new Date() + + const jobInitInternal: InternalJobInit = { + team_id: job.teamId, + function_id: job.functionId, + queue_name: job.queueName, + priority: job.priority, + scheduled: job.scheduled, + vm_state: job.vmState, + parameters: job.parameters, + metadata: job.metadata, + } + return await cyclotron.createJob(JSON.stringify(jobInitInternal)) +} + +function convertInternalJobToJob(jobInternal: InternalJob): Job { + return { + id: jobInternal.id, + teamId: jobInternal.team_id, + functionId: jobInternal.function_id, + created: new Date(jobInternal.created), + lockId: jobInternal.lock_id, + lastHeartbeat: jobInternal.last_heartbeat ? new Date(jobInternal.last_heartbeat) : null, + janitorTouchCount: jobInternal.janitor_touch_count, + transitionCount: jobInternal.transition_count, + lastTransition: new Date(jobInternal.last_transition), + queueName: jobInternal.queue_name, + state: jobInternal.state, + priority: jobInternal.priority, + scheduled: new Date(jobInternal.scheduled), + vmState: jobInternal.vm_state, + metadata: jobInternal.metadata, + parameters: jobInternal.parameters, + } +} + +async function dequeueJobs(queueName: string, limit: number): Promise { + const jobsStr = await cyclotron.dequeueJobs(queueName, limit) + const jobs: InternalJob[] = JSON.parse(jobsStr) + return jobs.map(convertInternalJobToJob) +} +async function dequeueJobsWithVmState(queueName: string, limit: number): Promise { + const jobsStr = await cyclotron.dequeueJobsWithVmState(queueName, limit) + const jobs: InternalJob[] = JSON.parse(jobsStr) + return jobs.map(convertInternalJobToJob) +} + +async function flushJob(jobId: string): Promise { + return await cyclotron.flushJob(jobId) +} + +function setState(jobId: string, jobState: JobState): Promise { + return cyclotron.setState(jobId, jobState) +} + +function setQueue(jobId: string, queueName: string): Promise { + return cyclotron.setQueue(jobId, queueName) +} + +function setPriority(jobId: string, priority: number): Promise { + return cyclotron.setPriority(jobId, priority) +} + +function setScheduledAt(jobId: string, scheduledAt: Date): Promise { + return cyclotron.setScheduledAt(jobId, scheduledAt.toISOString()) +} + +function serializeObject(name: string, obj: Record | null): string | null { + if (obj === null) { + return null + } else if (typeof obj === 'object' && obj !== null) { + return JSON.stringify(obj) + } + throw new Error(`${name} must be either an object or null`) +} + +function setVmState(jobId: string, vmState: Record | null): Promise { + const serialized = serializeObject('vmState', vmState) + return cyclotron.setVmState(jobId, serialized) +} + +function setMetadata(jobId: string, metadata: Record | null): Promise { + const serialized = serializeObject('metadata', metadata) + return cyclotron.setMetadata(jobId, serialized) +} + +function setParameters(jobId: string, parameters: Record | null): Promise { + const serialized = serializeObject('parameters', parameters) + return cyclotron.setParameters(jobId, serialized) +} + +export default { + initWorker, + initManager, + maybeInitWorker, + maybeInitManager, + createJob, + dequeueJobs, + dequeueJobsWithVmState, + flushJob, + setState, + setQueue, + setPriority, + setScheduledAt, + setVmState, + setMetadata, + setParameters, +} diff --git a/rust/cyclotron-node/src/lib.rs b/rust/cyclotron-node/src/lib.rs new file mode 100644 index 00000000000..212053d5fa7 --- /dev/null +++ b/rust/cyclotron-node/src/lib.rs @@ -0,0 +1,450 @@ +use chrono::{DateTime, Utc}; +use cyclotron_core::{ + base_ops::{JobInit, JobState}, + manager::{ManagerConfig, QueueManager}, + worker::Worker, + PoolConfig, +}; + +use neon::{ + handle::Handle, + prelude::{Context, FunctionContext, ModuleContext}, + result::{JsResult, NeonResult}, + types::{JsNull, JsNumber, JsPromise, JsString, JsValue}, +}; +use once_cell::sync::OnceCell; +use serde::de::DeserializeOwned; +use serde_json::Value; +use tokio::runtime::Runtime; +use uuid::Uuid; + +static WORKER: OnceCell = OnceCell::new(); +static MANAGER: OnceCell = OnceCell::new(); +static RUNTIME: OnceCell = OnceCell::new(); + +fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> { + RUNTIME + .get_or_try_init(Runtime::new) + .or_else(|e| cx.throw_error(format!("failed to create tokio runtime: {}", e))) +} + +// The general interface for calling our functions takes a JSON serialized stirng, +// because neon has no nice serde support for function arguments (and generally. +// rippping objects from the v8 runtime piece by piece is slower than just passing +// a since chunk of bytes). These are convenience functions for converting between +pub fn from_json_string<'a, T, C>(cx: &mut C, object: Handle) -> NeonResult +where + T: DeserializeOwned, + C: Context<'a>, +{ + let value: T = + serde_json::from_str(&object.value(cx)).or_else(|e| cx.throw_error(format!("{}", e)))?; + Ok(value) +} + +pub fn to_json_string<'a, T, C>(cx: &mut C, value: T) -> NeonResult +where + T: serde::Serialize, + C: Context<'a>, +{ + let value = serde_json::to_string(&value) + .or_else(|e| cx.throw_error(format!("failed to serialize value: {}", e)))?; + Ok(value) +} + +fn hello(mut cx: FunctionContext) -> JsResult { + let arg1 = cx.argument::(0)?; + let value: Value = from_json_string(&mut cx, arg1)?; + let string = to_json_string(&mut cx, value)?; + Ok(cx.string(string)) +} + +fn init_worker_impl(mut cx: FunctionContext, throw_on_reinit: bool) -> JsResult { + let arg1 = cx.argument::(0)?; + let config: PoolConfig = from_json_string(&mut cx, arg1)?; + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let runtime = runtime(&mut cx)?; + + let fut = async move { + let worker = Worker::new(config).await; + deferred.settle_with(&channel, move |mut cx| { + if WORKER.get().is_some() && !throw_on_reinit { + return Ok(cx.null()); // Short circuit to make using maybe_init a no-op + } + let worker = worker.or_else(|e| cx.throw_error(format!("{}", e)))?; + let already_set = WORKER.set(worker).is_err(); + if already_set && throw_on_reinit { + cx.throw_error("worker already initialized") + } else { + Ok(cx.null()) + } + }); + }; + + runtime.spawn(fut); + + Ok(promise) +} + +fn init_manager_impl(mut cx: FunctionContext, throw_on_reinit: bool) -> JsResult { + let arg1 = cx.argument::(0)?; + let config: ManagerConfig = from_json_string(&mut cx, arg1)?; + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let runtime = runtime(&mut cx)?; + + let fut = async move { + let manager = QueueManager::new(config).await; + deferred.settle_with(&channel, move |mut cx| { + if MANAGER.get().is_some() && !throw_on_reinit { + return Ok(cx.null()); // Short circuit to make using maybe_init a no-op + } + let manager = manager.or_else(|e| cx.throw_error(format!("{}", e)))?; + let already_set = MANAGER.set(manager).is_err(); + if already_set && throw_on_reinit { + cx.throw_error("manager already initialized") + } else { + Ok(cx.null()) + } + }); + }; + + runtime.spawn(fut); + + Ok(promise) +} + +fn init_worker(cx: FunctionContext) -> JsResult { + init_worker_impl(cx, true) +} + +fn init_manager(cx: FunctionContext) -> JsResult { + init_manager_impl(cx, true) +} + +fn maybe_init_worker(cx: FunctionContext) -> JsResult { + init_worker_impl(cx, false) +} + +fn maybe_init_manager(cx: FunctionContext) -> JsResult { + init_manager_impl(cx, false) +} + +// throw_error has a type signature that makes it inconvenient to use in closures, because +// it requires that you specify the V of the NeonResult returned, even though it's always +// an error. This is a sane thing for it to do, but it's inconvenient for us, because we +// frequently settle promises early, before we have a V to use for type inference. This little +// wrapper makes that easier, by specifying the V as JsNull +fn throw_null_err<'c, C>(cx: &mut C, msg: &str) -> NeonResult> +where + C: Context<'c>, +{ + cx.throw_error(msg) +} + +fn create_job(mut cx: FunctionContext) -> JsResult { + let arg1: Handle = cx.argument::(0)?; + let job: JobInit = from_json_string(&mut cx, arg1)?; + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let runtime = runtime(&mut cx)?; + + let fut = async move { + let manager = match MANAGER.get() { + Some(manager) => manager, + None => { + deferred.settle_with(&channel, |mut cx| { + throw_null_err(&mut cx, "manager not initialized") + }); + return; + } + }; + let job = manager.create_job(job).await; + deferred.settle_with(&channel, move |mut cx| { + job.or_else(|e| cx.throw_error(format!("{}", e)))?; + Ok(cx.null()) + }); + }; + + runtime.spawn(fut); + + Ok(promise) +} + +fn dequeue_jobs(mut cx: FunctionContext) -> JsResult { + let queue_name = cx.argument::(0)?.value(&mut cx); + + let limit = cx.argument::(1)?.value(&mut cx) as usize; // TODO - I don't love this cast + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let runtime = runtime(&mut cx)?; + + let fut = async move { + let worker = match WORKER.get() { + Some(worker) => worker, + None => { + deferred.settle_with(&channel, |mut cx| { + throw_null_err(&mut cx, "worker not initialized") + }); + return; + } + }; + let jobs = worker.dequeue_jobs(&queue_name, limit).await; + deferred.settle_with(&channel, move |mut cx| { + let jobs = jobs.or_else(|e| cx.throw_error(format!("{}", e)))?; + let jobs = to_json_string(&mut cx, jobs)?; + Ok(cx.string(jobs)) + }); + }; + + runtime.spawn(fut); + + Ok(promise) +} + +fn dequeue_with_vm_state(mut cx: FunctionContext) -> JsResult { + let queue_name = cx.argument::(0)?.value(&mut cx); + + let limit = cx.argument::(1)?.value(&mut cx) as usize; // TODO - I don't love this cast + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let runtime = runtime(&mut cx)?; + + let fut = async move { + let worker = match WORKER.get() { + Some(worker) => worker, + None => { + deferred.settle_with(&channel, |mut cx| { + throw_null_err(&mut cx, "worker not initialized") + }); + return; + } + }; + let jobs = worker.dequeue_with_vm_state(&queue_name, limit).await; + deferred.settle_with(&channel, move |mut cx| { + let jobs = jobs.or_else(|e| cx.throw_error(format!("{}", e)))?; + let jobs = to_json_string(&mut cx, jobs)?; + Ok(cx.string(jobs)) + }); + }; + + runtime.spawn(fut); + + Ok(promise) +} + +fn flush_job(mut cx: FunctionContext) -> JsResult { + let arg1 = cx.argument::(0)?.value(&mut cx); + let job_id: Uuid = arg1 + .parse() + .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg1)))?; + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let runtime = runtime(&mut cx)?; + + let fut = async move { + let worker = match WORKER.get() { + Some(worker) => worker, + None => { + deferred.settle_with(&channel, |mut cx| { + throw_null_err(&mut cx, "worker not initialized") + }); + return; + } + }; + let res = worker.flush_job(job_id).await; + deferred.settle_with(&channel, move |mut cx| { + res.or_else(|e: cyclotron_core::error::QueueError| cx.throw_error(format!("{}", e)))?; + Ok(cx.null()) + }); + }; + + runtime.spawn(fut); + + Ok(promise) +} + +fn set_state(mut cx: FunctionContext) -> JsResult { + let arg = cx.argument::(0)?.value(&mut cx); + let job_id: Uuid = arg + .parse() + .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?; + + let arg = cx.argument::(1)?.value(&mut cx); + let state: JobState = arg + .parse() + .or_else(|_| cx.throw_error(format!("invalid job state: {}", arg)))?; + + WORKER + .get() + .map_or_else(|| cx.throw_error("worker not initialized"), Ok)? + .set_state(job_id, state) + .or_else(|e| cx.throw_error(format!("{}", e)))?; + + Ok(cx.null()) +} + +fn set_queue(mut cx: FunctionContext) -> JsResult { + let arg = cx.argument::(0)?.value(&mut cx); + let job_id: Uuid = arg + .parse() + .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?; + + let queue = cx.argument::(1)?.value(&mut cx); + + WORKER + .get() + .map_or_else(|| cx.throw_error("worker not initialized"), Ok)? + .set_queue(job_id, &queue) + .or_else(|e| cx.throw_error(format!("{}", e)))?; + + Ok(cx.null()) +} + +fn set_priority(mut cx: FunctionContext) -> JsResult { + let arg = cx.argument::(0)?.value(&mut cx); + let job_id: Uuid = arg + .parse() + .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?; + + let arg = cx.argument::(1)?.value(&mut cx); + let priority = arg as i16; // TODO - I /really/ don't love this cast + + WORKER + .get() + .map_or_else(|| cx.throw_error("worker not initialized"), Ok)? + .set_priority(job_id, priority) + .or_else(|e| cx.throw_error(format!("{}", e)))?; + + Ok(cx.null()) +} + +fn set_scheduled_at(mut cx: FunctionContext) -> JsResult { + let arg = cx.argument::(0)?.value(&mut cx); + let job_id: Uuid = arg + .parse() + .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?; + + let arg = cx.argument::(1)?.value(&mut cx); + let scheduled: DateTime = arg + .parse() + .or_else(|_| cx.throw_error(format!("invalid scheduled at: {}", arg)))?; + + WORKER + .get() + .map_or_else(|| cx.throw_error("worker not initialized"), Ok)? + .set_scheduled_at(job_id, scheduled) + .or_else(|e| cx.throw_error(format!("{}", e)))?; + + Ok(cx.null()) +} + +fn set_vm_state(mut cx: FunctionContext) -> JsResult { + let arg = cx.argument::(0)?.value(&mut cx); + let job_id: Uuid = arg + .parse() + .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?; + + // Tricky - we have to support passing nulls here, because that's how you clear vm state. + let vm_state = cx.argument::(1)?; + let vm_state = if vm_state.is_a::(&mut cx) { + None + } else { + Some( + vm_state + .downcast_or_throw::(&mut cx)? + .value(&mut cx), + ) + }; + + WORKER + .get() + .map_or_else(|| cx.throw_error("worker not initialized"), Ok)? + .set_vm_state(job_id, vm_state) + .or_else(|e| cx.throw_error(format!("{}", e)))?; + + Ok(cx.null()) +} + +fn set_metadata(mut cx: FunctionContext) -> JsResult { + let arg = cx.argument::(0)?.value(&mut cx); + let job_id: Uuid = arg + .parse() + .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?; + + // Tricky - we have to support passing nulls here, because that's how you clear metadata. + let metadata = cx.argument::(1)?; + let metadata = if metadata.is_a::(&mut cx) { + None + } else { + Some( + metadata + .downcast_or_throw::(&mut cx)? + .value(&mut cx), + ) + }; + + WORKER + .get() + .map_or_else(|| cx.throw_error("worker not initialized"), Ok)? + .set_metadata(job_id, metadata) + .or_else(|e| cx.throw_error(format!("{}", e)))?; + + Ok(cx.null()) +} + +fn set_parameters(mut cx: FunctionContext) -> JsResult { + let arg = cx.argument::(0)?.value(&mut cx); + let job_id: Uuid = arg + .parse() + .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?; + + // Tricky - we have to support passing nulls here, because that's how you clear parameters. + let parameters = cx.argument::(1)?; + let parameters = if parameters.is_a::(&mut cx) { + None + } else { + Some( + parameters + .downcast_or_throw::(&mut cx)? + .value(&mut cx), + ) + }; + + WORKER + .get() + .map_or_else(|| cx.throw_error("worker not initialized"), Ok)? + .set_parameters(job_id, parameters) + .or_else(|e| cx.throw_error(format!("{}", e)))?; + + Ok(cx.null()) +} + +#[neon::main] +fn main(mut cx: ModuleContext) -> NeonResult<()> { + cx.export_function("hello", hello)?; + cx.export_function("initWorker", init_worker)?; + cx.export_function("initManager", init_manager)?; + cx.export_function("maybeInitWorker", maybe_init_worker)?; + cx.export_function("maybeInitManager", maybe_init_manager)?; + cx.export_function("createJob", create_job)?; + cx.export_function("dequeueJobs", dequeue_jobs)?; + cx.export_function("dequeueJobsWithVmState", dequeue_with_vm_state)?; + cx.export_function("flushJob", flush_job)?; + cx.export_function("setState", set_state)?; + cx.export_function("setQueue", set_queue)?; + cx.export_function("setPriority", set_priority)?; + cx.export_function("setScheduledAt", set_scheduled_at)?; + cx.export_function("setVmState", set_vm_state)?; + cx.export_function("setMetadata", set_metadata)?; + cx.export_function("setParameters", set_parameters)?; + + Ok(()) +} diff --git a/rust/cyclotron-node/tsconfig.json b/rust/cyclotron-node/tsconfig.json new file mode 100644 index 00000000000..4fa58397f06 --- /dev/null +++ b/rust/cyclotron-node/tsconfig.json @@ -0,0 +1,24 @@ +{ + "compilerOptions": { + "module": "CommonJS", + "target": "ESNext", + "declaration": true, + "removeComments": true, + "emitDecoratorMetadata": true, + "experimentalDecorators": true, + "moduleResolution": "node", + "esModuleInterop": true, + "allowJs": true, + "sourceMap": true, + "baseUrl": "src/", + "rootDir": "src/", + "outDir": "dist/", + "types": ["node"], + "resolveJsonModule": true, + "strict": true, + "noImplicitAny": true, + "useUnknownInCatchVariables": false + }, + "include": ["src"], + "exclude": ["node_modules", "dist", "bin"] +} diff --git a/rust/hook-api/Cargo.toml b/rust/hook-api/Cargo.toml index c3528d23da5..7887e8e49a8 100644 --- a/rust/hook-api/Cargo.toml +++ b/rust/hook-api/Cargo.toml @@ -22,3 +22,4 @@ tower = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } url = { workspace = true } +common-metrics = { path = "../common/metrics" } diff --git a/rust/hook-api/src/main.rs b/rust/hook-api/src/main.rs index 7ca8de09513..1f84abb4e46 100644 --- a/rust/hook-api/src/main.rs +++ b/rust/hook-api/src/main.rs @@ -3,7 +3,7 @@ use config::Config; use envconfig::Envconfig; use eyre::Result; -use hook_common::metrics::setup_metrics_routes; +use common_metrics::setup_metrics_routes; use hook_common::pgqueue::PgQueue; mod config; diff --git a/rust/hook-common/Cargo.toml b/rust/hook-common/Cargo.toml index e5c27fd5982..e6b2625c239 100644 --- a/rust/hook-common/Cargo.toml +++ b/rust/hook-common/Cargo.toml @@ -8,13 +8,10 @@ workspace = true [dependencies] async-trait = { workspace = true } -axum = { workspace = true, features = ["http2"] } chrono = { workspace = true } envconfig = { workspace = true } health = { path = "../common/health" } http = { workspace = true } -metrics = { workspace = true } -metrics-exporter-prometheus = { workspace = true } rdkafka = { workspace = true } reqwest = { workspace = true } serde = { workspace = true } diff --git a/rust/hook-common/src/lib.rs b/rust/hook-common/src/lib.rs index 5531ceb7346..e1446d80c33 100644 --- a/rust/hook-common/src/lib.rs +++ b/rust/hook-common/src/lib.rs @@ -1,7 +1,6 @@ pub mod config; pub mod kafka_messages; pub mod kafka_producer; -pub mod metrics; pub mod pgqueue; pub mod retry; pub mod test; diff --git a/rust/hook-janitor/Cargo.toml b/rust/hook-janitor/Cargo.toml index 21894a38f80..a4fa315da70 100644 --- a/rust/hook-janitor/Cargo.toml +++ b/rust/hook-janitor/Cargo.toml @@ -24,3 +24,4 @@ time = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } +common-metrics = { path = "../common/metrics" } \ No newline at end of file diff --git a/rust/hook-janitor/src/main.rs b/rust/hook-janitor/src/main.rs index 200e5a00305..b7ea4db85ec 100644 --- a/rust/hook-janitor/src/main.rs +++ b/rust/hook-janitor/src/main.rs @@ -9,8 +9,8 @@ use std::{str::FromStr, time::Duration}; use tokio::sync::Semaphore; use webhooks::WebhookCleaner; +use common_metrics::setup_metrics_routes; use hook_common::kafka_producer::create_kafka_producer; -use hook_common::metrics::setup_metrics_routes; mod cleanup; mod config; diff --git a/rust/hook-janitor/src/webhooks.rs b/rust/hook-janitor/src/webhooks.rs index c40c7441c5b..c523a4c59da 100644 --- a/rust/hook-janitor/src/webhooks.rs +++ b/rust/hook-janitor/src/webhooks.rs @@ -17,10 +17,10 @@ use tracing::{debug, error, info}; use crate::cleanup::Cleaner; +use common_metrics::get_current_timestamp_seconds; use hook_common::kafka_messages::app_metrics::{AppMetric, AppMetricCategory}; use hook_common::kafka_messages::app_metrics2::{self, AppMetric2}; use hook_common::kafka_producer::KafkaContext; -use hook_common::metrics::get_current_timestamp_seconds; #[derive(Error, Debug)] pub enum WebhookCleanerError { @@ -1080,7 +1080,7 @@ mod tests { let mut conn = db.acquire().await.unwrap(); let count: i64 = sqlx::query("SELECT count(*) FROM job_queue WHERE status = $1::job_status") - .bind(&status) + .bind(status) .fetch_one(&mut *conn) .await .unwrap() @@ -1105,7 +1105,7 @@ mod tests { { // The fixtures include an available job, so let's complete it while the txn is open. let mut batch: PgTransactionBatch<'_, WebhookJobParameters, WebhookJobMetadata> = queue - .dequeue_tx(&"worker_id", 1) + .dequeue_tx("worker_id", 1) .await .expect("failed to dequeue job") .expect("didn't find a job to dequeue"); @@ -1130,10 +1130,10 @@ mod tests { plugin_id: 2, plugin_config_id: 3, }; - let new_job = NewJob::new(1, job_metadata, job_parameters, &"target"); + let new_job = NewJob::new(1, job_metadata, job_parameters, "target"); queue.enqueue(new_job).await.expect("failed to enqueue job"); let mut batch: PgTransactionBatch<'_, WebhookJobParameters, WebhookJobMetadata> = queue - .dequeue_tx(&"worker_id", 1) + .dequeue_tx("worker_id", 1) .await .expect("failed to dequeue job") .expect("didn't find a job to dequeue"); @@ -1158,7 +1158,7 @@ mod tests { plugin_id: 2, plugin_config_id: 3, }; - let new_job = NewJob::new(1, job_metadata, job_parameters, &"target"); + let new_job = NewJob::new(1, job_metadata, job_parameters, "target"); queue.enqueue(new_job).await.expect("failed to enqueue job"); } diff --git a/rust/hook-worker/Cargo.toml b/rust/hook-worker/Cargo.toml index d09a2412069..f7aaf59c752 100644 --- a/rust/hook-worker/Cargo.toml +++ b/rust/hook-worker/Cargo.toml @@ -25,6 +25,8 @@ tokio = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } url = { version = "2.2" } +common-metrics = { path = "../common/metrics" } +common-dns = { path = "../common/dns" } [dev-dependencies] httpmock = { workspace = true } diff --git a/rust/hook-worker/src/error.rs b/rust/hook-worker/src/error.rs index 70877f2e1c3..3b12bf28977 100644 --- a/rust/hook-worker/src/error.rs +++ b/rust/hook-worker/src/error.rs @@ -2,7 +2,7 @@ use std::error::Error; use std::fmt; use std::time; -use crate::dns::NoPublicIPv4Error; +use common_dns::NoPublicIPv4Error; use hook_common::{pgqueue, webhook::WebhookJobError}; use http::StatusCode; use thiserror::Error; diff --git a/rust/hook-worker/src/lib.rs b/rust/hook-worker/src/lib.rs index 94a07584f1d..8488d15b20a 100644 --- a/rust/hook-worker/src/lib.rs +++ b/rust/hook-worker/src/lib.rs @@ -1,5 +1,4 @@ pub mod config; -pub mod dns; pub mod error; pub mod util; pub mod worker; diff --git a/rust/hook-worker/src/main.rs b/rust/hook-worker/src/main.rs index 5400ff93bf6..798586bc6ed 100644 --- a/rust/hook-worker/src/main.rs +++ b/rust/hook-worker/src/main.rs @@ -2,13 +2,13 @@ use axum::routing::get; use axum::Router; use envconfig::Envconfig; +use hook_common::pgqueue::PgQueue; +use hook_common::retry::RetryPolicy; use std::future::ready; +use common_metrics::{serve, setup_metrics_routes}; use health::HealthRegistry; use hook_common::kafka_producer::create_kafka_producer; -use hook_common::{ - metrics::serve, metrics::setup_metrics_routes, pgqueue::PgQueue, retry::RetryPolicy, -}; use hook_worker::config::Config; use hook_worker::error::WorkerError; use hook_worker::worker::WebhookWorker; diff --git a/rust/hook-worker/src/worker.rs b/rust/hook-worker/src/worker.rs index f59f2dec627..bba15cd67c9 100644 --- a/rust/hook-worker/src/worker.rs +++ b/rust/hook-worker/src/worker.rs @@ -23,11 +23,11 @@ use hook_common::{ webhook::{HttpMethod, WebhookJobError, WebhookJobParameters}, }; -use crate::dns::{NoPublicIPv4Error, PublicIPv4Resolver}; use crate::error::{ is_error_source, WebhookError, WebhookParseError, WebhookRequestError, WorkerError, }; use crate::util::first_n_bytes_of_response; +use common_dns::{NoPublicIPv4Error, PublicIPv4Resolver}; // TODO: Either make this configurable or adjust it once we don't produce results to Kafka, where // our size limit is relatively low. @@ -1026,7 +1026,7 @@ mod tests { .unwrap() .as_array() .unwrap() - .get(0) + .first() .unwrap(); first_timing .get("duration_ms") @@ -1142,7 +1142,7 @@ mod tests { .unwrap() .as_array() .unwrap() - .get(0) + .first() .unwrap(); first_timing .get("duration_ms") @@ -1255,8 +1255,7 @@ mod tests { let err = send_webhook(localhost_client(), &method, url, &headers, body.to_owned()) .await - .err() - .expect("request didn't fail when it should have failed"); + .expect_err("request didn't fail when it should have failed"); assert!(matches!(err, WebhookError::Request(..))); if let WebhookError::Request(request_error) = err { @@ -1281,8 +1280,7 @@ mod tests { let err = send_webhook(localhost_client(), &method, url, &headers, body.to_owned()) .await - .err() - .expect("request didn't fail when it should have failed"); + .expect_err("request didn't fail when it should have failed"); assert!(matches!(err, WebhookError::Request(..))); if let WebhookError::Request(request_error) = err { @@ -1309,8 +1307,7 @@ mod tests { let err = send_webhook(filtering_client, &method, url, &headers, body.to_owned()) .await - .err() - .expect("request didn't fail when it should have failed"); + .expect_err("request didn't fail when it should have failed"); assert!(matches!(err, WebhookError::Request(..))); if let WebhookError::Request(request_error) = err {