feat: cyclotron (#24228)

Co-authored-by: Brett Hoerner <brett@bretthoerner.com> Co-authored-by: Ben White <ben@posthog.com>
2024-11-21 13:39:22 +01:00 · 2024-08-21 21:24:56 +03:00 · 2024-08-21 21:24:56 +03:00 · 9734a40c96
commit 9734a40c96
parent e1def6e3c1
106 changed files with 6298 additions and 42 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -39,3 +39,11 @@
 !test-runner-jest.config.js
 !test-runner-jest-environment.js
 !patches
+!rust
+rust/.env
+rust/.github
+rust/docker
+rust/target
+rust/cyclotron-node/dist
+rust/cyclotron-node/node_modules
+rust/cyclotron-node/index.node
--- a/.gitignore
+++ b/.gitignore
@ -64,3 +64,5 @@ plugin-transpiler/dist
 *-esbuild-bundle-visualization.html
 .dlt
 *.db
+# Ignore any log files that happen to be present
+*.log
--- a/bin/start-cyclotron
+++ b/bin/start-cyclotron
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
+
+cd rust
+
+cargo build
+
+export RUST_LOG=${DEBUG:-debug}
+SQLX_QUERY_LEVEL=${SQLX_QUERY_LEVEL:-warn}
+export RUST_LOG=$RUST_LOG,sqlx::query=$SQLX_QUERY_LEVEL
+
+export DATABASE_URL=${DATABASE_URL:-postgres://posthog:posthog@localhost:5432/posthog}
+export ALLOW_INTERNAL_IPS=${ALLOW_INTERNAL_IPS:-true}
+cd cyclotron-core
+cargo sqlx migrate run
+cd ..
+
+./target/debug/cyclotron-fetch &
+./target/debug/cyclotron-janitor &
+
+wait
--- a/plugin-server/package.json
+++ b/plugin-server/package.json
@ -27,7 +27,9 @@
        "services:start": "cd .. && docker compose -f docker-compose.dev.yml up",
        "services:stop": "cd .. && docker compose -f docker-compose.dev.yml down",
        "services:clean": "cd .. && docker compose -f docker-compose.dev.yml rm -v",
-        "services": "pnpm services:stop && pnpm services:clean && pnpm services:start"
+        "services": "pnpm services:stop && pnpm services:clean && pnpm services:start",
+        "build:cyclotron": "cd ../rust/cyclotron-node && pnpm run package",
+        "pnpm:devPreinstall": "pnpm run build:cyclotron"
    },
    "graphile-worker": {
        "maxContiguousErrors": 300
@ -86,7 +88,8 @@
        "uuid": "^9.0.1",
        "v8-profiler-next": "^1.9.0",
        "vm2": "3.9.18",
-        "detect-browser": "^5.3.0"
+        "detect-browser": "^5.3.0",
+        "@posthog/cyclotron": "file:../rust/cyclotron-node"
    },
    "devDependencies": {
        "0x": "^5.5.0",
--- a/plugin-server/pnpm-lock.yaml
+++ b/plugin-server/pnpm-lock.yaml
@ -43,6 +43,9 @@ dependencies:
  '@posthog/clickhouse':
    specifier: ^1.7.0
    version: 1.7.0
+  '@posthog/cyclotron':
+    specifier: file:../rust/cyclotron-node
+    version: file:../rust/cyclotron-node
  '@posthog/hogvm':
    specifier: ^1.0.32
    version: 1.0.32(luxon@3.4.4)(re2@1.20.3)
@ -10731,3 +10734,8 @@ packages:
  /yocto-queue@0.1.0:
    resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==}
    engines: {node: '>=10'}
+
+  file:../rust/cyclotron-node:
+    resolution: {directory: ../rust/cyclotron-node, type: directory}
+    name: '@posthog/cyclotron'
+    dev: false
--- a/plugin-server/src/capabilities.ts
+++ b/plugin-server/src/capabilities.ts
@ -26,6 +26,7 @@ export function getPluginServerCapabilities(config: PluginsServerConfig): Plugin
                cdpProcessedEvents: true,
                cdpFunctionCallbacks: true,
                cdpFunctionOverflow: true,
+                cdpCyclotronWorker: true,
                syncInlinePlugins: true,
                ...sharedCapabilities,
            }
@ -108,6 +109,11 @@ export function getPluginServerCapabilities(config: PluginsServerConfig): Plugin
                cdpFunctionOverflow: true,
                ...sharedCapabilities,
            }
+        case PluginServerMode.cdp_cyclotron_worker:
+            return {
+                cdpCyclotronWorker: true,
+                ...sharedCapabilities,
+            }
        // This is only for functional tests, which time out if all capabilities are used
        // ideally we'd run just the specific capability needed per test, but that's not easy to do atm
        case PluginServerMode.functional_tests:
--- a/plugin-server/src/cdp/async-function-executor.ts
+++ b/plugin-server/src/cdp/async-function-executor.ts
@ -1,3 +1,4 @@
+import cyclotron from '@posthog/cyclotron'
 import { Histogram } from 'prom-client'

 import { buildIntegerMatcher } from '../config/config'
@ -27,9 +28,11 @@ export type AsyncFunctionExecutorOptions = {

 export class AsyncFunctionExecutor {
    hogHookEnabledForTeams: ValueMatcher<number>
+    cyclotronEnabledForTeams: ValueMatcher<number>

    constructor(private serverConfig: PluginsServerConfig, private rustyHook: RustyHook) {
        this.hogHookEnabledForTeams = buildIntegerMatcher(serverConfig.CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS, true)
+        this.cyclotronEnabledForTeams = buildIntegerMatcher(serverConfig.CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS, true)
    }

    async execute(
@ -99,8 +102,44 @@ export class AsyncFunctionExecutor {
                histogramFetchPayloadSize.observe(body.length / 1024)
            }

-            // If the caller hasn't forced it to be synchronous and the team has the rustyhook enabled, enqueue it
-            if (!options?.sync && this.hogHookEnabledForTeams(request.teamId)) {
+            // If the caller hasn't forced it to be synchronous and the team has the cyclotron or
+            // rustyhook enabled, enqueue it in one of those services.
+            if (!options?.sync && this.cyclotronEnabledForTeams(request.teamId)) {
+                try {
+                    await cyclotron.createJob({
+                        teamId: request.teamId,
+                        functionId: request.hogFunctionId,
+                        queueName: 'fetch',
+                        // TODO: The async function compression changes happen upstream of this
+                        // function. I guess we'll want to unwind that change because we actually
+                        // want the `vmState` (and the rest of state) so we can put it into PG here.
+                        vmState: '',
+                        parameters: JSON.stringify({
+                            return_queue: 'hog',
+                            url,
+                            method,
+                            headers,
+                            body,
+                        }),
+                        metadata: JSON.stringify({
+                            // TODO: It seems like Fetch expects metadata to have this shape, which
+                            // I don't understand. I think `metadata` is where all the other Hog
+                            // state is going to be stored? For now I'm just trying to make fetch
+                            // work.
+                            tries: 0,
+                            trace: [],
+                        }),
+                    })
+                } catch (e) {
+                    status.error(
+                        '🦔',
+                        `[HogExecutor] Cyclotron failed to enqueue async fetch function, sending directly instead`,
+                        {
+                            error: e,
+                        }
+                    )
+                }
+            } else if (!options?.sync && this.hogHookEnabledForTeams(request.teamId)) {
                const hoghooksPayload = JSON.stringify(request)

                histogramHogHooksPayloadSize.observe(hoghooksPayload.length / 1024)
--- a/plugin-server/src/cdp/cdp-consumers.ts
+++ b/plugin-server/src/cdp/cdp-consumers.ts
@ -1,3 +1,4 @@
+import cyclotron from '@posthog/cyclotron'
 import { captureException } from '@sentry/node'
 import { features, librdkafkaVersion, Message } from 'node-rdkafka'
 import { Counter, Histogram } from 'prom-client'
@ -443,7 +444,12 @@ abstract class CdpConsumerBase {
        const globalConnectionConfig = createRdConnectionConfigFromEnvVars(this.hub)
        const globalProducerConfig = createRdProducerConfigFromEnvVars(this.hub)

-        await Promise.all([this.hogFunctionManager.start()])
+        await Promise.all([
+            this.hogFunctionManager.start(),
+            this.hub.CYCLOTRON_DATABASE_URL
+                ? cyclotron.initManager({ shards: [{ dbUrl: this.hub.CYCLOTRON_DATABASE_URL }] })
+                : Promise.resolve(),
+        ])

        this.kafkaProducer = new KafkaProducerWrapper(
            await createKafkaProducer(globalConnectionConfig, globalProducerConfig)
@ -693,3 +699,57 @@ export class CdpOverflowConsumer extends CdpConsumerBase {
        return invocationGlobals
    }
 }
+
+// TODO: Split out non-Kafka specific parts of CdpConsumerBase so that it can be used by the
+// Cyclotron worker below. Or maybe we can just wait, and rip the Kafka bits out once Cyclotron is
+// shipped (and rename it something other than consomer, probably). For now, this is an easy way to
+// use existing code and get an end-to-end demo shipped.
+export class CdpCyclotronWorker extends CdpConsumerBase {
+    protected name = 'CdpCyclotronWorker'
+    protected topic = 'UNUSED-CdpCyclotronWorker'
+    protected consumerGroupId = 'UNUSED-CdpCyclotronWorker'
+    private runningWorker: Promise<void> | undefined
+    private isUnhealthy = false
+
+    public async _handleEachBatch(_: Message[]): Promise<void> {
+        // Not called, we override `start` below to use Cyclotron instead.
+    }
+
+    private async innerStart() {
+        try {
+            const limit = 100 // TODO: Make configurable.
+            while (!this.isStopping) {
+                const jobs = await cyclotron.dequeueJobsWithVmState('hog', limit)
+                for (const job of jobs) {
+                    // TODO: Reassemble a HogFunctionInvocationAsyncResponse (or whatever proper type)
+                    // from the fields on the job, and then execute the next Hog step.
+                    console.log(job.id)
+                }
+            }
+        } catch (err) {
+            this.isUnhealthy = true
+            console.error('Error in Cyclotron worker', err)
+            throw err
+        }
+    }
+
+    public async start() {
+        await cyclotron.initManager({ shards: [{ dbUrl: this.hub.CYCLOTRON_DATABASE_URL }] })
+        await cyclotron.initWorker({ dbUrl: this.hub.CYCLOTRON_DATABASE_URL })
+
+        // Consumer `start` expects an async task is started, and not that `start` itself blocks
+        // indefinitely.
+        this.runningWorker = this.innerStart()
+
+        return Promise.resolve()
+    }
+
+    public async stop() {
+        await super.stop()
+        await this.runningWorker
+    }
+
+    public isHealthy() {
+        return this.isUnhealthy
+    }
+}
--- a/plugin-server/src/config/config.ts
+++ b/plugin-server/src/config/config.ts
@ -187,9 +187,13 @@ export function getDefaultConfig(): PluginsServerConfig {
        CDP_WATCHER_REFILL_RATE: 10,
        CDP_WATCHER_DISABLED_TEMPORARY_MAX_COUNT: 3,
        CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS: '',
+        CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS: '',
        CDP_REDIS_PASSWORD: '',
        CDP_REDIS_HOST: '',
        CDP_REDIS_PORT: 6479,
+
+        // Cyclotron
+        CYCLOTRON_DATABASE_URL: '',
    }
 }

--- a/plugin-server/src/main/pluginsServer.ts
+++ b/plugin-server/src/main/pluginsServer.ts
@ -11,7 +11,12 @@ import v8Profiler from 'v8-profiler-next'

 import { getPluginServerCapabilities } from '../capabilities'
 import { CdpApi } from '../cdp/cdp-api'
-import { CdpFunctionCallbackConsumer, CdpOverflowConsumer, CdpProcessedEventsConsumer } from '../cdp/cdp-consumers'
+import {
+    CdpCyclotronWorker,
+    CdpFunctionCallbackConsumer,
+    CdpOverflowConsumer,
+    CdpProcessedEventsConsumer,
+} from '../cdp/cdp-consumers'
 import { defaultConfig, sessionRecordingConsumerConfig } from '../config/config'
 import { Hub, PluginServerCapabilities, PluginsServerConfig } from '../types'
 import { createHub, createKafkaClient, createKafkaProducerWrapper } from '../utils/db/hub'
@ -571,6 +576,17 @@ export async function startPluginsServer(
            healthChecks['cdp-overflow'] = () => consumer.isHealthy() ?? false
        }

+        if (capabilities.cdpCyclotronWorker) {
+            ;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, capabilities)
+            if (hub.CYCLOTRON_DATABASE_URL) {
+                const worker = new CdpCyclotronWorker(hub)
+                await worker.start()
+            } else {
+                // This is a temporary solution until we *require* Cyclotron to be configured.
+                status.warn('💥', 'CYCLOTRON_DATABASE_URL is not set, not running Cyclotron worker')
+            }
+        }
+
        if (capabilities.http) {
            const app = setupCommonRoutes(healthChecks, analyticsEventsIngestionConsumer)

--- a/plugin-server/src/types.ts
+++ b/plugin-server/src/types.ts
@ -85,6 +85,7 @@ export enum PluginServerMode {
    cdp_processed_events = 'cdp-processed-events',
    cdp_function_callbacks = 'cdp-function-callbacks',
    cdp_function_overflow = 'cdp-function-overflow',
+    cdp_cyclotron_worker = 'cdp-cyclotron-worker',
    functional_tests = 'functional-tests',
 }

@ -107,6 +108,7 @@ export type CdpConfig = {
    CDP_WATCHER_DISABLED_TEMPORARY_TTL: number // How long a function should be temporarily disabled for
    CDP_WATCHER_DISABLED_TEMPORARY_MAX_COUNT: number // How many times a function can be disabled before it is disabled permanently
    CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS: string
+    CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS: string
    CDP_REDIS_HOST: string
    CDP_REDIS_PORT: number
    CDP_REDIS_PASSWORD: string
@ -279,6 +281,8 @@ export interface PluginsServerConfig extends CdpConfig {

    // kafka debug stats interval
    SESSION_RECORDING_KAFKA_CONSUMPTION_STATISTICS_EVENT_INTERVAL_MS: number
+
+    CYCLOTRON_DATABASE_URL: string
 }

 export interface Hub extends PluginsServerConfig {
@ -345,6 +349,7 @@ export interface PluginServerCapabilities {
    cdpProcessedEvents?: boolean
    cdpFunctionCallbacks?: boolean
    cdpFunctionOverflow?: boolean
+    cdpCyclotronWorker?: boolean
    appManagementSingleton?: boolean
    preflightSchedules?: boolean // Used for instance health checks on hobby deploy, not useful on cloud
    http?: boolean
--- a/plugin-server/tests/server.test.ts
+++ b/plugin-server/tests/server.test.ts
@ -97,6 +97,7 @@ describe('server', () => {
                cdpProcessedEvents: true,
                cdpFunctionCallbacks: true,
                cdpFunctionOverflow: true,
+                cdpCyclotronWorker: true,
                syncInlinePlugins: true,
            }
        )
--- a/production.Dockerfile
+++ b/production.Dockerfile
@ -38,11 +38,12 @@ COPY ./bin/ ./bin/
 COPY babel.config.js tsconfig.json webpack.config.js tailwind.config.js ./
 RUN pnpm build

-
 #
 # ---------------------------------------------------------
 #
-FROM node:18.19.1-bullseye-slim AS plugin-server-build
+FROM ghcr.io/posthog/rust-node-container:bullseye_rust_1.80.1-node_18.19.1 AS plugin-server-build
+WORKDIR /code
+COPY ./rust ./rust
 WORKDIR /code/plugin-server
 SHELL ["/bin/bash", "-e", "-o", "pipefail", "-c"]

@ -182,6 +183,7 @@ COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/dist
 COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/node_modules /code/plugin-server/node_modules
 COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/package.json /code/plugin-server/package.json

+
 # Copy the Python dependencies and Django staticfiles from the posthog-build stage.
 COPY --from=posthog-build --chown=posthog:posthog /code/staticfiles /code/staticfiles
 COPY --from=posthog-build --chown=posthog:posthog /python-runtime /python-runtime
--- a/rust/.cargo/config.toml
+++ b/rust/.cargo/config.toml
@ -0,0 +1,4 @@
+[env]
+# Force SQLX to run in offline mode for CI. Devs can change this if they want, to live code against the DB,
+# but we use it at the workspace level here to allow use of sqlx macros across all crates
+SQLX_OFFLINE = "true"
--- a/rust/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json
+++ b/rust/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503"
+}
--- a/rust/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json
+++ b/rust/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Text", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87"
+}
--- a/rust/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json
+++ b/rust/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Text", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0"
+}
--- a/rust/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json
+++ b/rust/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Int2", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805"
+}
--- a/rust/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json
+++ b/rust/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json
@ -0,0 +1,117 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nWITH available AS (\n    SELECT\n        id,\n        state\n    FROM cyclotron_jobs\n    WHERE\n        state = 'available'::JobState\n        AND queue_name = $1\n        AND scheduled <= NOW()\n    ORDER BY\n        priority ASC,\n        scheduled ASC\n    LIMIT $2\n    FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n    state = 'running'::JobState,\n    lock_id = $3,\n    last_heartbeat = NOW(),\n    last_transition = NOW(),\n    transition_count = transition_count + 1\nFROM available\nWHERE\n    cyclotron_jobs.id = available.id\nRETURNING\n    cyclotron_jobs.id,\n    team_id,\n    available.state as \"state: JobState\",\n    queue_name,\n    priority,\n    function_id,\n    created,\n    last_transition,\n    scheduled,\n    transition_count,\n    NULL as vm_state,\n    metadata,\n    parameters,\n    lock_id,\n    last_heartbeat,\n    janitor_touch_count\n    ",
+    "describe": {
+        "columns": [
+            {
+                "ordinal": 0,
+                "name": "id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 1,
+                "name": "team_id",
+                "type_info": "Int4"
+            },
+            {
+                "ordinal": 2,
+                "name": "state: JobState",
+                "type_info": {
+                    "Custom": {
+                        "name": "jobstate",
+                        "kind": {
+                            "Enum": ["available", "completed", "failed", "running", "paused"]
+                        }
+                    }
+                }
+            },
+            {
+                "ordinal": 3,
+                "name": "queue_name",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 4,
+                "name": "priority",
+                "type_info": "Int2"
+            },
+            {
+                "ordinal": 5,
+                "name": "function_id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 6,
+                "name": "created",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 7,
+                "name": "last_transition",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 8,
+                "name": "scheduled",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 9,
+                "name": "transition_count",
+                "type_info": "Int2"
+            },
+            {
+                "ordinal": 10,
+                "name": "vm_state",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 11,
+                "name": "metadata",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 12,
+                "name": "parameters",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 13,
+                "name": "lock_id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 14,
+                "name": "last_heartbeat",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 15,
+                "name": "janitor_touch_count",
+                "type_info": "Int2"
+            }
+        ],
+        "parameters": {
+            "Left": ["Text", "Int8", "Uuid"]
+        },
+        "nullable": [
+            false,
+            false,
+            false,
+            false,
+            false,
+            true,
+            false,
+            false,
+            false,
+            false,
+            null,
+            true,
+            true,
+            true,
+            true,
+            false
+        ]
+    },
+    "hash": "350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46"
+}
--- a/rust/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json
+++ b/rust/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nWITH stalled AS (\n    SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1\nFROM stalled\nWHERE cyclotron_jobs.id = stalled.id\n    ",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Timestamptz"]
+        },
+        "nullable": []
+    },
+    "hash": "54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d"
+}
--- a/rust/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json
+++ b/rust/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json
@ -0,0 +1,30 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nINSERT INTO cyclotron_jobs\n    (\n        id,\n        team_id,\n        function_id,\n        created,\n        lock_id,\n        last_heartbeat,\n        janitor_touch_count,\n        transition_count,\n        last_transition,\n        queue_name,\n        state,\n        scheduled,\n        priority,\n        vm_state,\n        metadata,\n        parameters\n    )\nVALUES\n    ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)\n    ",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": [
+                "Uuid",
+                "Int4",
+                "Uuid",
+                "Text",
+                {
+                    "Custom": {
+                        "name": "jobstate",
+                        "kind": {
+                            "Enum": ["available", "completed", "failed", "running", "paused"]
+                        }
+                    }
+                },
+                "Timestamptz",
+                "Int2",
+                "Text",
+                "Text",
+                "Text"
+            ]
+        },
+        "nullable": []
+    },
+    "hash": "7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6"
+}
--- a/rust/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json
+++ b/rust/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb"
+}
--- a/rust/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json
+++ b/rust/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Text", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c"
+}
--- a/rust/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json
+++ b/rust/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Timestamptz", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23"
+}
--- a/rust/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json
+++ b/rust/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json
@ -0,0 +1,18 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2",
+    "describe": {
+        "columns": [
+            {
+                "ordinal": 0,
+                "name": "vm_state",
+                "type_info": "Text"
+            }
+        ],
+        "parameters": {
+            "Left": ["Uuid", "Uuid"]
+        },
+        "nullable": [true]
+    },
+    "hash": "aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632"
+}
--- a/rust/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json
+++ b/rust/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Text", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7"
+}
--- a/rust/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json
+++ b/rust/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json
@ -0,0 +1,23 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs\n            SET state = $1, last_transition = NOW(), transition_count = transition_count + 1\n            WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": [
+                {
+                    "Custom": {
+                        "name": "jobstate",
+                        "kind": {
+                            "Enum": ["available", "completed", "failed", "running", "paused"]
+                        }
+                    }
+                },
+                "Uuid",
+                "Uuid"
+            ]
+        },
+        "nullable": []
+    },
+    "hash": "b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13"
+}
--- a/rust/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json
+++ b/rust/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json
@ -0,0 +1,117 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nWITH available AS (\n    SELECT\n        id,\n        state\n    FROM cyclotron_jobs\n    WHERE\n        state = 'available'::JobState\n        AND queue_name = $1\n        AND scheduled <= NOW()\n    ORDER BY\n        priority ASC,\n        scheduled ASC\n    LIMIT $2\n    FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n    state = 'running'::JobState,\n    lock_id = $3,\n    last_heartbeat = NOW(),\n    last_transition = NOW(),\n    transition_count = transition_count + 1\nFROM available\nWHERE\n    cyclotron_jobs.id = available.id\nRETURNING\n    cyclotron_jobs.id,\n    team_id,\n    available.state as \"state: JobState\",\n    queue_name,\n    priority,\n    function_id,\n    created,\n    last_transition,\n    scheduled,\n    transition_count,\n    vm_state,\n    metadata,\n    parameters,\n    lock_id,\n    last_heartbeat,\n    janitor_touch_count\n    ",
+    "describe": {
+        "columns": [
+            {
+                "ordinal": 0,
+                "name": "id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 1,
+                "name": "team_id",
+                "type_info": "Int4"
+            },
+            {
+                "ordinal": 2,
+                "name": "state: JobState",
+                "type_info": {
+                    "Custom": {
+                        "name": "jobstate",
+                        "kind": {
+                            "Enum": ["available", "completed", "failed", "running", "paused"]
+                        }
+                    }
+                }
+            },
+            {
+                "ordinal": 3,
+                "name": "queue_name",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 4,
+                "name": "priority",
+                "type_info": "Int2"
+            },
+            {
+                "ordinal": 5,
+                "name": "function_id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 6,
+                "name": "created",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 7,
+                "name": "last_transition",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 8,
+                "name": "scheduled",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 9,
+                "name": "transition_count",
+                "type_info": "Int2"
+            },
+            {
+                "ordinal": 10,
+                "name": "vm_state",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 11,
+                "name": "metadata",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 12,
+                "name": "parameters",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 13,
+                "name": "lock_id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 14,
+                "name": "last_heartbeat",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 15,
+                "name": "janitor_touch_count",
+                "type_info": "Int2"
+            }
+        ],
+        "parameters": {
+            "Left": ["Text", "Int8", "Uuid"]
+        },
+        "nullable": [
+            false,
+            false,
+            false,
+            false,
+            false,
+            true,
+            false,
+            false,
+            false,
+            false,
+            true,
+            true,
+            true,
+            true,
+            true,
+            false
+        ]
+    },
+    "hash": "c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e"
+}
--- a/rust/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json
+++ b/rust/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "DELETE FROM cyclotron_jobs WHERE state = 'completed'",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": []
+        },
+        "nullable": []
+    },
+    "hash": "f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c"
+}
--- a/rust/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json
+++ b/rust/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nDELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2\n        ",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Timestamptz", "Int2"]
+        },
+        "nullable": []
+    },
+    "hash": "fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb"
+}
--- a/rust/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json
+++ b/rust/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "DELETE FROM cyclotron_jobs WHERE state = 'failed'",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": []
+        },
+        "nullable": []
+    },
+    "hash": "ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e"
+}
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@ -673,6 +673,7 @@ dependencies = [
 "iana-time-zone",
 "js-sys",
 "num-traits",
+ "serde",
 "wasm-bindgen",
 "windows-targets 0.52.0",
 ]
@ -700,6 +701,25 @@ dependencies = [
 "tokio-util",
 ]

+[[package]]
+name = "common-dns"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "reqwest 0.12.3",
+ "tokio",
+]
+
+[[package]]
+name = "common-metrics"
+version = "0.1.0"
+dependencies = [
+ "axum 0.7.5",
+ "metrics",
+ "metrics-exporter-prometheus",
+ "tokio",
+]
+
 [[package]]
 name = "concurrent-queue"
 version = "2.5.0"
@ -819,6 +839,80 @@ dependencies = [
 "typenum",
 ]

+[[package]]
+name = "cyclotron-core"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "futures",
+ "rand",
+ "serde",
+ "sqlx",
+ "thiserror",
+ "tokio",
+ "uuid",
+]
+
+[[package]]
+name = "cyclotron-fetch"
+version = "0.1.0"
+dependencies = [
+ "axum 0.7.5",
+ "chrono",
+ "common-dns",
+ "common-metrics",
+ "cyclotron-core",
+ "envconfig",
+ "futures",
+ "health",
+ "http 1.1.0",
+ "httpmock",
+ "metrics",
+ "rand",
+ "reqwest 0.12.3",
+ "serde",
+ "serde_json",
+ "sqlx",
+ "thiserror",
+ "tokio",
+ "tracing",
+ "tracing-subscriber",
+ "uuid",
+]
+
+[[package]]
+name = "cyclotron-janitor"
+version = "0.1.0"
+dependencies = [
+ "axum 0.7.5",
+ "chrono",
+ "common-metrics",
+ "cyclotron-core",
+ "envconfig",
+ "eyre",
+ "health",
+ "metrics",
+ "sqlx",
+ "tokio",
+ "tracing",
+ "tracing-subscriber",
+ "uuid",
+]
+
+[[package]]
+name = "cyclotron-node"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "cyclotron-core",
+ "neon",
+ "once_cell",
+ "serde",
+ "serde_json",
+ "tokio",
+ "uuid",
+]
+
 [[package]]
 name = "dashmap"
 version = "5.5.3"
@ -1468,6 +1562,7 @@ name = "hook-api"
 version = "0.1.0"
 dependencies = [
 "axum 0.7.5",
+ "common-metrics",
 "envconfig",
 "eyre",
 "hook-common",
@ -1489,13 +1584,10 @@ name = "hook-common"
 version = "0.1.0"
 dependencies = [
 "async-trait",
- "axum 0.7.5",
 "chrono",
 "envconfig",
 "health",
 "http 1.1.0",
- "metrics",
- "metrics-exporter-prometheus",
 "rdkafka",
 "reqwest 0.12.3",
 "serde",
@ -1514,6 +1606,7 @@ version = "0.1.0"
 dependencies = [
 "async-trait",
 "axum 0.7.5",
+ "common-metrics",
 "envconfig",
 "eyre",
 "futures",
@ -1537,6 +1630,8 @@ version = "0.1.0"
 dependencies = [
 "axum 0.7.5",
 "chrono",
+ "common-dns",
+ "common-metrics",
 "envconfig",
 "futures",
 "health",
@ -1944,6 +2039,16 @@ version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"

+[[package]]
+name = "libloading"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
+dependencies = [
+ "cfg-if",
+ "windows-targets 0.52.0",
+]
+
 [[package]]
 name = "libm"
 version = "0.2.8"
@ -2160,6 +2265,32 @@ dependencies = [
 "tempfile",
 ]

+[[package]]
+name = "neon"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d75440242411c87dc39847b0e33e961ec1f10326a9d8ecf9c1ea64a3b3c13dc"
+dependencies = [
+ "getrandom",
+ "libloading",
+ "neon-macros",
+ "once_cell",
+ "semver",
+ "send_wrapper",
+ "smallvec",
+]
+
+[[package]]
+name = "neon-macros"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6813fde79b646e47e7ad75f480aa80ef76a5d9599e2717407961531169ee38b"
+dependencies = [
+ "quote",
+ "syn 2.0.48",
+ "syn-mid",
+]
+
 [[package]]
 name = "new_debug_unreachable"
 version = "1.0.6"
@ -3181,6 +3312,18 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "semver"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
+
+[[package]]
+name = "send_wrapper"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73"
+
 [[package]]
 name = "serde"
 version = "1.0.196"
@ -3660,6 +3803,17 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "syn-mid"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5dc35bb08dd1ca3dfb09dce91fd2d13294d6711c88897d9a9d60acf39bce049"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
 [[package]]
 name = "sync_wrapper"
 version = "0.1.2"
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@ -4,11 +4,17 @@ resolver = "2"
 members = [
  "capture",
  "common/health",
+  "common/metrics",
+  "common/dns",
  "feature-flags",
  "hook-api",
  "hook-common",
  "hook-janitor",
  "hook-worker",
+  "cyclotron-core",
+  "cyclotron-node",
+  "cyclotron-janitor",
+  "cyclotron-fetch",
 ]

 [workspace.lints.rust]
@ -34,7 +40,7 @@ axum = { version = "0.7.5", features = ["http2", "macros", "matched-path"] }
 axum-client-ip = "0.6.0"
 base64 = "0.22.0"
 bytes = "1"
-chrono = { version = "0.4" }
+chrono = { version = "0.4", features = ["default", "serde"]}
 envconfig = "0.10.0"
 eyre = "0.6.9"
 flate2 = "1.0"
@ -80,3 +86,4 @@ tracing-opentelemetry = "0.23.0"
 tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
 url = { version = "2.5.0 " }
 uuid = { version = "1.6.1", features = ["v7", "serde"] }
+neon = "1"
--- a/rust/Dockerfile
+++ b/rust/Dockerfile
@ -1,4 +1,4 @@
-FROM docker.io/lukemathwalker/cargo-chef:latest-rust-1.77-bookworm AS chef
+FROM docker.io/lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef
 ARG BIN
 WORKDIR /app

--- a/rust/common/dns/Cargo.toml
+++ b/rust/common/dns/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "common-dns"
+version = "0.1.0"
+edition = "2021"
+
+[lints]
+workspace = true
+
+[dependencies]
+futures = { workspace = true }
+reqwest = { workspace = true }
+tokio = { workspace = true }
--- a/rust/hook-worker/src/dns.rs
+++ b/rust/hook-worker/src/dns.rs
@ -86,7 +86,7 @@ impl Resolve for PublicIPv4Resolver {

 #[cfg(test)]
 mod tests {
-    use crate::dns::{NoPublicIPv4Error, PublicIPv4Resolver};
+    use crate::{NoPublicIPv4Error, PublicIPv4Resolver};
    use reqwest::dns::{Name, Resolve};
    use std::str::FromStr;

--- a/rust/common/health/src/lib.rs
+++ b/rust/common/health/src/lib.rs
@ -4,7 +4,7 @@ use std::sync::{Arc, RwLock};

 use axum::http::StatusCode;
 use axum::response::{IntoResponse, Response};
-use time::Duration;
+use std::time::Duration;
 use tokio::sync::mpsc;
 use tracing::{info, warn};

@ -143,7 +143,16 @@ impl HealthRegistry {

    /// Registers a new component in the registry. The returned handle should be passed
    /// to the component, to allow it to frequently report its health status.
-    pub async fn register(&self, component: String, deadline: Duration) -> HealthHandle {
+    pub async fn register<D>(&self, component: String, deadline: D) -> HealthHandle
+    where
+        // HACK: to let callers user time::Duration or std::time::Duration (and therefore chrono::Duration),
+        // since apparently we use all three
+        D: TryInto<Duration>,
+    {
+        let Ok(deadline) = deadline.try_into() else {
+            // TODO - I should return an error here, but I don't want to refactor everything that uses this right now
+            panic!("invalid deadline")
+        };
        let handle = HealthHandle {
            component,
            deadline,
--- a/rust/common/metrics/Cargo.toml
+++ b/rust/common/metrics/Cargo.toml
@ -0,0 +1,13 @@
+[package]
+name = "common-metrics"
+version = "0.1.0"
+edition = "2021"
+
+[lints]
+workspace = true
+
+[dependencies]
+axum = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
+tokio = { workspace = true }
+metrics = { workspace = true }
--- a/rust/common/metrics/README.md
+++ b/rust/common/metrics/README.md
@ -0,0 +1 @@
+Ripped from rusty-hook, since it'll be used across more or less all cyclotron stuff, as well as rustyhook
--- a/rust/hook-common/src/metrics.rs
+++ b/rust/hook-common/src/metrics.rs
--- a/rust/cyclotron-core/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json
+++ b/rust/cyclotron-core/.sqlx/query-075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503"
+}
--- a/rust/cyclotron-core/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json
+++ b/rust/cyclotron-core/.sqlx/query-16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Text", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87"
+}
--- a/rust/cyclotron-core/.sqlx/query-213e9d70e145a01fb42d5c3a80f9126073113a4af03c4c9fd3a81004d898f883.json
+++ b/rust/cyclotron-core/.sqlx/query-213e9d70e145a01fb42d5c3a80f9126073113a4af03c4c9fd3a81004d898f883.json
@ -0,0 +1,18 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'available' AND scheduled <= NOW()",
+    "describe": {
+        "columns": [
+            {
+                "ordinal": 0,
+                "name": "count",
+                "type_info": "Int8"
+            }
+        ],
+        "parameters": {
+            "Left": []
+        },
+        "nullable": [null]
+    },
+    "hash": "213e9d70e145a01fb42d5c3a80f9126073113a4af03c4c9fd3a81004d898f883"
+}
--- a/rust/cyclotron-core/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json
+++ b/rust/cyclotron-core/.sqlx/query-2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Text", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0"
+}
--- a/rust/cyclotron-core/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json
+++ b/rust/cyclotron-core/.sqlx/query-2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Int2", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805"
+}
--- a/rust/cyclotron-core/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json
+++ b/rust/cyclotron-core/.sqlx/query-350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46.json
@ -0,0 +1,117 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nWITH available AS (\n    SELECT\n        id,\n        state\n    FROM cyclotron_jobs\n    WHERE\n        state = 'available'::JobState\n        AND queue_name = $1\n        AND scheduled <= NOW()\n    ORDER BY\n        priority ASC,\n        scheduled ASC\n    LIMIT $2\n    FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n    state = 'running'::JobState,\n    lock_id = $3,\n    last_heartbeat = NOW(),\n    last_transition = NOW(),\n    transition_count = transition_count + 1\nFROM available\nWHERE\n    cyclotron_jobs.id = available.id\nRETURNING\n    cyclotron_jobs.id,\n    team_id,\n    available.state as \"state: JobState\",\n    queue_name,\n    priority,\n    function_id,\n    created,\n    last_transition,\n    scheduled,\n    transition_count,\n    NULL as vm_state,\n    metadata,\n    parameters,\n    lock_id,\n    last_heartbeat,\n    janitor_touch_count\n    ",
+    "describe": {
+        "columns": [
+            {
+                "ordinal": 0,
+                "name": "id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 1,
+                "name": "team_id",
+                "type_info": "Int4"
+            },
+            {
+                "ordinal": 2,
+                "name": "state: JobState",
+                "type_info": {
+                    "Custom": {
+                        "name": "jobstate",
+                        "kind": {
+                            "Enum": ["available", "completed", "failed", "running", "paused"]
+                        }
+                    }
+                }
+            },
+            {
+                "ordinal": 3,
+                "name": "queue_name",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 4,
+                "name": "priority",
+                "type_info": "Int2"
+            },
+            {
+                "ordinal": 5,
+                "name": "function_id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 6,
+                "name": "created",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 7,
+                "name": "last_transition",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 8,
+                "name": "scheduled",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 9,
+                "name": "transition_count",
+                "type_info": "Int2"
+            },
+            {
+                "ordinal": 10,
+                "name": "vm_state",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 11,
+                "name": "metadata",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 12,
+                "name": "parameters",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 13,
+                "name": "lock_id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 14,
+                "name": "last_heartbeat",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 15,
+                "name": "janitor_touch_count",
+                "type_info": "Int2"
+            }
+        ],
+        "parameters": {
+            "Left": ["Text", "Int8", "Uuid"]
+        },
+        "nullable": [
+            false,
+            false,
+            false,
+            false,
+            false,
+            true,
+            false,
+            false,
+            false,
+            false,
+            null,
+            true,
+            true,
+            true,
+            true,
+            false
+        ]
+    },
+    "hash": "350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46"
+}
--- a/rust/cyclotron-core/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json
+++ b/rust/cyclotron-core/.sqlx/query-54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nWITH stalled AS (\n    SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1\nFROM stalled\nWHERE cyclotron_jobs.id = stalled.id\n    ",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Timestamptz"]
+        },
+        "nullable": []
+    },
+    "hash": "54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d"
+}
--- a/rust/cyclotron-core/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json
+++ b/rust/cyclotron-core/.sqlx/query-7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6.json
@ -0,0 +1,30 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nINSERT INTO cyclotron_jobs\n    (\n        id,\n        team_id,\n        function_id,\n        created,\n        lock_id,\n        last_heartbeat,\n        janitor_touch_count,\n        transition_count,\n        last_transition,\n        queue_name,\n        state,\n        scheduled,\n        priority,\n        vm_state,\n        metadata,\n        parameters\n    )\nVALUES\n    ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)\n    ",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": [
+                "Uuid",
+                "Int4",
+                "Uuid",
+                "Text",
+                {
+                    "Custom": {
+                        "name": "jobstate",
+                        "kind": {
+                            "Enum": ["available", "completed", "failed", "running", "paused"]
+                        }
+                    }
+                },
+                "Timestamptz",
+                "Int2",
+                "Text",
+                "Text",
+                "Text"
+            ]
+        },
+        "nullable": []
+    },
+    "hash": "7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6"
+}
--- a/rust/cyclotron-core/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json
+++ b/rust/cyclotron-core/.sqlx/query-884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb"
+}
--- a/rust/cyclotron-core/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json
+++ b/rust/cyclotron-core/.sqlx/query-8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Text", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c"
+}
--- a/rust/cyclotron-core/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json
+++ b/rust/cyclotron-core/.sqlx/query-98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Timestamptz", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23"
+}
--- a/rust/cyclotron-core/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json
+++ b/rust/cyclotron-core/.sqlx/query-aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632.json
@ -0,0 +1,18 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2",
+    "describe": {
+        "columns": [
+            {
+                "ordinal": 0,
+                "name": "vm_state",
+                "type_info": "Text"
+            }
+        ],
+        "parameters": {
+            "Left": ["Uuid", "Uuid"]
+        },
+        "nullable": [true]
+    },
+    "hash": "aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632"
+}
--- a/rust/cyclotron-core/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json
+++ b/rust/cyclotron-core/.sqlx/query-b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Text", "Uuid", "Uuid"]
+        },
+        "nullable": []
+    },
+    "hash": "b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7"
+}
--- a/rust/cyclotron-core/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json
+++ b/rust/cyclotron-core/.sqlx/query-b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13.json
@ -0,0 +1,23 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "UPDATE cyclotron_jobs\n            SET state = $1, last_transition = NOW(), transition_count = transition_count + 1\n            WHERE id = $2 AND lock_id = $3",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": [
+                {
+                    "Custom": {
+                        "name": "jobstate",
+                        "kind": {
+                            "Enum": ["available", "completed", "failed", "running", "paused"]
+                        }
+                    }
+                },
+                "Uuid",
+                "Uuid"
+            ]
+        },
+        "nullable": []
+    },
+    "hash": "b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13"
+}
--- a/rust/cyclotron-core/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json
+++ b/rust/cyclotron-core/.sqlx/query-c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e.json
@ -0,0 +1,117 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nWITH available AS (\n    SELECT\n        id,\n        state\n    FROM cyclotron_jobs\n    WHERE\n        state = 'available'::JobState\n        AND queue_name = $1\n        AND scheduled <= NOW()\n    ORDER BY\n        priority ASC,\n        scheduled ASC\n    LIMIT $2\n    FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n    state = 'running'::JobState,\n    lock_id = $3,\n    last_heartbeat = NOW(),\n    last_transition = NOW(),\n    transition_count = transition_count + 1\nFROM available\nWHERE\n    cyclotron_jobs.id = available.id\nRETURNING\n    cyclotron_jobs.id,\n    team_id,\n    available.state as \"state: JobState\",\n    queue_name,\n    priority,\n    function_id,\n    created,\n    last_transition,\n    scheduled,\n    transition_count,\n    vm_state,\n    metadata,\n    parameters,\n    lock_id,\n    last_heartbeat,\n    janitor_touch_count\n    ",
+    "describe": {
+        "columns": [
+            {
+                "ordinal": 0,
+                "name": "id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 1,
+                "name": "team_id",
+                "type_info": "Int4"
+            },
+            {
+                "ordinal": 2,
+                "name": "state: JobState",
+                "type_info": {
+                    "Custom": {
+                        "name": "jobstate",
+                        "kind": {
+                            "Enum": ["available", "completed", "failed", "running", "paused"]
+                        }
+                    }
+                }
+            },
+            {
+                "ordinal": 3,
+                "name": "queue_name",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 4,
+                "name": "priority",
+                "type_info": "Int2"
+            },
+            {
+                "ordinal": 5,
+                "name": "function_id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 6,
+                "name": "created",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 7,
+                "name": "last_transition",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 8,
+                "name": "scheduled",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 9,
+                "name": "transition_count",
+                "type_info": "Int2"
+            },
+            {
+                "ordinal": 10,
+                "name": "vm_state",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 11,
+                "name": "metadata",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 12,
+                "name": "parameters",
+                "type_info": "Text"
+            },
+            {
+                "ordinal": 13,
+                "name": "lock_id",
+                "type_info": "Uuid"
+            },
+            {
+                "ordinal": 14,
+                "name": "last_heartbeat",
+                "type_info": "Timestamptz"
+            },
+            {
+                "ordinal": 15,
+                "name": "janitor_touch_count",
+                "type_info": "Int2"
+            }
+        ],
+        "parameters": {
+            "Left": ["Text", "Int8", "Uuid"]
+        },
+        "nullable": [
+            false,
+            false,
+            false,
+            false,
+            false,
+            true,
+            false,
+            false,
+            false,
+            false,
+            true,
+            true,
+            true,
+            true,
+            true,
+            false
+        ]
+    },
+    "hash": "c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e"
+}
--- a/rust/cyclotron-core/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json
+++ b/rust/cyclotron-core/.sqlx/query-f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "DELETE FROM cyclotron_jobs WHERE state = 'completed'",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": []
+        },
+        "nullable": []
+    },
+    "hash": "f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c"
+}
--- a/rust/cyclotron-core/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json
+++ b/rust/cyclotron-core/.sqlx/query-fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "\nDELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2\n        ",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": ["Timestamptz", "Int2"]
+        },
+        "nullable": []
+    },
+    "hash": "fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb"
+}
--- a/rust/cyclotron-core/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json
+++ b/rust/cyclotron-core/.sqlx/query-ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e.json
@ -0,0 +1,12 @@
+{
+    "db_name": "PostgreSQL",
+    "query": "DELETE FROM cyclotron_jobs WHERE state = 'failed'",
+    "describe": {
+        "columns": [],
+        "parameters": {
+            "Left": []
+        },
+        "nullable": []
+    },
+    "hash": "ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e"
+}
--- a/rust/cyclotron-core/Cargo.toml
+++ b/rust/cyclotron-core/Cargo.toml
@ -0,0 +1,17 @@
+[package]
+name = "cyclotron-core"
+version = "0.1.0"
+edition = "2021"
+
+[lints]
+workspace = true
+
+[dependencies]
+serde = { workspace = true }
+sqlx = { workspace = true }
+chrono = { workspace = true }
+tokio = { workspace = true }
+thiserror = { workspace = true }
+uuid = { workspace = true }
+rand = { workspace = true }
+futures = { workspace = true }
--- a/rust/cyclotron-core/migrations/20240804122549_initial_job_queue_schema.sql
+++ b/rust/cyclotron-core/migrations/20240804122549_initial_job_queue_schema.sql
@ -0,0 +1,102 @@
+CREATE TYPE JobState AS ENUM(
+    'available',
+    'completed',
+    'failed',
+    'running',
+    'paused'
+);
+
+
+---------------------------------------------------------------------
+-- Job table
+---------------------------------------------------------------------
+-- When a job is dequeued, it is locked by generating a UUID and returning it to the dequeuing
+-- worker. Any worker that can't provide the correct lock_id when updating will have their updates
+-- rejected. The reason this is important is because if, e.g., a worker holds a job in a running
+-- state without updating the heartbeat, the janitor will return the job to the queue eventually,
+-- and if the worker /then/ tries to update the job after another worker has picked it up, that's a
+-- race. We track transition count and times alongside lock_id's and heartbeats for reporting and
+-- debugging purposes, and we track the number of times the janitor has touched a job to spot poison
+-- pills.
+CREATE TABLE IF NOT EXISTS cyclotron_jobs (
+    ---------------------------------------------------------------------
+    -- Job metadata
+    ---------------------------------------------------------------------
+    id UUID PRIMARY KEY,
+    team_id INT NOT NULL,
+    function_id UUID,
+    created TIMESTAMPTZ NOT NULL,
+    ---------------------------------------------------------------------
+    -- Queue bookkeeping - invisible to the worker
+    ---------------------------------------------------------------------
+    lock_id UUID,
+    -- This is set when a job is in a running state, and is required to update the job.
+    last_heartbeat TIMESTAMPTZ,
+    -- This is updated by the worker to indicate that the job is making forward progress even
+    -- without transitions (and should not be reaped)
+    janitor_touch_count SMALLINT NOT NULL,
+    transition_count SMALLINT NOT NULL,
+    last_transition TIMESTAMPTZ NOT NULL,
+    ---------------------------------------------------------------------
+    -- Queue components - determines which workers will consume this job
+    ---------------------------------------------------------------------
+    queue_name TEXT NOT NULL,
+    ---------------------------------------------------------------------
+    -- Job availability and priority (can this job be dequeued, and in what order?)
+    ---------------------------------------------------------------------
+    state JobState NOT NULL,
+    scheduled TIMESTAMPTZ NOT NULL,
+    priority SMALLINT NOT NULL,
+    ---------------------------------------------------------------------
+    -- Job data
+    ---------------------------------------------------------------------
+    vm_state TEXT,
+    -- This is meant for workers "talking to themselves", e.g. tracking retries or something
+    metadata TEXT,
+    -- This is meant for "the next guy" - hog might fill it with a URL to fetch, for example
+    parameters TEXT
+);
+
+-- For a given worker, the set of "available" jobs depends on state, queue_name, and scheduled (so
+-- we can exclude sleeping jobs). This index is partial, because we don't care about other states
+-- for the purpose of dequeuing
+CREATE INDEX idx_cyclotron_jobs_dequeue ON cyclotron_jobs (queue_name, state, scheduled, priority)
+WHERE
+    state = 'available';
+
+-- We create simple indexes on team_id, function_id and queue_name to support fast joins to future
+-- control tables
+CREATE INDEX idx_queue_team_id ON cyclotron_jobs(team_id);
+
+CREATE INDEX idx_queue_function_id ON cyclotron_jobs(function_id);
+
+CREATE INDEX idx_queue_queue_name ON cyclotron_jobs(queue_name);
+
+
+---------------------------------------------------------------------
+-- Control tables
+---------------------------------------------------------------------
+
+
+-- These are just a starting point, supporting overriding the state for a given team, function or queue
+-- For now these are entirely unused
+CREATE TABLE IF NOT EXISTS cyclotron_team_control (
+    team_id INT PRIMARY KEY,
+    state_override JobState,
+    -- If this is not null, it overrides the state of all jobs for this team (allowing for e.g. pausing or force failing all of a teams jobs)
+    state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent
+);
+
+CREATE TABLE IF NOT EXISTS cyclotron_function_control (
+    function_id UUID PRIMARY KEY,
+    state_override JobState,
+    -- If this is not null, it overrides the state of all jobs for this function (allowing for e.g. pausing or force failing all of a functions jobs)
+    state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent
+);
+
+CREATE TABLE IF NOT EXISTS cyclotron_queue_control (
+    queue_name TEXT PRIMARY KEY,
+    state_override JobState,
+    -- If this is not null, it overrides the state of all jobs for this queue (allowing for e.g. pausing or force failing all of a queues jobs)
+    state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent
+);
--- a/rust/cyclotron-core/src/base_ops.rs
+++ b/rust/cyclotron-core/src/base_ops.rs
@ -0,0 +1,697 @@
+//! # PgQueue
+//!
+//! A job queue implementation backed by a PostgreSQL table.
+
+use std::str::FromStr;
+
+use chrono::{self, DateTime, Utc};
+use serde::{self, Deserialize, Serialize};
+use sqlx::{
+    postgres::{PgArguments, PgHasArrayType, PgQueryResult, PgTypeInfo},
+    query::Query,
+};
+use uuid::Uuid;
+
+use crate::error::QueueError;
+
+#[derive(Debug, Deserialize, Serialize, sqlx::Type)]
+#[serde(rename_all = "lowercase")]
+#[sqlx(type_name = "JobState", rename_all = "lowercase")]
+pub enum JobState {
+    Available,
+    Running,
+    Completed,
+    Failed,
+    Paused,
+}
+
+impl FromStr for JobState {
+    type Err = ();
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "available" => Ok(JobState::Available),
+            "running" => Ok(JobState::Running),
+            "completed" => Ok(JobState::Completed),
+            "failed" => Ok(JobState::Failed),
+            _ => Err(()),
+        }
+    }
+}
+
+impl PgHasArrayType for JobState {
+    fn array_type_info() -> sqlx::postgres::PgTypeInfo {
+        // Postgres default naming convention for array types is "_typename"
+        PgTypeInfo::with_name("_JobState")
+    }
+}
+
+// The chunk of data needed to enqueue a job
+#[derive(Debug, Deserialize, Serialize, Clone, Eq, PartialEq)]
+pub struct JobInit {
+    pub team_id: i32,
+    pub queue_name: String,
+    pub priority: i16,
+    pub scheduled: DateTime<Utc>,
+    pub function_id: Option<Uuid>,
+    pub vm_state: Option<String>,
+    pub parameters: Option<String>,
+    pub metadata: Option<String>,
+}
+
+// TODO - there are certain things we might want to be on a per-team basis here... the ability to say
+// "do not process any jobs for this team" independent of doing an operation on the job table seems powerful,
+// but that requires a distinct team table. For now, I'm just making a note that it's something we might
+// want (the command to modify the treatment of all jobs associated with a team should only need to be issued and
+// processed /once/, not once per job, and should apply to all jobs both currently queued and any future ones). This
+// can be added in a progressive way (by adding joins and clauses to the dequeue query), so we don't need to worry about
+// it too much up front.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct Job {
+    // Job metadata
+    pub id: Uuid,
+    pub team_id: i32,
+    pub function_id: Option<Uuid>, // Some jobs might not come from hog, and it doesn't /kill/ use to support that
+    pub created: DateTime<Utc>,
+
+    // Queue bookkeeping
+    // This will be set for any worker that ever has a job in the "running" state (so any worker that dequeues a job)
+    // but I don't want to do the work to encode that in the type system right now - later it should be
+    pub lock_id: Option<Uuid>,
+    pub last_heartbeat: Option<DateTime<Utc>>,
+    pub janitor_touch_count: i16,
+    pub transition_count: i16,
+    pub last_transition: DateTime<Utc>,
+
+    // Virtual queue components
+    pub queue_name: String, // We can have multiple "virtual queues" workers pull from
+
+    // Job availability
+    pub state: JobState,
+    pub priority: i16, // For sorting "available" jobs. Lower is higher priority
+    pub scheduled: DateTime<Utc>,
+
+    // Job data
+    pub vm_state: Option<String>, // The state of the VM this job is running on (if it exists)
+    pub metadata: Option<String>, // Additional fields a worker can tack onto a job, for e.g. tracking some state across retries (or number of retries in general by a given class of worker)
+    pub parameters: Option<String>, // The actual parameters of the job (function args for a hog function, http request for a fetch function)
+}
+
+pub async fn create_job<'c, E>(executor: E, data: JobInit) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let id = Uuid::now_v7();
+    sqlx::query!(
+        r#"
+INSERT INTO cyclotron_jobs
+    (
+        id,
+        team_id,
+        function_id,
+        created,
+        lock_id,
+        last_heartbeat,
+        janitor_touch_count,
+        transition_count,
+        last_transition,
+        queue_name,
+        state,
+        scheduled,
+        priority,
+        vm_state,
+        metadata,
+        parameters
+    )
+VALUES
+    ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)
+    "#,
+        id,
+        data.team_id,
+        data.function_id,
+        data.queue_name,
+        JobState::Available as _,
+        data.scheduled,
+        data.priority,
+        data.vm_state,
+        data.metadata,
+        data.parameters
+    )
+    .execute(executor)
+    .await?;
+
+    Ok(())
+}
+
+pub async fn bulk_create_jobs<'c, E>(executor: E, jobs: &[JobInit]) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let now = Utc::now();
+    // Flatten these jobs into a series of vecs of arguments PG can unnest
+    let mut ids = Vec::with_capacity(jobs.len());
+    let mut team_ids = Vec::with_capacity(jobs.len());
+    let mut function_ids = Vec::with_capacity(jobs.len());
+    let mut created_at = Vec::with_capacity(jobs.len());
+    let mut lock_ids = Vec::with_capacity(jobs.len());
+    let mut last_heartbeats = Vec::with_capacity(jobs.len());
+    let mut janitor_touch_counts = Vec::with_capacity(jobs.len());
+    let mut transition_counts = Vec::with_capacity(jobs.len());
+    let mut last_transitions = Vec::with_capacity(jobs.len());
+    let mut queue_names = Vec::with_capacity(jobs.len());
+    let mut states = Vec::with_capacity(jobs.len());
+    let mut scheduleds = Vec::with_capacity(jobs.len());
+    let mut priorities = Vec::with_capacity(jobs.len());
+    let mut vm_states = Vec::with_capacity(jobs.len());
+    let mut metadatas = Vec::with_capacity(jobs.len());
+    let mut parameters = Vec::with_capacity(jobs.len());
+
+    for d in jobs {
+        ids.push(Uuid::now_v7());
+        team_ids.push(d.team_id);
+        function_ids.push(d.function_id);
+        created_at.push(now);
+        lock_ids.push(None::<Uuid>);
+        last_heartbeats.push(None::<DateTime<Utc>>);
+        janitor_touch_counts.push(0);
+        transition_counts.push(0);
+        last_transitions.push(now);
+        queue_names.push(d.queue_name.clone());
+        states.push(JobState::Available);
+        scheduleds.push(d.scheduled);
+        priorities.push(d.priority);
+        vm_states.push(d.vm_state.clone());
+        metadatas.push(d.metadata.clone());
+        parameters.push(d.parameters.clone());
+    }
+
+    // Using the "unnest" function to turn an array of rows into a set of rows
+    sqlx::query(
+        r#"
+INSERT INTO cyclotron_jobs
+    (
+        id,
+        team_id,
+        function_id,
+        created,
+        lock_id,
+        last_heartbeat,
+        janitor_touch_count,
+        transition_count,
+        last_transition,
+        queue_name,
+        state,
+        scheduled,
+        priority,
+        vm_state,
+        metadata,
+        parameters
+    )
+SELECT *
+FROM UNNEST(
+        $1,
+        $2,
+        $3,
+        $4,
+        $5,
+        $6,
+        $7,
+        $8,
+        $9,
+        $10,
+        $11,
+        $12,
+        $13,
+        $14,
+        $15,
+        $16
+    )
+"#,
+    )
+    .bind(ids)
+    .bind(team_ids)
+    .bind(function_ids)
+    .bind(created_at)
+    .bind(lock_ids)
+    .bind(last_heartbeats)
+    .bind(janitor_touch_counts)
+    .bind(transition_counts)
+    .bind(last_transitions)
+    .bind(queue_names)
+    .bind(states)
+    .bind(scheduleds)
+    .bind(priorities)
+    .bind(vm_states)
+    .bind(metadatas)
+    .bind(parameters)
+    .execute(executor)
+    .await?;
+
+    Ok(())
+}
+
+// Dequeue the next job batch from the queue, skipping VM state since it can be large
+pub async fn dequeue_jobs<'c, E>(
+    executor: E,
+    queue: &str,
+    max: usize,
+) -> Result<Vec<Job>, QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    // TODO - right now, locks are completely transient. We could instead have the lock_id act like a
+    // "worker_id", and be provided by the caller, which would let workers do less bookkeeping, and make
+    // some kinds of debugging easier, but I prefer locks being opaque to workers for now, to avoid any
+    // confusion or potential for accidental deadlocking (e.g. if someone persisted the worker_id across
+    // process restarts).
+    let lock_id = Uuid::now_v7();
+    Ok(sqlx::query_as!(
+        Job,
+        r#"
+WITH available AS (
+    SELECT
+        id,
+        state
+    FROM cyclotron_jobs
+    WHERE
+        state = 'available'::JobState
+        AND queue_name = $1
+        AND scheduled <= NOW()
+    ORDER BY
+        priority ASC,
+        scheduled ASC
+    LIMIT $2
+    FOR UPDATE SKIP LOCKED
+)
+UPDATE cyclotron_jobs
+SET
+    state = 'running'::JobState,
+    lock_id = $3,
+    last_heartbeat = NOW(),
+    last_transition = NOW(),
+    transition_count = transition_count + 1
+FROM available
+WHERE
+    cyclotron_jobs.id = available.id
+RETURNING
+    cyclotron_jobs.id,
+    team_id,
+    available.state as "state: JobState",
+    queue_name,
+    priority,
+    function_id,
+    created,
+    last_transition,
+    scheduled,
+    transition_count,
+    NULL as vm_state,
+    metadata,
+    parameters,
+    lock_id,
+    last_heartbeat,
+    janitor_touch_count
+    "#,
+        queue,
+        max as i64,
+        lock_id
+    )
+    .fetch_all(executor)
+    .await?)
+}
+
+// Dequeue a batch of jobs, also returning their VM state. This is an optimisation - you could
+// dequeue a batch of jobs and then fetch their VM state in a separate query, but this is hopefully less
+// heavy on the DB, if a given worker knows it needs VM state for all dequeue jobs
+pub async fn dequeue_with_vm_state<'c, E>(
+    executor: E,
+    queue: &str,
+    max: usize,
+) -> Result<Vec<Job>, QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let lock_id = Uuid::now_v7();
+    Ok(sqlx::query_as!(
+        Job,
+        r#"
+WITH available AS (
+    SELECT
+        id,
+        state
+    FROM cyclotron_jobs
+    WHERE
+        state = 'available'::JobState
+        AND queue_name = $1
+        AND scheduled <= NOW()
+    ORDER BY
+        priority ASC,
+        scheduled ASC
+    LIMIT $2
+    FOR UPDATE SKIP LOCKED
+)
+UPDATE cyclotron_jobs
+SET
+    state = 'running'::JobState,
+    lock_id = $3,
+    last_heartbeat = NOW(),
+    last_transition = NOW(),
+    transition_count = transition_count + 1
+FROM available
+WHERE
+    cyclotron_jobs.id = available.id
+RETURNING
+    cyclotron_jobs.id,
+    team_id,
+    available.state as "state: JobState",
+    queue_name,
+    priority,
+    function_id,
+    created,
+    last_transition,
+    scheduled,
+    transition_count,
+    vm_state,
+    metadata,
+    parameters,
+    lock_id,
+    last_heartbeat,
+    janitor_touch_count
+    "#,
+        queue,
+        max as i64,
+        lock_id
+    )
+    .fetch_all(executor)
+    .await?)
+}
+
+// Grab a jobs VM state - for workers that might sometimes need a jobs vm state, but not always,
+// this lets them use dequeue_jobs, and then fetch the states they need. VM state can only be retrieved
+// by workers holding a job lock
+pub async fn get_vm_state<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    lock_id: Uuid,
+) -> Result<Option<String>, QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    struct VMState {
+        vm_state: Option<String>,
+    }
+
+    // We use fetch_on here because giving us an unknown ID is an error
+    let res = sqlx::query_as!(
+        VMState,
+        "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2",
+        job_id,
+        lock_id
+    )
+    .fetch_one(executor)
+    .await?;
+
+    Ok(res.vm_state)
+}
+
+// A struct representing a set of updates for a job. Outer none values mean "don't update this field",
+// with nested none values meaning "set this field to null" for nullable fields
+#[derive(Debug, Deserialize, Serialize)]
+pub struct JobUpdate {
+    pub lock_id: Uuid, // The ID of the lock acquired when this worker dequeued the job, required for any update to be valid
+    pub state: Option<JobState>,
+    pub queue_name: Option<String>,
+    pub priority: Option<i16>,
+    pub scheduled: Option<DateTime<Utc>>,
+    pub vm_state: Option<Option<String>>,
+    pub metadata: Option<Option<String>>,
+    pub parameters: Option<Option<String>>,
+}
+
+impl JobUpdate {
+    pub fn new(lock_id: Uuid) -> Self {
+        Self {
+            lock_id,
+            state: None,
+            queue_name: None,
+            priority: None,
+            scheduled: None,
+            vm_state: None,
+            metadata: None,
+            parameters: None,
+        }
+    }
+}
+
+// TODO - I should think about a bulk-flush interface at /some/ point, although we expect jobs to be
+// high variance with respect to work time, so maybe that wouldn't be that useful in the end.
+// TODO - this isn't the cheapest way to update a row in a table... I could probably do better by instead
+// using a query builder, but I wanted sqlx's nice macro handling, at least while iterating on the schema.
+// If/when we start hitting perf issues, this is a good place to start.
+// NOTE - this function permits multiple flushes to the same job, without losing the lock on it, but
+// high level implementations are recommended to avoid this - ideally, for every de/requeue, there should be
+// exactly 2 database operations.
+pub async fn flush_job<'c, C>(
+    connection: &mut C,
+    job_id: Uuid,
+    updates: JobUpdate,
+) -> Result<(), QueueError>
+where
+    C: sqlx::Connection<Database = sqlx::Postgres>,
+{
+    let mut txn = connection.begin().await?;
+
+    // Flushing any job state except "running" is a signal that the worker no longer holds this job
+    let job_returned = !matches!(updates.state, Some(JobState::Running));
+    let lock_id = updates.lock_id;
+
+    if let Some(state) = updates.state {
+        set_state(&mut *txn, job_id, updates.lock_id, state).await?;
+    }
+
+    if let Some(queue_name) = updates.queue_name {
+        set_queue(&mut *txn, job_id, &queue_name, lock_id).await?;
+    }
+
+    if let Some(priority) = updates.priority {
+        set_priority(&mut *txn, job_id, lock_id, priority).await?;
+    }
+
+    if let Some(scheduled) = updates.scheduled {
+        set_scheduled(&mut *txn, job_id, scheduled, lock_id).await?;
+    }
+
+    if let Some(vm_state) = updates.vm_state {
+        set_vm_state(&mut *txn, job_id, vm_state, lock_id).await?;
+    }
+
+    if let Some(metadata) = updates.metadata {
+        set_metadata(&mut *txn, job_id, metadata, lock_id).await?;
+    }
+
+    if let Some(parameters) = updates.parameters {
+        set_parameters(&mut *txn, job_id, parameters, lock_id).await?;
+    }
+
+    // Calling flush indicates forward progress, so we should touch the heartbeat
+    set_heartbeat(&mut *txn, job_id, lock_id).await?;
+
+    // We do this here, instead of in the set_state call, because otherwise the lock_id passed to other
+    // updates would be invalid
+    if job_returned {
+        let query = sqlx::query!(
+            "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2",
+            job_id,
+            lock_id
+        );
+        assert_does_update(&mut *txn, job_id, lock_id, query).await?;
+    }
+
+    txn.commit().await?;
+
+    Ok(())
+}
+
+// Simple wrapper, that just executes a query and throws an error if no rows were affected
+async fn assert_does_update<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    lock_id: Uuid,
+    query: Query<'_, sqlx::Postgres, PgArguments>,
+) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let res = query.execute(executor).await?;
+    throw_if_no_rows(res, job_id, lock_id)
+}
+
+// Most of the rest of these functions are designed to be used as part of larger transactions, e.g.
+// "completing" a job means updating various rows and then marking it complete, and we can build that
+// by composing a set of individual queries together using a transaction.
+// Update the state of a job, also tracking the transition count and last transition time
+pub async fn set_state<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    lock_id: Uuid,
+    state: JobState,
+) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let q = sqlx::query!(
+        r#"UPDATE cyclotron_jobs
+            SET state = $1, last_transition = NOW(), transition_count = transition_count + 1
+            WHERE id = $2 AND lock_id = $3"#,
+        state as _,
+        job_id,
+        lock_id
+    );
+
+    assert_does_update(executor, job_id, lock_id, q).await
+}
+
+pub async fn set_queue<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    queue: &str,
+    lock_id: Uuid,
+) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let q = sqlx::query!(
+        "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3",
+        queue,
+        job_id,
+        lock_id
+    );
+    assert_does_update(executor, job_id, lock_id, q).await
+}
+
+pub async fn set_priority<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    lock_id: Uuid,
+    priority: i16,
+) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let q = sqlx::query!(
+        "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3",
+        priority,
+        job_id,
+        lock_id
+    );
+    assert_does_update(executor, job_id, lock_id, q).await
+}
+
+pub async fn set_scheduled<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    scheduled: DateTime<Utc>,
+    lock_id: Uuid,
+) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let q = sqlx::query!(
+        "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3",
+        scheduled,
+        job_id,
+        lock_id
+    );
+    assert_does_update(executor, job_id, lock_id, q).await
+}
+
+pub async fn set_vm_state<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    vm_state: Option<String>,
+    lock_id: Uuid,
+) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let q = sqlx::query!(
+        "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3",
+        vm_state,
+        job_id,
+        lock_id
+    );
+    assert_does_update(executor, job_id, lock_id, q).await
+}
+
+pub async fn set_metadata<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    metadata: Option<String>,
+    lock_id: Uuid,
+) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let q = sqlx::query!(
+        "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3",
+        metadata,
+        job_id,
+        lock_id
+    );
+    assert_does_update(executor, job_id, lock_id, q).await
+}
+
+pub async fn set_parameters<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    parameters: Option<String>,
+    lock_id: Uuid,
+) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let q = sqlx::query!(
+        "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3",
+        parameters,
+        job_id,
+        lock_id
+    );
+    assert_does_update(executor, job_id, lock_id, q).await
+}
+
+pub async fn set_heartbeat<'c, E>(
+    executor: E,
+    job_id: Uuid,
+    lock_id: Uuid,
+) -> Result<(), QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let q = sqlx::query!(
+        "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2",
+        job_id,
+        lock_id
+    );
+    assert_does_update(executor, job_id, lock_id, q).await
+}
+
+pub async fn count_total_waiting_jobs<'c, E>(executor: E) -> Result<u64, QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let res = sqlx::query!(
+        "SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'available' AND scheduled <= NOW()",
+    )
+    .fetch_one(executor)
+    .await?;
+
+    let res = res.count.unwrap_or(0);
+    Ok(res as u64)
+}
+
+fn throw_if_no_rows(res: PgQueryResult, job: Uuid, lock: Uuid) -> Result<(), QueueError> {
+    if res.rows_affected() == 0 {
+        Err(QueueError::InvalidLock(lock, job))
+    } else {
+        Ok(())
+    }
+}
--- a/rust/cyclotron-core/src/bin/create_test_data.rs
+++ b/rust/cyclotron-core/src/bin/create_test_data.rs
@ -0,0 +1,56 @@
+use chrono::{Duration, Utc};
+use cyclotron_core::{
+    base_ops::JobInit,
+    manager::{ManagerConfig, QueueManager},
+    PoolConfig,
+};
+use uuid::Uuid;
+
+// Just inserts jobs as fast as it can, choosing randomly between hog and fetch workers, and between different priorities.
+// prints every 100 jobs inserted.
+#[tokio::main]
+async fn main() {
+    let pool_config = PoolConfig {
+        db_url: "postgresql://posthog:posthog@localhost:5432/cyclotron".to_string(),
+        max_connections: None,
+        min_connections: None,
+        acquire_timeout_seconds: None,
+        max_lifetime_seconds: None,
+        idle_timeout_seconds: None,
+    };
+
+    let manager_config = ManagerConfig {
+        shards: vec![pool_config.clone()],
+        shard_depth_limit: None,
+        shard_depth_check_interval_seconds: None,
+    };
+
+    let manager = QueueManager::new(manager_config).await.unwrap();
+
+    let now = Utc::now() - Duration::minutes(1);
+    let start = Utc::now();
+    let mut count = 0;
+    loop {
+        let queue = if rand::random() { "fetch" } else { "hog" };
+
+        let priority = (rand::random::<u16>() % 3) as i16;
+
+        let test_job = JobInit {
+            team_id: 1,
+            queue_name: queue.to_string(),
+            priority,
+            scheduled: now,
+            function_id: Some(Uuid::now_v7()),
+            vm_state: None,
+            parameters: None,
+            metadata: None,
+        };
+
+        manager.create_job(test_job).await.unwrap();
+
+        count += 1;
+        if count % 100 == 0 {
+            println!("Elapsed: {:?}, count: {}", Utc::now() - start, count);
+        }
+    }
+}
--- a/rust/cyclotron-core/src/bin/load_test.rs
+++ b/rust/cyclotron-core/src/bin/load_test.rs
@ -0,0 +1,167 @@
+use std::{
+    sync::{atomic::AtomicUsize, Arc},
+    time::Instant,
+};
+
+use chrono::{Duration, Utc};
+use cyclotron_core::{
+    base_ops::{JobInit, JobState},
+    manager::{ManagerConfig, QueueManager},
+    worker::Worker,
+    PoolConfig,
+};
+use futures::future::join_all;
+use uuid::Uuid;
+
+// This spins up a manager and 2 workers, and tries to simulate semi-realistic load (on the DB - the workers do nothing except complete jobs)
+// - The manager inserts jobs as fast as it can, choosing randomly between hog and fetch workers, and between different priorities.
+// - The workers will process jobs as fast as they can, in batches of 1000.
+// - The manager and both workers track how long each insert and dequeue takes, in ms/job.
+// - The manager never inserts more than 10,000 more jobs than the workers have processed.
+const INSERT_BATCH_SIZE: usize = 1000;
+
+struct SharedContext {
+    jobs_inserted: AtomicUsize,
+    jobs_dequeued: AtomicUsize,
+}
+
+async fn producer_loop(manager: QueueManager, shared_context: Arc<SharedContext>) {
+    let mut time_spent_inserting = Duration::zero();
+    let now = Utc::now() - Duration::minutes(1);
+    loop {
+        let mut to_insert = Vec::with_capacity(1000);
+        for _ in 0..INSERT_BATCH_SIZE {
+            let queue = if rand::random() { "fetch" } else { "hog" };
+
+            let priority = (rand::random::<u16>() % 3) as i16;
+
+            let test_job = JobInit {
+                team_id: 1,
+                queue_name: queue.to_string(),
+                priority,
+                scheduled: now,
+                function_id: Some(Uuid::now_v7()),
+                vm_state: None,
+                parameters: None,
+                metadata: None,
+            };
+
+            to_insert.push(test_job);
+        }
+
+        let start = Instant::now();
+        manager.bulk_create_jobs(to_insert).await;
+        let elapsed = start.elapsed();
+        time_spent_inserting += Duration::from_std(elapsed).unwrap();
+
+        let inserted = shared_context
+            .jobs_inserted
+            .fetch_add(INSERT_BATCH_SIZE, std::sync::atomic::Ordering::Relaxed);
+
+        println!("Inserted: {} in {}, ", inserted, time_spent_inserting);
+        let mut dequeued = shared_context
+            .jobs_dequeued
+            .load(std::sync::atomic::Ordering::Relaxed);
+        while inserted > dequeued + 10_000 {
+            println!(
+                "Waiting for workers to catch up, lagging by {}",
+                inserted - dequeued
+            );
+            tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
+            dequeued = shared_context
+                .jobs_dequeued
+                .load(std::sync::atomic::Ordering::Relaxed);
+        }
+    }
+}
+
+async fn worker_loop(worker: Worker, shared_context: Arc<SharedContext>, queue: &str) {
+    let mut time_spent_dequeuing = Duration::zero();
+    let start = Utc::now();
+    loop {
+        let loop_start = Instant::now();
+        let jobs = worker.dequeue_jobs(queue, 1000).await.unwrap();
+
+        if jobs.is_empty() {
+            println!(
+                "Worker {:?} outpacing inserts, got no jobs, sleeping!",
+                queue
+            );
+            tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
+            continue;
+        }
+
+        let mut futs = Vec::with_capacity(jobs.len());
+        for job in &jobs {
+            worker.set_state(job.id, JobState::Completed).unwrap();
+            futs.push(worker.flush_job(job.id));
+        }
+
+        for res in join_all(futs).await {
+            res.unwrap();
+        }
+
+        time_spent_dequeuing += Duration::from_std(loop_start.elapsed()).unwrap();
+
+        let dequeued = shared_context
+            .jobs_dequeued
+            .fetch_add(jobs.len(), std::sync::atomic::Ordering::Relaxed);
+
+        // To account for the bunch we just handled
+        let dequeued = dequeued + jobs.len();
+
+        println!(
+            "Dequeued, processed and completed {} jobs in {} for {:?}. Total time running: {}",
+            dequeued,
+            time_spent_dequeuing,
+            queue,
+            Utc::now() - start
+        );
+
+        if jobs.len() < 1000 {
+            println!(
+                "Worker {:?} outpacing manager, only got {} jobs, sleeping!",
+                queue,
+                jobs.len()
+            );
+            tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    let pool_config = PoolConfig {
+        db_url: "postgresql://posthog:posthog@localhost:5432/cyclotron".to_string(),
+        max_connections: None,
+        min_connections: None,
+        acquire_timeout_seconds: None,
+        max_lifetime_seconds: None,
+        idle_timeout_seconds: None,
+    };
+
+    let manager_config = ManagerConfig {
+        shards: vec![pool_config.clone()],
+        shard_depth_limit: None,
+        shard_depth_check_interval_seconds: None,
+    };
+
+    let shared_context = Arc::new(SharedContext {
+        jobs_inserted: AtomicUsize::new(0),
+        jobs_dequeued: AtomicUsize::new(0),
+    });
+
+    let manager = QueueManager::new(manager_config).await.unwrap();
+    let worker_1 = Worker::new(pool_config.clone()).await.unwrap();
+    let worker_2 = Worker::new(pool_config.clone()).await.unwrap();
+
+    let producer = producer_loop(manager, shared_context.clone());
+    let worker_1 = worker_loop(worker_1, shared_context.clone(), "fetch");
+    let worker_2 = worker_loop(worker_2, shared_context.clone(), "hog");
+
+    let producer = tokio::spawn(producer);
+    let worker_1 = tokio::spawn(worker_1);
+    let worker_2 = tokio::spawn(worker_2);
+
+    tokio::try_join!(producer, worker_1, worker_2).unwrap();
+}
--- a/rust/cyclotron-core/src/error.rs
+++ b/rust/cyclotron-core/src/error.rs
@ -0,0 +1,17 @@
+use uuid::Uuid;
+
+#[derive(Debug, thiserror::Error)]
+pub enum QueueError {
+    #[error("sqlx error: {0}")]
+    SqlxError(#[from] sqlx::Error),
+    #[error("Unknown job id: {0}")]
+    UnknownJobId(Uuid), // Happens when someone tries to update a job through a QueueManager that wasn't dequeue or was already flushed
+    #[error("Job {0} flushed without a new state, which would leave it in a running state forever (or until reaped)")]
+    FlushWithoutNextState(Uuid),
+    #[error("Invalid lock {0} used to update job {1}. This usually means a job has been reaped from under a worker - did you forget to set the heartbeat?")]
+    InvalidLock(Uuid, Uuid),
+    #[error("Shard over capacity {0} for this manager, insert aborted")]
+    ShardFull(u64),
+    #[error("Timed waiting for shard to have capacity")]
+    TimedOutWaitingForCapacity,
+}
--- a/rust/cyclotron-core/src/janitor_ops.rs
+++ b/rust/cyclotron-core/src/janitor_ops.rs
@ -0,0 +1,94 @@
+use chrono::{Duration, Utc};
+
+use crate::error::QueueError;
+
+// As a general rule, janitor operations are not queue specific (as in, they don't account for the
+// queue name). We can revisit this later, if we decide we need the ability to do janitor operations
+// on a per-queue basis.
+pub async fn delete_completed_jobs<'c, E>(executor: E) -> Result<u64, QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let result = sqlx::query!("DELETE FROM cyclotron_jobs WHERE state = 'completed'")
+        .execute(executor)
+        .await
+        .map_err(QueueError::from)?;
+
+    Ok(result.rows_affected())
+}
+
+pub async fn delete_failed_jobs<'c, E>(executor: E) -> Result<u64, QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let result = sqlx::query!("DELETE FROM cyclotron_jobs WHERE state = 'failed'")
+        .execute(executor)
+        .await
+        .map_err(QueueError::from)?;
+
+    Ok(result.rows_affected())
+}
+
+// Jobs are considered stalled if their lock is held and their last_heartbeat is older than `timeout`.
+// NOTE - because this runs on running jobs, it can stall workers trying to flush updates as it
+// executes. I need to use some of the load generators alongside explain/analyze to optimise this (and
+// the set of DB indexes)
+// TODO - this /could/ return the lock_id's held, which might help with debugging (if workers reported
+// the lock_id's they dequeue'd), but lets not do that right now.
+pub async fn reset_stalled_jobs<'c, E>(executor: E, timeout: Duration) -> Result<u64, QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let oldest_valid_heartbeat = Utc::now() - timeout;
+    let result = sqlx::query!(r#"
+WITH stalled AS (
+    SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED
+)
+UPDATE cyclotron_jobs
+SET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1
+FROM stalled
+WHERE cyclotron_jobs.id = stalled.id
+    "#,
+        oldest_valid_heartbeat
+    )
+        .execute(executor)
+        .await
+        .map_err(QueueError::from)?;
+
+    Ok(result.rows_affected())
+}
+
+// Poison pills are jobs whose lock is held and whose heartbeat is older than `timeout`, that have
+// been returned to the queue by the janitor more than `max_janitor_touched` times.
+// NOTE - this has the same performance caveat as reset_stalled_jobs
+// TODO - This shoud, instead, move the job row to a dead letter table, for later investigation. Of course,
+// rather than doing that, it could just put the job in a "dead letter" state, and no worker or janitor process
+// will touch it... maybe the table moving isn't needed? but either way, being able to debug jobs that cause workers
+// to stall would be good (and, thinking about it, moving it to a new table means we don't have to clear the lock,
+// so have a potential way to trace back to the last worker that died holding the job)
+pub async fn delete_poison_pills<'c, E>(
+    executor: E,
+    timeout: Duration,
+    max_janitor_touched: i16,
+) -> Result<u64, QueueError>
+where
+    E: sqlx::Executor<'c, Database = sqlx::Postgres>,
+{
+    let oldest_valid_heartbeat = Utc::now() - timeout;
+    // NOTE - we don't check the lock_id here, because it probably doesn't matter (the lock_id should be set if the
+    // job state is "running"), but perhaps we should only delete jobs with a set lock_id, and report an error
+    // if we find a job with a state of "running" and no lock_id. Also, we delete jobs whose last_heartbeat is
+    // null, which again should never happen (dequeuing a job should always set the last_heartbeat), but for
+    // robustness sake we may as well handle it
+    let result = sqlx::query!(
+        r#"
+DELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2
+        "#,
+        oldest_valid_heartbeat,
+        max_janitor_touched
+    ).execute(executor)
+        .await
+        .map_err(QueueError::from)?;
+
+    Ok(result.rows_affected())
+}
--- a/rust/cyclotron-core/src/lib.rs
+++ b/rust/cyclotron-core/src/lib.rs
@ -0,0 +1,38 @@
+use std::time::Duration;
+
+use serde::{Deserialize, Serialize};
+use sqlx::{pool::PoolOptions, PgPool};
+
+pub mod base_ops;
+pub mod error;
+pub mod janitor_ops;
+pub mod manager;
+pub mod worker;
+
+// A pool config object, designed to be passable across API boundaries
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct PoolConfig {
+    pub db_url: String,
+    pub max_connections: Option<u32>,         // Default to 10
+    pub min_connections: Option<u32>,         // Default to 1
+    pub acquire_timeout_seconds: Option<u64>, // Default to 30
+    pub max_lifetime_seconds: Option<u64>,    // Default to 300
+    pub idle_timeout_seconds: Option<u64>,    // Default to 60
+}
+
+impl PoolConfig {
+    pub async fn connect(&self) -> Result<PgPool, sqlx::Error> {
+        let builder = PoolOptions::new()
+            .max_connections(self.max_connections.unwrap_or(10))
+            .min_connections(self.min_connections.unwrap_or(1))
+            .max_lifetime(Duration::from_secs(
+                self.max_lifetime_seconds.unwrap_or(300),
+            ))
+            .idle_timeout(Duration::from_secs(self.idle_timeout_seconds.unwrap_or(60)))
+            .acquire_timeout(Duration::from_secs(
+                self.acquire_timeout_seconds.unwrap_or(30),
+            ));
+
+        builder.connect(&self.db_url).await
+    }
+}
--- a/rust/cyclotron-core/src/manager.rs
+++ b/rust/cyclotron-core/src/manager.rs
@ -0,0 +1,262 @@
+use std::sync::atomic::AtomicUsize;
+
+use chrono::{DateTime, Duration, Utc};
+use serde::{Deserialize, Serialize};
+use sqlx::PgPool;
+use tokio::sync::RwLock;
+
+use crate::{
+    base_ops::{bulk_create_jobs, count_total_waiting_jobs, create_job, JobInit},
+    error::QueueError,
+    PoolConfig,
+};
+
+pub const DEFAULT_QUEUE_DEPTH_LIMIT: u64 = 10_000;
+pub const DEFAULT_SHARD_HEALTH_CHECK_INTERVAL: u64 = 10;
+
+// TODO - right now, a lot of this sharding stuff will be hollow, but later we'll add logic like
+// e.g. routing work to alive shards if one is down, or reporting shard failure, etc.
+// TODO - here's also where queue management commands will go, like "downgrade the priority of this function"
+// or "pause jobs for this team", but we're going to add those ad-hoc as they're needed, not up front
+#[derive(Debug, Serialize, Deserialize)]
+pub struct ManagerConfig {
+    pub shards: Vec<PoolConfig>,
+    pub shard_depth_limit: Option<u64>, // Defaults to 10_000 available jobs per shard
+    pub shard_depth_check_interval_seconds: Option<u64>, // Defaults to 10 seconds - checking shard capacity
+}
+
+pub struct Shard {
+    pub pool: PgPool,
+    pub last_healthy: RwLock<DateTime<Utc>>,
+    pub check_interval: Duration,
+    pub depth_limit: u64,
+}
+
+pub struct QueueManager {
+    shards: RwLock<Vec<Shard>>,
+    next_shard: AtomicUsize,
+}
+
+// Bulk inserts across multiple shards can partially succeed, so we need to track failures
+// and hand back failed job inits to the caller.
+pub struct BulkInsertResult {
+    pub failures: Vec<(QueueError, Vec<JobInit>)>,
+}
+
+impl QueueManager {
+    pub async fn new(config: ManagerConfig) -> Result<Self, QueueError> {
+        let mut shards = vec![];
+        let depth_limit = config
+            .shard_depth_limit
+            .unwrap_or(DEFAULT_QUEUE_DEPTH_LIMIT);
+        let check_interval = Duration::seconds(
+            config
+                .shard_depth_check_interval_seconds
+                .unwrap_or(DEFAULT_SHARD_HEALTH_CHECK_INTERVAL) as i64,
+        );
+        for shard in config.shards {
+            let pool = shard.connect().await.unwrap();
+            let shard = Shard::new(pool, depth_limit, check_interval);
+            shards.push(shard);
+        }
+        Ok(Self {
+            shards: RwLock::new(shards),
+            next_shard: AtomicUsize::new(0),
+        })
+    }
+
+    // Designed mostly to be used for testing, but safe enough to expose publicly
+    pub fn from_pool(pool: PgPool) -> Self {
+        Self {
+            shards: RwLock::new(vec![Shard::new(
+                pool,
+                DEFAULT_QUEUE_DEPTH_LIMIT,
+                Duration::seconds(DEFAULT_SHARD_HEALTH_CHECK_INTERVAL as i64),
+            )]),
+            next_shard: AtomicUsize::new(0),
+        }
+    }
+
+    pub async fn create_job(&self, init: JobInit) -> Result<(), QueueError> {
+        // TODO - here is where a lot of shard health and failover logic will go, eventually.
+        let next = self
+            .next_shard
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        let shards = self.shards.read().await;
+        let shard = &shards[next % shards.len()];
+        shard.create_job(init).await
+    }
+
+    pub async fn create_job_blocking(
+        &self,
+        init: JobInit,
+        timeout: Option<Duration>,
+    ) -> Result<(), QueueError> {
+        let next = self
+            .next_shard
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        let shards = self.shards.read().await;
+        let shard = &shards[next % shards.len()];
+        shard.create_job_blocking(init, timeout).await
+    }
+
+    pub async fn bulk_create_jobs(&self, inits: Vec<JobInit>) -> BulkInsertResult {
+        let shards = self.shards.read().await;
+        let chunk_size = inits.len() / shards.len();
+        let mut result = BulkInsertResult::new();
+        // TODO - at some point, we should dynamically re-acquire the lock each time, to allow
+        // for re-routing jobs away from a bad shard during a bulk insert, but right now, we
+        // don't even re-try inserts. Later work.
+        for chunk in inits.chunks(chunk_size) {
+            let next_shard = self
+                .next_shard
+                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+            let shard = &shards[next_shard % shards.len()];
+            let shard_result = shard.bulk_create_jobs(chunk).await;
+            if let Err(err) = shard_result {
+                result.add_failure(err, chunk.to_vec());
+            }
+        }
+
+        result
+    }
+
+    pub async fn bulk_create_jobs_blocking(
+        &self,
+        inits: Vec<JobInit>,
+        timeout: Option<Duration>,
+    ) -> BulkInsertResult {
+        let shards = self.shards.read().await;
+        let chunk_size = inits.len() / shards.len();
+        let mut result = BulkInsertResult::new();
+        for chunk in inits.chunks(chunk_size) {
+            let next_shard = self
+                .next_shard
+                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+            let shard = &shards[next_shard % shards.len()];
+            // TODO - we sequentially try each shard, but we could try to parallelize this.
+            let shard_result = shard.bulk_create_jobs_blocking(chunk, timeout).await;
+            if let Err(err) = shard_result {
+                result.add_failure(err, chunk.to_vec());
+            }
+        }
+
+        result
+    }
+}
+
+impl Shard {
+    pub fn new(pool: PgPool, depth_limit: u64, check_interval: Duration) -> Self {
+        Self {
+            pool,
+            last_healthy: RwLock::new(Utc::now() - check_interval),
+            check_interval,
+            depth_limit,
+        }
+    }
+
+    // Inserts a job, failing if the shard is at capacity
+    pub async fn create_job(&self, init: JobInit) -> Result<(), QueueError> {
+        self.insert_guard().await?;
+        create_job(&self.pool, init).await
+    }
+
+    // Inserts a vec of jobs, failing if the shard is at capacity. Note "capacity" here just
+    // means "it isn't totally full" - if there's "capacity" for 1 job, and this is a vec of
+    // 1000, we still insert all 1000.
+    pub async fn bulk_create_jobs(&self, inits: &[JobInit]) -> Result<(), QueueError> {
+        self.insert_guard().await?;
+        bulk_create_jobs(&self.pool, inits).await
+    }
+
+    // Inserts a job, blocking until there's capacity (or until the timeout is reached)
+    pub async fn create_job_blocking(
+        &self,
+        init: JobInit,
+        timeout: Option<Duration>,
+    ) -> Result<(), QueueError> {
+        let start = Utc::now();
+        while self.is_full().await? {
+            tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
+            if let Some(timeout) = &timeout {
+                if Utc::now() - start > *timeout {
+                    return Err(QueueError::TimedOutWaitingForCapacity);
+                }
+            }
+        }
+
+        create_job(&self.pool, init).await
+    }
+
+    pub async fn bulk_create_jobs_blocking(
+        &self,
+        inits: &[JobInit],
+        timeout: Option<Duration>,
+    ) -> Result<(), QueueError> {
+        let start = Utc::now();
+        while self.is_full().await? {
+            tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
+            if let Some(timeout) = &timeout {
+                if Utc::now() - start > *timeout {
+                    return Err(QueueError::TimedOutWaitingForCapacity);
+                }
+            }
+        }
+
+        bulk_create_jobs(&self.pool, inits).await
+    }
+
+    pub async fn insert_guard(&self) -> Result<(), QueueError> {
+        if self.is_full().await? {
+            return Err(QueueError::ShardFull(self.depth_limit));
+        }
+
+        Ok(())
+    }
+
+    pub async fn is_full(&self) -> Result<bool, QueueError> {
+        let last_healthy = self.last_healthy.read().await;
+        // If we were healthy less than the check interval ago, assume we are still
+        if Utc::now() - *last_healthy < self.check_interval {
+            return Ok(false);
+        }
+
+        // Grab a write lock. This constrains the number of concurrent capacity checks
+        // to 1, purposefully - if someone spawns a thousand tasks to blockingly create
+        // a job, we don't want all of them to be querying the available count at once.
+        drop(last_healthy);
+        let mut last_healthy = self.last_healthy.write().await;
+        // TOCTOU - multiple tasks could be racing to re-do the check, and the firs time one
+        // succeeds all the rest should skip it.
+        if Utc::now() - *last_healthy < self.check_interval {
+            return Ok(false);
+        }
+
+        let pending = count_total_waiting_jobs(&self.pool).await?;
+        let is_full = pending >= self.depth_limit;
+        if !is_full {
+            *last_healthy = Utc::now();
+        }
+        Ok(is_full)
+    }
+}
+
+impl BulkInsertResult {
+    pub fn new() -> Self {
+        Self { failures: vec![] }
+    }
+
+    pub fn add_failure(&mut self, err: QueueError, jobs: Vec<JobInit>) {
+        self.failures.push((err, jobs));
+    }
+
+    pub fn all_succeeded(&self) -> bool {
+        self.failures.is_empty()
+    }
+}
+
+impl Default for BulkInsertResult {
+    fn default() -> Self {
+        Self::new()
+    }
+}
--- a/rust/cyclotron-core/src/worker.rs
+++ b/rust/cyclotron-core/src/worker.rs
@ -0,0 +1,229 @@
+use std::{collections::HashMap, sync::Arc};
+
+use chrono::{DateTime, Utc};
+use sqlx::PgPool;
+use std::sync::Mutex;
+use uuid::Uuid;
+
+use crate::{
+    base_ops::{
+        dequeue_jobs, dequeue_with_vm_state, flush_job, set_heartbeat, Job, JobState, JobUpdate,
+    },
+    error::QueueError,
+    PoolConfig,
+};
+
+// The worker's interface to the underlying queue system - a worker can do everything except
+// create jobs (because job creation has to be shard-aware).
+//
+// This interface looks stange, because a lot of things that would normally be done with lifetimes
+// and references are done with uuid's instead (and we lose some nice raii stuff as a result), but
+// the reason for this is that this is designed to be embedded in other runtimes, where handing out
+// lifetime'd references or things with drop impls isn't really practical. This makes it a little
+// awkward to use, but since it's meant to be the core of other abstractions, I think it's ok for
+// now (client libraries should wrap this to provide better interfaces).
+pub struct Worker {
+    pool: PgPool,
+    // All dequeued job IDs that haven't been flushed yet. The idea is this lets us
+    // manage, on the rust side of any API boundary, the "pending" update of any given
+    // job, such that a user can progressively build up a full update, and then flush it,
+    // rather than having to track the update state on their side and submit it all at once
+    // TODO - we don't handle people "forgetting" to abort a job, because we expect that to
+    //       only happen if a process dies (in which case the job queue janitor should handle
+    //       it)... this is a memory leak, but I think it's ok.
+    // TRICKY - this is a sync mutex, because we never hold it across an await point, and that
+    // radically simplifies using this for FFI (because there's no message passing across runtimes)
+    pending: Arc<Mutex<HashMap<Uuid, JobUpdate>>>,
+}
+
+impl Worker {
+    pub async fn new(config: PoolConfig) -> Result<Self, QueueError> {
+        let pool = config.connect().await?;
+        Ok(Self {
+            pool,
+            pending: Arc::new(Mutex::new(HashMap::new())),
+        })
+    }
+
+    pub fn from_pool(pool: PgPool) -> Self {
+        Self {
+            pool,
+            pending: Arc::new(Mutex::new(HashMap::new())),
+        }
+    }
+
+    /// Dequeues jobs from the queue, and returns them. Job sorting happens at the queue level,
+    /// workers can't provide any filtering or sorting criteria - queue managers decide which jobs are run,
+    /// workers just run them.
+    pub async fn dequeue_jobs(&self, queue: &str, limit: usize) -> Result<Vec<Job>, QueueError> {
+        let jobs = dequeue_jobs(&self.pool, queue, limit).await?;
+
+        let mut pending = self.pending.lock().unwrap();
+        for job in &jobs {
+            // We need to hang onto the locks for a job until we flush it, so we can send updates.
+            let update = JobUpdate::new(
+                job.lock_id
+                    .expect("Yell at oliver that the dequeuing code is broken. He's very sorry that your process just panicked"),
+            );
+            pending.insert(job.id, update);
+        }
+
+        Ok(jobs)
+    }
+
+    /// This is the same as dequeue_jobs, but it also returns the vm_state of the job
+    pub async fn dequeue_with_vm_state(
+        &self,
+        queue: &str,
+        limit: usize,
+    ) -> Result<Vec<Job>, QueueError> {
+        let jobs = dequeue_with_vm_state(&self.pool, queue, limit).await?;
+
+        let mut pending = self.pending.lock().unwrap();
+        for job in &jobs {
+            // We need to hang onto the locks for a job until we flush it, so we can send updates.
+            let update = JobUpdate::new(
+                job.lock_id
+                    .expect("Yell at oliver that the dequeuing (with vm) code is broken. He's very sorry that your process just panicked"),
+            );
+            pending.insert(job.id, update);
+        }
+
+        Ok(jobs)
+    }
+
+    /// NOTE - This function can only be called once, even though the underlying
+    /// basic operation can be performed as many times as the caller likes (so long as
+    /// the job state is never set to something other than running, as that clears the
+    /// job lock). We're more strict here (flushes can only happen once, you must
+    /// flush some non-running state) to try and enforce a good interaction
+    /// pattern with the queue. I might return to this and loosen this constraint in the
+    /// future, if there's a motivating case for needing to flush partial job updates.
+    pub async fn flush_job(&self, job_id: Uuid) -> Result<(), QueueError> {
+        // TODO - this drops the job from the known jobs before the flush succeeds,
+        // which means that if the flush fails, we'll lose the job and can never
+        // update it's state (leaving it to the reaper). This is a bug, but I'm not
+        // sure I want to make flushes retryable just yet, so I'm leaving it for now.
+        // NIT: this wrapping is to ensure pending is dropped prior to the await
+        let update = {
+            let mut pending = self.pending.lock().unwrap();
+            let update = pending
+                .remove(&job_id)
+                .ok_or(QueueError::UnknownJobId(job_id))?;
+            // It's a programming error to flush a job without setting a new state
+            match update.state {
+                Some(JobState::Running) | None => {
+                    // Keep track of any /other/ updates that might have been stored, even in this case,
+                    // so a user can queue up the appropriate state transition and flush properly
+                    pending.insert(job_id, update);
+                    return Err(QueueError::FlushWithoutNextState(job_id));
+                }
+                _ => update,
+            }
+        };
+        let mut connection = self.pool.acquire().await?;
+        flush_job(connection.as_mut(), job_id, update).await
+    }
+
+    /// Jobs are reaped after some seconds (the number is deployment specific, and may become
+    /// specific on job properties like queue name in the future, as we figure out what /kinds/ of
+    /// jobs are longer or shorter running). A job is considered "dead" if it's in a running state,
+    /// and it's last heartbeat was more than the reaping time ago. This, like flush, returns an
+    /// error if you try to set the heartbeat on a job whose lock you don't have (which can happen
+    /// if e.g. the job was reaped out from under you).
+    pub async fn heartbeat(&self, job_id: Uuid) -> Result<(), QueueError> {
+        let lock_id = {
+            let pending = self.pending.lock().unwrap();
+            pending
+                .get(&job_id)
+                .ok_or(QueueError::UnknownJobId(job_id))?
+                .lock_id
+        };
+        let mut connection = self.pool.acquire().await?;
+        set_heartbeat(connection.as_mut(), job_id, lock_id).await
+    }
+
+    /// This is how you "return" a job to the queue, by setting the state to "available"
+    pub fn set_state(&self, job_id: Uuid, state: JobState) -> Result<(), QueueError> {
+        let mut pending = self.pending.lock().unwrap();
+        pending
+            .get_mut(&job_id)
+            .ok_or(QueueError::UnknownJobId(job_id))?
+            .state = Some(state);
+        Ok(())
+    }
+
+    pub fn set_queue(&self, job_id: Uuid, queue: &str) -> Result<(), QueueError> {
+        let mut pending = self.pending.lock().unwrap();
+        pending
+            .get_mut(&job_id)
+            .ok_or(QueueError::UnknownJobId(job_id))?
+            .queue_name = Some(queue.to_string());
+        Ok(())
+    }
+
+    /// Jobs are dequeued lowest-priority-first, so this is how you change the "base" priority of a job
+    /// (control tables may apply further deltas if e.g. a given function is in a degraded state)
+    pub fn set_priority(&self, job_id: Uuid, priority: i16) -> Result<(), QueueError> {
+        let mut pending = self.pending.lock().unwrap();
+        pending
+            .get_mut(&job_id)
+            .ok_or(QueueError::UnknownJobId(job_id))?
+            .priority = Some(priority);
+        Ok(())
+    }
+
+    /// This is how you do e.g. retries after some time, by setting the scheduled time
+    /// to some time in the future. Sleeping, retry backoff, scheduling - it's all the same operation,
+    /// this one.
+    pub fn set_scheduled_at(
+        &self,
+        job_id: Uuid,
+        scheduled: DateTime<Utc>,
+    ) -> Result<(), QueueError> {
+        let mut pending = self.pending.lock().unwrap();
+        pending
+            .get_mut(&job_id)
+            .ok_or(QueueError::UnknownJobId(job_id))?
+            .scheduled = Some(scheduled);
+        Ok(())
+    }
+
+    /// Passing None here will clear the vm_state
+    pub fn set_vm_state(
+        &self,
+        job_id: Uuid,
+        vm_state: Option<String>, // This (and the following) are Options, because the user can null them (by calling with None)
+    ) -> Result<(), QueueError> {
+        let mut pending = self.pending.lock().unwrap();
+        pending
+            .get_mut(&job_id)
+            .ok_or(QueueError::UnknownJobId(job_id))?
+            .vm_state = Some(vm_state);
+        Ok(())
+    }
+
+    /// Passing None here will clear the metadata
+    pub fn set_metadata(&self, job_id: Uuid, metadata: Option<String>) -> Result<(), QueueError> {
+        let mut pending = self.pending.lock().unwrap();
+        pending
+            .get_mut(&job_id)
+            .ok_or(QueueError::UnknownJobId(job_id))?
+            .metadata = Some(metadata);
+        Ok(())
+    }
+
+    /// Passing None here will clear the parameters
+    pub fn set_parameters(
+        &self,
+        job_id: Uuid,
+        parameters: Option<String>,
+    ) -> Result<(), QueueError> {
+        let mut pending = self.pending.lock().unwrap();
+        pending
+            .get_mut(&job_id)
+            .ok_or(QueueError::UnknownJobId(job_id))?
+            .parameters = Some(parameters);
+        Ok(())
+    }
+}
--- a/rust/cyclotron-core/tests/base_ops.rs
+++ b/rust/cyclotron-core/tests/base_ops.rs
@ -0,0 +1,255 @@
+use std::sync::Arc;
+
+use chrono::{Duration, Utc};
+use common::{assert_job_matches_init, create_new_job, dates_match};
+use cyclotron_core::{
+    base_ops::{bulk_create_jobs, JobState},
+    manager::QueueManager,
+    worker::Worker,
+};
+use sqlx::PgPool;
+use uuid::Uuid;
+
+mod common;
+
+// I know this should be a bunch of tests, but for hacking together stuff right now, it'll do
+#[sqlx::test(migrations = "./migrations")]
+async fn test_queue(db: PgPool) {
+    let manager = QueueManager::from_pool(db.clone());
+    let worker = Worker::from_pool(db);
+
+    let job_1 = create_new_job();
+    let mut job_2 = create_new_job();
+
+    job_2.priority = 2; // Lower priority jobs should be returned second
+
+    let queue_name = job_1.queue_name.clone();
+
+    manager
+        .create_job(job_1.clone())
+        .await
+        .expect("failed to create job");
+    manager
+        .create_job(job_2.clone())
+        .await
+        .expect("failed to create job");
+
+    let jobs = worker
+        .dequeue_jobs(&queue_name, 2)
+        .await
+        .expect("failed to dequeue job");
+
+    assert_eq!(jobs.len(), 2);
+    // This also assert that the ordering is correct in terms of priority
+    assert_job_matches_init(&jobs[0], &job_1);
+    assert_job_matches_init(&jobs[1], &job_2);
+
+    // Now we can re-queue these jobs (imagine we had done work)
+    worker
+        .set_state(jobs[0].id, JobState::Available)
+        .expect("failed to set state");
+    worker
+        .set_state(jobs[1].id, JobState::Available)
+        .expect("failed to set state");
+
+    // Flush the two jobs, having made no other changes, then assert we can re-dequeue them
+    worker
+        .flush_job(jobs[0].id)
+        .await
+        .expect("failed to flush job");
+    worker
+        .flush_job(jobs[1].id)
+        .await
+        .expect("failed to flush job");
+
+    let jobs = worker
+        .dequeue_jobs(&queue_name, 2)
+        .await
+        .expect("failed to dequeue job");
+
+    assert_eq!(jobs.len(), 2);
+    assert_job_matches_init(&jobs[0], &job_1);
+    assert_job_matches_init(&jobs[1], &job_2);
+
+    // Re-queue them again
+    worker
+        .set_state(jobs[0].id, JobState::Available)
+        .expect("failed to set state");
+    worker
+        .set_state(jobs[1].id, JobState::Available)
+        .expect("failed to set state");
+
+    worker
+        .flush_job(jobs[0].id)
+        .await
+        .expect("failed to flush job");
+    worker
+        .flush_job(jobs[1].id)
+        .await
+        .expect("failed to flush job");
+
+    // Spin up two tasks to race on dequeuing, and assert at most 2 jobs are dequeued
+    let worker = Arc::new(worker);
+    let moved = worker.clone();
+    let queue_name_moved = queue_name.clone();
+    let fut_1 = async move {
+        moved
+            .dequeue_jobs(&queue_name_moved, 2)
+            .await
+            .expect("failed to dequeue job")
+    };
+    let moved = worker.clone();
+    let queue_name_moved = queue_name.clone();
+    let fut_2 = async move {
+        moved
+            .dequeue_jobs(&queue_name_moved, 2)
+            .await
+            .expect("failed to dequeue job")
+    };
+
+    let (jobs_1, jobs_2) = tokio::join!(fut_1, fut_2);
+    assert_eq!(jobs_1.len() + jobs_2.len(), 2);
+
+    let jobs = jobs_1
+        .into_iter()
+        .chain(jobs_2.into_iter())
+        .collect::<Vec<_>>();
+
+    // And now, any subsequent dequeues will return no jobs
+    let empty = worker
+        .dequeue_jobs(&queue_name, 2)
+        .await
+        .expect("failed to dequeue job");
+    assert_eq!(empty.len(), 0);
+
+    // If we try to flush a job without setting what it's next state will be (or if we set that next state to be "running"),
+    // we should get an error
+    worker
+        .flush_job(jobs[0].id)
+        .await
+        .expect_err("expected error due to no-next-state");
+
+    worker
+        .set_state(jobs[1].id, JobState::Running)
+        .expect("failed to set state");
+    worker
+        .flush_job(jobs[1].id)
+        .await
+        .expect_err("expected error due to running state");
+
+    // But if we properly set the state to completed or failed, now we can flush
+    worker
+        .set_state(jobs[0].id, JobState::Completed)
+        .expect("failed to set state");
+    worker
+        .set_state(jobs[1].id, JobState::Failed)
+        .expect("failed to set state");
+
+    worker
+        .flush_job(jobs[0].id)
+        .await
+        .expect("failed to flush job");
+    worker
+        .flush_job(jobs[1].id)
+        .await
+        .expect("failed to flush job");
+
+    // And now, any subsequent dequeues will return no jobs (because these jobs are finished)
+    let empty = worker
+        .dequeue_jobs(&queue_name, 2)
+        .await
+        .expect("failed to dequeue job");
+    assert_eq!(empty.len(), 0);
+
+    // Now, lets check that we can set every variable on a job
+
+    // Set up some initial values
+    let now = Utc::now();
+    let mut job = create_new_job();
+    job.queue_name = "test".to_string();
+    job.priority = 0;
+    job.scheduled = now - Duration::minutes(2);
+    job.vm_state = None;
+    job.parameters = None;
+    job.metadata = None;
+
+    // Queue the job
+    manager
+        .create_job(job.clone())
+        .await
+        .expect("failed to create job");
+
+    // Then dequeue it
+    let job = worker
+        .dequeue_jobs("test", 1)
+        .await
+        .expect("failed to dequeue job")
+        .pop()
+        .expect("failed to dequeue job");
+
+    // Set everything we're able to set, including state to available, so we can dequeue it again
+    worker
+        .set_state(job.id, JobState::Available)
+        .expect("failed to set state");
+    worker
+        .set_queue(job.id, "test_2")
+        .expect("failed to set queue");
+    worker
+        .set_priority(job.id, 1)
+        .expect("failed to set priority");
+    worker
+        .set_scheduled_at(job.id, now - Duration::minutes(10))
+        .expect("failed to set scheduled_at");
+    worker
+        .set_vm_state(job.id, Some("test".to_string()))
+        .expect("failed to set vm_state");
+    worker
+        .set_parameters(job.id, Some("test".to_string()))
+        .expect("failed to set parameters");
+    worker
+        .set_metadata(job.id, Some("test".to_string()))
+        .expect("failed to set metadata");
+
+    // Flush the job
+    worker.flush_job(job.id).await.expect("failed to flush job");
+
+    // Then dequeue it again (this time being sure to grab the vm state too)
+    let job = worker
+        .dequeue_with_vm_state("test_2", 1)
+        .await
+        .expect("failed to dequeue job")
+        .pop()
+        .expect("failed to dequeue job");
+
+    // And every value should be the updated one
+    assert_eq!(job.queue_name, "test_2");
+    assert_eq!(job.priority, 1);
+    assert!(dates_match(&job.scheduled, &(now - Duration::minutes(10))),);
+    assert_eq!(job.vm_state, Some("test".to_string()));
+    assert_eq!(job.parameters, Some("test".to_string()));
+    assert_eq!(job.metadata, Some("test".to_string()));
+}
+
+#[sqlx::test(migrations = "./migrations")]
+pub async fn test_bulk_insert(db: PgPool) {
+    let worker = Worker::from_pool(db.clone());
+
+    let job_template = create_new_job();
+
+    let jobs = (0..1000)
+        .map(|_| {
+            let mut job = job_template.clone();
+            job.function_id = Some(Uuid::now_v7());
+            job
+        })
+        .collect::<Vec<_>>();
+
+    bulk_create_jobs(&db, &jobs).await.unwrap();
+
+    let dequeue_jobs = worker
+        .dequeue_jobs(&job_template.queue_name, 1000)
+        .await
+        .expect("failed to dequeue job");
+
+    assert_eq!(dequeue_jobs.len(), 1000);
+}
--- a/rust/cyclotron-core/tests/common.rs
+++ b/rust/cyclotron-core/tests/common.rs
@ -0,0 +1,40 @@
+use chrono::{DateTime, Duration, Utc};
+use cyclotron_core::base_ops::{Job, JobInit};
+use uuid::Uuid;
+
+#[allow(dead_code)]
+pub fn create_new_job() -> JobInit {
+    JobInit {
+        team_id: 1,
+        function_id: Some(Uuid::now_v7()), // Lets us uniquely identify jobs without having the Uuid
+        queue_name: "test".to_string(),
+        priority: 0,
+        scheduled: Utc::now() - Duration::minutes(1),
+        vm_state: None,
+        parameters: None,
+        metadata: None,
+    }
+}
+
+#[allow(dead_code)]
+pub fn dates_match(left: &DateTime<Utc>, right: &DateTime<Utc>) -> bool {
+    // Roundtripping a datetime to PG can cause sub-ms differences, so we need to check within a margin of error
+    // Seeing errors like this in CI:
+    //      assertion `left == right` failed
+    //          left: 2024-08-08T20:41:55.964936Z
+    //         right: 2024-08-08T20:41:55.964936997Z
+    let diff = *left - *right;
+    diff.abs() < Duration::milliseconds(1)
+}
+
+#[allow(dead_code)]
+pub fn assert_job_matches_init(job: &Job, init: &JobInit) {
+    assert_eq!(job.team_id, init.team_id);
+    assert_eq!(job.function_id, init.function_id);
+    assert_eq!(job.queue_name, init.queue_name);
+    assert_eq!(job.priority, init.priority);
+    assert!(dates_match(&job.scheduled, &init.scheduled));
+    assert_eq!(job.vm_state, init.vm_state);
+    assert_eq!(job.parameters, init.parameters);
+    assert_eq!(job.metadata, init.metadata);
+}
--- a/rust/cyclotron-core/tests/shard.rs
+++ b/rust/cyclotron-core/tests/shard.rs
@ -0,0 +1,68 @@
+use chrono::{Duration, Utc};
+use common::create_new_job;
+use cyclotron_core::manager::Shard;
+use sqlx::PgPool;
+use tokio::sync::RwLock;
+
+mod common;
+
+pub fn get_shard(db: PgPool) -> Shard {
+    Shard {
+        pool: db,
+        last_healthy: RwLock::new(Utc::now()),
+        check_interval: Duration::milliseconds(0), // We always want to check the limit, for these tests
+        depth_limit: 10,
+    }
+}
+
+#[sqlx::test(migrations = "./migrations")]
+pub async fn test_shard_limiting(db: PgPool) {
+    let shard = get_shard(db.clone());
+
+    // We should be able to insert 10 jobs
+    for _ in 0..10 {
+        shard.create_job(create_new_job()).await.unwrap();
+    }
+
+    // And then we should fail on the 11th
+    let result = shard.create_job(create_new_job()).await;
+    assert!(result.is_err());
+}
+
+#[sqlx::test(migrations = "./migrations")]
+pub async fn test_shard_blocking_insert_waits(db: PgPool) {
+    let shard = get_shard(db.clone());
+
+    // We should be able to insert 10 jobs
+    for _ in 0..10 {
+        shard.create_job(create_new_job()).await.unwrap();
+    }
+
+    let timeout = Some(Duration::milliseconds(50));
+
+    let start = Utc::now();
+    // And then we should fail on the 11th
+    let result = shard.create_job_blocking(create_new_job(), timeout).await;
+    assert!(result.is_err());
+
+    // We should have waited at least 50ms
+    assert!(Utc::now() - start >= Duration::milliseconds(50));
+}
+
+#[sqlx::test(migrations = "./migrations")]
+pub async fn test_shard_allows_bulk_inserts_beyond_capacity(db: PgPool) {
+    let shard = get_shard(db.clone());
+
+    // We should be able to insert 10 jobs
+    for _ in 0..9 {
+        shard.create_job(create_new_job()).await.unwrap();
+    }
+
+    // And then we should be able to bulk insert 1000
+    let inits = (0..1000).map(|_| create_new_job()).collect::<Vec<_>>();
+    shard.bulk_create_jobs(&inits).await.unwrap();
+
+    // And the next insert should fail
+    let result = shard.create_job(create_new_job()).await;
+    assert!(result.is_err());
+}
--- a/rust/cyclotron-fetch/Cargo.toml
+++ b/rust/cyclotron-fetch/Cargo.toml
@ -0,0 +1,32 @@
+[package]
+name = "cyclotron-fetch"
+version = "0.1.0"
+edition = "2021"
+
+[lints]
+workspace = true
+
+[dependencies]
+tracing-subscriber = { workspace = true }
+chrono = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
+uuid = { workspace = true }
+envconfig = { workspace = true }
+axum = { workspace = true }
+thiserror = { workspace = true }
+metrics = { workspace = true }
+cyclotron-core = { path = "../cyclotron-core" }
+common-metrics = { path = "../common/metrics" }
+common-dns = { path = "../common/dns" }
+health = { path = "../common/health" }
+reqwest = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+http = { workspace = true }
+rand = { workspace = true }
+futures = { workspace = true }
+
+[dev-dependencies]
+sqlx = { workspace = true }
+httpmock = { workspace = true }
--- a/rust/cyclotron-fetch/src/config.rs
+++ b/rust/cyclotron-fetch/src/config.rs
@ -0,0 +1,104 @@
+use chrono::Duration;
+use cyclotron_core::PoolConfig;
+use envconfig::Envconfig;
+use uuid::Uuid;
+
+#[derive(Envconfig)]
+pub struct Config {
+    #[envconfig(from = "BIND_HOST", default = "::")]
+    pub host: String,
+
+    #[envconfig(from = "BIND_PORT", default = "3304")]
+    pub port: u16,
+
+    #[envconfig(default = "postgres://posthog:posthog@localhost:5432/cyclotron")]
+    pub database_url: String,
+
+    #[envconfig(default = "10")]
+    pub pg_max_connections: u32,
+
+    #[envconfig(default = "1")]
+    pub pg_min_connections: u32,
+
+    #[envconfig(default = "30")]
+    pub pg_acquire_timeout_seconds: u64,
+
+    #[envconfig(default = "300")]
+    pub pg_max_lifetime_seconds: u64,
+
+    #[envconfig(default = "60")]
+    pub pg_idle_timeout_seconds: u64,
+
+    #[envconfig(default = "false")]
+    pub allow_internal_ips: bool,
+
+    pub worker_id: Option<String>,              // Default to a UUID
+    pub job_poll_interval_seconds: Option<u32>, // Defaults to 1
+    pub concurrent_requests_limit: Option<u32>, // Defaults to 1000
+    pub fetch_timeout_seconds: Option<u32>,     // Defaults to 30
+    pub max_retry_attempts: Option<u32>,        // Defaults to 10
+    pub queue_served: Option<String>,           // Default to "fetch"
+    pub batch_size: Option<usize>,              // Defaults to 1000
+    pub max_response_bytes: Option<usize>,      // Defaults to 1MB
+    pub retry_backoff_base_ms: Option<u32>,     // Defaults to 4000
+}
+
+// I do this instead of using envconfig's defaults because
+// envconfig doesn't support defaults provided by functions,
+// which is frustrating when I want to use UUIDs, and if I'm
+// going to break out one field, I might as well break out
+// everything into "AppConfig" and "PoolConfig"
+#[derive(Debug, Clone)]
+pub struct AppConfig {
+    pub host: String,
+    pub port: u16,
+    pub worker_id: String,
+    pub job_poll_interval: Duration, // How long we wait to poll for new jobs, when we're at capacity or find no new jobs
+    pub concurrent_requests_limit: u32,
+    pub fetch_timeout: Duration,
+    pub max_retry_attempts: u32,
+    pub queue_served: String,
+    pub batch_size: usize,
+    pub max_response_bytes: usize,
+    pub retry_backoff_base: Duration, // Job retry backoff times are this * attempt count
+    pub allow_internal_ips: bool,
+}
+
+impl Config {
+    pub fn to_components(self) -> (AppConfig, PoolConfig) {
+        let worker_id = self.worker_id.unwrap_or_else(|| Uuid::now_v7().to_string());
+        let job_poll_interval_seconds = self.job_poll_interval_seconds.unwrap_or(1);
+        let concurrent_requests_limit = self.concurrent_requests_limit.unwrap_or(1000);
+        let fetch_timeout_seconds = self.fetch_timeout_seconds.unwrap_or(30);
+        let max_retry_attempts = self.max_retry_attempts.unwrap_or(10);
+        let queue_served = self.queue_served.unwrap_or_else(|| "fetch".to_string());
+
+        let app_config = AppConfig {
+            host: self.host,
+            port: self.port,
+            worker_id,
+            job_poll_interval: Duration::seconds(job_poll_interval_seconds as i64),
+            concurrent_requests_limit,
+            fetch_timeout: Duration::seconds(fetch_timeout_seconds as i64),
+            max_retry_attempts,
+            queue_served,
+            batch_size: self.batch_size.unwrap_or(1000),
+            max_response_bytes: self.max_response_bytes.unwrap_or(1024 * 1024),
+            retry_backoff_base: Duration::milliseconds(
+                self.retry_backoff_base_ms.unwrap_or(4000) as i64
+            ),
+            allow_internal_ips: self.allow_internal_ips,
+        };
+
+        let pool_config = PoolConfig {
+            db_url: self.database_url,
+            max_connections: Some(self.pg_max_connections),
+            min_connections: Some(self.pg_min_connections),
+            acquire_timeout_seconds: Some(self.pg_acquire_timeout_seconds),
+            max_lifetime_seconds: Some(self.pg_max_lifetime_seconds),
+            idle_timeout_seconds: Some(self.pg_idle_timeout_seconds),
+        };
+
+        (app_config, pool_config)
+    }
+}
--- a/rust/cyclotron-fetch/src/context.rs
+++ b/rust/cyclotron-fetch/src/context.rs
@ -0,0 +1,55 @@
+use std::sync::Arc;
+
+use cyclotron_core::{worker::Worker, PoolConfig};
+use health::HealthHandle;
+use tokio::sync::Semaphore;
+
+use crate::{config::AppConfig, fetch::FetchError};
+
+pub struct AppContext {
+    pub worker: Worker,
+    pub client: reqwest::Client,
+    pub concurrency_limit: Arc<Semaphore>,
+    pub liveness: HealthHandle,
+    pub config: AppConfig,
+}
+
+impl AppContext {
+    pub async fn create(
+        config: AppConfig,
+        pool_config: PoolConfig,
+        liveness: HealthHandle,
+    ) -> Result<Self, FetchError> {
+        let concurrency_limit = Arc::new(Semaphore::new(config.concurrent_requests_limit as usize));
+
+        let resolver = Arc::new(common_dns::PublicIPv4Resolver {});
+
+        let mut client = reqwest::Client::builder().timeout(config.fetch_timeout.to_std().unwrap());
+
+        if !config.allow_internal_ips {
+            client = client.dns_resolver(resolver);
+        }
+
+        let client = client.build();
+
+        let client = match client {
+            Ok(c) => c,
+            Err(e) => {
+                return Err(FetchError::StartupError(format!(
+                    "Failed to create reqwest client: {}",
+                    e
+                )));
+            }
+        };
+
+        let worker = Worker::new(pool_config).await?;
+
+        Ok(Self {
+            worker,
+            client,
+            concurrency_limit,
+            liveness,
+            config,
+        })
+    }
+}
--- a/rust/cyclotron-fetch/src/fetch.rs
+++ b/rust/cyclotron-fetch/src/fetch.rs
@ -0,0 +1,653 @@
+use std::{cmp::min, collections::HashMap, sync::Arc};
+
+use chrono::{DateTime, Duration, Utc};
+use cyclotron_core::{
+    base_ops::{Job, JobState},
+    error::QueueError,
+    worker::Worker,
+};
+use futures::StreamExt;
+use http::StatusCode;
+use reqwest::Response;
+use serde::{Deserialize, Serialize};
+use thiserror::Error;
+use tokio::sync::OwnedSemaphorePermit;
+use tracing::error;
+
+use crate::context::AppContext;
+
+// TODO - a lot of these should maybe be configurable
+pub const DEAD_LETTER_QUEUE: &str = "fetch-dead-letter";
+pub const DEFAULT_RETRIES: u32 = 3;
+pub const DEFAULT_ON_FINISH: OnFinish = OnFinish::Return;
+pub const HEARTBEAT_INTERVAL_MS: i64 = 5000;
+
+// Exclusively for errors in the worker - these will
+// never be serialised into the job queue, and indicate
+// bad worker health. As a general rule, if one of these
+// is produced, we should let the worker fall over (as in,
+// the outer worker loop should exit).
+#[derive(Error, Debug)]
+pub enum FetchError {
+    #[error("timeout fetching jobs")]
+    JobFetchTimeout,
+    #[error(transparent)]
+    QueueError(#[from] QueueError),
+    // TRICKY - in most cases, serde errors are a FetchError (something coming from the queue was
+    // invalid), but this is used in cases where /we/ fail to serialise something /to/ the queue
+    #[error(transparent)]
+    SerdeError(#[from] serde_json::Error),
+    // We failed doing some kind of setup, like creating a reqwest client
+    #[error("error during startup: {0}")]
+    StartupError(String),
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum HttpMethod {
+    Get,
+    Post,
+    Patch,
+    Put,
+    Delete,
+}
+
+impl From<&HttpMethod> for http::Method {
+    fn from(method: &HttpMethod) -> Self {
+        match method {
+            HttpMethod::Get => http::Method::GET,
+            HttpMethod::Post => http::Method::POST,
+            HttpMethod::Patch => http::Method::PATCH,
+            HttpMethod::Put => http::Method::PUT,
+            HttpMethod::Delete => http::Method::DELETE,
+        }
+    }
+}
+
+// What does someone need to give us to execute a fetch?
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "lowercase")]
+pub struct FetchParameters {
+    pub url: String,
+    pub method: HttpMethod,
+    pub return_queue: String,
+    pub headers: Option<HashMap<String, String>>,
+    pub body: Option<String>,
+    pub max_tries: Option<u32>,      // Defaults to 3
+    pub on_finish: Option<OnFinish>, // Defaults to Return
+}
+
+// What should we do when we get a result, or run out of tries for a given job?
+// Return means re-queue to the return_worker, Complete means mark as Completed/Failed
+#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
+#[serde(rename_all = "lowercase")]
+pub enum OnFinish {
+    Return,
+    Complete,
+}
+
+// Internal bookkeeping for a fetch job
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "lowercase")]
+pub struct FetchMetadata {
+    tries: u32,
+    // The history of failures seen with this job
+    trace: Vec<FetchFailure>,
+}
+
+// This is what we put in the parameters of the job queue for the next
+// worker to pick up
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(tag = "status", rename_all = "lowercase")]
+pub enum FetchResult {
+    Success { response: FetchResponse },
+    Failure { trace: Vec<FetchFailure> }, // If we failed entirely to fetch the job, we return the trace for user debugging
+}
+
+impl FetchResult {
+    pub fn is_success(&self) -> bool {
+        matches!(self, FetchResult::Success { .. })
+    }
+}
+
+// We distinguish between a "fetch failure" and a "worker failure" -
+// worker failures are internal-only, and do not count against the
+// retries of a job (generally, on worker failure, the job is either
+// moved to the dead letter queue, or dropped and left to the janitor to
+// reset). Feture failures are, after retries, returned to the queue, and
+// represent the result of the fetch operation.
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "lowercase")]
+pub struct FetchFailure {
+    pub kind: FetchFailureKind,
+    pub message: String,
+    pub body: Option<String>, // If we have a body, we include it in the failure
+    pub headers: Option<HashMap<String, String>>, // If we have headers, we include them in the failure
+    pub status: Option<u16>, // If we have a status, we include it in the failure
+    pub timestamp: DateTime<Utc>, // Useful for users to correlate logs when debugging
+}
+
+impl FetchFailure {
+    pub fn new(kind: FetchFailureKind, message: impl AsRef<str>) -> Self {
+        Self {
+            kind,
+            message: message.as_ref().to_string(),
+            timestamp: Utc::now(),
+            body: None,
+            headers: None,
+            status: None,
+        }
+    }
+
+    pub fn failure_status(status: StatusCode) -> Self {
+        Self {
+            kind: FetchFailureKind::FailureStatus,
+            message: format!("Received failure status: {}", status),
+            timestamp: Utc::now(),
+            body: None,
+            headers: None,
+            status: Some(status.as_u16()),
+        }
+    }
+
+    pub fn with_body(self, body: String) -> Self {
+        Self {
+            body: Some(body),
+            ..self
+        }
+    }
+
+    pub fn with_headers(self, headers: HashMap<String, String>) -> Self {
+        Self {
+            headers: Some(headers),
+            ..self
+        }
+    }
+
+    pub fn with_status(self, status: u16) -> Self {
+        Self {
+            status: Some(status),
+            ..self
+        }
+    }
+}
+
+impl From<reqwest::Error> for FetchFailure {
+    fn from(e: reqwest::Error) -> Self {
+        let kind = if e.is_timeout() {
+            FetchFailureKind::Timeout
+        } else {
+            FetchFailureKind::RequestError
+        };
+        Self {
+            kind,
+            message: e.to_string(),
+            timestamp: Utc::now(),
+            body: None,
+            headers: None,
+            status: None,
+        }
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
+#[serde(rename_all = "lowercase")]
+pub enum FetchFailureKind {
+    Timeout,
+    TimeoutGettingBody,
+    MissingParameters,
+    InvalidParameters,
+    RequestError,
+    FailureStatus,
+    InvalidBody, // Generally means the body could not be parsed toa  utf8 string
+    ResponseTooLarge,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "lowercase")]
+pub struct FetchResponse {
+    pub status: u16,
+    pub headers: HashMap<String, String>,
+    pub body: String,
+}
+
+pub fn report_worker_saturation(context: &AppContext) {
+    metrics::gauge!("fetch_worker_available_permits")
+        .set(context.concurrency_limit.available_permits() as f64);
+}
+
+pub async fn tick(context: Arc<AppContext>) -> Result<usize, FetchError> {
+    report_worker_saturation(&context);
+
+    let max_jobs = min(
+        context.concurrency_limit.available_permits(),
+        context.config.batch_size,
+    );
+
+    let jobs = context
+        .worker
+        .dequeue_jobs(&context.config.queue_served, max_jobs)
+        .await?;
+
+    let num_jobs = jobs.len();
+
+    for job in jobs {
+        let context = context.clone();
+        // We grab job permits individually, so that as soon as a job is finished, the
+        // permit to run another job is immediately available. This call should
+        // never block, since we only ever dequeue as many jobs as we have permits
+        // available.
+        let permit = context
+            .concurrency_limit
+            .clone()
+            .acquire_owned()
+            .await
+            .unwrap();
+        tokio::spawn(async move {
+            // TODO - since worker errors are never an indication of a fetch failure,
+            // only of some internal worker issue, we should report unhealthy or fall
+            // over or something here.
+            if let Err(e) = run_job(context.clone(), job, permit).await {
+                error!("Error running job: {:?}", e);
+            }
+        });
+    }
+
+    Ok(num_jobs)
+}
+
+// Mostly a thin wrapper to make ser/de a bit easier
+struct FetchJob<'a> {
+    _job: &'a Job,
+    metadata: FetchMetadata,
+    parameters: FetchParameters,
+}
+
+impl<'a> TryFrom<&'a Job> for FetchJob<'a> {
+    type Error = FetchFailure;
+
+    fn try_from(job: &'a Job) -> Result<Self, Self::Error> {
+        let Some(parameters) = &job.parameters else {
+            return Err(FetchFailure::new(
+                FetchFailureKind::MissingParameters,
+                "Job is missing parameters",
+            ));
+        };
+        let parameters: FetchParameters = match serde_json::from_str(parameters) {
+            Ok(p) => p,
+            Err(e) => {
+                return Err(FetchFailure::new(
+                    FetchFailureKind::InvalidParameters,
+                    format!("Failed to parse parameters: {}", e),
+                ))
+            }
+        };
+        let metadata = match &job.metadata {
+            Some(m) => match serde_json::from_str(m) {
+                Ok(m) => m,
+                Err(_) => {
+                    // If we can't decode the metadata, assume this is the first time we've seen the job
+                    // TODO - this is maybe too lenient, I'm not sure.
+                    FetchMetadata {
+                        tries: 0,
+                        trace: vec![],
+                    }
+                }
+            },
+            None => FetchMetadata {
+                tries: 0,
+                trace: vec![],
+            },
+        };
+        Ok(Self {
+            _job: job,
+            metadata,
+            parameters,
+        })
+    }
+}
+
+pub async fn run_job(
+    context: Arc<AppContext>,
+    job: Job,
+    _permit: OwnedSemaphorePermit,
+) -> Result<(), FetchError> {
+    let parsed: FetchJob = match (&job).try_into() {
+        Ok(p) => p,
+        Err(e) => return dead_letter_job(&context.worker, job, vec![e]).await,
+    };
+
+    let method: http::Method = (&parsed.parameters.method).into();
+
+    // Parsing errors are always dead letters - it /will/ fail every time, so dump it
+    // TODO - We should probably decide whether to dead letter or return Failed on the basis of OnFinish,
+    // in case the caller wants to do any cleanup on broken jobs
+    let url: reqwest::Url = match (parsed.parameters.url).parse() {
+        Ok(u) => u,
+        Err(e) => {
+            return dead_letter_job(
+                &context.worker,
+                job,
+                vec![FetchFailure::new(
+                    FetchFailureKind::InvalidParameters,
+                    format!("Invalid url: {}", e),
+                )],
+            )
+            .await;
+        }
+    };
+    let headers: reqwest::header::HeaderMap =
+        match (&parsed.parameters.headers.unwrap_or_default()).try_into() {
+            Ok(h) => h,
+            Err(e) => {
+                return dead_letter_job(
+                    &context.worker,
+                    job,
+                    vec![FetchFailure::new(
+                        FetchFailureKind::InvalidParameters,
+                        format!("Invalid headers: {}", e),
+                    )],
+                )
+                .await;
+            }
+        };
+
+    let body = reqwest::Body::from(parsed.parameters.body.unwrap_or_default());
+
+    let send_fut = context
+        .client
+        .request(method, url)
+        .headers(headers)
+        .body(body)
+        .send();
+
+    let mut send_fut = Box::pin(send_fut);
+
+    let start = Utc::now();
+    let res = loop {
+        tokio::select! {
+            res = &mut send_fut => {
+                break res
+            }
+            _ = tokio::time::sleep(Duration::milliseconds(HEARTBEAT_INTERVAL_MS).to_std().unwrap()) => {
+                context.worker.heartbeat(job.id).await?;
+            }
+        }
+    };
+
+    // If we took, say, 25% of the heartbeat interval to send the request, we may as well heartbeat now
+    if Utc::now() - start > Duration::milliseconds(HEARTBEAT_INTERVAL_MS / 4) {
+        context.worker.heartbeat(job.id).await?;
+    }
+
+    let res = match res {
+        Ok(r) => r,
+        Err(e) => {
+            return handle_fetch_failure(
+                &context,
+                &job,
+                &parsed.metadata,
+                parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES),
+                parsed.parameters.return_queue,
+                parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
+                e,
+            )
+            .await
+        }
+    };
+
+    // Grab the response metadata, since getting the body moves it
+    let status = res.status();
+    let headers: HashMap<String, String> = res
+        .headers()
+        .iter()
+        .map(|(k, v)| {
+            (
+                k.as_str().to_string(),
+                v.to_str().unwrap_or_default().to_string(),
+            )
+        })
+        .collect();
+
+    // We pre-emptively get the response body, because we incldued it in the failure trace, even if we got a failure status
+    let body = first_n_bytes_of_response(
+        &context.worker,
+        &job,
+        res,
+        context.config.max_response_bytes,
+    )
+    .await?;
+    let body = match body {
+        Ok(b) => b,
+        Err(e) => {
+            // Tag the status and headers onto the failure
+            let e = e.with_status(status.as_u16()).with_headers(headers);
+            return handle_fetch_failure(
+                &context,
+                &job,
+                &parsed.metadata,
+                parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES),
+                parsed.parameters.return_queue,
+                parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
+                e,
+            )
+            .await;
+        }
+    };
+
+    // TODO - we should handle "retryable" and "permanent" failures differently, mostly
+    // to be polite - retrying a permanent failure isn't a correctness problem, but it's
+    // rude (and inefficient)
+    if !status.is_success() {
+        let failure = FetchFailure::failure_status(status)
+            .with_body(body)
+            .with_headers(headers);
+        return handle_fetch_failure(
+            &context,
+            &job,
+            &parsed.metadata,
+            parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES),
+            parsed.parameters.return_queue,
+            parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
+            failure,
+        )
+        .await;
+    }
+
+    let result = FetchResult::Success {
+        response: FetchResponse {
+            status: status.as_u16(),
+            headers,
+            body,
+        },
+    };
+
+    complete_job(
+        &context.worker,
+        &job,
+        parsed.parameters.return_queue,
+        parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
+        result,
+    )
+    .await
+}
+
+// Checks if the retry limit has been reached, and does one of:
+// - Schedule the job for retry, doing metadata bookkeeping
+// - Complete the job, with the failure trace
+#[allow(clippy::too_many_arguments)]
+pub async fn handle_fetch_failure<F>(
+    context: &AppContext,
+    job: &Job,
+    metadata: &FetchMetadata,
+    max_tries: u32,
+    return_queue: String,
+    on_finish: OnFinish,
+    failure: F,
+) -> Result<(), FetchError>
+where
+    F: Into<FetchFailure>,
+{
+    let failure = failure.into();
+    let mut metadata = metadata.clone();
+    metadata.tries += 1;
+    metadata.trace.push(failure);
+
+    // TODO - right now we treat all failures as retryable, but we should probably be more aggressive in
+    // culling retries for permanent failures (this is less of a correctness issue and more of an efficiency/
+    // politeness one). We might also want to make backoff configurable.
+    if metadata.tries < min(max_tries, context.config.max_retry_attempts) {
+        let next_available =
+            Utc::now() + (context.config.retry_backoff_base * (metadata.tries as i32));
+        // We back off for at most an hour (since callers can configure max retries to be very high)
+        let next_available = min(next_available, Utc::now() + Duration::hours(1));
+        // Add some seconds of jitter
+        let next_available =
+            next_available + Duration::seconds((rand::random::<u64>() % 30) as i64);
+
+        // Set us up for a retry - update metadata, reschedule, and put back in the queue we pulled from
+        context
+            .worker
+            .set_metadata(job.id, Some(serde_json::to_string(&metadata)?))?;
+        context.worker.set_state(job.id, JobState::Available)?;
+        context.worker.set_queue(job.id, &job.queue_name)?;
+        context.worker.set_scheduled_at(job.id, next_available)?;
+
+        // We downgrade the priority of jobs that fail, so first attempts at jobs get better QoS
+        context.worker.set_priority(job.id, job.priority + 1)?;
+
+        context.worker.flush_job(job.id).await?;
+    } else {
+        // Complete the job, with a Failed result
+        let result = FetchResult::Failure {
+            trace: metadata.trace.clone(),
+        };
+        complete_job(&context.worker, job, return_queue, on_finish, result).await?;
+    }
+
+    Ok(())
+}
+
+// Complete the job, either because we got a good response, or because the jobs retries
+// have been exceeded.
+pub async fn complete_job(
+    worker: &Worker,
+    job: &Job,
+    return_queue: String,
+    on_finish: OnFinish,
+    result: FetchResult,
+) -> Result<(), FetchError> {
+    // If we fail any serde, we just want to flush to the DLQ and bail
+    worker.set_state(job.id, JobState::Available)?;
+    worker.set_queue(job.id, DEAD_LETTER_QUEUE)?;
+
+    let is_success = result.is_success();
+
+    let result = match serde_json::to_string(&result) {
+        Ok(r) => r,
+        Err(e) => {
+            // Leave behind a hint for debugging
+            worker.set_metadata(job.id, Some(format!("Failed to serialise result: {}", e)))?;
+            worker.flush_job(job.id).await?;
+            return Err(FetchError::SerdeError(e));
+        }
+    };
+
+    worker.set_queue(job.id, &return_queue)?;
+
+    match (is_success, on_finish) {
+        (true, _) | (false, OnFinish::Return) => {
+            worker.set_state(job.id, JobState::Available)?;
+        }
+        (false, OnFinish::Complete) => {
+            worker.set_state(job.id, JobState::Failed)?;
+        }
+    }
+
+    worker.set_parameters(job.id, Some(result))?;
+    worker.set_metadata(job.id, None)?; // We're finished with the job, so clear our internal state
+    worker.flush_job(job.id).await?;
+
+    Ok(())
+}
+
+// This moves the job to a dead letter queue, and sets the state to Available (to prevent it
+// from being deleted by the janitor). This is for debugging purposes, and only really jobs
+// that have some parsing failure on dequeue end up here (as they indicate a programming error
+// in the caller, or the worker)
+pub async fn dead_letter_job(
+    worker: &Worker,
+    job: Job,
+    errors: Vec<FetchFailure>,
+) -> Result<(), FetchError> {
+    worker.set_state(job.id, JobState::Available)?;
+    worker.set_queue(job.id, DEAD_LETTER_QUEUE)?;
+
+    let result = FetchResult::Failure { trace: errors };
+    let result = match serde_json::to_string(&result) {
+        Ok(r) => r,
+        Err(e) => {
+            worker.set_metadata(
+                job.id,
+                Some(format!(
+                    "Failed to serialise result during DLQ write: {}",
+                    e
+                )),
+            )?;
+            worker.flush_job(job.id).await?;
+            return Err(FetchError::SerdeError(e));
+        }
+    };
+
+    worker.set_parameters(job.id, Some(result))?;
+
+    worker.flush_job(job.id).await?;
+
+    Ok(())
+}
+
+// Pulls the body, while maintaining the job heartbeat.
+pub async fn first_n_bytes_of_response(
+    worker: &Worker,
+    job: &Job,
+    response: Response,
+    n: usize,
+) -> Result<Result<String, FetchFailure>, FetchError> {
+    let mut body = response.bytes_stream();
+    // We deserialize into a vec<u8>, and then parse to a string
+    let mut buffer = Vec::with_capacity(n);
+
+    worker.heartbeat(job.id).await?;
+
+    loop {
+        tokio::select! {
+            chunk = body.next() => {
+                let chunk = match chunk {
+                    Some(Ok(c)) => c,
+                    Some(Err(e)) => return Ok(Err(FetchFailure::from(e))),
+                    None => break,
+                };
+
+                buffer.extend_from_slice(&chunk);
+
+                if buffer.len() >= n {
+                    return Ok(Err(
+                        FetchFailure::new(FetchFailureKind::ResponseTooLarge, "Response too large")
+                    ));
+                };
+            }
+            _ = tokio::time::sleep(Duration::milliseconds(HEARTBEAT_INTERVAL_MS).to_std().unwrap()) => {}
+        }
+        // Heartbeat every time we get a new body chunk, or every HEARTBEAT_INTERVAL_MS
+        worker.heartbeat(job.id).await?;
+    }
+
+    let Ok(body) = String::from_utf8(buffer) else {
+        return Ok(Err(FetchFailure::new(
+            FetchFailureKind::InvalidBody,
+            "Body could not be parsed as utf8",
+        )));
+    };
+
+    Ok(Ok(body))
+}
--- a/rust/cyclotron-fetch/src/lib.rs
+++ b/rust/cyclotron-fetch/src/lib.rs
@ -0,0 +1,3 @@
+pub mod config;
+pub mod context;
+pub mod fetch;
--- a/rust/cyclotron-fetch/src/main.rs
+++ b/rust/cyclotron-fetch/src/main.rs
@ -0,0 +1,98 @@
+use axum::{extract::State, routing::get, Router};
+use common_metrics::setup_metrics_routes;
+use cyclotron_fetch::{
+    config::Config,
+    context::AppContext,
+    fetch::{tick, FetchError},
+};
+use envconfig::Envconfig;
+use health::HealthRegistry;
+use std::{future::ready, sync::Arc};
+use tracing::{error, info};
+
+async fn listen(app: Router, bind: String) -> Result<(), std::io::Error> {
+    let listener = tokio::net::TcpListener::bind(bind).await?;
+
+    axum::serve(listener, app).await?;
+
+    Ok(())
+}
+
+// For axums state stuff
+#[derive(Clone)]
+struct WorkerId(pub String);
+
+pub fn app(liveness: HealthRegistry, worker_id: String) -> Router {
+    Router::new()
+        .route("/", get(index))
+        .route("/_readiness", get(index))
+        .route("/_liveness", get(move || ready(liveness.get_status())))
+        .with_state(WorkerId(worker_id))
+}
+
+async fn index(State(worker_id): State<WorkerId>) -> String {
+    format!("cyclotron janitor {}", worker_id.0)
+}
+
+async fn worker_loop(context: AppContext) -> Result<(), FetchError> {
+    let context = Arc::new(context);
+    loop {
+        context.liveness.report_healthy().await;
+        let started = tick(context.clone()).await?;
+        info!("started {} jobs", started);
+        // This will happen if 1) there are no jobs or 2) we have no capacity to start new jobs. Either way, we should sleep for a bit
+        if started == 0 {
+            tokio::time::sleep(context.config.job_poll_interval.to_std().unwrap()).await;
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    let config = Config::init_from_env().expect("failed to load configuration from env");
+    tracing_subscriber::fmt::init();
+
+    let liveness = HealthRegistry::new("liveness");
+
+    let (app_config, pool_config) = config.to_components();
+    let bind = format!("{}:{}", app_config.host, app_config.port);
+
+    info!(
+        "Fetch worker starting with ID {:?}, listening at {}",
+        app_config.worker_id, bind
+    );
+
+    let worker_liveness = liveness
+        .register(
+            "worker".to_string(),
+            (app_config.job_poll_interval * 4).to_std().unwrap(),
+        )
+        .await;
+
+    let app = setup_metrics_routes(app(liveness, app_config.worker_id.clone()));
+
+    let context = AppContext::create(app_config, pool_config, worker_liveness)
+        .await
+        .expect("failed to create app context");
+
+    let http_server = tokio::spawn(listen(app, bind));
+
+    let worker_loop = tokio::spawn(worker_loop(context));
+
+    tokio::select! {
+        res = worker_loop => {
+            error!("janitor loop exited");
+            if let Err(e) = res {
+                error!("janitor failed with: {}", e)
+            }
+        }
+        res = http_server => {
+            error!("http server exited");
+            if let Err(e) = res {
+                error!("server failed with: {}", e)
+            }
+        }
+    }
+
+    info!("exiting");
+}
--- a/rust/cyclotron-fetch/tests/fetch.rs
+++ b/rust/cyclotron-fetch/tests/fetch.rs
@ -0,0 +1,293 @@
+use std::{collections::HashMap, str::FromStr, sync::Arc};
+
+use chrono::Duration;
+use cyclotron_core::{manager::QueueManager, worker::Worker};
+use cyclotron_fetch::fetch::{tick, FetchResult, HttpMethod};
+use httpmock::{Method, MockServer};
+use serde_json::json;
+use sqlx::PgPool;
+use utils::{
+    construct_job, construct_params, get_app_test_context, make_immediately_available,
+    wait_on_no_running, wait_on_return,
+};
+
+mod utils;
+
+#[sqlx::test(migrations = "../cyclotron-core/migrations")]
+pub async fn test_completes_fetch(db: PgPool) {
+    let context = Arc::new(get_app_test_context(db.clone()).await);
+    let producer = QueueManager::from_pool(db.clone());
+    let return_worker = Worker::from_pool(db.clone());
+    let server = MockServer::start();
+
+    let mock = server.mock(|when, then| {
+        when.method(Method::GET).path("/test");
+        then.status(200).body("Hello, world!");
+    });
+
+    let params = construct_params(server.url("/test"), HttpMethod::Get);
+    let job = construct_job(params);
+    producer.create_job(job).await.unwrap();
+
+    let started = tick(context).await.unwrap();
+
+    assert_eq!(started, 1);
+
+    let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
+
+    let response: FetchResult =
+        serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
+
+    let FetchResult::Success { response } = response else {
+        panic!("Expected success response");
+    };
+
+    assert_eq!(response.status, 200);
+    assert_eq!(response.body, "Hello, world!");
+
+    mock.assert_hits(1);
+}
+
+#[sqlx::test(migrations = "../cyclotron-core/migrations")]
+pub async fn test_returns_failure_after_retries(db: PgPool) {
+    let context = Arc::new(get_app_test_context(db.clone()).await);
+    let producer = QueueManager::from_pool(db.clone());
+    let return_worker = Worker::from_pool(db.clone());
+    let server = MockServer::start();
+
+    let mock = server.mock(|when, then| {
+        when.method(Method::GET).path("/test");
+        then.status(500).body("test server error body");
+    });
+
+    let mut params = construct_params(server.url("/test"), HttpMethod::Get);
+    params.max_tries = Some(2);
+
+    let job = construct_job(params);
+    producer.create_job(job).await.unwrap();
+
+    // Tick twice for retry
+    let started = tick(context.clone()).await.unwrap();
+    assert_eq!(started, 1);
+    wait_on_no_running(&db, Duration::milliseconds(100)).await;
+    make_immediately_available(&db).await;
+    let started = tick(context.clone()).await.unwrap();
+    assert_eq!(started, 1);
+    wait_on_no_running(&db, Duration::milliseconds(100)).await;
+
+    let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
+
+    let response: FetchResult =
+        serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
+
+    let FetchResult::Failure { trace } = response else {
+        panic!("Expected failure response");
+    };
+
+    assert!(trace.len() == 2);
+    for attempt in trace {
+        assert_eq!(attempt.status, Some(500));
+        assert_eq!(attempt.body, Some("test server error body".to_string()));
+    }
+
+    mock.assert_hits(2);
+}
+
+#[sqlx::test(migrations = "../cyclotron-core/migrations")]
+pub fn fetch_discards_bad_metadata(db: PgPool) {
+    let context = Arc::new(get_app_test_context(db.clone()).await);
+    let producer = QueueManager::from_pool(db.clone());
+    let return_worker = Worker::from_pool(db.clone());
+    let server = MockServer::start();
+
+    let mock = server.mock(|when, then| {
+        when.method(Method::GET).path("/test");
+        then.status(200).body("Hello, world!");
+    });
+
+    let params = construct_params(server.url("/test"), HttpMethod::Get);
+    let mut job = construct_job(params);
+    job.metadata = Some("bad json".to_string());
+    producer.create_job(job).await.unwrap();
+
+    let started = tick(context).await.unwrap();
+
+    assert_eq!(started, 1);
+
+    let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
+
+    let response: FetchResult =
+        serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
+
+    let FetchResult::Success { response } = response else {
+        panic!("Expected success response");
+    };
+
+    assert_eq!(response.status, 200);
+    assert_eq!(response.body, "Hello, world!");
+
+    mock.assert_hits(1);
+}
+
+#[sqlx::test(migrations = "../cyclotron-core/migrations")]
+pub fn fetch_with_minimum_params_works(db: PgPool) {
+    let context = Arc::new(get_app_test_context(db.clone()).await);
+    let producer = QueueManager::from_pool(db.clone());
+    let return_worker = Worker::from_pool(db.clone());
+    let server = MockServer::start();
+
+    let mock = server.mock(|when, then| {
+        when.method(Method::GET).path("/test");
+        then.status(200).body("Hello, world!");
+    });
+
+    let params = construct_params(server.url("/test"), HttpMethod::Get);
+    let mut job = construct_job(params);
+
+    let url = server.url("/test");
+    let manual_params = json!({
+        "url": url,
+        "method": "GET",
+        "return_queue": "return",
+    })
+    .to_string();
+
+    job.parameters = Some(manual_params);
+
+    producer.create_job(job).await.unwrap();
+
+    let started = tick(context).await.unwrap();
+
+    assert_eq!(started, 1);
+
+    let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
+
+    let response: FetchResult =
+        serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
+
+    let FetchResult::Success { response } = response else {
+        panic!("Expected success response");
+    };
+
+    assert_eq!(response.status, 200);
+    assert_eq!(response.body, "Hello, world!");
+
+    mock.assert_hits(1);
+}
+
+#[sqlx::test(migrations = "../cyclotron-core/migrations")]
+pub async fn test_completes_fetch_with_headers(db: PgPool) {
+    let context = Arc::new(get_app_test_context(db.clone()).await);
+    let producer = QueueManager::from_pool(db.clone());
+    let return_worker = Worker::from_pool(db.clone());
+    let server = MockServer::start();
+
+    let mock = server.mock(|when, then| {
+        when.method(Method::GET)
+            .path("/test")
+            .header("X-Test", "test");
+        then.status(200).body("Hello, world!");
+    });
+
+    let mut params = construct_params(server.url("/test"), HttpMethod::Get);
+    let mut headers = HashMap::new();
+    headers.insert("X-Test".to_string(), "test".to_string());
+    params.headers = Some(headers);
+
+    let job = construct_job(params);
+    producer.create_job(job).await.unwrap();
+
+    let started = tick(context).await.unwrap();
+
+    assert_eq!(started, 1);
+
+    let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
+
+    let response: FetchResult =
+        serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
+
+    let FetchResult::Success { response } = response else {
+        panic!("Expected success response");
+    };
+
+    assert_eq!(response.status, 200);
+    assert_eq!(response.body, "Hello, world!");
+
+    mock.assert_hits(1);
+}
+
+#[sqlx::test(migrations = "../cyclotron-core/migrations")]
+pub async fn test_completes_fetch_with_body(db: PgPool) {
+    let context = Arc::new(get_app_test_context(db.clone()).await);
+    let producer = QueueManager::from_pool(db.clone());
+    let return_worker = Worker::from_pool(db.clone());
+    let server = MockServer::start();
+
+    let mock = server.mock(|when, then| {
+        when.method(Method::POST).path("/test").body("test body");
+        then.status(200).body("Hello, world!");
+    });
+
+    let mut params = construct_params(server.url("/test"), HttpMethod::Post);
+    params.body = Some("test body".to_string());
+
+    let job = construct_job(params);
+    producer.create_job(job).await.unwrap();
+
+    let started = tick(context).await.unwrap();
+
+    assert_eq!(started, 1);
+
+    let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
+
+    let response: FetchResult =
+        serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
+
+    let FetchResult::Success { response } = response else {
+        panic!("Expected success response");
+    };
+
+    assert_eq!(response.status, 200);
+    assert_eq!(response.body, "Hello, world!");
+
+    mock.assert_hits(1);
+}
+
+#[sqlx::test(migrations = "../cyclotron-core/migrations")]
+pub async fn test_completes_fetch_with_vm_state(db: PgPool) {
+    let context = Arc::new(get_app_test_context(db.clone()).await);
+    let producer = QueueManager::from_pool(db.clone());
+    let return_worker = Worker::from_pool(db.clone());
+    let server = MockServer::start();
+
+    let mock = server.mock(|when, then| {
+        when.method(Method::GET).path("/test");
+        then.status(200).body("Hello, world!");
+    });
+
+    let params = construct_params(server.url("/test"), HttpMethod::Get);
+    let mut job = construct_job(params);
+    job.vm_state = Some(json!({"test": "state"}).to_string());
+    producer.create_job(job).await.unwrap();
+
+    let started = tick(context).await.unwrap();
+
+    assert_eq!(started, 1);
+
+    let returned = wait_on_return(&return_worker, 1, true).await.unwrap();
+
+    let state = serde_json::Value::from_str(returned[0].vm_state.as_ref().unwrap()).unwrap();
+    assert_eq!(state, json!({"test": "state"}));
+
+    let response: FetchResult =
+        serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
+
+    let FetchResult::Success { response } = response else {
+        panic!("Expected success response");
+    };
+
+    assert_eq!(response.status, 200);
+    assert_eq!(response.body, "Hello, world!");
+
+    mock.assert_hits(1);
+}
--- a/rust/cyclotron-fetch/tests/utils.rs
+++ b/rust/cyclotron-fetch/tests/utils.rs
@ -0,0 +1,127 @@
+use std::sync::Arc;
+
+use chrono::{Duration, Utc};
+use cyclotron_core::{
+    base_ops::{Job, JobInit},
+    error::QueueError,
+    worker::Worker,
+};
+use cyclotron_fetch::{
+    config::AppConfig,
+    context::AppContext,
+    fetch::{FetchParameters, HttpMethod},
+};
+use sqlx::PgPool;
+use tokio::sync::Semaphore;
+
+const FETCH_QUEUE: &str = "fetch";
+const RETURN_QUEUE: &str = "return";
+
+pub async fn get_app_test_context(db: PgPool) -> AppContext {
+    let worker = Worker::from_pool(db.clone());
+    let client = reqwest::Client::new();
+    let concurrency_limit = Arc::new(Semaphore::new(1));
+    let health = health::HealthRegistry::new("test");
+    let liveness = health
+        .register("test".to_string(), Duration::seconds(30).to_std().unwrap())
+        .await;
+
+    let config = AppConfig {
+        fetch_timeout: Duration::seconds(10),
+        concurrent_requests_limit: 1,
+        host: "localhost".to_string(),
+        port: 16,
+        worker_id: "test".to_string(),
+        job_poll_interval: Duration::seconds(10),
+        max_retry_attempts: 3,
+        queue_served: FETCH_QUEUE.to_string(),
+        batch_size: 1000,
+        max_response_bytes: 1024 * 1024,
+        retry_backoff_base: Duration::milliseconds(1000),
+        allow_internal_ips: true,
+    };
+
+    AppContext {
+        worker,
+        client,
+        concurrency_limit,
+        liveness,
+        config,
+    }
+}
+
+pub fn construct_params(url: String, method: HttpMethod) -> FetchParameters {
+    FetchParameters {
+        url,
+        method,
+        return_queue: RETURN_QUEUE.to_string(),
+        headers: None,
+        body: None,
+        max_tries: None,
+        on_finish: None,
+    }
+}
+
+pub fn construct_job(parameters: FetchParameters) -> JobInit {
+    JobInit {
+        team_id: 1,
+        queue_name: FETCH_QUEUE.to_string(),
+        priority: 0,
+        scheduled: Utc::now() - Duration::seconds(1),
+        function_id: None,
+        vm_state: None,
+        parameters: Some(serde_json::to_string(&parameters).unwrap()),
+        metadata: None,
+    }
+}
+
+pub async fn wait_on_return(
+    worker: &Worker,
+    count: usize,
+    with_vm: bool,
+) -> Result<Vec<Job>, QueueError> {
+    let timeout = Duration::seconds(1);
+    let start = Utc::now();
+    let mut returned = vec![];
+    while start + timeout > Utc::now() {
+        let mut jobs = if with_vm {
+            worker.dequeue_with_vm_state(RETURN_QUEUE, 1).await?
+        } else {
+            worker.dequeue_jobs(RETURN_QUEUE, 1).await?
+        };
+        returned.append(&mut jobs);
+        if returned.len() == count {
+            return Ok(returned);
+        }
+        if returned.len() > count {
+            panic!("Too many jobs returned");
+        }
+    }
+    panic!("Timeout waiting for jobs to return");
+}
+
+pub async fn wait_on_no_running(pool: &PgPool, max_time: Duration) {
+    let start = Utc::now();
+    loop {
+        let running: i64 =
+            sqlx::query_scalar("SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'running'")
+                .fetch_one(pool)
+                .await
+                .unwrap();
+        if running == 0 {
+            return;
+        }
+        if Utc::now() - start > max_time {
+            panic!("Timeout waiting for jobs to finish");
+        }
+    }
+}
+
+pub async fn make_immediately_available(pool: &PgPool) {
+    sqlx::query(
+        "UPDATE cyclotron_jobs SET scheduled = NOW() - INTERVAL '1 second' WHERE state = 'available'",
+    )
+    .execute(pool)
+    .await
+    .unwrap();
+}
--- a/rust/cyclotron-janitor/Cargo.toml
+++ b/rust/cyclotron-janitor/Cargo.toml
@ -0,0 +1,22 @@
+[package]
+name = "cyclotron-janitor"
+version = "0.1.0"
+edition = "2021"
+
+[lints]
+workspace = true
+
+[dependencies]
+tracing-subscriber = { workspace = true }
+sqlx = { workspace = true }
+chrono = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
+uuid = { workspace = true }
+envconfig = { workspace = true }
+axum = { workspace = true }
+eyre = { workspace = true }
+metrics = { workspace = true }
+cyclotron-core = { path = "../cyclotron-core" }
+common-metrics = { path = "../common/metrics" }
+health = { path = "../common/health" }
--- a/rust/cyclotron-janitor/bin/entrypoint.sh
+++ b/rust/cyclotron-janitor/bin/entrypoint.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+set -e
+
+# I set all possible env vars here, tune them as you like
+export RUST_LOG="INFO"
+export HOST="::"
+export PORT="3302"
+export DATABASE_URL="postgres://posthog:posthog@localhost:5432/cyclotron"
+export CLEANUP_INTERVAL_SECONDS="10"
+export PG_MAX_CONNECTIONS="10"
+export PG_MIN_CONNECTIONS="1"
+export PG_ACQUIRE_TIMEOUT_SECONDS="5"
+export PG_MAX_LIFETIME_SECONDS="300"
+export PG_IDLE_TIMEOUT_SECONDS="60"
+export JANITOR_ID="test-janitor"
+export JANITOR_MAX_TOUCHES="2"
+export JANITOR_STALL_TIMEOUT_SECONDS="30"
+
+# Uncomment this to have the database be reset every time you start the janitor
+sqlx database reset -y --source ../cyclotron-core/migrations
+sqlx migrate run --source ../cyclotron-core/migrations
+
+cargo run --release
--- a/rust/cyclotron-janitor/src/config.rs
+++ b/rust/cyclotron-janitor/src/config.rs
@ -0,0 +1,83 @@
+use chrono::Duration;
+
+use cyclotron_core::PoolConfig;
+use envconfig::Envconfig;
+use uuid::Uuid;
+
+#[derive(Envconfig)]
+pub struct Config {
+    #[envconfig(from = "BIND_HOST", default = "::")]
+    pub host: String,
+
+    #[envconfig(from = "BIND_PORT", default = "3303")]
+    pub port: u16,
+
+    #[envconfig(default = "postgres://posthog:posthog@localhost:5432/cyclotron")]
+    pub database_url: String,
+
+    #[envconfig(default = "30")]
+    pub cleanup_interval_secs: u64,
+
+    #[envconfig(default = "10")]
+    pub pg_max_connections: u32,
+
+    #[envconfig(default = "1")]
+    pub pg_min_connections: u32,
+
+    #[envconfig(default = "30")]
+    pub pg_acquire_timeout_seconds: u64,
+
+    #[envconfig(default = "300")]
+    pub pg_max_lifetime_seconds: u64,
+
+    #[envconfig(default = "60")]
+    pub pg_idle_timeout_seconds: u64,
+
+    // Generally, this should be equivalent to a "shard id", as only one janitor should be running
+    // per shard
+    pub janitor_id: Option<String>,
+
+    #[envconfig(default = "10")]
+    pub janitor_max_touches: i16,
+
+    #[envconfig(default = "60")]
+    pub janitor_stall_timeout_seconds: u16,
+}
+
+impl Config {
+    pub fn get_janitor_config(&self) -> JanitorConfig {
+        let pool_config = PoolConfig {
+            db_url: self.database_url.clone(),
+            max_connections: Some(self.pg_max_connections),
+            min_connections: Some(self.pg_min_connections),
+            acquire_timeout_seconds: Some(self.pg_acquire_timeout_seconds),
+            max_lifetime_seconds: Some(self.pg_max_lifetime_seconds),
+            idle_timeout_seconds: Some(self.pg_idle_timeout_seconds),
+        };
+
+        let settings = JanitorSettings {
+            stall_timeout: Duration::seconds(self.janitor_stall_timeout_seconds as i64),
+            max_touches: self.janitor_max_touches,
+            id: self
+                .janitor_id
+                .clone()
+                .unwrap_or_else(|| Uuid::now_v7().to_string()),
+        };
+
+        JanitorConfig {
+            pool: pool_config,
+            settings,
+        }
+    }
+}
+
+pub struct JanitorConfig {
+    pub pool: PoolConfig,
+    pub settings: JanitorSettings,
+}
+
+pub struct JanitorSettings {
+    pub stall_timeout: Duration,
+    pub max_touches: i16,
+    pub id: String,
+}
--- a/rust/cyclotron-janitor/src/janitor.rs
+++ b/rust/cyclotron-janitor/src/janitor.rs
@ -0,0 +1,136 @@
+use chrono::Utc;
+use cyclotron_core::{
+    error::QueueError,
+    janitor_ops::{
+        delete_completed_jobs, delete_failed_jobs, delete_poison_pills, reset_stalled_jobs,
+    },
+};
+use sqlx::PgPool;
+use tracing::{info, warn};
+
+use crate::config::{JanitorConfig, JanitorSettings};
+
+// The janitor reports it's own metrics, this is mostly for testing purposes
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct CleanupResult {
+    pub completed: u64,
+    pub failed: u64,
+    pub poisoned: u64,
+    pub stalled: u64,
+}
+
+pub struct Janitor {
+    pool: PgPool,
+    settings: JanitorSettings,
+    metrics_labels: Vec<(&'static str, String)>,
+}
+
+impl Janitor {
+    pub async fn new(config: JanitorConfig) -> Result<Self, QueueError> {
+        let settings = config.settings;
+        let pool = config.pool.connect().await?;
+
+        let metrics_labels = vec![("janitor_id", settings.id.clone())];
+
+        Ok(Self {
+            pool,
+            settings,
+            metrics_labels,
+        })
+    }
+
+    pub fn from_pool(pool: PgPool, settings: JanitorSettings) -> Self {
+        let metrics_labels = vec![("janitor_id", settings.id.clone())];
+        Self {
+            pool,
+            settings,
+            metrics_labels,
+        }
+    }
+
+    // TODO - right now, the metrics produced here are pretty rough - just per shard, without
+    // any per-queue or per-worker-type breakdown. It'd be nice to add that, eventually.
+    pub async fn run_once(&self) -> Result<CleanupResult, QueueError> {
+        info!("Running janitor loop");
+        let start = Utc::now();
+        metrics::counter!("cyclotron_janitor_run_starts", &self.metrics_labels).increment(1);
+
+        let before = Utc::now();
+        let completed = delete_completed_jobs(&self.pool).await?;
+        let taken = Utc::now() - before;
+        metrics::histogram!(
+            "cyclotron_janitor_completed_jobs_cleanup_duration_ms",
+            &self.metrics_labels
+        )
+        .record(taken.num_milliseconds() as f64);
+        metrics::counter!(
+            "cyclotron_janitor_completed_jobs_deleted",
+            &self.metrics_labels
+        )
+        .increment(completed);
+
+        let before = Utc::now();
+        let failed = delete_failed_jobs(&self.pool).await?;
+        let taken = Utc::now() - before;
+        metrics::histogram!(
+            "cyclotron_janitor_failed_jobs_cleanup_duration_ms",
+            &self.metrics_labels
+        )
+        .record(taken.num_milliseconds() as f64);
+        metrics::counter!(
+            "cyclotron_janitor_failed_jobs_deleted",
+            &self.metrics_labels
+        )
+        .increment(failed);
+
+        // Note - if we reset stalled jobs before deleting poison pills, we'll never delete poision
+        // pills, since resetting a stalled job clears the locked state.
+        let before = Utc::now();
+        let poisoned = delete_poison_pills(
+            &self.pool,
+            self.settings.stall_timeout,
+            self.settings.max_touches,
+        )
+        .await?;
+        let taken = Utc::now() - before;
+        metrics::histogram!(
+            "cyclotron_janitor_poison_pills_cleanup_duration_ms",
+            &self.metrics_labels
+        )
+        .record(taken.num_milliseconds() as f64);
+        metrics::counter!(
+            "cyclotron_janitor_poison_pills_deleted",
+            &self.metrics_labels
+        )
+        .increment(poisoned);
+        if poisoned > 0 {
+            warn!("Deleted {} poison pills", poisoned);
+        }
+
+        let before = Utc::now();
+        let stalled = reset_stalled_jobs(&self.pool, self.settings.stall_timeout).await?;
+        let taken = Utc::now() - before;
+        metrics::histogram!(
+            "cyclotron_janitor_stalled_jobs_reset_duration_ms",
+            &self.metrics_labels
+        )
+        .record(taken.num_milliseconds() as f64);
+        metrics::counter!("cyclotron_janitor_stalled_jobs_reset", &self.metrics_labels)
+            .increment(stalled);
+        if stalled > 0 {
+            warn!("Reset {} stalled jobs", stalled);
+        }
+
+        metrics::counter!("cyclotron_janitor_run_ends", &self.metrics_labels).increment(1);
+        let elapsed = Utc::now() - start;
+        metrics::histogram!("cyclotron_janitor_run_duration_ms", &self.metrics_labels)
+            .record(elapsed.num_milliseconds() as f64);
+        info!("Janitor loop complete");
+        Ok(CleanupResult {
+            completed,
+            failed,
+            poisoned,
+            stalled,
+        })
+    }
+}
--- a/rust/cyclotron-janitor/src/lib.rs
+++ b/rust/cyclotron-janitor/src/lib.rs
@ -0,0 +1,2 @@
+pub mod config;
+pub mod janitor;
--- a/rust/cyclotron-janitor/src/main.rs
+++ b/rust/cyclotron-janitor/src/main.rs
@ -0,0 +1,105 @@
+use axum::{extract::State, routing::get, Router};
+use common_metrics::setup_metrics_routes;
+use cyclotron_janitor::{config::Config, janitor::Janitor};
+use envconfig::Envconfig;
+use eyre::Result;
+use health::{HealthHandle, HealthRegistry};
+use std::{future::ready, time::Duration};
+use tracing::{error, info};
+
+/// Most of this stuff is stolen pretty shamelessly from the rustyhook janitor. It'll diverge more
+/// once we introduce the management command stuff, but for now it's a good starting point.
+
+async fn cleanup_loop(janitor: Janitor, livenes: HealthHandle, interval_secs: u64) -> Result<()> {
+    let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
+
+    loop {
+        interval.tick().await;
+
+        if let Err(e) = janitor.run_once().await {
+            // don't bother reporting unhealthy - a few times around this loop will put us in a stalled state
+            error!("janitor failed cleanup with: {}", e);
+        } else {
+            livenes.report_healthy().await;
+        }
+    }
+}
+
+async fn listen(app: Router, bind: String) -> Result<()> {
+    let listener = tokio::net::TcpListener::bind(bind).await?;
+
+    axum::serve(listener, app).await?;
+
+    Ok(())
+}
+
+// For axums state stuff
+#[derive(Clone)]
+struct JanitorId(pub String);
+
+pub fn app(liveness: HealthRegistry, janitor_id: String) -> Router {
+    Router::new()
+        .route("/", get(index))
+        .route("/_readiness", get(index))
+        .route("/_liveness", get(move || ready(liveness.get_status())))
+        .with_state(JanitorId(janitor_id))
+}
+
+async fn index(State(janitor_id): State<JanitorId>) -> String {
+    format!("cyclotron janitor {}", janitor_id.0)
+}
+
+#[tokio::main]
+async fn main() {
+    let config = Config::init_from_env().expect("failed to load configuration from env");
+    tracing_subscriber::fmt::init();
+
+    let liveness = HealthRegistry::new("liveness");
+
+    let janitor_config = config.get_janitor_config();
+
+    let janitor_id = janitor_config.settings.id.clone();
+    let bind = format!("{}:{}", config.host, config.port);
+
+    info!(
+        "Starting janitor with ID {:?}, listening at {}",
+        janitor_id, bind
+    );
+
+    let janitor = Janitor::new(janitor_config)
+        .await
+        .expect("failed to create janitor");
+
+    let janitor_liveness = liveness
+        .register(
+            "janitor".to_string(),
+            Duration::from_secs(config.cleanup_interval_secs * 4),
+        )
+        .await;
+
+    let janitor_loop = tokio::spawn(cleanup_loop(
+        janitor,
+        janitor_liveness,
+        config.cleanup_interval_secs,
+    ));
+
+    let app = setup_metrics_routes(app(liveness, janitor_id));
+    let http_server = tokio::spawn(listen(app, bind));
+
+    tokio::select! {
+        res = janitor_loop => {
+            error!("janitor loop exited");
+            if let Err(e) = res {
+                error!("janitor failed with: {}", e)
+            }
+        }
+        res = http_server => {
+            error!("http server exited");
+            if let Err(e) = res {
+                error!("server failed with: {}", e)
+            }
+        }
+    }
+
+    info!("exiting");
+}
--- a/rust/cyclotron-janitor/tests/janitor.rs
+++ b/rust/cyclotron-janitor/tests/janitor.rs
@ -0,0 +1,226 @@
+use chrono::{Duration, Utc};
+use cyclotron_core::{
+    base_ops::{JobInit, JobState},
+    manager::QueueManager,
+    worker::Worker,
+};
+use cyclotron_janitor::{config::JanitorSettings, janitor::Janitor};
+use sqlx::PgPool;
+use uuid::Uuid;
+
+#[sqlx::test(migrations = "../cyclotron-core/migrations")]
+async fn janitor_test(db: PgPool) {
+    let worker = Worker::from_pool(db.clone());
+    let manager = QueueManager::from_pool(db.clone());
+
+    // Purposefully MUCH smaller than would be used in production, so
+    // we can simulate stalled or poison jobs quickly
+    let stall_timeout = Duration::milliseconds(10);
+    let max_touches = 3;
+
+    let settings = JanitorSettings {
+        stall_timeout,
+        max_touches,
+        id: "test_janitor".to_string(),
+    };
+    let janitor = Janitor::from_pool(db.clone(), settings);
+
+    let now = Utc::now() - Duration::seconds(10);
+    let queue_name = "default".to_string();
+
+    let job_init = JobInit {
+        team_id: 1,
+        queue_name: queue_name.clone(),
+        priority: 0,
+        scheduled: now,
+        function_id: Some(Uuid::now_v7()),
+        vm_state: None,
+        parameters: None,
+        metadata: None,
+    };
+
+    // First test - if we mark a job as completed, the janitor will clean it up
+    manager.create_job(job_init.clone()).await.unwrap();
+    let job = worker
+        .dequeue_jobs(&queue_name, 1)
+        .await
+        .unwrap()
+        .pop()
+        .unwrap();
+
+    worker.set_state(job.id, JobState::Completed).unwrap();
+    worker.flush_job(job.id).await.unwrap();
+
+    let result = janitor.run_once().await.unwrap();
+    assert_eq!(result.completed, 1);
+    assert_eq!(result.failed, 0);
+    assert_eq!(result.poisoned, 0);
+    assert_eq!(result.stalled, 0);
+
+    // Second test - if we mark a job as failed, the janitor will clean it up
+    manager.create_job(job_init.clone()).await.unwrap();
+    let job = worker
+        .dequeue_jobs(&queue_name, 1)
+        .await
+        .unwrap()
+        .pop()
+        .unwrap();
+
+    worker.set_state(job.id, JobState::Failed).unwrap();
+    worker.flush_job(job.id).await.unwrap();
+
+    let result = janitor.run_once().await.unwrap();
+    assert_eq!(result.completed, 0);
+    assert_eq!(result.failed, 1);
+    assert_eq!(result.poisoned, 0);
+    assert_eq!(result.stalled, 0);
+
+    // Third test - if we pick up a job, and then hold it for longer than
+    // the stall timeout, the janitor will reset it. After this, the worker
+    // cannot flush updates to the job, and must re-dequeue it.
+
+    manager.create_job(job_init.clone()).await.unwrap();
+    let job = worker
+        .dequeue_jobs(&queue_name, 1)
+        .await
+        .unwrap()
+        .pop()
+        .unwrap();
+
+    // First, cleanup won't do anything
+    let result = janitor.run_once().await.unwrap();
+    assert_eq!(result.completed, 0);
+    assert_eq!(result.failed, 0);
+    assert_eq!(result.poisoned, 0);
+    assert_eq!(result.stalled, 0);
+
+    // Then we stall on the job
+    tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await;
+
+    // Now, cleanup will reset the job
+    let result = janitor.run_once().await.unwrap();
+    assert_eq!(result.completed, 0);
+    assert_eq!(result.failed, 0);
+    assert_eq!(result.poisoned, 0);
+    assert_eq!(result.stalled, 1);
+
+    // Now, the worker can't flush the job
+    worker.set_state(job.id, JobState::Completed).unwrap();
+    let result = worker.flush_job(job.id).await;
+    assert!(result.is_err());
+
+    // But if we re-dequeue the job, we can flush it
+    let job = worker
+        .dequeue_jobs(&queue_name, 1)
+        .await
+        .unwrap()
+        .pop()
+        .unwrap();
+    worker.set_state(job.id, JobState::Completed).unwrap();
+    worker.flush_job(job.id).await.unwrap();
+
+    janitor.run_once().await.unwrap(); // Clean up the completed job to reset for the next test
+
+    // Fourth test - if a worker holds a job for longer than the stall
+    // time, but calls heartbeat, the job will not be reset
+
+    manager.create_job(job_init.clone()).await.unwrap();
+    let job = worker
+        .dequeue_jobs(&queue_name, 1)
+        .await
+        .unwrap()
+        .pop()
+        .unwrap();
+
+    let start = tokio::time::Instant::now();
+    loop {
+        worker.heartbeat(job.id).await.unwrap();
+        tokio::time::sleep(Duration::milliseconds(1).to_std().unwrap()).await;
+        if start.elapsed() > stall_timeout.to_std().unwrap() * 2 {
+            break;
+        }
+    }
+
+    let result = janitor.run_once().await.unwrap();
+    assert_eq!(result.completed, 0);
+    assert_eq!(result.failed, 0);
+    assert_eq!(result.poisoned, 0);
+    assert_eq!(result.stalled, 0);
+
+    // The worker can still flush the job
+    worker.set_state(job.id, JobState::Completed).unwrap();
+    worker.flush_job(job.id).await.unwrap();
+
+    // and now cleanup will work
+    let result = janitor.run_once().await.unwrap();
+    assert_eq!(result.completed, 1);
+    assert_eq!(result.failed, 0);
+    assert_eq!(result.poisoned, 0);
+    assert_eq!(result.stalled, 0);
+
+    // Fifth test - if a job stalls more than max_touches
+    // it will be marked as poisoned and deleted
+
+    manager.create_job(job_init.clone()).await.unwrap();
+    let mut job = worker
+        .dequeue_jobs(&queue_name, 1)
+        .await
+        .unwrap()
+        .pop()
+        .unwrap();
+
+    for _ in 0..max_touches {
+        tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await;
+        let result = janitor.run_once().await.unwrap();
+        assert_eq!(result.completed, 0);
+        assert_eq!(result.failed, 0);
+        assert_eq!(result.poisoned, 0);
+        assert_eq!(result.stalled, 1);
+
+        // assert we can't update the job (flush and heartbeat fail)
+        worker.set_state(job.id, JobState::Completed).unwrap();
+        let result = worker.heartbeat(job.id).await;
+        assert!(result.is_err());
+        let result = worker.flush_job(job.id).await;
+        assert!(result.is_err());
+
+        // re-dequeue the job
+        job = worker
+            .dequeue_jobs(&queue_name, 1)
+            .await
+            .unwrap()
+            .pop()
+            .unwrap();
+    }
+    // At this point, the "janitor touches" on the job is 3 (it's been stalled and reset 3 times), so one more cleanup loop will delete it
+
+    // Now stall one more time, and on cleanup, we should see the job was considered poison and deleted
+    tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await;
+    let result: cyclotron_janitor::janitor::CleanupResult = janitor.run_once().await.unwrap();
+    assert_eq!(result.completed, 0);
+    assert_eq!(result.failed, 0);
+    assert_eq!(result.poisoned, 1);
+    assert_eq!(result.stalled, 0);
+
+    // The worker can't flush the job
+    worker.set_state(job.id, JobState::Completed).unwrap();
+    let result = worker.flush_job(job.id).await;
+    assert!(result.is_err());
+
+    // Sixth test - the janitor can operate on multiple jobs at once
+    manager.create_job(job_init.clone()).await.unwrap();
+    manager.create_job(job_init.clone()).await.unwrap();
+    let jobs = worker.dequeue_jobs(&queue_name, 2).await.unwrap();
+
+    worker.set_state(jobs[0].id, JobState::Completed).unwrap();
+    worker.set_state(jobs[1].id, JobState::Failed).unwrap();
+
+    worker.flush_job(jobs[0].id).await.unwrap();
+    worker.flush_job(jobs[1].id).await.unwrap();
+
+    let result = janitor.run_once().await.unwrap();
+    assert_eq!(result.completed, 1);
+    assert_eq!(result.failed, 1);
+    assert_eq!(result.poisoned, 0);
+    assert_eq!(result.stalled, 0);
+}
--- a/rust/cyclotron-node/.gitignore
+++ b/rust/cyclotron-node/.gitignore
@ -0,0 +1,7 @@
+target
+index.node
+**/node_modules
+**/.DS_Store
+npm-debug.log*cargo.log
+cross.log
+dist/
--- a/rust/cyclotron-node/Cargo.toml
+++ b/rust/cyclotron-node/Cargo.toml
@ -0,0 +1,22 @@
+[package]
+name = "cyclotron-node"
+version = "0.1.0"
+edition = "2021"
+exclude = ["index.node"]
+
+[lints]
+workspace = true
+
+
+[lib]
+crate-type = ["cdylib"]
+
+[dependencies]
+cyclotron-core = { path = "../cyclotron-core" }
+neon = { workspace = true }
+once_cell = { workspace = true }
+tokio = { workspace = true }
+serde_json = { workspace = true }
+serde = { workspace = true }
+uuid = { workspace = true }
+chrono = { workspace = true }
--- a/rust/cyclotron-node/examples/basic.js
+++ b/rust/cyclotron-node/examples/basic.js
@ -0,0 +1,144 @@
+const assert = require('assert')
+const cyclotron = require('../.')
+const crypto = require('crypto')
+
+// Set of available job states
+const JOB_STATES = Object.freeze({
+    AVAILABLE: 'available',
+    RUNNING: 'running',
+    FAILED: 'failed',
+    COMPLETED: 'completed',
+})
+
+const AVAILABLE_WORKERS = Object.freeze({
+    FETCH: 'fetch',
+    HOG: 'hog',
+})
+
+async function main() {
+    let poolConfig = {
+        db_url: 'postgresql://posthog:posthog@localhost:5432/cyclotron',
+    }
+
+    let managerConfig = {
+        shards: [poolConfig],
+    }
+
+    // Most processes will only need to do one of these, but we can do both here for demonstration purposes
+    await cyclotron.initWorker(JSON.stringify(poolConfig))
+    await cyclotron.initManager(JSON.stringify(managerConfig))
+
+    // Maybe inits won't throw on re-calling, and are also short-circuiting to be almost free, so safe to call frequently
+    // (although I still wouldn't call them in a loop)
+    await cyclotron.maybeInitWorker(JSON.stringify(poolConfig))
+    await cyclotron.maybeInitManager(JSON.stringify(managerConfig))
+
+    let five_mintes_ago = new Date(new Date().getTime() - 5 * 60000).toISOString()
+    let queue_name = 'default'
+
+    let job_1 = {
+        team_id: 1,
+        queue_name,
+        priority: 0,
+        scheduled: five_mintes_ago,
+        function_id: crypto.randomUUID(), // Is nullable
+        vm_state: null,
+        parameters: null,
+        metadata: null,
+    }
+
+    let job_2 = {
+        team_id: 1,
+        queue_name,
+        priority: 1,
+        scheduled: five_mintes_ago,
+        function_id: crypto.randomUUID(), // Is nullable
+        vm_state: null,
+        parameters: null,
+        metadata: null,
+    }
+
+    await cyclotron.createJob(JSON.stringify(job_1))
+    await cyclotron.createJob(JSON.stringify(job_2))
+
+    // Jobs (as well as any other 'complex' data shape) are serialized across the API boundary,
+    // because that's (according to the neon maintainers) /actually faster/ than doing a bunch
+    // of cross-runtime pointer chasing.
+    let jobs = JSON.parse(await cyclotron.dequeueJobs(queue_name, 2))
+    assert(jobs.length === 2)
+    assert(jobs[0].function_id === job_1.function_id)
+    assert(jobs[1].function_id === job_2.function_id)
+
+    job_1 = jobs[0]
+    job_2 = jobs[1]
+
+    // All of these throw if the job hasn't been dequeued by the worker created when init_worker was called,
+    // or if there's some serde error - generally, interacting with the cyclotron should involve try/catch in
+    // some far outer catch. We can iterate on this API to make it more ergonomic with time, but
+    // my js/ts is... rusty (co-pilot wrote this joke)
+    cyclotron.setState(job_1.id, JOB_STATES.AVAILABLE)
+    cyclotron.setState(job_2.id, JOB_STATES.AVAILABLE)
+
+    cyclotron.setQueue(job_1.id, 'non-default')
+    cyclotron.setQueue(job_2.id, 'non-default')
+
+    // Priority is lowest-first, so this means we can assert that job_2 will be returned first on subsequent dequeue_jobs
+    cyclotron.setPriority(job_1.id, 2)
+    cyclotron.setPriority(job_2.id, 1)
+
+    let ten_minutes_ago = new Date(new Date().getTime() - 10 * 60000).toISOString()
+    cyclotron.setScheduledAt(job_1.id, ten_minutes_ago)
+    cyclotron.setScheduledAt(job_2.id, ten_minutes_ago)
+
+    cyclotron.setVmState(job_1.id, JSON.stringify({ state: 'running' }))
+    cyclotron.setVmState(job_2.id, JSON.stringify({ state: 'running' }))
+
+    cyclotron.setParameters(job_1.id, JSON.stringify({ parameters: 'running' }))
+    cyclotron.setParameters(job_2.id, JSON.stringify({ parameters: 'running' }))
+
+    cyclotron.setMetadata(job_1.id, JSON.stringify({ metadata: 'running' }))
+    cyclotron.setMetadata(job_2.id, JSON.stringify({ metadata: 'running' }))
+
+    // Flush the updates queued up above back to the queue. Subsequent calls to flush
+    // will throw if a job isn't re-acquired. Flushes will fail if a job state update
+    // isn't included (workers should not purposefully leave jobs in a running state)
+    await cyclotron.flushJob(job_1.id)
+    await cyclotron.flushJob(job_2.id)
+
+    jobs = JSON.parse(await cyclotron.dequeueWithVmState('non-default', 2))
+
+    assert(jobs[0].id == job_2.id)
+    assert(jobs[1].id == job_1.id)
+
+    assert(jobs[0].function_id === job_2.function_id)
+    assert(jobs[1].function_id === job_1.function_id)
+
+    assert(jobs[0].team_id === job_2.team_id)
+    assert(jobs[1].team_id === job_1.team_id)
+
+    assert(jobs[0].queue_name === 'non-default')
+    assert(jobs[1].queue_name === 'non-default')
+
+    assert(jobs[0].priority === 1)
+    assert(jobs[1].priority === 2)
+
+    assert(jobs[0].scheduled === ten_minutes_ago)
+    assert(jobs[1].scheduled === ten_minutes_ago)
+
+    assert(jobs[0].vm_state === JSON.stringify({ state: 'running' }))
+    assert(jobs[1].vm_state === JSON.stringify({ state: 'running' }))
+    assert(jobs[0].parameters === JSON.stringify({ parameters: 'running' }))
+    assert(jobs[1].parameters === JSON.stringify({ parameters: 'running' }))
+    assert(jobs[0].metadata === JSON.stringify({ metadata: 'running' }))
+    assert(jobs[1].metadata === JSON.stringify({ metadata: 'running' }))
+
+    // Now we'll mark these jobs as completed
+    cyclotron.setState(job_1.id, JOB_STATES.COMPLETED)
+    cyclotron.setState(job_2.id, JOB_STATES.COMPLETED)
+
+    // And flush them back to the queue
+    await cyclotron.flushJob(job_1.id)
+    await cyclotron.flushJob(job_2.id)
+}
+
+main()
--- a/rust/cyclotron-node/package.json
+++ b/rust/cyclotron-node/package.json
@ -0,0 +1,27 @@
+{
+    "name": "@posthog/cyclotron",
+    "version": "0.1.0",
+    "description": "Node bindings for cyclotron",
+    "main": "dist/index.js",
+    "types": "dist/index.d.ts",
+    "scripts": {
+        "test": "cargo test",
+        "build": "pnpm run build:cargo --release && pnpm run build:move-lib && pnpm run build:typescript",
+        "build:move-lib": "cp ../target/release/libcyclotron_node.dylib index.node || cp ../target/release/libcyclotron_node.so index.node",
+        "build:cargo": "cargo build --message-format=json > cargo.log",
+        "build:cargo:debug": "pnpm run build:cargo",
+        "build:cross": "cross build --message-format=json > cross.log",
+        "build:typescript": "tsc",
+        "package": "NODE_ENV=development pnpm i --dev && pnpm run build"
+    },
+    "author": "",
+    "license": "MIT",
+    "devDependencies": {
+        "@types/node": "^22.4.1",
+        "typescript": "^4.7.4"
+    },
+    "files": [
+        "dist",
+        "index.node"
+    ]
+}
--- a/rust/cyclotron-node/pnpm-lock.yaml
+++ b/rust/cyclotron-node/pnpm-lock.yaml
@ -0,0 +1,31 @@
+lockfileVersion: '6.0'
+
+settings:
+  autoInstallPeers: true
+  excludeLinksFromLockfile: false
+
+devDependencies:
+  '@types/node':
+    specifier: ^22.4.1
+    version: 22.4.1
+  typescript:
+    specifier: ^4.7.4
+    version: 4.9.5
+
+packages:
+
+  /@types/node@22.4.1:
+    resolution: {integrity: sha512-1tbpb9325+gPnKK0dMm+/LMriX0vKxf6RnB0SZUqfyVkQ4fMgUSySqhxE/y8Jvs4NyF1yHzTfG9KlnkIODxPKg==}
+    dependencies:
+      undici-types: 6.19.8
+    dev: true
+
+  /typescript@4.9.5:
+    resolution: {integrity: sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==}
+    engines: {node: '>=4.2.0'}
+    hasBin: true
+    dev: true
+
+  /undici-types@6.19.8:
+    resolution: {integrity: sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==}
+    dev: true
--- a/rust/cyclotron-node/src/index.ts
+++ b/rust/cyclotron-node/src/index.ts
@ -0,0 +1,257 @@
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+const cyclotron = require('../index.node')
+
+export interface PoolConfig {
+    dbUrl: string
+    maxConnections?: number
+    minConnections?: number
+    acquireTimeoutSeconds?: number
+    maxLifetimeSeconds?: number
+    idleTimeoutSeconds?: number
+}
+
+// Type as expected by Cyclotron.
+interface InternalPoolConfig {
+    db_url: string
+    max_connections?: number
+    min_connections?: number
+    acquire_timeout_seconds?: number
+    max_lifetime_seconds?: number
+    idle_timeout_seconds?: number
+}
+
+export interface ManagerConfig {
+    shards: PoolConfig[]
+}
+
+// Type as expected by Cyclotron.
+interface InternalManagerConfig {
+    shards: InternalPoolConfig[]
+}
+
+export interface JobInit {
+    teamId: number
+    functionId: string
+    queueName: string
+    priority?: number
+    scheduled?: Date
+    vmState?: string
+    parameters?: string
+    metadata?: string
+}
+
+// Type as expected by Cyclotron.
+interface InternalJobInit {
+    team_id: number
+    function_id: string
+    queue_name: string
+    priority?: number
+    scheduled?: Date
+    vm_state?: string
+    parameters?: string
+    metadata?: string
+}
+
+export type JobState = 'available' | 'running' | 'completed' | 'failed' | 'paused'
+
+export interface Job {
+    id: string
+    teamId: number
+    functionId: string | null
+    created: Date
+    lockId: string | null
+    lastHeartbeat: Date | null
+    janitorTouchCount: number
+    transitionCount: number
+    lastTransition: Date
+    queueName: string
+    state: JobState
+    priority: number
+    scheduled: Date
+    vmState: string | null
+    metadata: string | null
+    parameters: string | null
+}
+
+// Type as returned by Cyclotron.
+interface InternalJob {
+    id: string
+    team_id: number
+    function_id: string | null
+    created: string
+    lock_id: string | null
+    last_heartbeat: string | null
+    janitor_touch_count: number
+    transition_count: number
+    last_transition: string
+    queue_name: string
+    state: JobState
+    priority: number
+    scheduled: string
+    vm_state: string | null
+    metadata: string | null
+    parameters: string | null
+}
+
+async function initWorker(poolConfig: PoolConfig): Promise<void> {
+    const initWorkerInternal: InternalPoolConfig = {
+        db_url: poolConfig.dbUrl,
+        max_connections: poolConfig.maxConnections,
+        min_connections: poolConfig.minConnections,
+        acquire_timeout_seconds: poolConfig.acquireTimeoutSeconds,
+        max_lifetime_seconds: poolConfig.maxLifetimeSeconds,
+        idle_timeout_seconds: poolConfig.idleTimeoutSeconds,
+    }
+    return await cyclotron.initWorker(JSON.stringify(initWorkerInternal))
+}
+
+async function initManager(managerConfig: ManagerConfig): Promise<void> {
+    const managerConfigInternal: InternalManagerConfig = {
+        shards: managerConfig.shards.map((shard) => ({
+            db_url: shard.dbUrl,
+            max_connections: shard.maxConnections,
+            min_connections: shard.minConnections,
+            acquire_timeout_seconds: shard.acquireTimeoutSeconds,
+            max_lifetime_seconds: shard.maxLifetimeSeconds,
+            idle_timeout_seconds: shard.idleTimeoutSeconds,
+        })),
+    }
+    return await cyclotron.initManager(JSON.stringify(managerConfigInternal))
+}
+
+async function maybeInitWorker(poolConfig: PoolConfig): Promise<void> {
+    const initWorkerInternal: InternalPoolConfig = {
+        db_url: poolConfig.dbUrl,
+        max_connections: poolConfig.maxConnections,
+        min_connections: poolConfig.minConnections,
+        acquire_timeout_seconds: poolConfig.acquireTimeoutSeconds,
+        max_lifetime_seconds: poolConfig.maxLifetimeSeconds,
+        idle_timeout_seconds: poolConfig.idleTimeoutSeconds,
+    }
+    return await cyclotron.maybeInitWorker(JSON.stringify(initWorkerInternal))
+}
+
+async function maybeInitManager(managerConfig: ManagerConfig): Promise<void> {
+    const managerConfigInternal: InternalManagerConfig = {
+        shards: managerConfig.shards.map((shard) => ({
+            db_url: shard.dbUrl,
+            max_connections: shard.maxConnections,
+            min_connections: shard.minConnections,
+            acquire_timeout_seconds: shard.acquireTimeoutSeconds,
+            max_lifetime_seconds: shard.maxLifetimeSeconds,
+            idle_timeout_seconds: shard.idleTimeoutSeconds,
+        })),
+    }
+    return await cyclotron.maybeInitManager(JSON.stringify(managerConfigInternal))
+}
+
+export async function createJob(job: JobInit): Promise<void> {
+    job.priority ??= 1
+    job.scheduled ??= new Date()
+
+    const jobInitInternal: InternalJobInit = {
+        team_id: job.teamId,
+        function_id: job.functionId,
+        queue_name: job.queueName,
+        priority: job.priority,
+        scheduled: job.scheduled,
+        vm_state: job.vmState,
+        parameters: job.parameters,
+        metadata: job.metadata,
+    }
+    return await cyclotron.createJob(JSON.stringify(jobInitInternal))
+}
+
+function convertInternalJobToJob(jobInternal: InternalJob): Job {
+    return {
+        id: jobInternal.id,
+        teamId: jobInternal.team_id,
+        functionId: jobInternal.function_id,
+        created: new Date(jobInternal.created),
+        lockId: jobInternal.lock_id,
+        lastHeartbeat: jobInternal.last_heartbeat ? new Date(jobInternal.last_heartbeat) : null,
+        janitorTouchCount: jobInternal.janitor_touch_count,
+        transitionCount: jobInternal.transition_count,
+        lastTransition: new Date(jobInternal.last_transition),
+        queueName: jobInternal.queue_name,
+        state: jobInternal.state,
+        priority: jobInternal.priority,
+        scheduled: new Date(jobInternal.scheduled),
+        vmState: jobInternal.vm_state,
+        metadata: jobInternal.metadata,
+        parameters: jobInternal.parameters,
+    }
+}
+
+async function dequeueJobs(queueName: string, limit: number): Promise<Job[]> {
+    const jobsStr = await cyclotron.dequeueJobs(queueName, limit)
+    const jobs: InternalJob[] = JSON.parse(jobsStr)
+    return jobs.map(convertInternalJobToJob)
+}
+async function dequeueJobsWithVmState(queueName: string, limit: number): Promise<Job[]> {
+    const jobsStr = await cyclotron.dequeueJobsWithVmState(queueName, limit)
+    const jobs: InternalJob[] = JSON.parse(jobsStr)
+    return jobs.map(convertInternalJobToJob)
+}
+
+async function flushJob(jobId: string): Promise<void> {
+    return await cyclotron.flushJob(jobId)
+}
+
+function setState(jobId: string, jobState: JobState): Promise<void> {
+    return cyclotron.setState(jobId, jobState)
+}
+
+function setQueue(jobId: string, queueName: string): Promise<void> {
+    return cyclotron.setQueue(jobId, queueName)
+}
+
+function setPriority(jobId: string, priority: number): Promise<void> {
+    return cyclotron.setPriority(jobId, priority)
+}
+
+function setScheduledAt(jobId: string, scheduledAt: Date): Promise<void> {
+    return cyclotron.setScheduledAt(jobId, scheduledAt.toISOString())
+}
+
+function serializeObject(name: string, obj: Record<string, any> | null): string | null {
+    if (obj === null) {
+        return null
+    } else if (typeof obj === 'object' && obj !== null) {
+        return JSON.stringify(obj)
+    }
+    throw new Error(`${name} must be either an object or null`)
+}
+
+function setVmState(jobId: string, vmState: Record<string, any> | null): Promise<void> {
+    const serialized = serializeObject('vmState', vmState)
+    return cyclotron.setVmState(jobId, serialized)
+}
+
+function setMetadata(jobId: string, metadata: Record<string, any> | null): Promise<void> {
+    const serialized = serializeObject('metadata', metadata)
+    return cyclotron.setMetadata(jobId, serialized)
+}
+
+function setParameters(jobId: string, parameters: Record<string, any> | null): Promise<void> {
+    const serialized = serializeObject('parameters', parameters)
+    return cyclotron.setParameters(jobId, serialized)
+}
+
+export default {
+    initWorker,
+    initManager,
+    maybeInitWorker,
+    maybeInitManager,
+    createJob,
+    dequeueJobs,
+    dequeueJobsWithVmState,
+    flushJob,
+    setState,
+    setQueue,
+    setPriority,
+    setScheduledAt,
+    setVmState,
+    setMetadata,
+    setParameters,
+}
--- a/rust/cyclotron-node/src/lib.rs
+++ b/rust/cyclotron-node/src/lib.rs
@ -0,0 +1,450 @@
+use chrono::{DateTime, Utc};
+use cyclotron_core::{
+    base_ops::{JobInit, JobState},
+    manager::{ManagerConfig, QueueManager},
+    worker::Worker,
+    PoolConfig,
+};
+
+use neon::{
+    handle::Handle,
+    prelude::{Context, FunctionContext, ModuleContext},
+    result::{JsResult, NeonResult},
+    types::{JsNull, JsNumber, JsPromise, JsString, JsValue},
+};
+use once_cell::sync::OnceCell;
+use serde::de::DeserializeOwned;
+use serde_json::Value;
+use tokio::runtime::Runtime;
+use uuid::Uuid;
+
+static WORKER: OnceCell<Worker> = OnceCell::new();
+static MANAGER: OnceCell<QueueManager> = OnceCell::new();
+static RUNTIME: OnceCell<Runtime> = OnceCell::new();
+
+fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
+    RUNTIME
+        .get_or_try_init(Runtime::new)
+        .or_else(|e| cx.throw_error(format!("failed to create tokio runtime: {}", e)))
+}
+
+// The general interface for calling our functions takes a JSON serialized stirng,
+// because neon has no nice serde support for function arguments (and generally.
+// rippping objects from the v8 runtime piece by piece is slower than just passing
+// a since chunk of bytes). These are convenience functions for converting between
+pub fn from_json_string<'a, T, C>(cx: &mut C, object: Handle<JsString>) -> NeonResult<T>
+where
+    T: DeserializeOwned,
+    C: Context<'a>,
+{
+    let value: T =
+        serde_json::from_str(&object.value(cx)).or_else(|e| cx.throw_error(format!("{}", e)))?;
+    Ok(value)
+}
+
+pub fn to_json_string<'a, T, C>(cx: &mut C, value: T) -> NeonResult<String>
+where
+    T: serde::Serialize,
+    C: Context<'a>,
+{
+    let value = serde_json::to_string(&value)
+        .or_else(|e| cx.throw_error(format!("failed to serialize value: {}", e)))?;
+    Ok(value)
+}
+
+fn hello(mut cx: FunctionContext) -> JsResult<JsString> {
+    let arg1 = cx.argument::<JsString>(0)?;
+    let value: Value = from_json_string(&mut cx, arg1)?;
+    let string = to_json_string(&mut cx, value)?;
+    Ok(cx.string(string))
+}
+
+fn init_worker_impl(mut cx: FunctionContext, throw_on_reinit: bool) -> JsResult<JsPromise> {
+    let arg1 = cx.argument::<JsString>(0)?;
+    let config: PoolConfig = from_json_string(&mut cx, arg1)?;
+
+    let (deferred, promise) = cx.promise();
+    let channel = cx.channel();
+    let runtime = runtime(&mut cx)?;
+
+    let fut = async move {
+        let worker = Worker::new(config).await;
+        deferred.settle_with(&channel, move |mut cx| {
+            if WORKER.get().is_some() && !throw_on_reinit {
+                return Ok(cx.null()); // Short circuit to make using maybe_init a no-op
+            }
+            let worker = worker.or_else(|e| cx.throw_error(format!("{}", e)))?;
+            let already_set = WORKER.set(worker).is_err();
+            if already_set && throw_on_reinit {
+                cx.throw_error("worker already initialized")
+            } else {
+                Ok(cx.null())
+            }
+        });
+    };
+
+    runtime.spawn(fut);
+
+    Ok(promise)
+}
+
+fn init_manager_impl(mut cx: FunctionContext, throw_on_reinit: bool) -> JsResult<JsPromise> {
+    let arg1 = cx.argument::<JsString>(0)?;
+    let config: ManagerConfig = from_json_string(&mut cx, arg1)?;
+
+    let (deferred, promise) = cx.promise();
+    let channel = cx.channel();
+    let runtime = runtime(&mut cx)?;
+
+    let fut = async move {
+        let manager = QueueManager::new(config).await;
+        deferred.settle_with(&channel, move |mut cx| {
+            if MANAGER.get().is_some() && !throw_on_reinit {
+                return Ok(cx.null()); // Short circuit to make using maybe_init a no-op
+            }
+            let manager = manager.or_else(|e| cx.throw_error(format!("{}", e)))?;
+            let already_set = MANAGER.set(manager).is_err();
+            if already_set && throw_on_reinit {
+                cx.throw_error("manager already initialized")
+            } else {
+                Ok(cx.null())
+            }
+        });
+    };
+
+    runtime.spawn(fut);
+
+    Ok(promise)
+}
+
+fn init_worker(cx: FunctionContext) -> JsResult<JsPromise> {
+    init_worker_impl(cx, true)
+}
+
+fn init_manager(cx: FunctionContext) -> JsResult<JsPromise> {
+    init_manager_impl(cx, true)
+}
+
+fn maybe_init_worker(cx: FunctionContext) -> JsResult<JsPromise> {
+    init_worker_impl(cx, false)
+}
+
+fn maybe_init_manager(cx: FunctionContext) -> JsResult<JsPromise> {
+    init_manager_impl(cx, false)
+}
+
+// throw_error has a type signature that makes it inconvenient to use in closures, because
+// it requires that you specify the V of the NeonResult<V> returned, even though it's always
+// an error. This is a sane thing for it to do, but it's inconvenient for us, because we
+// frequently settle promises early, before we have a V to use for type inference. This little
+// wrapper makes that easier, by specifying the V as JsNull
+fn throw_null_err<'c, C>(cx: &mut C, msg: &str) -> NeonResult<Handle<'c, JsNull>>
+where
+    C: Context<'c>,
+{
+    cx.throw_error(msg)
+}
+
+fn create_job(mut cx: FunctionContext) -> JsResult<JsPromise> {
+    let arg1: Handle<JsString> = cx.argument::<JsString>(0)?;
+    let job: JobInit = from_json_string(&mut cx, arg1)?;
+
+    let (deferred, promise) = cx.promise();
+    let channel = cx.channel();
+    let runtime = runtime(&mut cx)?;
+
+    let fut = async move {
+        let manager = match MANAGER.get() {
+            Some(manager) => manager,
+            None => {
+                deferred.settle_with(&channel, |mut cx| {
+                    throw_null_err(&mut cx, "manager not initialized")
+                });
+                return;
+            }
+        };
+        let job = manager.create_job(job).await;
+        deferred.settle_with(&channel, move |mut cx| {
+            job.or_else(|e| cx.throw_error(format!("{}", e)))?;
+            Ok(cx.null())
+        });
+    };
+
+    runtime.spawn(fut);
+
+    Ok(promise)
+}
+
+fn dequeue_jobs(mut cx: FunctionContext) -> JsResult<JsPromise> {
+    let queue_name = cx.argument::<JsString>(0)?.value(&mut cx);
+
+    let limit = cx.argument::<JsNumber>(1)?.value(&mut cx) as usize; // TODO - I don't love this cast
+
+    let (deferred, promise) = cx.promise();
+    let channel = cx.channel();
+    let runtime = runtime(&mut cx)?;
+
+    let fut = async move {
+        let worker = match WORKER.get() {
+            Some(worker) => worker,
+            None => {
+                deferred.settle_with(&channel, |mut cx| {
+                    throw_null_err(&mut cx, "worker not initialized")
+                });
+                return;
+            }
+        };
+        let jobs = worker.dequeue_jobs(&queue_name, limit).await;
+        deferred.settle_with(&channel, move |mut cx| {
+            let jobs = jobs.or_else(|e| cx.throw_error(format!("{}", e)))?;
+            let jobs = to_json_string(&mut cx, jobs)?;
+            Ok(cx.string(jobs))
+        });
+    };
+
+    runtime.spawn(fut);
+
+    Ok(promise)
+}
+
+fn dequeue_with_vm_state(mut cx: FunctionContext) -> JsResult<JsPromise> {
+    let queue_name = cx.argument::<JsString>(0)?.value(&mut cx);
+
+    let limit = cx.argument::<JsNumber>(1)?.value(&mut cx) as usize; // TODO - I don't love this cast
+
+    let (deferred, promise) = cx.promise();
+    let channel = cx.channel();
+    let runtime = runtime(&mut cx)?;
+
+    let fut = async move {
+        let worker = match WORKER.get() {
+            Some(worker) => worker,
+            None => {
+                deferred.settle_with(&channel, |mut cx| {
+                    throw_null_err(&mut cx, "worker not initialized")
+                });
+                return;
+            }
+        };
+        let jobs = worker.dequeue_with_vm_state(&queue_name, limit).await;
+        deferred.settle_with(&channel, move |mut cx| {
+            let jobs = jobs.or_else(|e| cx.throw_error(format!("{}", e)))?;
+            let jobs = to_json_string(&mut cx, jobs)?;
+            Ok(cx.string(jobs))
+        });
+    };
+
+    runtime.spawn(fut);
+
+    Ok(promise)
+}
+
+fn flush_job(mut cx: FunctionContext) -> JsResult<JsPromise> {
+    let arg1 = cx.argument::<JsString>(0)?.value(&mut cx);
+    let job_id: Uuid = arg1
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg1)))?;
+
+    let (deferred, promise) = cx.promise();
+    let channel = cx.channel();
+    let runtime = runtime(&mut cx)?;
+
+    let fut = async move {
+        let worker = match WORKER.get() {
+            Some(worker) => worker,
+            None => {
+                deferred.settle_with(&channel, |mut cx| {
+                    throw_null_err(&mut cx, "worker not initialized")
+                });
+                return;
+            }
+        };
+        let res = worker.flush_job(job_id).await;
+        deferred.settle_with(&channel, move |mut cx| {
+            res.or_else(|e: cyclotron_core::error::QueueError| cx.throw_error(format!("{}", e)))?;
+            Ok(cx.null())
+        });
+    };
+
+    runtime.spawn(fut);
+
+    Ok(promise)
+}
+
+fn set_state(mut cx: FunctionContext) -> JsResult<JsNull> {
+    let arg = cx.argument::<JsString>(0)?.value(&mut cx);
+    let job_id: Uuid = arg
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
+
+    let arg = cx.argument::<JsString>(1)?.value(&mut cx);
+    let state: JobState = arg
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid job state: {}", arg)))?;
+
+    WORKER
+        .get()
+        .map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
+        .set_state(job_id, state)
+        .or_else(|e| cx.throw_error(format!("{}", e)))?;
+
+    Ok(cx.null())
+}
+
+fn set_queue(mut cx: FunctionContext) -> JsResult<JsNull> {
+    let arg = cx.argument::<JsString>(0)?.value(&mut cx);
+    let job_id: Uuid = arg
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
+
+    let queue = cx.argument::<JsString>(1)?.value(&mut cx);
+
+    WORKER
+        .get()
+        .map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
+        .set_queue(job_id, &queue)
+        .or_else(|e| cx.throw_error(format!("{}", e)))?;
+
+    Ok(cx.null())
+}
+
+fn set_priority(mut cx: FunctionContext) -> JsResult<JsNull> {
+    let arg = cx.argument::<JsString>(0)?.value(&mut cx);
+    let job_id: Uuid = arg
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
+
+    let arg = cx.argument::<JsNumber>(1)?.value(&mut cx);
+    let priority = arg as i16; // TODO - I /really/ don't love this cast
+
+    WORKER
+        .get()
+        .map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
+        .set_priority(job_id, priority)
+        .or_else(|e| cx.throw_error(format!("{}", e)))?;
+
+    Ok(cx.null())
+}
+
+fn set_scheduled_at(mut cx: FunctionContext) -> JsResult<JsNull> {
+    let arg = cx.argument::<JsString>(0)?.value(&mut cx);
+    let job_id: Uuid = arg
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
+
+    let arg = cx.argument::<JsString>(1)?.value(&mut cx);
+    let scheduled: DateTime<Utc> = arg
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid scheduled at: {}", arg)))?;
+
+    WORKER
+        .get()
+        .map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
+        .set_scheduled_at(job_id, scheduled)
+        .or_else(|e| cx.throw_error(format!("{}", e)))?;
+
+    Ok(cx.null())
+}
+
+fn set_vm_state(mut cx: FunctionContext) -> JsResult<JsNull> {
+    let arg = cx.argument::<JsString>(0)?.value(&mut cx);
+    let job_id: Uuid = arg
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
+
+    // Tricky - we have to support passing nulls here, because that's how you clear vm state.
+    let vm_state = cx.argument::<JsValue>(1)?;
+    let vm_state = if vm_state.is_a::<JsNull, _>(&mut cx) {
+        None
+    } else {
+        Some(
+            vm_state
+                .downcast_or_throw::<JsString, _>(&mut cx)?
+                .value(&mut cx),
+        )
+    };
+
+    WORKER
+        .get()
+        .map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
+        .set_vm_state(job_id, vm_state)
+        .or_else(|e| cx.throw_error(format!("{}", e)))?;
+
+    Ok(cx.null())
+}
+
+fn set_metadata(mut cx: FunctionContext) -> JsResult<JsNull> {
+    let arg = cx.argument::<JsString>(0)?.value(&mut cx);
+    let job_id: Uuid = arg
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
+
+    // Tricky - we have to support passing nulls here, because that's how you clear metadata.
+    let metadata = cx.argument::<JsValue>(1)?;
+    let metadata = if metadata.is_a::<JsNull, _>(&mut cx) {
+        None
+    } else {
+        Some(
+            metadata
+                .downcast_or_throw::<JsString, _>(&mut cx)?
+                .value(&mut cx),
+        )
+    };
+
+    WORKER
+        .get()
+        .map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
+        .set_metadata(job_id, metadata)
+        .or_else(|e| cx.throw_error(format!("{}", e)))?;
+
+    Ok(cx.null())
+}
+
+fn set_parameters(mut cx: FunctionContext) -> JsResult<JsNull> {
+    let arg = cx.argument::<JsString>(0)?.value(&mut cx);
+    let job_id: Uuid = arg
+        .parse()
+        .or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
+
+    // Tricky - we have to support passing nulls here, because that's how you clear parameters.
+    let parameters = cx.argument::<JsValue>(1)?;
+    let parameters = if parameters.is_a::<JsNull, _>(&mut cx) {
+        None
+    } else {
+        Some(
+            parameters
+                .downcast_or_throw::<JsString, _>(&mut cx)?
+                .value(&mut cx),
+        )
+    };
+
+    WORKER
+        .get()
+        .map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
+        .set_parameters(job_id, parameters)
+        .or_else(|e| cx.throw_error(format!("{}", e)))?;
+
+    Ok(cx.null())
+}
+
+#[neon::main]
+fn main(mut cx: ModuleContext) -> NeonResult<()> {
+    cx.export_function("hello", hello)?;
+    cx.export_function("initWorker", init_worker)?;
+    cx.export_function("initManager", init_manager)?;
+    cx.export_function("maybeInitWorker", maybe_init_worker)?;
+    cx.export_function("maybeInitManager", maybe_init_manager)?;
+    cx.export_function("createJob", create_job)?;
+    cx.export_function("dequeueJobs", dequeue_jobs)?;
+    cx.export_function("dequeueJobsWithVmState", dequeue_with_vm_state)?;
+    cx.export_function("flushJob", flush_job)?;
+    cx.export_function("setState", set_state)?;
+    cx.export_function("setQueue", set_queue)?;
+    cx.export_function("setPriority", set_priority)?;
+    cx.export_function("setScheduledAt", set_scheduled_at)?;
+    cx.export_function("setVmState", set_vm_state)?;
+    cx.export_function("setMetadata", set_metadata)?;
+    cx.export_function("setParameters", set_parameters)?;
+
+    Ok(())
+}
--- a/rust/cyclotron-node/tsconfig.json
+++ b/rust/cyclotron-node/tsconfig.json
@ -0,0 +1,24 @@
+{
+    "compilerOptions": {
+        "module": "CommonJS",
+        "target": "ESNext",
+        "declaration": true,
+        "removeComments": true,
+        "emitDecoratorMetadata": true,
+        "experimentalDecorators": true,
+        "moduleResolution": "node",
+        "esModuleInterop": true,
+        "allowJs": true,
+        "sourceMap": true,
+        "baseUrl": "src/",
+        "rootDir": "src/",
+        "outDir": "dist/",
+        "types": ["node"],
+        "resolveJsonModule": true,
+        "strict": true,
+        "noImplicitAny": true,
+        "useUnknownInCatchVariables": false
+    },
+    "include": ["src"],
+    "exclude": ["node_modules", "dist", "bin"]
+}
--- a/rust/hook-api/Cargo.toml
+++ b/rust/hook-api/Cargo.toml
@ -22,3 +22,4 @@ tower = { workspace = true }
 tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
 url = { workspace = true }
+common-metrics = { path = "../common/metrics" }
--- a/rust/hook-api/src/main.rs
+++ b/rust/hook-api/src/main.rs
@ -3,7 +3,7 @@ use config::Config;
 use envconfig::Envconfig;
 use eyre::Result;

-use hook_common::metrics::setup_metrics_routes;
+use common_metrics::setup_metrics_routes;
 use hook_common::pgqueue::PgQueue;

 mod config;
--- a/rust/hook-common/Cargo.toml
+++ b/rust/hook-common/Cargo.toml
@ -8,13 +8,10 @@ workspace = true

 [dependencies]
 async-trait = { workspace = true }
-axum = { workspace = true, features = ["http2"] }
 chrono = { workspace = true }
 envconfig = { workspace = true }
 health = { path = "../common/health" }
 http = { workspace = true }
-metrics = { workspace = true }
-metrics-exporter-prometheus = { workspace = true }
 rdkafka = { workspace = true }
 reqwest = { workspace = true }
 serde = { workspace = true }
--- a/rust/hook-common/src/lib.rs
+++ b/rust/hook-common/src/lib.rs
@ -1,7 +1,6 @@
 pub mod config;
 pub mod kafka_messages;
 pub mod kafka_producer;
-pub mod metrics;
 pub mod pgqueue;
 pub mod retry;
 pub mod test;
--- a/rust/hook-janitor/Cargo.toml
+++ b/rust/hook-janitor/Cargo.toml
@ -24,3 +24,4 @@ time = { workspace = true }
 tokio = { workspace = true }
 tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
+common-metrics = { path = "../common/metrics" }
--- a/rust/hook-janitor/src/main.rs
+++ b/rust/hook-janitor/src/main.rs
@ -9,8 +9,8 @@ use std::{str::FromStr, time::Duration};
 use tokio::sync::Semaphore;
 use webhooks::WebhookCleaner;

+use common_metrics::setup_metrics_routes;
 use hook_common::kafka_producer::create_kafka_producer;
-use hook_common::metrics::setup_metrics_routes;

 mod cleanup;
 mod config;
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Ripped from rusty-hook, since it'll be used across more or less all cyclotron stuff, as well as rustyhook`