mirror of
https://github.com/PostHog/posthog.git
synced 2024-11-21 13:39:22 +01:00
feat: cyclotron (#24228)
Co-authored-by: Brett Hoerner <brett@bretthoerner.com> Co-authored-by: Ben White <ben@posthog.com>
This commit is contained in:
parent
e1def6e3c1
commit
9734a40c96
@ -39,3 +39,11 @@
|
||||
!test-runner-jest.config.js
|
||||
!test-runner-jest-environment.js
|
||||
!patches
|
||||
!rust
|
||||
rust/.env
|
||||
rust/.github
|
||||
rust/docker
|
||||
rust/target
|
||||
rust/cyclotron-node/dist
|
||||
rust/cyclotron-node/node_modules
|
||||
rust/cyclotron-node/index.node
|
||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -64,3 +64,5 @@ plugin-transpiler/dist
|
||||
*-esbuild-bundle-visualization.html
|
||||
.dlt
|
||||
*.db
|
||||
# Ignore any log files that happen to be present
|
||||
*.log
|
24
bin/start-cyclotron
Executable file
24
bin/start-cyclotron
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
|
||||
|
||||
cd rust
|
||||
|
||||
cargo build
|
||||
|
||||
export RUST_LOG=${DEBUG:-debug}
|
||||
SQLX_QUERY_LEVEL=${SQLX_QUERY_LEVEL:-warn}
|
||||
export RUST_LOG=$RUST_LOG,sqlx::query=$SQLX_QUERY_LEVEL
|
||||
|
||||
export DATABASE_URL=${DATABASE_URL:-postgres://posthog:posthog@localhost:5432/posthog}
|
||||
export ALLOW_INTERNAL_IPS=${ALLOW_INTERNAL_IPS:-true}
|
||||
cd cyclotron-core
|
||||
cargo sqlx migrate run
|
||||
cd ..
|
||||
|
||||
./target/debug/cyclotron-fetch &
|
||||
./target/debug/cyclotron-janitor &
|
||||
|
||||
wait
|
@ -27,7 +27,9 @@
|
||||
"services:start": "cd .. && docker compose -f docker-compose.dev.yml up",
|
||||
"services:stop": "cd .. && docker compose -f docker-compose.dev.yml down",
|
||||
"services:clean": "cd .. && docker compose -f docker-compose.dev.yml rm -v",
|
||||
"services": "pnpm services:stop && pnpm services:clean && pnpm services:start"
|
||||
"services": "pnpm services:stop && pnpm services:clean && pnpm services:start",
|
||||
"build:cyclotron": "cd ../rust/cyclotron-node && pnpm run package",
|
||||
"pnpm:devPreinstall": "pnpm run build:cyclotron"
|
||||
},
|
||||
"graphile-worker": {
|
||||
"maxContiguousErrors": 300
|
||||
@ -86,7 +88,8 @@
|
||||
"uuid": "^9.0.1",
|
||||
"v8-profiler-next": "^1.9.0",
|
||||
"vm2": "3.9.18",
|
||||
"detect-browser": "^5.3.0"
|
||||
"detect-browser": "^5.3.0",
|
||||
"@posthog/cyclotron": "file:../rust/cyclotron-node"
|
||||
},
|
||||
"devDependencies": {
|
||||
"0x": "^5.5.0",
|
||||
|
@ -43,6 +43,9 @@ dependencies:
|
||||
'@posthog/clickhouse':
|
||||
specifier: ^1.7.0
|
||||
version: 1.7.0
|
||||
'@posthog/cyclotron':
|
||||
specifier: file:../rust/cyclotron-node
|
||||
version: file:../rust/cyclotron-node
|
||||
'@posthog/hogvm':
|
||||
specifier: ^1.0.32
|
||||
version: 1.0.32(luxon@3.4.4)(re2@1.20.3)
|
||||
@ -10731,3 +10734,8 @@ packages:
|
||||
/yocto-queue@0.1.0:
|
||||
resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
file:../rust/cyclotron-node:
|
||||
resolution: {directory: ../rust/cyclotron-node, type: directory}
|
||||
name: '@posthog/cyclotron'
|
||||
dev: false
|
||||
|
@ -26,6 +26,7 @@ export function getPluginServerCapabilities(config: PluginsServerConfig): Plugin
|
||||
cdpProcessedEvents: true,
|
||||
cdpFunctionCallbacks: true,
|
||||
cdpFunctionOverflow: true,
|
||||
cdpCyclotronWorker: true,
|
||||
syncInlinePlugins: true,
|
||||
...sharedCapabilities,
|
||||
}
|
||||
@ -108,6 +109,11 @@ export function getPluginServerCapabilities(config: PluginsServerConfig): Plugin
|
||||
cdpFunctionOverflow: true,
|
||||
...sharedCapabilities,
|
||||
}
|
||||
case PluginServerMode.cdp_cyclotron_worker:
|
||||
return {
|
||||
cdpCyclotronWorker: true,
|
||||
...sharedCapabilities,
|
||||
}
|
||||
// This is only for functional tests, which time out if all capabilities are used
|
||||
// ideally we'd run just the specific capability needed per test, but that's not easy to do atm
|
||||
case PluginServerMode.functional_tests:
|
||||
|
@ -1,3 +1,4 @@
|
||||
import cyclotron from '@posthog/cyclotron'
|
||||
import { Histogram } from 'prom-client'
|
||||
|
||||
import { buildIntegerMatcher } from '../config/config'
|
||||
@ -27,9 +28,11 @@ export type AsyncFunctionExecutorOptions = {
|
||||
|
||||
export class AsyncFunctionExecutor {
|
||||
hogHookEnabledForTeams: ValueMatcher<number>
|
||||
cyclotronEnabledForTeams: ValueMatcher<number>
|
||||
|
||||
constructor(private serverConfig: PluginsServerConfig, private rustyHook: RustyHook) {
|
||||
this.hogHookEnabledForTeams = buildIntegerMatcher(serverConfig.CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS, true)
|
||||
this.cyclotronEnabledForTeams = buildIntegerMatcher(serverConfig.CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS, true)
|
||||
}
|
||||
|
||||
async execute(
|
||||
@ -99,8 +102,44 @@ export class AsyncFunctionExecutor {
|
||||
histogramFetchPayloadSize.observe(body.length / 1024)
|
||||
}
|
||||
|
||||
// If the caller hasn't forced it to be synchronous and the team has the rustyhook enabled, enqueue it
|
||||
if (!options?.sync && this.hogHookEnabledForTeams(request.teamId)) {
|
||||
// If the caller hasn't forced it to be synchronous and the team has the cyclotron or
|
||||
// rustyhook enabled, enqueue it in one of those services.
|
||||
if (!options?.sync && this.cyclotronEnabledForTeams(request.teamId)) {
|
||||
try {
|
||||
await cyclotron.createJob({
|
||||
teamId: request.teamId,
|
||||
functionId: request.hogFunctionId,
|
||||
queueName: 'fetch',
|
||||
// TODO: The async function compression changes happen upstream of this
|
||||
// function. I guess we'll want to unwind that change because we actually
|
||||
// want the `vmState` (and the rest of state) so we can put it into PG here.
|
||||
vmState: '',
|
||||
parameters: JSON.stringify({
|
||||
return_queue: 'hog',
|
||||
url,
|
||||
method,
|
||||
headers,
|
||||
body,
|
||||
}),
|
||||
metadata: JSON.stringify({
|
||||
// TODO: It seems like Fetch expects metadata to have this shape, which
|
||||
// I don't understand. I think `metadata` is where all the other Hog
|
||||
// state is going to be stored? For now I'm just trying to make fetch
|
||||
// work.
|
||||
tries: 0,
|
||||
trace: [],
|
||||
}),
|
||||
})
|
||||
} catch (e) {
|
||||
status.error(
|
||||
'🦔',
|
||||
`[HogExecutor] Cyclotron failed to enqueue async fetch function, sending directly instead`,
|
||||
{
|
||||
error: e,
|
||||
}
|
||||
)
|
||||
}
|
||||
} else if (!options?.sync && this.hogHookEnabledForTeams(request.teamId)) {
|
||||
const hoghooksPayload = JSON.stringify(request)
|
||||
|
||||
histogramHogHooksPayloadSize.observe(hoghooksPayload.length / 1024)
|
||||
|
@ -1,3 +1,4 @@
|
||||
import cyclotron from '@posthog/cyclotron'
|
||||
import { captureException } from '@sentry/node'
|
||||
import { features, librdkafkaVersion, Message } from 'node-rdkafka'
|
||||
import { Counter, Histogram } from 'prom-client'
|
||||
@ -443,7 +444,12 @@ abstract class CdpConsumerBase {
|
||||
const globalConnectionConfig = createRdConnectionConfigFromEnvVars(this.hub)
|
||||
const globalProducerConfig = createRdProducerConfigFromEnvVars(this.hub)
|
||||
|
||||
await Promise.all([this.hogFunctionManager.start()])
|
||||
await Promise.all([
|
||||
this.hogFunctionManager.start(),
|
||||
this.hub.CYCLOTRON_DATABASE_URL
|
||||
? cyclotron.initManager({ shards: [{ dbUrl: this.hub.CYCLOTRON_DATABASE_URL }] })
|
||||
: Promise.resolve(),
|
||||
])
|
||||
|
||||
this.kafkaProducer = new KafkaProducerWrapper(
|
||||
await createKafkaProducer(globalConnectionConfig, globalProducerConfig)
|
||||
@ -693,3 +699,57 @@ export class CdpOverflowConsumer extends CdpConsumerBase {
|
||||
return invocationGlobals
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Split out non-Kafka specific parts of CdpConsumerBase so that it can be used by the
|
||||
// Cyclotron worker below. Or maybe we can just wait, and rip the Kafka bits out once Cyclotron is
|
||||
// shipped (and rename it something other than consomer, probably). For now, this is an easy way to
|
||||
// use existing code and get an end-to-end demo shipped.
|
||||
export class CdpCyclotronWorker extends CdpConsumerBase {
|
||||
protected name = 'CdpCyclotronWorker'
|
||||
protected topic = 'UNUSED-CdpCyclotronWorker'
|
||||
protected consumerGroupId = 'UNUSED-CdpCyclotronWorker'
|
||||
private runningWorker: Promise<void> | undefined
|
||||
private isUnhealthy = false
|
||||
|
||||
public async _handleEachBatch(_: Message[]): Promise<void> {
|
||||
// Not called, we override `start` below to use Cyclotron instead.
|
||||
}
|
||||
|
||||
private async innerStart() {
|
||||
try {
|
||||
const limit = 100 // TODO: Make configurable.
|
||||
while (!this.isStopping) {
|
||||
const jobs = await cyclotron.dequeueJobsWithVmState('hog', limit)
|
||||
for (const job of jobs) {
|
||||
// TODO: Reassemble a HogFunctionInvocationAsyncResponse (or whatever proper type)
|
||||
// from the fields on the job, and then execute the next Hog step.
|
||||
console.log(job.id)
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
this.isUnhealthy = true
|
||||
console.error('Error in Cyclotron worker', err)
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
public async start() {
|
||||
await cyclotron.initManager({ shards: [{ dbUrl: this.hub.CYCLOTRON_DATABASE_URL }] })
|
||||
await cyclotron.initWorker({ dbUrl: this.hub.CYCLOTRON_DATABASE_URL })
|
||||
|
||||
// Consumer `start` expects an async task is started, and not that `start` itself blocks
|
||||
// indefinitely.
|
||||
this.runningWorker = this.innerStart()
|
||||
|
||||
return Promise.resolve()
|
||||
}
|
||||
|
||||
public async stop() {
|
||||
await super.stop()
|
||||
await this.runningWorker
|
||||
}
|
||||
|
||||
public isHealthy() {
|
||||
return this.isUnhealthy
|
||||
}
|
||||
}
|
||||
|
@ -187,9 +187,13 @@ export function getDefaultConfig(): PluginsServerConfig {
|
||||
CDP_WATCHER_REFILL_RATE: 10,
|
||||
CDP_WATCHER_DISABLED_TEMPORARY_MAX_COUNT: 3,
|
||||
CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS: '',
|
||||
CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS: '',
|
||||
CDP_REDIS_PASSWORD: '',
|
||||
CDP_REDIS_HOST: '',
|
||||
CDP_REDIS_PORT: 6479,
|
||||
|
||||
// Cyclotron
|
||||
CYCLOTRON_DATABASE_URL: '',
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,12 @@ import v8Profiler from 'v8-profiler-next'
|
||||
|
||||
import { getPluginServerCapabilities } from '../capabilities'
|
||||
import { CdpApi } from '../cdp/cdp-api'
|
||||
import { CdpFunctionCallbackConsumer, CdpOverflowConsumer, CdpProcessedEventsConsumer } from '../cdp/cdp-consumers'
|
||||
import {
|
||||
CdpCyclotronWorker,
|
||||
CdpFunctionCallbackConsumer,
|
||||
CdpOverflowConsumer,
|
||||
CdpProcessedEventsConsumer,
|
||||
} from '../cdp/cdp-consumers'
|
||||
import { defaultConfig, sessionRecordingConsumerConfig } from '../config/config'
|
||||
import { Hub, PluginServerCapabilities, PluginsServerConfig } from '../types'
|
||||
import { createHub, createKafkaClient, createKafkaProducerWrapper } from '../utils/db/hub'
|
||||
@ -571,6 +576,17 @@ export async function startPluginsServer(
|
||||
healthChecks['cdp-overflow'] = () => consumer.isHealthy() ?? false
|
||||
}
|
||||
|
||||
if (capabilities.cdpCyclotronWorker) {
|
||||
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, capabilities)
|
||||
if (hub.CYCLOTRON_DATABASE_URL) {
|
||||
const worker = new CdpCyclotronWorker(hub)
|
||||
await worker.start()
|
||||
} else {
|
||||
// This is a temporary solution until we *require* Cyclotron to be configured.
|
||||
status.warn('💥', 'CYCLOTRON_DATABASE_URL is not set, not running Cyclotron worker')
|
||||
}
|
||||
}
|
||||
|
||||
if (capabilities.http) {
|
||||
const app = setupCommonRoutes(healthChecks, analyticsEventsIngestionConsumer)
|
||||
|
||||
|
@ -85,6 +85,7 @@ export enum PluginServerMode {
|
||||
cdp_processed_events = 'cdp-processed-events',
|
||||
cdp_function_callbacks = 'cdp-function-callbacks',
|
||||
cdp_function_overflow = 'cdp-function-overflow',
|
||||
cdp_cyclotron_worker = 'cdp-cyclotron-worker',
|
||||
functional_tests = 'functional-tests',
|
||||
}
|
||||
|
||||
@ -107,6 +108,7 @@ export type CdpConfig = {
|
||||
CDP_WATCHER_DISABLED_TEMPORARY_TTL: number // How long a function should be temporarily disabled for
|
||||
CDP_WATCHER_DISABLED_TEMPORARY_MAX_COUNT: number // How many times a function can be disabled before it is disabled permanently
|
||||
CDP_ASYNC_FUNCTIONS_RUSTY_HOOK_TEAMS: string
|
||||
CDP_ASYNC_FUNCTIONS_CYCLOTRON_TEAMS: string
|
||||
CDP_REDIS_HOST: string
|
||||
CDP_REDIS_PORT: number
|
||||
CDP_REDIS_PASSWORD: string
|
||||
@ -279,6 +281,8 @@ export interface PluginsServerConfig extends CdpConfig {
|
||||
|
||||
// kafka debug stats interval
|
||||
SESSION_RECORDING_KAFKA_CONSUMPTION_STATISTICS_EVENT_INTERVAL_MS: number
|
||||
|
||||
CYCLOTRON_DATABASE_URL: string
|
||||
}
|
||||
|
||||
export interface Hub extends PluginsServerConfig {
|
||||
@ -345,6 +349,7 @@ export interface PluginServerCapabilities {
|
||||
cdpProcessedEvents?: boolean
|
||||
cdpFunctionCallbacks?: boolean
|
||||
cdpFunctionOverflow?: boolean
|
||||
cdpCyclotronWorker?: boolean
|
||||
appManagementSingleton?: boolean
|
||||
preflightSchedules?: boolean // Used for instance health checks on hobby deploy, not useful on cloud
|
||||
http?: boolean
|
||||
|
@ -97,6 +97,7 @@ describe('server', () => {
|
||||
cdpProcessedEvents: true,
|
||||
cdpFunctionCallbacks: true,
|
||||
cdpFunctionOverflow: true,
|
||||
cdpCyclotronWorker: true,
|
||||
syncInlinePlugins: true,
|
||||
}
|
||||
)
|
||||
|
@ -38,11 +38,12 @@ COPY ./bin/ ./bin/
|
||||
COPY babel.config.js tsconfig.json webpack.config.js tailwind.config.js ./
|
||||
RUN pnpm build
|
||||
|
||||
|
||||
#
|
||||
# ---------------------------------------------------------
|
||||
#
|
||||
FROM node:18.19.1-bullseye-slim AS plugin-server-build
|
||||
FROM ghcr.io/posthog/rust-node-container:bullseye_rust_1.80.1-node_18.19.1 AS plugin-server-build
|
||||
WORKDIR /code
|
||||
COPY ./rust ./rust
|
||||
WORKDIR /code/plugin-server
|
||||
SHELL ["/bin/bash", "-e", "-o", "pipefail", "-c"]
|
||||
|
||||
@ -182,6 +183,7 @@ COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/dist
|
||||
COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/node_modules /code/plugin-server/node_modules
|
||||
COPY --from=plugin-server-build --chown=posthog:posthog /code/plugin-server/package.json /code/plugin-server/package.json
|
||||
|
||||
|
||||
# Copy the Python dependencies and Django staticfiles from the posthog-build stage.
|
||||
COPY --from=posthog-build --chown=posthog:posthog /code/staticfiles /code/staticfiles
|
||||
COPY --from=posthog-build --chown=posthog:posthog /python-runtime /python-runtime
|
||||
|
4
rust/.cargo/config.toml
Normal file
4
rust/.cargo/config.toml
Normal file
@ -0,0 +1,4 @@
|
||||
[env]
|
||||
# Force SQLX to run in offline mode for CI. Devs can change this if they want, to live code against the DB,
|
||||
# but we use it at the workspace level here to allow use of sqlx macros across all crates
|
||||
SQLX_OFFLINE = "true"
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Int2", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805"
|
||||
}
|
@ -0,0 +1,117 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n NULL as vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"ordinal": 0,
|
||||
"name": "id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 1,
|
||||
"name": "team_id",
|
||||
"type_info": "Int4"
|
||||
},
|
||||
{
|
||||
"ordinal": 2,
|
||||
"name": "state: JobState",
|
||||
"type_info": {
|
||||
"Custom": {
|
||||
"name": "jobstate",
|
||||
"kind": {
|
||||
"Enum": ["available", "completed", "failed", "running", "paused"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"ordinal": 3,
|
||||
"name": "queue_name",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 4,
|
||||
"name": "priority",
|
||||
"type_info": "Int2"
|
||||
},
|
||||
{
|
||||
"ordinal": 5,
|
||||
"name": "function_id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 6,
|
||||
"name": "created",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 7,
|
||||
"name": "last_transition",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 8,
|
||||
"name": "scheduled",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 9,
|
||||
"name": "transition_count",
|
||||
"type_info": "Int2"
|
||||
},
|
||||
{
|
||||
"ordinal": 10,
|
||||
"name": "vm_state",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 11,
|
||||
"name": "metadata",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 12,
|
||||
"name": "parameters",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 13,
|
||||
"name": "lock_id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 14,
|
||||
"name": "last_heartbeat",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 15,
|
||||
"name": "janitor_touch_count",
|
||||
"type_info": "Int2"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Int8", "Uuid"]
|
||||
},
|
||||
"nullable": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
null,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
false
|
||||
]
|
||||
},
|
||||
"hash": "350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nWITH stalled AS (\n SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1\nFROM stalled\nWHERE cyclotron_jobs.id = stalled.id\n ",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Timestamptz"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d"
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nINSERT INTO cyclotron_jobs\n (\n id,\n team_id,\n function_id,\n created,\n lock_id,\n last_heartbeat,\n janitor_touch_count,\n transition_count,\n last_transition,\n queue_name,\n state,\n scheduled,\n priority,\n vm_state,\n metadata,\n parameters\n )\nVALUES\n ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)\n ",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": [
|
||||
"Uuid",
|
||||
"Int4",
|
||||
"Uuid",
|
||||
"Text",
|
||||
{
|
||||
"Custom": {
|
||||
"name": "jobstate",
|
||||
"kind": {
|
||||
"Enum": ["available", "completed", "failed", "running", "paused"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"Timestamptz",
|
||||
"Int2",
|
||||
"Text",
|
||||
"Text",
|
||||
"Text"
|
||||
]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Timestamptz", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23"
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"ordinal": 0,
|
||||
"name": "vm_state",
|
||||
"type_info": "Text"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Left": ["Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": [true]
|
||||
},
|
||||
"hash": "aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7"
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs\n SET state = $1, last_transition = NOW(), transition_count = transition_count + 1\n WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": [
|
||||
{
|
||||
"Custom": {
|
||||
"name": "jobstate",
|
||||
"kind": {
|
||||
"Enum": ["available", "completed", "failed", "running", "paused"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"Uuid",
|
||||
"Uuid"
|
||||
]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13"
|
||||
}
|
@ -0,0 +1,117 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"ordinal": 0,
|
||||
"name": "id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 1,
|
||||
"name": "team_id",
|
||||
"type_info": "Int4"
|
||||
},
|
||||
{
|
||||
"ordinal": 2,
|
||||
"name": "state: JobState",
|
||||
"type_info": {
|
||||
"Custom": {
|
||||
"name": "jobstate",
|
||||
"kind": {
|
||||
"Enum": ["available", "completed", "failed", "running", "paused"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"ordinal": 3,
|
||||
"name": "queue_name",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 4,
|
||||
"name": "priority",
|
||||
"type_info": "Int2"
|
||||
},
|
||||
{
|
||||
"ordinal": 5,
|
||||
"name": "function_id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 6,
|
||||
"name": "created",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 7,
|
||||
"name": "last_transition",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 8,
|
||||
"name": "scheduled",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 9,
|
||||
"name": "transition_count",
|
||||
"type_info": "Int2"
|
||||
},
|
||||
{
|
||||
"ordinal": 10,
|
||||
"name": "vm_state",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 11,
|
||||
"name": "metadata",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 12,
|
||||
"name": "parameters",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 13,
|
||||
"name": "lock_id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 14,
|
||||
"name": "last_heartbeat",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 15,
|
||||
"name": "janitor_touch_count",
|
||||
"type_info": "Int2"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Int8", "Uuid"]
|
||||
},
|
||||
"nullable": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
false
|
||||
]
|
||||
},
|
||||
"hash": "c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "DELETE FROM cyclotron_jobs WHERE state = 'completed'",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": []
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nDELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2\n ",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Timestamptz", "Int2"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "DELETE FROM cyclotron_jobs WHERE state = 'failed'",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": []
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e"
|
||||
}
|
160
rust/Cargo.lock
generated
160
rust/Cargo.lock
generated
@ -673,6 +673,7 @@ dependencies = [
|
||||
"iana-time-zone",
|
||||
"js-sys",
|
||||
"num-traits",
|
||||
"serde",
|
||||
"wasm-bindgen",
|
||||
"windows-targets 0.52.0",
|
||||
]
|
||||
@ -700,6 +701,25 @@ dependencies = [
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "common-dns"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"reqwest 0.12.3",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "common-metrics"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"axum 0.7.5",
|
||||
"metrics",
|
||||
"metrics-exporter-prometheus",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "concurrent-queue"
|
||||
version = "2.5.0"
|
||||
@ -819,6 +839,80 @@ dependencies = [
|
||||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cyclotron-core"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"futures",
|
||||
"rand",
|
||||
"serde",
|
||||
"sqlx",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cyclotron-fetch"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"axum 0.7.5",
|
||||
"chrono",
|
||||
"common-dns",
|
||||
"common-metrics",
|
||||
"cyclotron-core",
|
||||
"envconfig",
|
||||
"futures",
|
||||
"health",
|
||||
"http 1.1.0",
|
||||
"httpmock",
|
||||
"metrics",
|
||||
"rand",
|
||||
"reqwest 0.12.3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sqlx",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cyclotron-janitor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"axum 0.7.5",
|
||||
"chrono",
|
||||
"common-metrics",
|
||||
"cyclotron-core",
|
||||
"envconfig",
|
||||
"eyre",
|
||||
"health",
|
||||
"metrics",
|
||||
"sqlx",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cyclotron-node"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"cyclotron-core",
|
||||
"neon",
|
||||
"once_cell",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dashmap"
|
||||
version = "5.5.3"
|
||||
@ -1468,6 +1562,7 @@ name = "hook-api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"axum 0.7.5",
|
||||
"common-metrics",
|
||||
"envconfig",
|
||||
"eyre",
|
||||
"hook-common",
|
||||
@ -1489,13 +1584,10 @@ name = "hook-common"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum 0.7.5",
|
||||
"chrono",
|
||||
"envconfig",
|
||||
"health",
|
||||
"http 1.1.0",
|
||||
"metrics",
|
||||
"metrics-exporter-prometheus",
|
||||
"rdkafka",
|
||||
"reqwest 0.12.3",
|
||||
"serde",
|
||||
@ -1514,6 +1606,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum 0.7.5",
|
||||
"common-metrics",
|
||||
"envconfig",
|
||||
"eyre",
|
||||
"futures",
|
||||
@ -1537,6 +1630,8 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"axum 0.7.5",
|
||||
"chrono",
|
||||
"common-dns",
|
||||
"common-metrics",
|
||||
"envconfig",
|
||||
"futures",
|
||||
"health",
|
||||
@ -1944,6 +2039,16 @@ version = "0.2.153"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
|
||||
|
||||
[[package]]
|
||||
name = "libloading"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"windows-targets 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libm"
|
||||
version = "0.2.8"
|
||||
@ -2160,6 +2265,32 @@ dependencies = [
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "neon"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d75440242411c87dc39847b0e33e961ec1f10326a9d8ecf9c1ea64a3b3c13dc"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"libloading",
|
||||
"neon-macros",
|
||||
"once_cell",
|
||||
"semver",
|
||||
"send_wrapper",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "neon-macros"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c6813fde79b646e47e7ad75f480aa80ef76a5d9599e2717407961531169ee38b"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn 2.0.48",
|
||||
"syn-mid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "new_debug_unreachable"
|
||||
version = "1.0.6"
|
||||
@ -3181,6 +3312,18 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
|
||||
|
||||
[[package]]
|
||||
name = "send_wrapper"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.196"
|
||||
@ -3660,6 +3803,17 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn-mid"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5dc35bb08dd1ca3dfb09dce91fd2d13294d6711c88897d9a9d60acf39bce049"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.48",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sync_wrapper"
|
||||
version = "0.1.2"
|
||||
|
@ -4,11 +4,17 @@ resolver = "2"
|
||||
members = [
|
||||
"capture",
|
||||
"common/health",
|
||||
"common/metrics",
|
||||
"common/dns",
|
||||
"feature-flags",
|
||||
"hook-api",
|
||||
"hook-common",
|
||||
"hook-janitor",
|
||||
"hook-worker",
|
||||
"cyclotron-core",
|
||||
"cyclotron-node",
|
||||
"cyclotron-janitor",
|
||||
"cyclotron-fetch",
|
||||
]
|
||||
|
||||
[workspace.lints.rust]
|
||||
@ -34,7 +40,7 @@ axum = { version = "0.7.5", features = ["http2", "macros", "matched-path"] }
|
||||
axum-client-ip = "0.6.0"
|
||||
base64 = "0.22.0"
|
||||
bytes = "1"
|
||||
chrono = { version = "0.4" }
|
||||
chrono = { version = "0.4", features = ["default", "serde"]}
|
||||
envconfig = "0.10.0"
|
||||
eyre = "0.6.9"
|
||||
flate2 = "1.0"
|
||||
@ -80,3 +86,4 @@ tracing-opentelemetry = "0.23.0"
|
||||
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
|
||||
url = { version = "2.5.0 " }
|
||||
uuid = { version = "1.6.1", features = ["v7", "serde"] }
|
||||
neon = "1"
|
@ -1,4 +1,4 @@
|
||||
FROM docker.io/lukemathwalker/cargo-chef:latest-rust-1.77-bookworm AS chef
|
||||
FROM docker.io/lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef
|
||||
ARG BIN
|
||||
WORKDIR /app
|
||||
|
||||
|
12
rust/common/dns/Cargo.toml
Normal file
12
rust/common/dns/Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "common-dns"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
futures = { workspace = true }
|
||||
reqwest = { workspace = true }
|
||||
tokio = { workspace = true }
|
@ -86,7 +86,7 @@ impl Resolve for PublicIPv4Resolver {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::dns::{NoPublicIPv4Error, PublicIPv4Resolver};
|
||||
use crate::{NoPublicIPv4Error, PublicIPv4Resolver};
|
||||
use reqwest::dns::{Name, Resolve};
|
||||
use std::str::FromStr;
|
||||
|
@ -4,7 +4,7 @@ use std::sync::{Arc, RwLock};
|
||||
|
||||
use axum::http::StatusCode;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use time::Duration;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{info, warn};
|
||||
|
||||
@ -143,7 +143,16 @@ impl HealthRegistry {
|
||||
|
||||
/// Registers a new component in the registry. The returned handle should be passed
|
||||
/// to the component, to allow it to frequently report its health status.
|
||||
pub async fn register(&self, component: String, deadline: Duration) -> HealthHandle {
|
||||
pub async fn register<D>(&self, component: String, deadline: D) -> HealthHandle
|
||||
where
|
||||
// HACK: to let callers user time::Duration or std::time::Duration (and therefore chrono::Duration),
|
||||
// since apparently we use all three
|
||||
D: TryInto<Duration>,
|
||||
{
|
||||
let Ok(deadline) = deadline.try_into() else {
|
||||
// TODO - I should return an error here, but I don't want to refactor everything that uses this right now
|
||||
panic!("invalid deadline")
|
||||
};
|
||||
let handle = HealthHandle {
|
||||
component,
|
||||
deadline,
|
||||
|
13
rust/common/metrics/Cargo.toml
Normal file
13
rust/common/metrics/Cargo.toml
Normal file
@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "common-metrics"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
axum = { workspace = true }
|
||||
metrics-exporter-prometheus = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
metrics = { workspace = true }
|
1
rust/common/metrics/README.md
Normal file
1
rust/common/metrics/README.md
Normal file
@ -0,0 +1 @@
|
||||
Ripped from rusty-hook, since it'll be used across more or less all cyclotron stuff, as well as rustyhook
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "075421be22b51c50eb74ac1156175c285bc510766c175b1b8c4e4002e04ff503"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "16d533b5a15b0b9926a181f578b5b577efe424710b45f02e1ddeece8bca96f87"
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'available' AND scheduled <= NOW()",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"ordinal": 0,
|
||||
"name": "count",
|
||||
"type_info": "Int8"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Left": []
|
||||
},
|
||||
"nullable": [null]
|
||||
},
|
||||
"hash": "213e9d70e145a01fb42d5c3a80f9126073113a4af03c4c9fd3a81004d898f883"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "2b62adf40f8dd5758690c763df30fffa01b679951ec786f8ee2410454b9a2de0"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Int2", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "2f6de0977357909dfd8d3d510c39a284f16421f77b77fe38e67143f28e270805"
|
||||
}
|
@ -0,0 +1,117 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n NULL as vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"ordinal": 0,
|
||||
"name": "id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 1,
|
||||
"name": "team_id",
|
||||
"type_info": "Int4"
|
||||
},
|
||||
{
|
||||
"ordinal": 2,
|
||||
"name": "state: JobState",
|
||||
"type_info": {
|
||||
"Custom": {
|
||||
"name": "jobstate",
|
||||
"kind": {
|
||||
"Enum": ["available", "completed", "failed", "running", "paused"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"ordinal": 3,
|
||||
"name": "queue_name",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 4,
|
||||
"name": "priority",
|
||||
"type_info": "Int2"
|
||||
},
|
||||
{
|
||||
"ordinal": 5,
|
||||
"name": "function_id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 6,
|
||||
"name": "created",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 7,
|
||||
"name": "last_transition",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 8,
|
||||
"name": "scheduled",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 9,
|
||||
"name": "transition_count",
|
||||
"type_info": "Int2"
|
||||
},
|
||||
{
|
||||
"ordinal": 10,
|
||||
"name": "vm_state",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 11,
|
||||
"name": "metadata",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 12,
|
||||
"name": "parameters",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 13,
|
||||
"name": "lock_id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 14,
|
||||
"name": "last_heartbeat",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 15,
|
||||
"name": "janitor_touch_count",
|
||||
"type_info": "Int2"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Int8", "Uuid"]
|
||||
},
|
||||
"nullable": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
null,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
false
|
||||
]
|
||||
},
|
||||
"hash": "350983ef271029734aff70eb7e298bfe578ecaa8678268863bce917ced9d5d46"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nWITH stalled AS (\n SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1\nFROM stalled\nWHERE cyclotron_jobs.id = stalled.id\n ",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Timestamptz"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "54d9afe6952f92b753fbce2c4e8554065b71152389f98d35532c6b332d5a4c9d"
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nINSERT INTO cyclotron_jobs\n (\n id,\n team_id,\n function_id,\n created,\n lock_id,\n last_heartbeat,\n janitor_touch_count,\n transition_count,\n last_transition,\n queue_name,\n state,\n scheduled,\n priority,\n vm_state,\n metadata,\n parameters\n )\nVALUES\n ($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)\n ",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": [
|
||||
"Uuid",
|
||||
"Int4",
|
||||
"Uuid",
|
||||
"Text",
|
||||
{
|
||||
"Custom": {
|
||||
"name": "jobstate",
|
||||
"kind": {
|
||||
"Enum": ["available", "completed", "failed", "running", "paused"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"Timestamptz",
|
||||
"Int2",
|
||||
"Text",
|
||||
"Text",
|
||||
"Text"
|
||||
]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "7217e766aeb53298238222c0c71a2ce446cac731845c53cb926fc47ace708dd6"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "884da9767d2992c7b279b4f8df5129b8c4d7020b7cb1999702aee1bfb1087efb"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "8ab11a89bc4720985e130c58021f46045c332cc45e69b08708b289cc933b3b5c"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Timestamptz", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "98da1f12285a97a47ce88535c82a8c4eb4cb910b0c5ddbc6bdbd9156af7b4e23"
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"ordinal": 0,
|
||||
"name": "vm_state",
|
||||
"type_info": "Text"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Left": ["Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": [true]
|
||||
},
|
||||
"hash": "aa595eaf28c1f4b872c278be407b59cc00f3125413f4032ac3647a6b5ee1a632"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Uuid", "Uuid"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "b160b785a0377b854341105e99e4ef7a5da523e168a5f9be47f6caaef09487d7"
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "UPDATE cyclotron_jobs\n SET state = $1, last_transition = NOW(), transition_count = transition_count + 1\n WHERE id = $2 AND lock_id = $3",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": [
|
||||
{
|
||||
"Custom": {
|
||||
"name": "jobstate",
|
||||
"kind": {
|
||||
"Enum": ["available", "completed", "failed", "running", "paused"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"Uuid",
|
||||
"Uuid"
|
||||
]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "b3239c1dde9a88769ec488299612f7a3ad2b2ee57d8d2353c858299d7b6ffe13"
|
||||
}
|
@ -0,0 +1,117 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nWITH available AS (\n SELECT\n id,\n state\n FROM cyclotron_jobs\n WHERE\n state = 'available'::JobState\n AND queue_name = $1\n AND scheduled <= NOW()\n ORDER BY\n priority ASC,\n scheduled ASC\n LIMIT $2\n FOR UPDATE SKIP LOCKED\n)\nUPDATE cyclotron_jobs\nSET\n state = 'running'::JobState,\n lock_id = $3,\n last_heartbeat = NOW(),\n last_transition = NOW(),\n transition_count = transition_count + 1\nFROM available\nWHERE\n cyclotron_jobs.id = available.id\nRETURNING\n cyclotron_jobs.id,\n team_id,\n available.state as \"state: JobState\",\n queue_name,\n priority,\n function_id,\n created,\n last_transition,\n scheduled,\n transition_count,\n vm_state,\n metadata,\n parameters,\n lock_id,\n last_heartbeat,\n janitor_touch_count\n ",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"ordinal": 0,
|
||||
"name": "id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 1,
|
||||
"name": "team_id",
|
||||
"type_info": "Int4"
|
||||
},
|
||||
{
|
||||
"ordinal": 2,
|
||||
"name": "state: JobState",
|
||||
"type_info": {
|
||||
"Custom": {
|
||||
"name": "jobstate",
|
||||
"kind": {
|
||||
"Enum": ["available", "completed", "failed", "running", "paused"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"ordinal": 3,
|
||||
"name": "queue_name",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 4,
|
||||
"name": "priority",
|
||||
"type_info": "Int2"
|
||||
},
|
||||
{
|
||||
"ordinal": 5,
|
||||
"name": "function_id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 6,
|
||||
"name": "created",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 7,
|
||||
"name": "last_transition",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 8,
|
||||
"name": "scheduled",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 9,
|
||||
"name": "transition_count",
|
||||
"type_info": "Int2"
|
||||
},
|
||||
{
|
||||
"ordinal": 10,
|
||||
"name": "vm_state",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 11,
|
||||
"name": "metadata",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 12,
|
||||
"name": "parameters",
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"ordinal": 13,
|
||||
"name": "lock_id",
|
||||
"type_info": "Uuid"
|
||||
},
|
||||
{
|
||||
"ordinal": 14,
|
||||
"name": "last_heartbeat",
|
||||
"type_info": "Timestamptz"
|
||||
},
|
||||
{
|
||||
"ordinal": 15,
|
||||
"name": "janitor_touch_count",
|
||||
"type_info": "Int2"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Left": ["Text", "Int8", "Uuid"]
|
||||
},
|
||||
"nullable": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
false
|
||||
]
|
||||
},
|
||||
"hash": "c624261597b9356ff3e7c3e392a84bb0b551e91c503e8b21c29814f1eb660a8e"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "DELETE FROM cyclotron_jobs WHERE state = 'completed'",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": []
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "f4e808f58dd290c6e2b49b63e9e0eb022936ba318021512a0cc0c2e0766abe7c"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "\nDELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2\n ",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": ["Timestamptz", "Int2"]
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "fdda5a80f5495f2d4b15ce1a0963f990986c8b8433f01e449fbd1eee70ce6aeb"
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
{
|
||||
"db_name": "PostgreSQL",
|
||||
"query": "DELETE FROM cyclotron_jobs WHERE state = 'failed'",
|
||||
"describe": {
|
||||
"columns": [],
|
||||
"parameters": {
|
||||
"Left": []
|
||||
},
|
||||
"nullable": []
|
||||
},
|
||||
"hash": "ffb66bdedf6506f95b9293ef88b0c51e2f5fb7d3271e1287165d2a35b6aaa25e"
|
||||
}
|
17
rust/cyclotron-core/Cargo.toml
Normal file
17
rust/cyclotron-core/Cargo.toml
Normal file
@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "cyclotron-core"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
serde = { workspace = true }
|
||||
sqlx = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
rand = { workspace = true }
|
||||
futures = { workspace = true }
|
@ -0,0 +1,102 @@
|
||||
CREATE TYPE JobState AS ENUM(
|
||||
'available',
|
||||
'completed',
|
||||
'failed',
|
||||
'running',
|
||||
'paused'
|
||||
);
|
||||
|
||||
|
||||
---------------------------------------------------------------------
|
||||
-- Job table
|
||||
---------------------------------------------------------------------
|
||||
-- When a job is dequeued, it is locked by generating a UUID and returning it to the dequeuing
|
||||
-- worker. Any worker that can't provide the correct lock_id when updating will have their updates
|
||||
-- rejected. The reason this is important is because if, e.g., a worker holds a job in a running
|
||||
-- state without updating the heartbeat, the janitor will return the job to the queue eventually,
|
||||
-- and if the worker /then/ tries to update the job after another worker has picked it up, that's a
|
||||
-- race. We track transition count and times alongside lock_id's and heartbeats for reporting and
|
||||
-- debugging purposes, and we track the number of times the janitor has touched a job to spot poison
|
||||
-- pills.
|
||||
CREATE TABLE IF NOT EXISTS cyclotron_jobs (
|
||||
---------------------------------------------------------------------
|
||||
-- Job metadata
|
||||
---------------------------------------------------------------------
|
||||
id UUID PRIMARY KEY,
|
||||
team_id INT NOT NULL,
|
||||
function_id UUID,
|
||||
created TIMESTAMPTZ NOT NULL,
|
||||
---------------------------------------------------------------------
|
||||
-- Queue bookkeeping - invisible to the worker
|
||||
---------------------------------------------------------------------
|
||||
lock_id UUID,
|
||||
-- This is set when a job is in a running state, and is required to update the job.
|
||||
last_heartbeat TIMESTAMPTZ,
|
||||
-- This is updated by the worker to indicate that the job is making forward progress even
|
||||
-- without transitions (and should not be reaped)
|
||||
janitor_touch_count SMALLINT NOT NULL,
|
||||
transition_count SMALLINT NOT NULL,
|
||||
last_transition TIMESTAMPTZ NOT NULL,
|
||||
---------------------------------------------------------------------
|
||||
-- Queue components - determines which workers will consume this job
|
||||
---------------------------------------------------------------------
|
||||
queue_name TEXT NOT NULL,
|
||||
---------------------------------------------------------------------
|
||||
-- Job availability and priority (can this job be dequeued, and in what order?)
|
||||
---------------------------------------------------------------------
|
||||
state JobState NOT NULL,
|
||||
scheduled TIMESTAMPTZ NOT NULL,
|
||||
priority SMALLINT NOT NULL,
|
||||
---------------------------------------------------------------------
|
||||
-- Job data
|
||||
---------------------------------------------------------------------
|
||||
vm_state TEXT,
|
||||
-- This is meant for workers "talking to themselves", e.g. tracking retries or something
|
||||
metadata TEXT,
|
||||
-- This is meant for "the next guy" - hog might fill it with a URL to fetch, for example
|
||||
parameters TEXT
|
||||
);
|
||||
|
||||
-- For a given worker, the set of "available" jobs depends on state, queue_name, and scheduled (so
|
||||
-- we can exclude sleeping jobs). This index is partial, because we don't care about other states
|
||||
-- for the purpose of dequeuing
|
||||
CREATE INDEX idx_cyclotron_jobs_dequeue ON cyclotron_jobs (queue_name, state, scheduled, priority)
|
||||
WHERE
|
||||
state = 'available';
|
||||
|
||||
-- We create simple indexes on team_id, function_id and queue_name to support fast joins to future
|
||||
-- control tables
|
||||
CREATE INDEX idx_queue_team_id ON cyclotron_jobs(team_id);
|
||||
|
||||
CREATE INDEX idx_queue_function_id ON cyclotron_jobs(function_id);
|
||||
|
||||
CREATE INDEX idx_queue_queue_name ON cyclotron_jobs(queue_name);
|
||||
|
||||
|
||||
---------------------------------------------------------------------
|
||||
-- Control tables
|
||||
---------------------------------------------------------------------
|
||||
|
||||
|
||||
-- These are just a starting point, supporting overriding the state for a given team, function or queue
|
||||
-- For now these are entirely unused
|
||||
CREATE TABLE IF NOT EXISTS cyclotron_team_control (
|
||||
team_id INT PRIMARY KEY,
|
||||
state_override JobState,
|
||||
-- If this is not null, it overrides the state of all jobs for this team (allowing for e.g. pausing or force failing all of a teams jobs)
|
||||
state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cyclotron_function_control (
|
||||
function_id UUID PRIMARY KEY,
|
||||
state_override JobState,
|
||||
-- If this is not null, it overrides the state of all jobs for this function (allowing for e.g. pausing or force failing all of a functions jobs)
|
||||
state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cyclotron_queue_control (
|
||||
queue_name TEXT PRIMARY KEY,
|
||||
state_override JobState,
|
||||
-- If this is not null, it overrides the state of all jobs for this queue (allowing for e.g. pausing or force failing all of a queues jobs)
|
||||
state_override_expires TIMESTAMPTZ -- State override can be temporary or permanent
|
||||
);
|
697
rust/cyclotron-core/src/base_ops.rs
Normal file
697
rust/cyclotron-core/src/base_ops.rs
Normal file
@ -0,0 +1,697 @@
|
||||
//! # PgQueue
|
||||
//!
|
||||
//! A job queue implementation backed by a PostgreSQL table.
|
||||
|
||||
use std::str::FromStr;
|
||||
|
||||
use chrono::{self, DateTime, Utc};
|
||||
use serde::{self, Deserialize, Serialize};
|
||||
use sqlx::{
|
||||
postgres::{PgArguments, PgHasArrayType, PgQueryResult, PgTypeInfo},
|
||||
query::Query,
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::error::QueueError;
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize, sqlx::Type)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
#[sqlx(type_name = "JobState", rename_all = "lowercase")]
|
||||
pub enum JobState {
|
||||
Available,
|
||||
Running,
|
||||
Completed,
|
||||
Failed,
|
||||
Paused,
|
||||
}
|
||||
|
||||
impl FromStr for JobState {
|
||||
type Err = ();
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"available" => Ok(JobState::Available),
|
||||
"running" => Ok(JobState::Running),
|
||||
"completed" => Ok(JobState::Completed),
|
||||
"failed" => Ok(JobState::Failed),
|
||||
_ => Err(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PgHasArrayType for JobState {
|
||||
fn array_type_info() -> sqlx::postgres::PgTypeInfo {
|
||||
// Postgres default naming convention for array types is "_typename"
|
||||
PgTypeInfo::with_name("_JobState")
|
||||
}
|
||||
}
|
||||
|
||||
// The chunk of data needed to enqueue a job
|
||||
#[derive(Debug, Deserialize, Serialize, Clone, Eq, PartialEq)]
|
||||
pub struct JobInit {
|
||||
pub team_id: i32,
|
||||
pub queue_name: String,
|
||||
pub priority: i16,
|
||||
pub scheduled: DateTime<Utc>,
|
||||
pub function_id: Option<Uuid>,
|
||||
pub vm_state: Option<String>,
|
||||
pub parameters: Option<String>,
|
||||
pub metadata: Option<String>,
|
||||
}
|
||||
|
||||
// TODO - there are certain things we might want to be on a per-team basis here... the ability to say
|
||||
// "do not process any jobs for this team" independent of doing an operation on the job table seems powerful,
|
||||
// but that requires a distinct team table. For now, I'm just making a note that it's something we might
|
||||
// want (the command to modify the treatment of all jobs associated with a team should only need to be issued and
|
||||
// processed /once/, not once per job, and should apply to all jobs both currently queued and any future ones). This
|
||||
// can be added in a progressive way (by adding joins and clauses to the dequeue query), so we don't need to worry about
|
||||
// it too much up front.
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct Job {
|
||||
// Job metadata
|
||||
pub id: Uuid,
|
||||
pub team_id: i32,
|
||||
pub function_id: Option<Uuid>, // Some jobs might not come from hog, and it doesn't /kill/ use to support that
|
||||
pub created: DateTime<Utc>,
|
||||
|
||||
// Queue bookkeeping
|
||||
// This will be set for any worker that ever has a job in the "running" state (so any worker that dequeues a job)
|
||||
// but I don't want to do the work to encode that in the type system right now - later it should be
|
||||
pub lock_id: Option<Uuid>,
|
||||
pub last_heartbeat: Option<DateTime<Utc>>,
|
||||
pub janitor_touch_count: i16,
|
||||
pub transition_count: i16,
|
||||
pub last_transition: DateTime<Utc>,
|
||||
|
||||
// Virtual queue components
|
||||
pub queue_name: String, // We can have multiple "virtual queues" workers pull from
|
||||
|
||||
// Job availability
|
||||
pub state: JobState,
|
||||
pub priority: i16, // For sorting "available" jobs. Lower is higher priority
|
||||
pub scheduled: DateTime<Utc>,
|
||||
|
||||
// Job data
|
||||
pub vm_state: Option<String>, // The state of the VM this job is running on (if it exists)
|
||||
pub metadata: Option<String>, // Additional fields a worker can tack onto a job, for e.g. tracking some state across retries (or number of retries in general by a given class of worker)
|
||||
pub parameters: Option<String>, // The actual parameters of the job (function args for a hog function, http request for a fetch function)
|
||||
}
|
||||
|
||||
pub async fn create_job<'c, E>(executor: E, data: JobInit) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let id = Uuid::now_v7();
|
||||
sqlx::query!(
|
||||
r#"
|
||||
INSERT INTO cyclotron_jobs
|
||||
(
|
||||
id,
|
||||
team_id,
|
||||
function_id,
|
||||
created,
|
||||
lock_id,
|
||||
last_heartbeat,
|
||||
janitor_touch_count,
|
||||
transition_count,
|
||||
last_transition,
|
||||
queue_name,
|
||||
state,
|
||||
scheduled,
|
||||
priority,
|
||||
vm_state,
|
||||
metadata,
|
||||
parameters
|
||||
)
|
||||
VALUES
|
||||
($1, $2, $3, NOW(), NULL, NULL, 0, 0, NOW(), $4, $5, $6, $7, $8, $9, $10)
|
||||
"#,
|
||||
id,
|
||||
data.team_id,
|
||||
data.function_id,
|
||||
data.queue_name,
|
||||
JobState::Available as _,
|
||||
data.scheduled,
|
||||
data.priority,
|
||||
data.vm_state,
|
||||
data.metadata,
|
||||
data.parameters
|
||||
)
|
||||
.execute(executor)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn bulk_create_jobs<'c, E>(executor: E, jobs: &[JobInit]) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let now = Utc::now();
|
||||
// Flatten these jobs into a series of vecs of arguments PG can unnest
|
||||
let mut ids = Vec::with_capacity(jobs.len());
|
||||
let mut team_ids = Vec::with_capacity(jobs.len());
|
||||
let mut function_ids = Vec::with_capacity(jobs.len());
|
||||
let mut created_at = Vec::with_capacity(jobs.len());
|
||||
let mut lock_ids = Vec::with_capacity(jobs.len());
|
||||
let mut last_heartbeats = Vec::with_capacity(jobs.len());
|
||||
let mut janitor_touch_counts = Vec::with_capacity(jobs.len());
|
||||
let mut transition_counts = Vec::with_capacity(jobs.len());
|
||||
let mut last_transitions = Vec::with_capacity(jobs.len());
|
||||
let mut queue_names = Vec::with_capacity(jobs.len());
|
||||
let mut states = Vec::with_capacity(jobs.len());
|
||||
let mut scheduleds = Vec::with_capacity(jobs.len());
|
||||
let mut priorities = Vec::with_capacity(jobs.len());
|
||||
let mut vm_states = Vec::with_capacity(jobs.len());
|
||||
let mut metadatas = Vec::with_capacity(jobs.len());
|
||||
let mut parameters = Vec::with_capacity(jobs.len());
|
||||
|
||||
for d in jobs {
|
||||
ids.push(Uuid::now_v7());
|
||||
team_ids.push(d.team_id);
|
||||
function_ids.push(d.function_id);
|
||||
created_at.push(now);
|
||||
lock_ids.push(None::<Uuid>);
|
||||
last_heartbeats.push(None::<DateTime<Utc>>);
|
||||
janitor_touch_counts.push(0);
|
||||
transition_counts.push(0);
|
||||
last_transitions.push(now);
|
||||
queue_names.push(d.queue_name.clone());
|
||||
states.push(JobState::Available);
|
||||
scheduleds.push(d.scheduled);
|
||||
priorities.push(d.priority);
|
||||
vm_states.push(d.vm_state.clone());
|
||||
metadatas.push(d.metadata.clone());
|
||||
parameters.push(d.parameters.clone());
|
||||
}
|
||||
|
||||
// Using the "unnest" function to turn an array of rows into a set of rows
|
||||
sqlx::query(
|
||||
r#"
|
||||
INSERT INTO cyclotron_jobs
|
||||
(
|
||||
id,
|
||||
team_id,
|
||||
function_id,
|
||||
created,
|
||||
lock_id,
|
||||
last_heartbeat,
|
||||
janitor_touch_count,
|
||||
transition_count,
|
||||
last_transition,
|
||||
queue_name,
|
||||
state,
|
||||
scheduled,
|
||||
priority,
|
||||
vm_state,
|
||||
metadata,
|
||||
parameters
|
||||
)
|
||||
SELECT *
|
||||
FROM UNNEST(
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4,
|
||||
$5,
|
||||
$6,
|
||||
$7,
|
||||
$8,
|
||||
$9,
|
||||
$10,
|
||||
$11,
|
||||
$12,
|
||||
$13,
|
||||
$14,
|
||||
$15,
|
||||
$16
|
||||
)
|
||||
"#,
|
||||
)
|
||||
.bind(ids)
|
||||
.bind(team_ids)
|
||||
.bind(function_ids)
|
||||
.bind(created_at)
|
||||
.bind(lock_ids)
|
||||
.bind(last_heartbeats)
|
||||
.bind(janitor_touch_counts)
|
||||
.bind(transition_counts)
|
||||
.bind(last_transitions)
|
||||
.bind(queue_names)
|
||||
.bind(states)
|
||||
.bind(scheduleds)
|
||||
.bind(priorities)
|
||||
.bind(vm_states)
|
||||
.bind(metadatas)
|
||||
.bind(parameters)
|
||||
.execute(executor)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Dequeue the next job batch from the queue, skipping VM state since it can be large
|
||||
pub async fn dequeue_jobs<'c, E>(
|
||||
executor: E,
|
||||
queue: &str,
|
||||
max: usize,
|
||||
) -> Result<Vec<Job>, QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
// TODO - right now, locks are completely transient. We could instead have the lock_id act like a
|
||||
// "worker_id", and be provided by the caller, which would let workers do less bookkeeping, and make
|
||||
// some kinds of debugging easier, but I prefer locks being opaque to workers for now, to avoid any
|
||||
// confusion or potential for accidental deadlocking (e.g. if someone persisted the worker_id across
|
||||
// process restarts).
|
||||
let lock_id = Uuid::now_v7();
|
||||
Ok(sqlx::query_as!(
|
||||
Job,
|
||||
r#"
|
||||
WITH available AS (
|
||||
SELECT
|
||||
id,
|
||||
state
|
||||
FROM cyclotron_jobs
|
||||
WHERE
|
||||
state = 'available'::JobState
|
||||
AND queue_name = $1
|
||||
AND scheduled <= NOW()
|
||||
ORDER BY
|
||||
priority ASC,
|
||||
scheduled ASC
|
||||
LIMIT $2
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
UPDATE cyclotron_jobs
|
||||
SET
|
||||
state = 'running'::JobState,
|
||||
lock_id = $3,
|
||||
last_heartbeat = NOW(),
|
||||
last_transition = NOW(),
|
||||
transition_count = transition_count + 1
|
||||
FROM available
|
||||
WHERE
|
||||
cyclotron_jobs.id = available.id
|
||||
RETURNING
|
||||
cyclotron_jobs.id,
|
||||
team_id,
|
||||
available.state as "state: JobState",
|
||||
queue_name,
|
||||
priority,
|
||||
function_id,
|
||||
created,
|
||||
last_transition,
|
||||
scheduled,
|
||||
transition_count,
|
||||
NULL as vm_state,
|
||||
metadata,
|
||||
parameters,
|
||||
lock_id,
|
||||
last_heartbeat,
|
||||
janitor_touch_count
|
||||
"#,
|
||||
queue,
|
||||
max as i64,
|
||||
lock_id
|
||||
)
|
||||
.fetch_all(executor)
|
||||
.await?)
|
||||
}
|
||||
|
||||
// Dequeue a batch of jobs, also returning their VM state. This is an optimisation - you could
|
||||
// dequeue a batch of jobs and then fetch their VM state in a separate query, but this is hopefully less
|
||||
// heavy on the DB, if a given worker knows it needs VM state for all dequeue jobs
|
||||
pub async fn dequeue_with_vm_state<'c, E>(
|
||||
executor: E,
|
||||
queue: &str,
|
||||
max: usize,
|
||||
) -> Result<Vec<Job>, QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let lock_id = Uuid::now_v7();
|
||||
Ok(sqlx::query_as!(
|
||||
Job,
|
||||
r#"
|
||||
WITH available AS (
|
||||
SELECT
|
||||
id,
|
||||
state
|
||||
FROM cyclotron_jobs
|
||||
WHERE
|
||||
state = 'available'::JobState
|
||||
AND queue_name = $1
|
||||
AND scheduled <= NOW()
|
||||
ORDER BY
|
||||
priority ASC,
|
||||
scheduled ASC
|
||||
LIMIT $2
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
UPDATE cyclotron_jobs
|
||||
SET
|
||||
state = 'running'::JobState,
|
||||
lock_id = $3,
|
||||
last_heartbeat = NOW(),
|
||||
last_transition = NOW(),
|
||||
transition_count = transition_count + 1
|
||||
FROM available
|
||||
WHERE
|
||||
cyclotron_jobs.id = available.id
|
||||
RETURNING
|
||||
cyclotron_jobs.id,
|
||||
team_id,
|
||||
available.state as "state: JobState",
|
||||
queue_name,
|
||||
priority,
|
||||
function_id,
|
||||
created,
|
||||
last_transition,
|
||||
scheduled,
|
||||
transition_count,
|
||||
vm_state,
|
||||
metadata,
|
||||
parameters,
|
||||
lock_id,
|
||||
last_heartbeat,
|
||||
janitor_touch_count
|
||||
"#,
|
||||
queue,
|
||||
max as i64,
|
||||
lock_id
|
||||
)
|
||||
.fetch_all(executor)
|
||||
.await?)
|
||||
}
|
||||
|
||||
// Grab a jobs VM state - for workers that might sometimes need a jobs vm state, but not always,
|
||||
// this lets them use dequeue_jobs, and then fetch the states they need. VM state can only be retrieved
|
||||
// by workers holding a job lock
|
||||
pub async fn get_vm_state<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
lock_id: Uuid,
|
||||
) -> Result<Option<String>, QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
struct VMState {
|
||||
vm_state: Option<String>,
|
||||
}
|
||||
|
||||
// We use fetch_on here because giving us an unknown ID is an error
|
||||
let res = sqlx::query_as!(
|
||||
VMState,
|
||||
"SELECT vm_state FROM cyclotron_jobs WHERE id = $1 AND lock_id = $2",
|
||||
job_id,
|
||||
lock_id
|
||||
)
|
||||
.fetch_one(executor)
|
||||
.await?;
|
||||
|
||||
Ok(res.vm_state)
|
||||
}
|
||||
|
||||
// A struct representing a set of updates for a job. Outer none values mean "don't update this field",
|
||||
// with nested none values meaning "set this field to null" for nullable fields
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct JobUpdate {
|
||||
pub lock_id: Uuid, // The ID of the lock acquired when this worker dequeued the job, required for any update to be valid
|
||||
pub state: Option<JobState>,
|
||||
pub queue_name: Option<String>,
|
||||
pub priority: Option<i16>,
|
||||
pub scheduled: Option<DateTime<Utc>>,
|
||||
pub vm_state: Option<Option<String>>,
|
||||
pub metadata: Option<Option<String>>,
|
||||
pub parameters: Option<Option<String>>,
|
||||
}
|
||||
|
||||
impl JobUpdate {
|
||||
pub fn new(lock_id: Uuid) -> Self {
|
||||
Self {
|
||||
lock_id,
|
||||
state: None,
|
||||
queue_name: None,
|
||||
priority: None,
|
||||
scheduled: None,
|
||||
vm_state: None,
|
||||
metadata: None,
|
||||
parameters: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO - I should think about a bulk-flush interface at /some/ point, although we expect jobs to be
|
||||
// high variance with respect to work time, so maybe that wouldn't be that useful in the end.
|
||||
// TODO - this isn't the cheapest way to update a row in a table... I could probably do better by instead
|
||||
// using a query builder, but I wanted sqlx's nice macro handling, at least while iterating on the schema.
|
||||
// If/when we start hitting perf issues, this is a good place to start.
|
||||
// NOTE - this function permits multiple flushes to the same job, without losing the lock on it, but
|
||||
// high level implementations are recommended to avoid this - ideally, for every de/requeue, there should be
|
||||
// exactly 2 database operations.
|
||||
pub async fn flush_job<'c, C>(
|
||||
connection: &mut C,
|
||||
job_id: Uuid,
|
||||
updates: JobUpdate,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
C: sqlx::Connection<Database = sqlx::Postgres>,
|
||||
{
|
||||
let mut txn = connection.begin().await?;
|
||||
|
||||
// Flushing any job state except "running" is a signal that the worker no longer holds this job
|
||||
let job_returned = !matches!(updates.state, Some(JobState::Running));
|
||||
let lock_id = updates.lock_id;
|
||||
|
||||
if let Some(state) = updates.state {
|
||||
set_state(&mut *txn, job_id, updates.lock_id, state).await?;
|
||||
}
|
||||
|
||||
if let Some(queue_name) = updates.queue_name {
|
||||
set_queue(&mut *txn, job_id, &queue_name, lock_id).await?;
|
||||
}
|
||||
|
||||
if let Some(priority) = updates.priority {
|
||||
set_priority(&mut *txn, job_id, lock_id, priority).await?;
|
||||
}
|
||||
|
||||
if let Some(scheduled) = updates.scheduled {
|
||||
set_scheduled(&mut *txn, job_id, scheduled, lock_id).await?;
|
||||
}
|
||||
|
||||
if let Some(vm_state) = updates.vm_state {
|
||||
set_vm_state(&mut *txn, job_id, vm_state, lock_id).await?;
|
||||
}
|
||||
|
||||
if let Some(metadata) = updates.metadata {
|
||||
set_metadata(&mut *txn, job_id, metadata, lock_id).await?;
|
||||
}
|
||||
|
||||
if let Some(parameters) = updates.parameters {
|
||||
set_parameters(&mut *txn, job_id, parameters, lock_id).await?;
|
||||
}
|
||||
|
||||
// Calling flush indicates forward progress, so we should touch the heartbeat
|
||||
set_heartbeat(&mut *txn, job_id, lock_id).await?;
|
||||
|
||||
// We do this here, instead of in the set_state call, because otherwise the lock_id passed to other
|
||||
// updates would be invalid
|
||||
if job_returned {
|
||||
let query = sqlx::query!(
|
||||
"UPDATE cyclotron_jobs SET lock_id = NULL, last_heartbeat = NULL WHERE id = $1 AND lock_id = $2",
|
||||
job_id,
|
||||
lock_id
|
||||
);
|
||||
assert_does_update(&mut *txn, job_id, lock_id, query).await?;
|
||||
}
|
||||
|
||||
txn.commit().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Simple wrapper, that just executes a query and throws an error if no rows were affected
|
||||
async fn assert_does_update<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
lock_id: Uuid,
|
||||
query: Query<'_, sqlx::Postgres, PgArguments>,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let res = query.execute(executor).await?;
|
||||
throw_if_no_rows(res, job_id, lock_id)
|
||||
}
|
||||
|
||||
// Most of the rest of these functions are designed to be used as part of larger transactions, e.g.
|
||||
// "completing" a job means updating various rows and then marking it complete, and we can build that
|
||||
// by composing a set of individual queries together using a transaction.
|
||||
// Update the state of a job, also tracking the transition count and last transition time
|
||||
pub async fn set_state<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
lock_id: Uuid,
|
||||
state: JobState,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let q = sqlx::query!(
|
||||
r#"UPDATE cyclotron_jobs
|
||||
SET state = $1, last_transition = NOW(), transition_count = transition_count + 1
|
||||
WHERE id = $2 AND lock_id = $3"#,
|
||||
state as _,
|
||||
job_id,
|
||||
lock_id
|
||||
);
|
||||
|
||||
assert_does_update(executor, job_id, lock_id, q).await
|
||||
}
|
||||
|
||||
pub async fn set_queue<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
queue: &str,
|
||||
lock_id: Uuid,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let q = sqlx::query!(
|
||||
"UPDATE cyclotron_jobs SET queue_name = $1 WHERE id = $2 AND lock_id = $3",
|
||||
queue,
|
||||
job_id,
|
||||
lock_id
|
||||
);
|
||||
assert_does_update(executor, job_id, lock_id, q).await
|
||||
}
|
||||
|
||||
pub async fn set_priority<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
lock_id: Uuid,
|
||||
priority: i16,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let q = sqlx::query!(
|
||||
"UPDATE cyclotron_jobs SET priority = $1 WHERE id = $2 AND lock_id = $3",
|
||||
priority,
|
||||
job_id,
|
||||
lock_id
|
||||
);
|
||||
assert_does_update(executor, job_id, lock_id, q).await
|
||||
}
|
||||
|
||||
pub async fn set_scheduled<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
scheduled: DateTime<Utc>,
|
||||
lock_id: Uuid,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let q = sqlx::query!(
|
||||
"UPDATE cyclotron_jobs SET scheduled = $1 WHERE id = $2 AND lock_id = $3",
|
||||
scheduled,
|
||||
job_id,
|
||||
lock_id
|
||||
);
|
||||
assert_does_update(executor, job_id, lock_id, q).await
|
||||
}
|
||||
|
||||
pub async fn set_vm_state<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
vm_state: Option<String>,
|
||||
lock_id: Uuid,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let q = sqlx::query!(
|
||||
"UPDATE cyclotron_jobs SET vm_state = $1 WHERE id = $2 AND lock_id = $3",
|
||||
vm_state,
|
||||
job_id,
|
||||
lock_id
|
||||
);
|
||||
assert_does_update(executor, job_id, lock_id, q).await
|
||||
}
|
||||
|
||||
pub async fn set_metadata<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
metadata: Option<String>,
|
||||
lock_id: Uuid,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let q = sqlx::query!(
|
||||
"UPDATE cyclotron_jobs SET metadata = $1 WHERE id = $2 AND lock_id = $3",
|
||||
metadata,
|
||||
job_id,
|
||||
lock_id
|
||||
);
|
||||
assert_does_update(executor, job_id, lock_id, q).await
|
||||
}
|
||||
|
||||
pub async fn set_parameters<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
parameters: Option<String>,
|
||||
lock_id: Uuid,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let q = sqlx::query!(
|
||||
"UPDATE cyclotron_jobs SET parameters = $1 WHERE id = $2 AND lock_id = $3",
|
||||
parameters,
|
||||
job_id,
|
||||
lock_id
|
||||
);
|
||||
assert_does_update(executor, job_id, lock_id, q).await
|
||||
}
|
||||
|
||||
pub async fn set_heartbeat<'c, E>(
|
||||
executor: E,
|
||||
job_id: Uuid,
|
||||
lock_id: Uuid,
|
||||
) -> Result<(), QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let q = sqlx::query!(
|
||||
"UPDATE cyclotron_jobs SET last_heartbeat = NOW() WHERE id = $1 AND lock_id = $2",
|
||||
job_id,
|
||||
lock_id
|
||||
);
|
||||
assert_does_update(executor, job_id, lock_id, q).await
|
||||
}
|
||||
|
||||
pub async fn count_total_waiting_jobs<'c, E>(executor: E) -> Result<u64, QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let res = sqlx::query!(
|
||||
"SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'available' AND scheduled <= NOW()",
|
||||
)
|
||||
.fetch_one(executor)
|
||||
.await?;
|
||||
|
||||
let res = res.count.unwrap_or(0);
|
||||
Ok(res as u64)
|
||||
}
|
||||
|
||||
fn throw_if_no_rows(res: PgQueryResult, job: Uuid, lock: Uuid) -> Result<(), QueueError> {
|
||||
if res.rows_affected() == 0 {
|
||||
Err(QueueError::InvalidLock(lock, job))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
56
rust/cyclotron-core/src/bin/create_test_data.rs
Normal file
56
rust/cyclotron-core/src/bin/create_test_data.rs
Normal file
@ -0,0 +1,56 @@
|
||||
use chrono::{Duration, Utc};
|
||||
use cyclotron_core::{
|
||||
base_ops::JobInit,
|
||||
manager::{ManagerConfig, QueueManager},
|
||||
PoolConfig,
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
// Just inserts jobs as fast as it can, choosing randomly between hog and fetch workers, and between different priorities.
|
||||
// prints every 100 jobs inserted.
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let pool_config = PoolConfig {
|
||||
db_url: "postgresql://posthog:posthog@localhost:5432/cyclotron".to_string(),
|
||||
max_connections: None,
|
||||
min_connections: None,
|
||||
acquire_timeout_seconds: None,
|
||||
max_lifetime_seconds: None,
|
||||
idle_timeout_seconds: None,
|
||||
};
|
||||
|
||||
let manager_config = ManagerConfig {
|
||||
shards: vec![pool_config.clone()],
|
||||
shard_depth_limit: None,
|
||||
shard_depth_check_interval_seconds: None,
|
||||
};
|
||||
|
||||
let manager = QueueManager::new(manager_config).await.unwrap();
|
||||
|
||||
let now = Utc::now() - Duration::minutes(1);
|
||||
let start = Utc::now();
|
||||
let mut count = 0;
|
||||
loop {
|
||||
let queue = if rand::random() { "fetch" } else { "hog" };
|
||||
|
||||
let priority = (rand::random::<u16>() % 3) as i16;
|
||||
|
||||
let test_job = JobInit {
|
||||
team_id: 1,
|
||||
queue_name: queue.to_string(),
|
||||
priority,
|
||||
scheduled: now,
|
||||
function_id: Some(Uuid::now_v7()),
|
||||
vm_state: None,
|
||||
parameters: None,
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
manager.create_job(test_job).await.unwrap();
|
||||
|
||||
count += 1;
|
||||
if count % 100 == 0 {
|
||||
println!("Elapsed: {:?}, count: {}", Utc::now() - start, count);
|
||||
}
|
||||
}
|
||||
}
|
167
rust/cyclotron-core/src/bin/load_test.rs
Normal file
167
rust/cyclotron-core/src/bin/load_test.rs
Normal file
@ -0,0 +1,167 @@
|
||||
use std::{
|
||||
sync::{atomic::AtomicUsize, Arc},
|
||||
time::Instant,
|
||||
};
|
||||
|
||||
use chrono::{Duration, Utc};
|
||||
use cyclotron_core::{
|
||||
base_ops::{JobInit, JobState},
|
||||
manager::{ManagerConfig, QueueManager},
|
||||
worker::Worker,
|
||||
PoolConfig,
|
||||
};
|
||||
use futures::future::join_all;
|
||||
use uuid::Uuid;
|
||||
|
||||
// This spins up a manager and 2 workers, and tries to simulate semi-realistic load (on the DB - the workers do nothing except complete jobs)
|
||||
// - The manager inserts jobs as fast as it can, choosing randomly between hog and fetch workers, and between different priorities.
|
||||
// - The workers will process jobs as fast as they can, in batches of 1000.
|
||||
// - The manager and both workers track how long each insert and dequeue takes, in ms/job.
|
||||
// - The manager never inserts more than 10,000 more jobs than the workers have processed.
|
||||
const INSERT_BATCH_SIZE: usize = 1000;
|
||||
|
||||
struct SharedContext {
|
||||
jobs_inserted: AtomicUsize,
|
||||
jobs_dequeued: AtomicUsize,
|
||||
}
|
||||
|
||||
async fn producer_loop(manager: QueueManager, shared_context: Arc<SharedContext>) {
|
||||
let mut time_spent_inserting = Duration::zero();
|
||||
let now = Utc::now() - Duration::minutes(1);
|
||||
loop {
|
||||
let mut to_insert = Vec::with_capacity(1000);
|
||||
for _ in 0..INSERT_BATCH_SIZE {
|
||||
let queue = if rand::random() { "fetch" } else { "hog" };
|
||||
|
||||
let priority = (rand::random::<u16>() % 3) as i16;
|
||||
|
||||
let test_job = JobInit {
|
||||
team_id: 1,
|
||||
queue_name: queue.to_string(),
|
||||
priority,
|
||||
scheduled: now,
|
||||
function_id: Some(Uuid::now_v7()),
|
||||
vm_state: None,
|
||||
parameters: None,
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
to_insert.push(test_job);
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
manager.bulk_create_jobs(to_insert).await;
|
||||
let elapsed = start.elapsed();
|
||||
time_spent_inserting += Duration::from_std(elapsed).unwrap();
|
||||
|
||||
let inserted = shared_context
|
||||
.jobs_inserted
|
||||
.fetch_add(INSERT_BATCH_SIZE, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
println!("Inserted: {} in {}, ", inserted, time_spent_inserting);
|
||||
let mut dequeued = shared_context
|
||||
.jobs_dequeued
|
||||
.load(std::sync::atomic::Ordering::Relaxed);
|
||||
while inserted > dequeued + 10_000 {
|
||||
println!(
|
||||
"Waiting for workers to catch up, lagging by {}",
|
||||
inserted - dequeued
|
||||
);
|
||||
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
|
||||
dequeued = shared_context
|
||||
.jobs_dequeued
|
||||
.load(std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn worker_loop(worker: Worker, shared_context: Arc<SharedContext>, queue: &str) {
|
||||
let mut time_spent_dequeuing = Duration::zero();
|
||||
let start = Utc::now();
|
||||
loop {
|
||||
let loop_start = Instant::now();
|
||||
let jobs = worker.dequeue_jobs(queue, 1000).await.unwrap();
|
||||
|
||||
if jobs.is_empty() {
|
||||
println!(
|
||||
"Worker {:?} outpacing inserts, got no jobs, sleeping!",
|
||||
queue
|
||||
);
|
||||
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut futs = Vec::with_capacity(jobs.len());
|
||||
for job in &jobs {
|
||||
worker.set_state(job.id, JobState::Completed).unwrap();
|
||||
futs.push(worker.flush_job(job.id));
|
||||
}
|
||||
|
||||
for res in join_all(futs).await {
|
||||
res.unwrap();
|
||||
}
|
||||
|
||||
time_spent_dequeuing += Duration::from_std(loop_start.elapsed()).unwrap();
|
||||
|
||||
let dequeued = shared_context
|
||||
.jobs_dequeued
|
||||
.fetch_add(jobs.len(), std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
// To account for the bunch we just handled
|
||||
let dequeued = dequeued + jobs.len();
|
||||
|
||||
println!(
|
||||
"Dequeued, processed and completed {} jobs in {} for {:?}. Total time running: {}",
|
||||
dequeued,
|
||||
time_spent_dequeuing,
|
||||
queue,
|
||||
Utc::now() - start
|
||||
);
|
||||
|
||||
if jobs.len() < 1000 {
|
||||
println!(
|
||||
"Worker {:?} outpacing manager, only got {} jobs, sleeping!",
|
||||
queue,
|
||||
jobs.len()
|
||||
);
|
||||
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let pool_config = PoolConfig {
|
||||
db_url: "postgresql://posthog:posthog@localhost:5432/cyclotron".to_string(),
|
||||
max_connections: None,
|
||||
min_connections: None,
|
||||
acquire_timeout_seconds: None,
|
||||
max_lifetime_seconds: None,
|
||||
idle_timeout_seconds: None,
|
||||
};
|
||||
|
||||
let manager_config = ManagerConfig {
|
||||
shards: vec![pool_config.clone()],
|
||||
shard_depth_limit: None,
|
||||
shard_depth_check_interval_seconds: None,
|
||||
};
|
||||
|
||||
let shared_context = Arc::new(SharedContext {
|
||||
jobs_inserted: AtomicUsize::new(0),
|
||||
jobs_dequeued: AtomicUsize::new(0),
|
||||
});
|
||||
|
||||
let manager = QueueManager::new(manager_config).await.unwrap();
|
||||
let worker_1 = Worker::new(pool_config.clone()).await.unwrap();
|
||||
let worker_2 = Worker::new(pool_config.clone()).await.unwrap();
|
||||
|
||||
let producer = producer_loop(manager, shared_context.clone());
|
||||
let worker_1 = worker_loop(worker_1, shared_context.clone(), "fetch");
|
||||
let worker_2 = worker_loop(worker_2, shared_context.clone(), "hog");
|
||||
|
||||
let producer = tokio::spawn(producer);
|
||||
let worker_1 = tokio::spawn(worker_1);
|
||||
let worker_2 = tokio::spawn(worker_2);
|
||||
|
||||
tokio::try_join!(producer, worker_1, worker_2).unwrap();
|
||||
}
|
17
rust/cyclotron-core/src/error.rs
Normal file
17
rust/cyclotron-core/src/error.rs
Normal file
@ -0,0 +1,17 @@
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum QueueError {
|
||||
#[error("sqlx error: {0}")]
|
||||
SqlxError(#[from] sqlx::Error),
|
||||
#[error("Unknown job id: {0}")]
|
||||
UnknownJobId(Uuid), // Happens when someone tries to update a job through a QueueManager that wasn't dequeue or was already flushed
|
||||
#[error("Job {0} flushed without a new state, which would leave it in a running state forever (or until reaped)")]
|
||||
FlushWithoutNextState(Uuid),
|
||||
#[error("Invalid lock {0} used to update job {1}. This usually means a job has been reaped from under a worker - did you forget to set the heartbeat?")]
|
||||
InvalidLock(Uuid, Uuid),
|
||||
#[error("Shard over capacity {0} for this manager, insert aborted")]
|
||||
ShardFull(u64),
|
||||
#[error("Timed waiting for shard to have capacity")]
|
||||
TimedOutWaitingForCapacity,
|
||||
}
|
94
rust/cyclotron-core/src/janitor_ops.rs
Normal file
94
rust/cyclotron-core/src/janitor_ops.rs
Normal file
@ -0,0 +1,94 @@
|
||||
use chrono::{Duration, Utc};
|
||||
|
||||
use crate::error::QueueError;
|
||||
|
||||
// As a general rule, janitor operations are not queue specific (as in, they don't account for the
|
||||
// queue name). We can revisit this later, if we decide we need the ability to do janitor operations
|
||||
// on a per-queue basis.
|
||||
pub async fn delete_completed_jobs<'c, E>(executor: E) -> Result<u64, QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let result = sqlx::query!("DELETE FROM cyclotron_jobs WHERE state = 'completed'")
|
||||
.execute(executor)
|
||||
.await
|
||||
.map_err(QueueError::from)?;
|
||||
|
||||
Ok(result.rows_affected())
|
||||
}
|
||||
|
||||
pub async fn delete_failed_jobs<'c, E>(executor: E) -> Result<u64, QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let result = sqlx::query!("DELETE FROM cyclotron_jobs WHERE state = 'failed'")
|
||||
.execute(executor)
|
||||
.await
|
||||
.map_err(QueueError::from)?;
|
||||
|
||||
Ok(result.rows_affected())
|
||||
}
|
||||
|
||||
// Jobs are considered stalled if their lock is held and their last_heartbeat is older than `timeout`.
|
||||
// NOTE - because this runs on running jobs, it can stall workers trying to flush updates as it
|
||||
// executes. I need to use some of the load generators alongside explain/analyze to optimise this (and
|
||||
// the set of DB indexes)
|
||||
// TODO - this /could/ return the lock_id's held, which might help with debugging (if workers reported
|
||||
// the lock_id's they dequeue'd), but lets not do that right now.
|
||||
pub async fn reset_stalled_jobs<'c, E>(executor: E, timeout: Duration) -> Result<u64, QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let oldest_valid_heartbeat = Utc::now() - timeout;
|
||||
let result = sqlx::query!(r#"
|
||||
WITH stalled AS (
|
||||
SELECT id FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
UPDATE cyclotron_jobs
|
||||
SET state = 'available', lock_id = NULL, last_heartbeat = NULL, janitor_touch_count = janitor_touch_count + 1
|
||||
FROM stalled
|
||||
WHERE cyclotron_jobs.id = stalled.id
|
||||
"#,
|
||||
oldest_valid_heartbeat
|
||||
)
|
||||
.execute(executor)
|
||||
.await
|
||||
.map_err(QueueError::from)?;
|
||||
|
||||
Ok(result.rows_affected())
|
||||
}
|
||||
|
||||
// Poison pills are jobs whose lock is held and whose heartbeat is older than `timeout`, that have
|
||||
// been returned to the queue by the janitor more than `max_janitor_touched` times.
|
||||
// NOTE - this has the same performance caveat as reset_stalled_jobs
|
||||
// TODO - This shoud, instead, move the job row to a dead letter table, for later investigation. Of course,
|
||||
// rather than doing that, it could just put the job in a "dead letter" state, and no worker or janitor process
|
||||
// will touch it... maybe the table moving isn't needed? but either way, being able to debug jobs that cause workers
|
||||
// to stall would be good (and, thinking about it, moving it to a new table means we don't have to clear the lock,
|
||||
// so have a potential way to trace back to the last worker that died holding the job)
|
||||
pub async fn delete_poison_pills<'c, E>(
|
||||
executor: E,
|
||||
timeout: Duration,
|
||||
max_janitor_touched: i16,
|
||||
) -> Result<u64, QueueError>
|
||||
where
|
||||
E: sqlx::Executor<'c, Database = sqlx::Postgres>,
|
||||
{
|
||||
let oldest_valid_heartbeat = Utc::now() - timeout;
|
||||
// NOTE - we don't check the lock_id here, because it probably doesn't matter (the lock_id should be set if the
|
||||
// job state is "running"), but perhaps we should only delete jobs with a set lock_id, and report an error
|
||||
// if we find a job with a state of "running" and no lock_id. Also, we delete jobs whose last_heartbeat is
|
||||
// null, which again should never happen (dequeuing a job should always set the last_heartbeat), but for
|
||||
// robustness sake we may as well handle it
|
||||
let result = sqlx::query!(
|
||||
r#"
|
||||
DELETE FROM cyclotron_jobs WHERE state = 'running' AND COALESCE(last_heartbeat, $1) <= $1 AND janitor_touch_count >= $2
|
||||
"#,
|
||||
oldest_valid_heartbeat,
|
||||
max_janitor_touched
|
||||
).execute(executor)
|
||||
.await
|
||||
.map_err(QueueError::from)?;
|
||||
|
||||
Ok(result.rows_affected())
|
||||
}
|
38
rust/cyclotron-core/src/lib.rs
Normal file
38
rust/cyclotron-core/src/lib.rs
Normal file
@ -0,0 +1,38 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::{pool::PoolOptions, PgPool};
|
||||
|
||||
pub mod base_ops;
|
||||
pub mod error;
|
||||
pub mod janitor_ops;
|
||||
pub mod manager;
|
||||
pub mod worker;
|
||||
|
||||
// A pool config object, designed to be passable across API boundaries
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct PoolConfig {
|
||||
pub db_url: String,
|
||||
pub max_connections: Option<u32>, // Default to 10
|
||||
pub min_connections: Option<u32>, // Default to 1
|
||||
pub acquire_timeout_seconds: Option<u64>, // Default to 30
|
||||
pub max_lifetime_seconds: Option<u64>, // Default to 300
|
||||
pub idle_timeout_seconds: Option<u64>, // Default to 60
|
||||
}
|
||||
|
||||
impl PoolConfig {
|
||||
pub async fn connect(&self) -> Result<PgPool, sqlx::Error> {
|
||||
let builder = PoolOptions::new()
|
||||
.max_connections(self.max_connections.unwrap_or(10))
|
||||
.min_connections(self.min_connections.unwrap_or(1))
|
||||
.max_lifetime(Duration::from_secs(
|
||||
self.max_lifetime_seconds.unwrap_or(300),
|
||||
))
|
||||
.idle_timeout(Duration::from_secs(self.idle_timeout_seconds.unwrap_or(60)))
|
||||
.acquire_timeout(Duration::from_secs(
|
||||
self.acquire_timeout_seconds.unwrap_or(30),
|
||||
));
|
||||
|
||||
builder.connect(&self.db_url).await
|
||||
}
|
||||
}
|
262
rust/cyclotron-core/src/manager.rs
Normal file
262
rust/cyclotron-core/src/manager.rs
Normal file
@ -0,0 +1,262 @@
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
|
||||
use chrono::{DateTime, Duration, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::PgPool;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use crate::{
|
||||
base_ops::{bulk_create_jobs, count_total_waiting_jobs, create_job, JobInit},
|
||||
error::QueueError,
|
||||
PoolConfig,
|
||||
};
|
||||
|
||||
pub const DEFAULT_QUEUE_DEPTH_LIMIT: u64 = 10_000;
|
||||
pub const DEFAULT_SHARD_HEALTH_CHECK_INTERVAL: u64 = 10;
|
||||
|
||||
// TODO - right now, a lot of this sharding stuff will be hollow, but later we'll add logic like
|
||||
// e.g. routing work to alive shards if one is down, or reporting shard failure, etc.
|
||||
// TODO - here's also where queue management commands will go, like "downgrade the priority of this function"
|
||||
// or "pause jobs for this team", but we're going to add those ad-hoc as they're needed, not up front
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ManagerConfig {
|
||||
pub shards: Vec<PoolConfig>,
|
||||
pub shard_depth_limit: Option<u64>, // Defaults to 10_000 available jobs per shard
|
||||
pub shard_depth_check_interval_seconds: Option<u64>, // Defaults to 10 seconds - checking shard capacity
|
||||
}
|
||||
|
||||
pub struct Shard {
|
||||
pub pool: PgPool,
|
||||
pub last_healthy: RwLock<DateTime<Utc>>,
|
||||
pub check_interval: Duration,
|
||||
pub depth_limit: u64,
|
||||
}
|
||||
|
||||
pub struct QueueManager {
|
||||
shards: RwLock<Vec<Shard>>,
|
||||
next_shard: AtomicUsize,
|
||||
}
|
||||
|
||||
// Bulk inserts across multiple shards can partially succeed, so we need to track failures
|
||||
// and hand back failed job inits to the caller.
|
||||
pub struct BulkInsertResult {
|
||||
pub failures: Vec<(QueueError, Vec<JobInit>)>,
|
||||
}
|
||||
|
||||
impl QueueManager {
|
||||
pub async fn new(config: ManagerConfig) -> Result<Self, QueueError> {
|
||||
let mut shards = vec![];
|
||||
let depth_limit = config
|
||||
.shard_depth_limit
|
||||
.unwrap_or(DEFAULT_QUEUE_DEPTH_LIMIT);
|
||||
let check_interval = Duration::seconds(
|
||||
config
|
||||
.shard_depth_check_interval_seconds
|
||||
.unwrap_or(DEFAULT_SHARD_HEALTH_CHECK_INTERVAL) as i64,
|
||||
);
|
||||
for shard in config.shards {
|
||||
let pool = shard.connect().await.unwrap();
|
||||
let shard = Shard::new(pool, depth_limit, check_interval);
|
||||
shards.push(shard);
|
||||
}
|
||||
Ok(Self {
|
||||
shards: RwLock::new(shards),
|
||||
next_shard: AtomicUsize::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
// Designed mostly to be used for testing, but safe enough to expose publicly
|
||||
pub fn from_pool(pool: PgPool) -> Self {
|
||||
Self {
|
||||
shards: RwLock::new(vec![Shard::new(
|
||||
pool,
|
||||
DEFAULT_QUEUE_DEPTH_LIMIT,
|
||||
Duration::seconds(DEFAULT_SHARD_HEALTH_CHECK_INTERVAL as i64),
|
||||
)]),
|
||||
next_shard: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn create_job(&self, init: JobInit) -> Result<(), QueueError> {
|
||||
// TODO - here is where a lot of shard health and failover logic will go, eventually.
|
||||
let next = self
|
||||
.next_shard
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let shards = self.shards.read().await;
|
||||
let shard = &shards[next % shards.len()];
|
||||
shard.create_job(init).await
|
||||
}
|
||||
|
||||
pub async fn create_job_blocking(
|
||||
&self,
|
||||
init: JobInit,
|
||||
timeout: Option<Duration>,
|
||||
) -> Result<(), QueueError> {
|
||||
let next = self
|
||||
.next_shard
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let shards = self.shards.read().await;
|
||||
let shard = &shards[next % shards.len()];
|
||||
shard.create_job_blocking(init, timeout).await
|
||||
}
|
||||
|
||||
pub async fn bulk_create_jobs(&self, inits: Vec<JobInit>) -> BulkInsertResult {
|
||||
let shards = self.shards.read().await;
|
||||
let chunk_size = inits.len() / shards.len();
|
||||
let mut result = BulkInsertResult::new();
|
||||
// TODO - at some point, we should dynamically re-acquire the lock each time, to allow
|
||||
// for re-routing jobs away from a bad shard during a bulk insert, but right now, we
|
||||
// don't even re-try inserts. Later work.
|
||||
for chunk in inits.chunks(chunk_size) {
|
||||
let next_shard = self
|
||||
.next_shard
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let shard = &shards[next_shard % shards.len()];
|
||||
let shard_result = shard.bulk_create_jobs(chunk).await;
|
||||
if let Err(err) = shard_result {
|
||||
result.add_failure(err, chunk.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub async fn bulk_create_jobs_blocking(
|
||||
&self,
|
||||
inits: Vec<JobInit>,
|
||||
timeout: Option<Duration>,
|
||||
) -> BulkInsertResult {
|
||||
let shards = self.shards.read().await;
|
||||
let chunk_size = inits.len() / shards.len();
|
||||
let mut result = BulkInsertResult::new();
|
||||
for chunk in inits.chunks(chunk_size) {
|
||||
let next_shard = self
|
||||
.next_shard
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let shard = &shards[next_shard % shards.len()];
|
||||
// TODO - we sequentially try each shard, but we could try to parallelize this.
|
||||
let shard_result = shard.bulk_create_jobs_blocking(chunk, timeout).await;
|
||||
if let Err(err) = shard_result {
|
||||
result.add_failure(err, chunk.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
impl Shard {
|
||||
pub fn new(pool: PgPool, depth_limit: u64, check_interval: Duration) -> Self {
|
||||
Self {
|
||||
pool,
|
||||
last_healthy: RwLock::new(Utc::now() - check_interval),
|
||||
check_interval,
|
||||
depth_limit,
|
||||
}
|
||||
}
|
||||
|
||||
// Inserts a job, failing if the shard is at capacity
|
||||
pub async fn create_job(&self, init: JobInit) -> Result<(), QueueError> {
|
||||
self.insert_guard().await?;
|
||||
create_job(&self.pool, init).await
|
||||
}
|
||||
|
||||
// Inserts a vec of jobs, failing if the shard is at capacity. Note "capacity" here just
|
||||
// means "it isn't totally full" - if there's "capacity" for 1 job, and this is a vec of
|
||||
// 1000, we still insert all 1000.
|
||||
pub async fn bulk_create_jobs(&self, inits: &[JobInit]) -> Result<(), QueueError> {
|
||||
self.insert_guard().await?;
|
||||
bulk_create_jobs(&self.pool, inits).await
|
||||
}
|
||||
|
||||
// Inserts a job, blocking until there's capacity (or until the timeout is reached)
|
||||
pub async fn create_job_blocking(
|
||||
&self,
|
||||
init: JobInit,
|
||||
timeout: Option<Duration>,
|
||||
) -> Result<(), QueueError> {
|
||||
let start = Utc::now();
|
||||
while self.is_full().await? {
|
||||
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
|
||||
if let Some(timeout) = &timeout {
|
||||
if Utc::now() - start > *timeout {
|
||||
return Err(QueueError::TimedOutWaitingForCapacity);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
create_job(&self.pool, init).await
|
||||
}
|
||||
|
||||
pub async fn bulk_create_jobs_blocking(
|
||||
&self,
|
||||
inits: &[JobInit],
|
||||
timeout: Option<Duration>,
|
||||
) -> Result<(), QueueError> {
|
||||
let start = Utc::now();
|
||||
while self.is_full().await? {
|
||||
tokio::time::sleep(Duration::milliseconds(100).to_std().unwrap()).await;
|
||||
if let Some(timeout) = &timeout {
|
||||
if Utc::now() - start > *timeout {
|
||||
return Err(QueueError::TimedOutWaitingForCapacity);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bulk_create_jobs(&self.pool, inits).await
|
||||
}
|
||||
|
||||
pub async fn insert_guard(&self) -> Result<(), QueueError> {
|
||||
if self.is_full().await? {
|
||||
return Err(QueueError::ShardFull(self.depth_limit));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn is_full(&self) -> Result<bool, QueueError> {
|
||||
let last_healthy = self.last_healthy.read().await;
|
||||
// If we were healthy less than the check interval ago, assume we are still
|
||||
if Utc::now() - *last_healthy < self.check_interval {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
// Grab a write lock. This constrains the number of concurrent capacity checks
|
||||
// to 1, purposefully - if someone spawns a thousand tasks to blockingly create
|
||||
// a job, we don't want all of them to be querying the available count at once.
|
||||
drop(last_healthy);
|
||||
let mut last_healthy = self.last_healthy.write().await;
|
||||
// TOCTOU - multiple tasks could be racing to re-do the check, and the firs time one
|
||||
// succeeds all the rest should skip it.
|
||||
if Utc::now() - *last_healthy < self.check_interval {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let pending = count_total_waiting_jobs(&self.pool).await?;
|
||||
let is_full = pending >= self.depth_limit;
|
||||
if !is_full {
|
||||
*last_healthy = Utc::now();
|
||||
}
|
||||
Ok(is_full)
|
||||
}
|
||||
}
|
||||
|
||||
impl BulkInsertResult {
|
||||
pub fn new() -> Self {
|
||||
Self { failures: vec![] }
|
||||
}
|
||||
|
||||
pub fn add_failure(&mut self, err: QueueError, jobs: Vec<JobInit>) {
|
||||
self.failures.push((err, jobs));
|
||||
}
|
||||
|
||||
pub fn all_succeeded(&self) -> bool {
|
||||
self.failures.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BulkInsertResult {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
229
rust/cyclotron-core/src/worker.rs
Normal file
229
rust/cyclotron-core/src/worker.rs
Normal file
@ -0,0 +1,229 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use sqlx::PgPool;
|
||||
use std::sync::Mutex;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
base_ops::{
|
||||
dequeue_jobs, dequeue_with_vm_state, flush_job, set_heartbeat, Job, JobState, JobUpdate,
|
||||
},
|
||||
error::QueueError,
|
||||
PoolConfig,
|
||||
};
|
||||
|
||||
// The worker's interface to the underlying queue system - a worker can do everything except
|
||||
// create jobs (because job creation has to be shard-aware).
|
||||
//
|
||||
// This interface looks stange, because a lot of things that would normally be done with lifetimes
|
||||
// and references are done with uuid's instead (and we lose some nice raii stuff as a result), but
|
||||
// the reason for this is that this is designed to be embedded in other runtimes, where handing out
|
||||
// lifetime'd references or things with drop impls isn't really practical. This makes it a little
|
||||
// awkward to use, but since it's meant to be the core of other abstractions, I think it's ok for
|
||||
// now (client libraries should wrap this to provide better interfaces).
|
||||
pub struct Worker {
|
||||
pool: PgPool,
|
||||
// All dequeued job IDs that haven't been flushed yet. The idea is this lets us
|
||||
// manage, on the rust side of any API boundary, the "pending" update of any given
|
||||
// job, such that a user can progressively build up a full update, and then flush it,
|
||||
// rather than having to track the update state on their side and submit it all at once
|
||||
// TODO - we don't handle people "forgetting" to abort a job, because we expect that to
|
||||
// only happen if a process dies (in which case the job queue janitor should handle
|
||||
// it)... this is a memory leak, but I think it's ok.
|
||||
// TRICKY - this is a sync mutex, because we never hold it across an await point, and that
|
||||
// radically simplifies using this for FFI (because there's no message passing across runtimes)
|
||||
pending: Arc<Mutex<HashMap<Uuid, JobUpdate>>>,
|
||||
}
|
||||
|
||||
impl Worker {
|
||||
pub async fn new(config: PoolConfig) -> Result<Self, QueueError> {
|
||||
let pool = config.connect().await?;
|
||||
Ok(Self {
|
||||
pool,
|
||||
pending: Arc::new(Mutex::new(HashMap::new())),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn from_pool(pool: PgPool) -> Self {
|
||||
Self {
|
||||
pool,
|
||||
pending: Arc::new(Mutex::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Dequeues jobs from the queue, and returns them. Job sorting happens at the queue level,
|
||||
/// workers can't provide any filtering or sorting criteria - queue managers decide which jobs are run,
|
||||
/// workers just run them.
|
||||
pub async fn dequeue_jobs(&self, queue: &str, limit: usize) -> Result<Vec<Job>, QueueError> {
|
||||
let jobs = dequeue_jobs(&self.pool, queue, limit).await?;
|
||||
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
for job in &jobs {
|
||||
// We need to hang onto the locks for a job until we flush it, so we can send updates.
|
||||
let update = JobUpdate::new(
|
||||
job.lock_id
|
||||
.expect("Yell at oliver that the dequeuing code is broken. He's very sorry that your process just panicked"),
|
||||
);
|
||||
pending.insert(job.id, update);
|
||||
}
|
||||
|
||||
Ok(jobs)
|
||||
}
|
||||
|
||||
/// This is the same as dequeue_jobs, but it also returns the vm_state of the job
|
||||
pub async fn dequeue_with_vm_state(
|
||||
&self,
|
||||
queue: &str,
|
||||
limit: usize,
|
||||
) -> Result<Vec<Job>, QueueError> {
|
||||
let jobs = dequeue_with_vm_state(&self.pool, queue, limit).await?;
|
||||
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
for job in &jobs {
|
||||
// We need to hang onto the locks for a job until we flush it, so we can send updates.
|
||||
let update = JobUpdate::new(
|
||||
job.lock_id
|
||||
.expect("Yell at oliver that the dequeuing (with vm) code is broken. He's very sorry that your process just panicked"),
|
||||
);
|
||||
pending.insert(job.id, update);
|
||||
}
|
||||
|
||||
Ok(jobs)
|
||||
}
|
||||
|
||||
/// NOTE - This function can only be called once, even though the underlying
|
||||
/// basic operation can be performed as many times as the caller likes (so long as
|
||||
/// the job state is never set to something other than running, as that clears the
|
||||
/// job lock). We're more strict here (flushes can only happen once, you must
|
||||
/// flush some non-running state) to try and enforce a good interaction
|
||||
/// pattern with the queue. I might return to this and loosen this constraint in the
|
||||
/// future, if there's a motivating case for needing to flush partial job updates.
|
||||
pub async fn flush_job(&self, job_id: Uuid) -> Result<(), QueueError> {
|
||||
// TODO - this drops the job from the known jobs before the flush succeeds,
|
||||
// which means that if the flush fails, we'll lose the job and can never
|
||||
// update it's state (leaving it to the reaper). This is a bug, but I'm not
|
||||
// sure I want to make flushes retryable just yet, so I'm leaving it for now.
|
||||
// NIT: this wrapping is to ensure pending is dropped prior to the await
|
||||
let update = {
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
let update = pending
|
||||
.remove(&job_id)
|
||||
.ok_or(QueueError::UnknownJobId(job_id))?;
|
||||
// It's a programming error to flush a job without setting a new state
|
||||
match update.state {
|
||||
Some(JobState::Running) | None => {
|
||||
// Keep track of any /other/ updates that might have been stored, even in this case,
|
||||
// so a user can queue up the appropriate state transition and flush properly
|
||||
pending.insert(job_id, update);
|
||||
return Err(QueueError::FlushWithoutNextState(job_id));
|
||||
}
|
||||
_ => update,
|
||||
}
|
||||
};
|
||||
let mut connection = self.pool.acquire().await?;
|
||||
flush_job(connection.as_mut(), job_id, update).await
|
||||
}
|
||||
|
||||
/// Jobs are reaped after some seconds (the number is deployment specific, and may become
|
||||
/// specific on job properties like queue name in the future, as we figure out what /kinds/ of
|
||||
/// jobs are longer or shorter running). A job is considered "dead" if it's in a running state,
|
||||
/// and it's last heartbeat was more than the reaping time ago. This, like flush, returns an
|
||||
/// error if you try to set the heartbeat on a job whose lock you don't have (which can happen
|
||||
/// if e.g. the job was reaped out from under you).
|
||||
pub async fn heartbeat(&self, job_id: Uuid) -> Result<(), QueueError> {
|
||||
let lock_id = {
|
||||
let pending = self.pending.lock().unwrap();
|
||||
pending
|
||||
.get(&job_id)
|
||||
.ok_or(QueueError::UnknownJobId(job_id))?
|
||||
.lock_id
|
||||
};
|
||||
let mut connection = self.pool.acquire().await?;
|
||||
set_heartbeat(connection.as_mut(), job_id, lock_id).await
|
||||
}
|
||||
|
||||
/// This is how you "return" a job to the queue, by setting the state to "available"
|
||||
pub fn set_state(&self, job_id: Uuid, state: JobState) -> Result<(), QueueError> {
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
pending
|
||||
.get_mut(&job_id)
|
||||
.ok_or(QueueError::UnknownJobId(job_id))?
|
||||
.state = Some(state);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn set_queue(&self, job_id: Uuid, queue: &str) -> Result<(), QueueError> {
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
pending
|
||||
.get_mut(&job_id)
|
||||
.ok_or(QueueError::UnknownJobId(job_id))?
|
||||
.queue_name = Some(queue.to_string());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Jobs are dequeued lowest-priority-first, so this is how you change the "base" priority of a job
|
||||
/// (control tables may apply further deltas if e.g. a given function is in a degraded state)
|
||||
pub fn set_priority(&self, job_id: Uuid, priority: i16) -> Result<(), QueueError> {
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
pending
|
||||
.get_mut(&job_id)
|
||||
.ok_or(QueueError::UnknownJobId(job_id))?
|
||||
.priority = Some(priority);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is how you do e.g. retries after some time, by setting the scheduled time
|
||||
/// to some time in the future. Sleeping, retry backoff, scheduling - it's all the same operation,
|
||||
/// this one.
|
||||
pub fn set_scheduled_at(
|
||||
&self,
|
||||
job_id: Uuid,
|
||||
scheduled: DateTime<Utc>,
|
||||
) -> Result<(), QueueError> {
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
pending
|
||||
.get_mut(&job_id)
|
||||
.ok_or(QueueError::UnknownJobId(job_id))?
|
||||
.scheduled = Some(scheduled);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Passing None here will clear the vm_state
|
||||
pub fn set_vm_state(
|
||||
&self,
|
||||
job_id: Uuid,
|
||||
vm_state: Option<String>, // This (and the following) are Options, because the user can null them (by calling with None)
|
||||
) -> Result<(), QueueError> {
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
pending
|
||||
.get_mut(&job_id)
|
||||
.ok_or(QueueError::UnknownJobId(job_id))?
|
||||
.vm_state = Some(vm_state);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Passing None here will clear the metadata
|
||||
pub fn set_metadata(&self, job_id: Uuid, metadata: Option<String>) -> Result<(), QueueError> {
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
pending
|
||||
.get_mut(&job_id)
|
||||
.ok_or(QueueError::UnknownJobId(job_id))?
|
||||
.metadata = Some(metadata);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Passing None here will clear the parameters
|
||||
pub fn set_parameters(
|
||||
&self,
|
||||
job_id: Uuid,
|
||||
parameters: Option<String>,
|
||||
) -> Result<(), QueueError> {
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
pending
|
||||
.get_mut(&job_id)
|
||||
.ok_or(QueueError::UnknownJobId(job_id))?
|
||||
.parameters = Some(parameters);
|
||||
Ok(())
|
||||
}
|
||||
}
|
255
rust/cyclotron-core/tests/base_ops.rs
Normal file
255
rust/cyclotron-core/tests/base_ops.rs
Normal file
@ -0,0 +1,255 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use chrono::{Duration, Utc};
|
||||
use common::{assert_job_matches_init, create_new_job, dates_match};
|
||||
use cyclotron_core::{
|
||||
base_ops::{bulk_create_jobs, JobState},
|
||||
manager::QueueManager,
|
||||
worker::Worker,
|
||||
};
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
mod common;
|
||||
|
||||
// I know this should be a bunch of tests, but for hacking together stuff right now, it'll do
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn test_queue(db: PgPool) {
|
||||
let manager = QueueManager::from_pool(db.clone());
|
||||
let worker = Worker::from_pool(db);
|
||||
|
||||
let job_1 = create_new_job();
|
||||
let mut job_2 = create_new_job();
|
||||
|
||||
job_2.priority = 2; // Lower priority jobs should be returned second
|
||||
|
||||
let queue_name = job_1.queue_name.clone();
|
||||
|
||||
manager
|
||||
.create_job(job_1.clone())
|
||||
.await
|
||||
.expect("failed to create job");
|
||||
manager
|
||||
.create_job(job_2.clone())
|
||||
.await
|
||||
.expect("failed to create job");
|
||||
|
||||
let jobs = worker
|
||||
.dequeue_jobs(&queue_name, 2)
|
||||
.await
|
||||
.expect("failed to dequeue job");
|
||||
|
||||
assert_eq!(jobs.len(), 2);
|
||||
// This also assert that the ordering is correct in terms of priority
|
||||
assert_job_matches_init(&jobs[0], &job_1);
|
||||
assert_job_matches_init(&jobs[1], &job_2);
|
||||
|
||||
// Now we can re-queue these jobs (imagine we had done work)
|
||||
worker
|
||||
.set_state(jobs[0].id, JobState::Available)
|
||||
.expect("failed to set state");
|
||||
worker
|
||||
.set_state(jobs[1].id, JobState::Available)
|
||||
.expect("failed to set state");
|
||||
|
||||
// Flush the two jobs, having made no other changes, then assert we can re-dequeue them
|
||||
worker
|
||||
.flush_job(jobs[0].id)
|
||||
.await
|
||||
.expect("failed to flush job");
|
||||
worker
|
||||
.flush_job(jobs[1].id)
|
||||
.await
|
||||
.expect("failed to flush job");
|
||||
|
||||
let jobs = worker
|
||||
.dequeue_jobs(&queue_name, 2)
|
||||
.await
|
||||
.expect("failed to dequeue job");
|
||||
|
||||
assert_eq!(jobs.len(), 2);
|
||||
assert_job_matches_init(&jobs[0], &job_1);
|
||||
assert_job_matches_init(&jobs[1], &job_2);
|
||||
|
||||
// Re-queue them again
|
||||
worker
|
||||
.set_state(jobs[0].id, JobState::Available)
|
||||
.expect("failed to set state");
|
||||
worker
|
||||
.set_state(jobs[1].id, JobState::Available)
|
||||
.expect("failed to set state");
|
||||
|
||||
worker
|
||||
.flush_job(jobs[0].id)
|
||||
.await
|
||||
.expect("failed to flush job");
|
||||
worker
|
||||
.flush_job(jobs[1].id)
|
||||
.await
|
||||
.expect("failed to flush job");
|
||||
|
||||
// Spin up two tasks to race on dequeuing, and assert at most 2 jobs are dequeued
|
||||
let worker = Arc::new(worker);
|
||||
let moved = worker.clone();
|
||||
let queue_name_moved = queue_name.clone();
|
||||
let fut_1 = async move {
|
||||
moved
|
||||
.dequeue_jobs(&queue_name_moved, 2)
|
||||
.await
|
||||
.expect("failed to dequeue job")
|
||||
};
|
||||
let moved = worker.clone();
|
||||
let queue_name_moved = queue_name.clone();
|
||||
let fut_2 = async move {
|
||||
moved
|
||||
.dequeue_jobs(&queue_name_moved, 2)
|
||||
.await
|
||||
.expect("failed to dequeue job")
|
||||
};
|
||||
|
||||
let (jobs_1, jobs_2) = tokio::join!(fut_1, fut_2);
|
||||
assert_eq!(jobs_1.len() + jobs_2.len(), 2);
|
||||
|
||||
let jobs = jobs_1
|
||||
.into_iter()
|
||||
.chain(jobs_2.into_iter())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// And now, any subsequent dequeues will return no jobs
|
||||
let empty = worker
|
||||
.dequeue_jobs(&queue_name, 2)
|
||||
.await
|
||||
.expect("failed to dequeue job");
|
||||
assert_eq!(empty.len(), 0);
|
||||
|
||||
// If we try to flush a job without setting what it's next state will be (or if we set that next state to be "running"),
|
||||
// we should get an error
|
||||
worker
|
||||
.flush_job(jobs[0].id)
|
||||
.await
|
||||
.expect_err("expected error due to no-next-state");
|
||||
|
||||
worker
|
||||
.set_state(jobs[1].id, JobState::Running)
|
||||
.expect("failed to set state");
|
||||
worker
|
||||
.flush_job(jobs[1].id)
|
||||
.await
|
||||
.expect_err("expected error due to running state");
|
||||
|
||||
// But if we properly set the state to completed or failed, now we can flush
|
||||
worker
|
||||
.set_state(jobs[0].id, JobState::Completed)
|
||||
.expect("failed to set state");
|
||||
worker
|
||||
.set_state(jobs[1].id, JobState::Failed)
|
||||
.expect("failed to set state");
|
||||
|
||||
worker
|
||||
.flush_job(jobs[0].id)
|
||||
.await
|
||||
.expect("failed to flush job");
|
||||
worker
|
||||
.flush_job(jobs[1].id)
|
||||
.await
|
||||
.expect("failed to flush job");
|
||||
|
||||
// And now, any subsequent dequeues will return no jobs (because these jobs are finished)
|
||||
let empty = worker
|
||||
.dequeue_jobs(&queue_name, 2)
|
||||
.await
|
||||
.expect("failed to dequeue job");
|
||||
assert_eq!(empty.len(), 0);
|
||||
|
||||
// Now, lets check that we can set every variable on a job
|
||||
|
||||
// Set up some initial values
|
||||
let now = Utc::now();
|
||||
let mut job = create_new_job();
|
||||
job.queue_name = "test".to_string();
|
||||
job.priority = 0;
|
||||
job.scheduled = now - Duration::minutes(2);
|
||||
job.vm_state = None;
|
||||
job.parameters = None;
|
||||
job.metadata = None;
|
||||
|
||||
// Queue the job
|
||||
manager
|
||||
.create_job(job.clone())
|
||||
.await
|
||||
.expect("failed to create job");
|
||||
|
||||
// Then dequeue it
|
||||
let job = worker
|
||||
.dequeue_jobs("test", 1)
|
||||
.await
|
||||
.expect("failed to dequeue job")
|
||||
.pop()
|
||||
.expect("failed to dequeue job");
|
||||
|
||||
// Set everything we're able to set, including state to available, so we can dequeue it again
|
||||
worker
|
||||
.set_state(job.id, JobState::Available)
|
||||
.expect("failed to set state");
|
||||
worker
|
||||
.set_queue(job.id, "test_2")
|
||||
.expect("failed to set queue");
|
||||
worker
|
||||
.set_priority(job.id, 1)
|
||||
.expect("failed to set priority");
|
||||
worker
|
||||
.set_scheduled_at(job.id, now - Duration::minutes(10))
|
||||
.expect("failed to set scheduled_at");
|
||||
worker
|
||||
.set_vm_state(job.id, Some("test".to_string()))
|
||||
.expect("failed to set vm_state");
|
||||
worker
|
||||
.set_parameters(job.id, Some("test".to_string()))
|
||||
.expect("failed to set parameters");
|
||||
worker
|
||||
.set_metadata(job.id, Some("test".to_string()))
|
||||
.expect("failed to set metadata");
|
||||
|
||||
// Flush the job
|
||||
worker.flush_job(job.id).await.expect("failed to flush job");
|
||||
|
||||
// Then dequeue it again (this time being sure to grab the vm state too)
|
||||
let job = worker
|
||||
.dequeue_with_vm_state("test_2", 1)
|
||||
.await
|
||||
.expect("failed to dequeue job")
|
||||
.pop()
|
||||
.expect("failed to dequeue job");
|
||||
|
||||
// And every value should be the updated one
|
||||
assert_eq!(job.queue_name, "test_2");
|
||||
assert_eq!(job.priority, 1);
|
||||
assert!(dates_match(&job.scheduled, &(now - Duration::minutes(10))),);
|
||||
assert_eq!(job.vm_state, Some("test".to_string()));
|
||||
assert_eq!(job.parameters, Some("test".to_string()));
|
||||
assert_eq!(job.metadata, Some("test".to_string()));
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
pub async fn test_bulk_insert(db: PgPool) {
|
||||
let worker = Worker::from_pool(db.clone());
|
||||
|
||||
let job_template = create_new_job();
|
||||
|
||||
let jobs = (0..1000)
|
||||
.map(|_| {
|
||||
let mut job = job_template.clone();
|
||||
job.function_id = Some(Uuid::now_v7());
|
||||
job
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
bulk_create_jobs(&db, &jobs).await.unwrap();
|
||||
|
||||
let dequeue_jobs = worker
|
||||
.dequeue_jobs(&job_template.queue_name, 1000)
|
||||
.await
|
||||
.expect("failed to dequeue job");
|
||||
|
||||
assert_eq!(dequeue_jobs.len(), 1000);
|
||||
}
|
40
rust/cyclotron-core/tests/common.rs
Normal file
40
rust/cyclotron-core/tests/common.rs
Normal file
@ -0,0 +1,40 @@
|
||||
use chrono::{DateTime, Duration, Utc};
|
||||
use cyclotron_core::base_ops::{Job, JobInit};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn create_new_job() -> JobInit {
|
||||
JobInit {
|
||||
team_id: 1,
|
||||
function_id: Some(Uuid::now_v7()), // Lets us uniquely identify jobs without having the Uuid
|
||||
queue_name: "test".to_string(),
|
||||
priority: 0,
|
||||
scheduled: Utc::now() - Duration::minutes(1),
|
||||
vm_state: None,
|
||||
parameters: None,
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn dates_match(left: &DateTime<Utc>, right: &DateTime<Utc>) -> bool {
|
||||
// Roundtripping a datetime to PG can cause sub-ms differences, so we need to check within a margin of error
|
||||
// Seeing errors like this in CI:
|
||||
// assertion `left == right` failed
|
||||
// left: 2024-08-08T20:41:55.964936Z
|
||||
// right: 2024-08-08T20:41:55.964936997Z
|
||||
let diff = *left - *right;
|
||||
diff.abs() < Duration::milliseconds(1)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn assert_job_matches_init(job: &Job, init: &JobInit) {
|
||||
assert_eq!(job.team_id, init.team_id);
|
||||
assert_eq!(job.function_id, init.function_id);
|
||||
assert_eq!(job.queue_name, init.queue_name);
|
||||
assert_eq!(job.priority, init.priority);
|
||||
assert!(dates_match(&job.scheduled, &init.scheduled));
|
||||
assert_eq!(job.vm_state, init.vm_state);
|
||||
assert_eq!(job.parameters, init.parameters);
|
||||
assert_eq!(job.metadata, init.metadata);
|
||||
}
|
68
rust/cyclotron-core/tests/shard.rs
Normal file
68
rust/cyclotron-core/tests/shard.rs
Normal file
@ -0,0 +1,68 @@
|
||||
use chrono::{Duration, Utc};
|
||||
use common::create_new_job;
|
||||
use cyclotron_core::manager::Shard;
|
||||
use sqlx::PgPool;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
mod common;
|
||||
|
||||
pub fn get_shard(db: PgPool) -> Shard {
|
||||
Shard {
|
||||
pool: db,
|
||||
last_healthy: RwLock::new(Utc::now()),
|
||||
check_interval: Duration::milliseconds(0), // We always want to check the limit, for these tests
|
||||
depth_limit: 10,
|
||||
}
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
pub async fn test_shard_limiting(db: PgPool) {
|
||||
let shard = get_shard(db.clone());
|
||||
|
||||
// We should be able to insert 10 jobs
|
||||
for _ in 0..10 {
|
||||
shard.create_job(create_new_job()).await.unwrap();
|
||||
}
|
||||
|
||||
// And then we should fail on the 11th
|
||||
let result = shard.create_job(create_new_job()).await;
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
pub async fn test_shard_blocking_insert_waits(db: PgPool) {
|
||||
let shard = get_shard(db.clone());
|
||||
|
||||
// We should be able to insert 10 jobs
|
||||
for _ in 0..10 {
|
||||
shard.create_job(create_new_job()).await.unwrap();
|
||||
}
|
||||
|
||||
let timeout = Some(Duration::milliseconds(50));
|
||||
|
||||
let start = Utc::now();
|
||||
// And then we should fail on the 11th
|
||||
let result = shard.create_job_blocking(create_new_job(), timeout).await;
|
||||
assert!(result.is_err());
|
||||
|
||||
// We should have waited at least 50ms
|
||||
assert!(Utc::now() - start >= Duration::milliseconds(50));
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
pub async fn test_shard_allows_bulk_inserts_beyond_capacity(db: PgPool) {
|
||||
let shard = get_shard(db.clone());
|
||||
|
||||
// We should be able to insert 10 jobs
|
||||
for _ in 0..9 {
|
||||
shard.create_job(create_new_job()).await.unwrap();
|
||||
}
|
||||
|
||||
// And then we should be able to bulk insert 1000
|
||||
let inits = (0..1000).map(|_| create_new_job()).collect::<Vec<_>>();
|
||||
shard.bulk_create_jobs(&inits).await.unwrap();
|
||||
|
||||
// And the next insert should fail
|
||||
let result = shard.create_job(create_new_job()).await;
|
||||
assert!(result.is_err());
|
||||
}
|
32
rust/cyclotron-fetch/Cargo.toml
Normal file
32
rust/cyclotron-fetch/Cargo.toml
Normal file
@ -0,0 +1,32 @@
|
||||
[package]
|
||||
name = "cyclotron-fetch"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
tracing-subscriber = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
envconfig = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
cyclotron-core = { path = "../cyclotron-core" }
|
||||
common-metrics = { path = "../common/metrics" }
|
||||
common-dns = { path = "../common/dns" }
|
||||
health = { path = "../common/health" }
|
||||
reqwest = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
http = { workspace = true }
|
||||
rand = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
sqlx = { workspace = true }
|
||||
httpmock = { workspace = true }
|
104
rust/cyclotron-fetch/src/config.rs
Normal file
104
rust/cyclotron-fetch/src/config.rs
Normal file
@ -0,0 +1,104 @@
|
||||
use chrono::Duration;
|
||||
use cyclotron_core::PoolConfig;
|
||||
use envconfig::Envconfig;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Envconfig)]
|
||||
pub struct Config {
|
||||
#[envconfig(from = "BIND_HOST", default = "::")]
|
||||
pub host: String,
|
||||
|
||||
#[envconfig(from = "BIND_PORT", default = "3304")]
|
||||
pub port: u16,
|
||||
|
||||
#[envconfig(default = "postgres://posthog:posthog@localhost:5432/cyclotron")]
|
||||
pub database_url: String,
|
||||
|
||||
#[envconfig(default = "10")]
|
||||
pub pg_max_connections: u32,
|
||||
|
||||
#[envconfig(default = "1")]
|
||||
pub pg_min_connections: u32,
|
||||
|
||||
#[envconfig(default = "30")]
|
||||
pub pg_acquire_timeout_seconds: u64,
|
||||
|
||||
#[envconfig(default = "300")]
|
||||
pub pg_max_lifetime_seconds: u64,
|
||||
|
||||
#[envconfig(default = "60")]
|
||||
pub pg_idle_timeout_seconds: u64,
|
||||
|
||||
#[envconfig(default = "false")]
|
||||
pub allow_internal_ips: bool,
|
||||
|
||||
pub worker_id: Option<String>, // Default to a UUID
|
||||
pub job_poll_interval_seconds: Option<u32>, // Defaults to 1
|
||||
pub concurrent_requests_limit: Option<u32>, // Defaults to 1000
|
||||
pub fetch_timeout_seconds: Option<u32>, // Defaults to 30
|
||||
pub max_retry_attempts: Option<u32>, // Defaults to 10
|
||||
pub queue_served: Option<String>, // Default to "fetch"
|
||||
pub batch_size: Option<usize>, // Defaults to 1000
|
||||
pub max_response_bytes: Option<usize>, // Defaults to 1MB
|
||||
pub retry_backoff_base_ms: Option<u32>, // Defaults to 4000
|
||||
}
|
||||
|
||||
// I do this instead of using envconfig's defaults because
|
||||
// envconfig doesn't support defaults provided by functions,
|
||||
// which is frustrating when I want to use UUIDs, and if I'm
|
||||
// going to break out one field, I might as well break out
|
||||
// everything into "AppConfig" and "PoolConfig"
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AppConfig {
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
pub worker_id: String,
|
||||
pub job_poll_interval: Duration, // How long we wait to poll for new jobs, when we're at capacity or find no new jobs
|
||||
pub concurrent_requests_limit: u32,
|
||||
pub fetch_timeout: Duration,
|
||||
pub max_retry_attempts: u32,
|
||||
pub queue_served: String,
|
||||
pub batch_size: usize,
|
||||
pub max_response_bytes: usize,
|
||||
pub retry_backoff_base: Duration, // Job retry backoff times are this * attempt count
|
||||
pub allow_internal_ips: bool,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
pub fn to_components(self) -> (AppConfig, PoolConfig) {
|
||||
let worker_id = self.worker_id.unwrap_or_else(|| Uuid::now_v7().to_string());
|
||||
let job_poll_interval_seconds = self.job_poll_interval_seconds.unwrap_or(1);
|
||||
let concurrent_requests_limit = self.concurrent_requests_limit.unwrap_or(1000);
|
||||
let fetch_timeout_seconds = self.fetch_timeout_seconds.unwrap_or(30);
|
||||
let max_retry_attempts = self.max_retry_attempts.unwrap_or(10);
|
||||
let queue_served = self.queue_served.unwrap_or_else(|| "fetch".to_string());
|
||||
|
||||
let app_config = AppConfig {
|
||||
host: self.host,
|
||||
port: self.port,
|
||||
worker_id,
|
||||
job_poll_interval: Duration::seconds(job_poll_interval_seconds as i64),
|
||||
concurrent_requests_limit,
|
||||
fetch_timeout: Duration::seconds(fetch_timeout_seconds as i64),
|
||||
max_retry_attempts,
|
||||
queue_served,
|
||||
batch_size: self.batch_size.unwrap_or(1000),
|
||||
max_response_bytes: self.max_response_bytes.unwrap_or(1024 * 1024),
|
||||
retry_backoff_base: Duration::milliseconds(
|
||||
self.retry_backoff_base_ms.unwrap_or(4000) as i64
|
||||
),
|
||||
allow_internal_ips: self.allow_internal_ips,
|
||||
};
|
||||
|
||||
let pool_config = PoolConfig {
|
||||
db_url: self.database_url,
|
||||
max_connections: Some(self.pg_max_connections),
|
||||
min_connections: Some(self.pg_min_connections),
|
||||
acquire_timeout_seconds: Some(self.pg_acquire_timeout_seconds),
|
||||
max_lifetime_seconds: Some(self.pg_max_lifetime_seconds),
|
||||
idle_timeout_seconds: Some(self.pg_idle_timeout_seconds),
|
||||
};
|
||||
|
||||
(app_config, pool_config)
|
||||
}
|
||||
}
|
55
rust/cyclotron-fetch/src/context.rs
Normal file
55
rust/cyclotron-fetch/src/context.rs
Normal file
@ -0,0 +1,55 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use cyclotron_core::{worker::Worker, PoolConfig};
|
||||
use health::HealthHandle;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use crate::{config::AppConfig, fetch::FetchError};
|
||||
|
||||
pub struct AppContext {
|
||||
pub worker: Worker,
|
||||
pub client: reqwest::Client,
|
||||
pub concurrency_limit: Arc<Semaphore>,
|
||||
pub liveness: HealthHandle,
|
||||
pub config: AppConfig,
|
||||
}
|
||||
|
||||
impl AppContext {
|
||||
pub async fn create(
|
||||
config: AppConfig,
|
||||
pool_config: PoolConfig,
|
||||
liveness: HealthHandle,
|
||||
) -> Result<Self, FetchError> {
|
||||
let concurrency_limit = Arc::new(Semaphore::new(config.concurrent_requests_limit as usize));
|
||||
|
||||
let resolver = Arc::new(common_dns::PublicIPv4Resolver {});
|
||||
|
||||
let mut client = reqwest::Client::builder().timeout(config.fetch_timeout.to_std().unwrap());
|
||||
|
||||
if !config.allow_internal_ips {
|
||||
client = client.dns_resolver(resolver);
|
||||
}
|
||||
|
||||
let client = client.build();
|
||||
|
||||
let client = match client {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
return Err(FetchError::StartupError(format!(
|
||||
"Failed to create reqwest client: {}",
|
||||
e
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
let worker = Worker::new(pool_config).await?;
|
||||
|
||||
Ok(Self {
|
||||
worker,
|
||||
client,
|
||||
concurrency_limit,
|
||||
liveness,
|
||||
config,
|
||||
})
|
||||
}
|
||||
}
|
653
rust/cyclotron-fetch/src/fetch.rs
Normal file
653
rust/cyclotron-fetch/src/fetch.rs
Normal file
@ -0,0 +1,653 @@
|
||||
use std::{cmp::min, collections::HashMap, sync::Arc};
|
||||
|
||||
use chrono::{DateTime, Duration, Utc};
|
||||
use cyclotron_core::{
|
||||
base_ops::{Job, JobState},
|
||||
error::QueueError,
|
||||
worker::Worker,
|
||||
};
|
||||
use futures::StreamExt;
|
||||
use http::StatusCode;
|
||||
use reqwest::Response;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
use tokio::sync::OwnedSemaphorePermit;
|
||||
use tracing::error;
|
||||
|
||||
use crate::context::AppContext;
|
||||
|
||||
// TODO - a lot of these should maybe be configurable
|
||||
pub const DEAD_LETTER_QUEUE: &str = "fetch-dead-letter";
|
||||
pub const DEFAULT_RETRIES: u32 = 3;
|
||||
pub const DEFAULT_ON_FINISH: OnFinish = OnFinish::Return;
|
||||
pub const HEARTBEAT_INTERVAL_MS: i64 = 5000;
|
||||
|
||||
// Exclusively for errors in the worker - these will
|
||||
// never be serialised into the job queue, and indicate
|
||||
// bad worker health. As a general rule, if one of these
|
||||
// is produced, we should let the worker fall over (as in,
|
||||
// the outer worker loop should exit).
|
||||
#[derive(Error, Debug)]
|
||||
pub enum FetchError {
|
||||
#[error("timeout fetching jobs")]
|
||||
JobFetchTimeout,
|
||||
#[error(transparent)]
|
||||
QueueError(#[from] QueueError),
|
||||
// TRICKY - in most cases, serde errors are a FetchError (something coming from the queue was
|
||||
// invalid), but this is used in cases where /we/ fail to serialise something /to/ the queue
|
||||
#[error(transparent)]
|
||||
SerdeError(#[from] serde_json::Error),
|
||||
// We failed doing some kind of setup, like creating a reqwest client
|
||||
#[error("error during startup: {0}")]
|
||||
StartupError(String),
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
|
||||
#[serde(rename_all = "UPPERCASE")]
|
||||
pub enum HttpMethod {
|
||||
Get,
|
||||
Post,
|
||||
Patch,
|
||||
Put,
|
||||
Delete,
|
||||
}
|
||||
|
||||
impl From<&HttpMethod> for http::Method {
|
||||
fn from(method: &HttpMethod) -> Self {
|
||||
match method {
|
||||
HttpMethod::Get => http::Method::GET,
|
||||
HttpMethod::Post => http::Method::POST,
|
||||
HttpMethod::Patch => http::Method::PATCH,
|
||||
HttpMethod::Put => http::Method::PUT,
|
||||
HttpMethod::Delete => http::Method::DELETE,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// What does someone need to give us to execute a fetch?
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub struct FetchParameters {
|
||||
pub url: String,
|
||||
pub method: HttpMethod,
|
||||
pub return_queue: String,
|
||||
pub headers: Option<HashMap<String, String>>,
|
||||
pub body: Option<String>,
|
||||
pub max_tries: Option<u32>, // Defaults to 3
|
||||
pub on_finish: Option<OnFinish>, // Defaults to Return
|
||||
}
|
||||
|
||||
// What should we do when we get a result, or run out of tries for a given job?
|
||||
// Return means re-queue to the return_worker, Complete means mark as Completed/Failed
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum OnFinish {
|
||||
Return,
|
||||
Complete,
|
||||
}
|
||||
|
||||
// Internal bookkeeping for a fetch job
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub struct FetchMetadata {
|
||||
tries: u32,
|
||||
// The history of failures seen with this job
|
||||
trace: Vec<FetchFailure>,
|
||||
}
|
||||
|
||||
// This is what we put in the parameters of the job queue for the next
|
||||
// worker to pick up
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(tag = "status", rename_all = "lowercase")]
|
||||
pub enum FetchResult {
|
||||
Success { response: FetchResponse },
|
||||
Failure { trace: Vec<FetchFailure> }, // If we failed entirely to fetch the job, we return the trace for user debugging
|
||||
}
|
||||
|
||||
impl FetchResult {
|
||||
pub fn is_success(&self) -> bool {
|
||||
matches!(self, FetchResult::Success { .. })
|
||||
}
|
||||
}
|
||||
|
||||
// We distinguish between a "fetch failure" and a "worker failure" -
|
||||
// worker failures are internal-only, and do not count against the
|
||||
// retries of a job (generally, on worker failure, the job is either
|
||||
// moved to the dead letter queue, or dropped and left to the janitor to
|
||||
// reset). Feture failures are, after retries, returned to the queue, and
|
||||
// represent the result of the fetch operation.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub struct FetchFailure {
|
||||
pub kind: FetchFailureKind,
|
||||
pub message: String,
|
||||
pub body: Option<String>, // If we have a body, we include it in the failure
|
||||
pub headers: Option<HashMap<String, String>>, // If we have headers, we include them in the failure
|
||||
pub status: Option<u16>, // If we have a status, we include it in the failure
|
||||
pub timestamp: DateTime<Utc>, // Useful for users to correlate logs when debugging
|
||||
}
|
||||
|
||||
impl FetchFailure {
|
||||
pub fn new(kind: FetchFailureKind, message: impl AsRef<str>) -> Self {
|
||||
Self {
|
||||
kind,
|
||||
message: message.as_ref().to_string(),
|
||||
timestamp: Utc::now(),
|
||||
body: None,
|
||||
headers: None,
|
||||
status: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn failure_status(status: StatusCode) -> Self {
|
||||
Self {
|
||||
kind: FetchFailureKind::FailureStatus,
|
||||
message: format!("Received failure status: {}", status),
|
||||
timestamp: Utc::now(),
|
||||
body: None,
|
||||
headers: None,
|
||||
status: Some(status.as_u16()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_body(self, body: String) -> Self {
|
||||
Self {
|
||||
body: Some(body),
|
||||
..self
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_headers(self, headers: HashMap<String, String>) -> Self {
|
||||
Self {
|
||||
headers: Some(headers),
|
||||
..self
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_status(self, status: u16) -> Self {
|
||||
Self {
|
||||
status: Some(status),
|
||||
..self
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<reqwest::Error> for FetchFailure {
|
||||
fn from(e: reqwest::Error) -> Self {
|
||||
let kind = if e.is_timeout() {
|
||||
FetchFailureKind::Timeout
|
||||
} else {
|
||||
FetchFailureKind::RequestError
|
||||
};
|
||||
Self {
|
||||
kind,
|
||||
message: e.to_string(),
|
||||
timestamp: Utc::now(),
|
||||
body: None,
|
||||
headers: None,
|
||||
status: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum FetchFailureKind {
|
||||
Timeout,
|
||||
TimeoutGettingBody,
|
||||
MissingParameters,
|
||||
InvalidParameters,
|
||||
RequestError,
|
||||
FailureStatus,
|
||||
InvalidBody, // Generally means the body could not be parsed toa utf8 string
|
||||
ResponseTooLarge,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub struct FetchResponse {
|
||||
pub status: u16,
|
||||
pub headers: HashMap<String, String>,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
pub fn report_worker_saturation(context: &AppContext) {
|
||||
metrics::gauge!("fetch_worker_available_permits")
|
||||
.set(context.concurrency_limit.available_permits() as f64);
|
||||
}
|
||||
|
||||
pub async fn tick(context: Arc<AppContext>) -> Result<usize, FetchError> {
|
||||
report_worker_saturation(&context);
|
||||
|
||||
let max_jobs = min(
|
||||
context.concurrency_limit.available_permits(),
|
||||
context.config.batch_size,
|
||||
);
|
||||
|
||||
let jobs = context
|
||||
.worker
|
||||
.dequeue_jobs(&context.config.queue_served, max_jobs)
|
||||
.await?;
|
||||
|
||||
let num_jobs = jobs.len();
|
||||
|
||||
for job in jobs {
|
||||
let context = context.clone();
|
||||
// We grab job permits individually, so that as soon as a job is finished, the
|
||||
// permit to run another job is immediately available. This call should
|
||||
// never block, since we only ever dequeue as many jobs as we have permits
|
||||
// available.
|
||||
let permit = context
|
||||
.concurrency_limit
|
||||
.clone()
|
||||
.acquire_owned()
|
||||
.await
|
||||
.unwrap();
|
||||
tokio::spawn(async move {
|
||||
// TODO - since worker errors are never an indication of a fetch failure,
|
||||
// only of some internal worker issue, we should report unhealthy or fall
|
||||
// over or something here.
|
||||
if let Err(e) = run_job(context.clone(), job, permit).await {
|
||||
error!("Error running job: {:?}", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Ok(num_jobs)
|
||||
}
|
||||
|
||||
// Mostly a thin wrapper to make ser/de a bit easier
|
||||
struct FetchJob<'a> {
|
||||
_job: &'a Job,
|
||||
metadata: FetchMetadata,
|
||||
parameters: FetchParameters,
|
||||
}
|
||||
|
||||
impl<'a> TryFrom<&'a Job> for FetchJob<'a> {
|
||||
type Error = FetchFailure;
|
||||
|
||||
fn try_from(job: &'a Job) -> Result<Self, Self::Error> {
|
||||
let Some(parameters) = &job.parameters else {
|
||||
return Err(FetchFailure::new(
|
||||
FetchFailureKind::MissingParameters,
|
||||
"Job is missing parameters",
|
||||
));
|
||||
};
|
||||
let parameters: FetchParameters = match serde_json::from_str(parameters) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
return Err(FetchFailure::new(
|
||||
FetchFailureKind::InvalidParameters,
|
||||
format!("Failed to parse parameters: {}", e),
|
||||
))
|
||||
}
|
||||
};
|
||||
let metadata = match &job.metadata {
|
||||
Some(m) => match serde_json::from_str(m) {
|
||||
Ok(m) => m,
|
||||
Err(_) => {
|
||||
// If we can't decode the metadata, assume this is the first time we've seen the job
|
||||
// TODO - this is maybe too lenient, I'm not sure.
|
||||
FetchMetadata {
|
||||
tries: 0,
|
||||
trace: vec![],
|
||||
}
|
||||
}
|
||||
},
|
||||
None => FetchMetadata {
|
||||
tries: 0,
|
||||
trace: vec![],
|
||||
},
|
||||
};
|
||||
Ok(Self {
|
||||
_job: job,
|
||||
metadata,
|
||||
parameters,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn run_job(
|
||||
context: Arc<AppContext>,
|
||||
job: Job,
|
||||
_permit: OwnedSemaphorePermit,
|
||||
) -> Result<(), FetchError> {
|
||||
let parsed: FetchJob = match (&job).try_into() {
|
||||
Ok(p) => p,
|
||||
Err(e) => return dead_letter_job(&context.worker, job, vec![e]).await,
|
||||
};
|
||||
|
||||
let method: http::Method = (&parsed.parameters.method).into();
|
||||
|
||||
// Parsing errors are always dead letters - it /will/ fail every time, so dump it
|
||||
// TODO - We should probably decide whether to dead letter or return Failed on the basis of OnFinish,
|
||||
// in case the caller wants to do any cleanup on broken jobs
|
||||
let url: reqwest::Url = match (parsed.parameters.url).parse() {
|
||||
Ok(u) => u,
|
||||
Err(e) => {
|
||||
return dead_letter_job(
|
||||
&context.worker,
|
||||
job,
|
||||
vec![FetchFailure::new(
|
||||
FetchFailureKind::InvalidParameters,
|
||||
format!("Invalid url: {}", e),
|
||||
)],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
};
|
||||
let headers: reqwest::header::HeaderMap =
|
||||
match (&parsed.parameters.headers.unwrap_or_default()).try_into() {
|
||||
Ok(h) => h,
|
||||
Err(e) => {
|
||||
return dead_letter_job(
|
||||
&context.worker,
|
||||
job,
|
||||
vec![FetchFailure::new(
|
||||
FetchFailureKind::InvalidParameters,
|
||||
format!("Invalid headers: {}", e),
|
||||
)],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
};
|
||||
|
||||
let body = reqwest::Body::from(parsed.parameters.body.unwrap_or_default());
|
||||
|
||||
let send_fut = context
|
||||
.client
|
||||
.request(method, url)
|
||||
.headers(headers)
|
||||
.body(body)
|
||||
.send();
|
||||
|
||||
let mut send_fut = Box::pin(send_fut);
|
||||
|
||||
let start = Utc::now();
|
||||
let res = loop {
|
||||
tokio::select! {
|
||||
res = &mut send_fut => {
|
||||
break res
|
||||
}
|
||||
_ = tokio::time::sleep(Duration::milliseconds(HEARTBEAT_INTERVAL_MS).to_std().unwrap()) => {
|
||||
context.worker.heartbeat(job.id).await?;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// If we took, say, 25% of the heartbeat interval to send the request, we may as well heartbeat now
|
||||
if Utc::now() - start > Duration::milliseconds(HEARTBEAT_INTERVAL_MS / 4) {
|
||||
context.worker.heartbeat(job.id).await?;
|
||||
}
|
||||
|
||||
let res = match res {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
return handle_fetch_failure(
|
||||
&context,
|
||||
&job,
|
||||
&parsed.metadata,
|
||||
parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES),
|
||||
parsed.parameters.return_queue,
|
||||
parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
|
||||
e,
|
||||
)
|
||||
.await
|
||||
}
|
||||
};
|
||||
|
||||
// Grab the response metadata, since getting the body moves it
|
||||
let status = res.status();
|
||||
let headers: HashMap<String, String> = res
|
||||
.headers()
|
||||
.iter()
|
||||
.map(|(k, v)| {
|
||||
(
|
||||
k.as_str().to_string(),
|
||||
v.to_str().unwrap_or_default().to_string(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// We pre-emptively get the response body, because we incldued it in the failure trace, even if we got a failure status
|
||||
let body = first_n_bytes_of_response(
|
||||
&context.worker,
|
||||
&job,
|
||||
res,
|
||||
context.config.max_response_bytes,
|
||||
)
|
||||
.await?;
|
||||
let body = match body {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
// Tag the status and headers onto the failure
|
||||
let e = e.with_status(status.as_u16()).with_headers(headers);
|
||||
return handle_fetch_failure(
|
||||
&context,
|
||||
&job,
|
||||
&parsed.metadata,
|
||||
parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES),
|
||||
parsed.parameters.return_queue,
|
||||
parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
|
||||
e,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
};
|
||||
|
||||
// TODO - we should handle "retryable" and "permanent" failures differently, mostly
|
||||
// to be polite - retrying a permanent failure isn't a correctness problem, but it's
|
||||
// rude (and inefficient)
|
||||
if !status.is_success() {
|
||||
let failure = FetchFailure::failure_status(status)
|
||||
.with_body(body)
|
||||
.with_headers(headers);
|
||||
return handle_fetch_failure(
|
||||
&context,
|
||||
&job,
|
||||
&parsed.metadata,
|
||||
parsed.parameters.max_tries.unwrap_or(DEFAULT_RETRIES),
|
||||
parsed.parameters.return_queue,
|
||||
parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
|
||||
failure,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
let result = FetchResult::Success {
|
||||
response: FetchResponse {
|
||||
status: status.as_u16(),
|
||||
headers,
|
||||
body,
|
||||
},
|
||||
};
|
||||
|
||||
complete_job(
|
||||
&context.worker,
|
||||
&job,
|
||||
parsed.parameters.return_queue,
|
||||
parsed.parameters.on_finish.unwrap_or(DEFAULT_ON_FINISH),
|
||||
result,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
// Checks if the retry limit has been reached, and does one of:
|
||||
// - Schedule the job for retry, doing metadata bookkeeping
|
||||
// - Complete the job, with the failure trace
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn handle_fetch_failure<F>(
|
||||
context: &AppContext,
|
||||
job: &Job,
|
||||
metadata: &FetchMetadata,
|
||||
max_tries: u32,
|
||||
return_queue: String,
|
||||
on_finish: OnFinish,
|
||||
failure: F,
|
||||
) -> Result<(), FetchError>
|
||||
where
|
||||
F: Into<FetchFailure>,
|
||||
{
|
||||
let failure = failure.into();
|
||||
let mut metadata = metadata.clone();
|
||||
metadata.tries += 1;
|
||||
metadata.trace.push(failure);
|
||||
|
||||
// TODO - right now we treat all failures as retryable, but we should probably be more aggressive in
|
||||
// culling retries for permanent failures (this is less of a correctness issue and more of an efficiency/
|
||||
// politeness one). We might also want to make backoff configurable.
|
||||
if metadata.tries < min(max_tries, context.config.max_retry_attempts) {
|
||||
let next_available =
|
||||
Utc::now() + (context.config.retry_backoff_base * (metadata.tries as i32));
|
||||
// We back off for at most an hour (since callers can configure max retries to be very high)
|
||||
let next_available = min(next_available, Utc::now() + Duration::hours(1));
|
||||
// Add some seconds of jitter
|
||||
let next_available =
|
||||
next_available + Duration::seconds((rand::random::<u64>() % 30) as i64);
|
||||
|
||||
// Set us up for a retry - update metadata, reschedule, and put back in the queue we pulled from
|
||||
context
|
||||
.worker
|
||||
.set_metadata(job.id, Some(serde_json::to_string(&metadata)?))?;
|
||||
context.worker.set_state(job.id, JobState::Available)?;
|
||||
context.worker.set_queue(job.id, &job.queue_name)?;
|
||||
context.worker.set_scheduled_at(job.id, next_available)?;
|
||||
|
||||
// We downgrade the priority of jobs that fail, so first attempts at jobs get better QoS
|
||||
context.worker.set_priority(job.id, job.priority + 1)?;
|
||||
|
||||
context.worker.flush_job(job.id).await?;
|
||||
} else {
|
||||
// Complete the job, with a Failed result
|
||||
let result = FetchResult::Failure {
|
||||
trace: metadata.trace.clone(),
|
||||
};
|
||||
complete_job(&context.worker, job, return_queue, on_finish, result).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Complete the job, either because we got a good response, or because the jobs retries
|
||||
// have been exceeded.
|
||||
pub async fn complete_job(
|
||||
worker: &Worker,
|
||||
job: &Job,
|
||||
return_queue: String,
|
||||
on_finish: OnFinish,
|
||||
result: FetchResult,
|
||||
) -> Result<(), FetchError> {
|
||||
// If we fail any serde, we just want to flush to the DLQ and bail
|
||||
worker.set_state(job.id, JobState::Available)?;
|
||||
worker.set_queue(job.id, DEAD_LETTER_QUEUE)?;
|
||||
|
||||
let is_success = result.is_success();
|
||||
|
||||
let result = match serde_json::to_string(&result) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
// Leave behind a hint for debugging
|
||||
worker.set_metadata(job.id, Some(format!("Failed to serialise result: {}", e)))?;
|
||||
worker.flush_job(job.id).await?;
|
||||
return Err(FetchError::SerdeError(e));
|
||||
}
|
||||
};
|
||||
|
||||
worker.set_queue(job.id, &return_queue)?;
|
||||
|
||||
match (is_success, on_finish) {
|
||||
(true, _) | (false, OnFinish::Return) => {
|
||||
worker.set_state(job.id, JobState::Available)?;
|
||||
}
|
||||
(false, OnFinish::Complete) => {
|
||||
worker.set_state(job.id, JobState::Failed)?;
|
||||
}
|
||||
}
|
||||
|
||||
worker.set_parameters(job.id, Some(result))?;
|
||||
worker.set_metadata(job.id, None)?; // We're finished with the job, so clear our internal state
|
||||
worker.flush_job(job.id).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This moves the job to a dead letter queue, and sets the state to Available (to prevent it
|
||||
// from being deleted by the janitor). This is for debugging purposes, and only really jobs
|
||||
// that have some parsing failure on dequeue end up here (as they indicate a programming error
|
||||
// in the caller, or the worker)
|
||||
pub async fn dead_letter_job(
|
||||
worker: &Worker,
|
||||
job: Job,
|
||||
errors: Vec<FetchFailure>,
|
||||
) -> Result<(), FetchError> {
|
||||
worker.set_state(job.id, JobState::Available)?;
|
||||
worker.set_queue(job.id, DEAD_LETTER_QUEUE)?;
|
||||
|
||||
let result = FetchResult::Failure { trace: errors };
|
||||
let result = match serde_json::to_string(&result) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
worker.set_metadata(
|
||||
job.id,
|
||||
Some(format!(
|
||||
"Failed to serialise result during DLQ write: {}",
|
||||
e
|
||||
)),
|
||||
)?;
|
||||
worker.flush_job(job.id).await?;
|
||||
return Err(FetchError::SerdeError(e));
|
||||
}
|
||||
};
|
||||
|
||||
worker.set_parameters(job.id, Some(result))?;
|
||||
|
||||
worker.flush_job(job.id).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Pulls the body, while maintaining the job heartbeat.
|
||||
pub async fn first_n_bytes_of_response(
|
||||
worker: &Worker,
|
||||
job: &Job,
|
||||
response: Response,
|
||||
n: usize,
|
||||
) -> Result<Result<String, FetchFailure>, FetchError> {
|
||||
let mut body = response.bytes_stream();
|
||||
// We deserialize into a vec<u8>, and then parse to a string
|
||||
let mut buffer = Vec::with_capacity(n);
|
||||
|
||||
worker.heartbeat(job.id).await?;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
chunk = body.next() => {
|
||||
let chunk = match chunk {
|
||||
Some(Ok(c)) => c,
|
||||
Some(Err(e)) => return Ok(Err(FetchFailure::from(e))),
|
||||
None => break,
|
||||
};
|
||||
|
||||
buffer.extend_from_slice(&chunk);
|
||||
|
||||
if buffer.len() >= n {
|
||||
return Ok(Err(
|
||||
FetchFailure::new(FetchFailureKind::ResponseTooLarge, "Response too large")
|
||||
));
|
||||
};
|
||||
}
|
||||
_ = tokio::time::sleep(Duration::milliseconds(HEARTBEAT_INTERVAL_MS).to_std().unwrap()) => {}
|
||||
}
|
||||
// Heartbeat every time we get a new body chunk, or every HEARTBEAT_INTERVAL_MS
|
||||
worker.heartbeat(job.id).await?;
|
||||
}
|
||||
|
||||
let Ok(body) = String::from_utf8(buffer) else {
|
||||
return Ok(Err(FetchFailure::new(
|
||||
FetchFailureKind::InvalidBody,
|
||||
"Body could not be parsed as utf8",
|
||||
)));
|
||||
};
|
||||
|
||||
Ok(Ok(body))
|
||||
}
|
3
rust/cyclotron-fetch/src/lib.rs
Normal file
3
rust/cyclotron-fetch/src/lib.rs
Normal file
@ -0,0 +1,3 @@
|
||||
pub mod config;
|
||||
pub mod context;
|
||||
pub mod fetch;
|
98
rust/cyclotron-fetch/src/main.rs
Normal file
98
rust/cyclotron-fetch/src/main.rs
Normal file
@ -0,0 +1,98 @@
|
||||
use axum::{extract::State, routing::get, Router};
|
||||
use common_metrics::setup_metrics_routes;
|
||||
use cyclotron_fetch::{
|
||||
config::Config,
|
||||
context::AppContext,
|
||||
fetch::{tick, FetchError},
|
||||
};
|
||||
use envconfig::Envconfig;
|
||||
use health::HealthRegistry;
|
||||
use std::{future::ready, sync::Arc};
|
||||
use tracing::{error, info};
|
||||
|
||||
async fn listen(app: Router, bind: String) -> Result<(), std::io::Error> {
|
||||
let listener = tokio::net::TcpListener::bind(bind).await?;
|
||||
|
||||
axum::serve(listener, app).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// For axums state stuff
|
||||
#[derive(Clone)]
|
||||
struct WorkerId(pub String);
|
||||
|
||||
pub fn app(liveness: HealthRegistry, worker_id: String) -> Router {
|
||||
Router::new()
|
||||
.route("/", get(index))
|
||||
.route("/_readiness", get(index))
|
||||
.route("/_liveness", get(move || ready(liveness.get_status())))
|
||||
.with_state(WorkerId(worker_id))
|
||||
}
|
||||
|
||||
async fn index(State(worker_id): State<WorkerId>) -> String {
|
||||
format!("cyclotron janitor {}", worker_id.0)
|
||||
}
|
||||
|
||||
async fn worker_loop(context: AppContext) -> Result<(), FetchError> {
|
||||
let context = Arc::new(context);
|
||||
loop {
|
||||
context.liveness.report_healthy().await;
|
||||
let started = tick(context.clone()).await?;
|
||||
info!("started {} jobs", started);
|
||||
// This will happen if 1) there are no jobs or 2) we have no capacity to start new jobs. Either way, we should sleep for a bit
|
||||
if started == 0 {
|
||||
tokio::time::sleep(context.config.job_poll_interval.to_std().unwrap()).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let config = Config::init_from_env().expect("failed to load configuration from env");
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let liveness = HealthRegistry::new("liveness");
|
||||
|
||||
let (app_config, pool_config) = config.to_components();
|
||||
let bind = format!("{}:{}", app_config.host, app_config.port);
|
||||
|
||||
info!(
|
||||
"Fetch worker starting with ID {:?}, listening at {}",
|
||||
app_config.worker_id, bind
|
||||
);
|
||||
|
||||
let worker_liveness = liveness
|
||||
.register(
|
||||
"worker".to_string(),
|
||||
(app_config.job_poll_interval * 4).to_std().unwrap(),
|
||||
)
|
||||
.await;
|
||||
|
||||
let app = setup_metrics_routes(app(liveness, app_config.worker_id.clone()));
|
||||
|
||||
let context = AppContext::create(app_config, pool_config, worker_liveness)
|
||||
.await
|
||||
.expect("failed to create app context");
|
||||
|
||||
let http_server = tokio::spawn(listen(app, bind));
|
||||
|
||||
let worker_loop = tokio::spawn(worker_loop(context));
|
||||
|
||||
tokio::select! {
|
||||
res = worker_loop => {
|
||||
error!("janitor loop exited");
|
||||
if let Err(e) = res {
|
||||
error!("janitor failed with: {}", e)
|
||||
}
|
||||
}
|
||||
res = http_server => {
|
||||
error!("http server exited");
|
||||
if let Err(e) = res {
|
||||
error!("server failed with: {}", e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("exiting");
|
||||
}
|
293
rust/cyclotron-fetch/tests/fetch.rs
Normal file
293
rust/cyclotron-fetch/tests/fetch.rs
Normal file
@ -0,0 +1,293 @@
|
||||
use std::{collections::HashMap, str::FromStr, sync::Arc};
|
||||
|
||||
use chrono::Duration;
|
||||
use cyclotron_core::{manager::QueueManager, worker::Worker};
|
||||
use cyclotron_fetch::fetch::{tick, FetchResult, HttpMethod};
|
||||
use httpmock::{Method, MockServer};
|
||||
use serde_json::json;
|
||||
use sqlx::PgPool;
|
||||
use utils::{
|
||||
construct_job, construct_params, get_app_test_context, make_immediately_available,
|
||||
wait_on_no_running, wait_on_return,
|
||||
};
|
||||
|
||||
mod utils;
|
||||
|
||||
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
|
||||
pub async fn test_completes_fetch(db: PgPool) {
|
||||
let context = Arc::new(get_app_test_context(db.clone()).await);
|
||||
let producer = QueueManager::from_pool(db.clone());
|
||||
let return_worker = Worker::from_pool(db.clone());
|
||||
let server = MockServer::start();
|
||||
|
||||
let mock = server.mock(|when, then| {
|
||||
when.method(Method::GET).path("/test");
|
||||
then.status(200).body("Hello, world!");
|
||||
});
|
||||
|
||||
let params = construct_params(server.url("/test"), HttpMethod::Get);
|
||||
let job = construct_job(params);
|
||||
producer.create_job(job).await.unwrap();
|
||||
|
||||
let started = tick(context).await.unwrap();
|
||||
|
||||
assert_eq!(started, 1);
|
||||
|
||||
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
|
||||
|
||||
let response: FetchResult =
|
||||
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
|
||||
|
||||
let FetchResult::Success { response } = response else {
|
||||
panic!("Expected success response");
|
||||
};
|
||||
|
||||
assert_eq!(response.status, 200);
|
||||
assert_eq!(response.body, "Hello, world!");
|
||||
|
||||
mock.assert_hits(1);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
|
||||
pub async fn test_returns_failure_after_retries(db: PgPool) {
|
||||
let context = Arc::new(get_app_test_context(db.clone()).await);
|
||||
let producer = QueueManager::from_pool(db.clone());
|
||||
let return_worker = Worker::from_pool(db.clone());
|
||||
let server = MockServer::start();
|
||||
|
||||
let mock = server.mock(|when, then| {
|
||||
when.method(Method::GET).path("/test");
|
||||
then.status(500).body("test server error body");
|
||||
});
|
||||
|
||||
let mut params = construct_params(server.url("/test"), HttpMethod::Get);
|
||||
params.max_tries = Some(2);
|
||||
|
||||
let job = construct_job(params);
|
||||
producer.create_job(job).await.unwrap();
|
||||
|
||||
// Tick twice for retry
|
||||
let started = tick(context.clone()).await.unwrap();
|
||||
assert_eq!(started, 1);
|
||||
wait_on_no_running(&db, Duration::milliseconds(100)).await;
|
||||
make_immediately_available(&db).await;
|
||||
let started = tick(context.clone()).await.unwrap();
|
||||
assert_eq!(started, 1);
|
||||
wait_on_no_running(&db, Duration::milliseconds(100)).await;
|
||||
|
||||
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
|
||||
|
||||
let response: FetchResult =
|
||||
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
|
||||
|
||||
let FetchResult::Failure { trace } = response else {
|
||||
panic!("Expected failure response");
|
||||
};
|
||||
|
||||
assert!(trace.len() == 2);
|
||||
for attempt in trace {
|
||||
assert_eq!(attempt.status, Some(500));
|
||||
assert_eq!(attempt.body, Some("test server error body".to_string()));
|
||||
}
|
||||
|
||||
mock.assert_hits(2);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
|
||||
pub fn fetch_discards_bad_metadata(db: PgPool) {
|
||||
let context = Arc::new(get_app_test_context(db.clone()).await);
|
||||
let producer = QueueManager::from_pool(db.clone());
|
||||
let return_worker = Worker::from_pool(db.clone());
|
||||
let server = MockServer::start();
|
||||
|
||||
let mock = server.mock(|when, then| {
|
||||
when.method(Method::GET).path("/test");
|
||||
then.status(200).body("Hello, world!");
|
||||
});
|
||||
|
||||
let params = construct_params(server.url("/test"), HttpMethod::Get);
|
||||
let mut job = construct_job(params);
|
||||
job.metadata = Some("bad json".to_string());
|
||||
producer.create_job(job).await.unwrap();
|
||||
|
||||
let started = tick(context).await.unwrap();
|
||||
|
||||
assert_eq!(started, 1);
|
||||
|
||||
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
|
||||
|
||||
let response: FetchResult =
|
||||
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
|
||||
|
||||
let FetchResult::Success { response } = response else {
|
||||
panic!("Expected success response");
|
||||
};
|
||||
|
||||
assert_eq!(response.status, 200);
|
||||
assert_eq!(response.body, "Hello, world!");
|
||||
|
||||
mock.assert_hits(1);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
|
||||
pub fn fetch_with_minimum_params_works(db: PgPool) {
|
||||
let context = Arc::new(get_app_test_context(db.clone()).await);
|
||||
let producer = QueueManager::from_pool(db.clone());
|
||||
let return_worker = Worker::from_pool(db.clone());
|
||||
let server = MockServer::start();
|
||||
|
||||
let mock = server.mock(|when, then| {
|
||||
when.method(Method::GET).path("/test");
|
||||
then.status(200).body("Hello, world!");
|
||||
});
|
||||
|
||||
let params = construct_params(server.url("/test"), HttpMethod::Get);
|
||||
let mut job = construct_job(params);
|
||||
|
||||
let url = server.url("/test");
|
||||
let manual_params = json!({
|
||||
"url": url,
|
||||
"method": "GET",
|
||||
"return_queue": "return",
|
||||
})
|
||||
.to_string();
|
||||
|
||||
job.parameters = Some(manual_params);
|
||||
|
||||
producer.create_job(job).await.unwrap();
|
||||
|
||||
let started = tick(context).await.unwrap();
|
||||
|
||||
assert_eq!(started, 1);
|
||||
|
||||
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
|
||||
|
||||
let response: FetchResult =
|
||||
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
|
||||
|
||||
let FetchResult::Success { response } = response else {
|
||||
panic!("Expected success response");
|
||||
};
|
||||
|
||||
assert_eq!(response.status, 200);
|
||||
assert_eq!(response.body, "Hello, world!");
|
||||
|
||||
mock.assert_hits(1);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
|
||||
pub async fn test_completes_fetch_with_headers(db: PgPool) {
|
||||
let context = Arc::new(get_app_test_context(db.clone()).await);
|
||||
let producer = QueueManager::from_pool(db.clone());
|
||||
let return_worker = Worker::from_pool(db.clone());
|
||||
let server = MockServer::start();
|
||||
|
||||
let mock = server.mock(|when, then| {
|
||||
when.method(Method::GET)
|
||||
.path("/test")
|
||||
.header("X-Test", "test");
|
||||
then.status(200).body("Hello, world!");
|
||||
});
|
||||
|
||||
let mut params = construct_params(server.url("/test"), HttpMethod::Get);
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("X-Test".to_string(), "test".to_string());
|
||||
params.headers = Some(headers);
|
||||
|
||||
let job = construct_job(params);
|
||||
producer.create_job(job).await.unwrap();
|
||||
|
||||
let started = tick(context).await.unwrap();
|
||||
|
||||
assert_eq!(started, 1);
|
||||
|
||||
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
|
||||
|
||||
let response: FetchResult =
|
||||
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
|
||||
|
||||
let FetchResult::Success { response } = response else {
|
||||
panic!("Expected success response");
|
||||
};
|
||||
|
||||
assert_eq!(response.status, 200);
|
||||
assert_eq!(response.body, "Hello, world!");
|
||||
|
||||
mock.assert_hits(1);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
|
||||
pub async fn test_completes_fetch_with_body(db: PgPool) {
|
||||
let context = Arc::new(get_app_test_context(db.clone()).await);
|
||||
let producer = QueueManager::from_pool(db.clone());
|
||||
let return_worker = Worker::from_pool(db.clone());
|
||||
let server = MockServer::start();
|
||||
|
||||
let mock = server.mock(|when, then| {
|
||||
when.method(Method::POST).path("/test").body("test body");
|
||||
then.status(200).body("Hello, world!");
|
||||
});
|
||||
|
||||
let mut params = construct_params(server.url("/test"), HttpMethod::Post);
|
||||
params.body = Some("test body".to_string());
|
||||
|
||||
let job = construct_job(params);
|
||||
producer.create_job(job).await.unwrap();
|
||||
|
||||
let started = tick(context).await.unwrap();
|
||||
|
||||
assert_eq!(started, 1);
|
||||
|
||||
let returned = wait_on_return(&return_worker, 1, false).await.unwrap();
|
||||
|
||||
let response: FetchResult =
|
||||
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
|
||||
|
||||
let FetchResult::Success { response } = response else {
|
||||
panic!("Expected success response");
|
||||
};
|
||||
|
||||
assert_eq!(response.status, 200);
|
||||
assert_eq!(response.body, "Hello, world!");
|
||||
|
||||
mock.assert_hits(1);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
|
||||
pub async fn test_completes_fetch_with_vm_state(db: PgPool) {
|
||||
let context = Arc::new(get_app_test_context(db.clone()).await);
|
||||
let producer = QueueManager::from_pool(db.clone());
|
||||
let return_worker = Worker::from_pool(db.clone());
|
||||
let server = MockServer::start();
|
||||
|
||||
let mock = server.mock(|when, then| {
|
||||
when.method(Method::GET).path("/test");
|
||||
then.status(200).body("Hello, world!");
|
||||
});
|
||||
|
||||
let params = construct_params(server.url("/test"), HttpMethod::Get);
|
||||
let mut job = construct_job(params);
|
||||
job.vm_state = Some(json!({"test": "state"}).to_string());
|
||||
producer.create_job(job).await.unwrap();
|
||||
|
||||
let started = tick(context).await.unwrap();
|
||||
|
||||
assert_eq!(started, 1);
|
||||
|
||||
let returned = wait_on_return(&return_worker, 1, true).await.unwrap();
|
||||
|
||||
let state = serde_json::Value::from_str(returned[0].vm_state.as_ref().unwrap()).unwrap();
|
||||
assert_eq!(state, json!({"test": "state"}));
|
||||
|
||||
let response: FetchResult =
|
||||
serde_json::from_str(returned[0].parameters.as_ref().unwrap()).unwrap();
|
||||
|
||||
let FetchResult::Success { response } = response else {
|
||||
panic!("Expected success response");
|
||||
};
|
||||
|
||||
assert_eq!(response.status, 200);
|
||||
assert_eq!(response.body, "Hello, world!");
|
||||
|
||||
mock.assert_hits(1);
|
||||
}
|
127
rust/cyclotron-fetch/tests/utils.rs
Normal file
127
rust/cyclotron-fetch/tests/utils.rs
Normal file
@ -0,0 +1,127 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use chrono::{Duration, Utc};
|
||||
use cyclotron_core::{
|
||||
base_ops::{Job, JobInit},
|
||||
error::QueueError,
|
||||
worker::Worker,
|
||||
};
|
||||
use cyclotron_fetch::{
|
||||
config::AppConfig,
|
||||
context::AppContext,
|
||||
fetch::{FetchParameters, HttpMethod},
|
||||
};
|
||||
use sqlx::PgPool;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
const FETCH_QUEUE: &str = "fetch";
|
||||
const RETURN_QUEUE: &str = "return";
|
||||
|
||||
pub async fn get_app_test_context(db: PgPool) -> AppContext {
|
||||
let worker = Worker::from_pool(db.clone());
|
||||
let client = reqwest::Client::new();
|
||||
let concurrency_limit = Arc::new(Semaphore::new(1));
|
||||
let health = health::HealthRegistry::new("test");
|
||||
let liveness = health
|
||||
.register("test".to_string(), Duration::seconds(30).to_std().unwrap())
|
||||
.await;
|
||||
|
||||
let config = AppConfig {
|
||||
fetch_timeout: Duration::seconds(10),
|
||||
concurrent_requests_limit: 1,
|
||||
host: "localhost".to_string(),
|
||||
port: 16,
|
||||
worker_id: "test".to_string(),
|
||||
job_poll_interval: Duration::seconds(10),
|
||||
max_retry_attempts: 3,
|
||||
queue_served: FETCH_QUEUE.to_string(),
|
||||
batch_size: 1000,
|
||||
max_response_bytes: 1024 * 1024,
|
||||
retry_backoff_base: Duration::milliseconds(1000),
|
||||
allow_internal_ips: true,
|
||||
};
|
||||
|
||||
AppContext {
|
||||
worker,
|
||||
client,
|
||||
concurrency_limit,
|
||||
liveness,
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn construct_params(url: String, method: HttpMethod) -> FetchParameters {
|
||||
FetchParameters {
|
||||
url,
|
||||
method,
|
||||
return_queue: RETURN_QUEUE.to_string(),
|
||||
headers: None,
|
||||
body: None,
|
||||
max_tries: None,
|
||||
on_finish: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn construct_job(parameters: FetchParameters) -> JobInit {
|
||||
JobInit {
|
||||
team_id: 1,
|
||||
queue_name: FETCH_QUEUE.to_string(),
|
||||
priority: 0,
|
||||
scheduled: Utc::now() - Duration::seconds(1),
|
||||
function_id: None,
|
||||
vm_state: None,
|
||||
parameters: Some(serde_json::to_string(¶meters).unwrap()),
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn wait_on_return(
|
||||
worker: &Worker,
|
||||
count: usize,
|
||||
with_vm: bool,
|
||||
) -> Result<Vec<Job>, QueueError> {
|
||||
let timeout = Duration::seconds(1);
|
||||
let start = Utc::now();
|
||||
let mut returned = vec![];
|
||||
while start + timeout > Utc::now() {
|
||||
let mut jobs = if with_vm {
|
||||
worker.dequeue_with_vm_state(RETURN_QUEUE, 1).await?
|
||||
} else {
|
||||
worker.dequeue_jobs(RETURN_QUEUE, 1).await?
|
||||
};
|
||||
returned.append(&mut jobs);
|
||||
if returned.len() == count {
|
||||
return Ok(returned);
|
||||
}
|
||||
if returned.len() > count {
|
||||
panic!("Too many jobs returned");
|
||||
}
|
||||
}
|
||||
panic!("Timeout waiting for jobs to return");
|
||||
}
|
||||
|
||||
pub async fn wait_on_no_running(pool: &PgPool, max_time: Duration) {
|
||||
let start = Utc::now();
|
||||
loop {
|
||||
let running: i64 =
|
||||
sqlx::query_scalar("SELECT COUNT(*) FROM cyclotron_jobs WHERE state = 'running'")
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap();
|
||||
if running == 0 {
|
||||
return;
|
||||
}
|
||||
if Utc::now() - start > max_time {
|
||||
panic!("Timeout waiting for jobs to finish");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn make_immediately_available(pool: &PgPool) {
|
||||
sqlx::query(
|
||||
"UPDATE cyclotron_jobs SET scheduled = NOW() - INTERVAL '1 second' WHERE state = 'available'",
|
||||
)
|
||||
.execute(pool)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
22
rust/cyclotron-janitor/Cargo.toml
Normal file
22
rust/cyclotron-janitor/Cargo.toml
Normal file
@ -0,0 +1,22 @@
|
||||
[package]
|
||||
name = "cyclotron-janitor"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
tracing-subscriber = { workspace = true }
|
||||
sqlx = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
envconfig = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
eyre = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
cyclotron-core = { path = "../cyclotron-core" }
|
||||
common-metrics = { path = "../common/metrics" }
|
||||
health = { path = "../common/health" }
|
23
rust/cyclotron-janitor/bin/entrypoint.sh
Executable file
23
rust/cyclotron-janitor/bin/entrypoint.sh
Executable file
@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# I set all possible env vars here, tune them as you like
|
||||
export RUST_LOG="INFO"
|
||||
export HOST="::"
|
||||
export PORT="3302"
|
||||
export DATABASE_URL="postgres://posthog:posthog@localhost:5432/cyclotron"
|
||||
export CLEANUP_INTERVAL_SECONDS="10"
|
||||
export PG_MAX_CONNECTIONS="10"
|
||||
export PG_MIN_CONNECTIONS="1"
|
||||
export PG_ACQUIRE_TIMEOUT_SECONDS="5"
|
||||
export PG_MAX_LIFETIME_SECONDS="300"
|
||||
export PG_IDLE_TIMEOUT_SECONDS="60"
|
||||
export JANITOR_ID="test-janitor"
|
||||
export JANITOR_MAX_TOUCHES="2"
|
||||
export JANITOR_STALL_TIMEOUT_SECONDS="30"
|
||||
|
||||
# Uncomment this to have the database be reset every time you start the janitor
|
||||
sqlx database reset -y --source ../cyclotron-core/migrations
|
||||
sqlx migrate run --source ../cyclotron-core/migrations
|
||||
|
||||
cargo run --release
|
83
rust/cyclotron-janitor/src/config.rs
Normal file
83
rust/cyclotron-janitor/src/config.rs
Normal file
@ -0,0 +1,83 @@
|
||||
use chrono::Duration;
|
||||
|
||||
use cyclotron_core::PoolConfig;
|
||||
use envconfig::Envconfig;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Envconfig)]
|
||||
pub struct Config {
|
||||
#[envconfig(from = "BIND_HOST", default = "::")]
|
||||
pub host: String,
|
||||
|
||||
#[envconfig(from = "BIND_PORT", default = "3303")]
|
||||
pub port: u16,
|
||||
|
||||
#[envconfig(default = "postgres://posthog:posthog@localhost:5432/cyclotron")]
|
||||
pub database_url: String,
|
||||
|
||||
#[envconfig(default = "30")]
|
||||
pub cleanup_interval_secs: u64,
|
||||
|
||||
#[envconfig(default = "10")]
|
||||
pub pg_max_connections: u32,
|
||||
|
||||
#[envconfig(default = "1")]
|
||||
pub pg_min_connections: u32,
|
||||
|
||||
#[envconfig(default = "30")]
|
||||
pub pg_acquire_timeout_seconds: u64,
|
||||
|
||||
#[envconfig(default = "300")]
|
||||
pub pg_max_lifetime_seconds: u64,
|
||||
|
||||
#[envconfig(default = "60")]
|
||||
pub pg_idle_timeout_seconds: u64,
|
||||
|
||||
// Generally, this should be equivalent to a "shard id", as only one janitor should be running
|
||||
// per shard
|
||||
pub janitor_id: Option<String>,
|
||||
|
||||
#[envconfig(default = "10")]
|
||||
pub janitor_max_touches: i16,
|
||||
|
||||
#[envconfig(default = "60")]
|
||||
pub janitor_stall_timeout_seconds: u16,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
pub fn get_janitor_config(&self) -> JanitorConfig {
|
||||
let pool_config = PoolConfig {
|
||||
db_url: self.database_url.clone(),
|
||||
max_connections: Some(self.pg_max_connections),
|
||||
min_connections: Some(self.pg_min_connections),
|
||||
acquire_timeout_seconds: Some(self.pg_acquire_timeout_seconds),
|
||||
max_lifetime_seconds: Some(self.pg_max_lifetime_seconds),
|
||||
idle_timeout_seconds: Some(self.pg_idle_timeout_seconds),
|
||||
};
|
||||
|
||||
let settings = JanitorSettings {
|
||||
stall_timeout: Duration::seconds(self.janitor_stall_timeout_seconds as i64),
|
||||
max_touches: self.janitor_max_touches,
|
||||
id: self
|
||||
.janitor_id
|
||||
.clone()
|
||||
.unwrap_or_else(|| Uuid::now_v7().to_string()),
|
||||
};
|
||||
|
||||
JanitorConfig {
|
||||
pool: pool_config,
|
||||
settings,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JanitorConfig {
|
||||
pub pool: PoolConfig,
|
||||
pub settings: JanitorSettings,
|
||||
}
|
||||
|
||||
pub struct JanitorSettings {
|
||||
pub stall_timeout: Duration,
|
||||
pub max_touches: i16,
|
||||
pub id: String,
|
||||
}
|
136
rust/cyclotron-janitor/src/janitor.rs
Normal file
136
rust/cyclotron-janitor/src/janitor.rs
Normal file
@ -0,0 +1,136 @@
|
||||
use chrono::Utc;
|
||||
use cyclotron_core::{
|
||||
error::QueueError,
|
||||
janitor_ops::{
|
||||
delete_completed_jobs, delete_failed_jobs, delete_poison_pills, reset_stalled_jobs,
|
||||
},
|
||||
};
|
||||
use sqlx::PgPool;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::config::{JanitorConfig, JanitorSettings};
|
||||
|
||||
// The janitor reports it's own metrics, this is mostly for testing purposes
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub struct CleanupResult {
|
||||
pub completed: u64,
|
||||
pub failed: u64,
|
||||
pub poisoned: u64,
|
||||
pub stalled: u64,
|
||||
}
|
||||
|
||||
pub struct Janitor {
|
||||
pool: PgPool,
|
||||
settings: JanitorSettings,
|
||||
metrics_labels: Vec<(&'static str, String)>,
|
||||
}
|
||||
|
||||
impl Janitor {
|
||||
pub async fn new(config: JanitorConfig) -> Result<Self, QueueError> {
|
||||
let settings = config.settings;
|
||||
let pool = config.pool.connect().await?;
|
||||
|
||||
let metrics_labels = vec![("janitor_id", settings.id.clone())];
|
||||
|
||||
Ok(Self {
|
||||
pool,
|
||||
settings,
|
||||
metrics_labels,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn from_pool(pool: PgPool, settings: JanitorSettings) -> Self {
|
||||
let metrics_labels = vec![("janitor_id", settings.id.clone())];
|
||||
Self {
|
||||
pool,
|
||||
settings,
|
||||
metrics_labels,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO - right now, the metrics produced here are pretty rough - just per shard, without
|
||||
// any per-queue or per-worker-type breakdown. It'd be nice to add that, eventually.
|
||||
pub async fn run_once(&self) -> Result<CleanupResult, QueueError> {
|
||||
info!("Running janitor loop");
|
||||
let start = Utc::now();
|
||||
metrics::counter!("cyclotron_janitor_run_starts", &self.metrics_labels).increment(1);
|
||||
|
||||
let before = Utc::now();
|
||||
let completed = delete_completed_jobs(&self.pool).await?;
|
||||
let taken = Utc::now() - before;
|
||||
metrics::histogram!(
|
||||
"cyclotron_janitor_completed_jobs_cleanup_duration_ms",
|
||||
&self.metrics_labels
|
||||
)
|
||||
.record(taken.num_milliseconds() as f64);
|
||||
metrics::counter!(
|
||||
"cyclotron_janitor_completed_jobs_deleted",
|
||||
&self.metrics_labels
|
||||
)
|
||||
.increment(completed);
|
||||
|
||||
let before = Utc::now();
|
||||
let failed = delete_failed_jobs(&self.pool).await?;
|
||||
let taken = Utc::now() - before;
|
||||
metrics::histogram!(
|
||||
"cyclotron_janitor_failed_jobs_cleanup_duration_ms",
|
||||
&self.metrics_labels
|
||||
)
|
||||
.record(taken.num_milliseconds() as f64);
|
||||
metrics::counter!(
|
||||
"cyclotron_janitor_failed_jobs_deleted",
|
||||
&self.metrics_labels
|
||||
)
|
||||
.increment(failed);
|
||||
|
||||
// Note - if we reset stalled jobs before deleting poison pills, we'll never delete poision
|
||||
// pills, since resetting a stalled job clears the locked state.
|
||||
let before = Utc::now();
|
||||
let poisoned = delete_poison_pills(
|
||||
&self.pool,
|
||||
self.settings.stall_timeout,
|
||||
self.settings.max_touches,
|
||||
)
|
||||
.await?;
|
||||
let taken = Utc::now() - before;
|
||||
metrics::histogram!(
|
||||
"cyclotron_janitor_poison_pills_cleanup_duration_ms",
|
||||
&self.metrics_labels
|
||||
)
|
||||
.record(taken.num_milliseconds() as f64);
|
||||
metrics::counter!(
|
||||
"cyclotron_janitor_poison_pills_deleted",
|
||||
&self.metrics_labels
|
||||
)
|
||||
.increment(poisoned);
|
||||
if poisoned > 0 {
|
||||
warn!("Deleted {} poison pills", poisoned);
|
||||
}
|
||||
|
||||
let before = Utc::now();
|
||||
let stalled = reset_stalled_jobs(&self.pool, self.settings.stall_timeout).await?;
|
||||
let taken = Utc::now() - before;
|
||||
metrics::histogram!(
|
||||
"cyclotron_janitor_stalled_jobs_reset_duration_ms",
|
||||
&self.metrics_labels
|
||||
)
|
||||
.record(taken.num_milliseconds() as f64);
|
||||
metrics::counter!("cyclotron_janitor_stalled_jobs_reset", &self.metrics_labels)
|
||||
.increment(stalled);
|
||||
if stalled > 0 {
|
||||
warn!("Reset {} stalled jobs", stalled);
|
||||
}
|
||||
|
||||
metrics::counter!("cyclotron_janitor_run_ends", &self.metrics_labels).increment(1);
|
||||
let elapsed = Utc::now() - start;
|
||||
metrics::histogram!("cyclotron_janitor_run_duration_ms", &self.metrics_labels)
|
||||
.record(elapsed.num_milliseconds() as f64);
|
||||
info!("Janitor loop complete");
|
||||
Ok(CleanupResult {
|
||||
completed,
|
||||
failed,
|
||||
poisoned,
|
||||
stalled,
|
||||
})
|
||||
}
|
||||
}
|
2
rust/cyclotron-janitor/src/lib.rs
Normal file
2
rust/cyclotron-janitor/src/lib.rs
Normal file
@ -0,0 +1,2 @@
|
||||
pub mod config;
|
||||
pub mod janitor;
|
105
rust/cyclotron-janitor/src/main.rs
Normal file
105
rust/cyclotron-janitor/src/main.rs
Normal file
@ -0,0 +1,105 @@
|
||||
use axum::{extract::State, routing::get, Router};
|
||||
use common_metrics::setup_metrics_routes;
|
||||
use cyclotron_janitor::{config::Config, janitor::Janitor};
|
||||
use envconfig::Envconfig;
|
||||
use eyre::Result;
|
||||
use health::{HealthHandle, HealthRegistry};
|
||||
use std::{future::ready, time::Duration};
|
||||
use tracing::{error, info};
|
||||
|
||||
/// Most of this stuff is stolen pretty shamelessly from the rustyhook janitor. It'll diverge more
|
||||
/// once we introduce the management command stuff, but for now it's a good starting point.
|
||||
|
||||
async fn cleanup_loop(janitor: Janitor, livenes: HealthHandle, interval_secs: u64) -> Result<()> {
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
|
||||
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
||||
if let Err(e) = janitor.run_once().await {
|
||||
// don't bother reporting unhealthy - a few times around this loop will put us in a stalled state
|
||||
error!("janitor failed cleanup with: {}", e);
|
||||
} else {
|
||||
livenes.report_healthy().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn listen(app: Router, bind: String) -> Result<()> {
|
||||
let listener = tokio::net::TcpListener::bind(bind).await?;
|
||||
|
||||
axum::serve(listener, app).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// For axums state stuff
|
||||
#[derive(Clone)]
|
||||
struct JanitorId(pub String);
|
||||
|
||||
pub fn app(liveness: HealthRegistry, janitor_id: String) -> Router {
|
||||
Router::new()
|
||||
.route("/", get(index))
|
||||
.route("/_readiness", get(index))
|
||||
.route("/_liveness", get(move || ready(liveness.get_status())))
|
||||
.with_state(JanitorId(janitor_id))
|
||||
}
|
||||
|
||||
async fn index(State(janitor_id): State<JanitorId>) -> String {
|
||||
format!("cyclotron janitor {}", janitor_id.0)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let config = Config::init_from_env().expect("failed to load configuration from env");
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let liveness = HealthRegistry::new("liveness");
|
||||
|
||||
let janitor_config = config.get_janitor_config();
|
||||
|
||||
let janitor_id = janitor_config.settings.id.clone();
|
||||
let bind = format!("{}:{}", config.host, config.port);
|
||||
|
||||
info!(
|
||||
"Starting janitor with ID {:?}, listening at {}",
|
||||
janitor_id, bind
|
||||
);
|
||||
|
||||
let janitor = Janitor::new(janitor_config)
|
||||
.await
|
||||
.expect("failed to create janitor");
|
||||
|
||||
let janitor_liveness = liveness
|
||||
.register(
|
||||
"janitor".to_string(),
|
||||
Duration::from_secs(config.cleanup_interval_secs * 4),
|
||||
)
|
||||
.await;
|
||||
|
||||
let janitor_loop = tokio::spawn(cleanup_loop(
|
||||
janitor,
|
||||
janitor_liveness,
|
||||
config.cleanup_interval_secs,
|
||||
));
|
||||
|
||||
let app = setup_metrics_routes(app(liveness, janitor_id));
|
||||
let http_server = tokio::spawn(listen(app, bind));
|
||||
|
||||
tokio::select! {
|
||||
res = janitor_loop => {
|
||||
error!("janitor loop exited");
|
||||
if let Err(e) = res {
|
||||
error!("janitor failed with: {}", e)
|
||||
}
|
||||
}
|
||||
res = http_server => {
|
||||
error!("http server exited");
|
||||
if let Err(e) = res {
|
||||
error!("server failed with: {}", e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("exiting");
|
||||
}
|
226
rust/cyclotron-janitor/tests/janitor.rs
Normal file
226
rust/cyclotron-janitor/tests/janitor.rs
Normal file
@ -0,0 +1,226 @@
|
||||
use chrono::{Duration, Utc};
|
||||
use cyclotron_core::{
|
||||
base_ops::{JobInit, JobState},
|
||||
manager::QueueManager,
|
||||
worker::Worker,
|
||||
};
|
||||
use cyclotron_janitor::{config::JanitorSettings, janitor::Janitor};
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[sqlx::test(migrations = "../cyclotron-core/migrations")]
|
||||
async fn janitor_test(db: PgPool) {
|
||||
let worker = Worker::from_pool(db.clone());
|
||||
let manager = QueueManager::from_pool(db.clone());
|
||||
|
||||
// Purposefully MUCH smaller than would be used in production, so
|
||||
// we can simulate stalled or poison jobs quickly
|
||||
let stall_timeout = Duration::milliseconds(10);
|
||||
let max_touches = 3;
|
||||
|
||||
let settings = JanitorSettings {
|
||||
stall_timeout,
|
||||
max_touches,
|
||||
id: "test_janitor".to_string(),
|
||||
};
|
||||
let janitor = Janitor::from_pool(db.clone(), settings);
|
||||
|
||||
let now = Utc::now() - Duration::seconds(10);
|
||||
let queue_name = "default".to_string();
|
||||
|
||||
let job_init = JobInit {
|
||||
team_id: 1,
|
||||
queue_name: queue_name.clone(),
|
||||
priority: 0,
|
||||
scheduled: now,
|
||||
function_id: Some(Uuid::now_v7()),
|
||||
vm_state: None,
|
||||
parameters: None,
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
// First test - if we mark a job as completed, the janitor will clean it up
|
||||
manager.create_job(job_init.clone()).await.unwrap();
|
||||
let job = worker
|
||||
.dequeue_jobs(&queue_name, 1)
|
||||
.await
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
|
||||
worker.set_state(job.id, JobState::Completed).unwrap();
|
||||
worker.flush_job(job.id).await.unwrap();
|
||||
|
||||
let result = janitor.run_once().await.unwrap();
|
||||
assert_eq!(result.completed, 1);
|
||||
assert_eq!(result.failed, 0);
|
||||
assert_eq!(result.poisoned, 0);
|
||||
assert_eq!(result.stalled, 0);
|
||||
|
||||
// Second test - if we mark a job as failed, the janitor will clean it up
|
||||
manager.create_job(job_init.clone()).await.unwrap();
|
||||
let job = worker
|
||||
.dequeue_jobs(&queue_name, 1)
|
||||
.await
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
|
||||
worker.set_state(job.id, JobState::Failed).unwrap();
|
||||
worker.flush_job(job.id).await.unwrap();
|
||||
|
||||
let result = janitor.run_once().await.unwrap();
|
||||
assert_eq!(result.completed, 0);
|
||||
assert_eq!(result.failed, 1);
|
||||
assert_eq!(result.poisoned, 0);
|
||||
assert_eq!(result.stalled, 0);
|
||||
|
||||
// Third test - if we pick up a job, and then hold it for longer than
|
||||
// the stall timeout, the janitor will reset it. After this, the worker
|
||||
// cannot flush updates to the job, and must re-dequeue it.
|
||||
|
||||
manager.create_job(job_init.clone()).await.unwrap();
|
||||
let job = worker
|
||||
.dequeue_jobs(&queue_name, 1)
|
||||
.await
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
|
||||
// First, cleanup won't do anything
|
||||
let result = janitor.run_once().await.unwrap();
|
||||
assert_eq!(result.completed, 0);
|
||||
assert_eq!(result.failed, 0);
|
||||
assert_eq!(result.poisoned, 0);
|
||||
assert_eq!(result.stalled, 0);
|
||||
|
||||
// Then we stall on the job
|
||||
tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await;
|
||||
|
||||
// Now, cleanup will reset the job
|
||||
let result = janitor.run_once().await.unwrap();
|
||||
assert_eq!(result.completed, 0);
|
||||
assert_eq!(result.failed, 0);
|
||||
assert_eq!(result.poisoned, 0);
|
||||
assert_eq!(result.stalled, 1);
|
||||
|
||||
// Now, the worker can't flush the job
|
||||
worker.set_state(job.id, JobState::Completed).unwrap();
|
||||
let result = worker.flush_job(job.id).await;
|
||||
assert!(result.is_err());
|
||||
|
||||
// But if we re-dequeue the job, we can flush it
|
||||
let job = worker
|
||||
.dequeue_jobs(&queue_name, 1)
|
||||
.await
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
worker.set_state(job.id, JobState::Completed).unwrap();
|
||||
worker.flush_job(job.id).await.unwrap();
|
||||
|
||||
janitor.run_once().await.unwrap(); // Clean up the completed job to reset for the next test
|
||||
|
||||
// Fourth test - if a worker holds a job for longer than the stall
|
||||
// time, but calls heartbeat, the job will not be reset
|
||||
|
||||
manager.create_job(job_init.clone()).await.unwrap();
|
||||
let job = worker
|
||||
.dequeue_jobs(&queue_name, 1)
|
||||
.await
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
|
||||
let start = tokio::time::Instant::now();
|
||||
loop {
|
||||
worker.heartbeat(job.id).await.unwrap();
|
||||
tokio::time::sleep(Duration::milliseconds(1).to_std().unwrap()).await;
|
||||
if start.elapsed() > stall_timeout.to_std().unwrap() * 2 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let result = janitor.run_once().await.unwrap();
|
||||
assert_eq!(result.completed, 0);
|
||||
assert_eq!(result.failed, 0);
|
||||
assert_eq!(result.poisoned, 0);
|
||||
assert_eq!(result.stalled, 0);
|
||||
|
||||
// The worker can still flush the job
|
||||
worker.set_state(job.id, JobState::Completed).unwrap();
|
||||
worker.flush_job(job.id).await.unwrap();
|
||||
|
||||
// and now cleanup will work
|
||||
let result = janitor.run_once().await.unwrap();
|
||||
assert_eq!(result.completed, 1);
|
||||
assert_eq!(result.failed, 0);
|
||||
assert_eq!(result.poisoned, 0);
|
||||
assert_eq!(result.stalled, 0);
|
||||
|
||||
// Fifth test - if a job stalls more than max_touches
|
||||
// it will be marked as poisoned and deleted
|
||||
|
||||
manager.create_job(job_init.clone()).await.unwrap();
|
||||
let mut job = worker
|
||||
.dequeue_jobs(&queue_name, 1)
|
||||
.await
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
|
||||
for _ in 0..max_touches {
|
||||
tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await;
|
||||
let result = janitor.run_once().await.unwrap();
|
||||
assert_eq!(result.completed, 0);
|
||||
assert_eq!(result.failed, 0);
|
||||
assert_eq!(result.poisoned, 0);
|
||||
assert_eq!(result.stalled, 1);
|
||||
|
||||
// assert we can't update the job (flush and heartbeat fail)
|
||||
worker.set_state(job.id, JobState::Completed).unwrap();
|
||||
let result = worker.heartbeat(job.id).await;
|
||||
assert!(result.is_err());
|
||||
let result = worker.flush_job(job.id).await;
|
||||
assert!(result.is_err());
|
||||
|
||||
// re-dequeue the job
|
||||
job = worker
|
||||
.dequeue_jobs(&queue_name, 1)
|
||||
.await
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
}
|
||||
// At this point, the "janitor touches" on the job is 3 (it's been stalled and reset 3 times), so one more cleanup loop will delete it
|
||||
|
||||
// Now stall one more time, and on cleanup, we should see the job was considered poison and deleted
|
||||
tokio::time::sleep(stall_timeout.to_std().unwrap() * 2).await;
|
||||
let result: cyclotron_janitor::janitor::CleanupResult = janitor.run_once().await.unwrap();
|
||||
assert_eq!(result.completed, 0);
|
||||
assert_eq!(result.failed, 0);
|
||||
assert_eq!(result.poisoned, 1);
|
||||
assert_eq!(result.stalled, 0);
|
||||
|
||||
// The worker can't flush the job
|
||||
worker.set_state(job.id, JobState::Completed).unwrap();
|
||||
let result = worker.flush_job(job.id).await;
|
||||
assert!(result.is_err());
|
||||
|
||||
// Sixth test - the janitor can operate on multiple jobs at once
|
||||
manager.create_job(job_init.clone()).await.unwrap();
|
||||
manager.create_job(job_init.clone()).await.unwrap();
|
||||
let jobs = worker.dequeue_jobs(&queue_name, 2).await.unwrap();
|
||||
|
||||
worker.set_state(jobs[0].id, JobState::Completed).unwrap();
|
||||
worker.set_state(jobs[1].id, JobState::Failed).unwrap();
|
||||
|
||||
worker.flush_job(jobs[0].id).await.unwrap();
|
||||
worker.flush_job(jobs[1].id).await.unwrap();
|
||||
|
||||
let result = janitor.run_once().await.unwrap();
|
||||
assert_eq!(result.completed, 1);
|
||||
assert_eq!(result.failed, 1);
|
||||
assert_eq!(result.poisoned, 0);
|
||||
assert_eq!(result.stalled, 0);
|
||||
}
|
7
rust/cyclotron-node/.gitignore
vendored
Normal file
7
rust/cyclotron-node/.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
target
|
||||
index.node
|
||||
**/node_modules
|
||||
**/.DS_Store
|
||||
npm-debug.log*cargo.log
|
||||
cross.log
|
||||
dist/
|
22
rust/cyclotron-node/Cargo.toml
Normal file
22
rust/cyclotron-node/Cargo.toml
Normal file
@ -0,0 +1,22 @@
|
||||
[package]
|
||||
name = "cyclotron-node"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
exclude = ["index.node"]
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
cyclotron-core = { path = "../cyclotron-core" }
|
||||
neon = { workspace = true }
|
||||
once_cell = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
chrono = { workspace = true }
|
144
rust/cyclotron-node/examples/basic.js
Normal file
144
rust/cyclotron-node/examples/basic.js
Normal file
@ -0,0 +1,144 @@
|
||||
const assert = require('assert')
|
||||
const cyclotron = require('../.')
|
||||
const crypto = require('crypto')
|
||||
|
||||
// Set of available job states
|
||||
const JOB_STATES = Object.freeze({
|
||||
AVAILABLE: 'available',
|
||||
RUNNING: 'running',
|
||||
FAILED: 'failed',
|
||||
COMPLETED: 'completed',
|
||||
})
|
||||
|
||||
const AVAILABLE_WORKERS = Object.freeze({
|
||||
FETCH: 'fetch',
|
||||
HOG: 'hog',
|
||||
})
|
||||
|
||||
async function main() {
|
||||
let poolConfig = {
|
||||
db_url: 'postgresql://posthog:posthog@localhost:5432/cyclotron',
|
||||
}
|
||||
|
||||
let managerConfig = {
|
||||
shards: [poolConfig],
|
||||
}
|
||||
|
||||
// Most processes will only need to do one of these, but we can do both here for demonstration purposes
|
||||
await cyclotron.initWorker(JSON.stringify(poolConfig))
|
||||
await cyclotron.initManager(JSON.stringify(managerConfig))
|
||||
|
||||
// Maybe inits won't throw on re-calling, and are also short-circuiting to be almost free, so safe to call frequently
|
||||
// (although I still wouldn't call them in a loop)
|
||||
await cyclotron.maybeInitWorker(JSON.stringify(poolConfig))
|
||||
await cyclotron.maybeInitManager(JSON.stringify(managerConfig))
|
||||
|
||||
let five_mintes_ago = new Date(new Date().getTime() - 5 * 60000).toISOString()
|
||||
let queue_name = 'default'
|
||||
|
||||
let job_1 = {
|
||||
team_id: 1,
|
||||
queue_name,
|
||||
priority: 0,
|
||||
scheduled: five_mintes_ago,
|
||||
function_id: crypto.randomUUID(), // Is nullable
|
||||
vm_state: null,
|
||||
parameters: null,
|
||||
metadata: null,
|
||||
}
|
||||
|
||||
let job_2 = {
|
||||
team_id: 1,
|
||||
queue_name,
|
||||
priority: 1,
|
||||
scheduled: five_mintes_ago,
|
||||
function_id: crypto.randomUUID(), // Is nullable
|
||||
vm_state: null,
|
||||
parameters: null,
|
||||
metadata: null,
|
||||
}
|
||||
|
||||
await cyclotron.createJob(JSON.stringify(job_1))
|
||||
await cyclotron.createJob(JSON.stringify(job_2))
|
||||
|
||||
// Jobs (as well as any other 'complex' data shape) are serialized across the API boundary,
|
||||
// because that's (according to the neon maintainers) /actually faster/ than doing a bunch
|
||||
// of cross-runtime pointer chasing.
|
||||
let jobs = JSON.parse(await cyclotron.dequeueJobs(queue_name, 2))
|
||||
assert(jobs.length === 2)
|
||||
assert(jobs[0].function_id === job_1.function_id)
|
||||
assert(jobs[1].function_id === job_2.function_id)
|
||||
|
||||
job_1 = jobs[0]
|
||||
job_2 = jobs[1]
|
||||
|
||||
// All of these throw if the job hasn't been dequeued by the worker created when init_worker was called,
|
||||
// or if there's some serde error - generally, interacting with the cyclotron should involve try/catch in
|
||||
// some far outer catch. We can iterate on this API to make it more ergonomic with time, but
|
||||
// my js/ts is... rusty (co-pilot wrote this joke)
|
||||
cyclotron.setState(job_1.id, JOB_STATES.AVAILABLE)
|
||||
cyclotron.setState(job_2.id, JOB_STATES.AVAILABLE)
|
||||
|
||||
cyclotron.setQueue(job_1.id, 'non-default')
|
||||
cyclotron.setQueue(job_2.id, 'non-default')
|
||||
|
||||
// Priority is lowest-first, so this means we can assert that job_2 will be returned first on subsequent dequeue_jobs
|
||||
cyclotron.setPriority(job_1.id, 2)
|
||||
cyclotron.setPriority(job_2.id, 1)
|
||||
|
||||
let ten_minutes_ago = new Date(new Date().getTime() - 10 * 60000).toISOString()
|
||||
cyclotron.setScheduledAt(job_1.id, ten_minutes_ago)
|
||||
cyclotron.setScheduledAt(job_2.id, ten_minutes_ago)
|
||||
|
||||
cyclotron.setVmState(job_1.id, JSON.stringify({ state: 'running' }))
|
||||
cyclotron.setVmState(job_2.id, JSON.stringify({ state: 'running' }))
|
||||
|
||||
cyclotron.setParameters(job_1.id, JSON.stringify({ parameters: 'running' }))
|
||||
cyclotron.setParameters(job_2.id, JSON.stringify({ parameters: 'running' }))
|
||||
|
||||
cyclotron.setMetadata(job_1.id, JSON.stringify({ metadata: 'running' }))
|
||||
cyclotron.setMetadata(job_2.id, JSON.stringify({ metadata: 'running' }))
|
||||
|
||||
// Flush the updates queued up above back to the queue. Subsequent calls to flush
|
||||
// will throw if a job isn't re-acquired. Flushes will fail if a job state update
|
||||
// isn't included (workers should not purposefully leave jobs in a running state)
|
||||
await cyclotron.flushJob(job_1.id)
|
||||
await cyclotron.flushJob(job_2.id)
|
||||
|
||||
jobs = JSON.parse(await cyclotron.dequeueWithVmState('non-default', 2))
|
||||
|
||||
assert(jobs[0].id == job_2.id)
|
||||
assert(jobs[1].id == job_1.id)
|
||||
|
||||
assert(jobs[0].function_id === job_2.function_id)
|
||||
assert(jobs[1].function_id === job_1.function_id)
|
||||
|
||||
assert(jobs[0].team_id === job_2.team_id)
|
||||
assert(jobs[1].team_id === job_1.team_id)
|
||||
|
||||
assert(jobs[0].queue_name === 'non-default')
|
||||
assert(jobs[1].queue_name === 'non-default')
|
||||
|
||||
assert(jobs[0].priority === 1)
|
||||
assert(jobs[1].priority === 2)
|
||||
|
||||
assert(jobs[0].scheduled === ten_minutes_ago)
|
||||
assert(jobs[1].scheduled === ten_minutes_ago)
|
||||
|
||||
assert(jobs[0].vm_state === JSON.stringify({ state: 'running' }))
|
||||
assert(jobs[1].vm_state === JSON.stringify({ state: 'running' }))
|
||||
assert(jobs[0].parameters === JSON.stringify({ parameters: 'running' }))
|
||||
assert(jobs[1].parameters === JSON.stringify({ parameters: 'running' }))
|
||||
assert(jobs[0].metadata === JSON.stringify({ metadata: 'running' }))
|
||||
assert(jobs[1].metadata === JSON.stringify({ metadata: 'running' }))
|
||||
|
||||
// Now we'll mark these jobs as completed
|
||||
cyclotron.setState(job_1.id, JOB_STATES.COMPLETED)
|
||||
cyclotron.setState(job_2.id, JOB_STATES.COMPLETED)
|
||||
|
||||
// And flush them back to the queue
|
||||
await cyclotron.flushJob(job_1.id)
|
||||
await cyclotron.flushJob(job_2.id)
|
||||
}
|
||||
|
||||
main()
|
27
rust/cyclotron-node/package.json
Normal file
27
rust/cyclotron-node/package.json
Normal file
@ -0,0 +1,27 @@
|
||||
{
|
||||
"name": "@posthog/cyclotron",
|
||||
"version": "0.1.0",
|
||||
"description": "Node bindings for cyclotron",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
"scripts": {
|
||||
"test": "cargo test",
|
||||
"build": "pnpm run build:cargo --release && pnpm run build:move-lib && pnpm run build:typescript",
|
||||
"build:move-lib": "cp ../target/release/libcyclotron_node.dylib index.node || cp ../target/release/libcyclotron_node.so index.node",
|
||||
"build:cargo": "cargo build --message-format=json > cargo.log",
|
||||
"build:cargo:debug": "pnpm run build:cargo",
|
||||
"build:cross": "cross build --message-format=json > cross.log",
|
||||
"build:typescript": "tsc",
|
||||
"package": "NODE_ENV=development pnpm i --dev && pnpm run build"
|
||||
},
|
||||
"author": "",
|
||||
"license": "MIT",
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.4.1",
|
||||
"typescript": "^4.7.4"
|
||||
},
|
||||
"files": [
|
||||
"dist",
|
||||
"index.node"
|
||||
]
|
||||
}
|
31
rust/cyclotron-node/pnpm-lock.yaml
Normal file
31
rust/cyclotron-node/pnpm-lock.yaml
Normal file
@ -0,0 +1,31 @@
|
||||
lockfileVersion: '6.0'
|
||||
|
||||
settings:
|
||||
autoInstallPeers: true
|
||||
excludeLinksFromLockfile: false
|
||||
|
||||
devDependencies:
|
||||
'@types/node':
|
||||
specifier: ^22.4.1
|
||||
version: 22.4.1
|
||||
typescript:
|
||||
specifier: ^4.7.4
|
||||
version: 4.9.5
|
||||
|
||||
packages:
|
||||
|
||||
/@types/node@22.4.1:
|
||||
resolution: {integrity: sha512-1tbpb9325+gPnKK0dMm+/LMriX0vKxf6RnB0SZUqfyVkQ4fMgUSySqhxE/y8Jvs4NyF1yHzTfG9KlnkIODxPKg==}
|
||||
dependencies:
|
||||
undici-types: 6.19.8
|
||||
dev: true
|
||||
|
||||
/typescript@4.9.5:
|
||||
resolution: {integrity: sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==}
|
||||
engines: {node: '>=4.2.0'}
|
||||
hasBin: true
|
||||
dev: true
|
||||
|
||||
/undici-types@6.19.8:
|
||||
resolution: {integrity: sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==}
|
||||
dev: true
|
257
rust/cyclotron-node/src/index.ts
Normal file
257
rust/cyclotron-node/src/index.ts
Normal file
@ -0,0 +1,257 @@
|
||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||
const cyclotron = require('../index.node')
|
||||
|
||||
export interface PoolConfig {
|
||||
dbUrl: string
|
||||
maxConnections?: number
|
||||
minConnections?: number
|
||||
acquireTimeoutSeconds?: number
|
||||
maxLifetimeSeconds?: number
|
||||
idleTimeoutSeconds?: number
|
||||
}
|
||||
|
||||
// Type as expected by Cyclotron.
|
||||
interface InternalPoolConfig {
|
||||
db_url: string
|
||||
max_connections?: number
|
||||
min_connections?: number
|
||||
acquire_timeout_seconds?: number
|
||||
max_lifetime_seconds?: number
|
||||
idle_timeout_seconds?: number
|
||||
}
|
||||
|
||||
export interface ManagerConfig {
|
||||
shards: PoolConfig[]
|
||||
}
|
||||
|
||||
// Type as expected by Cyclotron.
|
||||
interface InternalManagerConfig {
|
||||
shards: InternalPoolConfig[]
|
||||
}
|
||||
|
||||
export interface JobInit {
|
||||
teamId: number
|
||||
functionId: string
|
||||
queueName: string
|
||||
priority?: number
|
||||
scheduled?: Date
|
||||
vmState?: string
|
||||
parameters?: string
|
||||
metadata?: string
|
||||
}
|
||||
|
||||
// Type as expected by Cyclotron.
|
||||
interface InternalJobInit {
|
||||
team_id: number
|
||||
function_id: string
|
||||
queue_name: string
|
||||
priority?: number
|
||||
scheduled?: Date
|
||||
vm_state?: string
|
||||
parameters?: string
|
||||
metadata?: string
|
||||
}
|
||||
|
||||
export type JobState = 'available' | 'running' | 'completed' | 'failed' | 'paused'
|
||||
|
||||
export interface Job {
|
||||
id: string
|
||||
teamId: number
|
||||
functionId: string | null
|
||||
created: Date
|
||||
lockId: string | null
|
||||
lastHeartbeat: Date | null
|
||||
janitorTouchCount: number
|
||||
transitionCount: number
|
||||
lastTransition: Date
|
||||
queueName: string
|
||||
state: JobState
|
||||
priority: number
|
||||
scheduled: Date
|
||||
vmState: string | null
|
||||
metadata: string | null
|
||||
parameters: string | null
|
||||
}
|
||||
|
||||
// Type as returned by Cyclotron.
|
||||
interface InternalJob {
|
||||
id: string
|
||||
team_id: number
|
||||
function_id: string | null
|
||||
created: string
|
||||
lock_id: string | null
|
||||
last_heartbeat: string | null
|
||||
janitor_touch_count: number
|
||||
transition_count: number
|
||||
last_transition: string
|
||||
queue_name: string
|
||||
state: JobState
|
||||
priority: number
|
||||
scheduled: string
|
||||
vm_state: string | null
|
||||
metadata: string | null
|
||||
parameters: string | null
|
||||
}
|
||||
|
||||
async function initWorker(poolConfig: PoolConfig): Promise<void> {
|
||||
const initWorkerInternal: InternalPoolConfig = {
|
||||
db_url: poolConfig.dbUrl,
|
||||
max_connections: poolConfig.maxConnections,
|
||||
min_connections: poolConfig.minConnections,
|
||||
acquire_timeout_seconds: poolConfig.acquireTimeoutSeconds,
|
||||
max_lifetime_seconds: poolConfig.maxLifetimeSeconds,
|
||||
idle_timeout_seconds: poolConfig.idleTimeoutSeconds,
|
||||
}
|
||||
return await cyclotron.initWorker(JSON.stringify(initWorkerInternal))
|
||||
}
|
||||
|
||||
async function initManager(managerConfig: ManagerConfig): Promise<void> {
|
||||
const managerConfigInternal: InternalManagerConfig = {
|
||||
shards: managerConfig.shards.map((shard) => ({
|
||||
db_url: shard.dbUrl,
|
||||
max_connections: shard.maxConnections,
|
||||
min_connections: shard.minConnections,
|
||||
acquire_timeout_seconds: shard.acquireTimeoutSeconds,
|
||||
max_lifetime_seconds: shard.maxLifetimeSeconds,
|
||||
idle_timeout_seconds: shard.idleTimeoutSeconds,
|
||||
})),
|
||||
}
|
||||
return await cyclotron.initManager(JSON.stringify(managerConfigInternal))
|
||||
}
|
||||
|
||||
async function maybeInitWorker(poolConfig: PoolConfig): Promise<void> {
|
||||
const initWorkerInternal: InternalPoolConfig = {
|
||||
db_url: poolConfig.dbUrl,
|
||||
max_connections: poolConfig.maxConnections,
|
||||
min_connections: poolConfig.minConnections,
|
||||
acquire_timeout_seconds: poolConfig.acquireTimeoutSeconds,
|
||||
max_lifetime_seconds: poolConfig.maxLifetimeSeconds,
|
||||
idle_timeout_seconds: poolConfig.idleTimeoutSeconds,
|
||||
}
|
||||
return await cyclotron.maybeInitWorker(JSON.stringify(initWorkerInternal))
|
||||
}
|
||||
|
||||
async function maybeInitManager(managerConfig: ManagerConfig): Promise<void> {
|
||||
const managerConfigInternal: InternalManagerConfig = {
|
||||
shards: managerConfig.shards.map((shard) => ({
|
||||
db_url: shard.dbUrl,
|
||||
max_connections: shard.maxConnections,
|
||||
min_connections: shard.minConnections,
|
||||
acquire_timeout_seconds: shard.acquireTimeoutSeconds,
|
||||
max_lifetime_seconds: shard.maxLifetimeSeconds,
|
||||
idle_timeout_seconds: shard.idleTimeoutSeconds,
|
||||
})),
|
||||
}
|
||||
return await cyclotron.maybeInitManager(JSON.stringify(managerConfigInternal))
|
||||
}
|
||||
|
||||
export async function createJob(job: JobInit): Promise<void> {
|
||||
job.priority ??= 1
|
||||
job.scheduled ??= new Date()
|
||||
|
||||
const jobInitInternal: InternalJobInit = {
|
||||
team_id: job.teamId,
|
||||
function_id: job.functionId,
|
||||
queue_name: job.queueName,
|
||||
priority: job.priority,
|
||||
scheduled: job.scheduled,
|
||||
vm_state: job.vmState,
|
||||
parameters: job.parameters,
|
||||
metadata: job.metadata,
|
||||
}
|
||||
return await cyclotron.createJob(JSON.stringify(jobInitInternal))
|
||||
}
|
||||
|
||||
function convertInternalJobToJob(jobInternal: InternalJob): Job {
|
||||
return {
|
||||
id: jobInternal.id,
|
||||
teamId: jobInternal.team_id,
|
||||
functionId: jobInternal.function_id,
|
||||
created: new Date(jobInternal.created),
|
||||
lockId: jobInternal.lock_id,
|
||||
lastHeartbeat: jobInternal.last_heartbeat ? new Date(jobInternal.last_heartbeat) : null,
|
||||
janitorTouchCount: jobInternal.janitor_touch_count,
|
||||
transitionCount: jobInternal.transition_count,
|
||||
lastTransition: new Date(jobInternal.last_transition),
|
||||
queueName: jobInternal.queue_name,
|
||||
state: jobInternal.state,
|
||||
priority: jobInternal.priority,
|
||||
scheduled: new Date(jobInternal.scheduled),
|
||||
vmState: jobInternal.vm_state,
|
||||
metadata: jobInternal.metadata,
|
||||
parameters: jobInternal.parameters,
|
||||
}
|
||||
}
|
||||
|
||||
async function dequeueJobs(queueName: string, limit: number): Promise<Job[]> {
|
||||
const jobsStr = await cyclotron.dequeueJobs(queueName, limit)
|
||||
const jobs: InternalJob[] = JSON.parse(jobsStr)
|
||||
return jobs.map(convertInternalJobToJob)
|
||||
}
|
||||
async function dequeueJobsWithVmState(queueName: string, limit: number): Promise<Job[]> {
|
||||
const jobsStr = await cyclotron.dequeueJobsWithVmState(queueName, limit)
|
||||
const jobs: InternalJob[] = JSON.parse(jobsStr)
|
||||
return jobs.map(convertInternalJobToJob)
|
||||
}
|
||||
|
||||
async function flushJob(jobId: string): Promise<void> {
|
||||
return await cyclotron.flushJob(jobId)
|
||||
}
|
||||
|
||||
function setState(jobId: string, jobState: JobState): Promise<void> {
|
||||
return cyclotron.setState(jobId, jobState)
|
||||
}
|
||||
|
||||
function setQueue(jobId: string, queueName: string): Promise<void> {
|
||||
return cyclotron.setQueue(jobId, queueName)
|
||||
}
|
||||
|
||||
function setPriority(jobId: string, priority: number): Promise<void> {
|
||||
return cyclotron.setPriority(jobId, priority)
|
||||
}
|
||||
|
||||
function setScheduledAt(jobId: string, scheduledAt: Date): Promise<void> {
|
||||
return cyclotron.setScheduledAt(jobId, scheduledAt.toISOString())
|
||||
}
|
||||
|
||||
function serializeObject(name: string, obj: Record<string, any> | null): string | null {
|
||||
if (obj === null) {
|
||||
return null
|
||||
} else if (typeof obj === 'object' && obj !== null) {
|
||||
return JSON.stringify(obj)
|
||||
}
|
||||
throw new Error(`${name} must be either an object or null`)
|
||||
}
|
||||
|
||||
function setVmState(jobId: string, vmState: Record<string, any> | null): Promise<void> {
|
||||
const serialized = serializeObject('vmState', vmState)
|
||||
return cyclotron.setVmState(jobId, serialized)
|
||||
}
|
||||
|
||||
function setMetadata(jobId: string, metadata: Record<string, any> | null): Promise<void> {
|
||||
const serialized = serializeObject('metadata', metadata)
|
||||
return cyclotron.setMetadata(jobId, serialized)
|
||||
}
|
||||
|
||||
function setParameters(jobId: string, parameters: Record<string, any> | null): Promise<void> {
|
||||
const serialized = serializeObject('parameters', parameters)
|
||||
return cyclotron.setParameters(jobId, serialized)
|
||||
}
|
||||
|
||||
export default {
|
||||
initWorker,
|
||||
initManager,
|
||||
maybeInitWorker,
|
||||
maybeInitManager,
|
||||
createJob,
|
||||
dequeueJobs,
|
||||
dequeueJobsWithVmState,
|
||||
flushJob,
|
||||
setState,
|
||||
setQueue,
|
||||
setPriority,
|
||||
setScheduledAt,
|
||||
setVmState,
|
||||
setMetadata,
|
||||
setParameters,
|
||||
}
|
450
rust/cyclotron-node/src/lib.rs
Normal file
450
rust/cyclotron-node/src/lib.rs
Normal file
@ -0,0 +1,450 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use cyclotron_core::{
|
||||
base_ops::{JobInit, JobState},
|
||||
manager::{ManagerConfig, QueueManager},
|
||||
worker::Worker,
|
||||
PoolConfig,
|
||||
};
|
||||
|
||||
use neon::{
|
||||
handle::Handle,
|
||||
prelude::{Context, FunctionContext, ModuleContext},
|
||||
result::{JsResult, NeonResult},
|
||||
types::{JsNull, JsNumber, JsPromise, JsString, JsValue},
|
||||
};
|
||||
use once_cell::sync::OnceCell;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde_json::Value;
|
||||
use tokio::runtime::Runtime;
|
||||
use uuid::Uuid;
|
||||
|
||||
static WORKER: OnceCell<Worker> = OnceCell::new();
|
||||
static MANAGER: OnceCell<QueueManager> = OnceCell::new();
|
||||
static RUNTIME: OnceCell<Runtime> = OnceCell::new();
|
||||
|
||||
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
|
||||
RUNTIME
|
||||
.get_or_try_init(Runtime::new)
|
||||
.or_else(|e| cx.throw_error(format!("failed to create tokio runtime: {}", e)))
|
||||
}
|
||||
|
||||
// The general interface for calling our functions takes a JSON serialized stirng,
|
||||
// because neon has no nice serde support for function arguments (and generally.
|
||||
// rippping objects from the v8 runtime piece by piece is slower than just passing
|
||||
// a since chunk of bytes). These are convenience functions for converting between
|
||||
pub fn from_json_string<'a, T, C>(cx: &mut C, object: Handle<JsString>) -> NeonResult<T>
|
||||
where
|
||||
T: DeserializeOwned,
|
||||
C: Context<'a>,
|
||||
{
|
||||
let value: T =
|
||||
serde_json::from_str(&object.value(cx)).or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
Ok(value)
|
||||
}
|
||||
|
||||
pub fn to_json_string<'a, T, C>(cx: &mut C, value: T) -> NeonResult<String>
|
||||
where
|
||||
T: serde::Serialize,
|
||||
C: Context<'a>,
|
||||
{
|
||||
let value = serde_json::to_string(&value)
|
||||
.or_else(|e| cx.throw_error(format!("failed to serialize value: {}", e)))?;
|
||||
Ok(value)
|
||||
}
|
||||
|
||||
fn hello(mut cx: FunctionContext) -> JsResult<JsString> {
|
||||
let arg1 = cx.argument::<JsString>(0)?;
|
||||
let value: Value = from_json_string(&mut cx, arg1)?;
|
||||
let string = to_json_string(&mut cx, value)?;
|
||||
Ok(cx.string(string))
|
||||
}
|
||||
|
||||
fn init_worker_impl(mut cx: FunctionContext, throw_on_reinit: bool) -> JsResult<JsPromise> {
|
||||
let arg1 = cx.argument::<JsString>(0)?;
|
||||
let config: PoolConfig = from_json_string(&mut cx, arg1)?;
|
||||
|
||||
let (deferred, promise) = cx.promise();
|
||||
let channel = cx.channel();
|
||||
let runtime = runtime(&mut cx)?;
|
||||
|
||||
let fut = async move {
|
||||
let worker = Worker::new(config).await;
|
||||
deferred.settle_with(&channel, move |mut cx| {
|
||||
if WORKER.get().is_some() && !throw_on_reinit {
|
||||
return Ok(cx.null()); // Short circuit to make using maybe_init a no-op
|
||||
}
|
||||
let worker = worker.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
let already_set = WORKER.set(worker).is_err();
|
||||
if already_set && throw_on_reinit {
|
||||
cx.throw_error("worker already initialized")
|
||||
} else {
|
||||
Ok(cx.null())
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
runtime.spawn(fut);
|
||||
|
||||
Ok(promise)
|
||||
}
|
||||
|
||||
fn init_manager_impl(mut cx: FunctionContext, throw_on_reinit: bool) -> JsResult<JsPromise> {
|
||||
let arg1 = cx.argument::<JsString>(0)?;
|
||||
let config: ManagerConfig = from_json_string(&mut cx, arg1)?;
|
||||
|
||||
let (deferred, promise) = cx.promise();
|
||||
let channel = cx.channel();
|
||||
let runtime = runtime(&mut cx)?;
|
||||
|
||||
let fut = async move {
|
||||
let manager = QueueManager::new(config).await;
|
||||
deferred.settle_with(&channel, move |mut cx| {
|
||||
if MANAGER.get().is_some() && !throw_on_reinit {
|
||||
return Ok(cx.null()); // Short circuit to make using maybe_init a no-op
|
||||
}
|
||||
let manager = manager.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
let already_set = MANAGER.set(manager).is_err();
|
||||
if already_set && throw_on_reinit {
|
||||
cx.throw_error("manager already initialized")
|
||||
} else {
|
||||
Ok(cx.null())
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
runtime.spawn(fut);
|
||||
|
||||
Ok(promise)
|
||||
}
|
||||
|
||||
fn init_worker(cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
init_worker_impl(cx, true)
|
||||
}
|
||||
|
||||
fn init_manager(cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
init_manager_impl(cx, true)
|
||||
}
|
||||
|
||||
fn maybe_init_worker(cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
init_worker_impl(cx, false)
|
||||
}
|
||||
|
||||
fn maybe_init_manager(cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
init_manager_impl(cx, false)
|
||||
}
|
||||
|
||||
// throw_error has a type signature that makes it inconvenient to use in closures, because
|
||||
// it requires that you specify the V of the NeonResult<V> returned, even though it's always
|
||||
// an error. This is a sane thing for it to do, but it's inconvenient for us, because we
|
||||
// frequently settle promises early, before we have a V to use for type inference. This little
|
||||
// wrapper makes that easier, by specifying the V as JsNull
|
||||
fn throw_null_err<'c, C>(cx: &mut C, msg: &str) -> NeonResult<Handle<'c, JsNull>>
|
||||
where
|
||||
C: Context<'c>,
|
||||
{
|
||||
cx.throw_error(msg)
|
||||
}
|
||||
|
||||
fn create_job(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
let arg1: Handle<JsString> = cx.argument::<JsString>(0)?;
|
||||
let job: JobInit = from_json_string(&mut cx, arg1)?;
|
||||
|
||||
let (deferred, promise) = cx.promise();
|
||||
let channel = cx.channel();
|
||||
let runtime = runtime(&mut cx)?;
|
||||
|
||||
let fut = async move {
|
||||
let manager = match MANAGER.get() {
|
||||
Some(manager) => manager,
|
||||
None => {
|
||||
deferred.settle_with(&channel, |mut cx| {
|
||||
throw_null_err(&mut cx, "manager not initialized")
|
||||
});
|
||||
return;
|
||||
}
|
||||
};
|
||||
let job = manager.create_job(job).await;
|
||||
deferred.settle_with(&channel, move |mut cx| {
|
||||
job.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
Ok(cx.null())
|
||||
});
|
||||
};
|
||||
|
||||
runtime.spawn(fut);
|
||||
|
||||
Ok(promise)
|
||||
}
|
||||
|
||||
fn dequeue_jobs(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
let queue_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
|
||||
let limit = cx.argument::<JsNumber>(1)?.value(&mut cx) as usize; // TODO - I don't love this cast
|
||||
|
||||
let (deferred, promise) = cx.promise();
|
||||
let channel = cx.channel();
|
||||
let runtime = runtime(&mut cx)?;
|
||||
|
||||
let fut = async move {
|
||||
let worker = match WORKER.get() {
|
||||
Some(worker) => worker,
|
||||
None => {
|
||||
deferred.settle_with(&channel, |mut cx| {
|
||||
throw_null_err(&mut cx, "worker not initialized")
|
||||
});
|
||||
return;
|
||||
}
|
||||
};
|
||||
let jobs = worker.dequeue_jobs(&queue_name, limit).await;
|
||||
deferred.settle_with(&channel, move |mut cx| {
|
||||
let jobs = jobs.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
let jobs = to_json_string(&mut cx, jobs)?;
|
||||
Ok(cx.string(jobs))
|
||||
});
|
||||
};
|
||||
|
||||
runtime.spawn(fut);
|
||||
|
||||
Ok(promise)
|
||||
}
|
||||
|
||||
fn dequeue_with_vm_state(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
let queue_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
|
||||
let limit = cx.argument::<JsNumber>(1)?.value(&mut cx) as usize; // TODO - I don't love this cast
|
||||
|
||||
let (deferred, promise) = cx.promise();
|
||||
let channel = cx.channel();
|
||||
let runtime = runtime(&mut cx)?;
|
||||
|
||||
let fut = async move {
|
||||
let worker = match WORKER.get() {
|
||||
Some(worker) => worker,
|
||||
None => {
|
||||
deferred.settle_with(&channel, |mut cx| {
|
||||
throw_null_err(&mut cx, "worker not initialized")
|
||||
});
|
||||
return;
|
||||
}
|
||||
};
|
||||
let jobs = worker.dequeue_with_vm_state(&queue_name, limit).await;
|
||||
deferred.settle_with(&channel, move |mut cx| {
|
||||
let jobs = jobs.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
let jobs = to_json_string(&mut cx, jobs)?;
|
||||
Ok(cx.string(jobs))
|
||||
});
|
||||
};
|
||||
|
||||
runtime.spawn(fut);
|
||||
|
||||
Ok(promise)
|
||||
}
|
||||
|
||||
fn flush_job(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
let arg1 = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let job_id: Uuid = arg1
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg1)))?;
|
||||
|
||||
let (deferred, promise) = cx.promise();
|
||||
let channel = cx.channel();
|
||||
let runtime = runtime(&mut cx)?;
|
||||
|
||||
let fut = async move {
|
||||
let worker = match WORKER.get() {
|
||||
Some(worker) => worker,
|
||||
None => {
|
||||
deferred.settle_with(&channel, |mut cx| {
|
||||
throw_null_err(&mut cx, "worker not initialized")
|
||||
});
|
||||
return;
|
||||
}
|
||||
};
|
||||
let res = worker.flush_job(job_id).await;
|
||||
deferred.settle_with(&channel, move |mut cx| {
|
||||
res.or_else(|e: cyclotron_core::error::QueueError| cx.throw_error(format!("{}", e)))?;
|
||||
Ok(cx.null())
|
||||
});
|
||||
};
|
||||
|
||||
runtime.spawn(fut);
|
||||
|
||||
Ok(promise)
|
||||
}
|
||||
|
||||
fn set_state(mut cx: FunctionContext) -> JsResult<JsNull> {
|
||||
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let job_id: Uuid = arg
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
|
||||
|
||||
let arg = cx.argument::<JsString>(1)?.value(&mut cx);
|
||||
let state: JobState = arg
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid job state: {}", arg)))?;
|
||||
|
||||
WORKER
|
||||
.get()
|
||||
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
|
||||
.set_state(job_id, state)
|
||||
.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
|
||||
Ok(cx.null())
|
||||
}
|
||||
|
||||
fn set_queue(mut cx: FunctionContext) -> JsResult<JsNull> {
|
||||
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let job_id: Uuid = arg
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
|
||||
|
||||
let queue = cx.argument::<JsString>(1)?.value(&mut cx);
|
||||
|
||||
WORKER
|
||||
.get()
|
||||
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
|
||||
.set_queue(job_id, &queue)
|
||||
.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
|
||||
Ok(cx.null())
|
||||
}
|
||||
|
||||
fn set_priority(mut cx: FunctionContext) -> JsResult<JsNull> {
|
||||
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let job_id: Uuid = arg
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
|
||||
|
||||
let arg = cx.argument::<JsNumber>(1)?.value(&mut cx);
|
||||
let priority = arg as i16; // TODO - I /really/ don't love this cast
|
||||
|
||||
WORKER
|
||||
.get()
|
||||
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
|
||||
.set_priority(job_id, priority)
|
||||
.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
|
||||
Ok(cx.null())
|
||||
}
|
||||
|
||||
fn set_scheduled_at(mut cx: FunctionContext) -> JsResult<JsNull> {
|
||||
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let job_id: Uuid = arg
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
|
||||
|
||||
let arg = cx.argument::<JsString>(1)?.value(&mut cx);
|
||||
let scheduled: DateTime<Utc> = arg
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid scheduled at: {}", arg)))?;
|
||||
|
||||
WORKER
|
||||
.get()
|
||||
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
|
||||
.set_scheduled_at(job_id, scheduled)
|
||||
.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
|
||||
Ok(cx.null())
|
||||
}
|
||||
|
||||
fn set_vm_state(mut cx: FunctionContext) -> JsResult<JsNull> {
|
||||
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let job_id: Uuid = arg
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
|
||||
|
||||
// Tricky - we have to support passing nulls here, because that's how you clear vm state.
|
||||
let vm_state = cx.argument::<JsValue>(1)?;
|
||||
let vm_state = if vm_state.is_a::<JsNull, _>(&mut cx) {
|
||||
None
|
||||
} else {
|
||||
Some(
|
||||
vm_state
|
||||
.downcast_or_throw::<JsString, _>(&mut cx)?
|
||||
.value(&mut cx),
|
||||
)
|
||||
};
|
||||
|
||||
WORKER
|
||||
.get()
|
||||
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
|
||||
.set_vm_state(job_id, vm_state)
|
||||
.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
|
||||
Ok(cx.null())
|
||||
}
|
||||
|
||||
fn set_metadata(mut cx: FunctionContext) -> JsResult<JsNull> {
|
||||
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let job_id: Uuid = arg
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
|
||||
|
||||
// Tricky - we have to support passing nulls here, because that's how you clear metadata.
|
||||
let metadata = cx.argument::<JsValue>(1)?;
|
||||
let metadata = if metadata.is_a::<JsNull, _>(&mut cx) {
|
||||
None
|
||||
} else {
|
||||
Some(
|
||||
metadata
|
||||
.downcast_or_throw::<JsString, _>(&mut cx)?
|
||||
.value(&mut cx),
|
||||
)
|
||||
};
|
||||
|
||||
WORKER
|
||||
.get()
|
||||
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
|
||||
.set_metadata(job_id, metadata)
|
||||
.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
|
||||
Ok(cx.null())
|
||||
}
|
||||
|
||||
fn set_parameters(mut cx: FunctionContext) -> JsResult<JsNull> {
|
||||
let arg = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let job_id: Uuid = arg
|
||||
.parse()
|
||||
.or_else(|_| cx.throw_error(format!("invalid job id: {}", arg)))?;
|
||||
|
||||
// Tricky - we have to support passing nulls here, because that's how you clear parameters.
|
||||
let parameters = cx.argument::<JsValue>(1)?;
|
||||
let parameters = if parameters.is_a::<JsNull, _>(&mut cx) {
|
||||
None
|
||||
} else {
|
||||
Some(
|
||||
parameters
|
||||
.downcast_or_throw::<JsString, _>(&mut cx)?
|
||||
.value(&mut cx),
|
||||
)
|
||||
};
|
||||
|
||||
WORKER
|
||||
.get()
|
||||
.map_or_else(|| cx.throw_error("worker not initialized"), Ok)?
|
||||
.set_parameters(job_id, parameters)
|
||||
.or_else(|e| cx.throw_error(format!("{}", e)))?;
|
||||
|
||||
Ok(cx.null())
|
||||
}
|
||||
|
||||
#[neon::main]
|
||||
fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
||||
cx.export_function("hello", hello)?;
|
||||
cx.export_function("initWorker", init_worker)?;
|
||||
cx.export_function("initManager", init_manager)?;
|
||||
cx.export_function("maybeInitWorker", maybe_init_worker)?;
|
||||
cx.export_function("maybeInitManager", maybe_init_manager)?;
|
||||
cx.export_function("createJob", create_job)?;
|
||||
cx.export_function("dequeueJobs", dequeue_jobs)?;
|
||||
cx.export_function("dequeueJobsWithVmState", dequeue_with_vm_state)?;
|
||||
cx.export_function("flushJob", flush_job)?;
|
||||
cx.export_function("setState", set_state)?;
|
||||
cx.export_function("setQueue", set_queue)?;
|
||||
cx.export_function("setPriority", set_priority)?;
|
||||
cx.export_function("setScheduledAt", set_scheduled_at)?;
|
||||
cx.export_function("setVmState", set_vm_state)?;
|
||||
cx.export_function("setMetadata", set_metadata)?;
|
||||
cx.export_function("setParameters", set_parameters)?;
|
||||
|
||||
Ok(())
|
||||
}
|
24
rust/cyclotron-node/tsconfig.json
Normal file
24
rust/cyclotron-node/tsconfig.json
Normal file
@ -0,0 +1,24 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"module": "CommonJS",
|
||||
"target": "ESNext",
|
||||
"declaration": true,
|
||||
"removeComments": true,
|
||||
"emitDecoratorMetadata": true,
|
||||
"experimentalDecorators": true,
|
||||
"moduleResolution": "node",
|
||||
"esModuleInterop": true,
|
||||
"allowJs": true,
|
||||
"sourceMap": true,
|
||||
"baseUrl": "src/",
|
||||
"rootDir": "src/",
|
||||
"outDir": "dist/",
|
||||
"types": ["node"],
|
||||
"resolveJsonModule": true,
|
||||
"strict": true,
|
||||
"noImplicitAny": true,
|
||||
"useUnknownInCatchVariables": false
|
||||
},
|
||||
"include": ["src"],
|
||||
"exclude": ["node_modules", "dist", "bin"]
|
||||
}
|
@ -22,3 +22,4 @@ tower = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { workspace = true }
|
||||
url = { workspace = true }
|
||||
common-metrics = { path = "../common/metrics" }
|
||||
|
@ -3,7 +3,7 @@ use config::Config;
|
||||
use envconfig::Envconfig;
|
||||
use eyre::Result;
|
||||
|
||||
use hook_common::metrics::setup_metrics_routes;
|
||||
use common_metrics::setup_metrics_routes;
|
||||
use hook_common::pgqueue::PgQueue;
|
||||
|
||||
mod config;
|
||||
|
@ -8,13 +8,10 @@ workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait = { workspace = true }
|
||||
axum = { workspace = true, features = ["http2"] }
|
||||
chrono = { workspace = true }
|
||||
envconfig = { workspace = true }
|
||||
health = { path = "../common/health" }
|
||||
http = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
metrics-exporter-prometheus = { workspace = true }
|
||||
rdkafka = { workspace = true }
|
||||
reqwest = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
|
@ -1,7 +1,6 @@
|
||||
pub mod config;
|
||||
pub mod kafka_messages;
|
||||
pub mod kafka_producer;
|
||||
pub mod metrics;
|
||||
pub mod pgqueue;
|
||||
pub mod retry;
|
||||
pub mod test;
|
||||
|
@ -24,3 +24,4 @@ time = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { workspace = true }
|
||||
common-metrics = { path = "../common/metrics" }
|
@ -9,8 +9,8 @@ use std::{str::FromStr, time::Duration};
|
||||
use tokio::sync::Semaphore;
|
||||
use webhooks::WebhookCleaner;
|
||||
|
||||
use common_metrics::setup_metrics_routes;
|
||||
use hook_common::kafka_producer::create_kafka_producer;
|
||||
use hook_common::metrics::setup_metrics_routes;
|
||||
|
||||
mod cleanup;
|
||||
mod config;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user