From 9643cb197b54427206f280aa973a54db8d50093f Mon Sep 17 00:00:00 2001 From: Gabriel Brown Date: Wed, 24 Jun 2026 05:38:35 -0400 Subject: [PATCH] Fix agent empty-response in prod: workdir mount, image freshness, error surfacing - Pin codex@0.142.0 + opencode-ai@1.17.9 in the job image (was @latest, causing dev/prod drift) - Worker now s the job image once per process so prod stops running a stale Codex - Surface Codex error/turn.failed events instead of swallowing them, so the real failure reason is reported rather than 'no assistant response' - Harden the Codex JSON parser to also handle the legacy msg-wrapped shape - Fix the docker-in-docker workdir: bind-mount identical host:container path and set SPOON_AGENT_HOST_WORKDIR (named volume can't be mounted by sibling job containers) - Add docs/compose.prod.yml as a documented reference deployment --- apps/agent-worker/src/agent-events.ts | 60 +++++++ apps/agent-worker/src/runtime/docker.ts | 23 +++ apps/agent-worker/src/worker.ts | 29 +++- .../tests/unit/agent-events.test.ts | 26 +++ docker/agent-job.Dockerfile | 2 +- docker/compose.local.yml | 6 +- docker/compose.yml | 15 +- docs/compose.prod.yml | 162 ++++++++++++++++++ 8 files changed, 315 insertions(+), 8 deletions(-) create mode 100644 docs/compose.prod.yml diff --git a/apps/agent-worker/src/agent-events.ts b/apps/agent-worker/src/agent-events.ts index 9505b4d..7322265 100644 --- a/apps/agent-worker/src/agent-events.ts +++ b/apps/agent-worker/src/agent-events.ts @@ -140,6 +140,57 @@ const isCodexConfigWarning = (message: string) => message.includes('`[features].codex_hooks` is deprecated') || message.includes('Use `[features].hooks` instead'); +// Handles the legacy `codex-rs` `{ id, msg: { type, ... } }` envelope. +const normalizeCodexMsgEvent = ( + msg: Record, + envelope: Record, +): NormalizedAgentEvent[] => { + const msgType = stringify(msg.type).toLowerCase(); + const events: NormalizedAgentEvent[] = []; + if (msgType === 'session_configured' || msgType.includes('session')) { + const sessionId = stringify( + msg.session_id ?? envelope.session_id ?? envelope.id, + ); + if (sessionId) events.push({ kind: 'session', sessionId }); + } + if (msgType === 'agent_message_delta' || msgType === 'agent_reasoning_delta') { + const delta = stringify(msg.delta ?? msg.text); + if (delta) events.push({ kind: 'assistant_delta', content: delta }); + } + if (msgType === 'agent_message') { + const text = stringify(msg.message ?? msg.text); + if (text) { + events.push({ kind: 'assistant_delta', content: `${text.trim()}\n\n` }); + } + } + if (msgType === 'exec_command_begin') { + events.push({ + kind: 'tool_started', + name: 'Command', + input: commandString(msg.command), + }); + } + if (msgType === 'exec_command_end') { + events.push({ + kind: 'tool_completed', + name: 'Command', + output: toolOutputFromRecord(msg), + }); + } + if (msgType === 'error' || msgType === 'turn_failed' || msgType === 'task_error') { + const message = stringify(msg.message ?? msg.error ?? msg); + if (isCodexConfigWarning(message)) { + events.push({ kind: 'status', status: message }); + } else { + events.push({ kind: 'error', message }); + } + } + if (msgType === 'task_complete' || msgType === 'turn_complete') { + events.push({ kind: 'assistant_completed' }); + } + return events; +}; + export const normalizeCodexJsonLine = ( line: string, ): NormalizedAgentEvent[] => { @@ -152,6 +203,15 @@ export const normalizeCodexJsonLine = ( } const event = asRecord(parsed); if (!event) return []; + // Older Codex (`codex-rs`) protocol wraps events as `{ id, msg: { type, ... } }` + // instead of the newer `{ type, item: { ... } }` shape. Unwrap it so version + // skew between the pinned image and an upstream build degrades gracefully + // instead of silently producing an empty assistant response. + const msg = asRecord(event.msg); + if (msg) { + const msgEvents = normalizeCodexMsgEvent(msg, event); + if (msgEvents.length > 0) return msgEvents; + } const type = stringify(event.type ?? event.event); const id = event.id ?? diff --git a/apps/agent-worker/src/runtime/docker.ts b/apps/agent-worker/src/runtime/docker.ts index b4f9b14..a90fc31 100644 --- a/apps/agent-worker/src/runtime/docker.ts +++ b/apps/agent-worker/src/runtime/docker.ts @@ -18,6 +18,26 @@ const networkArgs = () => (env.network ? ['--network', env.network] : []); const containerRuntime = () => env.containerRuntime; +// `docker run` reuses a stale local `:latest` forever, so without an explicit +// pull the job image never updates in production. Pull once per worker process +// (i.e. once per deploy/restart) so a fresh worker always runs a fresh job +// image. Best-effort: if the registry is unreachable we fall back to whatever +// image is present locally rather than failing the job. +let jobImagePullPromise: Promise | undefined; +export const ensureJobImagePulled = () => { + jobImagePullPromise ??= (async () => { + try { + await execa(containerRuntime(), ['pull', env.jobImage], { + reject: false, + stdin: 'ignore', + }); + } catch { + // Ignore: keep running with the locally cached image. + } + })(); + return jobImagePullPromise; +}; + const hostWorkspacePath = (workdir: string) => { if (!env.hostWorkdir) return workdir; const workerRoot = path.resolve(env.workdir); @@ -46,6 +66,7 @@ export const runInJobContainer = async (args: { redact: (value: string) => string; timeoutMs: number; }): Promise => { + await ensureJobImagePulled(); const result = await execa( containerRuntime(), [ @@ -84,6 +105,7 @@ export const startWorkspaceContainer = async (args: { command?: string[]; publishTcpPort?: number; }) => { + await ensureJobImagePulled(); await execa( containerRuntime(), [ @@ -180,6 +202,7 @@ export const streamInJobContainer = async (args: { onStdoutLine?: (line: string) => Promise; onStderrLine?: (line: string) => Promise; }): Promise => { + await ensureJobImagePulled(); const subprocess = execa( containerRuntime(), [ diff --git a/apps/agent-worker/src/worker.ts b/apps/agent-worker/src/worker.ts index b0d712c..1c72699 100644 --- a/apps/agent-worker/src/worker.ts +++ b/apps/agent-worker/src/worker.ts @@ -111,6 +111,10 @@ type ActiveWorkspace = { agentTurnActive?: boolean; resolveTurn?: () => void; lastRecordedDiffSignature?: string; + // Captures the most recent Codex `error`/`turn.failed` event for the active + // turn so the failure surfaces the real reason instead of a generic + // "no assistant response" message. + codexTurnError?: string; }; type FileTreeNode = { @@ -599,6 +603,11 @@ const handleAgentEvent = async (args: { ); return; } + // event.kind === 'error' + // Record the real Codex failure reason on the workspace so the turn can + // surface it (Codex can emit `error`/`turn.failed` events and still exit 0 + // in some versions, which otherwise looks like an empty response). + workspace.codexTurnError = event.message; await appendEvent(jobId, 'error', 'plan', truncate(event.message, 20_000)); }; @@ -683,6 +692,12 @@ const workspaceCurrentContent = new Map< } >(); +// Reading through a function boundary prevents TypeScript from narrowing the +// field to `undefined` after the synchronous reset in `runCodexTurn`; it is set +// asynchronously by the stream event handler. +const readCodexTurnError = (workspace: ActiveWorkspace) => + workspace.codexTurnError; + const runCodexTurn = async (args: { workspace: ActiveWorkspace; prompt: string; @@ -691,6 +706,7 @@ const runCodexTurn = async (args: { }) => { const { workspace, prompt, assistantMessageId, assistantContent } = args; workspace.runtimeMode = 'codex_exec'; + workspace.codexTurnError = undefined; await setRuntimeSession({ jobId: workspace.claim.job._id, agentRuntimeMode: 'codex_exec', @@ -813,6 +829,15 @@ const runCodexTurn = async (args: { ); } } + // Codex can report a failure via a JSON `error`/`turn.failed` event while + // still exiting 0. If the turn produced no assistant text but did report an + // error, surface that real reason rather than a generic empty response. + // Read through a helper so it is not narrowed away by the reset above (the + // field is mutated asynchronously inside the stream handler). + const codexTurnError = readCodexTurnError(workspace); + if (!assistantContent.value.trim() && codexTurnError) { + throw new Error(`codex failed:\n${codexTurnError}`); + } }; const runOpenCodeTurn = async (args: { @@ -1593,7 +1618,9 @@ export const sendWorkspaceMessage = async ( `Codex completed without producing an assistant response for job ${claim.job._id}.`, ); throw new Error( - 'Codex completed without producing an assistant response.', + workspace.codexTurnError + ? `Codex failed: ${workspace.codexTurnError}` + : 'Codex completed without producing an assistant response.', ); } await updateMessage({ diff --git a/apps/agent-worker/tests/unit/agent-events.test.ts b/apps/agent-worker/tests/unit/agent-events.test.ts index f12c880..d0b2fdf 100644 --- a/apps/agent-worker/tests/unit/agent-events.test.ts +++ b/apps/agent-worker/tests/unit/agent-events.test.ts @@ -26,6 +26,32 @@ describe('agent event normalization', () => { ).toContainEqual({ kind: 'assistant_delta', content: 'hello' }); }); + test('normalizes legacy codex-rs msg-wrapped events', () => { + expect( + normalizeCodexJsonLine( + JSON.stringify({ + id: '0', + msg: { type: 'agent_message', message: 'hello there' }, + }), + ), + ).toContainEqual({ kind: 'assistant_delta', content: 'hello there\n\n' }); + + expect( + normalizeCodexJsonLine( + JSON.stringify({ + id: '1', + msg: { type: 'error', message: 'usage limit reached' }, + }), + ), + ).toContainEqual({ kind: 'error', message: 'usage limit reached' }); + + expect( + normalizeCodexJsonLine( + JSON.stringify({ id: '2', msg: { type: 'task_complete' } }), + ), + ).toContainEqual({ kind: 'assistant_completed' }); + }); + test('normalizes Codex CLI thread lifecycle events', () => { expect( normalizeCodexJsonLine( diff --git a/docker/agent-job.Dockerfile b/docker/agent-job.Dockerfile index b25185a..a5f2deb 100644 --- a/docker/agent-job.Dockerfile +++ b/docker/agent-job.Dockerfile @@ -17,7 +17,7 @@ RUN apt-get update \ && corepack enable \ && corepack prepare pnpm@latest --activate \ && corepack prepare yarn@stable --activate \ - && npm install -g bun@1.3.10 opencode-ai@latest @openai/codex@latest \ + && npm install -g bun@1.3.10 opencode-ai@1.17.9 @openai/codex@0.142.0 \ && rm -rf /var/lib/apt/lists/* WORKDIR /workspace diff --git a/docker/compose.local.yml b/docker/compose.local.yml index 5b10d43..44e6e7a 100644 --- a/docker/compose.local.yml +++ b/docker/compose.local.yml @@ -77,11 +77,14 @@ services: - SPOON_AGENT_MAX_CONCURRENT_JOBS=${SPOON_AGENT_MAX_CONCURRENT_JOBS:-1} - SPOON_AGENT_JOB_TIMEOUT_MS=${SPOON_AGENT_JOB_TIMEOUT_MS:-1800000} - SPOON_AGENT_WORKDIR=${SPOON_AGENT_WORKDIR:-/var/lib/spoon-agent/work} + # See compose.yml: the host-side path must match SPOON_AGENT_WORKDIR so the + # sibling job containers' bind mounts resolve on the host Docker daemon. + - SPOON_AGENT_HOST_WORKDIR=${SPOON_AGENT_HOST_WORKDIR:-/var/lib/spoon-agent/work} - GITHUB_APP_ID=${GITHUB_APP_ID} - GITHUB_APP_PRIVATE_KEY=${GITHUB_APP_PRIVATE_KEY} volumes: - /var/run/docker.sock:/var/run/docker.sock - - agent-work:/var/lib/spoon-agent/work + - ${SPOON_AGENT_HOST_WORKDIR:-/var/lib/spoon-agent/work}:/var/lib/spoon-agent/work depends_on: convex-backend: condition: service_healthy @@ -90,4 +93,3 @@ services: volumes: postgres-data: convex-data: - agent-work: diff --git a/docker/compose.yml b/docker/compose.yml index 58c9dce..b781c18 100644 --- a/docker/compose.yml +++ b/docker/compose.yml @@ -20,6 +20,7 @@ services: image: spoon-next:latest #image: git.gbrown.org/gib/spoon-next:latest container_name: ${NEXT_CONTAINER_NAME} + labels: ['com.centurylinklabs.watchtower.enable=true'] environment: - NODE_ENV=${NODE_ENV} - SENTRY_AUTH_TOKEN=${SENTRY_AUTH_TOKEN} @@ -95,6 +96,7 @@ services: image: spoon-agent-worker:latest container_name: ${AGENT_WORKER_CONTAINER_NAME:-spoon-agent-worker} hostname: ${AGENT_WORKER_CONTAINER_NAME:-spoon-agent-worker} + labels: ['com.centurylinklabs.watchtower.enable=true'] networks: ['${NETWORK:-nginx-bridge}'] environment: - NEXT_PUBLIC_CONVEX_URL=${CONVEX_SELF_HOSTED_URL:-http://${BACKEND_CONTAINER_NAME:-spoon-backend}:${BACKEND_PORT:-3210}} @@ -108,15 +110,20 @@ services: - SPOON_AGENT_MAX_CONCURRENT_JOBS=${SPOON_AGENT_MAX_CONCURRENT_JOBS:-1} - SPOON_AGENT_JOB_TIMEOUT_MS=${SPOON_AGENT_JOB_TIMEOUT_MS:-1800000} - SPOON_AGENT_WORKDIR=${SPOON_AGENT_WORKDIR:-/var/lib/spoon-agent/work} + # Required when the worker controls the host Docker socket: bind-mount + # source paths are resolved on the host, not inside this container, so the + # worker must know the host-side path backing SPOON_AGENT_WORKDIR. We bind + # the same host path at the same location below so they are identical. + - SPOON_AGENT_HOST_WORKDIR=${SPOON_AGENT_HOST_WORKDIR:-/var/lib/spoon-agent/work} - GITHUB_APP_ID=${GITHUB_APP_ID} - GITHUB_APP_PRIVATE_KEY=${GITHUB_APP_PRIVATE_KEY} volumes: - /var/run/docker.sock:/var/run/docker.sock - - spoon-agent-work:/var/lib/spoon-agent/work + # Host bind mount (not a named volume) so the path is identical on the + # host and inside the worker, which is what the sibling job containers + # need for their `-v :/workspace` mounts to resolve correctly. + - ${SPOON_AGENT_HOST_WORKDIR:-/var/lib/spoon-agent/work}:/var/lib/spoon-agent/work depends_on: spoon-backend: condition: service_healthy restart: unless-stopped - -volumes: - spoon-agent-work: diff --git a/docs/compose.prod.yml b/docs/compose.prod.yml new file mode 100644 index 0000000..5c75e47 --- /dev/null +++ b/docs/compose.prod.yml @@ -0,0 +1,162 @@ +# Production Compose for Spoon +# ----------------------------------------------------------------------------- +# Reference deployment for the production host. Copy this to the server and run +# with `docker compose -f compose.prod.yml up -d` (alongside your prod `.env`). +# +# Two things in here are load-bearing for the agent ("run a thread") to work. +# If you change them, read the comments first: +# +# 1. AGENT WORKDIR (spoon-agent-worker): the worker is containerized but +# launches the Codex job container by talking to the HOST Docker daemon. +# The host can only bind-mount real HOST paths, so the work directory MUST +# be a bind mount whose path is IDENTICAL inside and outside the container, +# and SPOON_AGENT_HOST_WORKDIR must match it. A named volume does NOT work +# here because its real host path is hidden from the worker. All three +# references to /var/lib/spoon-agent/work below must stay in sync; change +# them together if you want the data somewhere else. +# +# 2. IMAGE FRESHNESS: services use `pull_policy: always` + Watchtower labels so +# a redeploy / new push always lands. The Codex *job* image is pulled by the +# worker itself on startup (see SPOON_AGENT_JOB_IMAGE); restarting the worker +# (which Watchtower does on a new image) re-pulls a fresh job image. + +networks: + nginx-bridge: # Change to network you plan to use + external: true + +services: + spoon-next: + image: git.gbrown.org/gib/${NEXT_CONTAINER_NAME}:latest + container_name: ${NEXT_CONTAINER_NAME} + hostname: ${NEXT_CONTAINER_NAME} + domainname: ${NEXT_DOMAIN} + networks: ['${NETWORK:-nginx-bridge}'] + #ports: ['${NEXT_PORT}:${NEXT_PORT}'] + pull_policy: always + environment: + - NODE_ENV=${NODE_ENV} + - SENTRY_AUTH_TOKEN=${SENTRY_AUTH_TOKEN} + - NEXT_PUBLIC_SITE_URL=${NEXT_PUBLIC_SITE_URL:-http://localhost:${NEXT_PORT:-3000}} + - NEXT_PUBLIC_CONVEX_URL=${NEXT_PUBLIC_CONVEX_URL:-http://${BACKEND_CONTAINER_NAME:-spoon-backend}:${BACKEND_PORT:-3210}} + - NEXT_PUBLIC_PLAUSIBLE_URL=${NEXT_PUBLIC_PLAUSIBLE_URL:-https://plausible.gbrown.org} + - NEXT_PUBLIC_SENTRY_DSN=${NEXT_PUBLIC_SENTRY_DSN} + - NEXT_PUBLIC_SENTRY_URL=${NEXT_PUBLIC_SENTRY_URL} + - NEXT_PUBLIC_SENTRY_ORG=${NEXT_PUBLIC_SENTRY_ORG:-sentry} + - NEXT_PUBLIC_SENTRY_PROJECT_NAME=${NEXT_PUBLIC_SENTRY_PROJECT_NAME} + - SPOON_AGENT_WORKER_URL=${SPOON_AGENT_WORKER_URL:-http://spoon-agent-worker:3921} + - SPOON_AGENT_WORKER_INTERNAL_TOKEN=${SPOON_AGENT_WORKER_INTERNAL_TOKEN} + - SPOON_WORKER_TOKEN=${SPOON_WORKER_TOKEN} + depends_on: ['spoon-backend', 'spoon-postgres'] + labels: ['com.centurylinklabs.watchtower.enable=true'] + tty: true + stdin_open: true + restart: unless-stopped + + spoon-agent-worker: + image: git.gbrown.org/gib/spoon-agent-worker:latest + container_name: spoon-agent-worker + hostname: spoon-agent-worker + domainname: worker.${NEXT_DOMAIN:-spoon.gbrown.org} + networks: ['${NETWORK:-nginx-bridge}'] + pull_policy: always + environment: + - GITHUB_APP_ID=${GITHUB_APP_ID} + - GITHUB_APP_PRIVATE_KEY=${GITHUB_APP_PRIVATE_KEY} + - NEXT_PUBLIC_CONVEX_URL=https://api.spoon.gbrown.org + - SPOON_AGENT_WORKER_ID=${SPOON_AGENT_WORKER_ID:-production-worker} + - SPOON_AGENT_JOB_IMAGE=${SPOON_AGENT_JOB_IMAGE:-git.gbrown.org/gib/spoon-agent-job:latest} + - SPOON_AGENT_RUNTIME=docker + - SPOON_AGENT_NETWORK=${NETWORK:-nginx-bridge} + # The work directory MUST be the same absolute path here, in the bind mount + # below, and in SPOON_AGENT_HOST_WORKDIR. See header note (1). + - SPOON_AGENT_WORKDIR=/var/lib/spoon-agent/work + - SPOON_AGENT_HOST_WORKDIR=/var/lib/spoon-agent/work + - SPOON_AGENT_WORKER_HTTP_PORT=${SPOON_AGENT_WORKER_HTTP_PORT:-3921} + - SPOON_AGENT_WORKER_INTERNAL_TOKEN=${SPOON_AGENT_WORKER_INTERNAL_TOKEN} + - SPOON_AGENT_MAX_CONCURRENT_JOBS=${SPOON_AGENT_MAX_CONCURRENT_JOBS:-1} + - SPOON_AGENT_JOB_TIMEOUT_MS=${SPOON_AGENT_JOB_TIMEOUT_MS:-1800000} + - SPOON_WORKER_TOKEN=${SPOON_WORKER_TOKEN} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + # Identical host:container path so the sibling Codex job containers can + # bind-mount the workspace via the host daemon. Do NOT switch this to a + # named volume. See header note (1). + - /var/lib/spoon-agent/work:/var/lib/spoon-agent/work + labels: ['com.centurylinklabs.watchtower.enable=true'] + tty: true + stdin_open: true + restart: unless-stopped + + spoon-backend: + image: ghcr.io/get-convex/convex-backend:${BACKEND_TAG:-latest} + container_name: ${BACKEND_CONTAINER_NAME:-spoon-backend} + hostname: ${BACKEND_CONTAINER_NAME:-spoon-backend} + domainname: ${BACKEND_DOMAIN:-convex.spoon.gbrown.org} + networks: ['${NETWORK:-nginx-bridge}'] + #user: '1000:1000' + #ports: ['${BACKEND_PORT:-3210}:3210','${SITE_PROXY_PORT:-3211}:3211'] + volumes: [./volumes/convex:/convex/data] + pull_policy: always + environment: + - INSTANCE_NAME=${INSTANCE_NAME} + - CONVEX_CLOUD_ORIGIN=${CONVEX_CLOUD_ORIGIN:-http://${BACKEND_CONTAINER_NAME:-spoon-backend}:${BACKEND_PORT:-3210}} + - CONVEX_SITE_ORIGIN=${CONVEX_SITE_ORIGIN:-http://${BACKEND_CONTAINER_NAME:-spoon-backend}:${SITE_PROXY_PORT:-3211}} + - DISABLE_BEACON=${DISABLE_BEACON:-true} + - REDACT_LOGS_TO_CLIENT=${REDACT_LOGS_TO_CLIENT:-true} + - DO_NOT_REQUIRE_SSL=${DO_NOT_REQUIRE_SSL:-false} + - POSTGRES_URL=${POSTGRES_URL} + depends_on: ['spoon-postgres'] + labels: ['com.centurylinklabs.watchtower.enable=true'] + stdin_open: true + tty: true + restart: unless-stopped + healthcheck: + test: curl -f http://localhost:3210/version + interval: 5s + start_period: 10s + stop_grace_period: 10s + stop_signal: SIGINT + + spoon-dashboard: + image: ghcr.io/get-convex/convex-dashboard:${DASHBOARD_TAG:-latest} + container_name: ${DASHBOARD_CONTAINER_NAME:-spoon-dashboard} + hostname: ${DASHBOARD_CONTAINER_NAME:-spoon-dashboard} + domainname: ${DASHBOARD_DOMAIN:-dashboard.${BACKEND_DOMAIN:-spoon.gbrown.org}} + networks: ['${NETWORK:-nginx-bridge}'] + #user: 1000:1000 + #ports: ['${DASHBOARD_PORT:-6791}:6791'] + pull_policy: always + environment: + - NEXT_PUBLIC_DEPLOYMENT_URL=${NEXT_PUBLIC_DEPLOYMENT_URL:-http://${BACKEND_CONTAINER_NAME:-spoon-backend}:${PORT:-3210}} + depends_on: + spoon-backend: + condition: service_healthy + labels: ['com.centurylinklabs.watchtower.enable=true'] + stdin_open: true + tty: true + restart: unless-stopped + stop_grace_period: 10s + stop_signal: SIGINT + + spoon-postgres: + image: postgres:17 + container_name: ${POSTGRES_CONTAINER_NAME:-spoon-postgres} + hostname: ${POSTGRES_CONTAINER_NAME:-spoon-postgres} + domainname: postgres.${NEXT_DOMAIN:-spoon.gbrown.org} + networks: ['${NETWORK:-nginx-bridge}'] + # ports: ["5434:5432"] + environment: + - POSTGRES_USER=${POSTGRES_USER:-spoon} + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} + - POSTGRES_DB=${POSTGRES_DB:-spoon_convex} + volumes: ['./volumes/postgres:/var/lib/postgresql/data'] + labels: ['com.centurylinklabs.watchtower.enable=true'] + tty: true + stdin_open: true + restart: unless-stopped + healthcheck: + test: ['CMD-SHELL', 'pg_isready -d $${POSTGRES_DB} -U $${POSTGRES_USER}'] + start_period: 20s + interval: 30s + retries: 5 + timeout: 5s