From 40a6dd78e4a73d72817b28b0f44b9622152c2e21 Mon Sep 17 00:00:00 2001 From: Gabriel Brown Date: Wed, 24 Jun 2026 06:31:17 -0400 Subject: [PATCH] Fix worker image missing docker CLI; harden spawn-failure handling Root cause of the prod empty-response: the spoon-agent-worker image shipped without a docker CLI binary, so it could never launch the codex job container. On Debian trixie (the bun base) 'docker.io' + --no-install-recommends installs the daemon package but omits the client (split into 'docker-cli'), leaving no 'docker' on PATH. execa('docker', ...) hit ENOENT, and with reject:false that resolves with exitCode undefined -> coerced to 0 -> looked like a successful empty run -> 'Codex completed without producing an assistant response'. - agent-worker.Dockerfile: drop docker.io, install the official static docker CLI client pinned to 29.5.3 (matches the host daemon) to /usr/local/bin/docker - runtime/docker.ts: normalizeRunResult() so a spawn failure (exitCode null) is always a non-zero exit carrying the real reason, never a silent empty success - tests: cover the spawn-failure and normal-result paths --- apps/agent-worker/src/runtime/docker.ts | 35 ++++++++++++++----- .../tests/unit/docker-runtime.test.ts | 23 ++++++++++++ docker/agent-worker.Dockerfile | 14 +++++++- 3 files changed, 63 insertions(+), 9 deletions(-) diff --git a/apps/agent-worker/src/runtime/docker.ts b/apps/agent-worker/src/runtime/docker.ts index a90fc31..a32288a 100644 --- a/apps/agent-worker/src/runtime/docker.ts +++ b/apps/agent-worker/src/runtime/docker.ts @@ -38,6 +38,31 @@ export const ensureJobImagePulled = () => { return jobImagePullPromise; }; +// execa with `reject: false` resolves (does not throw) even when the runtime +// binary is missing (ENOENT) — `exitCode` is then `undefined`. Coercing that to +// 0 makes a failed spawn look like a successful empty run, which is exactly how +// a worker image without a `docker` CLI silently produced empty agent +// responses. Normalize so any spawn failure is a non-zero exit carrying the +// real reason. +export const normalizeRunResult = ( + // Declared nullable on purpose: execa's types claim these are always present, + // but on a spawn failure (e.g. missing `docker` binary) `exitCode`/`all` are + // actually undefined at runtime. + result: { exitCode?: number; shortMessage?: string }, + output: string | undefined, + redact: (value: string) => string, +): CommandResult => { + const text = output ?? ''; + if (result.exitCode == null) { + const reason = result.shortMessage ?? 'container runtime failed to start'; + return { + exitCode: 1, + output: redact(`${text}${text ? '\n' : ''}${reason}`), + }; + } + return { exitCode: result.exitCode, output: redact(text) }; +}; + const hostWorkspacePath = (workdir: string) => { if (!env.hostWorkdir) return workdir; const workerRoot = path.resolve(env.workdir); @@ -92,10 +117,7 @@ export const runInJobContainer = async (args: { timeout: args.timeoutMs, }, ); - return { - exitCode: result.exitCode ?? 0, - output: args.redact(result.all), - }; + return normalizeRunResult(result, result.all, args.redact); }; export const startWorkspaceContainer = async (args: { @@ -279,10 +301,7 @@ export const streamInJobContainer = async (args: { if (stderrBuffer && args.onStderrLine) { await args.onStderrLine(args.redact(stderrBuffer)); } - return { - exitCode: result.exitCode ?? 0, - output: args.redact(output.join('')), - }; + return normalizeRunResult(result, output.join(''), args.redact); }; export const stopWorkspaceContainer = async (containerName: string) => { diff --git a/apps/agent-worker/tests/unit/docker-runtime.test.ts b/apps/agent-worker/tests/unit/docker-runtime.test.ts index f002abf..62bcc5d 100644 --- a/apps/agent-worker/tests/unit/docker-runtime.test.ts +++ b/apps/agent-worker/tests/unit/docker-runtime.test.ts @@ -43,4 +43,27 @@ describe('Docker runtime', () => { '/tmp/spoon-job:/workspace:z', ); }); + + test('treats a spawn failure (no exitCode) as a non-zero exit, not empty success', async () => { + const { normalizeRunResult } = await loadVolumeSpec(); + // This is what execa returns with `reject: false` when the runtime binary is + // missing (e.g. no `docker` CLI in the worker image): exitCode is undefined. + const result = normalizeRunResult( + { exitCode: undefined, shortMessage: 'spawn docker ENOENT' }, + undefined, + (value) => value, + ); + expect(result.exitCode).toBe(1); + expect(result.output).toContain('spawn docker ENOENT'); + }); + + test('passes through a normal command result unchanged', async () => { + const { normalizeRunResult } = await loadVolumeSpec(); + const result = normalizeRunResult( + { exitCode: 0, shortMessage: undefined }, + 'hello', + (value) => value, + ); + expect(result).toEqual({ exitCode: 0, output: 'hello' }); + }); }); diff --git a/docker/agent-worker.Dockerfile b/docker/agent-worker.Dockerfile index f5367aa..b83019a 100644 --- a/docker/agent-worker.Dockerfile +++ b/docker/agent-worker.Dockerfile @@ -11,12 +11,24 @@ RUN apt-get update \ bash \ ca-certificates \ curl \ - docker.io \ git \ jq \ openssh-client \ && rm -rf /var/lib/apt/lists/* +# Docker CLI client only — the daemon is the host's, reached via the bind-mounted +# /var/run/docker.sock. The Debian `docker.io` package does NOT install the +# client under `--no-install-recommends` (trixie split it into `docker-cli`), +# which left the worker with no `docker` binary and silently broke every job. +# Install the official static client pinned to the host daemon's version. +ARG DOCKER_CLI_VERSION=29.5.3 +RUN arch="$(uname -m)" \ + && curl -fsSL "https://download.docker.com/linux/static/stable/${arch}/docker-${DOCKER_CLI_VERSION}.tgz" -o /tmp/docker.tgz \ + && tar -xzf /tmp/docker.tgz -C /tmp \ + && install -m0755 /tmp/docker/docker /usr/local/bin/docker \ + && rm -rf /tmp/docker /tmp/docker.tgz \ + && docker --version + WORKDIR /app COPY package.json bun.lock* turbo.json ./