diff --git a/apps/agent-worker/src/runtime/docker.ts b/apps/agent-worker/src/runtime/docker.ts index a90fc31..a32288a 100644 --- a/apps/agent-worker/src/runtime/docker.ts +++ b/apps/agent-worker/src/runtime/docker.ts @@ -38,6 +38,31 @@ export const ensureJobImagePulled = () => { return jobImagePullPromise; }; +// execa with `reject: false` resolves (does not throw) even when the runtime +// binary is missing (ENOENT) — `exitCode` is then `undefined`. Coercing that to +// 0 makes a failed spawn look like a successful empty run, which is exactly how +// a worker image without a `docker` CLI silently produced empty agent +// responses. Normalize so any spawn failure is a non-zero exit carrying the +// real reason. +export const normalizeRunResult = ( + // Declared nullable on purpose: execa's types claim these are always present, + // but on a spawn failure (e.g. missing `docker` binary) `exitCode`/`all` are + // actually undefined at runtime. + result: { exitCode?: number; shortMessage?: string }, + output: string | undefined, + redact: (value: string) => string, +): CommandResult => { + const text = output ?? ''; + if (result.exitCode == null) { + const reason = result.shortMessage ?? 'container runtime failed to start'; + return { + exitCode: 1, + output: redact(`${text}${text ? '\n' : ''}${reason}`), + }; + } + return { exitCode: result.exitCode, output: redact(text) }; +}; + const hostWorkspacePath = (workdir: string) => { if (!env.hostWorkdir) return workdir; const workerRoot = path.resolve(env.workdir); @@ -92,10 +117,7 @@ export const runInJobContainer = async (args: { timeout: args.timeoutMs, }, ); - return { - exitCode: result.exitCode ?? 0, - output: args.redact(result.all), - }; + return normalizeRunResult(result, result.all, args.redact); }; export const startWorkspaceContainer = async (args: { @@ -279,10 +301,7 @@ export const streamInJobContainer = async (args: { if (stderrBuffer && args.onStderrLine) { await args.onStderrLine(args.redact(stderrBuffer)); } - return { - exitCode: result.exitCode ?? 0, - output: args.redact(output.join('')), - }; + return normalizeRunResult(result, output.join(''), args.redact); }; export const stopWorkspaceContainer = async (containerName: string) => { diff --git a/apps/agent-worker/tests/unit/docker-runtime.test.ts b/apps/agent-worker/tests/unit/docker-runtime.test.ts index f002abf..62bcc5d 100644 --- a/apps/agent-worker/tests/unit/docker-runtime.test.ts +++ b/apps/agent-worker/tests/unit/docker-runtime.test.ts @@ -43,4 +43,27 @@ describe('Docker runtime', () => { '/tmp/spoon-job:/workspace:z', ); }); + + test('treats a spawn failure (no exitCode) as a non-zero exit, not empty success', async () => { + const { normalizeRunResult } = await loadVolumeSpec(); + // This is what execa returns with `reject: false` when the runtime binary is + // missing (e.g. no `docker` CLI in the worker image): exitCode is undefined. + const result = normalizeRunResult( + { exitCode: undefined, shortMessage: 'spawn docker ENOENT' }, + undefined, + (value) => value, + ); + expect(result.exitCode).toBe(1); + expect(result.output).toContain('spawn docker ENOENT'); + }); + + test('passes through a normal command result unchanged', async () => { + const { normalizeRunResult } = await loadVolumeSpec(); + const result = normalizeRunResult( + { exitCode: 0, shortMessage: undefined }, + 'hello', + (value) => value, + ); + expect(result).toEqual({ exitCode: 0, output: 'hello' }); + }); }); diff --git a/docker/agent-worker.Dockerfile b/docker/agent-worker.Dockerfile index f5367aa..b83019a 100644 --- a/docker/agent-worker.Dockerfile +++ b/docker/agent-worker.Dockerfile @@ -11,12 +11,24 @@ RUN apt-get update \ bash \ ca-certificates \ curl \ - docker.io \ git \ jq \ openssh-client \ && rm -rf /var/lib/apt/lists/* +# Docker CLI client only — the daemon is the host's, reached via the bind-mounted +# /var/run/docker.sock. The Debian `docker.io` package does NOT install the +# client under `--no-install-recommends` (trixie split it into `docker-cli`), +# which left the worker with no `docker` binary and silently broke every job. +# Install the official static client pinned to the host daemon's version. +ARG DOCKER_CLI_VERSION=29.5.3 +RUN arch="$(uname -m)" \ + && curl -fsSL "https://download.docker.com/linux/static/stable/${arch}/docker-${DOCKER_CLI_VERSION}.tgz" -o /tmp/docker.tgz \ + && tar -xzf /tmp/docker.tgz -C /tmp \ + && install -m0755 /tmp/docker/docker /usr/local/bin/docker \ + && rm -rf /tmp/docker /tmp/docker.tgz \ + && docker --version + WORKDIR /app COPY package.json bun.lock* turbo.json ./