Fix worker image missing docker CLI; harden spawn-failure handling
Build and Push Spoon Images / quality (push) Successful in 1m47s
Build and Push Spoon Images / build-images (push) Successful in 6m33s

Root cause of the prod empty-response: the spoon-agent-worker image shipped
without a docker CLI binary, so it could never launch the codex job container.
On Debian trixie (the bun base) 'docker.io' + --no-install-recommends installs
the daemon package but omits the client (split into 'docker-cli'), leaving no
'docker' on PATH. execa('docker', ...) hit ENOENT, and with reject:false that
resolves with exitCode undefined -> coerced to 0 -> looked like a successful
empty run -> 'Codex completed without producing an assistant response'.

- agent-worker.Dockerfile: drop docker.io, install the official static docker
  CLI client pinned to 29.5.3 (matches the host daemon) to /usr/local/bin/docker
- runtime/docker.ts: normalizeRunResult() so a spawn failure (exitCode null) is
  always a non-zero exit carrying the real reason, never a silent empty success
- tests: cover the spawn-failure and normal-result paths
This commit is contained in:
Gabriel Brown
2026-06-24 06:31:17 -04:00
parent a2976481d7
commit 40a6dd78e4
3 changed files with 63 additions and 9 deletions
+27 -8
View File
@@ -38,6 +38,31 @@ export const ensureJobImagePulled = () => {
return jobImagePullPromise;
};
// execa with `reject: false` resolves (does not throw) even when the runtime
// binary is missing (ENOENT) — `exitCode` is then `undefined`. Coercing that to
// 0 makes a failed spawn look like a successful empty run, which is exactly how
// a worker image without a `docker` CLI silently produced empty agent
// responses. Normalize so any spawn failure is a non-zero exit carrying the
// real reason.
export const normalizeRunResult = (
// Declared nullable on purpose: execa's types claim these are always present,
// but on a spawn failure (e.g. missing `docker` binary) `exitCode`/`all` are
// actually undefined at runtime.
result: { exitCode?: number; shortMessage?: string },
output: string | undefined,
redact: (value: string) => string,
): CommandResult => {
const text = output ?? '';
if (result.exitCode == null) {
const reason = result.shortMessage ?? 'container runtime failed to start';
return {
exitCode: 1,
output: redact(`${text}${text ? '\n' : ''}${reason}`),
};
}
return { exitCode: result.exitCode, output: redact(text) };
};
const hostWorkspacePath = (workdir: string) => {
if (!env.hostWorkdir) return workdir;
const workerRoot = path.resolve(env.workdir);
@@ -92,10 +117,7 @@ export const runInJobContainer = async (args: {
timeout: args.timeoutMs,
},
);
return {
exitCode: result.exitCode ?? 0,
output: args.redact(result.all),
};
return normalizeRunResult(result, result.all, args.redact);
};
export const startWorkspaceContainer = async (args: {
@@ -279,10 +301,7 @@ export const streamInJobContainer = async (args: {
if (stderrBuffer && args.onStderrLine) {
await args.onStderrLine(args.redact(stderrBuffer));
}
return {
exitCode: result.exitCode ?? 0,
output: args.redact(output.join('')),
};
return normalizeRunResult(result, output.join(''), args.redact);
};
export const stopWorkspaceContainer = async (containerName: string) => {
@@ -43,4 +43,27 @@ describe('Docker runtime', () => {
'/tmp/spoon-job:/workspace:z',
);
});
test('treats a spawn failure (no exitCode) as a non-zero exit, not empty success', async () => {
const { normalizeRunResult } = await loadVolumeSpec();
// This is what execa returns with `reject: false` when the runtime binary is
// missing (e.g. no `docker` CLI in the worker image): exitCode is undefined.
const result = normalizeRunResult(
{ exitCode: undefined, shortMessage: 'spawn docker ENOENT' },
undefined,
(value) => value,
);
expect(result.exitCode).toBe(1);
expect(result.output).toContain('spawn docker ENOENT');
});
test('passes through a normal command result unchanged', async () => {
const { normalizeRunResult } = await loadVolumeSpec();
const result = normalizeRunResult(
{ exitCode: 0, shortMessage: undefined },
'hello',
(value) => value,
);
expect(result).toEqual({ exitCode: 0, output: 'hello' });
});
});
+13 -1
View File
@@ -11,12 +11,24 @@ RUN apt-get update \
bash \
ca-certificates \
curl \
docker.io \
git \
jq \
openssh-client \
&& rm -rf /var/lib/apt/lists/*
# Docker CLI client only — the daemon is the host's, reached via the bind-mounted
# /var/run/docker.sock. The Debian `docker.io` package does NOT install the
# client under `--no-install-recommends` (trixie split it into `docker-cli`),
# which left the worker with no `docker` binary and silently broke every job.
# Install the official static client pinned to the host daemon's version.
ARG DOCKER_CLI_VERSION=29.5.3
RUN arch="$(uname -m)" \
&& curl -fsSL "https://download.docker.com/linux/static/stable/${arch}/docker-${DOCKER_CLI_VERSION}.tgz" -o /tmp/docker.tgz \
&& tar -xzf /tmp/docker.tgz -C /tmp \
&& install -m0755 /tmp/docker/docker /usr/local/bin/docker \
&& rm -rf /tmp/docker /tmp/docker.tgz \
&& docker --version
WORKDIR /app
COPY package.json bun.lock* turbo.json ./