Fix worker image missing docker CLI; harden spawn-failure handling
Root cause of the prod empty-response: the spoon-agent-worker image shipped
without a docker CLI binary, so it could never launch the codex job container.
On Debian trixie (the bun base) 'docker.io' + --no-install-recommends installs
the daemon package but omits the client (split into 'docker-cli'), leaving no
'docker' on PATH. execa('docker', ...) hit ENOENT, and with reject:false that
resolves with exitCode undefined -> coerced to 0 -> looked like a successful
empty run -> 'Codex completed without producing an assistant response'.
- agent-worker.Dockerfile: drop docker.io, install the official static docker
CLI client pinned to 29.5.3 (matches the host daemon) to /usr/local/bin/docker
- runtime/docker.ts: normalizeRunResult() so a spawn failure (exitCode null) is
always a non-zero exit carrying the real reason, never a silent empty success
- tests: cover the spawn-failure and normal-result paths
This commit is contained in:
@@ -38,6 +38,31 @@ export const ensureJobImagePulled = () => {
|
||||
return jobImagePullPromise;
|
||||
};
|
||||
|
||||
// execa with `reject: false` resolves (does not throw) even when the runtime
|
||||
// binary is missing (ENOENT) — `exitCode` is then `undefined`. Coercing that to
|
||||
// 0 makes a failed spawn look like a successful empty run, which is exactly how
|
||||
// a worker image without a `docker` CLI silently produced empty agent
|
||||
// responses. Normalize so any spawn failure is a non-zero exit carrying the
|
||||
// real reason.
|
||||
export const normalizeRunResult = (
|
||||
// Declared nullable on purpose: execa's types claim these are always present,
|
||||
// but on a spawn failure (e.g. missing `docker` binary) `exitCode`/`all` are
|
||||
// actually undefined at runtime.
|
||||
result: { exitCode?: number; shortMessage?: string },
|
||||
output: string | undefined,
|
||||
redact: (value: string) => string,
|
||||
): CommandResult => {
|
||||
const text = output ?? '';
|
||||
if (result.exitCode == null) {
|
||||
const reason = result.shortMessage ?? 'container runtime failed to start';
|
||||
return {
|
||||
exitCode: 1,
|
||||
output: redact(`${text}${text ? '\n' : ''}${reason}`),
|
||||
};
|
||||
}
|
||||
return { exitCode: result.exitCode, output: redact(text) };
|
||||
};
|
||||
|
||||
const hostWorkspacePath = (workdir: string) => {
|
||||
if (!env.hostWorkdir) return workdir;
|
||||
const workerRoot = path.resolve(env.workdir);
|
||||
@@ -92,10 +117,7 @@ export const runInJobContainer = async (args: {
|
||||
timeout: args.timeoutMs,
|
||||
},
|
||||
);
|
||||
return {
|
||||
exitCode: result.exitCode ?? 0,
|
||||
output: args.redact(result.all),
|
||||
};
|
||||
return normalizeRunResult(result, result.all, args.redact);
|
||||
};
|
||||
|
||||
export const startWorkspaceContainer = async (args: {
|
||||
@@ -279,10 +301,7 @@ export const streamInJobContainer = async (args: {
|
||||
if (stderrBuffer && args.onStderrLine) {
|
||||
await args.onStderrLine(args.redact(stderrBuffer));
|
||||
}
|
||||
return {
|
||||
exitCode: result.exitCode ?? 0,
|
||||
output: args.redact(output.join('')),
|
||||
};
|
||||
return normalizeRunResult(result, output.join(''), args.redact);
|
||||
};
|
||||
|
||||
export const stopWorkspaceContainer = async (containerName: string) => {
|
||||
|
||||
@@ -43,4 +43,27 @@ describe('Docker runtime', () => {
|
||||
'/tmp/spoon-job:/workspace:z',
|
||||
);
|
||||
});
|
||||
|
||||
test('treats a spawn failure (no exitCode) as a non-zero exit, not empty success', async () => {
|
||||
const { normalizeRunResult } = await loadVolumeSpec();
|
||||
// This is what execa returns with `reject: false` when the runtime binary is
|
||||
// missing (e.g. no `docker` CLI in the worker image): exitCode is undefined.
|
||||
const result = normalizeRunResult(
|
||||
{ exitCode: undefined, shortMessage: 'spawn docker ENOENT' },
|
||||
undefined,
|
||||
(value) => value,
|
||||
);
|
||||
expect(result.exitCode).toBe(1);
|
||||
expect(result.output).toContain('spawn docker ENOENT');
|
||||
});
|
||||
|
||||
test('passes through a normal command result unchanged', async () => {
|
||||
const { normalizeRunResult } = await loadVolumeSpec();
|
||||
const result = normalizeRunResult(
|
||||
{ exitCode: 0, shortMessage: undefined },
|
||||
'hello',
|
||||
(value) => value,
|
||||
);
|
||||
expect(result).toEqual({ exitCode: 0, output: 'hello' });
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,12 +11,24 @@ RUN apt-get update \
|
||||
bash \
|
||||
ca-certificates \
|
||||
curl \
|
||||
docker.io \
|
||||
git \
|
||||
jq \
|
||||
openssh-client \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Docker CLI client only — the daemon is the host's, reached via the bind-mounted
|
||||
# /var/run/docker.sock. The Debian `docker.io` package does NOT install the
|
||||
# client under `--no-install-recommends` (trixie split it into `docker-cli`),
|
||||
# which left the worker with no `docker` binary and silently broke every job.
|
||||
# Install the official static client pinned to the host daemon's version.
|
||||
ARG DOCKER_CLI_VERSION=29.5.3
|
||||
RUN arch="$(uname -m)" \
|
||||
&& curl -fsSL "https://download.docker.com/linux/static/stable/${arch}/docker-${DOCKER_CLI_VERSION}.tgz" -o /tmp/docker.tgz \
|
||||
&& tar -xzf /tmp/docker.tgz -C /tmp \
|
||||
&& install -m0755 /tmp/docker/docker /usr/local/bin/docker \
|
||||
&& rm -rf /tmp/docker /tmp/docker.tgz \
|
||||
&& docker --version
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY package.json bun.lock* turbo.json ./
|
||||
|
||||
Reference in New Issue
Block a user