Fix worker image missing docker CLI; harden spawn-failure handling
Root cause of the prod empty-response: the spoon-agent-worker image shipped
without a docker CLI binary, so it could never launch the codex job container.
On Debian trixie (the bun base) 'docker.io' + --no-install-recommends installs
the daemon package but omits the client (split into 'docker-cli'), leaving no
'docker' on PATH. execa('docker', ...) hit ENOENT, and with reject:false that
resolves with exitCode undefined -> coerced to 0 -> looked like a successful
empty run -> 'Codex completed without producing an assistant response'.
- agent-worker.Dockerfile: drop docker.io, install the official static docker
CLI client pinned to 29.5.3 (matches the host daemon) to /usr/local/bin/docker
- runtime/docker.ts: normalizeRunResult() so a spawn failure (exitCode null) is
always a non-zero exit carrying the real reason, never a silent empty success
- tests: cover the spawn-failure and normal-result paths
This commit is contained in:
@@ -38,6 +38,31 @@ export const ensureJobImagePulled = () => {
|
|||||||
return jobImagePullPromise;
|
return jobImagePullPromise;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// execa with `reject: false` resolves (does not throw) even when the runtime
|
||||||
|
// binary is missing (ENOENT) — `exitCode` is then `undefined`. Coercing that to
|
||||||
|
// 0 makes a failed spawn look like a successful empty run, which is exactly how
|
||||||
|
// a worker image without a `docker` CLI silently produced empty agent
|
||||||
|
// responses. Normalize so any spawn failure is a non-zero exit carrying the
|
||||||
|
// real reason.
|
||||||
|
export const normalizeRunResult = (
|
||||||
|
// Declared nullable on purpose: execa's types claim these are always present,
|
||||||
|
// but on a spawn failure (e.g. missing `docker` binary) `exitCode`/`all` are
|
||||||
|
// actually undefined at runtime.
|
||||||
|
result: { exitCode?: number; shortMessage?: string },
|
||||||
|
output: string | undefined,
|
||||||
|
redact: (value: string) => string,
|
||||||
|
): CommandResult => {
|
||||||
|
const text = output ?? '';
|
||||||
|
if (result.exitCode == null) {
|
||||||
|
const reason = result.shortMessage ?? 'container runtime failed to start';
|
||||||
|
return {
|
||||||
|
exitCode: 1,
|
||||||
|
output: redact(`${text}${text ? '\n' : ''}${reason}`),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return { exitCode: result.exitCode, output: redact(text) };
|
||||||
|
};
|
||||||
|
|
||||||
const hostWorkspacePath = (workdir: string) => {
|
const hostWorkspacePath = (workdir: string) => {
|
||||||
if (!env.hostWorkdir) return workdir;
|
if (!env.hostWorkdir) return workdir;
|
||||||
const workerRoot = path.resolve(env.workdir);
|
const workerRoot = path.resolve(env.workdir);
|
||||||
@@ -92,10 +117,7 @@ export const runInJobContainer = async (args: {
|
|||||||
timeout: args.timeoutMs,
|
timeout: args.timeoutMs,
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
return {
|
return normalizeRunResult(result, result.all, args.redact);
|
||||||
exitCode: result.exitCode ?? 0,
|
|
||||||
output: args.redact(result.all),
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export const startWorkspaceContainer = async (args: {
|
export const startWorkspaceContainer = async (args: {
|
||||||
@@ -279,10 +301,7 @@ export const streamInJobContainer = async (args: {
|
|||||||
if (stderrBuffer && args.onStderrLine) {
|
if (stderrBuffer && args.onStderrLine) {
|
||||||
await args.onStderrLine(args.redact(stderrBuffer));
|
await args.onStderrLine(args.redact(stderrBuffer));
|
||||||
}
|
}
|
||||||
return {
|
return normalizeRunResult(result, output.join(''), args.redact);
|
||||||
exitCode: result.exitCode ?? 0,
|
|
||||||
output: args.redact(output.join('')),
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export const stopWorkspaceContainer = async (containerName: string) => {
|
export const stopWorkspaceContainer = async (containerName: string) => {
|
||||||
|
|||||||
@@ -43,4 +43,27 @@ describe('Docker runtime', () => {
|
|||||||
'/tmp/spoon-job:/workspace:z',
|
'/tmp/spoon-job:/workspace:z',
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('treats a spawn failure (no exitCode) as a non-zero exit, not empty success', async () => {
|
||||||
|
const { normalizeRunResult } = await loadVolumeSpec();
|
||||||
|
// This is what execa returns with `reject: false` when the runtime binary is
|
||||||
|
// missing (e.g. no `docker` CLI in the worker image): exitCode is undefined.
|
||||||
|
const result = normalizeRunResult(
|
||||||
|
{ exitCode: undefined, shortMessage: 'spawn docker ENOENT' },
|
||||||
|
undefined,
|
||||||
|
(value) => value,
|
||||||
|
);
|
||||||
|
expect(result.exitCode).toBe(1);
|
||||||
|
expect(result.output).toContain('spawn docker ENOENT');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('passes through a normal command result unchanged', async () => {
|
||||||
|
const { normalizeRunResult } = await loadVolumeSpec();
|
||||||
|
const result = normalizeRunResult(
|
||||||
|
{ exitCode: 0, shortMessage: undefined },
|
||||||
|
'hello',
|
||||||
|
(value) => value,
|
||||||
|
);
|
||||||
|
expect(result).toEqual({ exitCode: 0, output: 'hello' });
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -11,12 +11,24 @@ RUN apt-get update \
|
|||||||
bash \
|
bash \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
curl \
|
curl \
|
||||||
docker.io \
|
|
||||||
git \
|
git \
|
||||||
jq \
|
jq \
|
||||||
openssh-client \
|
openssh-client \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Docker CLI client only — the daemon is the host's, reached via the bind-mounted
|
||||||
|
# /var/run/docker.sock. The Debian `docker.io` package does NOT install the
|
||||||
|
# client under `--no-install-recommends` (trixie split it into `docker-cli`),
|
||||||
|
# which left the worker with no `docker` binary and silently broke every job.
|
||||||
|
# Install the official static client pinned to the host daemon's version.
|
||||||
|
ARG DOCKER_CLI_VERSION=29.5.3
|
||||||
|
RUN arch="$(uname -m)" \
|
||||||
|
&& curl -fsSL "https://download.docker.com/linux/static/stable/${arch}/docker-${DOCKER_CLI_VERSION}.tgz" -o /tmp/docker.tgz \
|
||||||
|
&& tar -xzf /tmp/docker.tgz -C /tmp \
|
||||||
|
&& install -m0755 /tmp/docker/docker /usr/local/bin/docker \
|
||||||
|
&& rm -rf /tmp/docker /tmp/docker.tgz \
|
||||||
|
&& docker --version
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY package.json bun.lock* turbo.json ./
|
COPY package.json bun.lock* turbo.json ./
|
||||||
|
|||||||
Reference in New Issue
Block a user