Files
spoon/apps/agent-worker/src/runtime/docker.ts
T
Gabriel Brown 40a6dd78e4
Build and Push Spoon Images / quality (push) Successful in 1m47s
Build and Push Spoon Images / build-images (push) Successful in 6m33s
Fix worker image missing docker CLI; harden spawn-failure handling
Root cause of the prod empty-response: the spoon-agent-worker image shipped
without a docker CLI binary, so it could never launch the codex job container.
On Debian trixie (the bun base) 'docker.io' + --no-install-recommends installs
the daemon package but omits the client (split into 'docker-cli'), leaving no
'docker' on PATH. execa('docker', ...) hit ENOENT, and with reject:false that
resolves with exitCode undefined -> coerced to 0 -> looked like a successful
empty run -> 'Codex completed without producing an assistant response'.

- agent-worker.Dockerfile: drop docker.io, install the official static docker
  CLI client pinned to 29.5.3 (matches the host daemon) to /usr/local/bin/docker
- runtime/docker.ts: normalizeRunResult() so a spawn failure (exitCode null) is
  always a non-zero exit carrying the real reason, never a silent empty success
- tests: cover the spawn-failure and normal-result paths
2026-06-24 06:31:17 -04:00

340 lines
9.2 KiB
TypeScript

import { execa } from 'execa';
import path from 'node:path';
import { env } from '../env';
type CommandResult = {
exitCode: number;
output: string;
};
const environmentArgs = (environment: Record<string, string>) =>
Object.entries(environment).flatMap(([name, value]) => [
'-e',
`${name}=${value}`,
]);
const networkArgs = () => (env.network ? ['--network', env.network] : []);
const containerRuntime = () => env.containerRuntime;
// `docker run` reuses a stale local `:latest` forever, so without an explicit
// pull the job image never updates in production. Pull once per worker process
// (i.e. once per deploy/restart) so a fresh worker always runs a fresh job
// image. Best-effort: if the registry is unreachable we fall back to whatever
// image is present locally rather than failing the job.
let jobImagePullPromise: Promise<void> | undefined;
export const ensureJobImagePulled = () => {
jobImagePullPromise ??= (async () => {
try {
await execa(containerRuntime(), ['pull', env.jobImage], {
reject: false,
stdin: 'ignore',
});
} catch {
// Ignore: keep running with the locally cached image.
}
})();
return jobImagePullPromise;
};
// execa with `reject: false` resolves (does not throw) even when the runtime
// binary is missing (ENOENT) — `exitCode` is then `undefined`. Coercing that to
// 0 makes a failed spawn look like a successful empty run, which is exactly how
// a worker image without a `docker` CLI silently produced empty agent
// responses. Normalize so any spawn failure is a non-zero exit carrying the
// real reason.
export const normalizeRunResult = (
// Declared nullable on purpose: execa's types claim these are always present,
// but on a spawn failure (e.g. missing `docker` binary) `exitCode`/`all` are
// actually undefined at runtime.
result: { exitCode?: number; shortMessage?: string },
output: string | undefined,
redact: (value: string) => string,
): CommandResult => {
const text = output ?? '';
if (result.exitCode == null) {
const reason = result.shortMessage ?? 'container runtime failed to start';
return {
exitCode: 1,
output: redact(`${text}${text ? '\n' : ''}${reason}`),
};
}
return { exitCode: result.exitCode, output: redact(text) };
};
const hostWorkspacePath = (workdir: string) => {
if (!env.hostWorkdir) return workdir;
const workerRoot = path.resolve(env.workdir);
const resolvedWorkdir = path.resolve(workdir);
const relative = path.relative(workerRoot, resolvedWorkdir);
if (relative.startsWith('..') || path.isAbsolute(relative)) {
return workdir;
}
return path.join(env.hostWorkdir, relative);
};
export const jobWorkspaceVolumeSpec = (workdir: string) => {
const volumeOptions =
env.containerVolumeOptions ??
(containerRuntime().endsWith('podman') ? 'Z' : undefined);
const source = hostWorkspacePath(workdir);
return volumeOptions
? `${source}:/workspace:${volumeOptions}`
: `${source}:/workspace`;
};
export const runInJobContainer = async (args: {
workdir: string;
command: string[];
environment: Record<string, string>;
redact: (value: string) => string;
timeoutMs: number;
}): Promise<CommandResult> => {
await ensureJobImagePulled();
const result = await execa(
containerRuntime(),
[
'run',
'--rm',
'--memory',
'4g',
'--cpus',
'2',
...networkArgs(),
...environmentArgs(args.environment),
'-v',
jobWorkspaceVolumeSpec(args.workdir),
'-w',
'/workspace/repo',
env.jobImage,
...args.command,
],
{
all: true,
reject: false,
stdin: 'ignore',
timeout: args.timeoutMs,
},
);
return normalizeRunResult(result, result.all, args.redact);
};
export const startWorkspaceContainer = async (args: {
workdir: string;
containerName: string;
environment: Record<string, string>;
command?: string[];
publishTcpPort?: number;
}) => {
await ensureJobImagePulled();
await execa(
containerRuntime(),
[
'rm',
'-f',
args.containerName,
],
{ reject: false },
);
const result = await execa(
containerRuntime(),
[
'run',
'-d',
'--name',
args.containerName,
'--memory',
'4g',
'--cpus',
'2',
...networkArgs(),
...(args.publishTcpPort
? ['-p', `127.0.0.1::${args.publishTcpPort}`]
: []),
...environmentArgs(args.environment),
'-v',
jobWorkspaceVolumeSpec(args.workdir),
'-w',
'/workspace/repo',
env.jobImage,
...(args.command ?? ['sleep', 'infinity']),
],
{ all: true, stdin: 'ignore' },
);
return {
containerId: result.stdout.trim(),
containerName: args.containerName,
hostPort: args.publishTcpPort
? await getPublishedPort(args.containerName, args.publishTcpPort)
: undefined,
};
};
const getPublishedPort = async (containerName: string, containerPort: number) => {
const result = await execa(
containerRuntime(),
['port', containerName, `${containerPort}/tcp`],
{ all: true, reject: false, stdin: 'ignore' },
);
const output = result.all.trim();
const match = /:(\d+)\s*$/.exec(output);
if (!match?.[1]) {
throw new Error(
`Could not determine published port for ${containerName}:${containerPort}.`,
);
}
return match[1];
};
export const execInWorkspaceContainer = async (args: {
containerName: string;
command: string[];
environment?: Record<string, string>;
redact: (value: string) => string;
timeoutMs: number;
}): Promise<CommandResult> => {
const result = await execa(
containerRuntime(),
[
'exec',
...(args.environment ? environmentArgs(args.environment) : []),
args.containerName,
...args.command,
],
{
all: true,
reject: false,
stdin: 'ignore',
timeout: args.timeoutMs,
},
);
return {
exitCode: result.exitCode ?? 0,
output: args.redact(result.all),
};
};
export const streamInJobContainer = async (args: {
workdir: string;
command: string[];
environment: Record<string, string>;
redact: (value: string) => string;
timeoutMs: number;
onStdoutLine?: (line: string) => Promise<void>;
onStderrLine?: (line: string) => Promise<void>;
}): Promise<CommandResult> => {
await ensureJobImagePulled();
const subprocess = execa(
containerRuntime(),
[
'run',
'--rm',
'--memory',
'4g',
'--cpus',
'2',
...networkArgs(),
...environmentArgs(args.environment),
'-v',
jobWorkspaceVolumeSpec(args.workdir),
'-w',
'/workspace/repo',
env.jobImage,
...args.command,
],
{
all: true,
reject: false,
stdin: 'ignore',
timeout: args.timeoutMs,
},
);
let stdoutBuffer = '';
let stderrBuffer = '';
const output: string[] = [];
let lineHandlers = Promise.resolve();
const consume = async (
chunk: Buffer,
source: 'stdout' | 'stderr',
handler?: (line: string) => Promise<void>,
) => {
output.push(chunk.toString('utf8'));
const next = `${source === 'stdout' ? stdoutBuffer : stderrBuffer}${chunk.toString('utf8')}`;
const lines = next.split(/\r?\n/);
const remainder = lines.pop() ?? '';
if (source === 'stdout') stdoutBuffer = remainder;
else stderrBuffer = remainder;
for (const line of lines) {
if (handler) {
await handler(args.redact(line));
}
}
};
subprocess.stdout.on('data', (chunk: Buffer) => {
lineHandlers = lineHandlers.then(() =>
consume(chunk, 'stdout', args.onStdoutLine),
);
});
subprocess.stderr.on('data', (chunk: Buffer) => {
lineHandlers = lineHandlers.then(() =>
consume(chunk, 'stderr', args.onStderrLine),
);
});
let result: Awaited<typeof subprocess>;
try {
result = await subprocess;
} catch (error) {
await lineHandlers;
const outputText = output.join('');
const message =
error instanceof Error ? error.message : 'Container command failed.';
return {
exitCode: 1,
output: args.redact(`${outputText}${outputText ? '\n' : ''}${message}`),
};
}
await lineHandlers;
if (stdoutBuffer && args.onStdoutLine) {
await args.onStdoutLine(args.redact(stdoutBuffer));
}
if (stderrBuffer && args.onStderrLine) {
await args.onStderrLine(args.redact(stderrBuffer));
}
return normalizeRunResult(result, output.join(''), args.redact);
};
export const stopWorkspaceContainer = async (containerName: string) => {
await execa(containerRuntime(), ['rm', '-f', containerName], {
reject: false,
});
};
export const inspectWorkspaceContainer = async (containerName: string) => {
const result = await execa(
containerRuntime(),
['inspect', containerName],
{
all: true,
reject: false,
},
);
return {
exists: result.exitCode === 0,
output: result.all,
};
};
export const listWorkspaceContainerNames = async (prefix: string) => {
const result = await execa(
containerRuntime(),
['ps', '-a', '--format', '{{.Names}}'],
{ all: true, reject: false },
);
if (result.exitCode !== 0) return [];
return result.all
.split('\n')
.map((line) => line.trim())
.filter((line) => line.startsWith(prefix));
};