Fix agent empty-response in prod: workdir mount, image freshness, error surfacing

- Pin codex@0.142.0 + opencode-ai@1.17.9 in the job image (was @latest,
  causing dev/prod drift)
- Worker now s the job image once per process so prod stops
  running a stale Codex
- Surface Codex error/turn.failed events instead of swallowing them, so the
  real failure reason is reported rather than 'no assistant response'
- Harden the Codex JSON parser to also handle the legacy msg-wrapped shape
- Fix the docker-in-docker workdir: bind-mount identical host:container path
  and set SPOON_AGENT_HOST_WORKDIR (named volume can't be mounted by sibling
  job containers)
- Add docs/compose.prod.yml as a documented reference deployment
This commit is contained in:
Gabriel Brown
2026-06-24 05:38:35 -04:00
parent 980a2c07e8
commit 9643cb197b
8 changed files with 315 additions and 8 deletions
+162
View File
@@ -0,0 +1,162 @@
# Production Compose for Spoon
# -----------------------------------------------------------------------------
# Reference deployment for the production host. Copy this to the server and run
# with `docker compose -f compose.prod.yml up -d` (alongside your prod `.env`).
#
# Two things in here are load-bearing for the agent ("run a thread") to work.
# If you change them, read the comments first:
#
# 1. AGENT WORKDIR (spoon-agent-worker): the worker is containerized but
# launches the Codex job container by talking to the HOST Docker daemon.
# The host can only bind-mount real HOST paths, so the work directory MUST
# be a bind mount whose path is IDENTICAL inside and outside the container,
# and SPOON_AGENT_HOST_WORKDIR must match it. A named volume does NOT work
# here because its real host path is hidden from the worker. All three
# references to /var/lib/spoon-agent/work below must stay in sync; change
# them together if you want the data somewhere else.
#
# 2. IMAGE FRESHNESS: services use `pull_policy: always` + Watchtower labels so
# a redeploy / new push always lands. The Codex *job* image is pulled by the
# worker itself on startup (see SPOON_AGENT_JOB_IMAGE); restarting the worker
# (which Watchtower does on a new image) re-pulls a fresh job image.
networks:
nginx-bridge: # Change to network you plan to use
external: true
services:
spoon-next:
image: git.gbrown.org/gib/${NEXT_CONTAINER_NAME}:latest
container_name: ${NEXT_CONTAINER_NAME}
hostname: ${NEXT_CONTAINER_NAME}
domainname: ${NEXT_DOMAIN}
networks: ['${NETWORK:-nginx-bridge}']
#ports: ['${NEXT_PORT}:${NEXT_PORT}']
pull_policy: always
environment:
- NODE_ENV=${NODE_ENV}
- SENTRY_AUTH_TOKEN=${SENTRY_AUTH_TOKEN}
- NEXT_PUBLIC_SITE_URL=${NEXT_PUBLIC_SITE_URL:-http://localhost:${NEXT_PORT:-3000}}
- NEXT_PUBLIC_CONVEX_URL=${NEXT_PUBLIC_CONVEX_URL:-http://${BACKEND_CONTAINER_NAME:-spoon-backend}:${BACKEND_PORT:-3210}}
- NEXT_PUBLIC_PLAUSIBLE_URL=${NEXT_PUBLIC_PLAUSIBLE_URL:-https://plausible.gbrown.org}
- NEXT_PUBLIC_SENTRY_DSN=${NEXT_PUBLIC_SENTRY_DSN}
- NEXT_PUBLIC_SENTRY_URL=${NEXT_PUBLIC_SENTRY_URL}
- NEXT_PUBLIC_SENTRY_ORG=${NEXT_PUBLIC_SENTRY_ORG:-sentry}
- NEXT_PUBLIC_SENTRY_PROJECT_NAME=${NEXT_PUBLIC_SENTRY_PROJECT_NAME}
- SPOON_AGENT_WORKER_URL=${SPOON_AGENT_WORKER_URL:-http://spoon-agent-worker:3921}
- SPOON_AGENT_WORKER_INTERNAL_TOKEN=${SPOON_AGENT_WORKER_INTERNAL_TOKEN}
- SPOON_WORKER_TOKEN=${SPOON_WORKER_TOKEN}
depends_on: ['spoon-backend', 'spoon-postgres']
labels: ['com.centurylinklabs.watchtower.enable=true']
tty: true
stdin_open: true
restart: unless-stopped
spoon-agent-worker:
image: git.gbrown.org/gib/spoon-agent-worker:latest
container_name: spoon-agent-worker
hostname: spoon-agent-worker
domainname: worker.${NEXT_DOMAIN:-spoon.gbrown.org}
networks: ['${NETWORK:-nginx-bridge}']
pull_policy: always
environment:
- GITHUB_APP_ID=${GITHUB_APP_ID}
- GITHUB_APP_PRIVATE_KEY=${GITHUB_APP_PRIVATE_KEY}
- NEXT_PUBLIC_CONVEX_URL=https://api.spoon.gbrown.org
- SPOON_AGENT_WORKER_ID=${SPOON_AGENT_WORKER_ID:-production-worker}
- SPOON_AGENT_JOB_IMAGE=${SPOON_AGENT_JOB_IMAGE:-git.gbrown.org/gib/spoon-agent-job:latest}
- SPOON_AGENT_RUNTIME=docker
- SPOON_AGENT_NETWORK=${NETWORK:-nginx-bridge}
# The work directory MUST be the same absolute path here, in the bind mount
# below, and in SPOON_AGENT_HOST_WORKDIR. See header note (1).
- SPOON_AGENT_WORKDIR=/var/lib/spoon-agent/work
- SPOON_AGENT_HOST_WORKDIR=/var/lib/spoon-agent/work
- SPOON_AGENT_WORKER_HTTP_PORT=${SPOON_AGENT_WORKER_HTTP_PORT:-3921}
- SPOON_AGENT_WORKER_INTERNAL_TOKEN=${SPOON_AGENT_WORKER_INTERNAL_TOKEN}
- SPOON_AGENT_MAX_CONCURRENT_JOBS=${SPOON_AGENT_MAX_CONCURRENT_JOBS:-1}
- SPOON_AGENT_JOB_TIMEOUT_MS=${SPOON_AGENT_JOB_TIMEOUT_MS:-1800000}
- SPOON_WORKER_TOKEN=${SPOON_WORKER_TOKEN}
volumes:
- /var/run/docker.sock:/var/run/docker.sock
# Identical host:container path so the sibling Codex job containers can
# bind-mount the workspace via the host daemon. Do NOT switch this to a
# named volume. See header note (1).
- /var/lib/spoon-agent/work:/var/lib/spoon-agent/work
labels: ['com.centurylinklabs.watchtower.enable=true']
tty: true
stdin_open: true
restart: unless-stopped
spoon-backend:
image: ghcr.io/get-convex/convex-backend:${BACKEND_TAG:-latest}
container_name: ${BACKEND_CONTAINER_NAME:-spoon-backend}
hostname: ${BACKEND_CONTAINER_NAME:-spoon-backend}
domainname: ${BACKEND_DOMAIN:-convex.spoon.gbrown.org}
networks: ['${NETWORK:-nginx-bridge}']
#user: '1000:1000'
#ports: ['${BACKEND_PORT:-3210}:3210','${SITE_PROXY_PORT:-3211}:3211']
volumes: [./volumes/convex:/convex/data]
pull_policy: always
environment:
- INSTANCE_NAME=${INSTANCE_NAME}
- CONVEX_CLOUD_ORIGIN=${CONVEX_CLOUD_ORIGIN:-http://${BACKEND_CONTAINER_NAME:-spoon-backend}:${BACKEND_PORT:-3210}}
- CONVEX_SITE_ORIGIN=${CONVEX_SITE_ORIGIN:-http://${BACKEND_CONTAINER_NAME:-spoon-backend}:${SITE_PROXY_PORT:-3211}}
- DISABLE_BEACON=${DISABLE_BEACON:-true}
- REDACT_LOGS_TO_CLIENT=${REDACT_LOGS_TO_CLIENT:-true}
- DO_NOT_REQUIRE_SSL=${DO_NOT_REQUIRE_SSL:-false}
- POSTGRES_URL=${POSTGRES_URL}
depends_on: ['spoon-postgres']
labels: ['com.centurylinklabs.watchtower.enable=true']
stdin_open: true
tty: true
restart: unless-stopped
healthcheck:
test: curl -f http://localhost:3210/version
interval: 5s
start_period: 10s
stop_grace_period: 10s
stop_signal: SIGINT
spoon-dashboard:
image: ghcr.io/get-convex/convex-dashboard:${DASHBOARD_TAG:-latest}
container_name: ${DASHBOARD_CONTAINER_NAME:-spoon-dashboard}
hostname: ${DASHBOARD_CONTAINER_NAME:-spoon-dashboard}
domainname: ${DASHBOARD_DOMAIN:-dashboard.${BACKEND_DOMAIN:-spoon.gbrown.org}}
networks: ['${NETWORK:-nginx-bridge}']
#user: 1000:1000
#ports: ['${DASHBOARD_PORT:-6791}:6791']
pull_policy: always
environment:
- NEXT_PUBLIC_DEPLOYMENT_URL=${NEXT_PUBLIC_DEPLOYMENT_URL:-http://${BACKEND_CONTAINER_NAME:-spoon-backend}:${PORT:-3210}}
depends_on:
spoon-backend:
condition: service_healthy
labels: ['com.centurylinklabs.watchtower.enable=true']
stdin_open: true
tty: true
restart: unless-stopped
stop_grace_period: 10s
stop_signal: SIGINT
spoon-postgres:
image: postgres:17
container_name: ${POSTGRES_CONTAINER_NAME:-spoon-postgres}
hostname: ${POSTGRES_CONTAINER_NAME:-spoon-postgres}
domainname: postgres.${NEXT_DOMAIN:-spoon.gbrown.org}
networks: ['${NETWORK:-nginx-bridge}']
# ports: ["5434:5432"]
environment:
- POSTGRES_USER=${POSTGRES_USER:-spoon}
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
- POSTGRES_DB=${POSTGRES_DB:-spoon_convex}
volumes: ['./volumes/postgres:/var/lib/postgresql/data']
labels: ['com.centurylinklabs.watchtower.enable=true']
tty: true
stdin_open: true
restart: unless-stopped
healthcheck:
test: ['CMD-SHELL', 'pg_isready -d $${POSTGRES_DB} -U $${POSTGRES_USER}']
start_period: 20s
interval: 30s
retries: 5
timeout: 5s