From 66f340a7a378429a1fb64d6cd055052ef49ca961 Mon Sep 17 00:00:00 2001 From: dave Date: Thu, 30 Apr 2026 18:19:01 +0000 Subject: [PATCH] fix: prune session_store on stdio abort, respawn cold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bug 882 abort-respawn safeguard caps consecutive crashes at 5 then blocks the story — but the underlying stdio abort itself stays unfixed: each respawn calls start_agent which reads session_store.json, finds the prior session id, passes --resume to claude-code, and re-triggers the same crash. Five identical respawns later, the story is blocked. Now: when an abort+no-session exit triggers respawn, we first call session_store::remove_sessions_for_story to drop every entry for the story. The next spawn starts cold (no --resume), which avoids the bloated stdio replay claude-code is choking on. The function was already implemented but #[cfg(test)] only — promoted to a non-test pub fn. Existing remove_sessions_for_story_cleans_up test unchanged and still green. Net effect: instead of "5 retries, then blocked", we get "1 abort, prune, respawn cold, agent runs normally". The story can resume work without losing its worktree state. --- server/src/agents/pool/start/spawn.rs | 11 ++++++++++- server/src/agents/session_store.rs | 8 ++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/server/src/agents/pool/start/spawn.rs b/server/src/agents/pool/start/spawn.rs index 43238e64..946d6b24 100644 --- a/server/src/agents/pool/start/spawn.rs +++ b/server/src/agents/pool/start/spawn.rs @@ -463,10 +463,19 @@ pub(super) async fn run_agent_spawn( reason, }); } else { + // Prune session_store entries for this story so the next + // spawn starts cold (no `--resume` flag). The crash likely + // came from claude-code choking on the bloated stdio + // replay; resuming again would re-trigger the same abort. + crate::agents::session_store::remove_sessions_for_story( + &project_root_clone, + &sid, + ); slog!( "[agents] CLI crashed before session for '{sid}:{aname}' \ (abort respawn {count}/{ABORT_RESPAWN_CAP}). \ - Respawning without consuming a retry slot." + Pruned session_store and respawning cold without \ + consuming a retry slot." ); let agents_for_respawn = Arc::clone(&agents_ref); let watcher_for_respawn = watcher_tx_clone.clone(); diff --git a/server/src/agents/session_store.rs b/server/src/agents/session_store.rs index 0a3f2c03..717e556f 100644 --- a/server/src/agents/session_store.rs +++ b/server/src/agents/session_store.rs @@ -73,8 +73,12 @@ pub fn lookup_session( read_store(project_root).get(&key).cloned() } -/// Remove all session entries for a story (called when a story reaches done/archived). -#[cfg(test)] +/// Remove all session entries for a story. +/// +/// Called when the story reaches done/archived, OR when claude-code keeps +/// crashing on session resume — in the latter case the next spawn must start +/// cold (no `--resume` flag) so the bloated stdio replay doesn't re-trigger +/// the same abort. See bug 882 follow-up. pub fn remove_sessions_for_story(project_root: &Path, story_id: &str) { let mut data = read_store(project_root); let prefix = format!("{story_id}:");