From b0de86767a2280d391fc244daca72dbfb99be138 Mon Sep 17 00:00:00 2001 From: dave Date: Thu, 30 Apr 2026 00:31:08 +0000 Subject: [PATCH] huskies: merge 882 --- server/src/agents/pool/start/spawn.rs | 144 ++++++++++++++++++++++- server/src/agents/pty/runner.rs | 7 +- server/src/agents/pty/types.rs | 3 + server/src/agents/runtime/claude_code.rs | 5 + server/src/agents/runtime/gemini/mod.rs | 4 + server/src/agents/runtime/mod.rs | 10 ++ server/src/agents/runtime/openai.rs | 3 + 7 files changed, 173 insertions(+), 3 deletions(-) diff --git a/server/src/agents/pool/start/spawn.rs b/server/src/agents/pool/start/spawn.rs index d0ba5afe..43238e64 100644 --- a/server/src/agents/pool/start/spawn.rs +++ b/server/src/agents/pool/start/spawn.rs @@ -15,7 +15,7 @@ use crate::agent_log::AgentLogWriter; use crate::config::ProjectConfig; use crate::http::context::AppContext; use crate::io::watcher::WatcherEvent; -use crate::slog_error; +use crate::{slog, slog_error}; use super::super::super::runtime::{ AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext, @@ -413,6 +413,95 @@ pub(super) async fn run_agent_spawn( .find_agent(&aname) .map(agent_config_stage) .unwrap_or_else(|| pipeline_stage(&aname)); + + // AC1/AC2 (bug 882): CLI crashed (SIGABRT) before establishing a + // session. Respawn immediately without running gates or incrementing + // retry_count. Cap consecutive crash-respawns at 5 to avoid + // infinite loops; after the cap, block the story with a clear reason. + if result.aborted_signal && stage != PipelineStage::Mergemaster { + const ABORT_RESPAWN_CAP: u32 = 5; + let db_key = format!("{sid}:abort_respawn_count"); + let count = crate::db::read_content(&db_key) + .and_then(|s| s.trim().parse::().ok()) + .unwrap_or(0) + + 1; + crate::db::write_content(&db_key, &count.to_string()); + + // Remove the agent entry from the pool and emit Done so that + // any caller blocked on wait_for_agent is unblocked. + let tx_done = { + let mut lock = match agents_ref.lock() { + Ok(a) => a, + Err(_) => return, + }; + if let Some(agent) = lock.remove(&key_clone) { + agent.tx + } else { + tx_clone.clone() + } + }; + let _ = tx_done.send(AgentEvent::Done { + story_id: sid.clone(), + agent_name: aname.clone(), + session_id: None, + }); + AgentPool::notify_agent_state_changed(&watcher_tx_clone); + + if count >= ABORT_RESPAWN_CAP { + let reason = format!( + "CLI crashed before establishing a session (signal=Aborted, no session) \ + {count} times in a row. Stopping to avoid an infinite respawn loop." + ); + slog_error!( + "[agents] Story '{sid}' blocked after {count} consecutive CLI crashes." + ); + if let Err(e) = crate::agents::lifecycle::transition_to_blocked(&sid, &reason) { + slog_error!("[agents] Failed to block '{sid}' after abort cap: {e}"); + } + let _ = watcher_tx_clone.send(WatcherEvent::StoryBlocked { + story_id: sid.clone(), + reason, + }); + } else { + slog!( + "[agents] CLI crashed before session for '{sid}:{aname}' \ + (abort respawn {count}/{ABORT_RESPAWN_CAP}). \ + Respawning without consuming a retry slot." + ); + let agents_for_respawn = Arc::clone(&agents_ref); + let watcher_for_respawn = watcher_tx_clone.clone(); + let sid_r = sid.clone(); + let aname_r = aname.clone(); + let root_r = project_root_clone.clone(); + let port_r = port_for_task; + tokio::spawn(async move { + let pool = AgentPool { + agents: agents_for_respawn, + port: port_r, + child_killers: Arc::new(Mutex::new(HashMap::new())), + watcher_tx: watcher_for_respawn, + status_broadcaster: Arc::new( + crate::service::status::StatusBroadcaster::new(), + ), + }; + if let Err(e) = pool + .start_agent(&root_r, &sid_r, Some(&aname_r), None, None) + .await + { + slog_error!( + "[agents] Failed to respawn '{aname_r}' for '{sid_r}' \ + after CLI crash: {e}" + ); + } + }); + } + return; + } + + // Reset the abort-respawn counter on any non-aborted exit so that + // a single successful run clears the consecutive-crash history. + crate::db::delete_content(&format!("{sid}:abort_respawn_count")); + if stage == PipelineStage::Mergemaster { let (tx_done, done_session_id) = { let mut lock = match agents_ref.lock() { @@ -582,4 +671,57 @@ mod tests { "gate_output must appear in value" ); } + + /// AC3 (bug 882): simulates the abort-respawn counter mechanism to verify that + /// retry_count is never bumped during consecutive aborted+no-session exits and + /// that the abort counter reaches the cap (5) before blocking. + #[test] + fn abort_respawn_leaves_retry_count_unchanged_and_caps_at_five() { + crate::crdt_state::init_for_test(); + crate::db::ensure_content_store(); + + let story_id = "9962_story_abort_respawn_882"; + crate::db::write_item_with_content(story_id, "2_current", "---\nname: Test\n---\n"); + + let db_key = format!("{story_id}:abort_respawn_count"); + const CAP: u32 = 5; + + // Simulate CAP consecutive abort-before-session exits. + for expected_count in 1u32..=CAP { + // This is exactly the counter logic in run_agent_spawn's abort path. + let count = crate::db::read_content(&db_key) + .and_then(|s| s.trim().parse::().ok()) + .unwrap_or(0) + + 1; + crate::db::write_content(&db_key, &count.to_string()); + assert_eq!( + count, expected_count, + "abort counter must increment by 1 each time" + ); + + // retry_count must remain 0 — the abort path never calls bump_retry_count. + let retry_count = crate::crdt_state::read_item(story_id) + .and_then(|item| item.retry_count) + .map(|r| r as u32) + .unwrap_or(0); + assert_eq!( + retry_count, 0, + "retry_count must not be incremented by the abort-respawn path \ + (got {retry_count} on cycle {expected_count})" + ); + } + + // After CAP cycles the counter equals the cap — the story would be blocked. + let final_count: u32 = crate::db::read_content(&db_key) + .and_then(|s| s.trim().parse().ok()) + .unwrap_or(0); + assert_eq!( + final_count, CAP, + "counter must equal {CAP} after {CAP} abort cycles" + ); + assert!( + final_count >= CAP, + "a count of {final_count} triggers blocking (>= {CAP})" + ); + } } diff --git a/server/src/agents/pty/runner.rs b/server/src/agents/pty/runner.rs index cd94625c..3c000edf 100644 --- a/server/src/agents/pty/runner.rs +++ b/server/src/agents/pty/runner.rs @@ -399,14 +399,16 @@ fn run_agent_pty_blocking( let _ = child.kill(); let wait_result = child.wait(); - match &wait_result { + let exit_ok = match &wait_result { Ok(status) => { slog!("[agent:{story_id}:{agent_name}] Child exited: {status:?}"); + status.success() } Err(e) => { slog!("[agent:{story_id}:{agent_name}] Child wait error: {e}"); + false } - } + }; // Wait for the reader thread to finish so it releases the cloned PTY // master fd before we return. Without this, the next PTY spawn for the @@ -434,5 +436,6 @@ fn run_agent_pty_blocking( Ok(PtyResult { session_id, token_usage, + exit_ok, }) } diff --git a/server/src/agents/pty/types.rs b/server/src/agents/pty/types.rs index b628e3b3..6f8a7cf3 100644 --- a/server/src/agents/pty/types.rs +++ b/server/src/agents/pty/types.rs @@ -10,6 +10,9 @@ use crate::agents::TokenUsage; pub(in crate::agents) struct PtyResult { pub session_id: Option, pub token_usage: Option, + /// `true` if the child process exited with a zero exit code (normal completion). + /// `false` if the process was killed by a signal or exited with a non-zero code. + pub exit_ok: bool, } pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String { diff --git a/server/src/agents/runtime/claude_code.rs b/server/src/agents/runtime/claude_code.rs index 06e70e1a..a5f35239 100644 --- a/server/src/agents/runtime/claude_code.rs +++ b/server/src/agents/runtime/claude_code.rs @@ -61,6 +61,9 @@ impl AgentRuntime for ClaudeCodeRuntime { match pty_result { Ok(result) => Ok(RuntimeResult { + // Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its + // first "system" event. Detected by: non-zero exit AND no session. + aborted_signal: !result.exit_ok && result.session_id.is_none(), session_id: result.session_id, token_usage: result.token_usage, }), @@ -90,6 +93,8 @@ impl AgentRuntime for ClaudeCodeRuntime { ) .await?; Ok(RuntimeResult { + aborted_signal: !fallback_result.exit_ok + && fallback_result.session_id.is_none(), session_id: fallback_result.session_id, token_usage: fallback_result.token_usage, }) diff --git a/server/src/agents/runtime/gemini/mod.rs b/server/src/agents/runtime/gemini/mod.rs index d7c8a963..d6e8310a 100644 --- a/server/src/agents/runtime/gemini/mod.rs +++ b/server/src/agents/runtime/gemini/mod.rs @@ -135,6 +135,7 @@ impl AgentRuntime for GeminiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + aborted_signal: false, }); } @@ -148,6 +149,7 @@ impl AgentRuntime for GeminiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + aborted_signal: false, }); } @@ -248,6 +250,7 @@ impl AgentRuntime for GeminiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + aborted_signal: false, }); } @@ -330,6 +333,7 @@ impl AgentRuntime for GeminiRuntime { Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + aborted_signal: false, }) } diff --git a/server/src/agents/runtime/mod.rs b/server/src/agents/runtime/mod.rs index 9d5cfd60..9e1b1dc6 100644 --- a/server/src/agents/runtime/mod.rs +++ b/server/src/agents/runtime/mod.rs @@ -47,6 +47,14 @@ pub struct RuntimeContext { pub struct RuntimeResult { pub session_id: Option, pub token_usage: Option, + /// `true` when the process exited with a failure AND no session was established. + /// + /// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion + /// failure) before it could emit its first "system" event — the classic + /// "signal=Aborted, Session: None" case (bug 882). The completion handler + /// uses this flag to skip acceptance gates and respawn without consuming a + /// retry slot. Always `false` for API-based runtimes (Gemini, OpenAI). + pub aborted_signal: bool, } /// Runtime status reported by the backend. @@ -138,6 +146,7 @@ mod tests { cache_read_input_tokens: 0, total_cost_usd: 0.01, }), + aborted_signal: false, }; assert_eq!(result.session_id, Some("sess-123".to_string())); assert!(result.token_usage.is_some()); @@ -152,6 +161,7 @@ mod tests { let result = RuntimeResult { session_id: None, token_usage: None, + aborted_signal: false, }; assert!(result.session_id.is_none()); assert!(result.token_usage.is_none()); diff --git a/server/src/agents/runtime/openai.rs b/server/src/agents/runtime/openai.rs index cf267f8d..2840052c 100644 --- a/server/src/agents/runtime/openai.rs +++ b/server/src/agents/runtime/openai.rs @@ -122,6 +122,7 @@ impl AgentRuntime for OpenAiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + aborted_signal: false, }); } @@ -135,6 +136,7 @@ impl AgentRuntime for OpenAiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + aborted_signal: false, }); } @@ -218,6 +220,7 @@ impl AgentRuntime for OpenAiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + aborted_signal: false, }); }