From c4010854a52a9554ebbfae951c4bde3ed88002d2 Mon Sep 17 00:00:00 2001 From: dave Date: Fri, 15 May 2026 11:34:50 +0000 Subject: [PATCH] =?UTF-8?q?huskies:=20merge=201089=20bug=20Stuck-agent=20d?= =?UTF-8?q?etector=20blocks=20stories=20on=20legitimate=20exploration=20/?= =?UTF-8?q?=20debugging=20=E2=80=94=20uses=20too=20narrow=20a=20"progress"?= =?UTF-8?q?=20signal?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/agents/pool/pipeline/advance/mod.rs | 194 +++++++++++++++--- .../pool/pipeline/advance/tests_regression.rs | 4 +- server/src/agents/pool/start/spawn.rs | 15 +- server/src/agents/runtime/claude_code.rs | 2 + server/src/agents/runtime/gemini/mod.rs | 4 + server/src/agents/runtime/mod.rs | 8 + server/src/agents/runtime/openai.rs | 3 + server/src/db/content_store.rs | 15 ++ 8 files changed, 217 insertions(+), 28 deletions(-) diff --git a/server/src/agents/pool/pipeline/advance/mod.rs b/server/src/agents/pool/pipeline/advance/mod.rs index cc128b58..c71332ab 100644 --- a/server/src/agents/pool/pipeline/advance/mod.rs +++ b/server/src/agents/pool/pipeline/advance/mod.rs @@ -78,21 +78,34 @@ impl AgentPool { // The coder exited with uncommitted content but no commits // (typical "claude-code session boundary mid-sweep" pattern). // Use a PROGRESS-AWARE retry cap: the agent gets unlimited - // respawns as long as file edits keep growing between - // attempts; only when the worktree diff is byte-identical - // to the previous attempt do we count it as "no progress". - // After NO_PROGRESS_CAP consecutive no-progress respawns, - // block for human attention. + // respawns as long as progress is being made between attempts. + // Progress is satisfied if EITHER (a) the worktree diff grew, + // OR (b) the set of files the agent read grew. Raw tool-call + // count does NOT count — a looping agent can produce many calls. + // Only self-exited sessions with no file or read progress count + // toward the cap; forced exits (API error, network, budget + // exhaustion) are excluded (story 1089). + // After NO_PROGRESS_CAP consecutive qualifying no-progress + // respawns, block for human attention. // // TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent // keeps making file-edit progress every session, after this - // many total respawns without a commit we escalate — caught - // the "agent flaps between different edits but never - // commits" pattern that the progress-aware counter would - // never trigger. + // many total respawns without a commit we escalate — catches + // the "agent flaps between different edits but never commits" + // pattern that the progress-aware counter would never trigger. const NO_PROGRESS_CAP: u32 = 3; const TOTAL_ATTEMPTS_CAP: u32 = 8; + // AC1: consume the forced-exit flag written by spawn.rs when + // the agent process exited with a non-zero code. + let forced_exit = crate::db::read_content( + crate::db::ContentKey::CommitRecoveryForcedExit(story_id), + ) + .is_some(); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryForcedExit( + story_id, + )); + let current_fingerprint = worktree_path.as_deref().and_then(|p| { std::process::Command::new("git") .args(["diff", "master"]) @@ -104,18 +117,31 @@ impl AgentPool { let stored_fingerprint = crate::db::read_content( crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id), ); - let made_progress = current_fingerprint.is_some() + let diff_progress = current_fingerprint.is_some() && stored_fingerprint.as_ref() != current_fingerprint.as_ref(); - let no_progress_count = if made_progress || stored_fingerprint.is_none() { + + // AC2: check read-file set progress as an additional signal. + let read_progress = previous_session_id.as_deref().is_some_and(|session_id| { + collect_read_progress(&project_root, story_id, agent_name, session_id) + }); + + let made_progress = diff_progress || read_progress; + + let prev_no_progress_count = crate::db::read_content( + crate::db::ContentKey::CommitRecoveryPending(story_id), + ) + .and_then(|s| s.trim().parse::().ok()) + .unwrap_or(0); + + // AC1: forced exits do not increment the stuck-respawn counter. + let no_progress_count = if forced_exit { + prev_no_progress_count + } else if made_progress || stored_fingerprint.is_none() { 1 } else { - crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending( - story_id, - )) - .and_then(|s| s.trim().parse::().ok()) - .unwrap_or(0) - + 1 + prev_no_progress_count + 1 }; + let total_attempts = crate::db::read_content( crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), ) @@ -136,13 +162,17 @@ impl AgentPool { crate::db::delete_content( crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), ); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet( + story_id, + )); slog!( "[pipeline] Coder '{agent_name}' for '{story_id}' hit total \ commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \ without a commit. Blocking story." ); let reason = format!( - "agent flapped — {total_attempts} respawns without ever committing" + "commit absent after {total_attempts} respawns — \ + agent kept making edits but never committed" ); if let Err(e) = crate::agents::lifecycle::transition_to_blocked(story_id, &reason) @@ -167,14 +197,18 @@ impl AgentPool { crate::db::delete_content( crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), ); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet( + story_id, + )); slog!( "[pipeline] Coder '{agent_name}' for '{story_id}' made no \ - file-edit progress over {no_progress_count} consecutive \ - commit-recovery respawns. Blocking story." + file or read progress over {no_progress_count} consecutive \ + self-exit commit-recovery respawns. Blocking story." ); + // AC4: block message names the specific cause. let reason = format!( - "agent stuck — {no_progress_count} respawns without commits or \ - new file edits" + "stuck-respawn cap reached: {NO_PROGRESS_CAP} consecutive \ + self-exits with no file or read progress" ); if let Err(e) = crate::agents::lifecycle::transition_to_blocked(story_id, &reason) @@ -206,7 +240,8 @@ impl AgentPool { "[pipeline] Coder '{agent_name}' exited with uncommitted work \ for '{story_id}' (no-progress {no_progress_count}/\ {NO_PROGRESS_CAP}, total {total_attempts}/\ - {TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \ + {TOTAL_ATTEMPTS_CAP}; diff_progress={diff_progress}, \ + read_progress={read_progress}, forced_exit={forced_exit}). \ Issuing commit-only respawn." ); let addendum = "\n\nYou have uncommitted work in this worktree. \ @@ -302,10 +337,13 @@ impl AgentPool { }); } } else if completion.gates_passed { - // Clear any stale recovery key when the coder succeeds normally. + // Clear any stale recovery keys when the coder succeeds normally. crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending( story_id, )); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet( + story_id, + )); // Determine effective QA mode for this story. let qa_mode = { let item_type = crate::agents::lifecycle::item_type_from_id(story_id); @@ -361,11 +399,14 @@ impl AgentPool { } } } else { - // Clear any stale recovery key when gates fail normally (agent committed + // Clear any stale recovery keys when gates fail normally (agent committed // but the build is broken — treat as a standard retry, not a recovery). crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending( story_id, )); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet( + story_id, + )); // Bug 645 / 668: Before retry/block, check if the agent left committed // work AND the agent had a passing run_tests result captured during its // session. An agent may crash mid-output (e.g. Claude Code CLI PTY write @@ -724,6 +765,109 @@ mod helpers; use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store}; pub(crate) use helpers::{should_block_story, spawn_pipeline_advance}; +/// Parse a huskies agent log and return the set of file paths passed to the +/// Read tool in that session. Returns an empty set if the log cannot be read. +/// +/// Used by [`collect_read_progress`] to detect read-exploration progress even +/// when the worktree diff did not grow (story 1089, AC2). +fn collect_read_files_from_log( + project_root: &std::path::Path, + story_id: &str, + agent_name: &str, + session_id: &str, +) -> std::collections::HashSet { + let log_path = crate::agent_log::log_file_path(project_root, story_id, agent_name, session_id); + let mut files = std::collections::HashSet::new(); + + let log_text = match std::fs::read_to_string(&log_path) { + Ok(t) => t, + Err(_) => return files, + }; + + for line in log_text.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + let entry: serde_json::Value = match serde_json::from_str(trimmed) { + Ok(v) => v, + Err(_) => continue, + }; + // Only look at agent_json events where data.type == "assistant". + if entry.get("type").and_then(|t| t.as_str()) != Some("agent_json") { + continue; + } + let data = match entry.get("data") { + Some(d) => d, + None => continue, + }; + if data.get("type").and_then(|t| t.as_str()) != Some("assistant") { + continue; + } + let content = match data.pointer("/message/content").and_then(|c| c.as_array()) { + Some(c) => c, + None => continue, + }; + for item in content { + if item.get("type").and_then(|t| t.as_str()) != Some("tool_use") { + continue; + } + if item.get("name").and_then(|n| n.as_str()) != Some("Read") { + continue; + } + if let Some(path) = item.pointer("/input/file_path").and_then(|p| p.as_str()) { + files.insert(path.to_string()); + } + } + } + + files +} + +/// Return `true` if the agent read any files in `session_id` that were not in +/// the cumulative read set for `story_id`. Updates the stored cumulative set +/// when new files are found (story 1089, AC2). +fn collect_read_progress( + project_root: &std::path::Path, + story_id: &str, + agent_name: &str, + session_id: &str, +) -> bool { + let session_files = collect_read_files_from_log(project_root, story_id, agent_name, session_id); + if session_files.is_empty() { + return false; + } + + let stored_set: std::collections::HashSet = + crate::db::read_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id)) + .map(|s| { + s.lines() + .filter(|l| !l.is_empty()) + .map(str::to_string) + .collect() + }) + .unwrap_or_default(); + + let union: std::collections::HashSet = + stored_set.union(&session_files).cloned().collect(); + + if union.len() > stored_set.len() { + let mut sorted: Vec<&String> = union.iter().collect(); + sorted.sort(); + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryReadSet(story_id), + &sorted + .into_iter() + .map(String::as_str) + .collect::>() + .join("\n"), + ); + true + } else { + false + } +} + #[cfg(test)] mod tests; #[cfg(test)] diff --git a/server/src/agents/pool/pipeline/advance/tests_regression.rs b/server/src/agents/pool/pipeline/advance/tests_regression.rs index feccd9b6..2defcd6d 100644 --- a/server/src/agents/pool/pipeline/advance/tests_regression.rs +++ b/server/src/agents/pool/pipeline/advance/tests_regression.rs @@ -1077,7 +1077,7 @@ stage = "coder" "Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns" ); assert!( - block_reason.contains("without commits or new file edits"), + block_reason.contains("self-exits with no file or read progress"), "Block reason should describe the no-progress condition, got: {block_reason}" ); @@ -1193,7 +1193,7 @@ stage = "coder" "Story must be blocked once total commit-recovery attempts hits the outer cap" ); assert!( - block_reason.contains("flapped") && block_reason.contains("without ever committing"), + block_reason.contains("commit absent") && block_reason.contains("never committed"), "Block reason should describe the flapping pattern, got: {block_reason}" ); diff --git a/server/src/agents/pool/start/spawn.rs b/server/src/agents/pool/start/spawn.rs index 130e9f9e..313b2db8 100644 --- a/server/src/agents/pool/start/spawn.rs +++ b/server/src/agents/pool/start/spawn.rs @@ -808,6 +808,7 @@ pub(super) async fn run_agent_spawn( crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts( &sid, )); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(&sid)); // Remove agent from the pool and unblock any wait_for_agent callers. let tx_done = { @@ -873,6 +874,17 @@ pub(super) async fn run_agent_spawn( return; } + // AC1 (story 1089): mark forced exits so the commit-recovery + // stuck counter is not incremented for API errors, network + // failures, or Claude-API budget exhaustion. A non-zero exit + // code means the CLI was forced out, not that it chose to stop. + if !result.exit_ok { + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryForcedExit(&sid), + "1", + ); + } + // Server-owned completion: run acceptance gates automatically // when the agent process exits normally. super::super::pipeline::run_server_owned_completion( @@ -1246,12 +1258,13 @@ mod tests { "abc123", ); - // Rate-limit exit handler: reset all three counters (the fix). + // Rate-limit exit handler: reset all counters (the fix). crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id)); crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint( story_id, )); crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id)); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id)); // CommitRecoveryPending must be cleared after each rate-limit exit. assert!( diff --git a/server/src/agents/runtime/claude_code.rs b/server/src/agents/runtime/claude_code.rs index 3e9a4dae..0138c6b7 100644 --- a/server/src/agents/runtime/claude_code.rs +++ b/server/src/agents/runtime/claude_code.rs @@ -59,6 +59,7 @@ impl AgentRuntime for ClaudeCodeRuntime { // Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its // first "system" event. Detected by: non-zero exit AND no session. aborted_signal: !result.exit_ok && result.session_id.is_none(), + exit_ok: result.exit_ok, session_id: result.session_id, token_usage: result.token_usage, rate_limit_exit: result.rate_limit_exit, @@ -92,6 +93,7 @@ impl AgentRuntime for ClaudeCodeRuntime { Ok(RuntimeResult { aborted_signal: !fallback_result.exit_ok && fallback_result.session_id.is_none(), + exit_ok: fallback_result.exit_ok, session_id: fallback_result.session_id, token_usage: fallback_result.token_usage, rate_limit_exit: fallback_result.rate_limit_exit, diff --git a/server/src/agents/runtime/gemini/mod.rs b/server/src/agents/runtime/gemini/mod.rs index d7841556..88e7cf62 100644 --- a/server/src/agents/runtime/gemini/mod.rs +++ b/server/src/agents/runtime/gemini/mod.rs @@ -135,6 +135,7 @@ impl AgentRuntime for GeminiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + exit_ok: true, aborted_signal: false, rate_limit_exit: false, rate_limit_reset_at: None, @@ -151,6 +152,7 @@ impl AgentRuntime for GeminiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + exit_ok: true, aborted_signal: false, rate_limit_exit: false, rate_limit_reset_at: None, @@ -254,6 +256,7 @@ impl AgentRuntime for GeminiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + exit_ok: true, aborted_signal: false, rate_limit_exit: false, rate_limit_reset_at: None, @@ -339,6 +342,7 @@ impl AgentRuntime for GeminiRuntime { Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + exit_ok: true, aborted_signal: false, rate_limit_exit: false, rate_limit_reset_at: None, diff --git a/server/src/agents/runtime/mod.rs b/server/src/agents/runtime/mod.rs index 41996035..6e9d89fb 100644 --- a/server/src/agents/runtime/mod.rs +++ b/server/src/agents/runtime/mod.rs @@ -55,6 +55,12 @@ pub struct RuntimeContext { pub struct RuntimeResult { pub session_id: Option, pub token_usage: Option, + /// `true` when the process exited with exit code 0; `false` for non-zero exits + /// (API errors, network failures, or Claude-API-level budget exhaustion). Always + /// `true` for API-based runtimes (OpenAI, Gemini) which have no exit-code concept. + /// Used by the commit-recovery path to skip the stuck-respawn counter for forced + /// exits (story 1089, AC1). + pub exit_ok: bool, /// `true` when the process exited with a failure AND no session was established. /// /// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion @@ -169,6 +175,7 @@ mod tests { cache_read_input_tokens: 0, total_cost_usd: 0.01, }), + exit_ok: true, aborted_signal: false, rate_limit_exit: false, rate_limit_reset_at: None, @@ -186,6 +193,7 @@ mod tests { let result = RuntimeResult { session_id: None, token_usage: None, + exit_ok: true, aborted_signal: false, rate_limit_exit: false, rate_limit_reset_at: None, diff --git a/server/src/agents/runtime/openai.rs b/server/src/agents/runtime/openai.rs index 7a54dbf2..f251b26b 100644 --- a/server/src/agents/runtime/openai.rs +++ b/server/src/agents/runtime/openai.rs @@ -122,6 +122,7 @@ impl AgentRuntime for OpenAiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + exit_ok: true, aborted_signal: false, rate_limit_exit: false, rate_limit_reset_at: None, @@ -138,6 +139,7 @@ impl AgentRuntime for OpenAiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + exit_ok: true, aborted_signal: false, rate_limit_exit: false, rate_limit_reset_at: None, @@ -224,6 +226,7 @@ impl AgentRuntime for OpenAiRuntime { return Ok(RuntimeResult { session_id: None, token_usage: Some(total_usage), + exit_ok: true, aborted_signal: false, rate_limit_exit: false, rate_limit_reset_at: None, diff --git a/server/src/db/content_store.rs b/server/src/db/content_store.rs index 143f8fc6..3fd53d31 100644 --- a/server/src/db/content_store.rs +++ b/server/src/db/content_store.rs @@ -60,6 +60,17 @@ pub enum ContentKey<'a> { /// completion. Read by `get_merge_status` to surface gate output for the /// "completed" state without a separate MergeJob CRDT register (story 1036). MergeReport(&'a str), + /// Flag written by spawn.rs when a coder session exits with a non-zero exit + /// code (API error, network failure, or Claude-API-level budget exhaustion). + /// Prevents the stuck-respawn counter from incrementing for forced exits — + /// only self-exits with no file or read changes count toward the cap. + /// Consumed (read + deleted) by the commit-recovery path in pipeline advance. + CommitRecoveryForcedExit(&'a str), + /// Cumulative set of files read across all commit-recovery sessions for a + /// story, stored as a newline-separated sorted list. Used to detect whether + /// the agent made read-exploration progress even when the worktree diff did + /// not grow (story 1089, AC2). Cleared when a commit lands or the story blocks. + CommitRecoveryReadSet(&'a str), } impl<'a> ContentKey<'a> { @@ -85,6 +96,10 @@ impl<'a> ContentKey<'a> { ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"), ContentKey::MergeSuccess(id) => format!("{id}:merge_success"), ContentKey::MergeReport(id) => format!("{id}:merge_report"), + ContentKey::CommitRecoveryForcedExit(id) => { + format!("{id}:commit_recovery_forced_exit") + } + ContentKey::CommitRecoveryReadSet(id) => format!("{id}:commit_recovery_read_set"), } } }