huskies: merge 1089 bug Stuck-agent detector blocks stories on legitimate exploration / debugging — uses too narrow a "progress" signal

This commit is contained in:
dave
2026-05-15 11:34:50 +00:00
parent fb1311cdae
commit c4010854a5
8 changed files with 217 additions and 28 deletions
+169 -25
View File
@@ -78,21 +78,34 @@ impl AgentPool {
// The coder exited with uncommitted content but no commits // The coder exited with uncommitted content but no commits
// (typical "claude-code session boundary mid-sweep" pattern). // (typical "claude-code session boundary mid-sweep" pattern).
// Use a PROGRESS-AWARE retry cap: the agent gets unlimited // Use a PROGRESS-AWARE retry cap: the agent gets unlimited
// respawns as long as file edits keep growing between // respawns as long as progress is being made between attempts.
// attempts; only when the worktree diff is byte-identical // Progress is satisfied if EITHER (a) the worktree diff grew,
// to the previous attempt do we count it as "no progress". // OR (b) the set of files the agent read grew. Raw tool-call
// After NO_PROGRESS_CAP consecutive no-progress respawns, // count does NOT count — a looping agent can produce many calls.
// block for human attention. // Only self-exited sessions with no file or read progress count
// toward the cap; forced exits (API error, network, budget
// exhaustion) are excluded (story 1089).
// After NO_PROGRESS_CAP consecutive qualifying no-progress
// respawns, block for human attention.
// //
// TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent // TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
// keeps making file-edit progress every session, after this // keeps making file-edit progress every session, after this
// many total respawns without a commit we escalate — caught // many total respawns without a commit we escalate — catches
// the "agent flaps between different edits but never // the "agent flaps between different edits but never commits"
// commits" pattern that the progress-aware counter would // pattern that the progress-aware counter would never trigger.
// never trigger.
const NO_PROGRESS_CAP: u32 = 3; const NO_PROGRESS_CAP: u32 = 3;
const TOTAL_ATTEMPTS_CAP: u32 = 8; const TOTAL_ATTEMPTS_CAP: u32 = 8;
// AC1: consume the forced-exit flag written by spawn.rs when
// the agent process exited with a non-zero code.
let forced_exit = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryForcedExit(story_id),
)
.is_some();
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryForcedExit(
story_id,
));
let current_fingerprint = worktree_path.as_deref().and_then(|p| { let current_fingerprint = worktree_path.as_deref().and_then(|p| {
std::process::Command::new("git") std::process::Command::new("git")
.args(["diff", "master"]) .args(["diff", "master"])
@@ -104,18 +117,31 @@ impl AgentPool {
let stored_fingerprint = crate::db::read_content( let stored_fingerprint = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id), crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
); );
let made_progress = current_fingerprint.is_some() let diff_progress = current_fingerprint.is_some()
&& stored_fingerprint.as_ref() != current_fingerprint.as_ref(); && stored_fingerprint.as_ref() != current_fingerprint.as_ref();
let no_progress_count = if made_progress || stored_fingerprint.is_none() {
// AC2: check read-file set progress as an additional signal.
let read_progress = previous_session_id.as_deref().is_some_and(|session_id| {
collect_read_progress(&project_root, story_id, agent_name, session_id)
});
let made_progress = diff_progress || read_progress;
let prev_no_progress_count = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryPending(story_id),
)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0);
// AC1: forced exits do not increment the stuck-respawn counter.
let no_progress_count = if forced_exit {
prev_no_progress_count
} else if made_progress || stored_fingerprint.is_none() {
1 1
} else { } else {
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending( prev_no_progress_count + 1
story_id,
))
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1
}; };
let total_attempts = crate::db::read_content( let total_attempts = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
) )
@@ -136,13 +162,17 @@ impl AgentPool {
crate::db::delete_content( crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
); );
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
slog!( slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' hit total \ "[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \ commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
without a commit. Blocking story." without a commit. Blocking story."
); );
let reason = format!( let reason = format!(
"agent flapped — {total_attempts} respawns without ever committing" "commit absent after {total_attempts} respawns \
agent kept making edits but never committed"
); );
if let Err(e) = if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason) crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
@@ -167,14 +197,18 @@ impl AgentPool {
crate::db::delete_content( crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
); );
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
slog!( slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \ "[pipeline] Coder '{agent_name}' for '{story_id}' made no \
file-edit progress over {no_progress_count} consecutive \ file or read progress over {no_progress_count} consecutive \
commit-recovery respawns. Blocking story." self-exit commit-recovery respawns. Blocking story."
); );
// AC4: block message names the specific cause.
let reason = format!( let reason = format!(
"agent stuck — {no_progress_count} respawns without commits or \ "stuck-respawn cap reached: {NO_PROGRESS_CAP} consecutive \
new file edits" self-exits with no file or read progress"
); );
if let Err(e) = if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason) crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
@@ -206,7 +240,8 @@ impl AgentPool {
"[pipeline] Coder '{agent_name}' exited with uncommitted work \ "[pipeline] Coder '{agent_name}' exited with uncommitted work \
for '{story_id}' (no-progress {no_progress_count}/\ for '{story_id}' (no-progress {no_progress_count}/\
{NO_PROGRESS_CAP}, total {total_attempts}/\ {NO_PROGRESS_CAP}, total {total_attempts}/\
{TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \ {TOTAL_ATTEMPTS_CAP}; diff_progress={diff_progress}, \
read_progress={read_progress}, forced_exit={forced_exit}). \
Issuing commit-only respawn." Issuing commit-only respawn."
); );
let addendum = "\n\nYou have uncommitted work in this worktree. \ let addendum = "\n\nYou have uncommitted work in this worktree. \
@@ -302,10 +337,13 @@ impl AgentPool {
}); });
} }
} else if completion.gates_passed { } else if completion.gates_passed {
// Clear any stale recovery key when the coder succeeds normally. // Clear any stale recovery keys when the coder succeeds normally.
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending( crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id, story_id,
)); ));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
// Determine effective QA mode for this story. // Determine effective QA mode for this story.
let qa_mode = { let qa_mode = {
let item_type = crate::agents::lifecycle::item_type_from_id(story_id); let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
@@ -361,11 +399,14 @@ impl AgentPool {
} }
} }
} else { } else {
// Clear any stale recovery key when gates fail normally (agent committed // Clear any stale recovery keys when gates fail normally (agent committed
// but the build is broken — treat as a standard retry, not a recovery). // but the build is broken — treat as a standard retry, not a recovery).
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending( crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id, story_id,
)); ));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
// Bug 645 / 668: Before retry/block, check if the agent left committed // Bug 645 / 668: Before retry/block, check if the agent left committed
// work AND the agent had a passing run_tests result captured during its // work AND the agent had a passing run_tests result captured during its
// session. An agent may crash mid-output (e.g. Claude Code CLI PTY write // session. An agent may crash mid-output (e.g. Claude Code CLI PTY write
@@ -724,6 +765,109 @@ mod helpers;
use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store}; use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store};
pub(crate) use helpers::{should_block_story, spawn_pipeline_advance}; pub(crate) use helpers::{should_block_story, spawn_pipeline_advance};
/// Parse a huskies agent log and return the set of file paths passed to the
/// Read tool in that session. Returns an empty set if the log cannot be read.
///
/// Used by [`collect_read_progress`] to detect read-exploration progress even
/// when the worktree diff did not grow (story 1089, AC2).
fn collect_read_files_from_log(
project_root: &std::path::Path,
story_id: &str,
agent_name: &str,
session_id: &str,
) -> std::collections::HashSet<String> {
let log_path = crate::agent_log::log_file_path(project_root, story_id, agent_name, session_id);
let mut files = std::collections::HashSet::new();
let log_text = match std::fs::read_to_string(&log_path) {
Ok(t) => t,
Err(_) => return files,
};
for line in log_text.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let entry: serde_json::Value = match serde_json::from_str(trimmed) {
Ok(v) => v,
Err(_) => continue,
};
// Only look at agent_json events where data.type == "assistant".
if entry.get("type").and_then(|t| t.as_str()) != Some("agent_json") {
continue;
}
let data = match entry.get("data") {
Some(d) => d,
None => continue,
};
if data.get("type").and_then(|t| t.as_str()) != Some("assistant") {
continue;
}
let content = match data.pointer("/message/content").and_then(|c| c.as_array()) {
Some(c) => c,
None => continue,
};
for item in content {
if item.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
continue;
}
if item.get("name").and_then(|n| n.as_str()) != Some("Read") {
continue;
}
if let Some(path) = item.pointer("/input/file_path").and_then(|p| p.as_str()) {
files.insert(path.to_string());
}
}
}
files
}
/// Return `true` if the agent read any files in `session_id` that were not in
/// the cumulative read set for `story_id`. Updates the stored cumulative set
/// when new files are found (story 1089, AC2).
fn collect_read_progress(
project_root: &std::path::Path,
story_id: &str,
agent_name: &str,
session_id: &str,
) -> bool {
let session_files = collect_read_files_from_log(project_root, story_id, agent_name, session_id);
if session_files.is_empty() {
return false;
}
let stored_set: std::collections::HashSet<String> =
crate::db::read_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id))
.map(|s| {
s.lines()
.filter(|l| !l.is_empty())
.map(str::to_string)
.collect()
})
.unwrap_or_default();
let union: std::collections::HashSet<String> =
stored_set.union(&session_files).cloned().collect();
if union.len() > stored_set.len() {
let mut sorted: Vec<&String> = union.iter().collect();
sorted.sort();
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryReadSet(story_id),
&sorted
.into_iter()
.map(String::as_str)
.collect::<Vec<_>>()
.join("\n"),
);
true
} else {
false
}
}
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;
#[cfg(test)] #[cfg(test)]
@@ -1077,7 +1077,7 @@ stage = "coder"
"Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns" "Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
); );
assert!( assert!(
block_reason.contains("without commits or new file edits"), block_reason.contains("self-exits with no file or read progress"),
"Block reason should describe the no-progress condition, got: {block_reason}" "Block reason should describe the no-progress condition, got: {block_reason}"
); );
@@ -1193,7 +1193,7 @@ stage = "coder"
"Story must be blocked once total commit-recovery attempts hits the outer cap" "Story must be blocked once total commit-recovery attempts hits the outer cap"
); );
assert!( assert!(
block_reason.contains("flapped") && block_reason.contains("without ever committing"), block_reason.contains("commit absent") && block_reason.contains("never committed"),
"Block reason should describe the flapping pattern, got: {block_reason}" "Block reason should describe the flapping pattern, got: {block_reason}"
); );
+14 -1
View File
@@ -808,6 +808,7 @@ pub(super) async fn run_agent_spawn(
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts( crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
&sid, &sid,
)); ));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(&sid));
// Remove agent from the pool and unblock any wait_for_agent callers. // Remove agent from the pool and unblock any wait_for_agent callers.
let tx_done = { let tx_done = {
@@ -873,6 +874,17 @@ pub(super) async fn run_agent_spawn(
return; return;
} }
// AC1 (story 1089): mark forced exits so the commit-recovery
// stuck counter is not incremented for API errors, network
// failures, or Claude-API budget exhaustion. A non-zero exit
// code means the CLI was forced out, not that it chose to stop.
if !result.exit_ok {
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryForcedExit(&sid),
"1",
);
}
// Server-owned completion: run acceptance gates automatically // Server-owned completion: run acceptance gates automatically
// when the agent process exits normally. // when the agent process exits normally.
super::super::pipeline::run_server_owned_completion( super::super::pipeline::run_server_owned_completion(
@@ -1246,12 +1258,13 @@ mod tests {
"abc123", "abc123",
); );
// Rate-limit exit handler: reset all three counters (the fix). // Rate-limit exit handler: reset all counters (the fix).
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id)); crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint( crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
story_id, story_id,
)); ));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id)); crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id));
// CommitRecoveryPending must be cleared after each rate-limit exit. // CommitRecoveryPending must be cleared after each rate-limit exit.
assert!( assert!(
+2
View File
@@ -59,6 +59,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
// Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its // Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its
// first "system" event. Detected by: non-zero exit AND no session. // first "system" event. Detected by: non-zero exit AND no session.
aborted_signal: !result.exit_ok && result.session_id.is_none(), aborted_signal: !result.exit_ok && result.session_id.is_none(),
exit_ok: result.exit_ok,
session_id: result.session_id, session_id: result.session_id,
token_usage: result.token_usage, token_usage: result.token_usage,
rate_limit_exit: result.rate_limit_exit, rate_limit_exit: result.rate_limit_exit,
@@ -92,6 +93,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
Ok(RuntimeResult { Ok(RuntimeResult {
aborted_signal: !fallback_result.exit_ok aborted_signal: !fallback_result.exit_ok
&& fallback_result.session_id.is_none(), && fallback_result.session_id.is_none(),
exit_ok: fallback_result.exit_ok,
session_id: fallback_result.session_id, session_id: fallback_result.session_id,
token_usage: fallback_result.token_usage, token_usage: fallback_result.token_usage,
rate_limit_exit: fallback_result.rate_limit_exit, rate_limit_exit: fallback_result.rate_limit_exit,
+4
View File
@@ -135,6 +135,7 @@ impl AgentRuntime for GeminiRuntime {
return Ok(RuntimeResult { return Ok(RuntimeResult {
session_id: None, session_id: None,
token_usage: Some(total_usage), token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false, aborted_signal: false,
rate_limit_exit: false, rate_limit_exit: false,
rate_limit_reset_at: None, rate_limit_reset_at: None,
@@ -151,6 +152,7 @@ impl AgentRuntime for GeminiRuntime {
return Ok(RuntimeResult { return Ok(RuntimeResult {
session_id: None, session_id: None,
token_usage: Some(total_usage), token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false, aborted_signal: false,
rate_limit_exit: false, rate_limit_exit: false,
rate_limit_reset_at: None, rate_limit_reset_at: None,
@@ -254,6 +256,7 @@ impl AgentRuntime for GeminiRuntime {
return Ok(RuntimeResult { return Ok(RuntimeResult {
session_id: None, session_id: None,
token_usage: Some(total_usage), token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false, aborted_signal: false,
rate_limit_exit: false, rate_limit_exit: false,
rate_limit_reset_at: None, rate_limit_reset_at: None,
@@ -339,6 +342,7 @@ impl AgentRuntime for GeminiRuntime {
Ok(RuntimeResult { Ok(RuntimeResult {
session_id: None, session_id: None,
token_usage: Some(total_usage), token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false, aborted_signal: false,
rate_limit_exit: false, rate_limit_exit: false,
rate_limit_reset_at: None, rate_limit_reset_at: None,
+8
View File
@@ -55,6 +55,12 @@ pub struct RuntimeContext {
pub struct RuntimeResult { pub struct RuntimeResult {
pub session_id: Option<String>, pub session_id: Option<String>,
pub token_usage: Option<TokenUsage>, pub token_usage: Option<TokenUsage>,
/// `true` when the process exited with exit code 0; `false` for non-zero exits
/// (API errors, network failures, or Claude-API-level budget exhaustion). Always
/// `true` for API-based runtimes (OpenAI, Gemini) which have no exit-code concept.
/// Used by the commit-recovery path to skip the stuck-respawn counter for forced
/// exits (story 1089, AC1).
pub exit_ok: bool,
/// `true` when the process exited with a failure AND no session was established. /// `true` when the process exited with a failure AND no session was established.
/// ///
/// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion /// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion
@@ -169,6 +175,7 @@ mod tests {
cache_read_input_tokens: 0, cache_read_input_tokens: 0,
total_cost_usd: 0.01, total_cost_usd: 0.01,
}), }),
exit_ok: true,
aborted_signal: false, aborted_signal: false,
rate_limit_exit: false, rate_limit_exit: false,
rate_limit_reset_at: None, rate_limit_reset_at: None,
@@ -186,6 +193,7 @@ mod tests {
let result = RuntimeResult { let result = RuntimeResult {
session_id: None, session_id: None,
token_usage: None, token_usage: None,
exit_ok: true,
aborted_signal: false, aborted_signal: false,
rate_limit_exit: false, rate_limit_exit: false,
rate_limit_reset_at: None, rate_limit_reset_at: None,
+3
View File
@@ -122,6 +122,7 @@ impl AgentRuntime for OpenAiRuntime {
return Ok(RuntimeResult { return Ok(RuntimeResult {
session_id: None, session_id: None,
token_usage: Some(total_usage), token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false, aborted_signal: false,
rate_limit_exit: false, rate_limit_exit: false,
rate_limit_reset_at: None, rate_limit_reset_at: None,
@@ -138,6 +139,7 @@ impl AgentRuntime for OpenAiRuntime {
return Ok(RuntimeResult { return Ok(RuntimeResult {
session_id: None, session_id: None,
token_usage: Some(total_usage), token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false, aborted_signal: false,
rate_limit_exit: false, rate_limit_exit: false,
rate_limit_reset_at: None, rate_limit_reset_at: None,
@@ -224,6 +226,7 @@ impl AgentRuntime for OpenAiRuntime {
return Ok(RuntimeResult { return Ok(RuntimeResult {
session_id: None, session_id: None,
token_usage: Some(total_usage), token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false, aborted_signal: false,
rate_limit_exit: false, rate_limit_exit: false,
rate_limit_reset_at: None, rate_limit_reset_at: None,
+15
View File
@@ -60,6 +60,17 @@ pub enum ContentKey<'a> {
/// completion. Read by `get_merge_status` to surface gate output for the /// completion. Read by `get_merge_status` to surface gate output for the
/// "completed" state without a separate MergeJob CRDT register (story 1036). /// "completed" state without a separate MergeJob CRDT register (story 1036).
MergeReport(&'a str), MergeReport(&'a str),
/// Flag written by spawn.rs when a coder session exits with a non-zero exit
/// code (API error, network failure, or Claude-API-level budget exhaustion).
/// Prevents the stuck-respawn counter from incrementing for forced exits —
/// only self-exits with no file or read changes count toward the cap.
/// Consumed (read + deleted) by the commit-recovery path in pipeline advance.
CommitRecoveryForcedExit(&'a str),
/// Cumulative set of files read across all commit-recovery sessions for a
/// story, stored as a newline-separated sorted list. Used to detect whether
/// the agent made read-exploration progress even when the worktree diff did
/// not grow (story 1089, AC2). Cleared when a commit lands or the story blocks.
CommitRecoveryReadSet(&'a str),
} }
impl<'a> ContentKey<'a> { impl<'a> ContentKey<'a> {
@@ -85,6 +96,10 @@ impl<'a> ContentKey<'a> {
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"), ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"), ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),
ContentKey::MergeReport(id) => format!("{id}:merge_report"), ContentKey::MergeReport(id) => format!("{id}:merge_report"),
ContentKey::CommitRecoveryForcedExit(id) => {
format!("{id}:commit_recovery_forced_exit")
}
ContentKey::CommitRecoveryReadSet(id) => format!("{id}:commit_recovery_read_set"),
} }
} }
} }