huskies: merge 1089 bug Stuck-agent detector blocks stories on legitimate exploration / debugging — uses too narrow a "progress" signal

This commit is contained in:
dave
2026-05-15 11:34:50 +00:00
parent fb1311cdae
commit c4010854a5
8 changed files with 217 additions and 28 deletions
+169 -25
View File
@@ -78,21 +78,34 @@ impl AgentPool {
// The coder exited with uncommitted content but no commits
// (typical "claude-code session boundary mid-sweep" pattern).
// Use a PROGRESS-AWARE retry cap: the agent gets unlimited
// respawns as long as file edits keep growing between
// attempts; only when the worktree diff is byte-identical
// to the previous attempt do we count it as "no progress".
// After NO_PROGRESS_CAP consecutive no-progress respawns,
// block for human attention.
// respawns as long as progress is being made between attempts.
// Progress is satisfied if EITHER (a) the worktree diff grew,
// OR (b) the set of files the agent read grew. Raw tool-call
// count does NOT count — a looping agent can produce many calls.
// Only self-exited sessions with no file or read progress count
// toward the cap; forced exits (API error, network, budget
// exhaustion) are excluded (story 1089).
// After NO_PROGRESS_CAP consecutive qualifying no-progress
// respawns, block for human attention.
//
// TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
// keeps making file-edit progress every session, after this
// many total respawns without a commit we escalate — caught
// the "agent flaps between different edits but never
// commits" pattern that the progress-aware counter would
// never trigger.
// many total respawns without a commit we escalate — catches
// the "agent flaps between different edits but never commits"
// pattern that the progress-aware counter would never trigger.
const NO_PROGRESS_CAP: u32 = 3;
const TOTAL_ATTEMPTS_CAP: u32 = 8;
// AC1: consume the forced-exit flag written by spawn.rs when
// the agent process exited with a non-zero code.
let forced_exit = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryForcedExit(story_id),
)
.is_some();
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryForcedExit(
story_id,
));
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
std::process::Command::new("git")
.args(["diff", "master"])
@@ -104,18 +117,31 @@ impl AgentPool {
let stored_fingerprint = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
let made_progress = current_fingerprint.is_some()
let diff_progress = current_fingerprint.is_some()
&& stored_fingerprint.as_ref() != current_fingerprint.as_ref();
let no_progress_count = if made_progress || stored_fingerprint.is_none() {
// AC2: check read-file set progress as an additional signal.
let read_progress = previous_session_id.as_deref().is_some_and(|session_id| {
collect_read_progress(&project_root, story_id, agent_name, session_id)
});
let made_progress = diff_progress || read_progress;
let prev_no_progress_count = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryPending(story_id),
)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0);
// AC1: forced exits do not increment the stuck-respawn counter.
let no_progress_count = if forced_exit {
prev_no_progress_count
} else if made_progress || stored_fingerprint.is_none() {
1
} else {
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
))
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1
prev_no_progress_count + 1
};
let total_attempts = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
)
@@ -136,13 +162,17 @@ impl AgentPool {
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
);
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
without a commit. Blocking story."
);
let reason = format!(
"agent flapped — {total_attempts} respawns without ever committing"
"commit absent after {total_attempts} respawns \
agent kept making edits but never committed"
);
if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
@@ -167,14 +197,18 @@ impl AgentPool {
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
);
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
file-edit progress over {no_progress_count} consecutive \
commit-recovery respawns. Blocking story."
file or read progress over {no_progress_count} consecutive \
self-exit commit-recovery respawns. Blocking story."
);
// AC4: block message names the specific cause.
let reason = format!(
"agent stuck — {no_progress_count} respawns without commits or \
new file edits"
"stuck-respawn cap reached: {NO_PROGRESS_CAP} consecutive \
self-exits with no file or read progress"
);
if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
@@ -206,7 +240,8 @@ impl AgentPool {
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
for '{story_id}' (no-progress {no_progress_count}/\
{NO_PROGRESS_CAP}, total {total_attempts}/\
{TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \
{TOTAL_ATTEMPTS_CAP}; diff_progress={diff_progress}, \
read_progress={read_progress}, forced_exit={forced_exit}). \
Issuing commit-only respawn."
);
let addendum = "\n\nYou have uncommitted work in this worktree. \
@@ -302,10 +337,13 @@ impl AgentPool {
});
}
} else if completion.gates_passed {
// Clear any stale recovery key when the coder succeeds normally.
// Clear any stale recovery keys when the coder succeeds normally.
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
// Determine effective QA mode for this story.
let qa_mode = {
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
@@ -361,11 +399,14 @@ impl AgentPool {
}
}
} else {
// Clear any stale recovery key when gates fail normally (agent committed
// Clear any stale recovery keys when gates fail normally (agent committed
// but the build is broken — treat as a standard retry, not a recovery).
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
// Bug 645 / 668: Before retry/block, check if the agent left committed
// work AND the agent had a passing run_tests result captured during its
// session. An agent may crash mid-output (e.g. Claude Code CLI PTY write
@@ -724,6 +765,109 @@ mod helpers;
use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store};
pub(crate) use helpers::{should_block_story, spawn_pipeline_advance};
/// Parse a huskies agent log and return the set of file paths passed to the
/// Read tool in that session. Returns an empty set if the log cannot be read.
///
/// Used by [`collect_read_progress`] to detect read-exploration progress even
/// when the worktree diff did not grow (story 1089, AC2).
fn collect_read_files_from_log(
project_root: &std::path::Path,
story_id: &str,
agent_name: &str,
session_id: &str,
) -> std::collections::HashSet<String> {
let log_path = crate::agent_log::log_file_path(project_root, story_id, agent_name, session_id);
let mut files = std::collections::HashSet::new();
let log_text = match std::fs::read_to_string(&log_path) {
Ok(t) => t,
Err(_) => return files,
};
for line in log_text.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let entry: serde_json::Value = match serde_json::from_str(trimmed) {
Ok(v) => v,
Err(_) => continue,
};
// Only look at agent_json events where data.type == "assistant".
if entry.get("type").and_then(|t| t.as_str()) != Some("agent_json") {
continue;
}
let data = match entry.get("data") {
Some(d) => d,
None => continue,
};
if data.get("type").and_then(|t| t.as_str()) != Some("assistant") {
continue;
}
let content = match data.pointer("/message/content").and_then(|c| c.as_array()) {
Some(c) => c,
None => continue,
};
for item in content {
if item.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
continue;
}
if item.get("name").and_then(|n| n.as_str()) != Some("Read") {
continue;
}
if let Some(path) = item.pointer("/input/file_path").and_then(|p| p.as_str()) {
files.insert(path.to_string());
}
}
}
files
}
/// Return `true` if the agent read any files in `session_id` that were not in
/// the cumulative read set for `story_id`. Updates the stored cumulative set
/// when new files are found (story 1089, AC2).
fn collect_read_progress(
project_root: &std::path::Path,
story_id: &str,
agent_name: &str,
session_id: &str,
) -> bool {
let session_files = collect_read_files_from_log(project_root, story_id, agent_name, session_id);
if session_files.is_empty() {
return false;
}
let stored_set: std::collections::HashSet<String> =
crate::db::read_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id))
.map(|s| {
s.lines()
.filter(|l| !l.is_empty())
.map(str::to_string)
.collect()
})
.unwrap_or_default();
let union: std::collections::HashSet<String> =
stored_set.union(&session_files).cloned().collect();
if union.len() > stored_set.len() {
let mut sorted: Vec<&String> = union.iter().collect();
sorted.sort();
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryReadSet(story_id),
&sorted
.into_iter()
.map(String::as_str)
.collect::<Vec<_>>()
.join("\n"),
);
true
} else {
false
}
}
#[cfg(test)]
mod tests;
#[cfg(test)]
@@ -1077,7 +1077,7 @@ stage = "coder"
"Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
);
assert!(
block_reason.contains("without commits or new file edits"),
block_reason.contains("self-exits with no file or read progress"),
"Block reason should describe the no-progress condition, got: {block_reason}"
);
@@ -1193,7 +1193,7 @@ stage = "coder"
"Story must be blocked once total commit-recovery attempts hits the outer cap"
);
assert!(
block_reason.contains("flapped") && block_reason.contains("without ever committing"),
block_reason.contains("commit absent") && block_reason.contains("never committed"),
"Block reason should describe the flapping pattern, got: {block_reason}"
);
+14 -1
View File
@@ -808,6 +808,7 @@ pub(super) async fn run_agent_spawn(
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
&sid,
));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(&sid));
// Remove agent from the pool and unblock any wait_for_agent callers.
let tx_done = {
@@ -873,6 +874,17 @@ pub(super) async fn run_agent_spawn(
return;
}
// AC1 (story 1089): mark forced exits so the commit-recovery
// stuck counter is not incremented for API errors, network
// failures, or Claude-API budget exhaustion. A non-zero exit
// code means the CLI was forced out, not that it chose to stop.
if !result.exit_ok {
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryForcedExit(&sid),
"1",
);
}
// Server-owned completion: run acceptance gates automatically
// when the agent process exits normally.
super::super::pipeline::run_server_owned_completion(
@@ -1246,12 +1258,13 @@ mod tests {
"abc123",
);
// Rate-limit exit handler: reset all three counters (the fix).
// Rate-limit exit handler: reset all counters (the fix).
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
story_id,
));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id));
// CommitRecoveryPending must be cleared after each rate-limit exit.
assert!(
+2
View File
@@ -59,6 +59,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
// Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its
// first "system" event. Detected by: non-zero exit AND no session.
aborted_signal: !result.exit_ok && result.session_id.is_none(),
exit_ok: result.exit_ok,
session_id: result.session_id,
token_usage: result.token_usage,
rate_limit_exit: result.rate_limit_exit,
@@ -92,6 +93,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
Ok(RuntimeResult {
aborted_signal: !fallback_result.exit_ok
&& fallback_result.session_id.is_none(),
exit_ok: fallback_result.exit_ok,
session_id: fallback_result.session_id,
token_usage: fallback_result.token_usage,
rate_limit_exit: fallback_result.rate_limit_exit,
+4
View File
@@ -135,6 +135,7 @@ impl AgentRuntime for GeminiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
@@ -151,6 +152,7 @@ impl AgentRuntime for GeminiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
@@ -254,6 +256,7 @@ impl AgentRuntime for GeminiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
@@ -339,6 +342,7 @@ impl AgentRuntime for GeminiRuntime {
Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
+8
View File
@@ -55,6 +55,12 @@ pub struct RuntimeContext {
pub struct RuntimeResult {
pub session_id: Option<String>,
pub token_usage: Option<TokenUsage>,
/// `true` when the process exited with exit code 0; `false` for non-zero exits
/// (API errors, network failures, or Claude-API-level budget exhaustion). Always
/// `true` for API-based runtimes (OpenAI, Gemini) which have no exit-code concept.
/// Used by the commit-recovery path to skip the stuck-respawn counter for forced
/// exits (story 1089, AC1).
pub exit_ok: bool,
/// `true` when the process exited with a failure AND no session was established.
///
/// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion
@@ -169,6 +175,7 @@ mod tests {
cache_read_input_tokens: 0,
total_cost_usd: 0.01,
}),
exit_ok: true,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
@@ -186,6 +193,7 @@ mod tests {
let result = RuntimeResult {
session_id: None,
token_usage: None,
exit_ok: true,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
+3
View File
@@ -122,6 +122,7 @@ impl AgentRuntime for OpenAiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
@@ -138,6 +139,7 @@ impl AgentRuntime for OpenAiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
@@ -224,6 +226,7 @@ impl AgentRuntime for OpenAiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
exit_ok: true,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,