huskies: merge 1089 bug Stuck-agent detector blocks stories on legitimate exploration / debugging — uses too narrow a "progress" signal
This commit is contained in:
@@ -78,21 +78,34 @@ impl AgentPool {
|
||||
// The coder exited with uncommitted content but no commits
|
||||
// (typical "claude-code session boundary mid-sweep" pattern).
|
||||
// Use a PROGRESS-AWARE retry cap: the agent gets unlimited
|
||||
// respawns as long as file edits keep growing between
|
||||
// attempts; only when the worktree diff is byte-identical
|
||||
// to the previous attempt do we count it as "no progress".
|
||||
// After NO_PROGRESS_CAP consecutive no-progress respawns,
|
||||
// block for human attention.
|
||||
// respawns as long as progress is being made between attempts.
|
||||
// Progress is satisfied if EITHER (a) the worktree diff grew,
|
||||
// OR (b) the set of files the agent read grew. Raw tool-call
|
||||
// count does NOT count — a looping agent can produce many calls.
|
||||
// Only self-exited sessions with no file or read progress count
|
||||
// toward the cap; forced exits (API error, network, budget
|
||||
// exhaustion) are excluded (story 1089).
|
||||
// After NO_PROGRESS_CAP consecutive qualifying no-progress
|
||||
// respawns, block for human attention.
|
||||
//
|
||||
// TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
|
||||
// keeps making file-edit progress every session, after this
|
||||
// many total respawns without a commit we escalate — caught
|
||||
// the "agent flaps between different edits but never
|
||||
// commits" pattern that the progress-aware counter would
|
||||
// never trigger.
|
||||
// many total respawns without a commit we escalate — catches
|
||||
// the "agent flaps between different edits but never commits"
|
||||
// pattern that the progress-aware counter would never trigger.
|
||||
const NO_PROGRESS_CAP: u32 = 3;
|
||||
const TOTAL_ATTEMPTS_CAP: u32 = 8;
|
||||
|
||||
// AC1: consume the forced-exit flag written by spawn.rs when
|
||||
// the agent process exited with a non-zero code.
|
||||
let forced_exit = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryForcedExit(story_id),
|
||||
)
|
||||
.is_some();
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryForcedExit(
|
||||
story_id,
|
||||
));
|
||||
|
||||
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
|
||||
std::process::Command::new("git")
|
||||
.args(["diff", "master"])
|
||||
@@ -104,18 +117,31 @@ impl AgentPool {
|
||||
let stored_fingerprint = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||
);
|
||||
let made_progress = current_fingerprint.is_some()
|
||||
let diff_progress = current_fingerprint.is_some()
|
||||
&& stored_fingerprint.as_ref() != current_fingerprint.as_ref();
|
||||
let no_progress_count = if made_progress || stored_fingerprint.is_none() {
|
||||
|
||||
// AC2: check read-file set progress as an additional signal.
|
||||
let read_progress = previous_session_id.as_deref().is_some_and(|session_id| {
|
||||
collect_read_progress(&project_root, story_id, agent_name, session_id)
|
||||
});
|
||||
|
||||
let made_progress = diff_progress || read_progress;
|
||||
|
||||
let prev_no_progress_count = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryPending(story_id),
|
||||
)
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0);
|
||||
|
||||
// AC1: forced exits do not increment the stuck-respawn counter.
|
||||
let no_progress_count = if forced_exit {
|
||||
prev_no_progress_count
|
||||
} else if made_progress || stored_fingerprint.is_none() {
|
||||
1
|
||||
} else {
|
||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
))
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0)
|
||||
+ 1
|
||||
prev_no_progress_count + 1
|
||||
};
|
||||
|
||||
let total_attempts = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
)
|
||||
@@ -136,13 +162,17 @@ impl AgentPool {
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
);
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
|
||||
story_id,
|
||||
));
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
|
||||
commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
|
||||
without a commit. Blocking story."
|
||||
);
|
||||
let reason = format!(
|
||||
"agent flapped — {total_attempts} respawns without ever committing"
|
||||
"commit absent after {total_attempts} respawns — \
|
||||
agent kept making edits but never committed"
|
||||
);
|
||||
if let Err(e) =
|
||||
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
||||
@@ -167,14 +197,18 @@ impl AgentPool {
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
);
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
|
||||
story_id,
|
||||
));
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
|
||||
file-edit progress over {no_progress_count} consecutive \
|
||||
commit-recovery respawns. Blocking story."
|
||||
file or read progress over {no_progress_count} consecutive \
|
||||
self-exit commit-recovery respawns. Blocking story."
|
||||
);
|
||||
// AC4: block message names the specific cause.
|
||||
let reason = format!(
|
||||
"agent stuck — {no_progress_count} respawns without commits or \
|
||||
new file edits"
|
||||
"stuck-respawn cap reached: {NO_PROGRESS_CAP} consecutive \
|
||||
self-exits with no file or read progress"
|
||||
);
|
||||
if let Err(e) =
|
||||
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
||||
@@ -206,7 +240,8 @@ impl AgentPool {
|
||||
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
|
||||
for '{story_id}' (no-progress {no_progress_count}/\
|
||||
{NO_PROGRESS_CAP}, total {total_attempts}/\
|
||||
{TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \
|
||||
{TOTAL_ATTEMPTS_CAP}; diff_progress={diff_progress}, \
|
||||
read_progress={read_progress}, forced_exit={forced_exit}). \
|
||||
Issuing commit-only respawn."
|
||||
);
|
||||
let addendum = "\n\nYou have uncommitted work in this worktree. \
|
||||
@@ -302,10 +337,13 @@ impl AgentPool {
|
||||
});
|
||||
}
|
||||
} else if completion.gates_passed {
|
||||
// Clear any stale recovery key when the coder succeeds normally.
|
||||
// Clear any stale recovery keys when the coder succeeds normally.
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
|
||||
story_id,
|
||||
));
|
||||
// Determine effective QA mode for this story.
|
||||
let qa_mode = {
|
||||
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
|
||||
@@ -361,11 +399,14 @@ impl AgentPool {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Clear any stale recovery key when gates fail normally (agent committed
|
||||
// Clear any stale recovery keys when gates fail normally (agent committed
|
||||
// but the build is broken — treat as a standard retry, not a recovery).
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
|
||||
story_id,
|
||||
));
|
||||
// Bug 645 / 668: Before retry/block, check if the agent left committed
|
||||
// work AND the agent had a passing run_tests result captured during its
|
||||
// session. An agent may crash mid-output (e.g. Claude Code CLI PTY write
|
||||
@@ -724,6 +765,109 @@ mod helpers;
|
||||
use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store};
|
||||
pub(crate) use helpers::{should_block_story, spawn_pipeline_advance};
|
||||
|
||||
/// Parse a huskies agent log and return the set of file paths passed to the
|
||||
/// Read tool in that session. Returns an empty set if the log cannot be read.
|
||||
///
|
||||
/// Used by [`collect_read_progress`] to detect read-exploration progress even
|
||||
/// when the worktree diff did not grow (story 1089, AC2).
|
||||
fn collect_read_files_from_log(
|
||||
project_root: &std::path::Path,
|
||||
story_id: &str,
|
||||
agent_name: &str,
|
||||
session_id: &str,
|
||||
) -> std::collections::HashSet<String> {
|
||||
let log_path = crate::agent_log::log_file_path(project_root, story_id, agent_name, session_id);
|
||||
let mut files = std::collections::HashSet::new();
|
||||
|
||||
let log_text = match std::fs::read_to_string(&log_path) {
|
||||
Ok(t) => t,
|
||||
Err(_) => return files,
|
||||
};
|
||||
|
||||
for line in log_text.lines() {
|
||||
let trimmed = line.trim();
|
||||
if trimmed.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let entry: serde_json::Value = match serde_json::from_str(trimmed) {
|
||||
Ok(v) => v,
|
||||
Err(_) => continue,
|
||||
};
|
||||
// Only look at agent_json events where data.type == "assistant".
|
||||
if entry.get("type").and_then(|t| t.as_str()) != Some("agent_json") {
|
||||
continue;
|
||||
}
|
||||
let data = match entry.get("data") {
|
||||
Some(d) => d,
|
||||
None => continue,
|
||||
};
|
||||
if data.get("type").and_then(|t| t.as_str()) != Some("assistant") {
|
||||
continue;
|
||||
}
|
||||
let content = match data.pointer("/message/content").and_then(|c| c.as_array()) {
|
||||
Some(c) => c,
|
||||
None => continue,
|
||||
};
|
||||
for item in content {
|
||||
if item.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
|
||||
continue;
|
||||
}
|
||||
if item.get("name").and_then(|n| n.as_str()) != Some("Read") {
|
||||
continue;
|
||||
}
|
||||
if let Some(path) = item.pointer("/input/file_path").and_then(|p| p.as_str()) {
|
||||
files.insert(path.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
files
|
||||
}
|
||||
|
||||
/// Return `true` if the agent read any files in `session_id` that were not in
|
||||
/// the cumulative read set for `story_id`. Updates the stored cumulative set
|
||||
/// when new files are found (story 1089, AC2).
|
||||
fn collect_read_progress(
|
||||
project_root: &std::path::Path,
|
||||
story_id: &str,
|
||||
agent_name: &str,
|
||||
session_id: &str,
|
||||
) -> bool {
|
||||
let session_files = collect_read_files_from_log(project_root, story_id, agent_name, session_id);
|
||||
if session_files.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let stored_set: std::collections::HashSet<String> =
|
||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id))
|
||||
.map(|s| {
|
||||
s.lines()
|
||||
.filter(|l| !l.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
let union: std::collections::HashSet<String> =
|
||||
stored_set.union(&session_files).cloned().collect();
|
||||
|
||||
if union.len() > stored_set.len() {
|
||||
let mut sorted: Vec<&String> = union.iter().collect();
|
||||
sorted.sort();
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryReadSet(story_id),
|
||||
&sorted
|
||||
.into_iter()
|
||||
.map(String::as_str)
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n"),
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1077,7 +1077,7 @@ stage = "coder"
|
||||
"Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
|
||||
);
|
||||
assert!(
|
||||
block_reason.contains("without commits or new file edits"),
|
||||
block_reason.contains("self-exits with no file or read progress"),
|
||||
"Block reason should describe the no-progress condition, got: {block_reason}"
|
||||
);
|
||||
|
||||
@@ -1193,7 +1193,7 @@ stage = "coder"
|
||||
"Story must be blocked once total commit-recovery attempts hits the outer cap"
|
||||
);
|
||||
assert!(
|
||||
block_reason.contains("flapped") && block_reason.contains("without ever committing"),
|
||||
block_reason.contains("commit absent") && block_reason.contains("never committed"),
|
||||
"Block reason should describe the flapping pattern, got: {block_reason}"
|
||||
);
|
||||
|
||||
|
||||
@@ -808,6 +808,7 @@ pub(super) async fn run_agent_spawn(
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
|
||||
&sid,
|
||||
));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(&sid));
|
||||
|
||||
// Remove agent from the pool and unblock any wait_for_agent callers.
|
||||
let tx_done = {
|
||||
@@ -873,6 +874,17 @@ pub(super) async fn run_agent_spawn(
|
||||
return;
|
||||
}
|
||||
|
||||
// AC1 (story 1089): mark forced exits so the commit-recovery
|
||||
// stuck counter is not incremented for API errors, network
|
||||
// failures, or Claude-API budget exhaustion. A non-zero exit
|
||||
// code means the CLI was forced out, not that it chose to stop.
|
||||
if !result.exit_ok {
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryForcedExit(&sid),
|
||||
"1",
|
||||
);
|
||||
}
|
||||
|
||||
// Server-owned completion: run acceptance gates automatically
|
||||
// when the agent process exits normally.
|
||||
super::super::pipeline::run_server_owned_completion(
|
||||
@@ -1246,12 +1258,13 @@ mod tests {
|
||||
"abc123",
|
||||
);
|
||||
|
||||
// Rate-limit exit handler: reset all three counters (the fix).
|
||||
// Rate-limit exit handler: reset all counters (the fix).
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
|
||||
story_id,
|
||||
));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id));
|
||||
|
||||
// CommitRecoveryPending must be cleared after each rate-limit exit.
|
||||
assert!(
|
||||
|
||||
Reference in New Issue
Block a user