feat: outer cap on commit-recovery respawns catches flapping agents
The progress-aware no-progress cap (3 consecutive byte-identical diffs) doesn't catch the degenerate pattern where the agent keeps making DIFFERENT file edits each session but never commits — every respawn resets the no-progress counter, infinite loop, budget burns. Adds ContentKey::CommitRecoveryTotalAttempts: an absolute counter that increments on every commit-recovery respawn regardless of progress. TOTAL_ATTEMPTS_CAP = 8; when hit, block with reason 'agent flapped — N respawns without ever committing'. Two caps now bound the recovery loop: - NO_PROGRESS_CAP (3): catches stuck-agent (same diff repeatedly) - TOTAL_ATTEMPTS_CAP (8): catches flapping-agent (different diffs, no commits) Easy to tune the constant lower if we see runaway in practice. All 2936 tests pass.
This commit is contained in:
@@ -83,7 +83,15 @@ impl AgentPool {
|
||||
// to the previous attempt do we count it as "no progress".
|
||||
// After NO_PROGRESS_CAP consecutive no-progress respawns,
|
||||
// block for human attention.
|
||||
//
|
||||
// TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
|
||||
// keeps making file-edit progress every session, after this
|
||||
// many total respawns without a commit we escalate — caught
|
||||
// the "agent flaps between different edits but never
|
||||
// commits" pattern that the progress-aware counter would
|
||||
// never trigger.
|
||||
const NO_PROGRESS_CAP: u32 = 3;
|
||||
const TOTAL_ATTEMPTS_CAP: u32 = 8;
|
||||
|
||||
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
|
||||
std::process::Command::new("git")
|
||||
@@ -108,6 +116,45 @@ impl AgentPool {
|
||||
.unwrap_or(0)
|
||||
+ 1
|
||||
};
|
||||
let total_attempts = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
)
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0)
|
||||
+ 1;
|
||||
|
||||
if total_attempts >= TOTAL_ATTEMPTS_CAP {
|
||||
// Outer cap reached: agent has been respawned too many
|
||||
// times without ever committing. Block regardless of
|
||||
// whether file-edit progress is still happening.
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
));
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||
);
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
);
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
|
||||
commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
|
||||
without a commit. Blocking story."
|
||||
);
|
||||
let reason = format!(
|
||||
"agent flapped — {total_attempts} respawns without ever committing"
|
||||
);
|
||||
if let Err(e) =
|
||||
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
||||
{
|
||||
slog_error!("[pipeline] Failed to block '{story_id}': {e}");
|
||||
}
|
||||
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
|
||||
story_id: story_id.to_string(),
|
||||
reason,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if no_progress_count >= NO_PROGRESS_CAP {
|
||||
// Cap reached → block for human attention.
|
||||
@@ -117,6 +164,9 @@ impl AgentPool {
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||
);
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
);
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
|
||||
file-edit progress over {no_progress_count} consecutive \
|
||||
@@ -148,10 +198,15 @@ impl AgentPool {
|
||||
fp,
|
||||
);
|
||||
}
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
&total_attempts.to_string(),
|
||||
);
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
|
||||
for '{story_id}' (no-progress {no_progress_count}/\
|
||||
{NO_PROGRESS_CAP}; progress_made={made_progress}). \
|
||||
{NO_PROGRESS_CAP}, total {total_attempts}/\
|
||||
{TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \
|
||||
Issuing commit-only respawn."
|
||||
);
|
||||
let addendum = "\n\nYou have uncommitted work in this worktree. \
|
||||
|
||||
@@ -1107,6 +1107,113 @@ stage = "coder"
|
||||
);
|
||||
}
|
||||
|
||||
/// Outer-cap path: even if the agent keeps making file-edit progress between
|
||||
/// every commit-recovery respawn (different fingerprint each time), after
|
||||
/// TOTAL_ATTEMPTS_CAP respawns without a commit the story must block. This
|
||||
/// catches the "flapping agent" pattern where the progress-aware counter
|
||||
/// would never trigger because the diff keeps changing.
|
||||
#[tokio::test]
|
||||
async fn total_attempts_cap_blocks_flapping_agent_without_commit() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
fs::create_dir_all(root.join(".huskies")).unwrap();
|
||||
fs::write(
|
||||
root.join(".huskies/project.toml"),
|
||||
r#"
|
||||
max_retries = 3
|
||||
|
||||
[[agent]]
|
||||
name = "coder-1"
|
||||
role = "Coder"
|
||||
command = "echo"
|
||||
args = ["noop"]
|
||||
prompt = "test"
|
||||
stage = "coder"
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::db::ensure_content_store();
|
||||
crate::db::write_item_with_content(
|
||||
"9956_story_flapper",
|
||||
"2_current",
|
||||
"---\nname: Flapper Test\n---\n",
|
||||
crate::db::ItemMeta::named("Flapper Test"),
|
||||
);
|
||||
|
||||
// Simulate 7 previous respawns (one short of the outer cap of 8). The
|
||||
// no-progress counter is at 1 (the agent has been making progress every
|
||||
// attempt) but total attempts is at the threshold.
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts("9956_story_flapper"),
|
||||
"7",
|
||||
);
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryPending("9956_story_flapper"),
|
||||
"1",
|
||||
);
|
||||
|
||||
let pool = AgentPool::new_test(3001);
|
||||
let mut rx = pool.watcher_tx.subscribe();
|
||||
|
||||
pool.run_pipeline_advance(
|
||||
"9956_story_flapper",
|
||||
"coder-1",
|
||||
CompletionReport {
|
||||
summary: "still no commit".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "Worktree has uncommitted changes".to_string(),
|
||||
needs_commit_recovery: true,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
false,
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Outer cap reached (7 + 1 = 8) → block.
|
||||
let mut got_blocked = false;
|
||||
let mut block_reason = String::new();
|
||||
while let Ok(evt) = rx.try_recv() {
|
||||
if let WatcherEvent::StoryBlocked { story_id, reason } = evt
|
||||
&& story_id == "9956_story_flapper"
|
||||
{
|
||||
got_blocked = true;
|
||||
block_reason = reason;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert!(
|
||||
got_blocked,
|
||||
"Story must be blocked once total commit-recovery attempts hits the outer cap"
|
||||
);
|
||||
assert!(
|
||||
block_reason.contains("flapped") && block_reason.contains("without ever committing"),
|
||||
"Block reason should describe the flapping pattern, got: {block_reason}"
|
||||
);
|
||||
|
||||
// All three recovery keys must be cleared after blocking.
|
||||
assert!(
|
||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
|
||||
"9956_story_flapper"
|
||||
))
|
||||
.is_none(),
|
||||
"total_attempts key must be cleared after blocking"
|
||||
);
|
||||
assert!(
|
||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
"9956_story_flapper"
|
||||
))
|
||||
.is_none(),
|
||||
"commit_recovery_pending key must be cleared after blocking"
|
||||
);
|
||||
}
|
||||
|
||||
// ── bug 953: bug-645 path must not advance when feature branch has zero commits ──
|
||||
|
||||
/// Regression test for bug 953: when a coder agent exits with gates_passed=false
|
||||
|
||||
Reference in New Issue
Block a user