feat: outer cap on commit-recovery respawns catches flapping agents

The progress-aware no-progress cap (3 consecutive byte-identical diffs)
doesn't catch the degenerate pattern where the agent keeps making
DIFFERENT file edits each session but never commits — every respawn
resets the no-progress counter, infinite loop, budget burns.

Adds ContentKey::CommitRecoveryTotalAttempts: an absolute counter that
increments on every commit-recovery respawn regardless of progress.
TOTAL_ATTEMPTS_CAP = 8; when hit, block with reason 'agent flapped — N
respawns without ever committing'.

Two caps now bound the recovery loop:
- NO_PROGRESS_CAP (3): catches stuck-agent (same diff repeatedly)
- TOTAL_ATTEMPTS_CAP (8): catches flapping-agent (different diffs, no commits)

Easy to tune the constant lower if we see runaway in practice.
All 2936 tests pass.
This commit is contained in:
Timmy
2026-05-14 11:34:17 +01:00
parent bab337b289
commit 0572af2193
3 changed files with 172 additions and 1 deletions
+56 -1
View File
@@ -83,7 +83,15 @@ impl AgentPool {
// to the previous attempt do we count it as "no progress".
// After NO_PROGRESS_CAP consecutive no-progress respawns,
// block for human attention.
//
// TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
// keeps making file-edit progress every session, after this
// many total respawns without a commit we escalate — caught
// the "agent flaps between different edits but never
// commits" pattern that the progress-aware counter would
// never trigger.
const NO_PROGRESS_CAP: u32 = 3;
const TOTAL_ATTEMPTS_CAP: u32 = 8;
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
std::process::Command::new("git")
@@ -108,6 +116,45 @@ impl AgentPool {
.unwrap_or(0)
+ 1
};
let total_attempts = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1;
if total_attempts >= TOTAL_ATTEMPTS_CAP {
// Outer cap reached: agent has been respawned too many
// times without ever committing. Block regardless of
// whether file-edit progress is still happening.
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
);
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
without a commit. Blocking story."
);
let reason = format!(
"agent flapped — {total_attempts} respawns without ever committing"
);
if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
{
slog_error!("[pipeline] Failed to block '{story_id}': {e}");
}
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason,
});
return;
}
if no_progress_count >= NO_PROGRESS_CAP {
// Cap reached → block for human attention.
@@ -117,6 +164,9 @@ impl AgentPool {
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
);
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
file-edit progress over {no_progress_count} consecutive \
@@ -148,10 +198,15 @@ impl AgentPool {
fp,
);
}
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
&total_attempts.to_string(),
);
slog!(
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
for '{story_id}' (no-progress {no_progress_count}/\
{NO_PROGRESS_CAP}; progress_made={made_progress}). \
{NO_PROGRESS_CAP}, total {total_attempts}/\
{TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \
Issuing commit-only respawn."
);
let addendum = "\n\nYou have uncommitted work in this worktree. \
@@ -1107,6 +1107,113 @@ stage = "coder"
);
}
/// Outer-cap path: even if the agent keeps making file-edit progress between
/// every commit-recovery respawn (different fingerprint each time), after
/// TOTAL_ATTEMPTS_CAP respawns without a commit the story must block. This
/// catches the "flapping agent" pattern where the progress-aware counter
/// would never trigger because the diff keeps changing.
#[tokio::test]
async fn total_attempts_cap_blocks_flapping_agent_without_commit() {
use std::fs;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
fs::create_dir_all(root.join(".huskies")).unwrap();
fs::write(
root.join(".huskies/project.toml"),
r#"
max_retries = 3
[[agent]]
name = "coder-1"
role = "Coder"
command = "echo"
args = ["noop"]
prompt = "test"
stage = "coder"
"#,
)
.unwrap();
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
crate::db::write_item_with_content(
"9956_story_flapper",
"2_current",
"---\nname: Flapper Test\n---\n",
crate::db::ItemMeta::named("Flapper Test"),
);
// Simulate 7 previous respawns (one short of the outer cap of 8). The
// no-progress counter is at 1 (the agent has been making progress every
// attempt) but total attempts is at the threshold.
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts("9956_story_flapper"),
"7",
);
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryPending("9956_story_flapper"),
"1",
);
let pool = AgentPool::new_test(3001);
let mut rx = pool.watcher_tx.subscribe();
pool.run_pipeline_advance(
"9956_story_flapper",
"coder-1",
CompletionReport {
summary: "still no commit".to_string(),
gates_passed: false,
gate_output: "Worktree has uncommitted changes".to_string(),
needs_commit_recovery: true,
},
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
// Outer cap reached (7 + 1 = 8) → block.
let mut got_blocked = false;
let mut block_reason = String::new();
while let Ok(evt) = rx.try_recv() {
if let WatcherEvent::StoryBlocked { story_id, reason } = evt
&& story_id == "9956_story_flapper"
{
got_blocked = true;
block_reason = reason;
break;
}
}
assert!(
got_blocked,
"Story must be blocked once total commit-recovery attempts hits the outer cap"
);
assert!(
block_reason.contains("flapped") && block_reason.contains("without ever committing"),
"Block reason should describe the flapping pattern, got: {block_reason}"
);
// All three recovery keys must be cleared after blocking.
assert!(
crate::db::read_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
"9956_story_flapper"
))
.is_none(),
"total_attempts key must be cleared after blocking"
);
assert!(
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
"9956_story_flapper"
))
.is_none(),
"commit_recovery_pending key must be cleared after blocking"
);
}
// ── bug 953: bug-645 path must not advance when feature branch has zero commits ──
/// Regression test for bug 953: when a coder agent exits with gates_passed=false