feat: outer cap on commit-recovery respawns catches flapping agents

The progress-aware no-progress cap (3 consecutive byte-identical diffs)
doesn't catch the degenerate pattern where the agent keeps making
DIFFERENT file edits each session but never commits — every respawn
resets the no-progress counter, infinite loop, budget burns.

Adds ContentKey::CommitRecoveryTotalAttempts: an absolute counter that
increments on every commit-recovery respawn regardless of progress.
TOTAL_ATTEMPTS_CAP = 8; when hit, block with reason 'agent flapped — N
respawns without ever committing'.

Two caps now bound the recovery loop:
- NO_PROGRESS_CAP (3): catches stuck-agent (same diff repeatedly)
- TOTAL_ATTEMPTS_CAP (8): catches flapping-agent (different diffs, no commits)

Easy to tune the constant lower if we see runaway in practice.
All 2936 tests pass.
This commit is contained in:
Timmy
2026-05-14 11:34:17 +01:00
parent bab337b289
commit 0572af2193
3 changed files with 172 additions and 1 deletions
+56 -1
View File
@@ -83,7 +83,15 @@ impl AgentPool {
// to the previous attempt do we count it as "no progress".
// After NO_PROGRESS_CAP consecutive no-progress respawns,
// block for human attention.
//
// TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
// keeps making file-edit progress every session, after this
// many total respawns without a commit we escalate — caught
// the "agent flaps between different edits but never
// commits" pattern that the progress-aware counter would
// never trigger.
const NO_PROGRESS_CAP: u32 = 3;
const TOTAL_ATTEMPTS_CAP: u32 = 8;
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
std::process::Command::new("git")
@@ -108,6 +116,45 @@ impl AgentPool {
.unwrap_or(0)
+ 1
};
let total_attempts = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1;
if total_attempts >= TOTAL_ATTEMPTS_CAP {
// Outer cap reached: agent has been respawned too many
// times without ever committing. Block regardless of
// whether file-edit progress is still happening.
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
);
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
without a commit. Blocking story."
);
let reason = format!(
"agent flapped — {total_attempts} respawns without ever committing"
);
if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
{
slog_error!("[pipeline] Failed to block '{story_id}': {e}");
}
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason,
});
return;
}
if no_progress_count >= NO_PROGRESS_CAP {
// Cap reached → block for human attention.
@@ -117,6 +164,9 @@ impl AgentPool {
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
);
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
file-edit progress over {no_progress_count} consecutive \
@@ -148,10 +198,15 @@ impl AgentPool {
fp,
);
}
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
&total_attempts.to_string(),
);
slog!(
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
for '{story_id}' (no-progress {no_progress_count}/\
{NO_PROGRESS_CAP}; progress_made={made_progress}). \
{NO_PROGRESS_CAP}, total {total_attempts}/\
{TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \
Issuing commit-only respawn."
);
let addendum = "\n\nYou have uncommitted work in this worktree. \