feat: outer cap on commit-recovery respawns catches flapping agents
The progress-aware no-progress cap (3 consecutive byte-identical diffs) doesn't catch the degenerate pattern where the agent keeps making DIFFERENT file edits each session but never commits — every respawn resets the no-progress counter, infinite loop, budget burns. Adds ContentKey::CommitRecoveryTotalAttempts: an absolute counter that increments on every commit-recovery respawn regardless of progress. TOTAL_ATTEMPTS_CAP = 8; when hit, block with reason 'agent flapped — N respawns without ever committing'. Two caps now bound the recovery loop: - NO_PROGRESS_CAP (3): catches stuck-agent (same diff repeatedly) - TOTAL_ATTEMPTS_CAP (8): catches flapping-agent (different diffs, no commits) Easy to tune the constant lower if we see runaway in practice. All 2936 tests pass.
This commit is contained in:
@@ -83,7 +83,15 @@ impl AgentPool {
|
||||
// to the previous attempt do we count it as "no progress".
|
||||
// After NO_PROGRESS_CAP consecutive no-progress respawns,
|
||||
// block for human attention.
|
||||
//
|
||||
// TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
|
||||
// keeps making file-edit progress every session, after this
|
||||
// many total respawns without a commit we escalate — caught
|
||||
// the "agent flaps between different edits but never
|
||||
// commits" pattern that the progress-aware counter would
|
||||
// never trigger.
|
||||
const NO_PROGRESS_CAP: u32 = 3;
|
||||
const TOTAL_ATTEMPTS_CAP: u32 = 8;
|
||||
|
||||
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
|
||||
std::process::Command::new("git")
|
||||
@@ -108,6 +116,45 @@ impl AgentPool {
|
||||
.unwrap_or(0)
|
||||
+ 1
|
||||
};
|
||||
let total_attempts = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
)
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0)
|
||||
+ 1;
|
||||
|
||||
if total_attempts >= TOTAL_ATTEMPTS_CAP {
|
||||
// Outer cap reached: agent has been respawned too many
|
||||
// times without ever committing. Block regardless of
|
||||
// whether file-edit progress is still happening.
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
));
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||
);
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
);
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
|
||||
commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
|
||||
without a commit. Blocking story."
|
||||
);
|
||||
let reason = format!(
|
||||
"agent flapped — {total_attempts} respawns without ever committing"
|
||||
);
|
||||
if let Err(e) =
|
||||
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
||||
{
|
||||
slog_error!("[pipeline] Failed to block '{story_id}': {e}");
|
||||
}
|
||||
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
|
||||
story_id: story_id.to_string(),
|
||||
reason,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if no_progress_count >= NO_PROGRESS_CAP {
|
||||
// Cap reached → block for human attention.
|
||||
@@ -117,6 +164,9 @@ impl AgentPool {
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||
);
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
);
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
|
||||
file-edit progress over {no_progress_count} consecutive \
|
||||
@@ -148,10 +198,15 @@ impl AgentPool {
|
||||
fp,
|
||||
);
|
||||
}
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
&total_attempts.to_string(),
|
||||
);
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
|
||||
for '{story_id}' (no-progress {no_progress_count}/\
|
||||
{NO_PROGRESS_CAP}; progress_made={made_progress}). \
|
||||
{NO_PROGRESS_CAP}, total {total_attempts}/\
|
||||
{TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \
|
||||
Issuing commit-only respawn."
|
||||
);
|
||||
let addendum = "\n\nYou have uncommitted work in this worktree. \
|
||||
|
||||
Reference in New Issue
Block a user