feat: progress-aware commit-recovery cap (no longer block on 2nd attempt)

The existing commit-recovery path blocked stories on the 2nd consecutive
exit-without-commit. For long sweep refactors (e.g. story 997, the typed
retries payload migration), claude-code's session-length boundary
naturally terminates the coder mid-sweep before it can commit — even
though substantial file-edit progress is being made each session. The
old cap-of-1 misclassified normal mid-flight progress as 'agent declined
to commit'.

New behaviour:
- Each commit-recovery respawn captures a worktree-diff byte-length
  fingerprint (git diff master | wc -c).
- If the fingerprint differs from the previous attempt the agent made
  file-edit progress, the no-progress counter resets to 1.
- If the fingerprint is byte-identical (no new edits between exits),
  increment the no-progress counter.
- Block only when the counter reaches NO_PROGRESS_CAP (3) — i.e. three
  consecutive respawns where the agent did literally nothing.

Adds ContentKey::CommitRecoveryDiffFingerprint to store the prior
fingerprint. Updates the existing block-test to reflect the new cap
semantics; existing 'first respawn issued' test continues to pass.

All 2935 tests pass.
This commit is contained in:
Timmy
2026-05-14 11:24:02 +01:00
parent 5e5c5a0e08
commit bab337b289
3 changed files with 100 additions and 32 deletions
+59 -17
View File
@@ -75,23 +75,57 @@ impl AgentPool {
} }
PipelineStage::Coder => { PipelineStage::Coder => {
if completion.needs_commit_recovery { if completion.needs_commit_recovery {
// The coder exited with uncommitted content but no commits. // The coder exited with uncommitted content but no commits
// Check if this is already a second recovery attempt (the // (typical "claude-code session boundary mid-sweep" pattern).
// first recovery respawn also produced no commits). // Use a PROGRESS-AWARE retry cap: the agent gets unlimited
if crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending( // respawns as long as file edits keep growing between
story_id, // attempts; only when the worktree diff is byte-identical
)) // to the previous attempt do we count it as "no progress".
.is_some() // After NO_PROGRESS_CAP consecutive no-progress respawns,
{ // block for human attention.
// Second attempt still produced no commits → block. const NO_PROGRESS_CAP: u32 = 3;
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
std::process::Command::new("git")
.args(["diff", "master"])
.current_dir(p)
.output()
.ok()
.map(|out| out.stdout.len().to_string())
});
let stored_fingerprint = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
let made_progress = current_fingerprint.is_some()
&& stored_fingerprint.as_ref() != current_fingerprint.as_ref();
let no_progress_count = if made_progress || stored_fingerprint.is_none() {
1
} else {
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
))
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1
};
if no_progress_count >= NO_PROGRESS_CAP {
// Cap reached → block for human attention.
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending( crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id, story_id,
)); ));
slog!( crate::db::delete_content(
"[pipeline] Coder '{agent_name}' (commit-recovery respawn) \ crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
still produced no commits for '{story_id}'. Blocking story." );
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
file-edit progress over {no_progress_count} consecutive \
commit-recovery respawns. Blocking story."
);
let reason = format!(
"agent stuck — {no_progress_count} respawns without commits or \
new file edits"
); );
let reason = "agent declined to commit recoverable work".to_string();
if let Err(e) = if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason) crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
{ {
@@ -102,15 +136,23 @@ impl AgentPool {
reason, reason,
}); });
} else { } else {
// First occurrence: issue a commit-only recovery respawn. // Below cap: respawn with commit-only prompt. Does NOT
// This does NOT consume a retry_count slot. // consume a retry_count slot.
crate::db::write_content( crate::db::write_content(
crate::db::ContentKey::CommitRecoveryPending(story_id), crate::db::ContentKey::CommitRecoveryPending(story_id),
"1", &no_progress_count.to_string(),
); );
if let Some(ref fp) = current_fingerprint {
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
fp,
);
}
slog!( slog!(
"[pipeline] Coder '{agent_name}' exited with uncommitted work \ "[pipeline] Coder '{agent_name}' exited with uncommitted work \
for '{story_id}'. Issuing commit-only recovery respawn." for '{story_id}' (no-progress {no_progress_count}/\
{NO_PROGRESS_CAP}; progress_made={made_progress}). \
Issuing commit-only respawn."
); );
let addendum = "\n\nYou have uncommitted work in this worktree. \ let addendum = "\n\nYou have uncommitted work in this worktree. \
Your only task this session is run_tests → git_add → git_commit. \ Your only task this session is run_tests → git_add → git_commit. \
@@ -991,10 +991,12 @@ stage = "coder"
); );
} }
/// AC3: when the commit-recovery respawn also exits with `needs_commit_recovery=true`, /// AC3: when consecutive commit-recovery respawns make NO file-edit progress
/// the story moves to `blocked` with reason "agent declined to commit recoverable work". /// (worktree diff byte-identical across attempts), the story moves to `blocked`
/// after the no-progress cap is hit. The agent gets unlimited respawns while
/// progress is being made, only stalling triggers escalation.
#[tokio::test] #[tokio::test]
async fn second_commit_recovery_failure_blocks_story() { async fn no_progress_commit_recovery_blocks_story_at_cap() {
use std::fs; use std::fs;
let tmp = tempfile::tempdir().unwrap(); let tmp = tempfile::tempdir().unwrap();
@@ -1026,11 +1028,17 @@ stage = "coder"
crate::db::ItemMeta::named("Recovery2 Test"), crate::db::ItemMeta::named("Recovery2 Test"),
); );
// Simulate the recovery key already being set (first recovery respawn was // Simulate two previous consecutive no-progress respawns: counter=2 and a
// issued previously). // fingerprint stored that matches what the current (worktree-less) attempt
// will produce (None vs Some(stored) differ, but the path with stored=Some
// and current=None enters the else branch where we increment the counter).
crate::db::write_content( crate::db::write_content(
crate::db::ContentKey::CommitRecoveryPending("9955_story_recovery2"), crate::db::ContentKey::CommitRecoveryPending("9955_story_recovery2"),
"1", "2",
);
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint("9955_story_recovery2"),
"0",
); );
let pool = AgentPool::new_test(3001); let pool = AgentPool::new_test(3001);
@@ -1052,7 +1060,7 @@ stage = "coder"
) )
.await; .await;
// The story must be blocked (not retried again). // The story must be blocked once the cap is reached (counter 2 + 1 = 3).
let mut got_blocked = false; let mut got_blocked = false;
let mut block_reason = String::new(); let mut block_reason = String::new();
while let Ok(evt) = rx.try_recv() { while let Ok(evt) = rx.try_recv() {
@@ -1066,23 +1074,30 @@ stage = "coder"
} }
assert!( assert!(
got_blocked, got_blocked,
"Story must be blocked when commit-recovery respawn also produces no commits (AC 3)" "Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
); );
assert_eq!( assert!(
block_reason, "agent declined to commit recoverable work", block_reason.contains("without commits or new file edits"),
"Block reason must match AC 3 spec" "Block reason should describe the no-progress condition, got: {block_reason}"
); );
// The recovery key must be cleared after blocking. // Both recovery keys must be cleared after blocking.
assert!( assert!(
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending( crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
"9955_story_recovery2" "9955_story_recovery2"
)) ))
.is_none(), .is_none(),
"commit_recovery_pending key must be cleared after blocking the story" "commit_recovery_pending key must be cleared after blocking"
);
assert!(
crate::db::read_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
"9955_story_recovery2"
))
.is_none(),
"commit_recovery_diff_fingerprint key must be cleared after blocking"
); );
// retry_count must NOT have been incremented (AC 2: recovery never consumes a slot). // retry_count must NOT have been incremented (recovery never consumes a slot).
let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT"); let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT");
assert_eq!( assert_eq!(
item.retry_count(), item.retry_count(),
+12 -1
View File
@@ -24,8 +24,16 @@ pub enum ContentKey<'a> {
MergeMasterSpawnCount(&'a str), MergeMasterSpawnCount(&'a str),
/// Evidence that `run_tests` passed during an agent session. /// Evidence that `run_tests` passed during an agent session.
RunTestsOk(&'a str), RunTestsOk(&'a str),
/// Flag indicating a commit-recovery respawn is in progress. /// Flag indicating a commit-recovery respawn is in progress. Stored as
/// a decimal string counting consecutive respawns that made NO file-edit
/// progress (worktree diff byte-identical to the previous attempt). Reset
/// to "1" whenever a respawn produces a different diff fingerprint.
CommitRecoveryPending(&'a str), CommitRecoveryPending(&'a str),
/// Worktree diff byte-length captured at the last commit-recovery respawn
/// trigger. Used to detect whether the agent made any file-edit progress
/// between consecutive session-boundary-clean exits. Same byte length on
/// two consecutive attempts → no progress → increment CommitRecoveryPending.
CommitRecoveryDiffFingerprint(&'a str),
/// Flag indicating a merge gate fixup coder session is in progress. /// Flag indicating a merge gate fixup coder session is in progress.
/// ///
/// Set when the merge gate fails with a self-evident-fix class of failure /// Set when the merge gate fails with a self-evident-fix class of failure
@@ -57,6 +65,9 @@ impl<'a> ContentKey<'a> {
ContentKey::MergeMasterSpawnCount(id) => format!("{id}:mergemaster_spawn_count"), ContentKey::MergeMasterSpawnCount(id) => format!("{id}:mergemaster_spawn_count"),
ContentKey::RunTestsOk(id) => format!("{id}:run_tests_ok"), ContentKey::RunTestsOk(id) => format!("{id}:run_tests_ok"),
ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"), ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"),
ContentKey::CommitRecoveryDiffFingerprint(id) => {
format!("{id}:commit_recovery_diff_fingerprint")
}
ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"), ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"),
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"), ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"), ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),