feat: progress-aware commit-recovery cap (no longer block on 2nd attempt)

The existing commit-recovery path blocked stories on the 2nd consecutive
exit-without-commit. For long sweep refactors (e.g. story 997, the typed
retries payload migration), claude-code's session-length boundary
naturally terminates the coder mid-sweep before it can commit — even
though substantial file-edit progress is being made each session. The
old cap-of-1 misclassified normal mid-flight progress as 'agent declined
to commit'.

New behaviour:
- Each commit-recovery respawn captures a worktree-diff byte-length
  fingerprint (git diff master | wc -c).
- If the fingerprint differs from the previous attempt the agent made
  file-edit progress, the no-progress counter resets to 1.
- If the fingerprint is byte-identical (no new edits between exits),
  increment the no-progress counter.
- Block only when the counter reaches NO_PROGRESS_CAP (3) — i.e. three
  consecutive respawns where the agent did literally nothing.

Adds ContentKey::CommitRecoveryDiffFingerprint to store the prior
fingerprint. Updates the existing block-test to reflect the new cap
semantics; existing 'first respawn issued' test continues to pass.

All 2935 tests pass.
This commit is contained in:
Timmy
2026-05-14 11:24:02 +01:00
parent 5e5c5a0e08
commit bab337b289
3 changed files with 100 additions and 32 deletions
+57 -15
View File
@@ -75,23 +75,57 @@ impl AgentPool {
}
PipelineStage::Coder => {
if completion.needs_commit_recovery {
// The coder exited with uncommitted content but no commits.
// Check if this is already a second recovery attempt (the
// first recovery respawn also produced no commits).
if crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
// The coder exited with uncommitted content but no commits
// (typical "claude-code session boundary mid-sweep" pattern).
// Use a PROGRESS-AWARE retry cap: the agent gets unlimited
// respawns as long as file edits keep growing between
// attempts; only when the worktree diff is byte-identical
// to the previous attempt do we count it as "no progress".
// After NO_PROGRESS_CAP consecutive no-progress respawns,
// block for human attention.
const NO_PROGRESS_CAP: u32 = 3;
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
std::process::Command::new("git")
.args(["diff", "master"])
.current_dir(p)
.output()
.ok()
.map(|out| out.stdout.len().to_string())
});
let stored_fingerprint = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
let made_progress = current_fingerprint.is_some()
&& stored_fingerprint.as_ref() != current_fingerprint.as_ref();
let no_progress_count = if made_progress || stored_fingerprint.is_none() {
1
} else {
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
))
.is_some()
{
// Second attempt still produced no commits → block.
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1
};
if no_progress_count >= NO_PROGRESS_CAP {
// Cap reached → block for human attention.
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
slog!(
"[pipeline] Coder '{agent_name}' (commit-recovery respawn) \
still produced no commits for '{story_id}'. Blocking story."
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
file-edit progress over {no_progress_count} consecutive \
commit-recovery respawns. Blocking story."
);
let reason = format!(
"agent stuck — {no_progress_count} respawns without commits or \
new file edits"
);
let reason = "agent declined to commit recoverable work".to_string();
if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
{
@@ -102,15 +136,23 @@ impl AgentPool {
reason,
});
} else {
// First occurrence: issue a commit-only recovery respawn.
// This does NOT consume a retry_count slot.
// Below cap: respawn with commit-only prompt. Does NOT
// consume a retry_count slot.
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryPending(story_id),
"1",
&no_progress_count.to_string(),
);
if let Some(ref fp) = current_fingerprint {
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
fp,
);
}
slog!(
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
for '{story_id}'. Issuing commit-only recovery respawn."
for '{story_id}' (no-progress {no_progress_count}/\
{NO_PROGRESS_CAP}; progress_made={made_progress}). \
Issuing commit-only respawn."
);
let addendum = "\n\nYou have uncommitted work in this worktree. \
Your only task this session is run_tests → git_add → git_commit. \
@@ -991,10 +991,12 @@ stage = "coder"
);
}
/// AC3: when the commit-recovery respawn also exits with `needs_commit_recovery=true`,
/// the story moves to `blocked` with reason "agent declined to commit recoverable work".
/// AC3: when consecutive commit-recovery respawns make NO file-edit progress
/// (worktree diff byte-identical across attempts), the story moves to `blocked`
/// after the no-progress cap is hit. The agent gets unlimited respawns while
/// progress is being made, only stalling triggers escalation.
#[tokio::test]
async fn second_commit_recovery_failure_blocks_story() {
async fn no_progress_commit_recovery_blocks_story_at_cap() {
use std::fs;
let tmp = tempfile::tempdir().unwrap();
@@ -1026,11 +1028,17 @@ stage = "coder"
crate::db::ItemMeta::named("Recovery2 Test"),
);
// Simulate the recovery key already being set (first recovery respawn was
// issued previously).
// Simulate two previous consecutive no-progress respawns: counter=2 and a
// fingerprint stored that matches what the current (worktree-less) attempt
// will produce (None vs Some(stored) differ, but the path with stored=Some
// and current=None enters the else branch where we increment the counter).
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryPending("9955_story_recovery2"),
"1",
"2",
);
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint("9955_story_recovery2"),
"0",
);
let pool = AgentPool::new_test(3001);
@@ -1052,7 +1060,7 @@ stage = "coder"
)
.await;
// The story must be blocked (not retried again).
// The story must be blocked once the cap is reached (counter 2 + 1 = 3).
let mut got_blocked = false;
let mut block_reason = String::new();
while let Ok(evt) = rx.try_recv() {
@@ -1066,23 +1074,30 @@ stage = "coder"
}
assert!(
got_blocked,
"Story must be blocked when commit-recovery respawn also produces no commits (AC 3)"
"Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
);
assert_eq!(
block_reason, "agent declined to commit recoverable work",
"Block reason must match AC 3 spec"
assert!(
block_reason.contains("without commits or new file edits"),
"Block reason should describe the no-progress condition, got: {block_reason}"
);
// The recovery key must be cleared after blocking.
// Both recovery keys must be cleared after blocking.
assert!(
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
"9955_story_recovery2"
))
.is_none(),
"commit_recovery_pending key must be cleared after blocking the story"
"commit_recovery_pending key must be cleared after blocking"
);
assert!(
crate::db::read_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
"9955_story_recovery2"
))
.is_none(),
"commit_recovery_diff_fingerprint key must be cleared after blocking"
);
// retry_count must NOT have been incremented (AC 2: recovery never consumes a slot).
// retry_count must NOT have been incremented (recovery never consumes a slot).
let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT");
assert_eq!(
item.retry_count(),
+12 -1
View File
@@ -24,8 +24,16 @@ pub enum ContentKey<'a> {
MergeMasterSpawnCount(&'a str),
/// Evidence that `run_tests` passed during an agent session.
RunTestsOk(&'a str),
/// Flag indicating a commit-recovery respawn is in progress.
/// Flag indicating a commit-recovery respawn is in progress. Stored as
/// a decimal string counting consecutive respawns that made NO file-edit
/// progress (worktree diff byte-identical to the previous attempt). Reset
/// to "1" whenever a respawn produces a different diff fingerprint.
CommitRecoveryPending(&'a str),
/// Worktree diff byte-length captured at the last commit-recovery respawn
/// trigger. Used to detect whether the agent made any file-edit progress
/// between consecutive session-boundary-clean exits. Same byte length on
/// two consecutive attempts → no progress → increment CommitRecoveryPending.
CommitRecoveryDiffFingerprint(&'a str),
/// Flag indicating a merge gate fixup coder session is in progress.
///
/// Set when the merge gate fails with a self-evident-fix class of failure
@@ -57,6 +65,9 @@ impl<'a> ContentKey<'a> {
ContentKey::MergeMasterSpawnCount(id) => format!("{id}:mergemaster_spawn_count"),
ContentKey::RunTestsOk(id) => format!("{id}:run_tests_ok"),
ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"),
ContentKey::CommitRecoveryDiffFingerprint(id) => {
format!("{id}:commit_recovery_diff_fingerprint")
}
ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"),
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),