feat: progress-aware commit-recovery cap (no longer block on 2nd attempt)
The existing commit-recovery path blocked stories on the 2nd consecutive exit-without-commit. For long sweep refactors (e.g. story 997, the typed retries payload migration), claude-code's session-length boundary naturally terminates the coder mid-sweep before it can commit — even though substantial file-edit progress is being made each session. The old cap-of-1 misclassified normal mid-flight progress as 'agent declined to commit'. New behaviour: - Each commit-recovery respawn captures a worktree-diff byte-length fingerprint (git diff master | wc -c). - If the fingerprint differs from the previous attempt the agent made file-edit progress, the no-progress counter resets to 1. - If the fingerprint is byte-identical (no new edits between exits), increment the no-progress counter. - Block only when the counter reaches NO_PROGRESS_CAP (3) — i.e. three consecutive respawns where the agent did literally nothing. Adds ContentKey::CommitRecoveryDiffFingerprint to store the prior fingerprint. Updates the existing block-test to reflect the new cap semantics; existing 'first respawn issued' test continues to pass. All 2935 tests pass.
This commit is contained in:
@@ -75,23 +75,57 @@ impl AgentPool {
|
||||
}
|
||||
PipelineStage::Coder => {
|
||||
if completion.needs_commit_recovery {
|
||||
// The coder exited with uncommitted content but no commits.
|
||||
// Check if this is already a second recovery attempt (the
|
||||
// first recovery respawn also produced no commits).
|
||||
if crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
))
|
||||
.is_some()
|
||||
{
|
||||
// Second attempt still produced no commits → block.
|
||||
// The coder exited with uncommitted content but no commits
|
||||
// (typical "claude-code session boundary mid-sweep" pattern).
|
||||
// Use a PROGRESS-AWARE retry cap: the agent gets unlimited
|
||||
// respawns as long as file edits keep growing between
|
||||
// attempts; only when the worktree diff is byte-identical
|
||||
// to the previous attempt do we count it as "no progress".
|
||||
// After NO_PROGRESS_CAP consecutive no-progress respawns,
|
||||
// block for human attention.
|
||||
const NO_PROGRESS_CAP: u32 = 3;
|
||||
|
||||
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
|
||||
std::process::Command::new("git")
|
||||
.args(["diff", "master"])
|
||||
.current_dir(p)
|
||||
.output()
|
||||
.ok()
|
||||
.map(|out| out.stdout.len().to_string())
|
||||
});
|
||||
let stored_fingerprint = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||
);
|
||||
let made_progress = current_fingerprint.is_some()
|
||||
&& stored_fingerprint.as_ref() != current_fingerprint.as_ref();
|
||||
let no_progress_count = if made_progress || stored_fingerprint.is_none() {
|
||||
1
|
||||
} else {
|
||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
))
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0)
|
||||
+ 1
|
||||
};
|
||||
|
||||
if no_progress_count >= NO_PROGRESS_CAP {
|
||||
// Cap reached → block for human attention.
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
));
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' (commit-recovery respawn) \
|
||||
still produced no commits for '{story_id}'. Blocking story."
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||
);
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
|
||||
file-edit progress over {no_progress_count} consecutive \
|
||||
commit-recovery respawns. Blocking story."
|
||||
);
|
||||
let reason = format!(
|
||||
"agent stuck — {no_progress_count} respawns without commits or \
|
||||
new file edits"
|
||||
);
|
||||
let reason = "agent declined to commit recoverable work".to_string();
|
||||
if let Err(e) =
|
||||
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
||||
{
|
||||
@@ -102,15 +136,23 @@ impl AgentPool {
|
||||
reason,
|
||||
});
|
||||
} else {
|
||||
// First occurrence: issue a commit-only recovery respawn.
|
||||
// This does NOT consume a retry_count slot.
|
||||
// Below cap: respawn with commit-only prompt. Does NOT
|
||||
// consume a retry_count slot.
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryPending(story_id),
|
||||
"1",
|
||||
&no_progress_count.to_string(),
|
||||
);
|
||||
if let Some(ref fp) = current_fingerprint {
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||
fp,
|
||||
);
|
||||
}
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
|
||||
for '{story_id}'. Issuing commit-only recovery respawn."
|
||||
for '{story_id}' (no-progress {no_progress_count}/\
|
||||
{NO_PROGRESS_CAP}; progress_made={made_progress}). \
|
||||
Issuing commit-only respawn."
|
||||
);
|
||||
let addendum = "\n\nYou have uncommitted work in this worktree. \
|
||||
Your only task this session is run_tests → git_add → git_commit. \
|
||||
|
||||
@@ -991,10 +991,12 @@ stage = "coder"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC3: when the commit-recovery respawn also exits with `needs_commit_recovery=true`,
|
||||
/// the story moves to `blocked` with reason "agent declined to commit recoverable work".
|
||||
/// AC3: when consecutive commit-recovery respawns make NO file-edit progress
|
||||
/// (worktree diff byte-identical across attempts), the story moves to `blocked`
|
||||
/// after the no-progress cap is hit. The agent gets unlimited respawns while
|
||||
/// progress is being made, only stalling triggers escalation.
|
||||
#[tokio::test]
|
||||
async fn second_commit_recovery_failure_blocks_story() {
|
||||
async fn no_progress_commit_recovery_blocks_story_at_cap() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
@@ -1026,11 +1028,17 @@ stage = "coder"
|
||||
crate::db::ItemMeta::named("Recovery2 Test"),
|
||||
);
|
||||
|
||||
// Simulate the recovery key already being set (first recovery respawn was
|
||||
// issued previously).
|
||||
// Simulate two previous consecutive no-progress respawns: counter=2 and a
|
||||
// fingerprint stored that matches what the current (worktree-less) attempt
|
||||
// will produce (None vs Some(stored) differ, but the path with stored=Some
|
||||
// and current=None enters the else branch where we increment the counter).
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryPending("9955_story_recovery2"),
|
||||
"1",
|
||||
"2",
|
||||
);
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint("9955_story_recovery2"),
|
||||
"0",
|
||||
);
|
||||
|
||||
let pool = AgentPool::new_test(3001);
|
||||
@@ -1052,7 +1060,7 @@ stage = "coder"
|
||||
)
|
||||
.await;
|
||||
|
||||
// The story must be blocked (not retried again).
|
||||
// The story must be blocked once the cap is reached (counter 2 + 1 = 3).
|
||||
let mut got_blocked = false;
|
||||
let mut block_reason = String::new();
|
||||
while let Ok(evt) = rx.try_recv() {
|
||||
@@ -1066,23 +1074,30 @@ stage = "coder"
|
||||
}
|
||||
assert!(
|
||||
got_blocked,
|
||||
"Story must be blocked when commit-recovery respawn also produces no commits (AC 3)"
|
||||
"Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
|
||||
);
|
||||
assert_eq!(
|
||||
block_reason, "agent declined to commit recoverable work",
|
||||
"Block reason must match AC 3 spec"
|
||||
assert!(
|
||||
block_reason.contains("without commits or new file edits"),
|
||||
"Block reason should describe the no-progress condition, got: {block_reason}"
|
||||
);
|
||||
|
||||
// The recovery key must be cleared after blocking.
|
||||
// Both recovery keys must be cleared after blocking.
|
||||
assert!(
|
||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
"9955_story_recovery2"
|
||||
))
|
||||
.is_none(),
|
||||
"commit_recovery_pending key must be cleared after blocking the story"
|
||||
"commit_recovery_pending key must be cleared after blocking"
|
||||
);
|
||||
assert!(
|
||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
|
||||
"9955_story_recovery2"
|
||||
))
|
||||
.is_none(),
|
||||
"commit_recovery_diff_fingerprint key must be cleared after blocking"
|
||||
);
|
||||
|
||||
// retry_count must NOT have been incremented (AC 2: recovery never consumes a slot).
|
||||
// retry_count must NOT have been incremented (recovery never consumes a slot).
|
||||
let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT");
|
||||
assert_eq!(
|
||||
item.retry_count(),
|
||||
|
||||
@@ -24,8 +24,16 @@ pub enum ContentKey<'a> {
|
||||
MergeMasterSpawnCount(&'a str),
|
||||
/// Evidence that `run_tests` passed during an agent session.
|
||||
RunTestsOk(&'a str),
|
||||
/// Flag indicating a commit-recovery respawn is in progress.
|
||||
/// Flag indicating a commit-recovery respawn is in progress. Stored as
|
||||
/// a decimal string counting consecutive respawns that made NO file-edit
|
||||
/// progress (worktree diff byte-identical to the previous attempt). Reset
|
||||
/// to "1" whenever a respawn produces a different diff fingerprint.
|
||||
CommitRecoveryPending(&'a str),
|
||||
/// Worktree diff byte-length captured at the last commit-recovery respawn
|
||||
/// trigger. Used to detect whether the agent made any file-edit progress
|
||||
/// between consecutive session-boundary-clean exits. Same byte length on
|
||||
/// two consecutive attempts → no progress → increment CommitRecoveryPending.
|
||||
CommitRecoveryDiffFingerprint(&'a str),
|
||||
/// Flag indicating a merge gate fixup coder session is in progress.
|
||||
///
|
||||
/// Set when the merge gate fails with a self-evident-fix class of failure
|
||||
@@ -57,6 +65,9 @@ impl<'a> ContentKey<'a> {
|
||||
ContentKey::MergeMasterSpawnCount(id) => format!("{id}:mergemaster_spawn_count"),
|
||||
ContentKey::RunTestsOk(id) => format!("{id}:run_tests_ok"),
|
||||
ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"),
|
||||
ContentKey::CommitRecoveryDiffFingerprint(id) => {
|
||||
format!("{id}:commit_recovery_diff_fingerprint")
|
||||
}
|
||||
ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"),
|
||||
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
|
||||
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),
|
||||
|
||||
Reference in New Issue
Block a user