feat: progress-aware commit-recovery cap (no longer block on 2nd attempt)
The existing commit-recovery path blocked stories on the 2nd consecutive exit-without-commit. For long sweep refactors (e.g. story 997, the typed retries payload migration), claude-code's session-length boundary naturally terminates the coder mid-sweep before it can commit — even though substantial file-edit progress is being made each session. The old cap-of-1 misclassified normal mid-flight progress as 'agent declined to commit'. New behaviour: - Each commit-recovery respawn captures a worktree-diff byte-length fingerprint (git diff master | wc -c). - If the fingerprint differs from the previous attempt the agent made file-edit progress, the no-progress counter resets to 1. - If the fingerprint is byte-identical (no new edits between exits), increment the no-progress counter. - Block only when the counter reaches NO_PROGRESS_CAP (3) — i.e. three consecutive respawns where the agent did literally nothing. Adds ContentKey::CommitRecoveryDiffFingerprint to store the prior fingerprint. Updates the existing block-test to reflect the new cap semantics; existing 'first respawn issued' test continues to pass. All 2935 tests pass.
This commit is contained in:
@@ -75,23 +75,57 @@ impl AgentPool {
|
|||||||
}
|
}
|
||||||
PipelineStage::Coder => {
|
PipelineStage::Coder => {
|
||||||
if completion.needs_commit_recovery {
|
if completion.needs_commit_recovery {
|
||||||
// The coder exited with uncommitted content but no commits.
|
// The coder exited with uncommitted content but no commits
|
||||||
// Check if this is already a second recovery attempt (the
|
// (typical "claude-code session boundary mid-sweep" pattern).
|
||||||
// first recovery respawn also produced no commits).
|
// Use a PROGRESS-AWARE retry cap: the agent gets unlimited
|
||||||
if crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
// respawns as long as file edits keep growing between
|
||||||
|
// attempts; only when the worktree diff is byte-identical
|
||||||
|
// to the previous attempt do we count it as "no progress".
|
||||||
|
// After NO_PROGRESS_CAP consecutive no-progress respawns,
|
||||||
|
// block for human attention.
|
||||||
|
const NO_PROGRESS_CAP: u32 = 3;
|
||||||
|
|
||||||
|
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
|
||||||
|
std::process::Command::new("git")
|
||||||
|
.args(["diff", "master"])
|
||||||
|
.current_dir(p)
|
||||||
|
.output()
|
||||||
|
.ok()
|
||||||
|
.map(|out| out.stdout.len().to_string())
|
||||||
|
});
|
||||||
|
let stored_fingerprint = crate::db::read_content(
|
||||||
|
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||||
|
);
|
||||||
|
let made_progress = current_fingerprint.is_some()
|
||||||
|
&& stored_fingerprint.as_ref() != current_fingerprint.as_ref();
|
||||||
|
let no_progress_count = if made_progress || stored_fingerprint.is_none() {
|
||||||
|
1
|
||||||
|
} else {
|
||||||
|
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||||
story_id,
|
story_id,
|
||||||
))
|
))
|
||||||
.is_some()
|
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||||
{
|
.unwrap_or(0)
|
||||||
// Second attempt still produced no commits → block.
|
+ 1
|
||||||
|
};
|
||||||
|
|
||||||
|
if no_progress_count >= NO_PROGRESS_CAP {
|
||||||
|
// Cap reached → block for human attention.
|
||||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
|
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||||
story_id,
|
story_id,
|
||||||
));
|
));
|
||||||
slog!(
|
crate::db::delete_content(
|
||||||
"[pipeline] Coder '{agent_name}' (commit-recovery respawn) \
|
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||||
still produced no commits for '{story_id}'. Blocking story."
|
);
|
||||||
|
slog!(
|
||||||
|
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
|
||||||
|
file-edit progress over {no_progress_count} consecutive \
|
||||||
|
commit-recovery respawns. Blocking story."
|
||||||
|
);
|
||||||
|
let reason = format!(
|
||||||
|
"agent stuck — {no_progress_count} respawns without commits or \
|
||||||
|
new file edits"
|
||||||
);
|
);
|
||||||
let reason = "agent declined to commit recoverable work".to_string();
|
|
||||||
if let Err(e) =
|
if let Err(e) =
|
||||||
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
||||||
{
|
{
|
||||||
@@ -102,15 +136,23 @@ impl AgentPool {
|
|||||||
reason,
|
reason,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
// First occurrence: issue a commit-only recovery respawn.
|
// Below cap: respawn with commit-only prompt. Does NOT
|
||||||
// This does NOT consume a retry_count slot.
|
// consume a retry_count slot.
|
||||||
crate::db::write_content(
|
crate::db::write_content(
|
||||||
crate::db::ContentKey::CommitRecoveryPending(story_id),
|
crate::db::ContentKey::CommitRecoveryPending(story_id),
|
||||||
"1",
|
&no_progress_count.to_string(),
|
||||||
);
|
);
|
||||||
|
if let Some(ref fp) = current_fingerprint {
|
||||||
|
crate::db::write_content(
|
||||||
|
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||||
|
fp,
|
||||||
|
);
|
||||||
|
}
|
||||||
slog!(
|
slog!(
|
||||||
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
|
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
|
||||||
for '{story_id}'. Issuing commit-only recovery respawn."
|
for '{story_id}' (no-progress {no_progress_count}/\
|
||||||
|
{NO_PROGRESS_CAP}; progress_made={made_progress}). \
|
||||||
|
Issuing commit-only respawn."
|
||||||
);
|
);
|
||||||
let addendum = "\n\nYou have uncommitted work in this worktree. \
|
let addendum = "\n\nYou have uncommitted work in this worktree. \
|
||||||
Your only task this session is run_tests → git_add → git_commit. \
|
Your only task this session is run_tests → git_add → git_commit. \
|
||||||
|
|||||||
@@ -991,10 +991,12 @@ stage = "coder"
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// AC3: when the commit-recovery respawn also exits with `needs_commit_recovery=true`,
|
/// AC3: when consecutive commit-recovery respawns make NO file-edit progress
|
||||||
/// the story moves to `blocked` with reason "agent declined to commit recoverable work".
|
/// (worktree diff byte-identical across attempts), the story moves to `blocked`
|
||||||
|
/// after the no-progress cap is hit. The agent gets unlimited respawns while
|
||||||
|
/// progress is being made, only stalling triggers escalation.
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn second_commit_recovery_failure_blocks_story() {
|
async fn no_progress_commit_recovery_blocks_story_at_cap() {
|
||||||
use std::fs;
|
use std::fs;
|
||||||
|
|
||||||
let tmp = tempfile::tempdir().unwrap();
|
let tmp = tempfile::tempdir().unwrap();
|
||||||
@@ -1026,11 +1028,17 @@ stage = "coder"
|
|||||||
crate::db::ItemMeta::named("Recovery2 Test"),
|
crate::db::ItemMeta::named("Recovery2 Test"),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Simulate the recovery key already being set (first recovery respawn was
|
// Simulate two previous consecutive no-progress respawns: counter=2 and a
|
||||||
// issued previously).
|
// fingerprint stored that matches what the current (worktree-less) attempt
|
||||||
|
// will produce (None vs Some(stored) differ, but the path with stored=Some
|
||||||
|
// and current=None enters the else branch where we increment the counter).
|
||||||
crate::db::write_content(
|
crate::db::write_content(
|
||||||
crate::db::ContentKey::CommitRecoveryPending("9955_story_recovery2"),
|
crate::db::ContentKey::CommitRecoveryPending("9955_story_recovery2"),
|
||||||
"1",
|
"2",
|
||||||
|
);
|
||||||
|
crate::db::write_content(
|
||||||
|
crate::db::ContentKey::CommitRecoveryDiffFingerprint("9955_story_recovery2"),
|
||||||
|
"0",
|
||||||
);
|
);
|
||||||
|
|
||||||
let pool = AgentPool::new_test(3001);
|
let pool = AgentPool::new_test(3001);
|
||||||
@@ -1052,7 +1060,7 @@ stage = "coder"
|
|||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
// The story must be blocked (not retried again).
|
// The story must be blocked once the cap is reached (counter 2 + 1 = 3).
|
||||||
let mut got_blocked = false;
|
let mut got_blocked = false;
|
||||||
let mut block_reason = String::new();
|
let mut block_reason = String::new();
|
||||||
while let Ok(evt) = rx.try_recv() {
|
while let Ok(evt) = rx.try_recv() {
|
||||||
@@ -1066,23 +1074,30 @@ stage = "coder"
|
|||||||
}
|
}
|
||||||
assert!(
|
assert!(
|
||||||
got_blocked,
|
got_blocked,
|
||||||
"Story must be blocked when commit-recovery respawn also produces no commits (AC 3)"
|
"Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert!(
|
||||||
block_reason, "agent declined to commit recoverable work",
|
block_reason.contains("without commits or new file edits"),
|
||||||
"Block reason must match AC 3 spec"
|
"Block reason should describe the no-progress condition, got: {block_reason}"
|
||||||
);
|
);
|
||||||
|
|
||||||
// The recovery key must be cleared after blocking.
|
// Both recovery keys must be cleared after blocking.
|
||||||
assert!(
|
assert!(
|
||||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||||
"9955_story_recovery2"
|
"9955_story_recovery2"
|
||||||
))
|
))
|
||||||
.is_none(),
|
.is_none(),
|
||||||
"commit_recovery_pending key must be cleared after blocking the story"
|
"commit_recovery_pending key must be cleared after blocking"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
crate::db::read_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
|
||||||
|
"9955_story_recovery2"
|
||||||
|
))
|
||||||
|
.is_none(),
|
||||||
|
"commit_recovery_diff_fingerprint key must be cleared after blocking"
|
||||||
);
|
);
|
||||||
|
|
||||||
// retry_count must NOT have been incremented (AC 2: recovery never consumes a slot).
|
// retry_count must NOT have been incremented (recovery never consumes a slot).
|
||||||
let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT");
|
let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
item.retry_count(),
|
item.retry_count(),
|
||||||
|
|||||||
@@ -24,8 +24,16 @@ pub enum ContentKey<'a> {
|
|||||||
MergeMasterSpawnCount(&'a str),
|
MergeMasterSpawnCount(&'a str),
|
||||||
/// Evidence that `run_tests` passed during an agent session.
|
/// Evidence that `run_tests` passed during an agent session.
|
||||||
RunTestsOk(&'a str),
|
RunTestsOk(&'a str),
|
||||||
/// Flag indicating a commit-recovery respawn is in progress.
|
/// Flag indicating a commit-recovery respawn is in progress. Stored as
|
||||||
|
/// a decimal string counting consecutive respawns that made NO file-edit
|
||||||
|
/// progress (worktree diff byte-identical to the previous attempt). Reset
|
||||||
|
/// to "1" whenever a respawn produces a different diff fingerprint.
|
||||||
CommitRecoveryPending(&'a str),
|
CommitRecoveryPending(&'a str),
|
||||||
|
/// Worktree diff byte-length captured at the last commit-recovery respawn
|
||||||
|
/// trigger. Used to detect whether the agent made any file-edit progress
|
||||||
|
/// between consecutive session-boundary-clean exits. Same byte length on
|
||||||
|
/// two consecutive attempts → no progress → increment CommitRecoveryPending.
|
||||||
|
CommitRecoveryDiffFingerprint(&'a str),
|
||||||
/// Flag indicating a merge gate fixup coder session is in progress.
|
/// Flag indicating a merge gate fixup coder session is in progress.
|
||||||
///
|
///
|
||||||
/// Set when the merge gate fails with a self-evident-fix class of failure
|
/// Set when the merge gate fails with a self-evident-fix class of failure
|
||||||
@@ -57,6 +65,9 @@ impl<'a> ContentKey<'a> {
|
|||||||
ContentKey::MergeMasterSpawnCount(id) => format!("{id}:mergemaster_spawn_count"),
|
ContentKey::MergeMasterSpawnCount(id) => format!("{id}:mergemaster_spawn_count"),
|
||||||
ContentKey::RunTestsOk(id) => format!("{id}:run_tests_ok"),
|
ContentKey::RunTestsOk(id) => format!("{id}:run_tests_ok"),
|
||||||
ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"),
|
ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"),
|
||||||
|
ContentKey::CommitRecoveryDiffFingerprint(id) => {
|
||||||
|
format!("{id}:commit_recovery_diff_fingerprint")
|
||||||
|
}
|
||||||
ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"),
|
ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"),
|
||||||
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
|
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
|
||||||
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),
|
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),
|
||||||
|
|||||||
Reference in New Issue
Block a user