diff --git a/server/src/agents/pool/pipeline/advance/mod.rs b/server/src/agents/pool/pipeline/advance/mod.rs index fdc596d6..5cf5fab2 100644 --- a/server/src/agents/pool/pipeline/advance/mod.rs +++ b/server/src/agents/pool/pipeline/advance/mod.rs @@ -75,23 +75,57 @@ impl AgentPool { } PipelineStage::Coder => { if completion.needs_commit_recovery { - // The coder exited with uncommitted content but no commits. - // Check if this is already a second recovery attempt (the - // first recovery respawn also produced no commits). - if crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending( - story_id, - )) - .is_some() - { - // Second attempt still produced no commits → block. + // The coder exited with uncommitted content but no commits + // (typical "claude-code session boundary mid-sweep" pattern). + // Use a PROGRESS-AWARE retry cap: the agent gets unlimited + // respawns as long as file edits keep growing between + // attempts; only when the worktree diff is byte-identical + // to the previous attempt do we count it as "no progress". + // After NO_PROGRESS_CAP consecutive no-progress respawns, + // block for human attention. + const NO_PROGRESS_CAP: u32 = 3; + + let current_fingerprint = worktree_path.as_deref().and_then(|p| { + std::process::Command::new("git") + .args(["diff", "master"]) + .current_dir(p) + .output() + .ok() + .map(|out| out.stdout.len().to_string()) + }); + let stored_fingerprint = crate::db::read_content( + crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id), + ); + let made_progress = current_fingerprint.is_some() + && stored_fingerprint.as_ref() != current_fingerprint.as_ref(); + let no_progress_count = if made_progress || stored_fingerprint.is_none() { + 1 + } else { + crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending( + story_id, + )) + .and_then(|s| s.trim().parse::().ok()) + .unwrap_or(0) + + 1 + }; + + if no_progress_count >= NO_PROGRESS_CAP { + // Cap reached → block for human attention. crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending( story_id, )); - slog!( - "[pipeline] Coder '{agent_name}' (commit-recovery respawn) \ - still produced no commits for '{story_id}'. Blocking story." + crate::db::delete_content( + crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id), + ); + slog!( + "[pipeline] Coder '{agent_name}' for '{story_id}' made no \ + file-edit progress over {no_progress_count} consecutive \ + commit-recovery respawns. Blocking story." + ); + let reason = format!( + "agent stuck — {no_progress_count} respawns without commits or \ + new file edits" ); - let reason = "agent declined to commit recoverable work".to_string(); if let Err(e) = crate::agents::lifecycle::transition_to_blocked(story_id, &reason) { @@ -102,15 +136,23 @@ impl AgentPool { reason, }); } else { - // First occurrence: issue a commit-only recovery respawn. - // This does NOT consume a retry_count slot. + // Below cap: respawn with commit-only prompt. Does NOT + // consume a retry_count slot. crate::db::write_content( crate::db::ContentKey::CommitRecoveryPending(story_id), - "1", + &no_progress_count.to_string(), ); + if let Some(ref fp) = current_fingerprint { + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id), + fp, + ); + } slog!( "[pipeline] Coder '{agent_name}' exited with uncommitted work \ - for '{story_id}'. Issuing commit-only recovery respawn." + for '{story_id}' (no-progress {no_progress_count}/\ + {NO_PROGRESS_CAP}; progress_made={made_progress}). \ + Issuing commit-only respawn." ); let addendum = "\n\nYou have uncommitted work in this worktree. \ Your only task this session is run_tests → git_add → git_commit. \ diff --git a/server/src/agents/pool/pipeline/advance/tests_regression.rs b/server/src/agents/pool/pipeline/advance/tests_regression.rs index d0d2f316..b528d390 100644 --- a/server/src/agents/pool/pipeline/advance/tests_regression.rs +++ b/server/src/agents/pool/pipeline/advance/tests_regression.rs @@ -991,10 +991,12 @@ stage = "coder" ); } -/// AC3: when the commit-recovery respawn also exits with `needs_commit_recovery=true`, -/// the story moves to `blocked` with reason "agent declined to commit recoverable work". +/// AC3: when consecutive commit-recovery respawns make NO file-edit progress +/// (worktree diff byte-identical across attempts), the story moves to `blocked` +/// after the no-progress cap is hit. The agent gets unlimited respawns while +/// progress is being made, only stalling triggers escalation. #[tokio::test] -async fn second_commit_recovery_failure_blocks_story() { +async fn no_progress_commit_recovery_blocks_story_at_cap() { use std::fs; let tmp = tempfile::tempdir().unwrap(); @@ -1026,11 +1028,17 @@ stage = "coder" crate::db::ItemMeta::named("Recovery2 Test"), ); - // Simulate the recovery key already being set (first recovery respawn was - // issued previously). + // Simulate two previous consecutive no-progress respawns: counter=2 and a + // fingerprint stored that matches what the current (worktree-less) attempt + // will produce (None vs Some(stored) differ, but the path with stored=Some + // and current=None enters the else branch where we increment the counter). crate::db::write_content( crate::db::ContentKey::CommitRecoveryPending("9955_story_recovery2"), - "1", + "2", + ); + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryDiffFingerprint("9955_story_recovery2"), + "0", ); let pool = AgentPool::new_test(3001); @@ -1052,7 +1060,7 @@ stage = "coder" ) .await; - // The story must be blocked (not retried again). + // The story must be blocked once the cap is reached (counter 2 + 1 = 3). let mut got_blocked = false; let mut block_reason = String::new(); while let Ok(evt) = rx.try_recv() { @@ -1066,23 +1074,30 @@ stage = "coder" } assert!( got_blocked, - "Story must be blocked when commit-recovery respawn also produces no commits (AC 3)" + "Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns" ); - assert_eq!( - block_reason, "agent declined to commit recoverable work", - "Block reason must match AC 3 spec" + assert!( + block_reason.contains("without commits or new file edits"), + "Block reason should describe the no-progress condition, got: {block_reason}" ); - // The recovery key must be cleared after blocking. + // Both recovery keys must be cleared after blocking. assert!( crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending( "9955_story_recovery2" )) .is_none(), - "commit_recovery_pending key must be cleared after blocking the story" + "commit_recovery_pending key must be cleared after blocking" + ); + assert!( + crate::db::read_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint( + "9955_story_recovery2" + )) + .is_none(), + "commit_recovery_diff_fingerprint key must be cleared after blocking" ); - // retry_count must NOT have been incremented (AC 2: recovery never consumes a slot). + // retry_count must NOT have been incremented (recovery never consumes a slot). let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT"); assert_eq!( item.retry_count(), diff --git a/server/src/db/content_store.rs b/server/src/db/content_store.rs index ccfefccb..0fd9771e 100644 --- a/server/src/db/content_store.rs +++ b/server/src/db/content_store.rs @@ -24,8 +24,16 @@ pub enum ContentKey<'a> { MergeMasterSpawnCount(&'a str), /// Evidence that `run_tests` passed during an agent session. RunTestsOk(&'a str), - /// Flag indicating a commit-recovery respawn is in progress. + /// Flag indicating a commit-recovery respawn is in progress. Stored as + /// a decimal string counting consecutive respawns that made NO file-edit + /// progress (worktree diff byte-identical to the previous attempt). Reset + /// to "1" whenever a respawn produces a different diff fingerprint. CommitRecoveryPending(&'a str), + /// Worktree diff byte-length captured at the last commit-recovery respawn + /// trigger. Used to detect whether the agent made any file-edit progress + /// between consecutive session-boundary-clean exits. Same byte length on + /// two consecutive attempts → no progress → increment CommitRecoveryPending. + CommitRecoveryDiffFingerprint(&'a str), /// Flag indicating a merge gate fixup coder session is in progress. /// /// Set when the merge gate fails with a self-evident-fix class of failure @@ -57,6 +65,9 @@ impl<'a> ContentKey<'a> { ContentKey::MergeMasterSpawnCount(id) => format!("{id}:mergemaster_spawn_count"), ContentKey::RunTestsOk(id) => format!("{id}:run_tests_ok"), ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"), + ContentKey::CommitRecoveryDiffFingerprint(id) => { + format!("{id}:commit_recovery_diff_fingerprint") + } ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"), ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"), ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),