diff --git a/server/src/agents/pool/pipeline/advance/mod.rs b/server/src/agents/pool/pipeline/advance/mod.rs index 5cf5fab2..cc128b58 100644 --- a/server/src/agents/pool/pipeline/advance/mod.rs +++ b/server/src/agents/pool/pipeline/advance/mod.rs @@ -83,7 +83,15 @@ impl AgentPool { // to the previous attempt do we count it as "no progress". // After NO_PROGRESS_CAP consecutive no-progress respawns, // block for human attention. + // + // TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent + // keeps making file-edit progress every session, after this + // many total respawns without a commit we escalate — caught + // the "agent flaps between different edits but never + // commits" pattern that the progress-aware counter would + // never trigger. const NO_PROGRESS_CAP: u32 = 3; + const TOTAL_ATTEMPTS_CAP: u32 = 8; let current_fingerprint = worktree_path.as_deref().and_then(|p| { std::process::Command::new("git") @@ -108,6 +116,45 @@ impl AgentPool { .unwrap_or(0) + 1 }; + let total_attempts = crate::db::read_content( + crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), + ) + .and_then(|s| s.trim().parse::().ok()) + .unwrap_or(0) + + 1; + + if total_attempts >= TOTAL_ATTEMPTS_CAP { + // Outer cap reached: agent has been respawned too many + // times without ever committing. Block regardless of + // whether file-edit progress is still happening. + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending( + story_id, + )); + crate::db::delete_content( + crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id), + ); + crate::db::delete_content( + crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), + ); + slog!( + "[pipeline] Coder '{agent_name}' for '{story_id}' hit total \ + commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \ + without a commit. Blocking story." + ); + let reason = format!( + "agent flapped — {total_attempts} respawns without ever committing" + ); + if let Err(e) = + crate::agents::lifecycle::transition_to_blocked(story_id, &reason) + { + slog_error!("[pipeline] Failed to block '{story_id}': {e}"); + } + let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked { + story_id: story_id.to_string(), + reason, + }); + return; + } if no_progress_count >= NO_PROGRESS_CAP { // Cap reached → block for human attention. @@ -117,6 +164,9 @@ impl AgentPool { crate::db::delete_content( crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id), ); + crate::db::delete_content( + crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), + ); slog!( "[pipeline] Coder '{agent_name}' for '{story_id}' made no \ file-edit progress over {no_progress_count} consecutive \ @@ -148,10 +198,15 @@ impl AgentPool { fp, ); } + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), + &total_attempts.to_string(), + ); slog!( "[pipeline] Coder '{agent_name}' exited with uncommitted work \ for '{story_id}' (no-progress {no_progress_count}/\ - {NO_PROGRESS_CAP}; progress_made={made_progress}). \ + {NO_PROGRESS_CAP}, total {total_attempts}/\ + {TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \ Issuing commit-only respawn." ); let addendum = "\n\nYou have uncommitted work in this worktree. \ diff --git a/server/src/agents/pool/pipeline/advance/tests_regression.rs b/server/src/agents/pool/pipeline/advance/tests_regression.rs index b528d390..feccd9b6 100644 --- a/server/src/agents/pool/pipeline/advance/tests_regression.rs +++ b/server/src/agents/pool/pipeline/advance/tests_regression.rs @@ -1107,6 +1107,113 @@ stage = "coder" ); } +/// Outer-cap path: even if the agent keeps making file-edit progress between +/// every commit-recovery respawn (different fingerprint each time), after +/// TOTAL_ATTEMPTS_CAP respawns without a commit the story must block. This +/// catches the "flapping agent" pattern where the progress-aware counter +/// would never trigger because the diff keeps changing. +#[tokio::test] +async fn total_attempts_cap_blocks_flapping_agent_without_commit() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + fs::create_dir_all(root.join(".huskies")).unwrap(); + fs::write( + root.join(".huskies/project.toml"), + r#" +max_retries = 3 + +[[agent]] +name = "coder-1" +role = "Coder" +command = "echo" +args = ["noop"] +prompt = "test" +stage = "coder" +"#, + ) + .unwrap(); + + crate::crdt_state::init_for_test(); + crate::db::ensure_content_store(); + crate::db::write_item_with_content( + "9956_story_flapper", + "2_current", + "---\nname: Flapper Test\n---\n", + crate::db::ItemMeta::named("Flapper Test"), + ); + + // Simulate 7 previous respawns (one short of the outer cap of 8). The + // no-progress counter is at 1 (the agent has been making progress every + // attempt) but total attempts is at the threshold. + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryTotalAttempts("9956_story_flapper"), + "7", + ); + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryPending("9956_story_flapper"), + "1", + ); + + let pool = AgentPool::new_test(3001); + let mut rx = pool.watcher_tx.subscribe(); + + pool.run_pipeline_advance( + "9956_story_flapper", + "coder-1", + CompletionReport { + summary: "still no commit".to_string(), + gates_passed: false, + gate_output: "Worktree has uncommitted changes".to_string(), + needs_commit_recovery: true, + }, + Some(root.to_path_buf()), + None, + false, + None, + ) + .await; + + // Outer cap reached (7 + 1 = 8) → block. + let mut got_blocked = false; + let mut block_reason = String::new(); + while let Ok(evt) = rx.try_recv() { + if let WatcherEvent::StoryBlocked { story_id, reason } = evt + && story_id == "9956_story_flapper" + { + got_blocked = true; + block_reason = reason; + break; + } + } + assert!( + got_blocked, + "Story must be blocked once total commit-recovery attempts hits the outer cap" + ); + assert!( + block_reason.contains("flapped") && block_reason.contains("without ever committing"), + "Block reason should describe the flapping pattern, got: {block_reason}" + ); + + // All three recovery keys must be cleared after blocking. + assert!( + crate::db::read_content(crate::db::ContentKey::CommitRecoveryTotalAttempts( + "9956_story_flapper" + )) + .is_none(), + "total_attempts key must be cleared after blocking" + ); + assert!( + crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending( + "9956_story_flapper" + )) + .is_none(), + "commit_recovery_pending key must be cleared after blocking" + ); +} + // ── bug 953: bug-645 path must not advance when feature branch has zero commits ── /// Regression test for bug 953: when a coder agent exits with gates_passed=false diff --git a/server/src/db/content_store.rs b/server/src/db/content_store.rs index 0fd9771e..d79dce99 100644 --- a/server/src/db/content_store.rs +++ b/server/src/db/content_store.rs @@ -34,6 +34,12 @@ pub enum ContentKey<'a> { /// between consecutive session-boundary-clean exits. Same byte length on /// two consecutive attempts → no progress → increment CommitRecoveryPending. CommitRecoveryDiffFingerprint(&'a str), + /// Absolute count of commit-recovery respawns issued for a story since the + /// last successful commit. Increments every respawn regardless of whether + /// the diff fingerprint changed. Outer cap that catches the "agent flaps + /// between different file edits each session but never commits" pattern + /// where the progress-aware counter would never trigger. + CommitRecoveryTotalAttempts(&'a str), /// Flag indicating a merge gate fixup coder session is in progress. /// /// Set when the merge gate fails with a self-evident-fix class of failure @@ -68,6 +74,9 @@ impl<'a> ContentKey<'a> { ContentKey::CommitRecoveryDiffFingerprint(id) => { format!("{id}:commit_recovery_diff_fingerprint") } + ContentKey::CommitRecoveryTotalAttempts(id) => { + format!("{id}:commit_recovery_total_attempts") + } ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"), ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"), ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),