diff --git a/server/src/agents/mod.rs b/server/src/agents/mod.rs index 28be75f7..83f9ae12 100644 --- a/server/src/agents/mod.rs +++ b/server/src/agents/mod.rs @@ -157,6 +157,11 @@ pub struct CompletionReport { pub summary: String, pub gates_passed: bool, pub gate_output: String, + /// True when the coder exited with no commits but left uncommitted content in + /// the worktree. The pipeline advance will issue a commit-only recovery respawn + /// rather than a normal retry, and will NOT consume a `retry_count` slot. + #[serde(default)] + pub needs_commit_recovery: bool, } /// Token usage from a Claude Code session's `result` event. diff --git a/server/src/agents/pool/auto_assign/auto_assign.rs b/server/src/agents/pool/auto_assign/auto_assign.rs index 153babf6..1cc9c7c3 100644 --- a/server/src/agents/pool/auto_assign/auto_assign.rs +++ b/server/src/agents/pool/auto_assign/auto_assign.rs @@ -528,6 +528,12 @@ mod tests { "CONFLICT (content): server/src/lib.rs", crate::db::ItemMeta::named("Conflict"), ); + // After master c228ae16, has_content_conflict_failure reads from + // {story_id}:gate_output (not the story description), so seed it there. + crate::db::write_content( + "9860_story_conflict:gate_output", + "CONFLICT (content): server/src/lib.rs", + ); let pool = AgentPool::new_test(3001); pool.auto_assign_available_work(tmp.path()).await; @@ -681,6 +687,12 @@ mod tests { "CONFLICT (content): foo.rs", crate::db::ItemMeta::named("Transient"), ); + // After master c228ae16, has_content_conflict_failure reads from + // {story_id}:gate_output (not the story description), so seed it there. + crate::db::write_content( + "920_story_transient:gate_output", + "CONFLICT (content): foo.rs", + ); // Simulate two previous transient exits (below cap of 3) recorded in DB. crate::db::write_content("920_story_transient:mergemaster_spawn_count", "2"); diff --git a/server/src/agents/pool/pipeline/advance/mod.rs b/server/src/agents/pool/pipeline/advance/mod.rs index 7f97cd42..3a07681e 100644 --- a/server/src/agents/pool/pipeline/advance/mod.rs +++ b/server/src/agents/pool/pipeline/advance/mod.rs @@ -74,7 +74,58 @@ impl AgentPool { // Supervisors and unknown agents do not advance the pipeline. } PipelineStage::Coder => { - if completion.gates_passed { + if completion.needs_commit_recovery { + // The coder exited with uncommitted content but no commits. + // Check if this is already a second recovery attempt (the + // first recovery respawn also produced no commits). + let recovery_key = format!("{story_id}:commit_recovery_pending"); + if crate::db::read_content(&recovery_key).is_some() { + // Second attempt still produced no commits → block. + crate::db::delete_content(&recovery_key); + slog!( + "[pipeline] Coder '{agent_name}' (commit-recovery respawn) \ + still produced no commits for '{story_id}'. Blocking story." + ); + let reason = "agent declined to commit recoverable work".to_string(); + if let Err(e) = + crate::agents::lifecycle::transition_to_blocked(story_id, &reason) + { + slog_error!("[pipeline] Failed to block '{story_id}': {e}"); + } + let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked { + story_id: story_id.to_string(), + reason, + }); + } else { + // First occurrence: issue a commit-only recovery respawn. + // This does NOT consume a retry_count slot. + crate::db::write_content(&recovery_key, "1"); + slog!( + "[pipeline] Coder '{agent_name}' exited with uncommitted work \ + for '{story_id}'. Issuing commit-only recovery respawn." + ); + let addendum = "\n\nYou have uncommitted work in this worktree. \ + Your only task this session is run_tests → git_add → git_commit. \ + Do not explore further."; + if let Err(e) = self + .start_agent( + &project_root, + story_id, + Some(agent_name), + Some(addendum), + previous_session_id, + ) + .await + { + slog_error!( + "[pipeline] Failed to start commit-recovery respawn \ + for '{story_id}': {e}" + ); + } + } + } else if completion.gates_passed { + // Clear any stale recovery key when the coder succeeds normally. + crate::db::delete_content(&format!("{story_id}:commit_recovery_pending")); // Determine effective QA mode for this story. let qa_mode = { let item_type = crate::agents::lifecycle::item_type_from_id(story_id); @@ -130,6 +181,9 @@ impl AgentPool { } } } else { + // Clear any stale recovery key when gates fail normally (agent committed + // but the build is broken — treat as a standard retry, not a recovery). + crate::db::delete_content(&format!("{story_id}:commit_recovery_pending")); // Bug 645 / 668: Before retry/block, check if the agent left committed // work AND the agent had a passing run_tests result captured during its // session. An agent may crash mid-output (e.g. Claude Code CLI PTY write diff --git a/server/src/agents/pool/pipeline/advance/tests.rs b/server/src/agents/pool/pipeline/advance/tests.rs index 0dbbd7e9..49e1b8f5 100644 --- a/server/src/agents/pool/pipeline/advance/tests.rs +++ b/server/src/agents/pool/pipeline/advance/tests.rs @@ -27,6 +27,7 @@ async fn pipeline_advance_coder_gates_pass_server_qa_moves_to_merge() { summary: "done".to_string(), gates_passed: true, gate_output: String::new(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), None, @@ -72,6 +73,7 @@ async fn pipeline_advance_coder_gates_pass_agent_qa_moves_to_qa() { summary: "done".to_string(), gates_passed: true, gate_output: String::new(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), None, @@ -114,6 +116,7 @@ async fn pipeline_advance_qa_gates_pass_moves_story_to_merge() { summary: "QA done".to_string(), gates_passed: true, gate_output: String::new(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), None, @@ -148,6 +151,7 @@ async fn pipeline_advance_supervisor_does_not_advance() { summary: "supervised".to_string(), gates_passed: true, gate_output: String::new(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), None, @@ -216,6 +220,7 @@ stage = "qa" summary: "done".to_string(), gates_passed: true, gate_output: String::new(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), None, diff --git a/server/src/agents/pool/pipeline/advance/tests_regression.rs b/server/src/agents/pool/pipeline/advance/tests_regression.rs index db292143..01e44c3d 100644 --- a/server/src/agents/pool/pipeline/advance/tests_regression.rs +++ b/server/src/agents/pool/pipeline/advance/tests_regression.rs @@ -65,6 +65,7 @@ async fn mergemaster_blocks_and_sends_story_blocked_when_no_commits_ahead() { summary: "done".to_string(), gates_passed: true, gate_output: String::new(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), None, @@ -182,6 +183,7 @@ stage = "qa" summary: "QA done".to_string(), gates_passed: true, gate_output: String::new(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), None, @@ -266,6 +268,7 @@ async fn stale_mergemaster_advance_for_done_story_is_noop() { summary: "stale advance".to_string(), gates_passed: true, gate_output: String::new(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), None, @@ -406,6 +409,7 @@ async fn work_survived_advances_to_qa_instead_of_blocking() { summary: "Agent crashed".to_string(), gates_passed: false, gate_output: "Worktree has uncommitted changes".to_string(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), Some(wt_path), @@ -505,6 +509,7 @@ async fn no_committed_work_still_retries_and_blocks() { summary: "Agent crashed".to_string(), gates_passed: false, gate_output: "Tests failed".to_string(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), Some(wt_path), @@ -635,6 +640,7 @@ async fn gates_failed_no_test_evidence_does_not_advance() { summary: "Gates failed".to_string(), gates_passed: false, gate_output: "Tests failed".to_string(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), Some(wt_path), @@ -758,6 +764,7 @@ async fn gates_failed_with_test_evidence_and_committed_work_advances() { summary: "Agent crashed".to_string(), gates_passed: false, gate_output: "PTY write assertion failed".to_string(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), Some(wt_path), @@ -839,6 +846,7 @@ stage = "coder" summary: "Tests failed".to_string(), gates_passed: false, gate_output: "error[E0308]: mismatched types\n --> src/lib.rs:5:10".to_string(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), None, @@ -873,6 +881,191 @@ stage = "coder" ); } +// ── story 954: commit-only recovery respawn ─────────────────────────────── + +/// AC1+AC2: when a coder exits with `needs_commit_recovery=true` (uncommitted +/// work, zero commits), the pipeline issues a commit-only recovery respawn +/// WITHOUT consuming a retry_count slot. +#[tokio::test] +async fn commit_recovery_respawn_does_not_consume_retry_count() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + fs::create_dir_all(root.join(".huskies")).unwrap(); + fs::write( + root.join(".huskies/project.toml"), + r#" +max_retries = 3 + +[[agent]] +name = "coder-1" +role = "Coder" +command = "echo" +args = ["noop"] +prompt = "test" +stage = "coder" +"#, + ) + .unwrap(); + + crate::crdt_state::init_for_test(); + crate::db::ensure_content_store(); + crate::db::write_item_with_content( + "9954_story_recovery", + "2_current", + "---\nname: Recovery Test\n---\n", + crate::db::ItemMeta::named("Recovery Test"), + ); + // Ensure no stale recovery key exists. + crate::db::delete_content("9954_story_recovery:commit_recovery_pending"); + + let pool = AgentPool::new_test(3001); + + pool.run_pipeline_advance( + "9954_story_recovery", + "coder-1", + CompletionReport { + summary: "exited".to_string(), + gates_passed: false, + gate_output: "Worktree has uncommitted changes".to_string(), + needs_commit_recovery: true, + }, + Some(root.to_path_buf()), + None, + false, + None, + ) + .await; + + // The recovery respawn must have been issued — coder-1 should be Pending/Running. + let agents = pool.agents.lock().unwrap(); + let coder_restarted = agents.values().any(|a| { + a.agent_name == "coder-1" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + coder_restarted, + "Commit-recovery respawn must be issued when needs_commit_recovery=true. \ + Pool: {:?}", + agents + .iter() + .map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status)) + .collect::>() + ); + drop(agents); + + // retry_count must NOT have been incremented (AC 2). + let item = crate::crdt_state::read_item("9954_story_recovery").expect("story must be in CRDT"); + assert_eq!( + item.retry_count(), + 0, + "retry_count must NOT be incremented for a commit-recovery respawn (AC 2): got {}", + item.retry_count() + ); + + // The recovery key must be set so a second failure triggers a block. + assert!( + crate::db::read_content("9954_story_recovery:commit_recovery_pending").is_some(), + "commit_recovery_pending key must be set after issuing recovery respawn" + ); +} + +/// AC3: when the commit-recovery respawn also exits with `needs_commit_recovery=true`, +/// the story moves to `blocked` with reason "agent declined to commit recoverable work". +#[tokio::test] +async fn second_commit_recovery_failure_blocks_story() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + fs::create_dir_all(root.join(".huskies")).unwrap(); + fs::write( + root.join(".huskies/project.toml"), + r#" +max_retries = 3 + +[[agent]] +name = "coder-1" +role = "Coder" +command = "echo" +args = ["noop"] +prompt = "test" +stage = "coder" +"#, + ) + .unwrap(); + + crate::crdt_state::init_for_test(); + crate::db::ensure_content_store(); + crate::db::write_item_with_content( + "9955_story_recovery2", + "2_current", + "---\nname: Recovery2 Test\n---\n", + crate::db::ItemMeta::named("Recovery2 Test"), + ); + + // Simulate the recovery key already being set (first recovery respawn was + // issued previously). + crate::db::write_content("9955_story_recovery2:commit_recovery_pending", "1"); + + let pool = AgentPool::new_test(3001); + let mut rx = pool.watcher_tx.subscribe(); + + pool.run_pipeline_advance( + "9955_story_recovery2", + "coder-1", + CompletionReport { + summary: "exited again".to_string(), + gates_passed: false, + gate_output: "Worktree has uncommitted changes".to_string(), + needs_commit_recovery: true, + }, + Some(root.to_path_buf()), + None, + false, + None, + ) + .await; + + // The story must be blocked (not retried again). + let mut got_blocked = false; + let mut block_reason = String::new(); + while let Ok(evt) = rx.try_recv() { + if let WatcherEvent::StoryBlocked { story_id, reason } = evt + && story_id == "9955_story_recovery2" + { + got_blocked = true; + block_reason = reason; + break; + } + } + assert!( + got_blocked, + "Story must be blocked when commit-recovery respawn also produces no commits (AC 3)" + ); + assert_eq!( + block_reason, "agent declined to commit recoverable work", + "Block reason must match AC 3 spec" + ); + + // The recovery key must be cleared after blocking. + assert!( + crate::db::read_content("9955_story_recovery2:commit_recovery_pending").is_none(), + "commit_recovery_pending key must be cleared after blocking the story" + ); + + // retry_count must NOT have been incremented (AC 2: recovery never consumes a slot). + let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT"); + assert_eq!( + item.retry_count(), + 0, + "retry_count must NOT be incremented during commit-recovery path: got {}", + item.retry_count() + ); +} + // ── bug 953: bug-645 path must not advance when feature branch has zero commits ── /// Regression test for bug 953: when a coder agent exits with gates_passed=false @@ -957,6 +1150,7 @@ async fn coder_completion_with_test_evidence_and_zero_commits_does_not_advance() summary: "Agent crashed mid-output".to_string(), gates_passed: false, gate_output: "PTY write assertion failed".to_string(), + needs_commit_recovery: false, }, Some(root.to_path_buf()), Some(wt_path), diff --git a/server/src/agents/pool/pipeline/completion/legacy.rs b/server/src/agents/pool/pipeline/completion/legacy.rs index f75eaee1..75d428d0 100644 --- a/server/src/agents/pool/pipeline/completion/legacy.rs +++ b/server/src/agents/pool/pipeline/completion/legacy.rs @@ -69,6 +69,7 @@ impl AgentPool { summary: summary.to_string(), gates_passed, gate_output, + needs_commit_recovery: false, }; // Extract data for pipeline advance, then remove the entry so diff --git a/server/src/agents/pool/pipeline/completion/server.rs b/server/src/agents/pool/pipeline/completion/server.rs index 0e0697db..573592df 100644 --- a/server/src/agents/pool/pipeline/completion/server.rs +++ b/server/src/agents/pool/pipeline/completion/server.rs @@ -95,10 +95,13 @@ pub(in crate::agents::pool) async fn run_server_owned_completion( } } - // Run acceptance gates. - let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path { + // Run acceptance gates. Third element of the tuple is `needs_commit_recovery`: + // true when the coder exited with uncommitted content but zero commits — the + // pipeline advance will issue a commit-only recovery respawn rather than a + // normal retry, and will NOT consume a `retry_count` slot. + let (gates_passed, gate_output, needs_commit_recovery) = if let Some(wt_path) = worktree_path { let path = wt_path; - match tokio::task::spawn_blocking(move || { + match tokio::task::spawn_blocking(move || -> Result<(bool, String, bool), String> { // If the worktree is dirty, check whether committed work survived. // An agent crash (e.g. Claude Code CLI's `output.write(&bytes).is_ok()` // assertion — bug 645) can leave uncommitted files behind even though @@ -133,7 +136,10 @@ pub(in crate::agents::pool) async fn run_server_owned_completion( }) .unwrap_or(false) } else { - return Ok((false, dirty_msg)); + // Dirty worktree AND no committed work: the coder exited + // without committing. Signal a commit-only recovery respawn + // rather than consuming a normal retry slot. + return Ok((false, dirty_msg, true)); } } else { false @@ -152,9 +158,10 @@ pub(in crate::agents::pool) async fn run_server_owned_completion( "Agent exited with no commits on the feature branch. \ The agent did not produce any code changes." .to_string(), + false, )); } - let result = crate::agents::gates::run_acceptance_gates(&path); + let (passed, output) = crate::agents::gates::run_acceptance_gates(&path)?; // Restore stashed uncommitted changes. if stashed { let _ = std::process::Command::new("git") @@ -162,18 +169,19 @@ pub(in crate::agents::pool) async fn run_server_owned_completion( .current_dir(&path) .output(); } - result + Ok((passed, output, false)) }) .await { Ok(Ok(result)) => result, - Ok(Err(e)) => (false, e), - Err(e) => (false, format!("Gate check task panicked: {e}")), + Ok(Err(e)) => (false, e, false), + Err(e) => (false, format!("Gate check task panicked: {e}"), false), } } else { ( false, "No worktree path available to run acceptance gates".to_string(), + false, ) }; @@ -192,6 +200,7 @@ pub(in crate::agents::pool) async fn run_server_owned_completion( summary: "Agent process exited normally".to_string(), gates_passed, gate_output, + needs_commit_recovery, }; // Store completion report, extract data for pipeline advance, then diff --git a/server/src/agents/pool/pipeline/completion/tests.rs b/server/src/agents/pool/pipeline/completion/tests.rs index 72cf91c7..39f3dbda 100644 --- a/server/src/agents/pool/pipeline/completion/tests.rs +++ b/server/src/agents/pool/pipeline/completion/tests.rs @@ -97,6 +97,7 @@ async fn server_owned_completion_skips_when_already_completed() { summary: "Already done".to_string(), gates_passed: true, gate_output: String::new(), + needs_commit_recovery: false, }; pool.inject_test_agent_with_completion( "s10",