diff --git a/server/src/agents/pool/auto_assign/auto_assign.rs b/server/src/agents/pool/auto_assign/auto_assign.rs index 6d08b27f..6b4a896d 100644 --- a/server/src/agents/pool/auto_assign/auto_assign.rs +++ b/server/src/agents/pool/auto_assign/auto_assign.rs @@ -657,6 +657,92 @@ mod tests { ); } + // ── Story 920: transient vs genuine mergemaster termination ────────────── + + /// AC4 (transient): a mergemaster that was killed transiently (no + /// report_merge_failure, spawn count below cap) must be re-spawned by the + /// next auto-assign pass — `mergemaster_attempted` stays false. + #[tokio::test] + async fn transient_mergemaster_exit_allows_respawn() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".huskies"); + std::fs::create_dir_all(&sk).unwrap(); + std::fs::write( + sk.join("project.toml"), + "[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", + ) + .unwrap(); + crate::crdt_state::init_for_test(); + crate::db::ensure_content_store(); + crate::db::write_item_with_content( + "920_story_transient", + "4_merge_failure", + "---\nname: Transient\nmerge_failure: \"CONFLICT (content): foo.rs\"\n---\n", + crate::db::ItemMeta::from_yaml( + "---\nname: Transient\nmerge_failure: \"CONFLICT (content): foo.rs\"\n---\n", + ), + ); + // Simulate two previous transient exits (below cap of 3) recorded in DB. + crate::db::write_content("920_story_transient:mergemaster_spawn_count", "2"); + + // mergemaster_attempted must still be false (transient exits don't set it). + let pool = AgentPool::new_test(3001); + pool.auto_assign_available_work(tmp.path()).await; + + let agents = pool.agents.lock().unwrap(); + let respawned = agents.iter().any(|(key, a)| { + key.contains("920_story_transient") + && a.agent_name == "mergemaster" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + respawned, + "mergemaster must re-spawn after transient terminations while below cap" + ); + } + + /// AC4 (genuine): after report_merge_failure, mergemaster_attempted is set + /// to true and auto-assign must not trigger another re-spawn. + #[tokio::test] + async fn genuine_mergemaster_exit_no_respawn() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".huskies"); + std::fs::create_dir_all(&sk).unwrap(); + std::fs::write( + sk.join("project.toml"), + "[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", + ) + .unwrap(); + crate::crdt_state::init_for_test(); + crate::db::ensure_content_store(); + // mergemaster_attempted is set by the exit path when genuine give-up occurs. + crate::db::write_item_with_content( + "920_story_genuine", + "4_merge_failure", + "---\nname: Genuine\nmerge_failure: \"CONFLICT (content): bar.rs\"\n---\n", + crate::db::ItemMeta::from_yaml( + "---\nname: Genuine\nmerge_failure: \"CONFLICT (content): bar.rs\"\n---\n", + ), + ); + // The CRDT register is the sole authority; set it explicitly as the + // spawn exit path would after report_merge_failure. + crate::crdt_state::set_mergemaster_attempted("920_story_genuine", true); + + let pool = AgentPool::new_test(3001); + pool.auto_assign_available_work(tmp.path()).await; + + let agents = pool.agents.lock().unwrap(); + let spawned = agents.iter().any(|(key, a)| { + key.contains("920_story_genuine") + && a.agent_name == "mergemaster" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + !spawned, + "mergemaster must not re-spawn after genuine give-up (mergemaster_attempted=true)" + ); + } + /// Two concurrent auto_assign_available_work calls must not assign the same /// agent to two stories simultaneously. After both complete, at most one /// Pending/Running entry must exist per agent name. diff --git a/server/src/agents/pool/auto_assign/merge.rs b/server/src/agents/pool/auto_assign/merge.rs index cd6a4e69..8207331f 100644 --- a/server/src/agents/pool/auto_assign/merge.rs +++ b/server/src/agents/pool/auto_assign/merge.rs @@ -146,7 +146,6 @@ impl AgentPool { "[auto-assign] Content conflict on '{story_id}'; \ auto-spawning mergemaster '{agent_name}'." ); - crate::crdt_state::set_mergemaster_attempted(story_id, true); if let Err(e) = self .start_agent(project_root, story_id, Some(&agent_name), None, None) .await diff --git a/server/src/agents/pool/start/spawn.rs b/server/src/agents/pool/start/spawn.rs index 52ece712..c36e095d 100644 --- a/server/src/agents/pool/start/spawn.rs +++ b/server/src/agents/pool/start/spawn.rs @@ -512,15 +512,19 @@ pub(super) async fn run_agent_spawn( crate::db::delete_content(&format!("{sid}:abort_respawn_count")); if stage == PipelineStage::Mergemaster { - let (tx_done, done_session_id) = { + let (tx_done, done_session_id, merge_failure_reported) = { let mut lock = match agents_ref.lock() { Ok(a) => a, Err(_) => return, }; if let Some(agent) = lock.remove(&key_clone) { - (agent.tx, agent.session_id.or(result.session_id)) + ( + agent.tx, + agent.session_id.or(result.session_id), + agent.merge_failure_reported, + ) } else { - (tx_clone.clone(), result.session_id) + (tx_clone.clone(), result.session_id, false) } }; // Clear any stale Running merge job so the next mergemaster @@ -531,6 +535,44 @@ pub(super) async fn run_agent_spawn( { crate::crdt_state::delete_merge_job(&sid); } + // Classify termination: genuine (report_merge_failure called, or + // the transient-respawn budget is exhausted) vs transient + // (watchdog / rate-limit / crash without an explicit give-up call). + // Only mark mergemaster_attempted on a genuine give-up so that + // transient exits can be re-spawned up to the cap (story 920). + const MERGEMASTER_RESPAWN_CAP: u32 = 3; + let spawn_count_key = format!("{sid}:mergemaster_spawn_count"); + let is_genuine = if merge_failure_reported { + slog!( + "[agents] Mergemaster '{aname}' for '{sid}' gave up genuinely \ + (report_merge_failure called)." + ); + true + } else { + let count = crate::db::read_content(&spawn_count_key) + .and_then(|s| s.trim().parse::().ok()) + .unwrap_or(0) + + 1; + crate::db::write_content(&spawn_count_key, &count.to_string()); + if count >= MERGEMASTER_RESPAWN_CAP { + slog!( + "[agents] Mergemaster '{aname}' for '{sid}' exhausted \ + respawn budget ({count}/{MERGEMASTER_RESPAWN_CAP}); \ + marking as permanently blocked." + ); + true + } else { + slog!( + "[agents] Mergemaster '{aname}' for '{sid}' terminated \ + transiently (spawn {count}/{MERGEMASTER_RESPAWN_CAP}); \ + will re-spawn." + ); + false + } + }; + if is_genuine { + crate::crdt_state::set_mergemaster_attempted(&sid, true); + } let _ = tx_done.send(AgentEvent::Done { story_id: sid.clone(), agent_name: aname.clone(),