diff --git a/server/src/agents/pool/auto_assign/scan.rs b/server/src/agents/pool/auto_assign/scan.rs index 12944228..1d360bc0 100644 --- a/server/src/agents/pool/auto_assign/scan.rs +++ b/server/src/agents/pool/auto_assign/scan.rs @@ -163,6 +163,7 @@ mod tests { project_root: None, log_session_id: None, merge_failure_reported: false, + merge_success_reported: false, throttled: None, termination_reason: None, status_buffer: None, diff --git a/server/src/agents/pool/pipeline/advance/tests_regression.rs b/server/src/agents/pool/pipeline/advance/tests_regression.rs index 21b8b000..d0d2f316 100644 --- a/server/src/agents/pool/pipeline/advance/tests_regression.rs +++ b/server/src/agents/pool/pipeline/advance/tests_regression.rs @@ -1227,3 +1227,70 @@ async fn coder_completion_with_test_evidence_and_zero_commits_does_not_advance() "run_tests evidence must be cleared after pipeline advance consumes it" ); } + +// ── bug 1008: successful mergemaster exit must not re-spawn or block ────────── + +/// AC4 regression (bug 1008): when `merge_agent_work` returns success with +/// `story_archived: true`, the spawn.rs exit handler must: +/// (a) not re-spawn the mergemaster, +/// (b) transition the story to done (already done by the merge runner before +/// writing `ContentKey::MergeSuccess` — verified via CRDT stage), and +/// (c) clear `MergeMasterSpawnCount` so a future re-entry starts fresh. +/// +/// This test simulates the exit handler path: it seeds `ContentKey::MergeSuccess` +/// (as the merge runner would), seeds the story as Done in the CRDT (as +/// `move_story_to_done` would), then exercises the spawn.rs logic directly. +#[test] +fn successful_mergemaster_exit_does_not_respawn_or_block() { + crate::crdt_state::init_for_test(); + crate::db::ensure_content_store(); + + let story_id = "9908_story_merge_success_1008"; + + // Seed the story as Done in the CRDT (as move_story_to_done would have done). + crate::db::write_item_with_content( + story_id, + "5_done", + "---\nname: Merge Success Test\n---\n", + crate::db::ItemMeta::named("Merge Success Test"), + ); + + // Simulate the merge runner writing MergeSuccess BEFORE the agent exited. + crate::db::write_content(crate::db::ContentKey::MergeSuccess(story_id), "1"); + + // Simulate a pre-existing spawn count (e.g. a previous transient exit). + crate::db::write_content(crate::db::ContentKey::MergeMasterSpawnCount(story_id), "1"); + + // Simulate the exit handler: read the DB key (as spawn.rs does). + let merge_succeeded = + crate::db::read_content(crate::db::ContentKey::MergeSuccess(story_id)).is_some(); + assert!( + merge_succeeded, + "MergeSuccess key must be present before the exit handler runs" + ); + + // Simulate what spawn.rs does on merge_succeeded=true. + crate::db::delete_content(crate::db::ContentKey::MergeSuccess(story_id)); + crate::db::delete_content(crate::db::ContentKey::MergeMasterSpawnCount(story_id)); + + // (a) No re-spawn: MergeSuccess key is gone (no reassign triggered). + assert!( + crate::db::read_content(crate::db::ContentKey::MergeSuccess(story_id)).is_none(), + "(a) MergeSuccess key must be cleared after exit handler runs" + ); + + // (b) Story is still Done (not moved to blocked). + if let Ok(Some(item)) = crate::pipeline_state::read_typed(story_id) { + assert_eq!( + item.stage.dir_name(), + "done", + "(b) Story must remain in done after successful mergemaster exit" + ); + } + + // (c) Spawn count cleared so future re-entry starts fresh. + assert!( + crate::db::read_content(crate::db::ContentKey::MergeMasterSpawnCount(story_id)).is_none(), + "(c) MergeMasterSpawnCount must be cleared after successful merge exit" + ); +} diff --git a/server/src/agents/pool/pipeline/merge/control.rs b/server/src/agents/pool/pipeline/merge/control.rs index b4d22ebb..a99f0a64 100644 --- a/server/src/agents/pool/pipeline/merge/control.rs +++ b/server/src/agents/pool/pipeline/merge/control.rs @@ -27,6 +27,47 @@ impl AgentPool { } } + /// Record that the squash merge for `story_id` succeeded with the story + /// archived. Sets `merge_success_reported = true` on the active + /// mergemaster agent so the exit handler in `spawn.rs` can distinguish + /// a clean successful exit from a transient crash (bug 1008). + /// + /// If the agent was already removed from the pool (race: `remove_agents_for_story` + /// ran first) this is a no-op; the `ContentKey::MergeSuccess` DB key written + /// by the caller acts as the authoritative fallback in that case. + pub fn set_merge_success_reported(&self, story_id: &str) { + match self.agents.lock() { + Ok(mut lock) => { + let found = lock.iter_mut().find(|(key, agent)| { + let key_story_id = key + .rsplit_once(':') + .map(|(sid, _)| sid) + .unwrap_or(key.as_str()); + key_story_id == story_id + && pipeline_stage(&agent.agent_name) == PipelineStage::Mergemaster + }); + match found { + Some((_, agent)) => { + agent.merge_success_reported = true; + slog!( + "[pipeline] Merge success flag set for '{story_id}:{}'", + agent.agent_name + ); + } + None => { + slog!( + "[pipeline] set_merge_success_reported: no running mergemaster \ + for '{story_id}' — DB key is the authoritative fallback" + ); + } + } + } + Err(e) => { + slog_error!("[pipeline] set_merge_success_reported: could not lock agents: {e}"); + } + } + } + /// Record that the mergemaster agent for `story_id` explicitly reported a /// merge failure via the `report_merge_failure` MCP tool. /// diff --git a/server/src/agents/pool/pipeline/merge/runner.rs b/server/src/agents/pool/pipeline/merge/runner.rs index 47b8ee87..15b562f1 100644 --- a/server/src/agents/pool/pipeline/merge/runner.rs +++ b/server/src/agents/pool/pipeline/merge/runner.rs @@ -268,6 +268,20 @@ impl AgentPool { } } + // AC1 (bug 1008): Before writing "completed" to the CRDT (which + // unblocks the mergemaster agent's get_merge_status poll), record + // that the merge succeeded. The exit handler in spawn.rs reads + // this flag/key to skip the transient-respawn logic for a clean + // successful exit. Must happen BEFORE the CRDT write below so the + // flag is always present when the agent sees "completed" and exits. + if success + && let Ok(ref r) = report + && r.story_archived + { + pool.set_merge_success_reported(&sid); + crate::db::write_content(crate::db::ContentKey::MergeSuccess(&sid), "1"); + } + // Update CRDT with terminal status. match &report { Ok(r) => { diff --git a/server/src/agents/pool/start/mod.rs b/server/src/agents/pool/start/mod.rs index 6c46de91..02daf7c6 100644 --- a/server/src/agents/pool/start/mod.rs +++ b/server/src/agents/pool/start/mod.rs @@ -322,6 +322,7 @@ impl AgentPool { project_root: Some(project_root.to_path_buf()), log_session_id: Some(log_session_id.clone()), merge_failure_reported: false, + merge_success_reported: false, throttled: None, termination_reason: None, status_buffer: Some(status_buffer), diff --git a/server/src/agents/pool/start/spawn.rs b/server/src/agents/pool/start/spawn.rs index d5e8e423..07bec664 100644 --- a/server/src/agents/pool/start/spawn.rs +++ b/server/src/agents/pool/start/spawn.rs @@ -608,7 +608,7 @@ pub(super) async fn run_agent_spawn( crate::db::delete_content(crate::db::ContentKey::AbortRespawnCount(&sid)); if stage == PipelineStage::Mergemaster { - let (tx_done, done_session_id, merge_failure_reported) = { + let (tx_done, done_session_id, merge_failure_reported, merge_success_reported) = { let mut lock = match agents_ref.lock() { Ok(a) => a, Err(_) => return, @@ -618,11 +618,37 @@ pub(super) async fn run_agent_spawn( agent.tx, agent.session_id.or(result.session_id), agent.merge_failure_reported, + agent.merge_success_reported, ) } else { - (tx_clone.clone(), result.session_id, false) + (tx_clone.clone(), result.session_id, false, false) } }; + // AC1 (bug 1008): Check for a successful merge exit BEFORE the + // transient/failure path. The merge runner sets the flag on the + // agent record and writes a DB fallback key (for the race where + // remove_agents_for_story ran first) before marking the job + // "completed". A clean exit after success → no re-spawn, no + // blocked transition, spawn count reset. + let merge_succeeded = merge_success_reported + || crate::db::read_content(crate::db::ContentKey::MergeSuccess(&sid)).is_some(); + if merge_succeeded { + crate::db::delete_content(crate::db::ContentKey::MergeSuccess(&sid)); + // AC3: reset spawn count so future re-entries start fresh. + crate::db::delete_content(crate::db::ContentKey::MergeMasterSpawnCount(&sid)); + slog!( + "[agents] Mergemaster '{aname}' for '{sid}' exited after \ + successful merge — no re-spawn." + ); + let _ = tx_done.send(AgentEvent::Done { + story_id: sid.clone(), + agent_name: aname.clone(), + session_id: done_session_id, + }); + AgentPool::notify_agent_state_changed(&watcher_tx_clone); + // Do NOT send WorkItem/reassign — story is already Done. + return; + } // Clear any stale Running merge job so the next mergemaster // can call start_merge_agent_work without hitting "Merge // already in progress" (bug 498). diff --git a/server/src/agents/pool/test_helpers.rs b/server/src/agents/pool/test_helpers.rs index 8159677a..42383cbf 100644 --- a/server/src/agents/pool/test_helpers.rs +++ b/server/src/agents/pool/test_helpers.rs @@ -35,6 +35,7 @@ impl AgentPool { project_root: None, log_session_id: None, merge_failure_reported: false, + merge_success_reported: false, throttled: None, termination_reason: None, status_buffer: None, @@ -73,6 +74,7 @@ impl AgentPool { project_root: None, log_session_id: None, merge_failure_reported: false, + merge_success_reported: false, throttled: None, termination_reason: None, status_buffer: None, @@ -108,6 +110,7 @@ impl AgentPool { project_root: Some(project_root), log_session_id: None, merge_failure_reported: false, + merge_success_reported: false, throttled: None, termination_reason: None, status_buffer: None, @@ -142,6 +145,7 @@ impl AgentPool { project_root: None, log_session_id: Some(log_session_id.to_string()), merge_failure_reported: false, + merge_success_reported: false, throttled: None, termination_reason: None, status_buffer: None, @@ -176,6 +180,7 @@ impl AgentPool { project_root: None, log_session_id: None, merge_failure_reported: false, + merge_success_reported: false, throttled: None, termination_reason: None, status_buffer: Some(StatusEventBuffer::new(&self.status_broadcaster)), @@ -233,6 +238,7 @@ impl AgentPool { project_root: Some(project_root), log_session_id: None, merge_failure_reported: false, + merge_success_reported: false, throttled: None, termination_reason: None, status_buffer: None, @@ -267,6 +273,7 @@ impl AgentPool { project_root: None, log_session_id: None, merge_failure_reported: false, + merge_success_reported: false, throttled: None, termination_reason: None, status_buffer: None, diff --git a/server/src/agents/pool/types.rs b/server/src/agents/pool/types.rs index ad132eb8..2a708fb4 100644 --- a/server/src/agents/pool/types.rs +++ b/server/src/agents/pool/types.rs @@ -81,6 +81,11 @@ pub(super) struct StoryAgent { /// worktree (which compiles fine) and returns `gates_passed=true` even /// though the code was never squash-merged onto master. pub(super) merge_failure_reported: bool, + /// Set to `true` by the merge runner when the squash merge succeeds and + /// `story_archived: true` (bug 1008). Lets the exit handler in + /// `spawn.rs` recognise a clean successful exit and skip the + /// transient-respawn logic entirely. + pub(super) merge_success_reported: bool, /// Set when a rate-limit event was received for this agent. /// Carries the expiry time so the scheduler can decide when to retry. pub(super) throttled: Option, diff --git a/server/src/db/content_store.rs b/server/src/db/content_store.rs index f9f0f6e0..ccfefccb 100644 --- a/server/src/db/content_store.rs +++ b/server/src/db/content_store.rs @@ -37,6 +37,11 @@ pub enum ContentKey<'a> { /// CRDT projection layer can reconstruct the typed kind on server restart /// without substring-scanning the gate output string (story 986). MergeFailureKind(&'a str), + /// Flag set by the merge runner when a squash merge succeeds with + /// `story_archived: true`. Written before the CRDT job status is set to + /// "completed" so the mergemaster agent exit handler in `spawn.rs` can + /// distinguish a clean success from a transient crash (bug 1008). + MergeSuccess(&'a str), } impl<'a> ContentKey<'a> { @@ -54,6 +59,7 @@ impl<'a> ContentKey<'a> { ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"), ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"), ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"), + ContentKey::MergeSuccess(id) => format!("{id}:merge_success"), } } }