diff --git a/server/src/agents/pool/pipeline/merge.rs b/server/src/agents/pool/pipeline/merge.rs index f482c5a5..006b8916 100644 --- a/server/src/agents/pool/pipeline/merge.rs +++ b/server/src/agents/pool/pipeline/merge.rs @@ -212,6 +212,70 @@ mod tests { .unwrap(); } + // ── bug 498: stale Running job blocks retry ─────────────────────────────── + + /// Regression test for bug 498: a Running merge job left behind by a killed + /// mergemaster must not block the next call to start_merge_agent_work. + /// + /// Before the fix: start_merge_agent_work would return "Merge already in + /// progress" when a Running entry existed, even after the mergemaster died. + /// After the fix: the entry is cleared when the mergemaster exits, so a new + /// call succeeds. + #[tokio::test] + async fn stale_running_merge_job_is_cleared_and_retry_succeeds() { + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let repo = tmp.path(); + init_git_repo(repo); + + let pool = Arc::new(AgentPool::new_test(3001)); + + // Inject a stale Running entry, simulating a mergemaster that died + // before the merge pipeline completed. + { + let mut jobs = pool.merge_jobs.lock().unwrap(); + jobs.insert( + "77_story_stale".to_string(), + MergeJob { + story_id: "77_story_stale".to_string(), + status: MergeJobStatus::Running, + }, + ); + } + + // With a stale Running entry, start_merge_agent_work must be blocked. + let blocked = pool.start_merge_agent_work(repo, "77_story_stale"); + assert!( + blocked.is_err(), + "start_merge_agent_work must be blocked while Running job exists" + ); + let err_msg = blocked.unwrap_err(); + assert!( + err_msg.contains("already in progress"), + "unexpected error: {err_msg}" + ); + + // Simulate the mergemaster exit path: clear the stale Running entry. + { + let mut jobs = pool.merge_jobs.lock().unwrap(); + if let Some(job) = jobs.get("77_story_stale") + && matches!(job.status, MergeJobStatus::Running) + { + jobs.remove("77_story_stale"); + } + } + + // After clearing, start_merge_agent_work must succeed (it will fail + // the pipeline because there's no feature branch, but it must not be + // blocked by "Merge already in progress"). + let result = pool.start_merge_agent_work(repo, "77_story_stale"); + assert!( + result.is_ok(), + "start_merge_agent_work must succeed after stale Running job is cleared; got: {result:?}" + ); + } + // ── merge_agent_work tests ──────────────────────────────────────────────── /// Helper: start a merge and poll until terminal state. diff --git a/server/src/agents/pool/start.rs b/server/src/agents/pool/start.rs index b1bf6934..afc9f489 100644 --- a/server/src/agents/pool/start.rs +++ b/server/src/agents/pool/start.rs @@ -320,6 +320,7 @@ impl AgentPool { let log_writer_clone = log_writer.clone(); let child_killers_clone = self.child_killers.clone(); let watcher_tx_clone = self.watcher_tx.clone(); + let merge_jobs_clone = Arc::clone(&self.merge_jobs); // Spawn the background task. Worktree creation and agent launch happen here // so `start_agent` returns immediately after registering the agent as @@ -524,6 +525,15 @@ impl AgentPool { (tx_clone.clone(), result.session_id) } }; + // Clear any stale Running merge job so the next mergemaster + // can call start_merge_agent_work without hitting "Merge + // already in progress" (bug 498). + if let Ok(mut jobs) = merge_jobs_clone.lock() + && let Some(job) = jobs.get(&sid) + && matches!(job.status, crate::agents::merge::MergeJobStatus::Running) + { + jobs.remove(&sid); + } let _ = tx_done.send(AgentEvent::Done { story_id: sid.clone(), agent_name: aname.clone(),