From 3048d26e66c53e5346a51e4ae84648fc8500d6b9 Mon Sep 17 00:00:00 2001 From: dave Date: Sat, 28 Mar 2026 20:05:10 +0000 Subject: [PATCH] storkit: merge 445_bug_rate_limited_mergemaster_exits_advance_stories_to_done_without_merging --- server/src/agents/pool/lifecycle.rs | 69 ++++++++++--- server/src/agents/pool/pipeline/completion.rs | 96 ++++++++++++++++++- 2 files changed, 152 insertions(+), 13 deletions(-) diff --git a/server/src/agents/pool/lifecycle.rs b/server/src/agents/pool/lifecycle.rs index ecfa1272..c6584820 100644 --- a/server/src/agents/pool/lifecycle.rs +++ b/server/src/agents/pool/lifecycle.rs @@ -498,18 +498,63 @@ impl AgentPool { } } - // Server-owned completion: run acceptance gates automatically - // when the agent process exits normally. - super::pipeline::run_server_owned_completion( - &agents_ref, - port_for_task, - &sid, - &aname, - result.session_id, - watcher_tx_clone.clone(), - ) - .await; - AgentPool::notify_agent_state_changed(&watcher_tx_clone); + // Mergemaster agents have their own completion path via + // start_merge_agent_work / run_merge_pipeline and must NOT go + // through server-owned gates. When a mergemaster exits early + // (e.g. rate-limited before calling start_merge_agent_work) the + // feature-branch worktree compiles fine and post-merge tests on + // master pass (nothing changed), which would wrongly advance the + // story to 5_done/ without any squash merge having occurred. + // Instead: just remove the agent from the pool and let + // auto-assign restart a new mergemaster for the story. + let stage = config_clone + .find_agent(&aname) + .map(agent_config_stage) + .unwrap_or_else(|| pipeline_stage(&aname)); + if stage == PipelineStage::Mergemaster { + let (tx_done, done_session_id) = { + let mut lock = match agents_ref.lock() { + Ok(a) => a, + Err(_) => return, + }; + if let Some(agent) = lock.remove(&key_clone) { + (agent.tx, agent.session_id.or(result.session_id)) + } else { + (tx_clone.clone(), result.session_id) + } + }; + let _ = tx_done.send(AgentEvent::Done { + story_id: sid.clone(), + agent_name: aname.clone(), + session_id: done_session_id, + }); + AgentPool::notify_agent_state_changed(&watcher_tx_clone); + // Send a WorkItem event so the auto-assign watcher loop + // re-dispatches a new mergemaster if the story still needs + // merging. This avoids an async call to start_agent inside + // a tokio::spawn (which would require Send). + let _ = watcher_tx_clone.send( + crate::io::watcher::WatcherEvent::WorkItem { + stage: "4_merge".to_string(), + item_id: sid.clone(), + action: "reassign".to_string(), + commit_msg: String::new(), + }, + ); + } else { + // Server-owned completion: run acceptance gates automatically + // when the agent process exits normally. + super::pipeline::run_server_owned_completion( + &agents_ref, + port_for_task, + &sid, + &aname, + result.session_id, + watcher_tx_clone.clone(), + ) + .await; + AgentPool::notify_agent_state_changed(&watcher_tx_clone); + } } Err(e) => { slog_error!("[agents] Agent process error for {aname} on {sid}: {e}"); diff --git a/server/src/agents/pool/pipeline/completion.rs b/server/src/agents/pool/pipeline/completion.rs index df88a9c1..70aef62d 100644 --- a/server/src/agents/pool/pipeline/completion.rs +++ b/server/src/agents/pool/pipeline/completion.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; use tokio::sync::broadcast; -use super::super::super::{AgentEvent, AgentStatus, CompletionReport}; +use super::super::super::{AgentEvent, AgentStatus, CompletionReport, PipelineStage, pipeline_stage}; use super::super::{AgentPool, StoryAgent, composite_key}; use super::advance::spawn_pipeline_advance; @@ -155,6 +155,21 @@ pub(in crate::agents::pool) async fn run_server_owned_completion( ) { let key = composite_key(story_id, agent_name); + // Guard: mergemaster agents have their own completion path via + // start_merge_agent_work / run_merge_pipeline. Running server-owned gates + // for a mergemaster would wrongly advance the story to 5_done/ even when + // no squash merge has occurred (e.g. rate-limited exit before the agent + // called start_merge_agent_work). The lifecycle caller is responsible for + // cleaning up the agent entry and triggering auto-assign. + if pipeline_stage(agent_name) == PipelineStage::Mergemaster { + slog!( + "[agents] run_server_owned_completion skipped for mergemaster \ + '{story_id}:{agent_name}'; mergemaster completion is handled by \ + start_merge_agent_work." + ); + return; + } + // Guard: skip if completion was already recorded (legacy path). { let lock = match agents.lock() { @@ -516,4 +531,83 @@ mod tests { ) .await; } + + /// Regression test for bug 445: a rate-limited mergemaster exits before + /// calling start_merge_agent_work. run_server_owned_completion must be a + /// no-op for mergemaster agents — it must not run acceptance gates and must + /// not advance the story to 5_done/ even when a passing script/test exists. + /// + /// Before the fix: run_server_owned_completion would call run_pipeline_advance + /// for the Mergemaster stage, which ran post-merge tests on master (they pass + /// because nothing changed), then called move_story_to_done — advancing the + /// story without any squash merge having occurred. + #[cfg(unix)] + #[tokio::test] + async fn server_owned_completion_is_noop_for_mergemaster() { + use std::fs; + use std::os::unix::fs::PermissionsExt; + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let root = tmp.path(); + init_git_repo(root); + + // Create a passing script/test so post-merge tests would succeed if + // run_pipeline_advance were incorrectly called for this mergemaster. + let script_dir = root.join("script"); + fs::create_dir_all(&script_dir).unwrap(); + let script_test = script_dir.join("test"); + fs::write(&script_test, "#!/usr/bin/env sh\nexit 0\n").unwrap(); + let mut perms = fs::metadata(&script_test).unwrap().permissions(); + perms.set_mode(0o755); + fs::set_permissions(&script_test, perms).unwrap(); + + // Story in 4_merge/ — must NOT be moved to 5_done/. + let merge_dir = root.join(".storkit/work/4_merge"); + fs::create_dir_all(&merge_dir).unwrap(); + let story_path = merge_dir.join("99_story_merge445.md"); + fs::write(&story_path, "---\nname: Merge 445 Test\n---\n").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent_with_path( + "99_story_merge445", + "mergemaster", + AgentStatus::Running, + root.to_path_buf(), + ); + + run_server_owned_completion( + &pool.agents, + pool.port, + "99_story_merge445", + "mergemaster", + None, + pool.watcher_tx.clone(), + ) + .await; + + // Wait briefly in case any background task fires. + tokio::time::sleep(std::time::Duration::from_millis(150)).await; + + // Story must remain in 4_merge/ — not moved to 5_done/. + let done_path = root.join(".storkit/work/5_done/99_story_merge445.md"); + assert!( + !done_path.exists(), + "Story must NOT be moved to 5_done/ when run_server_owned_completion \ + is (incorrectly) called for a mergemaster agent" + ); + assert!( + story_path.exists(), + "Story must remain in 4_merge/ when mergemaster completion is a no-op" + ); + + // The agent entry should remain in the pool (lifecycle cleanup is the + // caller's responsibility, not run_server_owned_completion's). + let agents = pool.agents.lock().unwrap(); + let key = composite_key("99_story_merge445", "mergemaster"); + assert!( + agents.get(&key).is_some(), + "Agent must remain in pool — run_server_owned_completion is a no-op for mergemaster" + ); + } }