storkit: merge 445_bug_rate_limited_mergemaster_exits_advance_stories_to_done_without_merging

This commit is contained in:
dave
2026-03-28 20:05:10 +00:00
parent 8e45b2a08d
commit 3048d26e66
2 changed files with 152 additions and 13 deletions
+57 -12
View File
@@ -498,18 +498,63 @@ impl AgentPool {
}
}
// Server-owned completion: run acceptance gates automatically
// when the agent process exits normally.
super::pipeline::run_server_owned_completion(
&agents_ref,
port_for_task,
&sid,
&aname,
result.session_id,
watcher_tx_clone.clone(),
)
.await;
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
// Mergemaster agents have their own completion path via
// start_merge_agent_work / run_merge_pipeline and must NOT go
// through server-owned gates. When a mergemaster exits early
// (e.g. rate-limited before calling start_merge_agent_work) the
// feature-branch worktree compiles fine and post-merge tests on
// master pass (nothing changed), which would wrongly advance the
// story to 5_done/ without any squash merge having occurred.
// Instead: just remove the agent from the pool and let
// auto-assign restart a new mergemaster for the story.
let stage = config_clone
.find_agent(&aname)
.map(agent_config_stage)
.unwrap_or_else(|| pipeline_stage(&aname));
if stage == PipelineStage::Mergemaster {
let (tx_done, done_session_id) = {
let mut lock = match agents_ref.lock() {
Ok(a) => a,
Err(_) => return,
};
if let Some(agent) = lock.remove(&key_clone) {
(agent.tx, agent.session_id.or(result.session_id))
} else {
(tx_clone.clone(), result.session_id)
}
};
let _ = tx_done.send(AgentEvent::Done {
story_id: sid.clone(),
agent_name: aname.clone(),
session_id: done_session_id,
});
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
// Send a WorkItem event so the auto-assign watcher loop
// re-dispatches a new mergemaster if the story still needs
// merging. This avoids an async call to start_agent inside
// a tokio::spawn (which would require Send).
let _ = watcher_tx_clone.send(
crate::io::watcher::WatcherEvent::WorkItem {
stage: "4_merge".to_string(),
item_id: sid.clone(),
action: "reassign".to_string(),
commit_msg: String::new(),
},
);
} else {
// Server-owned completion: run acceptance gates automatically
// when the agent process exits normally.
super::pipeline::run_server_owned_completion(
&agents_ref,
port_for_task,
&sid,
&aname,
result.session_id,
watcher_tx_clone.clone(),
)
.await;
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
}
}
Err(e) => {
slog_error!("[agents] Agent process error for {aname} on {sid}: {e}");
+95 -1
View File
@@ -4,7 +4,7 @@ use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use tokio::sync::broadcast;
use super::super::super::{AgentEvent, AgentStatus, CompletionReport};
use super::super::super::{AgentEvent, AgentStatus, CompletionReport, PipelineStage, pipeline_stage};
use super::super::{AgentPool, StoryAgent, composite_key};
use super::advance::spawn_pipeline_advance;
@@ -155,6 +155,21 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
) {
let key = composite_key(story_id, agent_name);
// Guard: mergemaster agents have their own completion path via
// start_merge_agent_work / run_merge_pipeline. Running server-owned gates
// for a mergemaster would wrongly advance the story to 5_done/ even when
// no squash merge has occurred (e.g. rate-limited exit before the agent
// called start_merge_agent_work). The lifecycle caller is responsible for
// cleaning up the agent entry and triggering auto-assign.
if pipeline_stage(agent_name) == PipelineStage::Mergemaster {
slog!(
"[agents] run_server_owned_completion skipped for mergemaster \
'{story_id}:{agent_name}'; mergemaster completion is handled by \
start_merge_agent_work."
);
return;
}
// Guard: skip if completion was already recorded (legacy path).
{
let lock = match agents.lock() {
@@ -516,4 +531,83 @@ mod tests {
)
.await;
}
/// Regression test for bug 445: a rate-limited mergemaster exits before
/// calling start_merge_agent_work. run_server_owned_completion must be a
/// no-op for mergemaster agents — it must not run acceptance gates and must
/// not advance the story to 5_done/ even when a passing script/test exists.
///
/// Before the fix: run_server_owned_completion would call run_pipeline_advance
/// for the Mergemaster stage, which ran post-merge tests on master (they pass
/// because nothing changed), then called move_story_to_done — advancing the
/// story without any squash merge having occurred.
#[cfg(unix)]
#[tokio::test]
async fn server_owned_completion_is_noop_for_mergemaster() {
use std::fs;
use std::os::unix::fs::PermissionsExt;
use tempfile::tempdir;
let tmp = tempdir().unwrap();
let root = tmp.path();
init_git_repo(root);
// Create a passing script/test so post-merge tests would succeed if
// run_pipeline_advance were incorrectly called for this mergemaster.
let script_dir = root.join("script");
fs::create_dir_all(&script_dir).unwrap();
let script_test = script_dir.join("test");
fs::write(&script_test, "#!/usr/bin/env sh\nexit 0\n").unwrap();
let mut perms = fs::metadata(&script_test).unwrap().permissions();
perms.set_mode(0o755);
fs::set_permissions(&script_test, perms).unwrap();
// Story in 4_merge/ — must NOT be moved to 5_done/.
let merge_dir = root.join(".storkit/work/4_merge");
fs::create_dir_all(&merge_dir).unwrap();
let story_path = merge_dir.join("99_story_merge445.md");
fs::write(&story_path, "---\nname: Merge 445 Test\n---\n").unwrap();
let pool = AgentPool::new_test(3001);
pool.inject_test_agent_with_path(
"99_story_merge445",
"mergemaster",
AgentStatus::Running,
root.to_path_buf(),
);
run_server_owned_completion(
&pool.agents,
pool.port,
"99_story_merge445",
"mergemaster",
None,
pool.watcher_tx.clone(),
)
.await;
// Wait briefly in case any background task fires.
tokio::time::sleep(std::time::Duration::from_millis(150)).await;
// Story must remain in 4_merge/ — not moved to 5_done/.
let done_path = root.join(".storkit/work/5_done/99_story_merge445.md");
assert!(
!done_path.exists(),
"Story must NOT be moved to 5_done/ when run_server_owned_completion \
is (incorrectly) called for a mergemaster agent"
);
assert!(
story_path.exists(),
"Story must remain in 4_merge/ when mergemaster completion is a no-op"
);
// The agent entry should remain in the pool (lifecycle cleanup is the
// caller's responsibility, not run_server_owned_completion's).
let agents = pool.agents.lock().unwrap();
let key = composite_key("99_story_merge445", "mergemaster");
assert!(
agents.get(&key).is_some(),
"Agent must remain in pool — run_server_owned_completion is a no-op for mergemaster"
);
}
}