From b09b6ce4f1e3e770d3b9efb9894b52aaea34fd38 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 23 Feb 2026 20:46:51 +0000 Subject: [PATCH] fix(agents): enforce single-instance concurrency per agent name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agent pool allowed the same agent (e.g. "qa") to run concurrently on multiple stories because start_agent() only checked whether that story+agent combo was already active. It did not check whether the agent was busy on a different story. Two concurrent QA runs each spawn cargo clippy + cargo test + vitest, causing extreme CPU load (load average >33 on M1 Mac). Fix: before registering a new agent as Pending, scan all active entries for any Running or Pending entry with the same agent_name. If one is found, return an error explaining that the story will be picked up when the agent becomes available. The existing auto_assign_available_work() mechanism already scans pipeline directories (3_qa/, 4_merge/, etc.) for unassigned stories and uses find_free_agent_for_stage() — which respects single-instance limits — to assign work when an agent slot opens up. So the queuing behaviour is naturally provided: the story stays in its directory, and auto-assign picks it up when the previous run completes. Adds two regression tests: - start_agent_rejects_when_same_agent_already_running_on_another_story - start_agent_allows_new_story_when_previous_run_is_completed Co-Authored-By: Claude Sonnet 4.6 --- server/src/agents.rs | 101 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/server/src/agents.rs b/server/src/agents.rs index ebe0f2a..e122702 100644 --- a/server/src/agents.rs +++ b/server/src/agents.rs @@ -204,6 +204,29 @@ impl AgentPool { agent.status )); } + // Enforce single-instance concurrency: if this agent is already running on + // any other story, reject the request. The story remains in its current pipeline + // directory and `auto_assign_available_work` will pick it up when the agent + // becomes free. + if let Some(busy_story) = agents.iter().find_map(|(k, a)| { + if a.agent_name == resolved_name + && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) + { + Some( + k.rsplit_once(':') + .map(|(sid, _)| sid) + .unwrap_or(k) + .to_string(), + ) + } else { + None + } + }) { + return Err(format!( + "Agent '{resolved_name}' is already running on story '{busy_story}'; \ + story '{story_id}' will be picked up when the agent becomes available" + )); + } } let (tx, _) = broadcast::channel::(1024); @@ -3327,6 +3350,84 @@ name = "qa" assert_eq!(find_active_story_stage(tmp.path(), "99_nonexistent"), None); } + // ── start_agent single-instance concurrency tests ───────────────────────── + + /// Regression test for bug 97: the agent pool must reject a second concurrent + /// instance of the same agent name even if it would run on a different story. + #[tokio::test] + async fn start_agent_rejects_when_same_agent_already_running_on_another_story() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Write a minimal project.toml so ProjectConfig::load can find the "qa" agent. + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(&sk_dir).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"qa\"\n", + ) + .unwrap(); + + let pool = AgentPool::new(3001); + // Simulate qa already running on story-a. + pool.inject_test_agent("story-a", "qa", AgentStatus::Running); + + // Attempt to start qa on story-b — must be rejected. + let result = pool + .start_agent(root, "story-b", Some("qa"), None) + .await; + + assert!( + result.is_err(), + "start_agent should fail when qa is already running on another story" + ); + let err = result.unwrap_err(); + assert!( + err.contains("already running") || err.contains("becomes available"), + "error message should explain why: got '{err}'" + ); + } + + /// Verify that the concurrency guard does NOT block an agent that is merely + /// Completed (not Running/Pending) — completed agents are free for new work. + #[tokio::test] + async fn start_agent_allows_new_story_when_previous_run_is_completed() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(&sk_dir).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"qa\"\n", + ) + .unwrap(); + + let pool = AgentPool::new(3001); + // Previous run completed — should NOT block a new story. + pool.inject_test_agent("story-a", "qa", AgentStatus::Completed); + + // The call will fail eventually (no real worktree / Claude CLI), but it must + // NOT fail at the concurrency check. We detect the difference by inspecting + // the error message: a concurrency rejection says "already running", while a + // later failure (missing story file, missing claude binary, etc.) says something else. + let result = pool + .start_agent(root, "story-b", Some("qa"), None) + .await; + + if let Err(ref e) = result { + assert!( + !e.contains("already running") && !e.contains("becomes available"), + "completed agent must not trigger the concurrency guard: got '{e}'" + ); + } + // result may be Ok (unlikely in test env) or Err for infra reasons — both fine. + } + // ── worktree_has_committed_work tests ───────────────────────────────────── #[test]