fix(agents): enforce single-instance concurrency per agent name
The agent pool allowed the same agent (e.g. "qa") to run concurrently on multiple stories because start_agent() only checked whether that story+agent combo was already active. It did not check whether the agent was busy on a different story. Two concurrent QA runs each spawn cargo clippy + cargo test + vitest, causing extreme CPU load (load average >33 on M1 Mac). Fix: before registering a new agent as Pending, scan all active entries for any Running or Pending entry with the same agent_name. If one is found, return an error explaining that the story will be picked up when the agent becomes available. The existing auto_assign_available_work() mechanism already scans pipeline directories (3_qa/, 4_merge/, etc.) for unassigned stories and uses find_free_agent_for_stage() — which respects single-instance limits — to assign work when an agent slot opens up. So the queuing behaviour is naturally provided: the story stays in its directory, and auto-assign picks it up when the previous run completes. Adds two regression tests: - start_agent_rejects_when_same_agent_already_running_on_another_story - start_agent_allows_new_story_when_previous_run_is_completed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -204,6 +204,29 @@ impl AgentPool {
|
||||
agent.status
|
||||
));
|
||||
}
|
||||
// Enforce single-instance concurrency: if this agent is already running on
|
||||
// any other story, reject the request. The story remains in its current pipeline
|
||||
// directory and `auto_assign_available_work` will pick it up when the agent
|
||||
// becomes free.
|
||||
if let Some(busy_story) = agents.iter().find_map(|(k, a)| {
|
||||
if a.agent_name == resolved_name
|
||||
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
||||
{
|
||||
Some(
|
||||
k.rsplit_once(':')
|
||||
.map(|(sid, _)| sid)
|
||||
.unwrap_or(k)
|
||||
.to_string(),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}) {
|
||||
return Err(format!(
|
||||
"Agent '{resolved_name}' is already running on story '{busy_story}'; \
|
||||
story '{story_id}' will be picked up when the agent becomes available"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let (tx, _) = broadcast::channel::<AgentEvent>(1024);
|
||||
@@ -3327,6 +3350,84 @@ name = "qa"
|
||||
assert_eq!(find_active_story_stage(tmp.path(), "99_nonexistent"), None);
|
||||
}
|
||||
|
||||
// ── start_agent single-instance concurrency tests ─────────────────────────
|
||||
|
||||
/// Regression test for bug 97: the agent pool must reject a second concurrent
|
||||
/// instance of the same agent name even if it would run on a different story.
|
||||
#[tokio::test]
|
||||
async fn start_agent_rejects_when_same_agent_already_running_on_another_story() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
// Write a minimal project.toml so ProjectConfig::load can find the "qa" agent.
|
||||
let sk_dir = root.join(".story_kit");
|
||||
fs::create_dir_all(&sk_dir).unwrap();
|
||||
fs::write(
|
||||
sk_dir.join("project.toml"),
|
||||
"[[agent]]\nname = \"qa\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let pool = AgentPool::new(3001);
|
||||
// Simulate qa already running on story-a.
|
||||
pool.inject_test_agent("story-a", "qa", AgentStatus::Running);
|
||||
|
||||
// Attempt to start qa on story-b — must be rejected.
|
||||
let result = pool
|
||||
.start_agent(root, "story-b", Some("qa"), None)
|
||||
.await;
|
||||
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"start_agent should fail when qa is already running on another story"
|
||||
);
|
||||
let err = result.unwrap_err();
|
||||
assert!(
|
||||
err.contains("already running") || err.contains("becomes available"),
|
||||
"error message should explain why: got '{err}'"
|
||||
);
|
||||
}
|
||||
|
||||
/// Verify that the concurrency guard does NOT block an agent that is merely
|
||||
/// Completed (not Running/Pending) — completed agents are free for new work.
|
||||
#[tokio::test]
|
||||
async fn start_agent_allows_new_story_when_previous_run_is_completed() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
let sk_dir = root.join(".story_kit");
|
||||
fs::create_dir_all(&sk_dir).unwrap();
|
||||
fs::write(
|
||||
sk_dir.join("project.toml"),
|
||||
"[[agent]]\nname = \"qa\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let pool = AgentPool::new(3001);
|
||||
// Previous run completed — should NOT block a new story.
|
||||
pool.inject_test_agent("story-a", "qa", AgentStatus::Completed);
|
||||
|
||||
// The call will fail eventually (no real worktree / Claude CLI), but it must
|
||||
// NOT fail at the concurrency check. We detect the difference by inspecting
|
||||
// the error message: a concurrency rejection says "already running", while a
|
||||
// later failure (missing story file, missing claude binary, etc.) says something else.
|
||||
let result = pool
|
||||
.start_agent(root, "story-b", Some("qa"), None)
|
||||
.await;
|
||||
|
||||
if let Err(ref e) = result {
|
||||
assert!(
|
||||
!e.contains("already running") && !e.contains("becomes available"),
|
||||
"completed agent must not trigger the concurrency guard: got '{e}'"
|
||||
);
|
||||
}
|
||||
// result may be Ok (unlikely in test env) or Err for infra reasons — both fine.
|
||||
}
|
||||
|
||||
// ── worktree_has_committed_work tests ─────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user