From b09b6ce4f1e3e770d3b9efb9894b52aaea34fd38 Mon Sep 17 00:00:00 2001
From: Dave <futurechimp@users.noreply.github.com>
Date: Mon, 23 Feb 2026 20:46:51 +0000
Subject: [PATCH] fix(agents): enforce single-instance concurrency per agent
 name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The agent pool allowed the same agent (e.g. "qa") to run concurrently
on multiple stories because start_agent() only checked whether that
story+agent combo was already active. It did not check whether the
agent was busy on a different story.

Two concurrent QA runs each spawn cargo clippy + cargo test + vitest,
causing extreme CPU load (load average >33 on M1 Mac).

Fix: before registering a new agent as Pending, scan all active entries
for any Running or Pending entry with the same agent_name. If one is
found, return an error explaining that the story will be picked up when
the agent becomes available.

The existing auto_assign_available_work() mechanism already scans
pipeline directories (3_qa/, 4_merge/, etc.) for unassigned stories
and uses find_free_agent_for_stage() — which respects single-instance
limits — to assign work when an agent slot opens up. So the queuing
behaviour is naturally provided: the story stays in its directory,
and auto-assign picks it up when the previous run completes.

Adds two regression tests:
- start_agent_rejects_when_same_agent_already_running_on_another_story
- start_agent_allows_new_story_when_previous_run_is_completed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 server/src/agents.rs | 101 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
diff --git a/server/src/agents.rs b/server/src/agents.rs
index ebe0f2a..e122702 100644
--- a/server/src/agents.rs
+++ b/server/src/agents.rs
@@ -204,6 +204,29 @@ impl AgentPool {
                     agent.status
                 ));
             }
+            // Enforce single-instance concurrency: if this agent is already running on
+            // any other story, reject the request. The story remains in its current pipeline
+            // directory and `auto_assign_available_work` will pick it up when the agent
+            // becomes free.
+            if let Some(busy_story) = agents.iter().find_map(|(k, a)| {
+                if a.agent_name == resolved_name
+                    && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+                {
+                    Some(
+                        k.rsplit_once(':')
+                            .map(|(sid, _)| sid)
+                            .unwrap_or(k)
+                            .to_string(),
+                    )
+                } else {
+                    None
+                }
+            }) {
+                return Err(format!(
+                    "Agent '{resolved_name}' is already running on story '{busy_story}'; \
+                     story '{story_id}' will be picked up when the agent becomes available"
+                ));
+            }
         }
 
         let (tx, _) = broadcast::channel::<AgentEvent>(1024);
@@ -3327,6 +3350,84 @@ name = "qa"
         assert_eq!(find_active_story_stage(tmp.path(), "99_nonexistent"), None);
     }
 
+    // ── start_agent single-instance concurrency tests ─────────────────────────
+
+    /// Regression test for bug 97: the agent pool must reject a second concurrent
+    /// instance of the same agent name even if it would run on a different story.
+    #[tokio::test]
+    async fn start_agent_rejects_when_same_agent_already_running_on_another_story() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        // Write a minimal project.toml so ProjectConfig::load can find the "qa" agent.
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(&sk_dir).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"qa\"\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new(3001);
+        // Simulate qa already running on story-a.
+        pool.inject_test_agent("story-a", "qa", AgentStatus::Running);
+
+        // Attempt to start qa on story-b — must be rejected.
+        let result = pool
+            .start_agent(root, "story-b", Some("qa"), None)
+            .await;
+
+        assert!(
+            result.is_err(),
+            "start_agent should fail when qa is already running on another story"
+        );
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("already running") || err.contains("becomes available"),
+            "error message should explain why: got '{err}'"
+        );
+    }
+
+    /// Verify that the concurrency guard does NOT block an agent that is merely
+    /// Completed (not Running/Pending) — completed agents are free for new work.
+    #[tokio::test]
+    async fn start_agent_allows_new_story_when_previous_run_is_completed() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(&sk_dir).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"qa\"\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new(3001);
+        // Previous run completed — should NOT block a new story.
+        pool.inject_test_agent("story-a", "qa", AgentStatus::Completed);
+
+        // The call will fail eventually (no real worktree / Claude CLI), but it must
+        // NOT fail at the concurrency check.  We detect the difference by inspecting
+        // the error message: a concurrency rejection says "already running", while a
+        // later failure (missing story file, missing claude binary, etc.) says something else.
+        let result = pool
+            .start_agent(root, "story-b", Some("qa"), None)
+            .await;
+
+        if let Err(ref e) = result {
+            assert!(
+                !e.contains("already running") && !e.contains("becomes available"),
+                "completed agent must not trigger the concurrency guard: got '{e}'"
+            );
+        }
+        // result may be Ok (unlikely in test env) or Err for infra reasons — both fine.
+    }
+
     // ── worktree_has_committed_work tests ─────────────────────────────────────
 
     #[test]