story-kit: merge 190_story_auto_select_available_agent_for_stage_in_start_agent

2026-02-25 16:17:38 +00:00
parent 8c338a6c63
commit 411653ab15
3 changed files with 316 additions and 168 deletions
--- a/server/src/agents.rs
+++ b/server/src/agents.rs
@@ -323,7 +323,11 @@ impl AgentPool {
    }

    /// Start an agent for a story: load config, create worktree, spawn agent.
-    /// If `agent_name` is None, defaults to the first configured agent.
+    ///
+    /// When `agent_name` is `None`, automatically selects the first idle coder
+    /// agent (story 190). If all coders are busy the call fails with an error
+    /// indicating the story will be picked up when one becomes available.
+    ///
    /// If `resume_context` is provided, it is appended to the rendered prompt
    /// so the agent can pick up from a previous failed attempt.
    pub async fn start_agent(
@@ -335,59 +339,56 @@ impl AgentPool {
    ) -> Result<AgentInfo, String> {
        let config = ProjectConfig::load(project_root)?;

-        // Resolve agent name from config
-        let resolved_name = match agent_name {
-            Some(name) => {
+        // Validate explicit agent name early (no lock needed).
+        if let Some(name) = agent_name {
            config
                .find_agent(name)
                .ok_or_else(|| format!("No agent named '{name}' in config"))?;
-                name.to_string()
        }
-            None => config
-                .default_coder_agent()
-                .ok_or_else(|| {
-                    "No coder agent configured. Specify an agent_name explicitly.".to_string()
-                })?
-                .name
-                .clone(),
-        };

-        let key = composite_key(story_id, &resolved_name);
-
-        // Create shared resources before the atomic check-and-insert so the
-        // agents lock is held continuously from the availability check through
-        // the Pending insert, eliminating the TOCTOU race (story 132).
+        // Create name-independent shared resources before the lock so they are
+        // ready for the atomic check-and-insert (story 132).
        let (tx, _) = broadcast::channel::<AgentEvent>(1024);
-
        let event_log: Arc<Mutex<Vec<AgentEvent>>> = Arc::new(Mutex::new(Vec::new()));
-
-        // Generate a unique session ID for the persistent log file.
        let log_session_id = uuid::Uuid::new_v4().to_string();

-        // Create persistent log writer.
-        let log_writer = match AgentLogWriter::new(
-            project_root,
-            story_id,
-            &resolved_name,
-            &log_session_id,
-        ) {
-            Ok(w) => Some(Arc::new(Mutex::new(w))),
-            Err(e) => {
-                eprintln!("[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}");
-                None
-            }
-        };
-
-        // Atomically check availability and register as Pending.  The lock is
-        // held continuously from the duplicate check through the HashMap insert
-        // so no concurrent start_agent call can slip through the check before
-        // this insert completes (fixes TOCTOU race, story 132).
+        // Atomically resolve agent name, check availability, and register as
+        // Pending.  When `agent_name` is `None` the first idle coder is
+        // selected inside the lock so no TOCTOU race can occur between the
+        // availability check and the Pending insert (story 132, story 190).
        //
        // The `PendingGuard` ensures that if any step below fails the entry is
        // removed from the pool so it does not permanently block auto-assign
        // (bug 118).
+        let resolved_name: String;
+        let key: String;
        {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
+
+            resolved_name = match agent_name {
+                Some(name) => name.to_string(),
+                None => find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder)
+                    .map(|s| s.to_string())
+                    .ok_or_else(|| {
+                        if config
+                            .agent
+                            .iter()
+                            .any(|a| agent_config_stage(a) == PipelineStage::Coder)
+                        {
+                            format!(
+                                "All coder agents are busy; story '{story_id}' will be \
+                                 picked up when one becomes available"
+                            )
+                        } else {
+                            "No coder agent configured. Specify an agent_name explicitly."
+                                .to_string()
+                        }
+                    })?,
+            };
+
+            key = composite_key(story_id, &resolved_name);
+
+            // Check for duplicate assignment (same story + same agent already active).
            if let Some(agent) = agents.get(&key)
                && (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending)
            {
@@ -396,11 +397,13 @@ impl AgentPool {
                    agent.status
                ));
            }
-            // Enforce single-instance concurrency: if this agent is already running on
-            // any other story, reject the request. The story remains in its current pipeline
-            // directory and `auto_assign_available_work` will pick it up when the agent
-            // becomes free.
-            if let Some(busy_story) = agents.iter().find_map(|(k, a)| {
+            // Enforce single-instance concurrency for explicitly-named agents:
+            // if this agent is already running on any other story, reject.
+            // Auto-selected agents are already guaranteed idle by
+            // find_free_agent_for_stage, so this check is only needed for
+            // explicit requests.
+            if agent_name.is_some()
+                && let Some(busy_story) = agents.iter().find_map(|(k, a)| {
                    if a.agent_name == resolved_name
                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
                    {
@@ -413,7 +416,8 @@ impl AgentPool {
                    } else {
                        None
                    }
-            }) {
+                })
+            {
                return Err(format!(
                    "Agent '{resolved_name}' is already running on story '{busy_story}'; \
                     story '{story_id}' will be picked up when the agent becomes available"
@@ -437,6 +441,23 @@ impl AgentPool {
        }
        let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone());

+        // Create persistent log writer (needs resolved_name, so must be after
+        // the atomic resolution above).
+        let log_writer = match AgentLogWriter::new(
+            project_root,
+            story_id,
+            &resolved_name,
+            &log_session_id,
+        ) {
+            Ok(w) => Some(Arc::new(Mutex::new(w))),
+            Err(e) => {
+                eprintln!(
+                    "[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}"
+                );
+                None
+            }
+        };
+
        // Notify WebSocket clients that a new agent is pending.
        Self::notify_agent_state_changed(&self.watcher_tx);

@@ -674,6 +695,28 @@ impl AgentPool {
        Ok(())
    }

+    /// Return the names of configured agents for `stage` that are not currently
+    /// running or pending.
+    pub fn available_agents_for_stage(
+        &self,
+        config: &ProjectConfig,
+        stage: &PipelineStage,
+    ) -> Result<Vec<String>, String> {
+        let agents = self.agents.lock().map_err(|e| e.to_string())?;
+        Ok(config
+            .agent
+            .iter()
+            .filter(|cfg| agent_config_stage(cfg) == *stage)
+            .filter(|cfg| {
+                !agents.values().any(|a| {
+                    a.agent_name == cfg.name
+                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+                })
+            })
+            .map(|cfg| cfg.name.clone())
+            .collect())
+    }
+
    /// List all agents with their status.
    pub fn list_agents(&self) -> Result<Vec<AgentInfo>, String> {
        let agents = self.agents.lock().map_err(|e| e.to_string())?;
@@ -6299,4 +6342,198 @@ stage = "qa"
             (bug 173: lozenges must update when agents are assigned during pipeline advance)"
        );
    }
+
+    // ── available_agents_for_stage tests (story 190) ──────────────────────────
+
+    fn make_config(toml_str: &str) -> ProjectConfig {
+        ProjectConfig::parse(toml_str).unwrap()
+    }
+
+    #[test]
+    fn available_agents_for_stage_returns_idle_agents() {
+        let config = make_config(
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+
+[[agent]]
+name = "qa"
+stage = "qa"
+"#,
+        );
+        let pool = AgentPool::new_test(3001);
+        // coder-1 is busy on story-1
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
+
+        let available = pool
+            .available_agents_for_stage(&config, &PipelineStage::Coder)
+            .unwrap();
+        assert_eq!(available, vec!["coder-2"]);
+
+        let available_qa = pool
+            .available_agents_for_stage(&config, &PipelineStage::Qa)
+            .unwrap();
+        assert_eq!(available_qa, vec!["qa"]);
+    }
+
+    #[test]
+    fn available_agents_for_stage_returns_empty_when_all_busy() {
+        let config = make_config(
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+"#,
+        );
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
+
+        let available = pool
+            .available_agents_for_stage(&config, &PipelineStage::Coder)
+            .unwrap();
+        assert!(available.is_empty());
+    }
+
+    #[test]
+    fn available_agents_for_stage_ignores_completed_agents() {
+        let config = make_config(
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+"#,
+        );
+        let pool = AgentPool::new_test(3001);
+        // Completed agents should not count as busy.
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Completed);
+
+        let available = pool
+            .available_agents_for_stage(&config, &PipelineStage::Coder)
+            .unwrap();
+        assert_eq!(available, vec!["coder-1"]);
+    }
+
+    #[tokio::test]
+    async fn start_agent_auto_selects_second_coder_when_first_busy() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        std::fs::create_dir_all(&sk).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            r#"
+[[agent]]
+name = "supervisor"
+stage = "other"
+
+[[agent]]
+name = "coder-1"
+stage = "coder"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        // coder-1 is busy on another story
+        pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running);
+
+        // Call start_agent without agent_name — should pick coder-2
+        let result = pool
+            .start_agent(tmp.path(), "42_my_story", None, None)
+            .await;
+        // Will fail for infrastructure reasons (no git repo), but should NOT
+        // fail with "All coder agents are busy" — that would mean it didn't
+        // try coder-2.
+        match result {
+            Ok(info) => {
+                assert_eq!(info.agent_name, "coder-2");
+            }
+            Err(err) => {
+                assert!(
+                    !err.contains("All coder agents are busy"),
+                    "should have selected coder-2 but got: {err}"
+                );
+                assert!(
+                    !err.contains("No coder agent configured"),
+                    "should not fail on agent selection, got: {err}"
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn start_agent_returns_busy_when_all_coders_occupied() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        std::fs::create_dir_all(&sk).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
+        pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending);
+
+        let result = pool
+            .start_agent(tmp.path(), "story-3", None, None)
+            .await;
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("All coder agents are busy"),
+            "expected busy error, got: {err}"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_explicit_name_unchanged_when_busy() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        std::fs::create_dir_all(&sk).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
+
+        // Explicit request for coder-1 (busy) should fail even though coder-2 is free.
+        let result = pool
+            .start_agent(tmp.path(), "story-2", Some("coder-1"), None)
+            .await;
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("coder-1") && err.contains("already running"),
+            "expected explicit busy error, got: {err}"
+        );
+    }
 }
--- a/server/src/config.rs
+++ b/server/src/config.rs
@@ -57,20 +57,6 @@ pub struct AgentConfig {
    pub inactivity_timeout_secs: u64,
 }

-impl AgentConfig {
-    /// Returns true if this agent is a coder.
-    ///
-    /// Prefers the explicit `stage` field; falls back to checking whether the
-    /// agent name starts with `"coder"` for backwards compatibility.
-    pub fn is_coder(&self) -> bool {
-        match self.stage.as_deref() {
-            Some("coder") => true,
-            Some(_) => false,
-            None => self.name.starts_with("coder"),
-        }
-    }
-}
-
 fn default_path() -> String {
    ".".to_string()
 }
@@ -203,15 +189,6 @@ impl ProjectConfig {
        self.agent.first()
    }

-    /// Get the first coder agent config.
-    ///
-    /// Prefers the explicit `stage = "coder"` field over the legacy name-based
-    /// heuristic (names starting with "coder").  Returns `None` if no coder
-    /// agent is configured.
-    pub fn default_coder_agent(&self) -> Option<&AgentConfig> {
-        self.agent.iter().find(|a| a.is_coder())
-    }
-
    /// Render template variables in agent args and prompt for the given agent.
    /// If `agent_name` is None, uses the first (default) agent.
    pub fn render_agent_args(
@@ -503,89 +480,6 @@ name = "second"
        assert!(config.find_agent("missing").is_none());
    }

-    #[test]
-    fn default_coder_agent_skips_supervisor() {
-        let toml_str = r#"
-[[agent]]
-name = "supervisor"
-stage = "other"
-
-[[agent]]
-name = "coder-1"
-stage = "coder"
-"#;
-        let config = ProjectConfig::parse(toml_str).unwrap();
-        assert_eq!(config.default_coder_agent().unwrap().name, "coder-1");
-    }
-
-    #[test]
-    fn default_coder_agent_uses_name_heuristic_when_no_stage() {
-        let toml_str = r#"
-[[agent]]
-name = "supervisor"
-
-[[agent]]
-name = "coder-1"
-"#;
-        let config = ProjectConfig::parse(toml_str).unwrap();
-        assert_eq!(config.default_coder_agent().unwrap().name, "coder-1");
-    }
-
-    #[test]
-    fn default_coder_agent_returns_none_when_no_coders() {
-        let toml_str = r#"
-[[agent]]
-name = "supervisor"
-stage = "other"
-"#;
-        let config = ProjectConfig::parse(toml_str).unwrap();
-        assert!(config.default_coder_agent().is_none());
-    }
-
-    #[test]
-    fn is_coder_explicit_stage() {
-        let mut agent = AgentConfig {
-            name: "my-agent".to_string(),
-            role: String::new(),
-            command: "claude".to_string(),
-            args: vec![],
-            prompt: String::new(),
-            model: None,
-            allowed_tools: None,
-            max_turns: None,
-            max_budget_usd: None,
-            system_prompt: None,
-            stage: Some("coder".to_string()),
-            inactivity_timeout_secs: 300,
-        };
-        assert!(agent.is_coder());
-        agent.stage = Some("other".to_string());
-        assert!(!agent.is_coder());
-    }
-
-    #[test]
-    fn is_coder_name_heuristic() {
-        let make = |name: &str| AgentConfig {
-            name: name.to_string(),
-            role: String::new(),
-            command: "claude".to_string(),
-            args: vec![],
-            prompt: String::new(),
-            model: None,
-            allowed_tools: None,
-            max_turns: None,
-            max_budget_usd: None,
-            system_prompt: None,
-            stage: None,
-            inactivity_timeout_secs: 300,
-        };
-        assert!(make("coder-1").is_coder());
-        assert!(make("coder-opus").is_coder());
-        assert!(!make("supervisor").is_coder());
-        assert!(!make("qa").is_coder());
-        assert!(!make("mergemaster").is_coder());
-    }
-
    #[test]
    fn parse_project_toml_from_file() {
        let tmp = tempfile::tempdir().unwrap();
--- a/server/src/http/mcp.rs
+++ b/server/src/http/mcp.rs
@@ -1,4 +1,4 @@
-use crate::agents::{close_bug_to_archive, move_story_to_archived, move_story_to_merge, move_story_to_qa};
+use crate::agents::{close_bug_to_archive, move_story_to_archived, move_story_to_merge, move_story_to_qa, PipelineStage};
 use crate::config::ProjectConfig;
 use crate::log_buffer;
 use crate::slog_warn;
@@ -1269,6 +1269,22 @@ fn get_agent_output_from_log(
 fn tool_get_agent_config(ctx: &AppContext) -> Result<String, String> {
    let project_root = ctx.agents.get_project_root(&ctx.state)?;
    let config = ProjectConfig::load(&project_root)?;
+
+    // Collect available (idle) agent names across all stages so the caller can
+    // see at a glance which agents are free to start (story 190).
+    let mut available_names: std::collections::HashSet<String> =
+        std::collections::HashSet::new();
+    for stage in &[
+        PipelineStage::Coder,
+        PipelineStage::Qa,
+        PipelineStage::Mergemaster,
+        PipelineStage::Other,
+    ] {
+        if let Ok(names) = ctx.agents.available_agents_for_stage(&config, stage) {
+            available_names.extend(names);
+        }
+    }
+
    serde_json::to_string_pretty(&json!(config
        .agent
        .iter()
@@ -1279,6 +1295,7 @@ fn tool_get_agent_config(ctx: &AppContext) -> Result<String, String> {
            "allowed_tools": a.allowed_tools,
            "max_turns": a.max_turns,
            "max_budget_usd": a.max_budget_usd,
+            "available": available_names.contains(&a.name),
        }))
        .collect::<Vec<_>>()))
    .map_err(|e| format!("Serialization error: {e}"))