From 5af3c1752251b6f556d524167b407d02e7051f29 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 19 Mar 2026 22:41:17 +0000 Subject: [PATCH] story-kit: merge 317_refactor_split_pool_rs_into_pipeline_auto_assign_and_agent_management_modules --- server/src/agents/pool.rs | 5852 ------------------------- server/src/agents/pool/auto_assign.rs | 1813 ++++++++ server/src/agents/pool/mod.rs | 2187 +++++++++ server/src/agents/pool/pipeline.rs | 1771 ++++++++ 4 files changed, 5771 insertions(+), 5852 deletions(-) delete mode 100644 server/src/agents/pool.rs create mode 100644 server/src/agents/pool/auto_assign.rs create mode 100644 server/src/agents/pool/mod.rs create mode 100644 server/src/agents/pool/pipeline.rs diff --git a/server/src/agents/pool.rs b/server/src/agents/pool.rs deleted file mode 100644 index 5148d66..0000000 --- a/server/src/agents/pool.rs +++ /dev/null @@ -1,5852 +0,0 @@ -use crate::agent_log::AgentLogWriter; -use crate::config::ProjectConfig; -use crate::io::watcher::WatcherEvent; -use crate::slog; -use crate::slog_error; -use crate::slog_warn; -use crate::worktree::{self, WorktreeInfo}; -use portable_pty::ChildKiller; -use std::collections::HashMap; -use std::path::{Path, PathBuf}; -use std::sync::{Arc, Mutex}; -use tokio::sync::broadcast; - -use super::{ - AgentEvent, AgentInfo, AgentStatus, CompletionReport, PipelineStage, ReconciliationEvent, - agent_config_stage, pipeline_stage, -}; - -/// Build the composite key used to track agents in the pool. -fn composite_key(story_id: &str, agent_name: &str) -> String { - format!("{story_id}:{agent_name}") -} - -/// RAII guard that removes a pending agent entry from the pool on drop. -/// -/// Created after inserting a `Pending` entry into the agent HashMap. -/// If `start_agent` succeeds (the agent process is spawned and status -/// transitions to `Running`), call [`disarm`](Self::disarm) to prevent -/// cleanup. If any intermediate step fails and the guard is dropped -/// without being disarmed, the pending entry is removed so it cannot -/// block future auto-assign dispatches. -struct PendingGuard { - agents: Arc>>, - key: String, - armed: bool, -} - -impl PendingGuard { - fn new(agents: Arc>>, key: String) -> Self { - Self { - agents, - key, - armed: true, - } - } - - /// Prevent the guard from cleaning up the entry (call after - /// successful spawn). - fn disarm(&mut self) { - self.armed = false; - } -} - -impl Drop for PendingGuard { - fn drop(&mut self) { - if self.armed - && let Ok(mut agents) = self.agents.lock() - && agents - .get(&self.key) - .is_some_and(|a| a.status == AgentStatus::Pending) - { - agents.remove(&self.key); - slog!( - "[agents] Cleaned up leaked Pending entry for '{}'", - self.key - ); - } - } -} - -struct StoryAgent { - agent_name: String, - status: AgentStatus, - worktree_info: Option, - session_id: Option, - tx: broadcast::Sender, - task_handle: Option>, - /// Accumulated events for polling via get_agent_output. - event_log: Arc>>, - /// Set when the agent calls report_completion. - completion: Option, - /// Project root, stored for pipeline advancement after completion. - project_root: Option, - /// UUID identifying the log file for this session. - log_session_id: Option, - /// Set to `true` when the agent calls `report_merge_failure`. - /// Prevents the pipeline from blindly advancing to `5_done/` after a - /// failed merge: the server-owned gate check runs in the feature-branch - /// worktree (which compiles fine) and returns `gates_passed=true` even - /// though the code was never squash-merged onto master. - merge_failure_reported: bool, -} - -/// Build an `AgentInfo` snapshot from a `StoryAgent` map entry. -fn agent_info_from_entry(story_id: &str, agent: &StoryAgent) -> AgentInfo { - AgentInfo { - story_id: story_id.to_string(), - agent_name: agent.agent_name.clone(), - status: agent.status.clone(), - session_id: agent.session_id.clone(), - worktree_path: agent - .worktree_info - .as_ref() - .map(|wt| wt.path.to_string_lossy().to_string()), - base_branch: agent - .worktree_info - .as_ref() - .map(|wt| wt.base_branch.clone()), - completion: agent.completion.clone(), - log_session_id: agent.log_session_id.clone(), - } -} - -/// Manages concurrent story agents, each in its own worktree. -pub struct AgentPool { - agents: Arc>>, - port: u16, - /// Registry of active PTY child process killers, keyed by "{story_id}:{agent_name}". - /// Used to terminate child processes on server shutdown or agent stop, preventing - /// orphaned Claude Code processes from running after the server exits. - child_killers: Arc>>>, - /// Broadcast channel for notifying WebSocket clients of agent state changes. - /// When an agent transitions state (Pending, Running, Completed, Failed, Stopped), - /// an `AgentStateChanged` event is emitted so the frontend can refresh the - /// pipeline board without waiting for a filesystem event. - watcher_tx: broadcast::Sender, - /// Tracks background merge jobs started by `merge_agent_work`, keyed by story_id. - /// The MCP tool returns immediately and the mergemaster agent polls - /// `get_merge_status` until the job reaches a terminal state. - merge_jobs: Arc>>, -} - -impl AgentPool { - pub fn new(port: u16, watcher_tx: broadcast::Sender) -> Self { - Self { - agents: Arc::new(Mutex::new(HashMap::new())), - port, - child_killers: Arc::new(Mutex::new(HashMap::new())), - watcher_tx, - merge_jobs: Arc::new(Mutex::new(HashMap::new())), - } - } - - /// Create a pool with a dummy watcher channel for unit tests. - #[cfg(test)] - pub fn new_test(port: u16) -> Self { - let (watcher_tx, _) = broadcast::channel(16); - Self::new(port, watcher_tx) - } - - /// Notify WebSocket clients that agent state has changed, so the pipeline - /// board and agent panel can refresh. - fn notify_agent_state_changed(watcher_tx: &broadcast::Sender) { - let _ = watcher_tx.send(WatcherEvent::AgentStateChanged); - } - - /// Kill all active PTY child processes. - /// - /// Called on server shutdown to prevent orphaned Claude Code processes from - /// continuing to run after the server exits. Each registered killer is called - /// once, then the registry is cleared. - pub fn kill_all_children(&self) { - if let Ok(mut killers) = self.child_killers.lock() { - for (key, killer) in killers.iter_mut() { - slog!("[agents] Killing child process for {key} on shutdown"); - let _ = killer.kill(); - } - killers.clear(); - } - } - - /// Kill and deregister the child process for a specific agent key. - /// - /// Used by `stop_agent` to ensure the PTY child is terminated even though - /// aborting a `spawn_blocking` task handle does not interrupt the blocking thread. - fn kill_child_for_key(&self, key: &str) { - if let Ok(mut killers) = self.child_killers.lock() - && let Some(mut killer) = killers.remove(key) - { - slog!("[agents] Killing child process for {key} on stop"); - let _ = killer.kill(); - } - } - - /// Start an agent for a story: load config, create worktree, spawn agent. - /// - /// When `agent_name` is `None`, automatically selects the first idle coder - /// agent (story 190). If all coders are busy the call fails with an error - /// indicating the story will be picked up when one becomes available. - /// - /// If `resume_context` is provided, it is appended to the rendered prompt - /// so the agent can pick up from a previous failed attempt. - pub async fn start_agent( - &self, - project_root: &Path, - story_id: &str, - agent_name: Option<&str>, - resume_context: Option<&str>, - ) -> Result { - let config = ProjectConfig::load(project_root)?; - - // Validate explicit agent name early (no lock needed). - if let Some(name) = agent_name { - config - .find_agent(name) - .ok_or_else(|| format!("No agent named '{name}' in config"))?; - } - - // Create name-independent shared resources before the lock so they are - // ready for the atomic check-and-insert (story 132). - let (tx, _) = broadcast::channel::(1024); - let event_log: Arc>> = Arc::new(Mutex::new(Vec::new())); - let log_session_id = uuid::Uuid::new_v4().to_string(); - - // Move story from backlog/ to current/ before checking agent - // availability so that auto_assign_available_work can pick it up even - // when all coders are currently busy (story 203). This is idempotent: - // if the story is already in 2_current/ or a later stage, the call is - // a no-op. - super::lifecycle::move_story_to_current(project_root, story_id)?; - - // Validate that the agent's configured stage matches the story's - // pipeline stage. This prevents any caller (auto-assign, MCP tool, - // pipeline advance, supervisor) from starting a wrong-stage agent on - // a story — e.g. mergemaster on a coding-stage story (bug 312). - if let Some(name) = agent_name { - let agent_stage = config - .find_agent(name) - .map(agent_config_stage) - .unwrap_or_else(|| pipeline_stage(name)); - if agent_stage != PipelineStage::Other - && let Some(story_stage_dir) = find_active_story_stage(project_root, story_id) - { - let expected_stage = match story_stage_dir { - "2_current" => PipelineStage::Coder, - "3_qa" => PipelineStage::Qa, - "4_merge" => PipelineStage::Mergemaster, - _ => PipelineStage::Other, - }; - if expected_stage != PipelineStage::Other && expected_stage != agent_stage { - return Err(format!( - "Agent '{name}' (stage: {agent_stage:?}) cannot be assigned to \ - story '{story_id}' in {story_stage_dir}/ (requires stage: {expected_stage:?})" - )); - } - } - } - - // Atomically resolve agent name, check availability, and register as - // Pending. When `agent_name` is `None` the first idle coder is - // selected inside the lock so no TOCTOU race can occur between the - // availability check and the Pending insert (story 132, story 190). - // - // The `PendingGuard` ensures that if any step below fails the entry is - // removed from the pool so it does not permanently block auto-assign - // (bug 118). - let resolved_name: String; - let key: String; - { - let mut agents = self.agents.lock().map_err(|e| e.to_string())?; - - resolved_name = match agent_name { - Some(name) => name.to_string(), - None => find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder) - .map(|s| s.to_string()) - .ok_or_else(|| { - if config - .agent - .iter() - .any(|a| agent_config_stage(a) == PipelineStage::Coder) - { - format!( - "All coder agents are busy; story '{story_id}' has been \ - queued in work/2_current/ and will be auto-assigned when \ - one becomes available" - ) - } else { - "No coder agent configured. Specify an agent_name explicitly." - .to_string() - } - })?, - }; - - key = composite_key(story_id, &resolved_name); - - // Check for duplicate assignment (same story + same agent already active). - if let Some(agent) = agents.get(&key) - && (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending) - { - return Err(format!( - "Agent '{resolved_name}' for story '{story_id}' is already {}", - agent.status - )); - } - // Enforce single-stage concurrency: reject if there is already a - // Running/Pending agent at the same pipeline stage for this story. - // This prevents two coders (or two QA/mergemaster agents) from - // corrupting each other's work in the same worktree. - // Applies to both explicit and auto-selected agents; the Other - // stage (supervisors, unknown agents) is exempt. - let resolved_stage = config - .find_agent(&resolved_name) - .map(agent_config_stage) - .unwrap_or_else(|| pipeline_stage(&resolved_name)); - if resolved_stage != PipelineStage::Other - && let Some(conflicting_name) = agents.iter().find_map(|(k, a)| { - let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k); - if k_story == story_id - && a.agent_name != resolved_name - && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) - { - let a_stage = config - .find_agent(&a.agent_name) - .map(agent_config_stage) - .unwrap_or_else(|| pipeline_stage(&a.agent_name)); - if a_stage == resolved_stage { - Some(a.agent_name.clone()) - } else { - None - } - } else { - None - } - }) - { - return Err(format!( - "Cannot start '{resolved_name}' on story '{story_id}': \ - '{conflicting_name}' is already active at the same pipeline stage" - )); - } - // Enforce single-instance concurrency for explicitly-named agents: - // if this agent is already running on any other story, reject. - // Auto-selected agents are already guaranteed idle by - // find_free_agent_for_stage, so this check is only needed for - // explicit requests. - if agent_name.is_some() - && let Some(busy_story) = agents.iter().find_map(|(k, a)| { - if a.agent_name == resolved_name - && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) - { - Some( - k.rsplit_once(':') - .map(|(sid, _)| sid) - .unwrap_or(k) - .to_string(), - ) - } else { - None - } - }) - { - return Err(format!( - "Agent '{resolved_name}' is already running on story '{busy_story}'; \ - story '{story_id}' will be picked up when the agent becomes available" - )); - } - agents.insert( - key.clone(), - StoryAgent { - agent_name: resolved_name.clone(), - status: AgentStatus::Pending, - worktree_info: None, - session_id: None, - tx: tx.clone(), - task_handle: None, - event_log: event_log.clone(), - completion: None, - project_root: Some(project_root.to_path_buf()), - log_session_id: Some(log_session_id.clone()), - merge_failure_reported: false, - }, - ); - } - let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone()); - - // Create persistent log writer (needs resolved_name, so must be after - // the atomic resolution above). - let log_writer = - match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) { - Ok(w) => Some(Arc::new(Mutex::new(w))), - Err(e) => { - eprintln!( - "[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}" - ); - None - } - }; - - // Notify WebSocket clients that a new agent is pending. - Self::notify_agent_state_changed(&self.watcher_tx); - - let _ = tx.send(AgentEvent::Status { - story_id: story_id.to_string(), - agent_name: resolved_name.clone(), - status: "pending".to_string(), - }); - - // Extract inactivity timeout from the agent config before cloning config. - let inactivity_timeout_secs = config - .find_agent(&resolved_name) - .map(|a| a.inactivity_timeout_secs) - .unwrap_or(300); - - // Clone all values needed inside the background spawn. - let project_root_clone = project_root.to_path_buf(); - let config_clone = config.clone(); - let resume_context_owned = resume_context.map(str::to_string); - let sid = story_id.to_string(); - let aname = resolved_name.clone(); - let tx_clone = tx.clone(); - let agents_ref = self.agents.clone(); - let key_clone = key.clone(); - let log_clone = event_log.clone(); - let port_for_task = self.port; - let log_writer_clone = log_writer.clone(); - let child_killers_clone = self.child_killers.clone(); - let watcher_tx_clone = self.watcher_tx.clone(); - - // Spawn the background task. Worktree creation and agent launch happen here - // so `start_agent` returns immediately after registering the agent as - // Pending — non-blocking by design (story 157). - let handle = tokio::spawn(async move { - // Step 1: create the worktree (slow — git checkout, pnpm install, etc.) - let wt_info = match worktree::create_worktree( - &project_root_clone, - &sid, - &config_clone, - port_for_task, - ) - .await - { - Ok(wt) => wt, - Err(e) => { - let error_msg = format!("Failed to create worktree: {e}"); - slog_error!("[agents] {error_msg}"); - let event = AgentEvent::Error { - story_id: sid.clone(), - agent_name: aname.clone(), - message: error_msg, - }; - if let Ok(mut log) = log_clone.lock() { - log.push(event.clone()); - } - let _ = tx_clone.send(event); - if let Ok(mut agents) = agents_ref.lock() - && let Some(agent) = agents.get_mut(&key_clone) - { - agent.status = AgentStatus::Failed; - } - Self::notify_agent_state_changed(&watcher_tx_clone); - return; - } - }; - - // Step 2: store worktree info and render agent command/args/prompt. - let wt_path_str = wt_info.path.to_string_lossy().to_string(); - { - if let Ok(mut agents) = agents_ref.lock() - && let Some(agent) = agents.get_mut(&key_clone) - { - agent.worktree_info = Some(wt_info.clone()); - } - } - - let (command, args, mut prompt) = match config_clone.render_agent_args( - &wt_path_str, - &sid, - Some(&aname), - Some(&wt_info.base_branch), - ) { - Ok(result) => result, - Err(e) => { - let error_msg = format!("Failed to render agent args: {e}"); - slog_error!("[agents] {error_msg}"); - let event = AgentEvent::Error { - story_id: sid.clone(), - agent_name: aname.clone(), - message: error_msg, - }; - if let Ok(mut log) = log_clone.lock() { - log.push(event.clone()); - } - let _ = tx_clone.send(event); - if let Ok(mut agents) = agents_ref.lock() - && let Some(agent) = agents.get_mut(&key_clone) - { - agent.status = AgentStatus::Failed; - } - Self::notify_agent_state_changed(&watcher_tx_clone); - return; - } - }; - - // Append resume context if this is a restart with failure information. - if let Some(ctx) = resume_context_owned { - prompt.push_str(&ctx); - } - - // Step 3: transition to Running now that the worktree is ready. - { - if let Ok(mut agents) = agents_ref.lock() - && let Some(agent) = agents.get_mut(&key_clone) - { - agent.status = AgentStatus::Running; - } - } - let _ = tx_clone.send(AgentEvent::Status { - story_id: sid.clone(), - agent_name: aname.clone(), - status: "running".to_string(), - }); - Self::notify_agent_state_changed(&watcher_tx_clone); - - // Step 4: launch the agent process. - match super::pty::run_agent_pty_streaming( - &sid, - &aname, - &command, - &args, - &prompt, - &wt_path_str, - &tx_clone, - &log_clone, - log_writer_clone, - inactivity_timeout_secs, - child_killers_clone, - ) - .await - { - Ok(pty_result) => { - // Persist token usage if the agent reported it. - if let Some(ref usage) = pty_result.token_usage - && let Ok(agents) = agents_ref.lock() - && let Some(agent) = agents.get(&key_clone) - && let Some(ref pr) = agent.project_root - { - let model = config_clone - .find_agent(&aname) - .and_then(|a| a.model.clone()); - let record = super::token_usage::build_record( - &sid, &aname, model, usage.clone(), - ); - if let Err(e) = super::token_usage::append_record(pr, &record) { - slog_error!( - "[agents] Failed to persist token usage for \ - {sid}:{aname}: {e}" - ); - } - } - - // Server-owned completion: run acceptance gates automatically - // when the agent process exits normally. - run_server_owned_completion( - &agents_ref, - port_for_task, - &sid, - &aname, - pty_result.session_id, - watcher_tx_clone.clone(), - ) - .await; - Self::notify_agent_state_changed(&watcher_tx_clone); - } - Err(e) => { - slog_error!("[agents] Agent process error for {aname} on {sid}: {e}"); - let event = AgentEvent::Error { - story_id: sid.clone(), - agent_name: aname.clone(), - message: e, - }; - if let Ok(mut log) = log_clone.lock() { - log.push(event.clone()); - } - let _ = tx_clone.send(event); - if let Ok(mut agents) = agents_ref.lock() - && let Some(agent) = agents.get_mut(&key_clone) - { - agent.status = AgentStatus::Failed; - } - Self::notify_agent_state_changed(&watcher_tx_clone); - } - } - }); - - // Store the task handle while the agent is still Pending. - { - let mut agents = self.agents.lock().map_err(|e| e.to_string())?; - if let Some(agent) = agents.get_mut(&key) { - agent.task_handle = Some(handle); - } - } - - // Agent successfully spawned — prevent the guard from removing the entry. - pending_guard.disarm(); - - Ok(AgentInfo { - story_id: story_id.to_string(), - agent_name: resolved_name, - status: AgentStatus::Pending, - session_id: None, - worktree_path: None, - base_branch: None, - completion: None, - log_session_id: Some(log_session_id), - }) - } - - /// Stop a running agent. Worktree is preserved for inspection. - pub async fn stop_agent( - &self, - _project_root: &Path, - story_id: &str, - agent_name: &str, - ) -> Result<(), String> { - let key = composite_key(story_id, agent_name); - - let (worktree_info, task_handle, tx) = { - let mut agents = self.agents.lock().map_err(|e| e.to_string())?; - let agent = agents - .get_mut(&key) - .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; - - let wt = agent.worktree_info.clone(); - let handle = agent.task_handle.take(); - let tx = agent.tx.clone(); - agent.status = AgentStatus::Failed; - (wt, handle, tx) - }; - - // Abort the task and kill the PTY child process. - // Note: aborting a spawn_blocking task handle does not interrupt the blocking - // thread, so we must also kill the child process directly via the killer registry. - if let Some(handle) = task_handle { - handle.abort(); - let _ = handle.await; - } - self.kill_child_for_key(&key); - - // Preserve worktree for inspection — don't destroy agent's work on stop. - if let Some(ref wt) = worktree_info { - slog!( - "[agents] Worktree preserved for {story_id}:{agent_name}: {}", - wt.path.display() - ); - } - - let _ = tx.send(AgentEvent::Status { - story_id: story_id.to_string(), - agent_name: agent_name.to_string(), - status: "stopped".to_string(), - }); - - // Remove from map - { - let mut agents = self.agents.lock().map_err(|e| e.to_string())?; - agents.remove(&key); - } - - // Notify WebSocket clients so pipeline board and agent panel update. - Self::notify_agent_state_changed(&self.watcher_tx); - - Ok(()) - } - - /// Return the names of configured agents for `stage` that are not currently - /// running or pending. - pub fn available_agents_for_stage( - &self, - config: &ProjectConfig, - stage: &PipelineStage, - ) -> Result, String> { - let agents = self.agents.lock().map_err(|e| e.to_string())?; - Ok(config - .agent - .iter() - .filter(|cfg| agent_config_stage(cfg) == *stage) - .filter(|cfg| { - !agents.values().any(|a| { - a.agent_name == cfg.name - && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) - }) - }) - .map(|cfg| cfg.name.clone()) - .collect()) - } - - /// List all agents with their status. - pub fn list_agents(&self) -> Result, String> { - let agents = self.agents.lock().map_err(|e| e.to_string())?; - Ok(agents - .iter() - .map(|(key, agent)| { - // Extract story_id from composite key "story_id:agent_name" - let story_id = key - .rsplit_once(':') - .map(|(sid, _)| sid.to_string()) - .unwrap_or_else(|| key.clone()); - agent_info_from_entry(&story_id, agent) - }) - .collect()) - } - - /// Subscribe to events for a story agent. - pub fn subscribe( - &self, - story_id: &str, - agent_name: &str, - ) -> Result, String> { - let key = composite_key(story_id, agent_name); - let agents = self.agents.lock().map_err(|e| e.to_string())?; - let agent = agents - .get(&key) - .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; - Ok(agent.tx.subscribe()) - } - - /// Drain accumulated events for polling. Returns all events since the last drain. - pub fn drain_events( - &self, - story_id: &str, - agent_name: &str, - ) -> Result, String> { - let key = composite_key(story_id, agent_name); - let agents = self.agents.lock().map_err(|e| e.to_string())?; - let agent = agents - .get(&key) - .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; - let mut log = agent.event_log.lock().map_err(|e| e.to_string())?; - Ok(log.drain(..).collect()) - } - - /// Block until the agent reaches a terminal state (completed, failed, stopped). - /// Returns the agent's final `AgentInfo`. - /// `timeout_ms` caps how long to wait; returns an error if the deadline passes. - pub async fn wait_for_agent( - &self, - story_id: &str, - agent_name: &str, - timeout_ms: u64, - ) -> Result { - // Subscribe before checking status so we don't miss the terminal event - // if the agent completes in the window between the two operations. - let mut rx = self.subscribe(story_id, agent_name)?; - - // Return immediately if already in a terminal state. - { - let agents = self.agents.lock().map_err(|e| e.to_string())?; - let key = composite_key(story_id, agent_name); - if let Some(agent) = agents.get(&key) - && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed) - { - return Ok(agent_info_from_entry(story_id, agent)); - } - } - - let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms); - - loop { - let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); - if remaining.is_zero() { - return Err(format!( - "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'" - )); - } - - match tokio::time::timeout(remaining, rx.recv()).await { - Ok(Ok(event)) => { - let is_terminal = match &event { - AgentEvent::Done { .. } | AgentEvent::Error { .. } => true, - AgentEvent::Status { status, .. } if status == "stopped" => true, - _ => false, - }; - if is_terminal { - let agents = self.agents.lock().map_err(|e| e.to_string())?; - let key = composite_key(story_id, agent_name); - return Ok(if let Some(agent) = agents.get(&key) { - agent_info_from_entry(story_id, agent) - } else { - // Agent was removed from map (e.g. stop_agent removes it after - // the "stopped" status event is sent). - let (status, session_id) = match event { - AgentEvent::Done { session_id, .. } => { - (AgentStatus::Completed, session_id) - } - _ => (AgentStatus::Failed, None), - }; - AgentInfo { - story_id: story_id.to_string(), - agent_name: agent_name.to_string(), - status, - session_id, - worktree_path: None, - base_branch: None, - completion: None, - log_session_id: None, - } - }); - } - } - Ok(Err(broadcast::error::RecvError::Lagged(_))) => { - // Missed some buffered events — check current status before resuming. - let agents = self.agents.lock().map_err(|e| e.to_string())?; - let key = composite_key(story_id, agent_name); - if let Some(agent) = agents.get(&key) - && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed) - { - return Ok(agent_info_from_entry(story_id, agent)); - } - // Still running — continue the loop. - } - Ok(Err(broadcast::error::RecvError::Closed)) => { - // Channel closed: no more events will arrive. Return current state. - let agents = self.agents.lock().map_err(|e| e.to_string())?; - let key = composite_key(story_id, agent_name); - if let Some(agent) = agents.get(&key) { - return Ok(agent_info_from_entry(story_id, agent)); - } - return Err(format!( - "Agent '{agent_name}' for story '{story_id}' channel closed unexpectedly" - )); - } - Err(_) => { - return Err(format!( - "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'" - )); - } - } - } - } - - /// Create a worktree for the given story using the server port (writes .mcp.json). - pub async fn create_worktree( - &self, - project_root: &Path, - story_id: &str, - ) -> Result { - let config = ProjectConfig::load(project_root)?; - worktree::create_worktree(project_root, story_id, &config, self.port).await - } - - /// Advance the pipeline after an agent completes. - /// - /// Called internally by `report_completion` as a background task. - /// Reads the stored completion report and project_root from the agent, - /// then drives the next pipeline stage based on the agent's role: - /// - /// - **Coder** + gates passed → move story to `work/3_qa/`, start `qa` agent. - /// - **Coder** + gates failed → restart the same coder agent with failure context. - /// - **QA** + gates passed + coverage passed → move story to `work/4_merge/`, start `mergemaster` agent. - /// - **QA** + gates passed + coverage failed → restart `qa` with coverage failure context. - /// - **QA** + gates failed → restart `qa` with failure context. - /// - **Mergemaster** → run `script/test` on master; if pass: archive + cleanup worktree; - /// if fail: restart `mergemaster` with failure context. - /// - **Other** (supervisor, unknown) → no automatic advancement. - async fn run_pipeline_advance( - &self, - story_id: &str, - agent_name: &str, - completion: CompletionReport, - project_root: Option, - worktree_path: Option, - merge_failure_reported: bool, - ) { - let project_root = match project_root { - Some(p) => p, - None => { - slog_warn!("[pipeline] No project_root for '{story_id}:{agent_name}'"); - return; - } - }; - - let config = ProjectConfig::load(&project_root).unwrap_or_default(); - let stage = config - .find_agent(agent_name) - .map(agent_config_stage) - .unwrap_or_else(|| pipeline_stage(agent_name)); - - match stage { - PipelineStage::Other => { - // Supervisors and unknown agents do not advance the pipeline. - } - PipelineStage::Coder => { - if completion.gates_passed { - // Determine effective QA mode for this story. - let qa_mode = { - let item_type = super::lifecycle::item_type_from_id(story_id); - if item_type == "spike" { - crate::io::story_metadata::QaMode::Human - } else { - let default_qa = config.default_qa_mode(); - // Story is in 2_current/ when a coder completes. - let story_path = project_root - .join(".story_kit/work/2_current") - .join(format!("{story_id}.md")); - crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa) - } - }; - - match qa_mode { - crate::io::story_metadata::QaMode::Server => { - slog!( - "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \ - qa: server — moving directly to merge." - ); - if let Err(e) = - super::lifecycle::move_story_to_merge(&project_root, story_id) - { - slog_error!( - "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}" - ); - } else if let Err(e) = self - .start_agent(&project_root, story_id, Some("mergemaster"), None) - .await - { - slog_error!( - "[pipeline] Failed to start mergemaster for '{story_id}': {e}" - ); - } - } - crate::io::story_metadata::QaMode::Agent => { - slog!( - "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \ - qa: agent — moving to QA." - ); - if let Err(e) = super::lifecycle::move_story_to_qa(&project_root, story_id) { - slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"); - } else if let Err(e) = self - .start_agent(&project_root, story_id, Some("qa"), None) - .await - { - slog_error!("[pipeline] Failed to start qa agent for '{story_id}': {e}"); - } - } - crate::io::story_metadata::QaMode::Human => { - slog!( - "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \ - qa: human — holding for human review." - ); - if let Err(e) = super::lifecycle::move_story_to_qa(&project_root, story_id) { - slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"); - } else { - let qa_dir = project_root.join(".story_kit/work/3_qa"); - let story_path = qa_dir.join(format!("{story_id}.md")); - if let Err(e) = - crate::io::story_metadata::write_review_hold(&story_path) - { - slog_error!( - "[pipeline] Failed to set review_hold on '{story_id}': {e}" - ); - } - } - } - } - } else { - // Increment retry count and check if blocked. - let story_path = project_root - .join(".story_kit/work/2_current") - .join(format!("{story_id}.md")); - if should_block_story(&story_path, config.max_retries, story_id, "coder") { - // Story has exceeded retry limit — do not restart. - } else { - slog!( - "[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting." - ); - let context = format!( - "\n\n---\n## Previous Attempt Failed\n\ - The acceptance gates failed with the following output:\n{}\n\n\ - Please review the failures above, fix the issues, and try again.", - completion.gate_output - ); - if let Err(e) = self - .start_agent(&project_root, story_id, Some(agent_name), Some(&context)) - .await - { - slog_error!( - "[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}" - ); - } - } - } - } - PipelineStage::Qa => { - if completion.gates_passed { - // Run coverage gate in the QA worktree before advancing to merge. - let coverage_path = worktree_path - .clone() - .unwrap_or_else(|| project_root.clone()); - let cp = coverage_path.clone(); - let coverage_result = - tokio::task::spawn_blocking(move || super::gates::run_coverage_gate(&cp)) - .await - .unwrap_or_else(|e| { - slog_warn!("[pipeline] Coverage gate task panicked: {e}"); - Ok((false, format!("Coverage gate task panicked: {e}"))) - }); - let (coverage_passed, coverage_output) = match coverage_result { - Ok(pair) => pair, - Err(e) => (false, e), - }; - - if coverage_passed { - // Check whether this item needs human review before merging. - let needs_human_review = { - let item_type = super::lifecycle::item_type_from_id(story_id); - if item_type == "spike" { - true // Spikes always need human review. - } else { - let qa_dir = project_root.join(".story_kit/work/3_qa"); - let story_path = qa_dir.join(format!("{story_id}.md")); - let default_qa = config.default_qa_mode(); - matches!( - crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa), - crate::io::story_metadata::QaMode::Human - ) - } - }; - - if needs_human_review { - // Hold in 3_qa/ for human review. - let qa_dir = project_root.join(".story_kit/work/3_qa"); - let story_path = qa_dir.join(format!("{story_id}.md")); - if let Err(e) = - crate::io::story_metadata::write_review_hold(&story_path) - { - slog_error!( - "[pipeline] Failed to set review_hold on '{story_id}': {e}" - ); - } - slog!( - "[pipeline] QA passed for '{story_id}'. \ - Holding for human review. \ - Worktree preserved at: {worktree_path:?}" - ); - } else { - slog!( - "[pipeline] QA passed gates and coverage for '{story_id}'. \ - Moving directly to merge." - ); - if let Err(e) = - super::lifecycle::move_story_to_merge(&project_root, story_id) - { - slog_error!( - "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}" - ); - } else if let Err(e) = self - .start_agent(&project_root, story_id, Some("mergemaster"), None) - .await - { - slog_error!( - "[pipeline] Failed to start mergemaster for '{story_id}': {e}" - ); - } - } - } else { - let story_path = project_root - .join(".story_kit/work/3_qa") - .join(format!("{story_id}.md")); - if should_block_story(&story_path, config.max_retries, story_id, "qa-coverage") { - // Story has exceeded retry limit — do not restart. - } else { - slog!( - "[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA." - ); - let context = format!( - "\n\n---\n## Coverage Gate Failed\n\ - The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\ - Please improve test coverage until the coverage gate passes.", - coverage_output - ); - if let Err(e) = self - .start_agent(&project_root, story_id, Some("qa"), Some(&context)) - .await - { - slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}"); - } - } - } - } else { - let story_path = project_root - .join(".story_kit/work/3_qa") - .join(format!("{story_id}.md")); - if should_block_story(&story_path, config.max_retries, story_id, "qa") { - // Story has exceeded retry limit — do not restart. - } else { - slog!("[pipeline] QA failed gates for '{story_id}'. Restarting."); - let context = format!( - "\n\n---\n## Previous QA Attempt Failed\n\ - The acceptance gates failed with the following output:\n{}\n\n\ - Please re-run and fix the issues.", - completion.gate_output - ); - if let Err(e) = self - .start_agent(&project_root, story_id, Some("qa"), Some(&context)) - .await - { - slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}"); - } - } - } - } - PipelineStage::Mergemaster => { - // Block advancement if the mergemaster explicitly reported a failure. - // The server-owned gate check runs in the feature-branch worktree (not - // master), so `gates_passed=true` is misleading when no code was merged. - if merge_failure_reported { - slog!( - "[pipeline] Pipeline advancement blocked for '{story_id}': \ - mergemaster explicitly reported a merge failure. \ - Story stays in 4_merge/ for human review." - ); - } else { - // Run script/test on master (project_root) as the post-merge verification. - slog!( - "[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master." - ); - let root = project_root.clone(); - let test_result = - tokio::task::spawn_blocking(move || super::gates::run_project_tests(&root)) - .await - .unwrap_or_else(|e| { - slog_warn!("[pipeline] Post-merge test task panicked: {e}"); - Ok((false, format!("Test task panicked: {e}"))) - }); - let (passed, output) = match test_result { - Ok(pair) => pair, - Err(e) => (false, e), - }; - - if passed { - slog!( - "[pipeline] Post-merge tests passed for '{story_id}'. Moving to done." - ); - if let Err(e) = - super::lifecycle::move_story_to_archived(&project_root, story_id) - { - slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}"); - } - self.remove_agents_for_story(story_id); - // TODO: Re-enable worktree cleanup once we have persistent agent logs. - // Removing worktrees destroys evidence needed to debug empty-commit agents. - // let config = - // crate::config::ProjectConfig::load(&project_root).unwrap_or_default(); - // if let Err(e) = - // worktree::remove_worktree_by_story_id(&project_root, story_id, &config) - // .await - // { - // slog!( - // "[pipeline] Failed to remove worktree for '{story_id}': {e}" - // ); - // } - slog!( - "[pipeline] Story '{story_id}' done. Worktree preserved for inspection." - ); - } else { - let story_path = project_root - .join(".story_kit/work/4_merge") - .join(format!("{story_id}.md")); - if should_block_story(&story_path, config.max_retries, story_id, "mergemaster") { - // Story has exceeded retry limit — do not restart. - } else { - slog!( - "[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster." - ); - let context = format!( - "\n\n---\n## Post-Merge Test Failed\n\ - The tests on master failed with the following output:\n{}\n\n\ - Please investigate and resolve the failures, then call merge_agent_work again.", - output - ); - if let Err(e) = self - .start_agent( - &project_root, - story_id, - Some("mergemaster"), - Some(&context), - ) - .await - { - slog_error!( - "[pipeline] Failed to restart mergemaster for '{story_id}': {e}" - ); - } - } - } - } - } - } - - // Always scan for unassigned work after any agent completes, regardless - // of the outcome (success, failure, restart). This ensures stories that - // failed agent assignment due to busy agents are retried when agents - // become available (bug 295). - self.auto_assign_available_work(&project_root).await; - } - - /// Internal: report that an agent has finished work on a story. - /// - /// **Note:** This is no longer exposed as an MCP tool. The server now - /// automatically runs completion gates when an agent process exits - /// (see `run_server_owned_completion`). This method is retained for - /// backwards compatibility and testing. - /// - /// - Rejects with an error if the worktree has uncommitted changes. - /// - Runs acceptance gates (cargo clippy + cargo nextest run / cargo test). - /// - Stores the `CompletionReport` on the agent record. - /// - Transitions status to `Completed` (gates passed) or `Failed` (gates failed). - /// - Emits a `Done` event so `wait_for_agent` unblocks. - #[allow(dead_code)] - pub async fn report_completion( - &self, - story_id: &str, - agent_name: &str, - summary: &str, - ) -> Result { - let key = composite_key(story_id, agent_name); - - // Verify agent exists, is Running, and grab its worktree path. - let worktree_path = { - let agents = self.agents.lock().map_err(|e| e.to_string())?; - let agent = agents - .get(&key) - .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; - - if agent.status != AgentStatus::Running { - return Err(format!( - "Agent '{agent_name}' for story '{story_id}' is not running (status: {}). \ - report_completion can only be called by a running agent.", - agent.status - )); - } - - agent - .worktree_info - .as_ref() - .map(|wt| wt.path.clone()) - .ok_or_else(|| { - format!( - "Agent '{agent_name}' for story '{story_id}' has no worktree. \ - Cannot run acceptance gates." - ) - })? - }; - - let path = worktree_path.clone(); - - // Run gate checks in a blocking thread to avoid stalling the async runtime. - let (gates_passed, gate_output) = tokio::task::spawn_blocking(move || { - // Step 1: Reject if worktree is dirty. - super::gates::check_uncommitted_changes(&path)?; - // Step 2: Run clippy + tests and return (passed, output). - super::gates::run_acceptance_gates(&path) - }) - .await - .map_err(|e| format!("Gate check task panicked: {e}"))??; - - let report = CompletionReport { - summary: summary.to_string(), - gates_passed, - gate_output, - }; - - // Extract data for pipeline advance, then remove the entry so - // completed agents never appear in list_agents. - let ( - tx, - session_id, - project_root_for_advance, - wt_path_for_advance, - merge_failure_reported_for_advance, - ) = { - let mut agents = self.agents.lock().map_err(|e| e.to_string())?; - let agent = agents.get_mut(&key).ok_or_else(|| { - format!("Agent '{agent_name}' for story '{story_id}' disappeared during gate check") - })?; - agent.completion = Some(report.clone()); - let tx = agent.tx.clone(); - let sid = agent.session_id.clone(); - let pr = agent.project_root.clone(); - let wt = agent.worktree_info.as_ref().map(|w| w.path.clone()); - let mfr = agent.merge_failure_reported; - agents.remove(&key); - (tx, sid, pr, wt, mfr) - }; - - // Emit Done so wait_for_agent unblocks. - let _ = tx.send(AgentEvent::Done { - story_id: story_id.to_string(), - agent_name: agent_name.to_string(), - session_id, - }); - - // Notify WebSocket clients that the agent is gone. - Self::notify_agent_state_changed(&self.watcher_tx); - - // Advance the pipeline state machine in a background task. - let pool_clone = Self { - agents: Arc::clone(&self.agents), - port: self.port, - child_killers: Arc::clone(&self.child_killers), - watcher_tx: self.watcher_tx.clone(), - merge_jobs: Arc::clone(&self.merge_jobs), - }; - let sid = story_id.to_string(); - let aname = agent_name.to_string(); - let report_for_advance = report.clone(); - tokio::spawn(async move { - pool_clone - .run_pipeline_advance( - &sid, - &aname, - report_for_advance, - project_root_for_advance, - wt_path_for_advance, - merge_failure_reported_for_advance, - ) - .await; - }); - - Ok(report) - } - - /// Run the full mergemaster pipeline for a completed story: - /// - /// 1. Squash-merge the story's feature branch into the current branch (master). - /// 2. If conflicts are found: abort the merge and report them. - /// 3. Quality gates run **inside the merge worktree** before master is touched. - /// 4. If gates pass: cherry-pick the squash commit onto master and archive the story. - /// - /// Returns a `MergeReport` with full details of what happened. - /// Start the merge pipeline as a background task. - /// - /// Returns immediately so the MCP tool call doesn't time out (the full - /// pipeline — squash merge + quality gates — takes well over 60 seconds, - /// exceeding Claude Code's MCP tool-call timeout). - /// - /// The mergemaster agent should poll [`get_merge_status`](Self::get_merge_status) - /// until the job reaches a terminal state. - pub fn start_merge_agent_work( - self: &Arc, - project_root: &Path, - story_id: &str, - ) -> Result<(), String> { - // Guard against double-starts. - { - let jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?; - if let Some(job) = jobs.get(story_id) - && matches!(job.status, super::merge::MergeJobStatus::Running) - { - return Err(format!( - "Merge already in progress for '{story_id}'. \ - Use get_merge_status to poll for completion." - )); - } - } - - // Insert Running job. - { - let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?; - jobs.insert( - story_id.to_string(), - super::merge::MergeJob { - story_id: story_id.to_string(), - status: super::merge::MergeJobStatus::Running, - }, - ); - } - - let pool = Arc::clone(self); - let root = project_root.to_path_buf(); - let sid = story_id.to_string(); - - tokio::spawn(async move { - let report = pool.run_merge_pipeline(&root, &sid).await; - let failed = report.is_err(); - let status = match report { - Ok(r) => super::merge::MergeJobStatus::Completed(r), - Err(e) => super::merge::MergeJobStatus::Failed(e), - }; - if let Ok(mut jobs) = pool.merge_jobs.lock() - && let Some(job) = jobs.get_mut(&sid) - { - job.status = status; - } - if failed { - pool.auto_assign_available_work(&root).await; - } - }); - - Ok(()) - } - - /// The actual merge pipeline, run inside a background task. - async fn run_merge_pipeline( - self: &Arc, - project_root: &Path, - story_id: &str, - ) -> Result { - let branch = format!("feature/story-{story_id}"); - let wt_path = worktree::worktree_path(project_root, story_id); - let root = project_root.to_path_buf(); - let sid = story_id.to_string(); - let br = branch.clone(); - - let merge_result = - tokio::task::spawn_blocking(move || super::merge::run_squash_merge(&root, &br, &sid)) - .await - .map_err(|e| format!("Merge task panicked: {e}"))??; - - if !merge_result.success { - return Ok(super::merge::MergeReport { - story_id: story_id.to_string(), - success: false, - had_conflicts: merge_result.had_conflicts, - conflicts_resolved: merge_result.conflicts_resolved, - conflict_details: merge_result.conflict_details, - gates_passed: merge_result.gates_passed, - gate_output: merge_result.output, - worktree_cleaned_up: false, - story_archived: false, - }); - } - - let story_archived = - super::lifecycle::move_story_to_archived(project_root, story_id).is_ok(); - if story_archived { - self.remove_agents_for_story(story_id); - } - - let worktree_cleaned_up = if wt_path.exists() { - let config = crate::config::ProjectConfig::load(project_root).unwrap_or_default(); - worktree::remove_worktree_by_story_id(project_root, story_id, &config) - .await - .is_ok() - } else { - false - }; - - self.auto_assign_available_work(project_root).await; - - Ok(super::merge::MergeReport { - story_id: story_id.to_string(), - success: true, - had_conflicts: merge_result.had_conflicts, - conflicts_resolved: merge_result.conflicts_resolved, - conflict_details: merge_result.conflict_details, - gates_passed: true, - gate_output: merge_result.output, - worktree_cleaned_up, - story_archived, - }) - } - - /// Check the status of a background merge job. - pub fn get_merge_status(&self, story_id: &str) -> Option { - self.merge_jobs - .lock() - .ok() - .and_then(|jobs| jobs.get(story_id).cloned()) - } - - /// Get project root helper. - pub fn get_project_root(&self, state: &crate::state::SessionState) -> Result { - state.get_project_root() - } - - /// Get the log session ID and project root for an agent, if available. - /// - /// Used by MCP tools to find the persistent log file for a completed agent. - pub fn get_log_info(&self, story_id: &str, agent_name: &str) -> Option<(String, PathBuf)> { - let key = composite_key(story_id, agent_name); - let agents = self.agents.lock().ok()?; - let agent = agents.get(&key)?; - let session_id = agent.log_session_id.clone()?; - let project_root = agent.project_root.clone()?; - Some((session_id, project_root)) - } - - /// Record that the mergemaster agent for `story_id` explicitly reported a - /// merge failure via the `report_merge_failure` MCP tool. - /// - /// Sets `merge_failure_reported = true` on the active mergemaster agent so - /// that `run_pipeline_advance` can block advancement to `5_done/` even when - /// the server-owned gate check returns `gates_passed=true` (those gates run - /// in the feature-branch worktree, not on master). - pub fn set_merge_failure_reported(&self, story_id: &str) { - match self.agents.lock() { - Ok(mut lock) => { - let found = lock.iter_mut().find(|(key, agent)| { - let key_story_id = key - .rsplit_once(':') - .map(|(sid, _)| sid) - .unwrap_or(key.as_str()); - key_story_id == story_id - && pipeline_stage(&agent.agent_name) == PipelineStage::Mergemaster - }); - match found { - Some((_, agent)) => { - agent.merge_failure_reported = true; - slog!( - "[pipeline] Merge failure flag set for '{story_id}:{}'", - agent.agent_name - ); - } - None => { - slog_warn!( - "[pipeline] set_merge_failure_reported: no running mergemaster found \ - for story '{story_id}' — flag not set" - ); - } - } - } - Err(e) => { - slog_error!("[pipeline] set_merge_failure_reported: could not lock agents: {e}"); - } - } - } - - /// Test helper: inject a pre-built agent entry so unit tests can exercise - /// wait/subscribe logic without spawning a real process. - #[cfg(test)] - pub fn inject_test_agent( - &self, - story_id: &str, - agent_name: &str, - status: AgentStatus, - ) -> broadcast::Sender { - let (tx, _) = broadcast::channel::(64); - let key = composite_key(story_id, agent_name); - let mut agents = self.agents.lock().unwrap(); - agents.insert( - key, - StoryAgent { - agent_name: agent_name.to_string(), - status, - worktree_info: None, - session_id: None, - tx: tx.clone(), - task_handle: None, - event_log: Arc::new(Mutex::new(Vec::new())), - completion: None, - project_root: None, - log_session_id: None, - merge_failure_reported: false, - }, - ); - tx - } - - /// Test helper: inject an agent with a specific worktree path for testing - /// gate-related logic. - #[cfg(test)] - pub fn inject_test_agent_with_path( - &self, - story_id: &str, - agent_name: &str, - status: AgentStatus, - worktree_path: PathBuf, - ) -> broadcast::Sender { - let (tx, _) = broadcast::channel::(64); - let key = composite_key(story_id, agent_name); - let mut agents = self.agents.lock().unwrap(); - agents.insert( - key, - StoryAgent { - agent_name: agent_name.to_string(), - status, - worktree_info: Some(WorktreeInfo { - path: worktree_path, - branch: format!("feature/story-{story_id}"), - base_branch: "master".to_string(), - }), - session_id: None, - tx: tx.clone(), - task_handle: None, - event_log: Arc::new(Mutex::new(Vec::new())), - completion: None, - project_root: None, - log_session_id: None, - merge_failure_reported: false, - }, - ); - tx - } - - /// Automatically assign free agents to stories waiting in the active pipeline stages. - /// - /// Scans `work/2_current/`, `work/3_qa/`, and `work/4_merge/` for items that have no - /// active agent and assigns the first free agent of the appropriate role. Items in - /// `work/1_backlog/` are never auto-started. - /// - /// Respects the configured agent roster: the maximum number of concurrently active agents - /// per role is bounded by the count of agents of that role defined in `project.toml`. - pub async fn auto_assign_available_work(&self, project_root: &Path) { - let config = match ProjectConfig::load(project_root) { - Ok(c) => c, - Err(e) => { - slog_warn!("[auto-assign] Failed to load project config: {e}"); - return; - } - }; - - // Process each active pipeline stage in order. - let stages: [(&str, PipelineStage); 3] = [ - ("2_current", PipelineStage::Coder), - ("3_qa", PipelineStage::Qa), - ("4_merge", PipelineStage::Mergemaster), - ]; - - for (stage_dir, stage) in &stages { - let items = scan_stage_items(project_root, stage_dir); - if items.is_empty() { - continue; - } - - for story_id in &items { - // Items marked with review_hold (e.g. spikes after QA passes) stay - // in their current stage for human review — don't auto-assign agents. - if has_review_hold(project_root, stage_dir, story_id) { - continue; - } - - // Skip blocked stories (retry limit exceeded). - if is_story_blocked(project_root, stage_dir, story_id) { - continue; - } - - // Skip stories in 4_merge/ that already have a reported merge failure. - // These need human intervention — auto-assigning a new mergemaster - // would just waste tokens on the same broken merge. - if *stage == PipelineStage::Mergemaster - && has_merge_failure(project_root, stage_dir, story_id) - { - continue; - } - - // AC6: Detect empty-diff stories in 4_merge/ before starting a - // mergemaster. If the worktree has no commits on the feature branch, - // write a merge_failure and block the story immediately. - if *stage == PipelineStage::Mergemaster - && let Some(wt_path) = worktree::find_worktree_path(project_root, story_id) - && !super::gates::worktree_has_committed_work(&wt_path) - { - slog_warn!( - "[auto-assign] Story '{story_id}' in 4_merge/ has no commits \ - on feature branch. Writing merge_failure and blocking." - ); - let story_path = project_root - .join(".story_kit/work") - .join(stage_dir) - .join(format!("{story_id}.md")); - let _ = crate::io::story_metadata::write_merge_failure( - &story_path, - "Feature branch has no code changes — the coder agent \ - did not produce any commits.", - ); - let _ = crate::io::story_metadata::write_blocked(&story_path); - continue; - } - - // Re-acquire the lock on each iteration to see state changes - // from previous start_agent calls in the same pass. - let preferred_agent = - read_story_front_matter_agent(project_root, stage_dir, story_id); - - // Check max_coders limit for the Coder stage before agent selection. - // If the pool is full, all remaining items in this stage wait. - if *stage == PipelineStage::Coder - && let Some(max) = config.max_coders - { - let agents_lock = match self.agents.lock() { - Ok(a) => a, - Err(e) => { - slog_error!("[auto-assign] Failed to lock agents: {e}"); - break; - } - }; - let active = count_active_agents_for_stage(&config, &agents_lock, stage); - if active >= max { - slog!( - "[auto-assign] Coder pool full ({active}/{max}); remaining items in {stage_dir}/ will wait." - ); - break; - } - } - - // Outcome: (already_assigned, chosen_agent, preferred_busy, stage_mismatch) - // preferred_busy=true means the story has a specific agent requested but it is - // currently occupied — the story should wait rather than fall back. - // stage_mismatch=true means the preferred agent's stage doesn't match the - // pipeline stage, so we fell back to a generic stage agent. - let (already_assigned, free_agent, preferred_busy, stage_mismatch) = { - let agents = match self.agents.lock() { - Ok(a) => a, - Err(e) => { - slog_error!("[auto-assign] Failed to lock agents: {e}"); - break; - } - }; - let assigned = is_story_assigned_for_stage(&config, &agents, story_id, stage); - if assigned { - (true, None, false, false) - } else if let Some(ref pref) = preferred_agent { - // Story has a front-matter agent preference. - // Verify the preferred agent's stage matches the current - // pipeline stage — a coder shouldn't be assigned to QA. - let pref_stage_matches = config - .find_agent(pref) - .map(|cfg| agent_config_stage(cfg) == *stage) - .unwrap_or(false); - if !pref_stage_matches { - // Stage mismatch — fall back to any free agent for this stage. - let free = find_free_agent_for_stage(&config, &agents, stage) - .map(|s| s.to_string()); - (false, free, false, true) - } else if is_agent_free(&agents, pref) { - (false, Some(pref.clone()), false, false) - } else { - (false, None, true, false) - } - } else { - let free = find_free_agent_for_stage(&config, &agents, stage) - .map(|s| s.to_string()); - (false, free, false, false) - } - }; - - if already_assigned { - // Story already has an active agent — skip silently. - continue; - } - - if preferred_busy { - // The story requests a specific agent that is currently busy. - // Do not fall back to a different agent; let this story wait. - slog!( - "[auto-assign] Preferred agent '{}' busy for '{story_id}'; story will wait.", - preferred_agent.as_deref().unwrap_or("?") - ); - continue; - } - - if stage_mismatch { - slog!( - "[auto-assign] Preferred agent '{}' stage mismatch for '{story_id}' in {stage_dir}/; falling back to stage-appropriate agent.", - preferred_agent.as_deref().unwrap_or("?") - ); - } - - match free_agent { - Some(agent_name) => { - slog!( - "[auto-assign] Assigning '{agent_name}' to '{story_id}' in {stage_dir}/" - ); - if let Err(e) = self - .start_agent(project_root, story_id, Some(&agent_name), None) - .await - { - slog!( - "[auto-assign] Failed to start '{agent_name}' for '{story_id}': {e}" - ); - } - } - None => { - // No free agents of this type — stop scanning this stage. - slog!( - "[auto-assign] All {:?} agents busy; remaining items in {stage_dir}/ will wait.", - stage - ); - break; - } - } - } - } - } - - /// Reconcile stories whose agent work was committed while the server was offline. - /// - /// On server startup the in-memory agent pool is empty, so any story that an agent - /// completed during a previous session is stuck: the worktree has committed work but - /// the pipeline never advanced. This method detects those stories, re-runs the - /// acceptance gates, and advances the pipeline stage so that `auto_assign_available_work` - /// (called immediately after) picks up the right next-stage agents. - /// - /// Algorithm: - /// 1. List all worktree directories under `{project_root}/.story_kit/worktrees/`. - /// 2. For each worktree, check whether its feature branch has commits ahead of the - /// base branch (`master` / `main`). - /// 3. If committed work is found AND the story is in `2_current/` or `3_qa/`: - /// - Run acceptance gates (uncommitted-change check + clippy + tests). - /// - On pass + `2_current/`: move the story to `3_qa/`. - /// - On pass + `3_qa/`: run the coverage gate; if that also passes move to `4_merge/`. - /// - On failure: leave the story where it is so `auto_assign_available_work` can - /// start a fresh agent to retry. - /// 4. Stories in `4_merge/` are left for `auto_assign_available_work` to handle via a - /// fresh mergemaster (squash-merge must be re-executed by the mergemaster agent). - pub async fn reconcile_on_startup( - &self, - project_root: &Path, - progress_tx: &broadcast::Sender, - ) { - let worktrees = match worktree::list_worktrees(project_root) { - Ok(wt) => wt, - Err(e) => { - eprintln!("[startup:reconcile] Failed to list worktrees: {e}"); - let _ = progress_tx.send(ReconciliationEvent { - story_id: String::new(), - status: "done".to_string(), - message: format!("Reconciliation failed: {e}"), - }); - return; - } - }; - - for wt_entry in &worktrees { - let story_id = &wt_entry.story_id; - let wt_path = wt_entry.path.clone(); - - // Determine which active stage the story is in. - let stage_dir = match find_active_story_stage(project_root, story_id) { - Some(s) => s, - None => continue, // Not in any active stage (backlog/archived or unknown). - }; - - // 4_merge/ is left for auto_assign to handle with a fresh mergemaster. - if stage_dir == "4_merge" { - continue; - } - - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "checking".to_string(), - message: format!("Checking for committed work in {stage_dir}/"), - }); - - // Check whether the worktree has commits ahead of the base branch. - let wt_path_for_check = wt_path.clone(); - let has_work = tokio::task::spawn_blocking(move || { - super::gates::worktree_has_committed_work(&wt_path_for_check) - }) - .await - .unwrap_or(false); - - if !has_work { - eprintln!( - "[startup:reconcile] No committed work for '{story_id}' in {stage_dir}/; skipping." - ); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "skipped".to_string(), - message: "No committed work found; skipping.".to_string(), - }); - continue; - } - - eprintln!( - "[startup:reconcile] Found committed work for '{story_id}' in {stage_dir}/. Running acceptance gates." - ); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "gates_running".to_string(), - message: "Running acceptance gates…".to_string(), - }); - - // Run acceptance gates on the worktree. - let wt_path_for_gates = wt_path.clone(); - let gates_result = tokio::task::spawn_blocking(move || { - super::gates::check_uncommitted_changes(&wt_path_for_gates)?; - super::gates::run_acceptance_gates(&wt_path_for_gates) - }) - .await; - - let (gates_passed, gate_output) = match gates_result { - Ok(Ok(pair)) => pair, - Ok(Err(e)) => { - eprintln!("[startup:reconcile] Gate check error for '{story_id}': {e}"); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: format!("Gate error: {e}"), - }); - continue; - } - Err(e) => { - eprintln!("[startup:reconcile] Gate check task panicked for '{story_id}': {e}"); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: format!("Gate task panicked: {e}"), - }); - continue; - } - }; - - if !gates_passed { - eprintln!( - "[startup:reconcile] Gates failed for '{story_id}': {gate_output}\n\ - Leaving in {stage_dir}/ for auto-assign to restart the agent." - ); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: "Gates failed; will be retried by auto-assign.".to_string(), - }); - continue; - } - - eprintln!("[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/)."); - - if stage_dir == "2_current" { - // Coder stage — determine qa mode to decide next step. - let qa_mode = { - let item_type = super::lifecycle::item_type_from_id(story_id); - if item_type == "spike" { - crate::io::story_metadata::QaMode::Human - } else { - let default_qa = crate::config::ProjectConfig::load(project_root) - .unwrap_or_default() - .default_qa_mode(); - let story_path = project_root - .join(".story_kit/work/2_current") - .join(format!("{story_id}.md")); - crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa) - } - }; - - match qa_mode { - crate::io::story_metadata::QaMode::Server => { - if let Err(e) = super::lifecycle::move_story_to_merge(project_root, story_id) { - eprintln!("[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}"); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: format!("Failed to advance to merge: {e}"), - }); - } else { - eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/ (qa: server)."); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "advanced".to_string(), - message: "Gates passed — moved to merge (qa: server).".to_string(), - }); - } - } - crate::io::story_metadata::QaMode::Agent => { - if let Err(e) = super::lifecycle::move_story_to_qa(project_root, story_id) { - eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}"); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: format!("Failed to advance to QA: {e}"), - }); - } else { - eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/."); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "advanced".to_string(), - message: "Gates passed — moved to QA.".to_string(), - }); - } - } - crate::io::story_metadata::QaMode::Human => { - if let Err(e) = super::lifecycle::move_story_to_qa(project_root, story_id) { - eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}"); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: format!("Failed to advance to QA: {e}"), - }); - } else { - let story_path = project_root - .join(".story_kit/work/3_qa") - .join(format!("{story_id}.md")); - if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) { - eprintln!( - "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}" - ); - } - eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/ (qa: human — holding for review)."); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "review_hold".to_string(), - message: "Gates passed — holding for human review.".to_string(), - }); - } - } - } - } else if stage_dir == "3_qa" { - // QA stage → run coverage gate before advancing to merge. - let wt_path_for_cov = wt_path.clone(); - let coverage_result = tokio::task::spawn_blocking(move || { - super::gates::run_coverage_gate(&wt_path_for_cov) - }) - .await; - - let (coverage_passed, coverage_output) = match coverage_result { - Ok(Ok(pair)) => pair, - Ok(Err(e)) => { - eprintln!("[startup:reconcile] Coverage gate error for '{story_id}': {e}"); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: format!("Coverage gate error: {e}"), - }); - continue; - } - Err(e) => { - eprintln!( - "[startup:reconcile] Coverage gate panicked for '{story_id}': {e}" - ); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: format!("Coverage gate panicked: {e}"), - }); - continue; - } - }; - - if coverage_passed { - // Check whether this item needs human review before merging. - let needs_human_review = { - let item_type = super::lifecycle::item_type_from_id(story_id); - if item_type == "spike" { - true - } else { - let story_path = project_root - .join(".story_kit/work/3_qa") - .join(format!("{story_id}.md")); - let default_qa = crate::config::ProjectConfig::load(project_root) - .unwrap_or_default() - .default_qa_mode(); - matches!( - crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa), - crate::io::story_metadata::QaMode::Human - ) - } - }; - - if needs_human_review { - let story_path = project_root - .join(".story_kit/work/3_qa") - .join(format!("{story_id}.md")); - if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) { - eprintln!( - "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}" - ); - } - eprintln!( - "[startup:reconcile] '{story_id}' passed QA — holding for human review." - ); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "review_hold".to_string(), - message: "Passed QA — waiting for human review.".to_string(), - }); - } else if let Err(e) = - super::lifecycle::move_story_to_merge(project_root, story_id) - { - eprintln!( - "[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}" - ); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: format!("Failed to advance to merge: {e}"), - }); - } else { - eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/."); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "advanced".to_string(), - message: "Gates passed — moved to merge.".to_string(), - }); - } - } else { - eprintln!( - "[startup:reconcile] Coverage gate failed for '{story_id}': {coverage_output}\n\ - Leaving in 3_qa/ for auto-assign to restart the QA agent." - ); - let _ = progress_tx.send(ReconciliationEvent { - story_id: story_id.clone(), - status: "failed".to_string(), - message: "Coverage gate failed; will be retried.".to_string(), - }); - } - } - } - - // Signal that reconciliation is complete. - let _ = progress_tx.send(ReconciliationEvent { - story_id: String::new(), - status: "done".to_string(), - message: "Startup reconciliation complete.".to_string(), - }); - } - - /// Test helper: inject an agent with a completion report and project_root - /// for testing pipeline advance logic without spawning real agents. - #[cfg(test)] - pub fn inject_test_agent_with_completion( - &self, - story_id: &str, - agent_name: &str, - status: AgentStatus, - project_root: PathBuf, - completion: CompletionReport, - ) -> broadcast::Sender { - let (tx, _) = broadcast::channel::(64); - let key = composite_key(story_id, agent_name); - let mut agents = self.agents.lock().unwrap(); - agents.insert( - key, - StoryAgent { - agent_name: agent_name.to_string(), - status, - worktree_info: None, - session_id: None, - tx: tx.clone(), - task_handle: None, - event_log: Arc::new(Mutex::new(Vec::new())), - completion: Some(completion), - project_root: Some(project_root), - log_session_id: None, - merge_failure_reported: false, - }, - ); - tx - } - - /// Inject a Running agent with a pre-built (possibly finished) task handle. - /// Used by watchdog tests to simulate an orphaned agent. - #[cfg(test)] - pub fn inject_test_agent_with_handle( - &self, - story_id: &str, - agent_name: &str, - status: AgentStatus, - task_handle: tokio::task::JoinHandle<()>, - ) -> broadcast::Sender { - let (tx, _) = broadcast::channel::(64); - let key = composite_key(story_id, agent_name); - let mut agents = self.agents.lock().unwrap(); - agents.insert( - key, - StoryAgent { - agent_name: agent_name.to_string(), - status, - worktree_info: None, - session_id: None, - tx: tx.clone(), - task_handle: Some(task_handle), - event_log: Arc::new(Mutex::new(Vec::new())), - completion: None, - project_root: None, - log_session_id: None, - merge_failure_reported: false, - }, - ); - tx - } - - /// Test helper: inject a child killer into the registry. - #[cfg(test)] - pub fn inject_child_killer(&self, key: &str, killer: Box) { - let mut killers = self.child_killers.lock().unwrap(); - killers.insert(key.to_string(), killer); - } - - /// Test helper: return the number of registered child killers. - #[cfg(test)] - pub fn child_killer_count(&self) -> usize { - self.child_killers.lock().unwrap().len() - } - - /// Run a single watchdog pass synchronously (test helper). - #[cfg(test)] - pub fn run_watchdog_once(&self) { - check_orphaned_agents(&self.agents); - } - - /// Spawn a background watchdog task that periodically checks for Running agents - /// whose underlying task has already finished (orphaned entries). Any such agent - /// is marked Failed and an Error event is emitted so that `wait_for_agent` unblocks. - /// - /// The watchdog runs every 30 seconds. It is a safety net for edge cases where the - /// PTY read loop exits without updating the agent status (e.g. a panic in the - /// spawn_blocking task, or an external SIGKILL that closes the PTY fd immediately). - /// - /// When orphaned agents are detected and a `project_root` is provided, auto-assign - /// is triggered so that free agents can pick up unassigned work. - pub fn spawn_watchdog(pool: Arc, project_root: Option) { - tokio::spawn(async move { - let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); - loop { - interval.tick().await; - let found = check_orphaned_agents(&pool.agents); - if found > 0 - && let Some(ref root) = project_root - { - slog!("[watchdog] {found} orphaned agent(s) detected; triggering auto-assign."); - pool.auto_assign_available_work(root).await; - } - } - }); - } - - /// Remove all agent entries for a given story_id from the pool. - /// - /// Called when a story is archived so that stale entries don't accumulate. - /// Returns the number of entries removed. - pub fn remove_agents_for_story(&self, story_id: &str) -> usize { - let mut agents = match self.agents.lock() { - Ok(a) => a, - Err(e) => { - slog_error!("[agents] Failed to lock pool for cleanup of '{story_id}': {e}"); - return 0; - } - }; - let prefix = format!("{story_id}:"); - let keys_to_remove: Vec = agents - .keys() - .filter(|k| k.starts_with(&prefix)) - .cloned() - .collect(); - let count = keys_to_remove.len(); - for key in &keys_to_remove { - agents.remove(key); - } - if count > 0 { - slog!("[agents] Removed {count} agent entries for archived story '{story_id}'"); - } - count - } -} - -/// Return the active pipeline stage directory name for `story_id`, or `None` if the -/// story is not in any active stage (`2_current/`, `3_qa/`, `4_merge/`). -fn find_active_story_stage(project_root: &Path, story_id: &str) -> Option<&'static str> { - const STAGES: [&str; 3] = ["2_current", "3_qa", "4_merge"]; - for stage in &STAGES { - let path = project_root - .join(".story_kit") - .join("work") - .join(stage) - .join(format!("{story_id}.md")); - if path.exists() { - return Some(stage); - } - } - None -} - -/// Scan a work pipeline stage directory and return story IDs, sorted alphabetically. -/// Returns an empty `Vec` if the directory does not exist. -/// Read the optional `agent:` field from the front matter of a story file. -/// -/// Returns `Some(agent_name)` if the front matter specifies an agent, or `None` -/// if the field is absent or the file cannot be read / parsed. -fn read_story_front_matter_agent( - project_root: &Path, - stage_dir: &str, - story_id: &str, -) -> Option { - use crate::io::story_metadata::parse_front_matter; - let path = project_root - .join(".story_kit") - .join("work") - .join(stage_dir) - .join(format!("{story_id}.md")); - let contents = std::fs::read_to_string(path).ok()?; - parse_front_matter(&contents).ok()?.agent -} - -/// Return `true` if the story file in the given stage has `review_hold: true` in its front matter. -fn has_review_hold(project_root: &Path, stage_dir: &str, story_id: &str) -> bool { - use crate::io::story_metadata::parse_front_matter; - let path = project_root - .join(".story_kit") - .join("work") - .join(stage_dir) - .join(format!("{story_id}.md")); - let contents = match std::fs::read_to_string(path) { - Ok(c) => c, - Err(_) => return false, - }; - parse_front_matter(&contents) - .ok() - .and_then(|m| m.review_hold) - .unwrap_or(false) -} - -/// Increment retry_count and block the story if it exceeds `max_retries`. -/// -/// Returns `true` if the story is now blocked (caller should NOT restart the agent). -/// Returns `false` if the story may be retried. -/// When `max_retries` is 0, retry limits are disabled. -fn should_block_story(story_path: &Path, max_retries: u32, story_id: &str, stage_label: &str) -> bool { - use crate::io::story_metadata::{increment_retry_count, write_blocked}; - - if max_retries == 0 { - // Retry limits disabled. - return false; - } - - match increment_retry_count(story_path) { - Ok(new_count) => { - if new_count >= max_retries { - slog_warn!( - "[pipeline] Story '{story_id}' reached retry limit ({new_count}/{max_retries}) \ - at {stage_label} stage. Marking as blocked." - ); - if let Err(e) = write_blocked(story_path) { - slog_error!("[pipeline] Failed to write blocked flag for '{story_id}': {e}"); - } - true - } else { - slog!( - "[pipeline] Story '{story_id}' retry {new_count}/{max_retries} at {stage_label} stage." - ); - false - } - } - Err(e) => { - slog_error!("[pipeline] Failed to increment retry_count for '{story_id}': {e}"); - false // Don't block on error — allow retry. - } - } -} - -/// Return `true` if the story file has `blocked: true` in its front matter. -fn is_story_blocked(project_root: &Path, stage_dir: &str, story_id: &str) -> bool { - use crate::io::story_metadata::parse_front_matter; - let path = project_root - .join(".story_kit") - .join("work") - .join(stage_dir) - .join(format!("{story_id}.md")); - let contents = match std::fs::read_to_string(path) { - Ok(c) => c, - Err(_) => return false, - }; - parse_front_matter(&contents) - .ok() - .and_then(|m| m.blocked) - .unwrap_or(false) -} - -/// Return `true` if the story file has a `merge_failure` field in its front matter. -fn has_merge_failure(project_root: &Path, stage_dir: &str, story_id: &str) -> bool { - use crate::io::story_metadata::parse_front_matter; - let path = project_root - .join(".story_kit") - .join("work") - .join(stage_dir) - .join(format!("{story_id}.md")); - let contents = match std::fs::read_to_string(path) { - Ok(c) => c, - Err(_) => return false, - }; - parse_front_matter(&contents) - .ok() - .and_then(|m| m.merge_failure) - .is_some() -} - -/// Return `true` if `agent_name` has no active (pending/running) entry in the pool. -fn is_agent_free(agents: &HashMap, agent_name: &str) -> bool { - !agents.values().any(|a| { - a.agent_name == agent_name - && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) - }) -} - -fn scan_stage_items(project_root: &Path, stage_dir: &str) -> Vec { - let dir = project_root.join(".story_kit").join("work").join(stage_dir); - if !dir.is_dir() { - return Vec::new(); - } - let mut items = Vec::new(); - if let Ok(entries) = std::fs::read_dir(&dir) { - for entry in entries.flatten() { - let path = entry.path(); - if path.extension().and_then(|e| e.to_str()) == Some("md") - && let Some(stem) = path.file_stem().and_then(|s| s.to_str()) - { - items.push(stem.to_string()); - } - } - } - items.sort(); - items -} - -/// Return `true` if `story_id` has any active (pending/running) agent matching `stage`. -/// -/// Uses the explicit `stage` config field when the agent is found in `config`; -/// falls back to the legacy name-based heuristic for unlisted agents. -fn is_story_assigned_for_stage( - config: &ProjectConfig, - agents: &HashMap, - story_id: &str, - stage: &PipelineStage, -) -> bool { - agents.iter().any(|(key, agent)| { - // Composite key format: "{story_id}:{agent_name}" - let key_story_id = key.rsplit_once(':').map(|(sid, _)| sid).unwrap_or(key); - let agent_stage = config - .find_agent(&agent.agent_name) - .map(agent_config_stage) - .unwrap_or_else(|| pipeline_stage(&agent.agent_name)); - key_story_id == story_id - && agent_stage == *stage - && matches!(agent.status, AgentStatus::Running | AgentStatus::Pending) - }) -} - -/// Count active (pending/running) agents for a given pipeline stage. -fn count_active_agents_for_stage( - config: &ProjectConfig, - agents: &HashMap, - stage: &PipelineStage, -) -> usize { - agents - .values() - .filter(|a| { - matches!(a.status, AgentStatus::Running | AgentStatus::Pending) - && config - .find_agent(&a.agent_name) - .map(|cfg| agent_config_stage(cfg) == *stage) - .unwrap_or_else(|| pipeline_stage(&a.agent_name) == *stage) - }) - .count() -} - -/// Find the first configured agent for `stage` that has no active (pending/running) assignment. -/// Returns `None` if all agents for that stage are busy, none are configured, -/// or the `max_coders` limit has been reached (for the Coder stage). -/// -/// For the Coder stage, when `default_coder_model` is set, only considers agents whose -/// model matches the default. This ensures opus-class agents are reserved for explicit -/// front-matter requests. -fn find_free_agent_for_stage<'a>( - config: &'a ProjectConfig, - agents: &HashMap, - stage: &PipelineStage, -) -> Option<&'a str> { - // Enforce max_coders limit for the Coder stage. - if *stage == PipelineStage::Coder - && let Some(max) = config.max_coders - { - let active = count_active_agents_for_stage(config, agents, stage); - if active >= max { - return None; - } - } - - for agent_config in &config.agent { - if agent_config_stage(agent_config) != *stage { - continue; - } - // When default_coder_model is set, only auto-assign coder agents whose - // model matches. This keeps opus agents reserved for explicit requests. - if *stage == PipelineStage::Coder - && let Some(ref default_model) = config.default_coder_model - { - let agent_model = agent_config.model.as_deref().unwrap_or(""); - if agent_model != default_model { - continue; - } - } - let is_busy = agents.values().any(|a| { - a.agent_name == agent_config.name - && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) - }); - if !is_busy { - return Some(&agent_config.name); - } - } - None -} - -/// Scan the agent pool for Running entries whose backing tokio task has already -/// finished and mark them as Failed. -/// -/// This handles the case where the PTY read loop or the spawned task exits -/// without updating the agent status — for example when the process is killed -/// externally and the PTY master fd returns EOF before our inactivity timeout -/// fires, but some other edge case prevents the normal cleanup path from running. -fn check_orphaned_agents(agents: &Mutex>) -> usize { - let mut lock = match agents.lock() { - Ok(l) => l, - Err(_) => return 0, - }; - - // Collect orphaned entries: Running or Pending agents whose task handle is finished. - // Pending agents can be orphaned if worktree creation panics before setting status. - let orphaned: Vec<(String, String, broadcast::Sender, AgentStatus)> = lock - .iter() - .filter_map(|(key, agent)| { - if matches!(agent.status, AgentStatus::Running | AgentStatus::Pending) - && let Some(handle) = &agent.task_handle - && handle.is_finished() - { - let story_id = key - .rsplit_once(':') - .map(|(s, _)| s.to_string()) - .unwrap_or_else(|| key.clone()); - return Some(( - key.clone(), - story_id, - agent.tx.clone(), - agent.status.clone(), - )); - } - None - }) - .collect(); - - let count = orphaned.len(); - for (key, story_id, tx, prev_status) in orphaned { - if let Some(agent) = lock.get_mut(&key) { - agent.status = AgentStatus::Failed; - slog!( - "[watchdog] Orphaned agent '{key}': task finished but status was {prev_status}. \ - Marking Failed." - ); - let _ = tx.send(AgentEvent::Error { - story_id, - agent_name: agent.agent_name.clone(), - message: "Agent process terminated unexpectedly (watchdog detected orphan)" - .to_string(), - }); - } - } - count -} - -/// Server-owned completion: runs acceptance gates when an agent process exits -/// normally, and advances the pipeline based on results. -/// -/// This is a **free function** (not a method on `AgentPool`) to break the -/// opaque type cycle that would otherwise arise: `start_agent` → spawned task -/// → server-owned completion → pipeline advance → `start_agent`. -/// -/// If the agent already has a completion report (e.g. from a legacy -/// `report_completion` call), this is a no-op to avoid double-running gates. -async fn run_server_owned_completion( - agents: &Arc>>, - port: u16, - story_id: &str, - agent_name: &str, - session_id: Option, - watcher_tx: broadcast::Sender, -) { - let key = composite_key(story_id, agent_name); - - // Guard: skip if completion was already recorded (legacy path). - { - let lock = match agents.lock() { - Ok(a) => a, - Err(_) => return, - }; - match lock.get(&key) { - Some(agent) if agent.completion.is_some() => { - slog!( - "[agents] Completion already recorded for '{story_id}:{agent_name}'; \ - skipping server-owned gates." - ); - return; - } - Some(_) => {} - None => return, - } - } - - // Get worktree path for running gates. - let worktree_path = { - let lock = match agents.lock() { - Ok(a) => a, - Err(_) => return, - }; - lock.get(&key) - .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone())) - }; - - // Run acceptance gates. - let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path { - let path = wt_path; - match tokio::task::spawn_blocking(move || { - super::gates::check_uncommitted_changes(&path)?; - // AC5: Fail early if the coder finished with no commits on the feature branch. - // This prevents empty-diff stories from advancing through QA to merge. - if !super::gates::worktree_has_committed_work(&path) { - return Ok(( - false, - "Agent exited with no commits on the feature branch. \ - The agent did not produce any code changes." - .to_string(), - )); - } - super::gates::run_acceptance_gates(&path) - }) - .await - { - Ok(Ok(result)) => result, - Ok(Err(e)) => (false, e), - Err(e) => (false, format!("Gate check task panicked: {e}")), - } - } else { - ( - false, - "No worktree path available to run acceptance gates".to_string(), - ) - }; - - slog!( - "[agents] Server-owned completion for '{story_id}:{agent_name}': gates_passed={gates_passed}" - ); - - let report = CompletionReport { - summary: "Agent process exited normally".to_string(), - gates_passed, - gate_output, - }; - - // Store completion report, extract data for pipeline advance, then - // remove the entry so completed agents never appear in list_agents. - let (tx, project_root_for_advance, wt_path_for_advance, merge_failure_reported_for_advance) = { - let mut lock = match agents.lock() { - Ok(a) => a, - Err(_) => return, - }; - let agent = match lock.get_mut(&key) { - Some(a) => a, - None => return, - }; - agent.completion = Some(report.clone()); - agent.session_id = session_id.clone(); - let tx = agent.tx.clone(); - let pr = agent.project_root.clone(); - let wt = agent.worktree_info.as_ref().map(|w| w.path.clone()); - let mfr = agent.merge_failure_reported; - lock.remove(&key); - (tx, pr, wt, mfr) - }; - - // Emit Done so wait_for_agent unblocks. - let _ = tx.send(AgentEvent::Done { - story_id: story_id.to_string(), - agent_name: agent_name.to_string(), - session_id, - }); - - // Notify WebSocket clients that the agent is gone. - AgentPool::notify_agent_state_changed(&watcher_tx); - - // Advance the pipeline state machine in a background task. - spawn_pipeline_advance( - Arc::clone(agents), - port, - story_id, - agent_name, - report, - project_root_for_advance, - wt_path_for_advance, - watcher_tx, - merge_failure_reported_for_advance, - ); -} - -/// Spawn pipeline advancement as a background task. -/// -/// This is a **non-async** function so it does not participate in the opaque -/// type cycle between `start_agent` and `run_server_owned_completion`. -#[allow(clippy::too_many_arguments)] -fn spawn_pipeline_advance( - agents: Arc>>, - port: u16, - story_id: &str, - agent_name: &str, - completion: CompletionReport, - project_root: Option, - worktree_path: Option, - watcher_tx: broadcast::Sender, - merge_failure_reported: bool, -) { - let sid = story_id.to_string(); - let aname = agent_name.to_string(); - tokio::spawn(async move { - let pool = AgentPool { - agents, - port, - child_killers: Arc::new(Mutex::new(HashMap::new())), - watcher_tx, - merge_jobs: Arc::new(Mutex::new(HashMap::new())), - }; - pool.run_pipeline_advance( - &sid, - &aname, - completion, - project_root, - worktree_path, - merge_failure_reported, - ) - .await; - }); -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::agents::merge::{MergeJob, MergeJobStatus}; - use crate::agents::{ - AgentEvent, AgentStatus, CompletionReport, PipelineStage, ReconciliationEvent, - lifecycle::move_story_to_archived, - }; - use crate::config::ProjectConfig; - use crate::io::watcher::WatcherEvent; - use portable_pty::{CommandBuilder, PtySize, native_pty_system}; - use std::collections::HashMap; - use std::path::PathBuf; - use std::process::Command; - use tokio::sync::broadcast; - - fn init_git_repo(repo: &std::path::Path) { - Command::new("git") - .args(["init"]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["config", "user.email", "test@test.com"]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["config", "user.name", "Test"]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "--allow-empty", "-m", "init"]) - .current_dir(repo) - .output() - .unwrap(); - } - - fn make_config(toml_str: &str) -> ProjectConfig { - ProjectConfig::parse(toml_str).unwrap() - } - - #[tokio::test] - async fn wait_for_agent_returns_immediately_if_completed() { - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("s1", "bot", AgentStatus::Completed); - - let info = pool.wait_for_agent("s1", "bot", 1000).await.unwrap(); - assert_eq!(info.status, AgentStatus::Completed); - assert_eq!(info.story_id, "s1"); - assert_eq!(info.agent_name, "bot"); - } - - #[tokio::test] - async fn wait_for_agent_returns_immediately_if_failed() { - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("s2", "bot", AgentStatus::Failed); - - let info = pool.wait_for_agent("s2", "bot", 1000).await.unwrap(); - assert_eq!(info.status, AgentStatus::Failed); - } - - #[tokio::test] - async fn wait_for_agent_completes_on_done_event() { - let pool = AgentPool::new_test(3001); - let tx = pool.inject_test_agent("s3", "bot", AgentStatus::Running); - - // Send Done event after a short delay - let tx_clone = tx.clone(); - tokio::spawn(async move { - tokio::time::sleep(std::time::Duration::from_millis(50)).await; - // Mark status via event; real code also updates the map, but for - // this unit test the map entry stays Running — we verify the - // wait loop reacts to the event. - let _ = tx_clone.send(AgentEvent::Done { - story_id: "s3".to_string(), - agent_name: "bot".to_string(), - session_id: Some("sess-abc".to_string()), - }); - }); - - let info = pool.wait_for_agent("s3", "bot", 2000).await.unwrap(); - // Status comes from the map entry (Running in this unit test) - // — the important thing is that wait_for_agent returned without timing out. - assert_eq!(info.story_id, "s3"); - } - - #[tokio::test] - async fn wait_for_agent_times_out() { - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("s4", "bot", AgentStatus::Running); - - let result = pool.wait_for_agent("s4", "bot", 50).await; - assert!(result.is_err()); - let msg = result.unwrap_err(); - assert!(msg.contains("Timed out"), "unexpected message: {msg}"); - } - - #[tokio::test] - async fn wait_for_agent_errors_for_nonexistent() { - let pool = AgentPool::new_test(3001); - let result = pool.wait_for_agent("no_story", "no_bot", 100).await; - assert!(result.is_err()); - } - - #[tokio::test] - async fn wait_for_agent_completes_on_stopped_status_event() { - let pool = AgentPool::new_test(3001); - let tx = pool.inject_test_agent("s5", "bot", AgentStatus::Running); - - let tx_clone = tx.clone(); - tokio::spawn(async move { - tokio::time::sleep(std::time::Duration::from_millis(30)).await; - let _ = tx_clone.send(AgentEvent::Status { - story_id: "s5".to_string(), - agent_name: "bot".to_string(), - status: "stopped".to_string(), - }); - }); - - let info = pool.wait_for_agent("s5", "bot", 2000).await.unwrap(); - assert_eq!(info.story_id, "s5"); - } - - // ── report_completion tests ──────────────────────────────────── - - #[tokio::test] - async fn report_completion_rejects_nonexistent_agent() { - let pool = AgentPool::new_test(3001); - let result = pool.report_completion("no_story", "no_bot", "done").await; - assert!(result.is_err()); - let msg = result.unwrap_err(); - assert!(msg.contains("No agent"), "unexpected: {msg}"); - } - - #[tokio::test] - async fn report_completion_rejects_non_running_agent() { - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("s6", "bot", AgentStatus::Completed); - - let result = pool.report_completion("s6", "bot", "done").await; - assert!(result.is_err()); - let msg = result.unwrap_err(); - assert!( - msg.contains("not running"), - "expected 'not running' in: {msg}" - ); - } - - #[tokio::test] - async fn report_completion_rejects_dirty_worktree() { - use std::fs; - use tempfile::tempdir; - - let tmp = tempdir().unwrap(); - let repo = tmp.path(); - - // Init a real git repo and make an initial commit - Command::new("git") - .args(["init"]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "--allow-empty", "-m", "init"]) - .current_dir(repo) - .output() - .unwrap(); - - // Write an uncommitted file - fs::write(repo.join("dirty.txt"), "not committed").unwrap(); - - let pool = AgentPool::new_test(3001); - pool.inject_test_agent_with_path("s7", "bot", AgentStatus::Running, repo.to_path_buf()); - - let result = pool.report_completion("s7", "bot", "done").await; - assert!(result.is_err()); - let msg = result.unwrap_err(); - assert!( - msg.contains("uncommitted"), - "expected 'uncommitted' in: {msg}" - ); - } - - // ── server-owned completion tests ─────────────────────────────────────────── - - #[tokio::test] - async fn server_owned_completion_skips_when_already_completed() { - let pool = AgentPool::new_test(3001); - let report = CompletionReport { - summary: "Already done".to_string(), - gates_passed: true, - gate_output: String::new(), - }; - pool.inject_test_agent_with_completion( - "s10", - "coder-1", - AgentStatus::Completed, - PathBuf::from("/tmp/nonexistent"), - report, - ); - - // Subscribe before calling so we can check if Done event was emitted. - let mut rx = pool.subscribe("s10", "coder-1").unwrap(); - - run_server_owned_completion( - &pool.agents, - pool.port, - "s10", - "coder-1", - Some("sess-1".to_string()), - pool.watcher_tx.clone(), - ) - .await; - - // Status should remain Completed (unchanged) — no gate re-run. - let agents = pool.agents.lock().unwrap(); - let key = composite_key("s10", "coder-1"); - let agent = agents.get(&key).unwrap(); - assert_eq!(agent.status, AgentStatus::Completed); - // Summary should still be the original, not overwritten. - assert_eq!(agent.completion.as_ref().unwrap().summary, "Already done"); - drop(agents); - - // No Done event should have been emitted. - assert!( - rx.try_recv().is_err(), - "should not emit Done when completion already exists" - ); - } - - #[tokio::test] - async fn server_owned_completion_runs_gates_on_clean_worktree() { - use tempfile::tempdir; - - let tmp = tempdir().unwrap(); - let repo = tmp.path(); - init_git_repo(repo); - - let pool = AgentPool::new_test(3001); - pool.inject_test_agent_with_path( - "s11", - "coder-1", - AgentStatus::Running, - repo.to_path_buf(), - ); - - let mut rx = pool.subscribe("s11", "coder-1").unwrap(); - - run_server_owned_completion( - &pool.agents, - pool.port, - "s11", - "coder-1", - Some("sess-2".to_string()), - pool.watcher_tx.clone(), - ) - .await; - - // Agent entry should be removed from the map after completion. - let agents = pool.agents.lock().unwrap(); - let key = composite_key("s11", "coder-1"); - assert!( - agents.get(&key).is_none(), - "agent should be removed from map after completion" - ); - drop(agents); - - // A Done event should have been emitted with the session_id. - let event = rx.try_recv().expect("should emit Done event"); - match &event { - AgentEvent::Done { session_id, .. } => { - assert_eq!(*session_id, Some("sess-2".to_string())); - } - other => panic!("expected Done event, got: {other:?}"), - } - } - - #[tokio::test] - async fn server_owned_completion_fails_on_dirty_worktree() { - use std::fs; - use tempfile::tempdir; - - let tmp = tempdir().unwrap(); - let repo = tmp.path(); - init_git_repo(repo); - // Create an uncommitted file. - fs::write(repo.join("dirty.txt"), "not committed").unwrap(); - - let pool = AgentPool::new_test(3001); - pool.inject_test_agent_with_path( - "s12", - "coder-1", - AgentStatus::Running, - repo.to_path_buf(), - ); - - let mut rx = pool.subscribe("s12", "coder-1").unwrap(); - - run_server_owned_completion( - &pool.agents, - pool.port, - "s12", - "coder-1", - None, - pool.watcher_tx.clone(), - ) - .await; - - // Agent entry should be removed from the map after completion (even on failure). - let agents = pool.agents.lock().unwrap(); - let key = composite_key("s12", "coder-1"); - assert!( - agents.get(&key).is_none(), - "agent should be removed from map after failed completion" - ); - drop(agents); - - // A Done event should have been emitted. - let event = rx.try_recv().expect("should emit Done event"); - assert!( - matches!(event, AgentEvent::Done { .. }), - "expected Done event, got: {event:?}" - ); - } - - #[tokio::test] - async fn server_owned_completion_nonexistent_agent_is_noop() { - let pool = AgentPool::new_test(3001); - // Should not panic or error — just silently return. - run_server_owned_completion( - &pool.agents, - pool.port, - "nonexistent", - "bot", - None, - pool.watcher_tx.clone(), - ) - .await; - } - - // ── pipeline advance tests ──────────────────────────────────────────────── - - #[tokio::test] - async fn pipeline_advance_coder_gates_pass_server_qa_moves_to_merge() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Set up story in 2_current/ (no qa frontmatter → uses project default "server") - let current = root.join(".story_kit/work/2_current"); - fs::create_dir_all(¤t).unwrap(); - fs::write(current.join("50_story_test.md"), "test").unwrap(); - - let pool = AgentPool::new_test(3001); - pool.run_pipeline_advance( - "50_story_test", - "coder-1", - CompletionReport { - summary: "done".to_string(), - gates_passed: true, - gate_output: String::new(), - }, - Some(root.to_path_buf()), - None, - false, - ) - .await; - - // With default qa: server, story skips QA and goes straight to 4_merge/ - assert!( - root.join(".story_kit/work/4_merge/50_story_test.md") - .exists(), - "story should be in 4_merge/" - ); - assert!( - !current.join("50_story_test.md").exists(), - "story should not still be in 2_current/" - ); - } - - #[tokio::test] - async fn pipeline_advance_coder_gates_pass_agent_qa_moves_to_qa() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Set up story in 2_current/ with qa: agent frontmatter - let current = root.join(".story_kit/work/2_current"); - fs::create_dir_all(¤t).unwrap(); - fs::write( - current.join("50_story_test.md"), - "---\nname: Test\nqa: agent\n---\ntest", - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - pool.run_pipeline_advance( - "50_story_test", - "coder-1", - CompletionReport { - summary: "done".to_string(), - gates_passed: true, - gate_output: String::new(), - }, - Some(root.to_path_buf()), - None, - false, - ) - .await; - - // With qa: agent, story should move to 3_qa/ - assert!( - root.join(".story_kit/work/3_qa/50_story_test.md").exists(), - "story should be in 3_qa/" - ); - assert!( - !current.join("50_story_test.md").exists(), - "story should not still be in 2_current/" - ); - } - - #[tokio::test] - async fn pipeline_advance_qa_gates_pass_moves_story_to_merge() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Set up story in 3_qa/ - let qa_dir = root.join(".story_kit/work/3_qa"); - fs::create_dir_all(&qa_dir).unwrap(); - // qa: server so the story skips human review and goes straight to merge. - fs::write( - qa_dir.join("51_story_test.md"), - "---\nname: Test\nqa: server\n---\ntest", - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - pool.run_pipeline_advance( - "51_story_test", - "qa", - CompletionReport { - summary: "QA done".to_string(), - gates_passed: true, - gate_output: String::new(), - }, - Some(root.to_path_buf()), - None, - false, - ) - .await; - - // Story should have moved to 4_merge/ - assert!( - root.join(".story_kit/work/4_merge/51_story_test.md") - .exists(), - "story should be in 4_merge/" - ); - assert!( - !qa_dir.join("51_story_test.md").exists(), - "story should not still be in 3_qa/" - ); - } - - #[tokio::test] - async fn pipeline_advance_supervisor_does_not_advance() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let current = root.join(".story_kit/work/2_current"); - fs::create_dir_all(¤t).unwrap(); - fs::write(current.join("52_story_test.md"), "test").unwrap(); - - let pool = AgentPool::new_test(3001); - pool.run_pipeline_advance( - "52_story_test", - "supervisor", - CompletionReport { - summary: "supervised".to_string(), - gates_passed: true, - gate_output: String::new(), - }, - Some(root.to_path_buf()), - None, - false, - ) - .await; - - // Story should NOT have moved (supervisors don't advance pipeline) - assert!( - current.join("52_story_test.md").exists(), - "story should still be in 2_current/ for supervisor" - ); - } - - #[tokio::test] - async fn pipeline_advance_sends_agent_state_changed_to_watcher_tx() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Set up story in 2_current/ - let current = root.join(".story_kit/work/2_current"); - fs::create_dir_all(¤t).unwrap(); - fs::write(current.join("173_story_test.md"), "test").unwrap(); - // Ensure 3_qa/ exists for the move target - fs::create_dir_all(root.join(".story_kit/work/3_qa")).unwrap(); - // Ensure 1_backlog/ exists (start_agent calls move_story_to_current) - fs::create_dir_all(root.join(".story_kit/work/1_backlog")).unwrap(); - - // Write a project.toml with a qa agent so start_agent can resolve it. - fs::create_dir_all(root.join(".story_kit")).unwrap(); - fs::write( - root.join(".story_kit/project.toml"), - r#" -default_qa = "agent" - -[[agent]] -name = "coder-1" -role = "Coder" -command = "echo" -args = ["noop"] -prompt = "test" -stage = "coder" - -[[agent]] -name = "qa" -role = "QA" -command = "echo" -args = ["noop"] -prompt = "test" -stage = "qa" -"#, - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - // Subscribe to the watcher channel BEFORE the pipeline advance. - let mut rx = pool.watcher_tx.subscribe(); - - // Call pipeline advance directly. This will: - // 1. Move the story to 3_qa/ - // 2. Start the QA agent (which calls notify_agent_state_changed) - // Note: the actual agent process will fail (no real worktree), but the - // agent insertion and notification happen before the background spawn. - pool.run_pipeline_advance( - "173_story_test", - "coder-1", - CompletionReport { - summary: "done".to_string(), - gates_passed: true, - gate_output: String::new(), - }, - Some(root.to_path_buf()), - None, - false, - ) - .await; - - // The pipeline advance should have sent AgentStateChanged events via - // the pool's watcher_tx (not a dummy channel). Collect all events. - let mut got_agent_state_changed = false; - while let Ok(evt) = rx.try_recv() { - if matches!(evt, WatcherEvent::AgentStateChanged) { - got_agent_state_changed = true; - break; - } - } - - assert!( - got_agent_state_changed, - "pipeline advance should send AgentStateChanged through the real watcher_tx \ - (bug 173: lozenges must update when agents are assigned during pipeline advance)" - ); - } - - // ── auto-assign helper tests ─────────────────────────────────── - - #[test] - fn scan_stage_items_returns_empty_for_missing_dir() { - let tmp = tempfile::tempdir().unwrap(); - let items = scan_stage_items(tmp.path(), "2_current"); - assert!(items.is_empty()); - } - - #[test] - fn scan_stage_items_returns_sorted_story_ids() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let stage_dir = tmp.path().join(".story_kit").join("work").join("2_current"); - fs::create_dir_all(&stage_dir).unwrap(); - fs::write(stage_dir.join("42_story_foo.md"), "---\nname: foo\n---").unwrap(); - fs::write(stage_dir.join("10_story_bar.md"), "---\nname: bar\n---").unwrap(); - fs::write(stage_dir.join("5_story_baz.md"), "---\nname: baz\n---").unwrap(); - // non-md file should be ignored - fs::write(stage_dir.join("README.txt"), "ignore me").unwrap(); - - let items = scan_stage_items(tmp.path(), "2_current"); - assert_eq!(items, vec!["10_story_bar", "42_story_foo", "5_story_baz"]); - } - - #[test] - fn is_story_assigned_returns_true_for_running_coder() { - let config = ProjectConfig::default(); - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running); - - let agents = pool.agents.lock().unwrap(); - assert!(is_story_assigned_for_stage( - &config, - &agents, - "42_story_foo", - &PipelineStage::Coder - )); - // Same story but wrong stage — should be false - assert!(!is_story_assigned_for_stage( - &config, - &agents, - "42_story_foo", - &PipelineStage::Qa - )); - // Different story — should be false - assert!(!is_story_assigned_for_stage( - &config, - &agents, - "99_story_other", - &PipelineStage::Coder - )); - } - - #[test] - fn is_story_assigned_returns_false_for_completed_agent() { - let config = ProjectConfig::default(); - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Completed); - - let agents = pool.agents.lock().unwrap(); - // Completed agents don't count as assigned - assert!(!is_story_assigned_for_stage( - &config, - &agents, - "42_story_foo", - &PipelineStage::Coder - )); - } - - #[test] - fn is_story_assigned_uses_config_stage_field_for_nonstandard_names() { - let config = ProjectConfig::parse( - r#" -[[agent]] -name = "qa-2" -stage = "qa" -"#, - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("42_story_foo", "qa-2", AgentStatus::Running); - - let agents = pool.agents.lock().unwrap(); - // qa-2 with stage=qa should be recognised as a QA agent - assert!( - is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Qa), - "qa-2 should be detected as assigned to QA stage" - ); - // Should NOT appear as a coder - assert!( - !is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Coder), - "qa-2 should not be detected as a coder" - ); - } - - #[test] - fn find_free_agent_returns_none_when_all_busy() { - let config = ProjectConfig::parse( - r#" -[[agent]] -name = "coder-1" -[[agent]] -name = "coder-2" -"#, - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("s1", "coder-1", AgentStatus::Running); - pool.inject_test_agent("s2", "coder-2", AgentStatus::Running); - - let agents = pool.agents.lock().unwrap(); - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert!(free.is_none(), "no free coders should be available"); - } - - #[test] - fn find_free_agent_returns_first_free_coder() { - let config = ProjectConfig::parse( - r#" -[[agent]] -name = "coder-1" -[[agent]] -name = "coder-2" -[[agent]] -name = "coder-3" -"#, - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - // coder-1 is busy, coder-2 is free - pool.inject_test_agent("s1", "coder-1", AgentStatus::Running); - - let agents = pool.agents.lock().unwrap(); - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!( - free, - Some("coder-2"), - "coder-2 should be the first free coder" - ); - } - - #[test] - fn find_free_agent_ignores_completed_agents() { - let config = ProjectConfig::parse( - r#" -[[agent]] -name = "coder-1" -"#, - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - // coder-1 completed its previous story — it's free for a new one - pool.inject_test_agent("s1", "coder-1", AgentStatus::Completed); - - let agents = pool.agents.lock().unwrap(); - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!(free, Some("coder-1"), "completed coder-1 should be free"); - } - - #[test] - fn find_free_agent_returns_none_for_wrong_stage() { - let config = ProjectConfig::parse( - r#" -[[agent]] -name = "qa" -"#, - ) - .unwrap(); - - let agents: HashMap = HashMap::new(); - // Looking for a Coder but only QA is configured - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert!(free.is_none()); - // Looking for QA should find it - let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa); - assert_eq!(free_qa, Some("qa")); - } - - #[test] - fn find_free_agent_uses_config_stage_field_not_name() { - // Agents named "qa-2" and "coder-opus" don't match the legacy name heuristic - // but should be picked up via their explicit stage field. - let config = ProjectConfig::parse( - r#" -[[agent]] -name = "qa-2" -stage = "qa" - -[[agent]] -name = "coder-opus" -stage = "coder" -"#, - ) - .unwrap(); - - let agents: HashMap = HashMap::new(); - - // qa-2 should be found for PipelineStage::Qa via config stage field - let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa); - assert_eq!(free_qa, Some("qa-2"), "qa-2 with stage=qa should be found"); - - // coder-opus should be found for PipelineStage::Coder via config stage field - let free_coder = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!( - free_coder, - Some("coder-opus"), - "coder-opus with stage=coder should be found" - ); - - // Neither should match the other stage - let free_merge = find_free_agent_for_stage(&config, &agents, &PipelineStage::Mergemaster); - assert!(free_merge.is_none()); - } - - // ── find_active_story_stage tests ───────────────────────────────────────── - - #[test] - fn find_active_story_stage_detects_current() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - let current = root.join(".story_kit/work/2_current"); - fs::create_dir_all(¤t).unwrap(); - fs::write(current.join("10_story_test.md"), "test").unwrap(); - - assert_eq!( - find_active_story_stage(root, "10_story_test"), - Some("2_current") - ); - } - - #[test] - fn find_active_story_stage_detects_qa() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - let qa = root.join(".story_kit/work/3_qa"); - fs::create_dir_all(&qa).unwrap(); - fs::write(qa.join("11_story_test.md"), "test").unwrap(); - - assert_eq!(find_active_story_stage(root, "11_story_test"), Some("3_qa")); - } - - #[test] - fn find_active_story_stage_detects_merge() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - let merge = root.join(".story_kit/work/4_merge"); - fs::create_dir_all(&merge).unwrap(); - fs::write(merge.join("12_story_test.md"), "test").unwrap(); - - assert_eq!( - find_active_story_stage(root, "12_story_test"), - Some("4_merge") - ); - } - - #[test] - fn find_active_story_stage_returns_none_for_unknown_story() { - let tmp = tempfile::tempdir().unwrap(); - assert_eq!(find_active_story_stage(tmp.path(), "99_nonexistent"), None); - } - - // ── check_orphaned_agents return value tests (bug 161) ────────────────── - - #[tokio::test] - async fn check_orphaned_agents_returns_count_of_orphaned_agents() { - let pool = AgentPool::new_test(3001); - - // Spawn two tasks that finish immediately. - let h1 = tokio::spawn(async {}); - let h2 = tokio::spawn(async {}); - tokio::time::sleep(std::time::Duration::from_millis(20)).await; - assert!(h1.is_finished()); - assert!(h2.is_finished()); - - pool.inject_test_agent_with_handle("story_a", "coder", AgentStatus::Running, h1); - pool.inject_test_agent_with_handle("story_b", "coder", AgentStatus::Running, h2); - - let found = check_orphaned_agents(&pool.agents); - assert_eq!(found, 2, "should detect both orphaned agents"); - } - - #[test] - fn check_orphaned_agents_returns_zero_when_no_orphans() { - let pool = AgentPool::new_test(3001); - // Inject agents in terminal states — not orphaned. - pool.inject_test_agent("story_a", "coder", AgentStatus::Completed); - pool.inject_test_agent("story_b", "qa", AgentStatus::Failed); - - let found = check_orphaned_agents(&pool.agents); - assert_eq!( - found, 0, - "no orphans should be detected for terminal agents" - ); - } - - #[tokio::test] - async fn watchdog_detects_orphaned_running_agent() { - let pool = AgentPool::new_test(3001); - - let handle = tokio::spawn(async {}); - tokio::time::sleep(std::time::Duration::from_millis(20)).await; - assert!( - handle.is_finished(), - "task should be finished before injection" - ); - - let tx = pool.inject_test_agent_with_handle( - "orphan_story", - "coder", - AgentStatus::Running, - handle, - ); - let mut rx = tx.subscribe(); - - pool.run_watchdog_once(); - - { - let agents = pool.agents.lock().unwrap(); - let key = composite_key("orphan_story", "coder"); - let agent = agents.get(&key).unwrap(); - assert_eq!( - agent.status, - AgentStatus::Failed, - "watchdog must mark an orphaned Running agent as Failed" - ); - } - - let event = rx.try_recv().expect("watchdog must emit an Error event"); - assert!( - matches!(event, AgentEvent::Error { .. }), - "expected AgentEvent::Error, got: {event:?}" - ); - } - - #[tokio::test] - async fn watchdog_orphan_detection_returns_nonzero_enabling_auto_assign() { - // This test verifies the contract that `check_orphaned_agents` returns - // a non-zero count when orphans exist, which the watchdog uses to - // decide whether to trigger auto-assign (bug 161). - let pool = AgentPool::new_test(3001); - - let handle = tokio::spawn(async {}); - tokio::time::sleep(std::time::Duration::from_millis(20)).await; - - pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle); - - // Before watchdog: agent is Running. - { - let agents = pool.agents.lock().unwrap(); - let key = composite_key("orphan_story", "coder"); - assert_eq!(agents.get(&key).unwrap().status, AgentStatus::Running); - } - - // Run watchdog pass — should return 1 (orphan found). - let found = check_orphaned_agents(&pool.agents); - assert_eq!( - found, 1, - "watchdog must return 1 for a single orphaned agent" - ); - - // After watchdog: agent is Failed. - { - let agents = pool.agents.lock().unwrap(); - let key = composite_key("orphan_story", "coder"); - assert_eq!( - agents.get(&key).unwrap().status, - AgentStatus::Failed, - "orphaned agent must be marked Failed" - ); - } - } - - // ── remove_agents_for_story tests ──────────────────────────────────────── - - #[test] - fn remove_agents_for_story_removes_all_entries() { - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("story_a", "coder-1", AgentStatus::Completed); - pool.inject_test_agent("story_a", "qa", AgentStatus::Failed); - pool.inject_test_agent("story_b", "coder-1", AgentStatus::Running); - - let removed = pool.remove_agents_for_story("story_a"); - assert_eq!(removed, 2, "should remove both agents for story_a"); - - let agents = pool.list_agents().unwrap(); - assert_eq!(agents.len(), 1, "only story_b agent should remain"); - assert_eq!(agents[0].story_id, "story_b"); - } - - #[test] - fn remove_agents_for_story_returns_zero_when_no_match() { - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("story_a", "coder-1", AgentStatus::Running); - - let removed = pool.remove_agents_for_story("nonexistent"); - assert_eq!(removed, 0); - - let agents = pool.list_agents().unwrap(); - assert_eq!(agents.len(), 1, "existing agents should not be affected"); - } - - // ── archive + cleanup integration test ─────────────────────────────────── - - #[tokio::test] - async fn archiving_story_removes_agent_entries_from_pool() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Set up story in 2_current/ - let current = root.join(".story_kit/work/2_current"); - fs::create_dir_all(¤t).unwrap(); - fs::write(current.join("60_story_cleanup.md"), "test").unwrap(); - - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("60_story_cleanup", "coder-1", AgentStatus::Completed); - pool.inject_test_agent("60_story_cleanup", "qa", AgentStatus::Completed); - pool.inject_test_agent("61_story_other", "coder-1", AgentStatus::Running); - - // Verify all 3 agents exist. - assert_eq!(pool.list_agents().unwrap().len(), 3); - - // Archive the story. - move_story_to_archived(root, "60_story_cleanup").unwrap(); - pool.remove_agents_for_story("60_story_cleanup"); - - // Agent entries for the archived story should be gone. - let remaining = pool.list_agents().unwrap(); - assert_eq!( - remaining.len(), - 1, - "only the other story's agent should remain" - ); - assert_eq!(remaining[0].story_id, "61_story_other"); - - // Story file should be in 5_done/ - assert!( - root.join(".story_kit/work/5_done/60_story_cleanup.md") - .exists() - ); - } - - // ── kill_all_children tests ──────────────────────────────────── - - /// Returns true if a process with the given PID is currently running. - fn process_is_running(pid: u32) -> bool { - std::process::Command::new("ps") - .arg("-p") - .arg(pid.to_string()) - .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::null()) - .status() - .map(|s| s.success()) - .unwrap_or(false) - } - - #[test] - fn kill_all_children_is_safe_on_empty_pool() { - let pool = AgentPool::new_test(3001); - // Should not panic or deadlock on an empty registry. - pool.kill_all_children(); - assert_eq!(pool.child_killer_count(), 0); - } - - #[test] - fn kill_all_children_kills_real_process() { - // GIVEN: a real PTY child process (sleep 100) with its killer registered. - let pool = AgentPool::new_test(3001); - - let pty_system = native_pty_system(); - let pair = pty_system - .openpty(PtySize { - rows: 24, - cols: 80, - pixel_width: 0, - pixel_height: 0, - }) - .expect("failed to open pty"); - - let mut cmd = CommandBuilder::new("sleep"); - cmd.arg("100"); - let mut child = pair - .slave - .spawn_command(cmd) - .expect("failed to spawn sleep"); - let pid = child.process_id().expect("no pid"); - - pool.inject_child_killer("story:agent", child.clone_killer()); - - // Verify the process is alive before we kill it. - assert!( - process_is_running(pid), - "process {pid} should be running before kill_all_children" - ); - - // WHEN: kill_all_children() is called. - pool.kill_all_children(); - - // Collect the exit status (prevents zombie; also ensures signal was sent). - let _ = child.wait(); - - // THEN: the process should be dead. - assert!( - !process_is_running(pid), - "process {pid} should have been killed by kill_all_children" - ); - } - - #[test] - fn kill_all_children_clears_registry() { - // GIVEN: a pool with one registered killer. - let pool = AgentPool::new_test(3001); - - let pty_system = native_pty_system(); - let pair = pty_system - .openpty(PtySize { - rows: 24, - cols: 80, - pixel_width: 0, - pixel_height: 0, - }) - .expect("failed to open pty"); - - let mut cmd = CommandBuilder::new("sleep"); - cmd.arg("1"); - let mut child = pair - .slave - .spawn_command(cmd) - .expect("failed to spawn sleep"); - - pool.inject_child_killer("story:agent", child.clone_killer()); - assert_eq!(pool.child_killer_count(), 1); - - // WHEN: kill_all_children() is called. - pool.kill_all_children(); - let _ = child.wait(); - - // THEN: the registry is empty. - assert_eq!( - pool.child_killer_count(), - 0, - "child_killers should be cleared after kill_all_children" - ); - } - - // ── available_agents_for_stage tests (story 190) ────────────────────────── - - #[test] - fn available_agents_for_stage_returns_idle_agents() { - let config = make_config( - r#" -[[agent]] -name = "coder-1" -stage = "coder" - -[[agent]] -name = "coder-2" -stage = "coder" - -[[agent]] -name = "qa" -stage = "qa" -"#, - ); - let pool = AgentPool::new_test(3001); - // coder-1 is busy on story-1 - pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); - - let available = pool - .available_agents_for_stage(&config, &PipelineStage::Coder) - .unwrap(); - assert_eq!(available, vec!["coder-2"]); - - let available_qa = pool - .available_agents_for_stage(&config, &PipelineStage::Qa) - .unwrap(); - assert_eq!(available_qa, vec!["qa"]); - } - - #[test] - fn available_agents_for_stage_returns_empty_when_all_busy() { - let config = make_config( - r#" -[[agent]] -name = "coder-1" -stage = "coder" -"#, - ); - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); - - let available = pool - .available_agents_for_stage(&config, &PipelineStage::Coder) - .unwrap(); - assert!(available.is_empty()); - } - - #[test] - fn available_agents_for_stage_ignores_completed_agents() { - let config = make_config( - r#" -[[agent]] -name = "coder-1" -stage = "coder" -"#, - ); - let pool = AgentPool::new_test(3001); - // Completed agents should not count as busy. - pool.inject_test_agent("story-1", "coder-1", AgentStatus::Completed); - - let available = pool - .available_agents_for_stage(&config, &PipelineStage::Coder) - .unwrap(); - assert_eq!(available, vec!["coder-1"]); - } - - #[tokio::test] - async fn start_agent_auto_selects_second_coder_when_first_busy() { - let tmp = tempfile::tempdir().unwrap(); - let sk = tmp.path().join(".story_kit"); - std::fs::create_dir_all(&sk).unwrap(); - std::fs::write( - sk.join("project.toml"), - r#" -[[agent]] -name = "supervisor" -stage = "other" - -[[agent]] -name = "coder-1" -stage = "coder" - -[[agent]] -name = "coder-2" -stage = "coder" -"#, - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - // coder-1 is busy on another story - pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running); - - // Call start_agent without agent_name — should pick coder-2 - let result = pool - .start_agent(tmp.path(), "42_my_story", None, None) - .await; - // Will fail for infrastructure reasons (no git repo), but should NOT - // fail with "All coder agents are busy" — that would mean it didn't - // try coder-2. - match result { - Ok(info) => { - assert_eq!(info.agent_name, "coder-2"); - } - Err(err) => { - assert!( - !err.contains("All coder agents are busy"), - "should have selected coder-2 but got: {err}" - ); - assert!( - !err.contains("No coder agent configured"), - "should not fail on agent selection, got: {err}" - ); - } - } - } - - #[tokio::test] - async fn start_agent_returns_busy_when_all_coders_occupied() { - let tmp = tempfile::tempdir().unwrap(); - let sk = tmp.path().join(".story_kit"); - std::fs::create_dir_all(&sk).unwrap(); - std::fs::write( - sk.join("project.toml"), - r#" -[[agent]] -name = "coder-1" -stage = "coder" - -[[agent]] -name = "coder-2" -stage = "coder" -"#, - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); - pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending); - - let result = pool.start_agent(tmp.path(), "story-3", None, None).await; - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!( - err.contains("All coder agents are busy"), - "expected busy error, got: {err}" - ); - } - - /// Story 203: when all coders are busy the story file must be moved from - /// 1_backlog/ to 2_current/ so that auto_assign_available_work can pick - /// it up once a coder finishes. - #[tokio::test] - async fn start_agent_moves_story_to_current_when_coders_busy() { - let tmp = tempfile::tempdir().unwrap(); - let sk = tmp.path().join(".story_kit"); - let backlog = sk.join("work/1_backlog"); - std::fs::create_dir_all(&backlog).unwrap(); - std::fs::write( - sk.join("project.toml"), - r#" -[[agent]] -name = "coder-1" -stage = "coder" -"#, - ) - .unwrap(); - // Place the story in 1_backlog/. - std::fs::write(backlog.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap(); - - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); - - let result = pool.start_agent(tmp.path(), "story-3", None, None).await; - - // Should fail because all coders are busy. - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!( - err.contains("All coder agents are busy"), - "expected busy error, got: {err}" - ); - assert!( - err.contains("queued in work/2_current/"), - "expected story-to-current message, got: {err}" - ); - - // Story must have been moved to 2_current/. - let current_path = sk.join("work/2_current/story-3.md"); - assert!( - current_path.exists(), - "story should be in 2_current/ after busy error, but was not" - ); - let backlog_path = backlog.join("story-3.md"); - assert!( - !backlog_path.exists(), - "story should no longer be in 1_backlog/" - ); - } - - /// Story 203: auto_assign_available_work must detect a story in 2_current/ - /// with no active agent and start an agent for it. - #[tokio::test] - async fn auto_assign_picks_up_story_queued_in_current() { - let tmp = tempfile::tempdir().unwrap(); - let sk = tmp.path().join(".story_kit"); - let current = sk.join("work/2_current"); - std::fs::create_dir_all(¤t).unwrap(); - std::fs::write( - sk.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", - ) - .unwrap(); - // Place the story in 2_current/ (simulating the "queued" state). - std::fs::write(current.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap(); - - let pool = AgentPool::new_test(3001); - // No agents are running — coder-1 is free. - - // auto_assign will try to call start_agent, which will attempt to create - // a worktree (will fail without a git repo) — that is fine. We only need - // to verify the agent is registered as Pending before the background - // task eventually fails. - pool.auto_assign_available_work(tmp.path()).await; - - let agents = pool.agents.lock().unwrap(); - let has_pending = agents.values().any(|a| { - a.agent_name == "coder-1" - && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) - }); - assert!( - has_pending, - "auto_assign should have started coder-1 for story-3, but pool is empty" - ); - } - - /// Story 203: if a story is already in 2_current/ or later, start_agent - /// must not fail — the move is a no-op. - #[tokio::test] - async fn start_agent_story_already_in_current_is_noop() { - let tmp = tempfile::tempdir().unwrap(); - let sk = tmp.path().join(".story_kit"); - let current = sk.join("work/2_current"); - std::fs::create_dir_all(¤t).unwrap(); - std::fs::write( - sk.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", - ) - .unwrap(); - // Place the story directly in 2_current/. - std::fs::write(current.join("story-5.md"), "---\nname: Story 5\n---\n").unwrap(); - - let pool = AgentPool::new_test(3001); - - // start_agent should attempt to assign coder-1 (no infra, so it will - // fail for git reasons), but must NOT fail due to the story already - // being in 2_current/. - let result = pool.start_agent(tmp.path(), "story-5", None, None).await; - match result { - Ok(_) => {} - Err(e) => { - assert!( - !e.contains("Failed to move"), - "should not fail on idempotent move, got: {e}" - ); - } - } - } - - #[tokio::test] - async fn start_agent_explicit_name_unchanged_when_busy() { - let tmp = tempfile::tempdir().unwrap(); - let sk = tmp.path().join(".story_kit"); - std::fs::create_dir_all(&sk).unwrap(); - std::fs::write( - sk.join("project.toml"), - r#" -[[agent]] -name = "coder-1" -stage = "coder" - -[[agent]] -name = "coder-2" -stage = "coder" -"#, - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); - - // Explicit request for coder-1 (busy) should fail even though coder-2 is free. - let result = pool - .start_agent(tmp.path(), "story-2", Some("coder-1"), None) - .await; - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!( - err.contains("coder-1") && err.contains("already running"), - "expected explicit busy error, got: {err}" - ); - } - - // ── start_agent single-instance concurrency tests ───────────────────────── - - /// Regression test for bug 97: the agent pool must reject a second concurrent - /// instance of the same agent name even if it would run on a different story. - #[tokio::test] - async fn start_agent_rejects_when_same_agent_already_running_on_another_story() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Write a minimal project.toml so ProjectConfig::load can find the "qa" agent. - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(&sk_dir).unwrap(); - fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); - - let pool = AgentPool::new_test(3001); - // Simulate qa already running on story-a. - pool.inject_test_agent("story-a", "qa", AgentStatus::Running); - - // Attempt to start qa on story-b — must be rejected. - let result = pool.start_agent(root, "story-b", Some("qa"), None).await; - - assert!( - result.is_err(), - "start_agent should fail when qa is already running on another story" - ); - let err = result.unwrap_err(); - assert!( - err.contains("already running") || err.contains("becomes available"), - "error message should explain why: got '{err}'" - ); - } - - /// Verify that the concurrency guard does NOT block an agent that is merely - /// Completed (not Running/Pending) — completed agents are free for new work. - #[tokio::test] - async fn start_agent_allows_new_story_when_previous_run_is_completed() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(&sk_dir).unwrap(); - fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); - - let pool = AgentPool::new_test(3001); - // Previous run completed — should NOT block a new story. - pool.inject_test_agent("story-a", "qa", AgentStatus::Completed); - - // The call will fail eventually (no real worktree / Claude CLI), but it must - // NOT fail at the concurrency check. We detect the difference by inspecting - // the error message: a concurrency rejection says "already running", while a - // later failure (missing story file, missing claude binary, etc.) says something else. - let result = pool.start_agent(root, "story-b", Some("qa"), None).await; - - if let Err(ref e) = result { - assert!( - !e.contains("already running") && !e.contains("becomes available"), - "completed agent must not trigger the concurrency guard: got '{e}'" - ); - } - // result may be Ok (unlikely in test env) or Err for infra reasons — both fine. - } - - // ── bug 118: pending entry cleanup on start_agent failure ──────────────── - - /// Regression test for bug 118: when worktree creation fails (e.g. because - /// there is no git repo), the Pending entry that was inserted into the agent - /// HashMap must not remain Pending — it must transition to Failed. This - /// prevents `find_free_agent_for_stage` / auto-assign from being permanently - /// blocked. - /// - /// With story 157 the worktree creation moved into the background spawn, so - /// `start_agent` returns `Ok(Pending)` immediately. We use `wait_for_agent` - /// to block until the background task resolves. - #[tokio::test] - async fn start_agent_cleans_up_pending_entry_on_failure() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Minimal project.toml with a coder agent (must match 2_current/ stage). - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(&sk_dir).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", - ) - .unwrap(); - - // Create the story in upcoming so `move_story_to_current` succeeds, - // but do NOT init a git repo — `create_worktree` will fail in the spawn. - let upcoming = root.join(".story_kit/work/1_backlog"); - fs::create_dir_all(&upcoming).unwrap(); - fs::write(upcoming.join("50_story_test.md"), "---\nname: Test\n---\n").unwrap(); - - let pool = AgentPool::new_test(3099); - - let result = pool - .start_agent(root, "50_story_test", Some("coder-1"), None) - .await; - - // With the non-blocking flow, start_agent returns Ok(Pending) immediately. - // Worktree creation failure happens asynchronously in the background. - assert!( - result.is_ok(), - "start_agent should return Ok(Pending) immediately: {:?}", - result.err() - ); - assert_eq!( - result.unwrap().status, - AgentStatus::Pending, - "initial status must be Pending" - ); - - // Wait for the background task to reach a terminal state. - // It must fail (no git repo → create_worktree returns an error). - let final_info = pool - .wait_for_agent("50_story_test", "coder-1", 5000) - .await - .expect("wait_for_agent should not time out"); - assert_eq!( - final_info.status, - AgentStatus::Failed, - "agent must transition to Failed after worktree creation error" - ); - - // The pool must retain a Failed entry (not disappear silently). - let agents = pool.agents.lock().unwrap(); - let failed_entry = agents - .values() - .find(|a| a.agent_name == "coder-1" && a.status == AgentStatus::Failed); - assert!( - failed_entry.is_some(), - "agent pool must retain a Failed entry so the UI can show the error state" - ); - drop(agents); - - // The AgentEvent::Error must be persisted in the event_log so late - // subscribers / polling clients can see the failure reason. - let events = pool - .drain_events("50_story_test", "coder-1") - .expect("drain_events should succeed"); - let has_error_event = events.iter().any(|e| matches!(e, AgentEvent::Error { .. })); - assert!( - has_error_event, - "event_log must contain AgentEvent::Error after worktree creation fails" - ); - } - - /// Verify that a successful start_agent keeps the Running entry (guard is - /// disarmed). We cannot truly spawn an agent in tests, but we verify that - /// the concurrency check still blocks a second concurrent start — which - /// proves the first entry survived the guard. - #[tokio::test] - async fn start_agent_guard_does_not_remove_running_entry() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(&sk_dir).unwrap(); - fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); - - let pool = AgentPool::new_test(3099); - - // Manually inject a Running agent (simulates successful start). - pool.inject_test_agent("story-x", "qa", AgentStatus::Running); - - // Attempting to start the same agent on a different story must be - // rejected — the Running entry must still be there. - let result = pool.start_agent(root, "story-y", Some("qa"), None).await; - - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!( - err.contains("already running") || err.contains("becomes available"), - "running entry must survive: got '{err}'" - ); - } - - // ── TOCTOU race-condition regression tests (story 132) ─────────────────── - - /// Verify that a Pending entry (not just Running) blocks a concurrent - /// start_agent for the same agent name on a different story. This proves - /// the check-and-insert is atomic: the Pending entry is visible to the - /// second caller because it was inserted while the lock was still held. - #[tokio::test] - async fn toctou_pending_entry_blocks_same_agent_on_different_story() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(&sk_dir).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3099); - - // Simulate what the winning concurrent call would have done: insert a - // Pending entry for coder-1 on story-86. - pool.inject_test_agent("86_story_foo", "coder-1", AgentStatus::Pending); - - // Now attempt to start coder-1 on a *different* story — must be rejected. - let result = pool - .start_agent(root, "130_story_bar", Some("coder-1"), None) - .await; - - assert!(result.is_err(), "second start_agent must be rejected"); - let err = result.unwrap_err(); - assert!( - err.contains("already running") || err.contains("becomes available"), - "expected concurrency-rejection message, got: '{err}'" - ); - } - - /// Concurrent start_agent calls for the same agent name on different stories - /// must result in exactly one rejection due to the concurrency check (not - /// due to an unrelated failure such as missing git repo). - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn toctou_concurrent_start_agent_same_agent_exactly_one_concurrency_rejection() { - use std::fs; - use std::sync::Arc; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path().to_path_buf(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap(); - fs::write( - root.join(".story_kit/project.toml"), - "[[agent]]\nname = \"coder-1\"\n", - ) - .unwrap(); - // Both stories must exist in upcoming so move_story_to_current can run - // (only the winner reaches that point, but we set both up defensively). - fs::write( - root.join(".story_kit/work/1_backlog/86_story_foo.md"), - "---\nname: Foo\n---\n", - ) - .unwrap(); - fs::write( - root.join(".story_kit/work/1_backlog/130_story_bar.md"), - "---\nname: Bar\n---\n", - ) - .unwrap(); - - let pool = Arc::new(AgentPool::new_test(3099)); - - let pool1 = pool.clone(); - let root1 = root.clone(); - let t1 = tokio::spawn(async move { - pool1 - .start_agent(&root1, "86_story_foo", Some("coder-1"), None) - .await - }); - - let pool2 = pool.clone(); - let root2 = root.clone(); - let t2 = tokio::spawn(async move { - pool2 - .start_agent(&root2, "130_story_bar", Some("coder-1"), None) - .await - }); - - let (r1, r2) = tokio::join!(t1, t2); - let r1 = r1.unwrap(); - let r2 = r2.unwrap(); - - // The concurrency-rejection message always contains "already running" / - // "becomes available". Any other error (e.g., missing git repo) means - // that call *won* the atomic check-and-insert. - let concurrency_rejections = [&r1, &r2] - .iter() - .filter(|r| { - r.as_ref().is_err_and(|e| { - e.contains("already running") || e.contains("becomes available") - }) - }) - .count(); - - assert_eq!( - concurrency_rejections, 1, - "exactly one call must be rejected by the concurrency check; \ - got r1={r1:?} r2={r2:?}" - ); - } - - // ── story-230: prevent duplicate stage agents on same story ─────────────── - - /// start_agent must reject a second coder on a story that already has a - /// Running coder, even if they are *different* agent names. - #[tokio::test] - async fn start_agent_rejects_second_coder_stage_on_same_story() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(&sk_dir).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3099); - // coder-1 is already running on the story. - pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running); - - // Attempt to start coder-2 on the *same* story — must be rejected. - let result = pool - .start_agent(root, "42_story_foo", Some("coder-2"), None) - .await; - - assert!( - result.is_err(), - "second coder on same story must be rejected" - ); - let err = result.unwrap_err(); - assert!( - err.contains("same pipeline stage"), - "error must mention same pipeline stage, got: '{err}'" - ); - assert!( - err.contains("coder-1") && err.contains("coder-2"), - "error must name both agents, got: '{err}'" - ); - } - - /// The stage-conflict check must also cover QA: a second QA agent on the - /// same story must be rejected. - #[tokio::test] - async fn start_agent_rejects_second_qa_stage_on_same_story() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(&sk_dir).unwrap(); - // Two qa agents using the explicit stage field so name-based detection - // doesn't interfere. - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n\n\ - [[agent]]\nname = \"qa-2\"\nstage = \"qa\"\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3099); - pool.inject_test_agent("55_story_bar", "qa-1", AgentStatus::Running); - - let result = pool - .start_agent(root, "55_story_bar", Some("qa-2"), None) - .await; - - assert!(result.is_err(), "second qa on same story must be rejected"); - let err = result.unwrap_err(); - assert!( - err.contains("same pipeline stage"), - "error must mention same pipeline stage, got: '{err}'" - ); - } - - /// Regression test (story 230): concurrent start_agent calls with two - /// different coder names on the same story — exactly one must succeed - /// (or fail for infrastructure reasons), and exactly one must be rejected - /// with a stage-conflict error. - /// - /// The story is pre-placed in `2_current/` so that both concurrent - /// `move_story_to_current` calls are no-ops, guaranteeing both reach the - /// lock where the stage-conflict check fires. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn start_agent_concurrent_two_coders_same_story_exactly_one_stage_rejection() { - use std::fs; - use std::sync::Arc; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path().to_path_buf(); - - let sk_dir = root.join(".story_kit"); - // Place story directly in 2_current/ so move_story_to_current is a - // no-op for both concurrent callers, letting both reach the lock. - fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); - fs::write( - root.join(".story_kit/project.toml"), - "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n", - ) - .unwrap(); - fs::write( - root.join(".story_kit/work/2_current/42_story_foo.md"), - "---\nname: Foo\n---\n", - ) - .unwrap(); - - let pool = Arc::new(AgentPool::new_test(3099)); - - let pool1 = pool.clone(); - let root1 = root.clone(); - let t1 = tokio::spawn(async move { - pool1 - .start_agent(&root1, "42_story_foo", Some("coder-1"), None) - .await - }); - - let pool2 = pool.clone(); - let root2 = root.clone(); - let t2 = tokio::spawn(async move { - pool2 - .start_agent(&root2, "42_story_foo", Some("coder-2"), None) - .await - }); - - let (r1, r2) = tokio::join!(t1, t2); - let r1 = r1.unwrap(); - let r2 = r2.unwrap(); - - // Exactly one call must be rejected with a stage-conflict error. - let stage_rejections = [&r1, &r2] - .iter() - .filter(|r| r.as_ref().is_err_and(|e| e.contains("same pipeline stage"))) - .count(); - - assert_eq!( - stage_rejections, 1, - "exactly one call must be rejected by the stage-conflict check; \ - got r1={r1:?} r2={r2:?}" - ); - } - - /// Regression test (story 230): two coders on *different* stories must - /// not trigger the stage-conflict guard — the guard is per-story. - #[tokio::test] - async fn start_agent_two_coders_different_stories_not_blocked_by_stage_check() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap(); - fs::write( - root.join(".story_kit/project.toml"), - "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n", - ) - .unwrap(); - fs::write( - root.join(".story_kit/work/1_backlog/99_story_baz.md"), - "---\nname: Baz\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3099); - // coder-1 is running on a *different* story. - pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running); - - // Starting coder-2 on story-99 must NOT be rejected by the stage - // guard (it may fail for infrastructure reasons like missing git repo, - // but not because of the stage-conflict check). - let result = pool - .start_agent(root, "99_story_baz", Some("coder-2"), None) - .await; - - if let Err(ref e) = result { - assert!( - !e.contains("same pipeline stage"), - "stage-conflict guard must not fire for agents on different stories; \ - got: '{e}'" - ); - } - // result may be Ok (unlikely in test env) or Err for infra reasons — both fine. - } - - /// Two concurrent auto_assign_available_work calls must not assign the same - /// agent to two stories simultaneously. After both complete, at most one - /// Pending/Running entry must exist per agent name. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn toctou_concurrent_auto_assign_no_duplicate_agent_assignments() { - use std::fs; - use std::sync::Arc; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path().to_path_buf(); - - let sk_dir = root.join(".story_kit"); - // Two stories waiting in 2_current, one coder agent. - fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\n", - ) - .unwrap(); - fs::write( - sk_dir.join("work/2_current/86_story_foo.md"), - "---\nname: Foo\n---\n", - ) - .unwrap(); - fs::write( - sk_dir.join("work/2_current/130_story_bar.md"), - "---\nname: Bar\n---\n", - ) - .unwrap(); - - let pool = Arc::new(AgentPool::new_test(3099)); - - // Run two concurrent auto_assign calls. - let pool1 = pool.clone(); - let root1 = root.clone(); - let t1 = tokio::spawn(async move { pool1.auto_assign_available_work(&root1).await }); - - let pool2 = pool.clone(); - let root2 = root.clone(); - let t2 = tokio::spawn(async move { pool2.auto_assign_available_work(&root2).await }); - - let _ = tokio::join!(t1, t2); - - // At most one Pending/Running entry should exist for coder-1. - let agents = pool.agents.lock().unwrap(); - let active_coder_count = agents - .values() - .filter(|a| { - a.agent_name == "coder-1" - && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) - }) - .count(); - - assert!( - active_coder_count <= 1, - "coder-1 must not be assigned to more than one story simultaneously; \ - found {active_coder_count} active entries" - ); - } - - // ── bug 312: stage-pipeline mismatch guard in start_agent ────────────── - - /// Bug 312: start_agent must reject a mergemaster on a story in 2_current/. - #[tokio::test] - async fn start_agent_rejects_mergemaster_on_coding_stage_story() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ - [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", - ) - .unwrap(); - fs::write( - sk_dir.join("work/2_current/310_story_foo.md"), - "---\nname: Foo\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3099); - let result = pool - .start_agent(root, "310_story_foo", Some("mergemaster"), None) - .await; - - assert!( - result.is_err(), - "mergemaster must not be assigned to a story in 2_current/" - ); - let err = result.unwrap_err(); - assert!( - err.contains("stage") && err.contains("2_current"), - "error must mention stage mismatch, got: '{err}'" - ); - } - - /// Bug 312: start_agent must reject a coder on a story in 3_qa/. - #[tokio::test] - async fn start_agent_rejects_coder_on_qa_stage_story() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(sk_dir.join("work/3_qa")).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ - [[agent]]\nname = \"qa\"\nstage = \"qa\"\n", - ) - .unwrap(); - fs::write( - sk_dir.join("work/3_qa/42_story_bar.md"), - "---\nname: Bar\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3099); - let result = pool - .start_agent(root, "42_story_bar", Some("coder-1"), None) - .await; - - assert!( - result.is_err(), - "coder must not be assigned to a story in 3_qa/" - ); - let err = result.unwrap_err(); - assert!( - err.contains("stage") && err.contains("3_qa"), - "error must mention stage mismatch, got: '{err}'" - ); - } - - /// Bug 312: start_agent must reject a QA agent on a story in 4_merge/. - #[tokio::test] - async fn start_agent_rejects_qa_on_merge_stage_story() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"qa\"\nstage = \"qa\"\n\n\ - [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", - ) - .unwrap(); - fs::write( - sk_dir.join("work/4_merge/55_story_baz.md"), - "---\nname: Baz\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3099); - let result = pool - .start_agent(root, "55_story_baz", Some("qa"), None) - .await; - - assert!( - result.is_err(), - "qa must not be assigned to a story in 4_merge/" - ); - let err = result.unwrap_err(); - assert!( - err.contains("stage") && err.contains("4_merge"), - "error must mention stage mismatch, got: '{err}'" - ); - } - - /// Bug 312: supervisor (stage=other) should be allowed on any pipeline stage. - #[tokio::test] - async fn start_agent_allows_supervisor_on_any_stage() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"supervisor\"\nstage = \"other\"\n", - ) - .unwrap(); - fs::write( - sk_dir.join("work/2_current/77_story_sup.md"), - "---\nname: Sup\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3099); - // start_agent will fail for git/worktree reasons, but NOT for stage - // mismatch. We just need to verify it doesn't fail with a stage error. - let result = pool - .start_agent(root, "77_story_sup", Some("supervisor"), None) - .await; - - match result { - Ok(_) => {} // Fine — no stage error. - Err(e) => { - assert!( - !e.contains("stage:") || !e.contains("cannot be assigned"), - "supervisor should not be rejected for stage mismatch, got: '{e}'" - ); - } - } - } - - /// Bug 312: correct stage agent should still be allowed. - #[tokio::test] - async fn start_agent_allows_correct_stage_agent() { - use std::fs; - - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk_dir = root.join(".story_kit"); - fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", - ) - .unwrap(); - fs::write( - sk_dir.join("work/4_merge/88_story_ok.md"), - "---\nname: OK\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3099); - let result = pool - .start_agent(root, "88_story_ok", Some("mergemaster"), None) - .await; - - match result { - Ok(_) => {} // Fine — correct stage. - Err(e) => { - assert!( - !e.contains("cannot be assigned"), - "mergemaster on 4_merge/ story should not fail stage check, got: '{e}'" - ); - } - } - } - - // ── merge_agent_work tests ──────────────────────────────────────────────── - - /// Helper: start a merge and poll until terminal state. - async fn run_merge_to_completion( - pool: &Arc, - repo: &std::path::Path, - story_id: &str, - ) -> MergeJob { - pool.start_merge_agent_work(repo, story_id).unwrap(); - loop { - tokio::time::sleep(std::time::Duration::from_millis(50)).await; - if let Some(job) = pool.get_merge_status(story_id) - && !matches!(job.status, MergeJobStatus::Running) - { - return job; - } - } - } - - #[tokio::test] - async fn merge_agent_work_returns_error_when_branch_not_found() { - use tempfile::tempdir; - - let tmp = tempdir().unwrap(); - let repo = tmp.path(); - init_git_repo(repo); - - let pool = Arc::new(AgentPool::new_test(3001)); - let job = run_merge_to_completion(&pool, repo, "99_nonexistent").await; - match &job.status { - MergeJobStatus::Completed(report) => { - assert!(!report.success, "should fail when branch missing"); - } - MergeJobStatus::Failed(_) => { - // Also acceptable — the pipeline errored out - } - MergeJobStatus::Running => { - panic!("should not still be running"); - } - } - } - - #[tokio::test] - async fn merge_agent_work_succeeds_on_clean_branch() { - use std::fs; - use tempfile::tempdir; - - let tmp = tempdir().unwrap(); - let repo = tmp.path(); - init_git_repo(repo); - - // Create a feature branch with a commit - Command::new("git") - .args(["checkout", "-b", "feature/story-23_test"]) - .current_dir(repo) - .output() - .unwrap(); - fs::write(repo.join("feature.txt"), "feature content").unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "-m", "add feature"]) - .current_dir(repo) - .output() - .unwrap(); - - // Switch back to master (initial branch) - Command::new("git") - .args(["checkout", "master"]) - .current_dir(repo) - .output() - .unwrap(); - - // Create the story file in 4_merge/ so we can test archival - let merge_dir = repo.join(".story_kit/work/4_merge"); - fs::create_dir_all(&merge_dir).unwrap(); - let story_file = merge_dir.join("23_test.md"); - fs::write(&story_file, "---\nname: Test\n---\n").unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "-m", "add story in merge"]) - .current_dir(repo) - .output() - .unwrap(); - - let pool = Arc::new(AgentPool::new_test(3001)); - let job = run_merge_to_completion(&pool, repo, "23_test").await; - - match &job.status { - MergeJobStatus::Completed(report) => { - assert!(!report.had_conflicts, "should have no conflicts"); - assert!( - report.success - || report.gate_output.contains("Failed to run") - || !report.gates_passed, - "report should be coherent: {report:?}" - ); - if report.story_archived { - let done = repo.join(".story_kit/work/5_done/23_test.md"); - assert!(done.exists(), "done file should exist"); - } - } - MergeJobStatus::Failed(e) => { - // Gate failures are acceptable in test env - assert!( - e.contains("Failed") || e.contains("failed"), - "unexpected failure: {e}" - ); - } - MergeJobStatus::Running => panic!("should not still be running"), - } - } - - // ── quality gate ordering test ──────────────────────────────── - - /// Regression test for bug 142: quality gates must run BEFORE the fast-forward - /// to master so that broken code never lands on master. - /// - /// Setup: a repo with a failing `script/test`, a feature branch with one commit. - /// When `run_squash_merge` is called, the gates must detect failure and abort the - /// fast-forward, leaving master HEAD unchanged. - #[cfg(unix)] - #[test] - fn quality_gates_run_before_fast_forward_to_master() { - use std::fs; - use std::os::unix::fs::PermissionsExt; - use tempfile::tempdir; - - let tmp = tempdir().unwrap(); - let repo = tmp.path(); - init_git_repo(repo); - - // Add a failing script/test so quality gates will fail. - let script_dir = repo.join("script"); - fs::create_dir_all(&script_dir).unwrap(); - let script_test = script_dir.join("test"); - fs::write(&script_test, "#!/usr/bin/env bash\nexit 1\n").unwrap(); - let mut perms = fs::metadata(&script_test).unwrap().permissions(); - perms.set_mode(0o755); - fs::set_permissions(&script_test, perms).unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "-m", "add failing script/test"]) - .current_dir(repo) - .output() - .unwrap(); - - // Create a feature branch with a commit. - Command::new("git") - .args(["checkout", "-b", "feature/story-142_test"]) - .current_dir(repo) - .output() - .unwrap(); - fs::write(repo.join("change.txt"), "feature change").unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "-m", "feature work"]) - .current_dir(repo) - .output() - .unwrap(); - - // Switch back to master and record its HEAD. - Command::new("git") - .args(["checkout", "master"]) - .current_dir(repo) - .output() - .unwrap(); - let head_before = String::from_utf8( - Command::new("git") - .args(["rev-parse", "HEAD"]) - .current_dir(repo) - .output() - .unwrap() - .stdout, - ) - .unwrap() - .trim() - .to_string(); - - // Run the squash-merge. The failing script/test makes quality gates - // fail → fast-forward must NOT happen. - let result = - crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test") - .unwrap(); - - let head_after = String::from_utf8( - Command::new("git") - .args(["rev-parse", "HEAD"]) - .current_dir(repo) - .output() - .unwrap() - .stdout, - ) - .unwrap() - .trim() - .to_string(); - - // Gates must have failed (script/test exits 1) so master should be untouched. - assert!( - !result.success, - "run_squash_merge must report failure when gates fail" - ); - assert_eq!( - head_before, head_after, - "master HEAD must not advance when quality gates fail (bug 142)" - ); - } - - #[tokio::test] - async fn merge_agent_work_conflict_does_not_break_master() { - use std::fs; - use tempfile::tempdir; - - let tmp = tempdir().unwrap(); - let repo = tmp.path(); - init_git_repo(repo); - - // Create a file on master. - fs::write( - repo.join("code.rs"), - "fn main() {\n println!(\"hello\");\n}\n", - ) - .unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "-m", "initial code"]) - .current_dir(repo) - .output() - .unwrap(); - - // Feature branch: modify the same line differently. - Command::new("git") - .args(["checkout", "-b", "feature/story-42_story_foo"]) - .current_dir(repo) - .output() - .unwrap(); - fs::write( - repo.join("code.rs"), - "fn main() {\n println!(\"hello\");\n feature_fn();\n}\n", - ) - .unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "-m", "feature: add fn call"]) - .current_dir(repo) - .output() - .unwrap(); - - // Master: add different line at same location. - Command::new("git") - .args(["checkout", "master"]) - .current_dir(repo) - .output() - .unwrap(); - fs::write( - repo.join("code.rs"), - "fn main() {\n println!(\"hello\");\n master_fn();\n}\n", - ) - .unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "-m", "master: add fn call"]) - .current_dir(repo) - .output() - .unwrap(); - - // Create story file in 4_merge. - let merge_dir = repo.join(".story_kit/work/4_merge"); - fs::create_dir_all(&merge_dir).unwrap(); - fs::write(merge_dir.join("42_story_foo.md"), "---\nname: Test\n---\n").unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(repo) - .output() - .unwrap(); - Command::new("git") - .args(["commit", "-m", "add story"]) - .current_dir(repo) - .output() - .unwrap(); - - let pool = Arc::new(AgentPool::new_test(3001)); - let job = run_merge_to_completion(&pool, repo, "42_story_foo").await; - - // Master should NEVER have conflict markers, regardless of merge outcome. - let master_code = fs::read_to_string(repo.join("code.rs")).unwrap(); - assert!( - !master_code.contains("<<<<<<<"), - "master must never contain conflict markers:\n{master_code}" - ); - assert!( - !master_code.contains(">>>>>>>"), - "master must never contain conflict markers:\n{master_code}" - ); - - // The report should accurately reflect what happened. - match &job.status { - MergeJobStatus::Completed(report) => { - assert!(report.had_conflicts, "should report conflicts"); - } - MergeJobStatus::Failed(_) => { - // Acceptable — merge aborted due to conflicts - } - MergeJobStatus::Running => panic!("should not still be running"), - } - } - - // ── reconcile_on_startup tests ──────────────────────────────────────────── - - #[tokio::test] - async fn reconcile_on_startup_noop_when_no_worktrees() { - let tmp = tempfile::tempdir().unwrap(); - let pool = AgentPool::new_test(3001); - let (tx, _rx) = broadcast::channel(16); - // Should not panic; no worktrees to reconcile. - pool.reconcile_on_startup(tmp.path(), &tx).await; - } - - #[tokio::test] - async fn reconcile_on_startup_emits_done_event() { - let tmp = tempfile::tempdir().unwrap(); - let pool = AgentPool::new_test(3001); - let (tx, mut rx) = broadcast::channel::(16); - pool.reconcile_on_startup(tmp.path(), &tx).await; - - // Collect all events; the last must be "done". - let mut events: Vec = Vec::new(); - while let Ok(evt) = rx.try_recv() { - events.push(evt); - } - assert!( - events.iter().any(|e| e.status == "done"), - "reconcile_on_startup must emit a 'done' event; got: {:?}", - events.iter().map(|e| &e.status).collect::>() - ); - } - - #[tokio::test] - async fn reconcile_on_startup_skips_story_without_committed_work() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Set up story in 2_current/. - let current = root.join(".story_kit/work/2_current"); - fs::create_dir_all(¤t).unwrap(); - fs::write(current.join("60_story_test.md"), "test").unwrap(); - - // Create a worktree directory that is a fresh git repo with no commits - // ahead of its own base branch (simulates a worktree where no work was done). - let wt_dir = root.join(".story_kit/worktrees/60_story_test"); - fs::create_dir_all(&wt_dir).unwrap(); - init_git_repo(&wt_dir); - - let pool = AgentPool::new_test(3001); - let (tx, _rx) = broadcast::channel(16); - pool.reconcile_on_startup(root, &tx).await; - - // Story should still be in 2_current/ — nothing was reconciled. - assert!( - current.join("60_story_test.md").exists(), - "story should stay in 2_current/ when worktree has no committed work" - ); - } - - #[tokio::test] - async fn reconcile_on_startup_runs_gates_on_worktree_with_committed_work() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Set up a git repo for the project root. - init_git_repo(root); - - // Set up story in 2_current/ and commit it so the project root is clean. - let current = root.join(".story_kit/work/2_current"); - fs::create_dir_all(¤t).unwrap(); - fs::write(current.join("61_story_test.md"), "test").unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(root) - .output() - .unwrap(); - Command::new("git") - .args([ - "-c", - "user.email=test@test.com", - "-c", - "user.name=Test", - "commit", - "-m", - "add story", - ]) - .current_dir(root) - .output() - .unwrap(); - - // Create a real git worktree for the story. - let wt_dir = root.join(".story_kit/worktrees/61_story_test"); - fs::create_dir_all(wt_dir.parent().unwrap()).unwrap(); - Command::new("git") - .args([ - "worktree", - "add", - &wt_dir.to_string_lossy(), - "-b", - "feature/story-61_story_test", - ]) - .current_dir(root) - .output() - .unwrap(); - - // Add a commit to the feature branch (simulates coder completing work). - fs::write(wt_dir.join("implementation.txt"), "done").unwrap(); - Command::new("git") - .args(["add", "."]) - .current_dir(&wt_dir) - .output() - .unwrap(); - Command::new("git") - .args([ - "-c", - "user.email=test@test.com", - "-c", - "user.name=Test", - "commit", - "-m", - "implement story", - ]) - .current_dir(&wt_dir) - .output() - .unwrap(); - - assert!( - crate::agents::gates::worktree_has_committed_work(&wt_dir), - "test setup: worktree should have committed work" - ); - - let pool = AgentPool::new_test(3001); - let (tx, _rx) = broadcast::channel(16); - pool.reconcile_on_startup(root, &tx).await; - - // In the test env, cargo clippy will fail (no Cargo.toml) so gates fail - // and the story stays in 2_current/. The important assertion is that - // reconcile ran without panicking and the story is in a consistent state. - let in_current = current.join("61_story_test.md").exists(); - let in_qa = root.join(".story_kit/work/3_qa/61_story_test.md").exists(); - assert!( - in_current || in_qa, - "story should be in 2_current/ or 3_qa/ after reconciliation" - ); - } - - #[test] - fn has_review_hold_returns_true_when_set() { - let tmp = tempfile::tempdir().unwrap(); - let qa_dir = tmp.path().join(".story_kit/work/3_qa"); - std::fs::create_dir_all(&qa_dir).unwrap(); - let spike_path = qa_dir.join("10_spike_research.md"); - std::fs::write( - &spike_path, - "---\nname: Research spike\nreview_hold: true\n---\n# Spike\n", - ) - .unwrap(); - assert!(has_review_hold(tmp.path(), "3_qa", "10_spike_research")); - } - - #[test] - fn has_review_hold_returns_false_when_not_set() { - let tmp = tempfile::tempdir().unwrap(); - let qa_dir = tmp.path().join(".story_kit/work/3_qa"); - std::fs::create_dir_all(&qa_dir).unwrap(); - let spike_path = qa_dir.join("10_spike_research.md"); - std::fs::write(&spike_path, "---\nname: Research spike\n---\n# Spike\n").unwrap(); - assert!(!has_review_hold(tmp.path(), "3_qa", "10_spike_research")); - } - - #[test] - fn has_review_hold_returns_false_when_file_missing() { - let tmp = tempfile::tempdir().unwrap(); - assert!(!has_review_hold(tmp.path(), "3_qa", "99_spike_missing")); - } - - /// Story 265: auto_assign_available_work must skip spikes in 3_qa/ that - /// have review_hold: true set in their front matter. - #[tokio::test] - async fn auto_assign_skips_spikes_with_review_hold() { - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - // Create project.toml with a QA agent. - let sk = root.join(".story_kit"); - std::fs::create_dir_all(&sk).unwrap(); - std::fs::write( - sk.join("project.toml"), - "[[agents]]\nname = \"qa\"\nrole = \"qa\"\nmodel = \"test\"\nprompt = \"test\"\n", - ) - .unwrap(); - - // Put a spike in 3_qa/ with review_hold: true. - let qa_dir = root.join(".story_kit/work/3_qa"); - std::fs::create_dir_all(&qa_dir).unwrap(); - std::fs::write( - qa_dir.join("20_spike_test.md"), - "---\nname: Test Spike\nreview_hold: true\n---\n# Spike\n", - ) - .unwrap(); - - let (watcher_tx, _) = broadcast::channel::(4); - let pool = AgentPool::new(3001, watcher_tx); - - pool.auto_assign_available_work(root).await; - - // No agent should have been started for the spike. - let agents = pool.agents.lock().unwrap(); - assert!( - agents.is_empty(), - "No agents should be assigned to a spike with review_hold" - ); - } - - // ── Story 279: auto-assign respects agent stage from front matter ────────── - - /// When a story in 3_qa/ has `agent: coder-1` in its front matter but - /// coder-1 is a coder-stage agent, auto-assign must NOT assign coder-1. - /// Instead it should fall back to a free QA-stage agent. - #[tokio::test] - async fn auto_assign_ignores_coder_preference_when_story_is_in_qa_stage() { - let tmp = tempfile::tempdir().unwrap(); - let sk = tmp.path().join(".story_kit"); - let qa_dir = sk.join("work/3_qa"); - std::fs::create_dir_all(&qa_dir).unwrap(); - std::fs::write( - sk.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ - [[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n", - ) - .unwrap(); - // Story in 3_qa/ with a preferred coder-stage agent. - std::fs::write( - qa_dir.join("story-qa1.md"), - "---\nname: QA Story\nagent: coder-1\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - - pool.auto_assign_available_work(tmp.path()).await; - - let agents = pool.agents.lock().unwrap(); - // coder-1 must NOT have been assigned (wrong stage for 3_qa/). - let coder_assigned = agents.values().any(|a| { - a.agent_name == "coder-1" - && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) - }); - assert!( - !coder_assigned, - "coder-1 should not be assigned to a QA-stage story" - ); - // qa-1 should have been assigned instead. - let qa_assigned = agents.values().any(|a| { - a.agent_name == "qa-1" - && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) - }); - assert!( - qa_assigned, - "qa-1 should be assigned as fallback for the QA-stage story" - ); - } - - /// When a story in 2_current/ has `agent: coder-1` in its front matter and - /// coder-1 is a coder-stage agent, auto-assign must respect the preference - /// and assign coder-1 (not fall back to some other coder). - #[tokio::test] - async fn auto_assign_respects_coder_preference_when_story_is_in_current_stage() { - let tmp = tempfile::tempdir().unwrap(); - let sk = tmp.path().join(".story_kit"); - let current_dir = sk.join("work/2_current"); - std::fs::create_dir_all(¤t_dir).unwrap(); - std::fs::write( - sk.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ - [[agent]]\nname = \"coder-2\"\nstage = \"coder\"\n", - ) - .unwrap(); - // Story in 2_current/ with a preferred coder-1 agent. - std::fs::write( - current_dir.join("story-pref.md"), - "---\nname: Coder Story\nagent: coder-1\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - - pool.auto_assign_available_work(tmp.path()).await; - - let agents = pool.agents.lock().unwrap(); - // coder-1 should have been picked (it matches the stage and is preferred). - let coder1_assigned = agents.values().any(|a| { - a.agent_name == "coder-1" - && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) - }); - assert!( - coder1_assigned, - "coder-1 should be assigned when it matches the stage and is preferred" - ); - // coder-2 must NOT be assigned (not preferred). - let coder2_assigned = agents.values().any(|a| { - a.agent_name == "coder-2" - && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) - }); - assert!( - !coder2_assigned, - "coder-2 should not be assigned when coder-1 is explicitly preferred" - ); - } - - /// When the preferred agent's stage mismatches and no other agent of the - /// correct stage is available, auto-assign must not start any agent for that - /// story (no panic, no error). - #[tokio::test] - async fn auto_assign_stage_mismatch_with_no_fallback_starts_no_agent() { - let tmp = tempfile::tempdir().unwrap(); - let sk = tmp.path().join(".story_kit"); - let qa_dir = sk.join("work/3_qa"); - std::fs::create_dir_all(&qa_dir).unwrap(); - // Only a coder agent is configured — no QA agent exists. - std::fs::write( - sk.join("project.toml"), - "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", - ) - .unwrap(); - // Story in 3_qa/ requests coder-1 (wrong stage) and no QA agent exists. - std::fs::write( - qa_dir.join("story-noqa.md"), - "---\nname: QA Story No Agent\nagent: coder-1\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - - // Must not panic. - pool.auto_assign_available_work(tmp.path()).await; - - let agents = pool.agents.lock().unwrap(); - assert!( - agents.is_empty(), - "No agent should be started when no stage-appropriate agent is available" - ); - } - - /// Bug 295: when a coder completes and QA is busy on another story, - /// the newly QA-queued story must be picked up when `run_pipeline_advance` - /// finishes for the busy QA agent's story (because auto_assign is now - /// called unconditionally at the end of pipeline advance). - #[tokio::test] - async fn pipeline_advance_picks_up_waiting_qa_stories_after_completion() { - use std::fs; - let tmp = tempfile::tempdir().unwrap(); - let root = tmp.path(); - - let sk = root.join(".story_kit"); - let qa_dir = sk.join("work/3_qa"); - fs::create_dir_all(&qa_dir).unwrap(); - - // Configure a single QA agent. - fs::write( - sk.join("project.toml"), - r#" -[[agent]] -name = "qa" -stage = "qa" -"#, - ) - .unwrap(); - - // Story 292 is in QA with QA agent running (will "complete" via - // run_pipeline_advance below). Story 293 is in QA with NO agent — - // simulating the "stuck" state from bug 295. - fs::write( - qa_dir.join("292_story_first.md"), - "---\nname: First\nqa: human\n---\n", - ) - .unwrap(); - fs::write( - qa_dir.join("293_story_second.md"), - "---\nname: Second\nqa: human\n---\n", - ) - .unwrap(); - - let pool = AgentPool::new_test(3001); - // QA is currently running on story 292. - pool.inject_test_agent("292_story_first", "qa", AgentStatus::Running); - - // Verify that 293 cannot get a QA agent right now (QA is busy). - { - let agents = pool.agents.lock().unwrap(); - assert!( - !is_agent_free(&agents, "qa"), - "qa should be busy on story 292" - ); - } - - // Simulate QA completing on story 292: remove the agent from the pool - // (as run_server_owned_completion does) then run pipeline advance. - { - let mut agents = pool.agents.lock().unwrap(); - agents.remove(&composite_key("292_story_first", "qa")); - } - - // Pipeline advance for QA with gates_passed=true will: - // 1. Run coverage gate (will "pass" trivially in test — no script/test_coverage) - // 2. Set review_hold on 292 (qa: human) - // 3. Call auto_assign_available_work (the fix from bug 295) - // 4. auto_assign should find 293 in 3_qa/ with no agent and start qa on it - pool.run_pipeline_advance( - "292_story_first", - "qa", - CompletionReport { - summary: "QA done".to_string(), - gates_passed: true, - gate_output: String::new(), - }, - Some(root.to_path_buf()), - None, - false, - ) - .await; - - // After pipeline advance, auto_assign should have started QA on story 293. - let agents = pool.agents.lock().unwrap(); - let qa_on_293 = agents.values().any(|a| { - a.agent_name == "qa" - && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) - }); - assert!( - qa_on_293, - "auto_assign should have started qa for story 293 after 292's QA completed, \ - but no qa agent is pending/running. Pool: {:?}", - agents - .iter() - .map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status)) - .collect::>() - ); - } - - // ── Helper to construct a test StoryAgent ────────────────────────── - - fn make_test_story_agent(agent_name: &str, status: AgentStatus) -> StoryAgent { - StoryAgent { - agent_name: agent_name.to_string(), - status, - worktree_info: None, - session_id: None, - tx: broadcast::channel(1).0, - task_handle: None, - event_log: Arc::new(Mutex::new(Vec::new())), - completion: None, - project_root: None, - log_session_id: None, - merge_failure_reported: false, - } - } - - // ── find_free_agent_for_stage: default_coder_model filtering ───────── - - #[test] - fn find_free_agent_skips_opus_when_default_coder_model_set() { - let config = make_config( - r#" -default_coder_model = "sonnet" - -[[agent]] -name = "coder-1" -stage = "coder" -model = "sonnet" - -[[agent]] -name = "coder-opus" -stage = "coder" -model = "opus" -"#, - ); - - let agents = HashMap::new(); - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!(free, Some("coder-1")); - } - - #[test] - fn find_free_agent_returns_opus_when_no_default_coder_model() { - let config = make_config( - r#" -[[agent]] -name = "coder-opus" -stage = "coder" -model = "opus" -"#, - ); - - let agents = HashMap::new(); - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!(free, Some("coder-opus")); - } - - #[test] - fn find_free_agent_returns_none_when_all_sonnet_coders_busy() { - let config = make_config( - r#" -default_coder_model = "sonnet" - -[[agent]] -name = "coder-1" -stage = "coder" -model = "sonnet" - -[[agent]] -name = "coder-opus" -stage = "coder" -model = "opus" -"#, - ); - - let mut agents = HashMap::new(); - agents.insert( - "story1:coder-1".to_string(), - make_test_story_agent("coder-1", AgentStatus::Running), - ); - - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!(free, None, "opus agent should not be auto-assigned"); - } - - // ── find_free_agent_for_stage: max_coders limit ───────────────────── - - #[test] - fn find_free_agent_respects_max_coders() { - let config = make_config( - r#" -max_coders = 1 - -[[agent]] -name = "coder-1" -stage = "coder" -model = "sonnet" - -[[agent]] -name = "coder-2" -stage = "coder" -model = "sonnet" -"#, - ); - - let mut agents = HashMap::new(); - agents.insert( - "story1:coder-1".to_string(), - make_test_story_agent("coder-1", AgentStatus::Running), - ); - - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!(free, None, "max_coders=1 should block second coder"); - } - - #[test] - fn find_free_agent_allows_within_max_coders() { - let config = make_config( - r#" -max_coders = 2 - -[[agent]] -name = "coder-1" -stage = "coder" -model = "sonnet" - -[[agent]] -name = "coder-2" -stage = "coder" -model = "sonnet" -"#, - ); - - let mut agents = HashMap::new(); - agents.insert( - "story1:coder-1".to_string(), - make_test_story_agent("coder-1", AgentStatus::Running), - ); - - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!(free, Some("coder-2")); - } - - #[test] - fn max_coders_does_not_affect_qa_stage() { - let config = make_config( - r#" -max_coders = 1 - -[[agent]] -name = "qa" -stage = "qa" -model = "sonnet" -"#, - ); - - let agents = HashMap::new(); - let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa); - assert_eq!(free, Some("qa")); - } - - // ── count_active_agents_for_stage ──────────────────────────────────── - - #[test] - fn count_active_agents_counts_running_and_pending() { - let config = make_config( - r#" -[[agent]] -name = "coder-1" -stage = "coder" - -[[agent]] -name = "coder-2" -stage = "coder" -"#, - ); - - let mut agents = HashMap::new(); - agents.insert( - "s1:coder-1".to_string(), - make_test_story_agent("coder-1", AgentStatus::Running), - ); - agents.insert( - "s2:coder-2".to_string(), - make_test_story_agent("coder-2", AgentStatus::Completed), - ); - - let count = count_active_agents_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!(count, 1, "Only Running coder should be counted, not Completed"); - } -} diff --git a/server/src/agents/pool/auto_assign.rs b/server/src/agents/pool/auto_assign.rs new file mode 100644 index 0000000..6e676be --- /dev/null +++ b/server/src/agents/pool/auto_assign.rs @@ -0,0 +1,1813 @@ +//! Auto-assign logic: scanning pipeline stages for unassigned stories and +//! dispatching free agents, startup reconciliation, and the watchdog task. + +use crate::config::ProjectConfig; +use crate::slog; +use crate::slog_error; +use crate::slog_warn; +use crate::worktree; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex}; +use tokio::sync::broadcast; + +use super::super::{ + AgentEvent, AgentStatus, PipelineStage, ReconciliationEvent, agent_config_stage, pipeline_stage, +}; +use super::{AgentPool, StoryAgent, find_active_story_stage}; + +impl AgentPool { + pub async fn auto_assign_available_work(&self, project_root: &Path) { + let config = match ProjectConfig::load(project_root) { + Ok(c) => c, + Err(e) => { + slog_warn!("[auto-assign] Failed to load project config: {e}"); + return; + } + }; + + // Process each active pipeline stage in order. + let stages: [(&str, PipelineStage); 3] = [ + ("2_current", PipelineStage::Coder), + ("3_qa", PipelineStage::Qa), + ("4_merge", PipelineStage::Mergemaster), + ]; + + for (stage_dir, stage) in &stages { + let items = scan_stage_items(project_root, stage_dir); + if items.is_empty() { + continue; + } + + for story_id in &items { + // Items marked with review_hold (e.g. spikes after QA passes) stay + // in their current stage for human review — don't auto-assign agents. + if has_review_hold(project_root, stage_dir, story_id) { + continue; + } + + // Skip blocked stories (retry limit exceeded). + if is_story_blocked(project_root, stage_dir, story_id) { + continue; + } + + // Skip stories in 4_merge/ that already have a reported merge failure. + // These need human intervention — auto-assigning a new mergemaster + // would just waste tokens on the same broken merge. + if *stage == PipelineStage::Mergemaster + && has_merge_failure(project_root, stage_dir, story_id) + { + continue; + } + + // AC6: Detect empty-diff stories in 4_merge/ before starting a + // mergemaster. If the worktree has no commits on the feature branch, + // write a merge_failure and block the story immediately. + if *stage == PipelineStage::Mergemaster + && let Some(wt_path) = worktree::find_worktree_path(project_root, story_id) + && !super::super::gates::worktree_has_committed_work(&wt_path) + { + slog_warn!( + "[auto-assign] Story '{story_id}' in 4_merge/ has no commits \ + on feature branch. Writing merge_failure and blocking." + ); + let story_path = project_root + .join(".story_kit/work") + .join(stage_dir) + .join(format!("{story_id}.md")); + let _ = crate::io::story_metadata::write_merge_failure( + &story_path, + "Feature branch has no code changes — the coder agent \ + did not produce any commits.", + ); + let _ = crate::io::story_metadata::write_blocked(&story_path); + continue; + } + + // Re-acquire the lock on each iteration to see state changes + // from previous start_agent calls in the same pass. + let preferred_agent = + read_story_front_matter_agent(project_root, stage_dir, story_id); + + // Check max_coders limit for the Coder stage before agent selection. + // If the pool is full, all remaining items in this stage wait. + if *stage == PipelineStage::Coder + && let Some(max) = config.max_coders + { + let agents_lock = match self.agents.lock() { + Ok(a) => a, + Err(e) => { + slog_error!("[auto-assign] Failed to lock agents: {e}"); + break; + } + }; + let active = count_active_agents_for_stage(&config, &agents_lock, stage); + if active >= max { + slog!( + "[auto-assign] Coder pool full ({active}/{max}); remaining items in {stage_dir}/ will wait." + ); + break; + } + } + + // Outcome: (already_assigned, chosen_agent, preferred_busy, stage_mismatch) + // preferred_busy=true means the story has a specific agent requested but it is + // currently occupied — the story should wait rather than fall back. + // stage_mismatch=true means the preferred agent's stage doesn't match the + // pipeline stage, so we fell back to a generic stage agent. + let (already_assigned, free_agent, preferred_busy, stage_mismatch) = { + let agents = match self.agents.lock() { + Ok(a) => a, + Err(e) => { + slog_error!("[auto-assign] Failed to lock agents: {e}"); + break; + } + }; + let assigned = is_story_assigned_for_stage(&config, &agents, story_id, stage); + if assigned { + (true, None, false, false) + } else if let Some(ref pref) = preferred_agent { + // Story has a front-matter agent preference. + // Verify the preferred agent's stage matches the current + // pipeline stage — a coder shouldn't be assigned to QA. + let pref_stage_matches = config + .find_agent(pref) + .map(|cfg| agent_config_stage(cfg) == *stage) + .unwrap_or(false); + if !pref_stage_matches { + // Stage mismatch — fall back to any free agent for this stage. + let free = find_free_agent_for_stage(&config, &agents, stage) + .map(|s| s.to_string()); + (false, free, false, true) + } else if is_agent_free(&agents, pref) { + (false, Some(pref.clone()), false, false) + } else { + (false, None, true, false) + } + } else { + let free = find_free_agent_for_stage(&config, &agents, stage) + .map(|s| s.to_string()); + (false, free, false, false) + } + }; + + if already_assigned { + // Story already has an active agent — skip silently. + continue; + } + + if preferred_busy { + // The story requests a specific agent that is currently busy. + // Do not fall back to a different agent; let this story wait. + slog!( + "[auto-assign] Preferred agent '{}' busy for '{story_id}'; story will wait.", + preferred_agent.as_deref().unwrap_or("?") + ); + continue; + } + + if stage_mismatch { + slog!( + "[auto-assign] Preferred agent '{}' stage mismatch for '{story_id}' in {stage_dir}/; falling back to stage-appropriate agent.", + preferred_agent.as_deref().unwrap_or("?") + ); + } + + match free_agent { + Some(agent_name) => { + slog!( + "[auto-assign] Assigning '{agent_name}' to '{story_id}' in {stage_dir}/" + ); + if let Err(e) = self + .start_agent(project_root, story_id, Some(&agent_name), None) + .await + { + slog!( + "[auto-assign] Failed to start '{agent_name}' for '{story_id}': {e}" + ); + } + } + None => { + // No free agents of this type — stop scanning this stage. + slog!( + "[auto-assign] All {:?} agents busy; remaining items in {stage_dir}/ will wait.", + stage + ); + break; + } + } + } + } + } + + /// Reconcile stories whose agent work was committed while the server was offline. + /// + /// On server startup the in-memory agent pool is empty, so any story that an agent + /// completed during a previous session is stuck: the worktree has committed work but + /// the pipeline never advanced. This method detects those stories, re-runs the + /// acceptance gates, and advances the pipeline stage so that `auto_assign_available_work` + /// (called immediately after) picks up the right next-stage agents. + /// + /// Algorithm: + /// 1. List all worktree directories under `{project_root}/.story_kit/worktrees/`. + /// 2. For each worktree, check whether its feature branch has commits ahead of the + /// base branch (`master` / `main`). + /// 3. If committed work is found AND the story is in `2_current/` or `3_qa/`: + /// - Run acceptance gates (uncommitted-change check + clippy + tests). + /// - On pass + `2_current/`: move the story to `3_qa/`. + /// - On pass + `3_qa/`: run the coverage gate; if that also passes move to `4_merge/`. + /// - On failure: leave the story where it is so `auto_assign_available_work` can + /// start a fresh agent to retry. + /// 4. Stories in `4_merge/` are left for `auto_assign_available_work` to handle via a + /// fresh mergemaster (squash-merge must be re-executed by the mergemaster agent). + pub async fn reconcile_on_startup( + &self, + project_root: &Path, + progress_tx: &broadcast::Sender, + ) { + let worktrees = match worktree::list_worktrees(project_root) { + Ok(wt) => wt, + Err(e) => { + eprintln!("[startup:reconcile] Failed to list worktrees: {e}"); + let _ = progress_tx.send(ReconciliationEvent { + story_id: String::new(), + status: "done".to_string(), + message: format!("Reconciliation failed: {e}"), + }); + return; + } + }; + + for wt_entry in &worktrees { + let story_id = &wt_entry.story_id; + let wt_path = wt_entry.path.clone(); + + // Determine which active stage the story is in. + let stage_dir = match find_active_story_stage(project_root, story_id) { + Some(s) => s, + None => continue, // Not in any active stage (backlog/archived or unknown). + }; + + // 4_merge/ is left for auto_assign to handle with a fresh mergemaster. + if stage_dir == "4_merge" { + continue; + } + + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "checking".to_string(), + message: format!("Checking for committed work in {stage_dir}/"), + }); + + // Check whether the worktree has commits ahead of the base branch. + let wt_path_for_check = wt_path.clone(); + let has_work = tokio::task::spawn_blocking(move || { + super::super::gates::worktree_has_committed_work(&wt_path_for_check) + }) + .await + .unwrap_or(false); + + if !has_work { + eprintln!( + "[startup:reconcile] No committed work for '{story_id}' in {stage_dir}/; skipping." + ); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "skipped".to_string(), + message: "No committed work found; skipping.".to_string(), + }); + continue; + } + + eprintln!( + "[startup:reconcile] Found committed work for '{story_id}' in {stage_dir}/. Running acceptance gates." + ); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "gates_running".to_string(), + message: "Running acceptance gates…".to_string(), + }); + + // Run acceptance gates on the worktree. + let wt_path_for_gates = wt_path.clone(); + let gates_result = tokio::task::spawn_blocking(move || { + super::super::gates::check_uncommitted_changes(&wt_path_for_gates)?; + super::super::gates::run_acceptance_gates(&wt_path_for_gates) + }) + .await; + + let (gates_passed, gate_output) = match gates_result { + Ok(Ok(pair)) => pair, + Ok(Err(e)) => { + eprintln!("[startup:reconcile] Gate check error for '{story_id}': {e}"); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: format!("Gate error: {e}"), + }); + continue; + } + Err(e) => { + eprintln!("[startup:reconcile] Gate check task panicked for '{story_id}': {e}"); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: format!("Gate task panicked: {e}"), + }); + continue; + } + }; + + if !gates_passed { + eprintln!( + "[startup:reconcile] Gates failed for '{story_id}': {gate_output}\n\ + Leaving in {stage_dir}/ for auto-assign to restart the agent." + ); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: "Gates failed; will be retried by auto-assign.".to_string(), + }); + continue; + } + + eprintln!("[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/)."); + + if stage_dir == "2_current" { + // Coder stage — determine qa mode to decide next step. + let qa_mode = { + let item_type = super::super::lifecycle::item_type_from_id(story_id); + if item_type == "spike" { + crate::io::story_metadata::QaMode::Human + } else { + let default_qa = crate::config::ProjectConfig::load(project_root) + .unwrap_or_default() + .default_qa_mode(); + let story_path = project_root + .join(".story_kit/work/2_current") + .join(format!("{story_id}.md")); + crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa) + } + }; + + match qa_mode { + crate::io::story_metadata::QaMode::Server => { + if let Err(e) = super::super::lifecycle::move_story_to_merge(project_root, story_id) { + eprintln!("[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}"); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: format!("Failed to advance to merge: {e}"), + }); + } else { + eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/ (qa: server)."); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "advanced".to_string(), + message: "Gates passed — moved to merge (qa: server).".to_string(), + }); + } + } + crate::io::story_metadata::QaMode::Agent => { + if let Err(e) = super::super::lifecycle::move_story_to_qa(project_root, story_id) { + eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}"); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: format!("Failed to advance to QA: {e}"), + }); + } else { + eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/."); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "advanced".to_string(), + message: "Gates passed — moved to QA.".to_string(), + }); + } + } + crate::io::story_metadata::QaMode::Human => { + if let Err(e) = super::super::lifecycle::move_story_to_qa(project_root, story_id) { + eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}"); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: format!("Failed to advance to QA: {e}"), + }); + } else { + let story_path = project_root + .join(".story_kit/work/3_qa") + .join(format!("{story_id}.md")); + if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) { + eprintln!( + "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}" + ); + } + eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/ (qa: human — holding for review)."); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "review_hold".to_string(), + message: "Gates passed — holding for human review.".to_string(), + }); + } + } + } + } else if stage_dir == "3_qa" { + // QA stage → run coverage gate before advancing to merge. + let wt_path_for_cov = wt_path.clone(); + let coverage_result = tokio::task::spawn_blocking(move || { + super::super::gates::run_coverage_gate(&wt_path_for_cov) + }) + .await; + + let (coverage_passed, coverage_output) = match coverage_result { + Ok(Ok(pair)) => pair, + Ok(Err(e)) => { + eprintln!("[startup:reconcile] Coverage gate error for '{story_id}': {e}"); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: format!("Coverage gate error: {e}"), + }); + continue; + } + Err(e) => { + eprintln!( + "[startup:reconcile] Coverage gate panicked for '{story_id}': {e}" + ); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: format!("Coverage gate panicked: {e}"), + }); + continue; + } + }; + + if coverage_passed { + // Check whether this item needs human review before merging. + let needs_human_review = { + let item_type = super::super::lifecycle::item_type_from_id(story_id); + if item_type == "spike" { + true + } else { + let story_path = project_root + .join(".story_kit/work/3_qa") + .join(format!("{story_id}.md")); + let default_qa = crate::config::ProjectConfig::load(project_root) + .unwrap_or_default() + .default_qa_mode(); + matches!( + crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa), + crate::io::story_metadata::QaMode::Human + ) + } + }; + + if needs_human_review { + let story_path = project_root + .join(".story_kit/work/3_qa") + .join(format!("{story_id}.md")); + if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) { + eprintln!( + "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}" + ); + } + eprintln!( + "[startup:reconcile] '{story_id}' passed QA — holding for human review." + ); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "review_hold".to_string(), + message: "Passed QA — waiting for human review.".to_string(), + }); + } else if let Err(e) = + super::super::lifecycle::move_story_to_merge(project_root, story_id) + { + eprintln!( + "[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}" + ); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: format!("Failed to advance to merge: {e}"), + }); + } else { + eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/."); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "advanced".to_string(), + message: "Gates passed — moved to merge.".to_string(), + }); + } + } else { + eprintln!( + "[startup:reconcile] Coverage gate failed for '{story_id}': {coverage_output}\n\ + Leaving in 3_qa/ for auto-assign to restart the QA agent." + ); + let _ = progress_tx.send(ReconciliationEvent { + story_id: story_id.clone(), + status: "failed".to_string(), + message: "Coverage gate failed; will be retried.".to_string(), + }); + } + } + } + + // Signal that reconciliation is complete. + let _ = progress_tx.send(ReconciliationEvent { + story_id: String::new(), + status: "done".to_string(), + message: "Startup reconciliation complete.".to_string(), + }); + } + + /// Run a single watchdog pass synchronously (test helper). + #[cfg(test)] + pub fn run_watchdog_once(&self) { + check_orphaned_agents(&self.agents); + } + + /// Spawn a background watchdog task that periodically checks for Running agents + /// whose underlying task has already finished (orphaned entries). Any such agent + /// is marked Failed and an Error event is emitted so that `wait_for_agent` unblocks. + /// + /// The watchdog runs every 30 seconds. It is a safety net for edge cases where the + /// PTY read loop exits without updating the agent status (e.g. a panic in the + /// spawn_blocking task, or an external SIGKILL that closes the PTY fd immediately). + /// + /// When orphaned agents are detected and a `project_root` is provided, auto-assign + /// is triggered so that free agents can pick up unassigned work. + pub fn spawn_watchdog(pool: Arc, project_root: Option) { + tokio::spawn(async move { + let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); + loop { + interval.tick().await; + let found = check_orphaned_agents(&pool.agents); + if found > 0 + && let Some(ref root) = project_root + { + slog!("[watchdog] {found} orphaned agent(s) detected; triggering auto-assign."); + pool.auto_assign_available_work(root).await; + } + } + }); + } +} + +// ── Free helper functions ────────────────────────────────────────────────── + +/// Read the optional `agent:` field from the front matter of a story file. +/// +/// Returns `Some(agent_name)` if the front matter specifies an agent, or `None` +/// if the field is absent or the file cannot be read / parsed. +fn read_story_front_matter_agent( + project_root: &Path, + stage_dir: &str, + story_id: &str, +) -> Option { + use crate::io::story_metadata::parse_front_matter; + let path = project_root + .join(".story_kit") + .join("work") + .join(stage_dir) + .join(format!("{story_id}.md")); + let contents = std::fs::read_to_string(path).ok()?; + parse_front_matter(&contents).ok()?.agent +} + +/// Return `true` if the story file in the given stage has `review_hold: true` in its front matter. +fn has_review_hold(project_root: &Path, stage_dir: &str, story_id: &str) -> bool { + use crate::io::story_metadata::parse_front_matter; + let path = project_root + .join(".story_kit") + .join("work") + .join(stage_dir) + .join(format!("{story_id}.md")); + let contents = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(_) => return false, + }; + parse_front_matter(&contents) + .ok() + .and_then(|m| m.review_hold) + .unwrap_or(false) +} + +/// Return `true` if the story file has `blocked: true` in its front matter. +fn is_story_blocked(project_root: &Path, stage_dir: &str, story_id: &str) -> bool { + use crate::io::story_metadata::parse_front_matter; + let path = project_root + .join(".story_kit") + .join("work") + .join(stage_dir) + .join(format!("{story_id}.md")); + let contents = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(_) => return false, + }; + parse_front_matter(&contents) + .ok() + .and_then(|m| m.blocked) + .unwrap_or(false) +} + +/// Return `true` if the story file has a `merge_failure` field in its front matter. +fn has_merge_failure(project_root: &Path, stage_dir: &str, story_id: &str) -> bool { + use crate::io::story_metadata::parse_front_matter; + let path = project_root + .join(".story_kit") + .join("work") + .join(stage_dir) + .join(format!("{story_id}.md")); + let contents = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(_) => return false, + }; + parse_front_matter(&contents) + .ok() + .and_then(|m| m.merge_failure) + .is_some() +} + +/// Return `true` if `agent_name` has no active (pending/running) entry in the pool. +pub(super) fn is_agent_free(agents: &HashMap, agent_name: &str) -> bool { + !agents.values().any(|a| { + a.agent_name == agent_name + && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) + }) +} + +fn scan_stage_items(project_root: &Path, stage_dir: &str) -> Vec { + let dir = project_root.join(".story_kit").join("work").join(stage_dir); + if !dir.is_dir() { + return Vec::new(); + } + let mut items = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&dir) { + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().and_then(|e| e.to_str()) == Some("md") + && let Some(stem) = path.file_stem().and_then(|s| s.to_str()) + { + items.push(stem.to_string()); + } + } + } + items.sort(); + items +} + +/// Return `true` if `story_id` has any active (pending/running) agent matching `stage`. +/// +/// Uses the explicit `stage` config field when the agent is found in `config`; +/// falls back to the legacy name-based heuristic for unlisted agents. +fn is_story_assigned_for_stage( + config: &ProjectConfig, + agents: &HashMap, + story_id: &str, + stage: &PipelineStage, +) -> bool { + agents.iter().any(|(key, agent)| { + // Composite key format: "{story_id}:{agent_name}" + let key_story_id = key.rsplit_once(':').map(|(sid, _)| sid).unwrap_or(key); + let agent_stage = config + .find_agent(&agent.agent_name) + .map(agent_config_stage) + .unwrap_or_else(|| pipeline_stage(&agent.agent_name)); + key_story_id == story_id + && agent_stage == *stage + && matches!(agent.status, AgentStatus::Running | AgentStatus::Pending) + }) +} + +/// Count active (pending/running) agents for a given pipeline stage. +fn count_active_agents_for_stage( + config: &ProjectConfig, + agents: &HashMap, + stage: &PipelineStage, +) -> usize { + agents + .values() + .filter(|a| { + matches!(a.status, AgentStatus::Running | AgentStatus::Pending) + && config + .find_agent(&a.agent_name) + .map(|cfg| agent_config_stage(cfg) == *stage) + .unwrap_or_else(|| pipeline_stage(&a.agent_name) == *stage) + }) + .count() +} + +/// Find the first configured agent for `stage` that has no active (pending/running) assignment. +/// Returns `None` if all agents for that stage are busy, none are configured, +/// or the `max_coders` limit has been reached (for the Coder stage). +/// +/// For the Coder stage, when `default_coder_model` is set, only considers agents whose +/// model matches the default. This ensures opus-class agents are reserved for explicit +/// front-matter requests. +pub(super) fn find_free_agent_for_stage<'a>( + config: &'a ProjectConfig, + agents: &HashMap, + stage: &PipelineStage, +) -> Option<&'a str> { + // Enforce max_coders limit for the Coder stage. + if *stage == PipelineStage::Coder + && let Some(max) = config.max_coders + { + let active = count_active_agents_for_stage(config, agents, stage); + if active >= max { + return None; + } + } + + for agent_config in &config.agent { + if agent_config_stage(agent_config) != *stage { + continue; + } + // When default_coder_model is set, only auto-assign coder agents whose + // model matches. This keeps opus agents reserved for explicit requests. + if *stage == PipelineStage::Coder + && let Some(ref default_model) = config.default_coder_model + { + let agent_model = agent_config.model.as_deref().unwrap_or(""); + if agent_model != default_model { + continue; + } + } + let is_busy = agents.values().any(|a| { + a.agent_name == agent_config.name + && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) + }); + if !is_busy { + return Some(&agent_config.name); + } + } + None +} + +/// Scan the agent pool for Running entries whose backing tokio task has already +/// finished and mark them as Failed. +/// +/// This handles the case where the PTY read loop or the spawned task exits +/// without updating the agent status — for example when the process is killed +/// externally and the PTY master fd returns EOF before our inactivity timeout +/// fires, but some other edge case prevents the normal cleanup path from running. +fn check_orphaned_agents(agents: &Mutex>) -> usize { + let mut lock = match agents.lock() { + Ok(l) => l, + Err(_) => return 0, + }; + + // Collect orphaned entries: Running or Pending agents whose task handle is finished. + // Pending agents can be orphaned if worktree creation panics before setting status. + let orphaned: Vec<(String, String, broadcast::Sender, AgentStatus)> = lock + .iter() + .filter_map(|(key, agent)| { + if matches!(agent.status, AgentStatus::Running | AgentStatus::Pending) + && let Some(handle) = &agent.task_handle + && handle.is_finished() + { + let story_id = key + .rsplit_once(':') + .map(|(s, _)| s.to_string()) + .unwrap_or_else(|| key.clone()); + return Some(( + key.clone(), + story_id, + agent.tx.clone(), + agent.status.clone(), + )); + } + None + }) + .collect(); + + let count = orphaned.len(); + for (key, story_id, tx, prev_status) in orphaned { + if let Some(agent) = lock.get_mut(&key) { + agent.status = AgentStatus::Failed; + slog!( + "[watchdog] Orphaned agent '{key}': task finished but status was {prev_status}. \ + Marking Failed." + ); + let _ = tx.send(AgentEvent::Error { + story_id, + agent_name: agent.agent_name.clone(), + message: "Agent process terminated unexpectedly (watchdog detected orphan)" + .to_string(), + }); + } + } + count +} + +// ── Tests ────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::ProjectConfig; + use crate::io::watcher::WatcherEvent; + use std::process::Command; + + use super::super::{AgentPool, StoryAgent, composite_key}; + + fn make_config(toml_str: &str) -> ProjectConfig { + ProjectConfig::parse(toml_str).unwrap() + } + + fn init_git_repo(repo: &std::path::Path) { + Command::new("git") + .args(["init"]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.email", "test@test.com"]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.name", "Test"]) + .current_dir(repo) + .output() + .unwrap(); + // Create initial commit so master branch exists. + std::fs::write(repo.join("README.md"), "# test\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "initial"]) + .current_dir(repo) + .output() + .unwrap(); + } + + fn make_test_story_agent(agent_name: &str, status: AgentStatus) -> StoryAgent { + StoryAgent { + agent_name: agent_name.to_string(), + status, + worktree_info: None, + session_id: None, + tx: broadcast::channel(1).0, + task_handle: None, + event_log: Arc::new(Mutex::new(Vec::new())), + completion: None, + project_root: None, + log_session_id: None, + merge_failure_reported: false, + } + } + + // ── auto-assign helper tests ─────────────────────────────────── + + #[test] + fn scan_stage_items_returns_empty_for_missing_dir() { + let tmp = tempfile::tempdir().unwrap(); + let items = scan_stage_items(tmp.path(), "2_current"); + assert!(items.is_empty()); + } + + #[test] + fn scan_stage_items_returns_sorted_story_ids() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let stage_dir = tmp.path().join(".story_kit").join("work").join("2_current"); + fs::create_dir_all(&stage_dir).unwrap(); + fs::write(stage_dir.join("42_story_foo.md"), "---\nname: foo\n---").unwrap(); + fs::write(stage_dir.join("10_story_bar.md"), "---\nname: bar\n---").unwrap(); + fs::write(stage_dir.join("5_story_baz.md"), "---\nname: baz\n---").unwrap(); + // non-md file should be ignored + fs::write(stage_dir.join("README.txt"), "ignore me").unwrap(); + + let items = scan_stage_items(tmp.path(), "2_current"); + assert_eq!(items, vec!["10_story_bar", "42_story_foo", "5_story_baz"]); + } + + #[test] + fn is_story_assigned_returns_true_for_running_coder() { + let config = ProjectConfig::default(); + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running); + + let agents = pool.agents.lock().unwrap(); + assert!(is_story_assigned_for_stage( + &config, + &agents, + "42_story_foo", + &PipelineStage::Coder + )); + // Same story but wrong stage — should be false + assert!(!is_story_assigned_for_stage( + &config, + &agents, + "42_story_foo", + &PipelineStage::Qa + )); + // Different story — should be false + assert!(!is_story_assigned_for_stage( + &config, + &agents, + "99_story_other", + &PipelineStage::Coder + )); + } + + #[test] + fn is_story_assigned_returns_false_for_completed_agent() { + let config = ProjectConfig::default(); + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Completed); + + let agents = pool.agents.lock().unwrap(); + // Completed agents don't count as assigned + assert!(!is_story_assigned_for_stage( + &config, + &agents, + "42_story_foo", + &PipelineStage::Coder + )); + } + + #[test] + fn is_story_assigned_uses_config_stage_field_for_nonstandard_names() { + let config = ProjectConfig::parse( + r#" +[[agent]] +name = "qa-2" +stage = "qa" +"#, + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("42_story_foo", "qa-2", AgentStatus::Running); + + let agents = pool.agents.lock().unwrap(); + // qa-2 with stage=qa should be recognised as a QA agent + assert!( + is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Qa), + "qa-2 should be detected as assigned to QA stage" + ); + // Should NOT appear as a coder + assert!( + !is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Coder), + "qa-2 should not be detected as a coder" + ); + } + + #[test] + fn find_free_agent_returns_none_when_all_busy() { + let config = ProjectConfig::parse( + r#" +[[agent]] +name = "coder-1" +[[agent]] +name = "coder-2" +"#, + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("s1", "coder-1", AgentStatus::Running); + pool.inject_test_agent("s2", "coder-2", AgentStatus::Running); + + let agents = pool.agents.lock().unwrap(); + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert!(free.is_none(), "no free coders should be available"); + } + + #[test] + fn find_free_agent_returns_first_free_coder() { + let config = ProjectConfig::parse( + r#" +[[agent]] +name = "coder-1" +[[agent]] +name = "coder-2" +[[agent]] +name = "coder-3" +"#, + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + // coder-1 is busy, coder-2 is free + pool.inject_test_agent("s1", "coder-1", AgentStatus::Running); + + let agents = pool.agents.lock().unwrap(); + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert_eq!( + free, + Some("coder-2"), + "coder-2 should be the first free coder" + ); + } + + #[test] + fn find_free_agent_ignores_completed_agents() { + let config = ProjectConfig::parse( + r#" +[[agent]] +name = "coder-1" +"#, + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + // coder-1 completed its previous story — it's free for a new one + pool.inject_test_agent("s1", "coder-1", AgentStatus::Completed); + + let agents = pool.agents.lock().unwrap(); + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert_eq!(free, Some("coder-1"), "completed coder-1 should be free"); + } + + #[test] + fn find_free_agent_returns_none_for_wrong_stage() { + let config = ProjectConfig::parse( + r#" +[[agent]] +name = "qa" +"#, + ) + .unwrap(); + + let agents: HashMap = HashMap::new(); + // Looking for a Coder but only QA is configured + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert!(free.is_none()); + // Looking for QA should find it + let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa); + assert_eq!(free_qa, Some("qa")); + } + + #[test] + fn find_free_agent_uses_config_stage_field_not_name() { + // Agents named "qa-2" and "coder-opus" don't match the legacy name heuristic + // but should be picked up via their explicit stage field. + let config = ProjectConfig::parse( + r#" +[[agent]] +name = "qa-2" +stage = "qa" + +[[agent]] +name = "coder-opus" +stage = "coder" +"#, + ) + .unwrap(); + + let agents: HashMap = HashMap::new(); + + // qa-2 should be found for PipelineStage::Qa via config stage field + let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa); + assert_eq!(free_qa, Some("qa-2"), "qa-2 with stage=qa should be found"); + + // coder-opus should be found for PipelineStage::Coder via config stage field + let free_coder = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert_eq!( + free_coder, + Some("coder-opus"), + "coder-opus with stage=coder should be found" + ); + + // Neither should match the other stage + let free_merge = find_free_agent_for_stage(&config, &agents, &PipelineStage::Mergemaster); + assert!(free_merge.is_none()); + } + + // ── check_orphaned_agents return value tests (bug 161) ────────────────── + + #[tokio::test] + async fn check_orphaned_agents_returns_count_of_orphaned_agents() { + let pool = AgentPool::new_test(3001); + + // Spawn two tasks that finish immediately. + let h1 = tokio::spawn(async {}); + let h2 = tokio::spawn(async {}); + tokio::time::sleep(std::time::Duration::from_millis(20)).await; + assert!(h1.is_finished()); + assert!(h2.is_finished()); + + pool.inject_test_agent_with_handle("story_a", "coder", AgentStatus::Running, h1); + pool.inject_test_agent_with_handle("story_b", "coder", AgentStatus::Running, h2); + + let found = check_orphaned_agents(&pool.agents); + assert_eq!(found, 2, "should detect both orphaned agents"); + } + + #[test] + fn check_orphaned_agents_returns_zero_when_no_orphans() { + let pool = AgentPool::new_test(3001); + // Inject agents in terminal states — not orphaned. + pool.inject_test_agent("story_a", "coder", AgentStatus::Completed); + pool.inject_test_agent("story_b", "qa", AgentStatus::Failed); + + let found = check_orphaned_agents(&pool.agents); + assert_eq!( + found, 0, + "no orphans should be detected for terminal agents" + ); + } + + #[tokio::test] + async fn watchdog_detects_orphaned_running_agent() { + let pool = AgentPool::new_test(3001); + + let handle = tokio::spawn(async {}); + tokio::time::sleep(std::time::Duration::from_millis(20)).await; + assert!( + handle.is_finished(), + "task should be finished before injection" + ); + + let tx = pool.inject_test_agent_with_handle( + "orphan_story", + "coder", + AgentStatus::Running, + handle, + ); + let mut rx = tx.subscribe(); + + pool.run_watchdog_once(); + + { + let agents = pool.agents.lock().unwrap(); + let key = composite_key("orphan_story", "coder"); + let agent = agents.get(&key).unwrap(); + assert_eq!( + agent.status, + AgentStatus::Failed, + "watchdog must mark an orphaned Running agent as Failed" + ); + } + + let event = rx.try_recv().expect("watchdog must emit an Error event"); + assert!( + matches!(event, AgentEvent::Error { .. }), + "expected AgentEvent::Error, got: {event:?}" + ); + } + + #[tokio::test] + async fn watchdog_orphan_detection_returns_nonzero_enabling_auto_assign() { + // This test verifies the contract that `check_orphaned_agents` returns + // a non-zero count when orphans exist, which the watchdog uses to + // decide whether to trigger auto-assign (bug 161). + let pool = AgentPool::new_test(3001); + + let handle = tokio::spawn(async {}); + tokio::time::sleep(std::time::Duration::from_millis(20)).await; + + pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle); + + // Before watchdog: agent is Running. + { + let agents = pool.agents.lock().unwrap(); + let key = composite_key("orphan_story", "coder"); + assert_eq!(agents.get(&key).unwrap().status, AgentStatus::Running); + } + + // Run watchdog pass — should return 1 (orphan found). + let found = check_orphaned_agents(&pool.agents); + assert_eq!( + found, 1, + "watchdog must return 1 for a single orphaned agent" + ); + + // After watchdog: agent is Failed. + { + let agents = pool.agents.lock().unwrap(); + let key = composite_key("orphan_story", "coder"); + assert_eq!( + agents.get(&key).unwrap().status, + AgentStatus::Failed, + "orphaned agent must be marked Failed" + ); + } + } + + // ── auto_assign_available_work tests ────────────────────────────────────── + + /// Story 203: auto_assign_available_work must detect a story in 2_current/ + /// with no active agent and start an agent for it. + #[tokio::test] + async fn auto_assign_picks_up_story_queued_in_current() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".story_kit"); + let current = sk.join("work/2_current"); + std::fs::create_dir_all(¤t).unwrap(); + std::fs::write( + sk.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", + ) + .unwrap(); + // Place the story in 2_current/ (simulating the "queued" state). + std::fs::write(current.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap(); + + let pool = AgentPool::new_test(3001); + // No agents are running — coder-1 is free. + + // auto_assign will try to call start_agent, which will attempt to create + // a worktree (will fail without a git repo) — that is fine. We only need + // to verify the agent is registered as Pending before the background + // task eventually fails. + pool.auto_assign_available_work(tmp.path()).await; + + let agents = pool.agents.lock().unwrap(); + let has_pending = agents.values().any(|a| { + a.agent_name == "coder-1" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + has_pending, + "auto_assign should have started coder-1 for story-3, but pool is empty" + ); + } + + /// Story 265: auto_assign_available_work must skip spikes in 3_qa/ that + /// have review_hold: true set in their front matter. + #[tokio::test] + async fn auto_assign_skips_spikes_with_review_hold() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Create project.toml with a QA agent. + let sk = root.join(".story_kit"); + std::fs::create_dir_all(&sk).unwrap(); + std::fs::write( + sk.join("project.toml"), + "[[agents]]\nname = \"qa\"\nrole = \"qa\"\nmodel = \"test\"\nprompt = \"test\"\n", + ) + .unwrap(); + + // Put a spike in 3_qa/ with review_hold: true. + let qa_dir = root.join(".story_kit/work/3_qa"); + std::fs::create_dir_all(&qa_dir).unwrap(); + std::fs::write( + qa_dir.join("20_spike_test.md"), + "---\nname: Test Spike\nreview_hold: true\n---\n# Spike\n", + ) + .unwrap(); + + let (watcher_tx, _) = broadcast::channel::(4); + let pool = AgentPool::new(3001, watcher_tx); + + pool.auto_assign_available_work(root).await; + + // No agent should have been started for the spike. + let agents = pool.agents.lock().unwrap(); + assert!( + agents.is_empty(), + "No agents should be assigned to a spike with review_hold" + ); + } + + // ── Story 279: auto-assign respects agent stage from front matter ────────── + + /// When a story in 3_qa/ has `agent: coder-1` in its front matter but + /// coder-1 is a coder-stage agent, auto-assign must NOT assign coder-1. + /// Instead it should fall back to a free QA-stage agent. + #[tokio::test] + async fn auto_assign_ignores_coder_preference_when_story_is_in_qa_stage() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".story_kit"); + let qa_dir = sk.join("work/3_qa"); + std::fs::create_dir_all(&qa_dir).unwrap(); + std::fs::write( + sk.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ + [[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n", + ) + .unwrap(); + // Story in 3_qa/ with a preferred coder-stage agent. + std::fs::write( + qa_dir.join("story-qa1.md"), + "---\nname: QA Story\nagent: coder-1\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + + pool.auto_assign_available_work(tmp.path()).await; + + let agents = pool.agents.lock().unwrap(); + // coder-1 must NOT have been assigned (wrong stage for 3_qa/). + let coder_assigned = agents.values().any(|a| { + a.agent_name == "coder-1" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + !coder_assigned, + "coder-1 should not be assigned to a QA-stage story" + ); + // qa-1 should have been assigned instead. + let qa_assigned = agents.values().any(|a| { + a.agent_name == "qa-1" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + qa_assigned, + "qa-1 should be assigned as fallback for the QA-stage story" + ); + } + + /// When a story in 2_current/ has `agent: coder-1` in its front matter and + /// coder-1 is a coder-stage agent, auto-assign must respect the preference + /// and assign coder-1 (not fall back to some other coder). + #[tokio::test] + async fn auto_assign_respects_coder_preference_when_story_is_in_current_stage() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".story_kit"); + let current_dir = sk.join("work/2_current"); + std::fs::create_dir_all(¤t_dir).unwrap(); + std::fs::write( + sk.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ + [[agent]]\nname = \"coder-2\"\nstage = \"coder\"\n", + ) + .unwrap(); + // Story in 2_current/ with a preferred coder-1 agent. + std::fs::write( + current_dir.join("story-pref.md"), + "---\nname: Coder Story\nagent: coder-1\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + + pool.auto_assign_available_work(tmp.path()).await; + + let agents = pool.agents.lock().unwrap(); + // coder-1 should have been picked (it matches the stage and is preferred). + let coder1_assigned = agents.values().any(|a| { + a.agent_name == "coder-1" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + coder1_assigned, + "coder-1 should be assigned when it matches the stage and is preferred" + ); + // coder-2 must NOT be assigned (not preferred). + let coder2_assigned = agents.values().any(|a| { + a.agent_name == "coder-2" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + !coder2_assigned, + "coder-2 should not be assigned when coder-1 is explicitly preferred" + ); + } + + /// When the preferred agent's stage mismatches and no other agent of the + /// correct stage is available, auto-assign must not start any agent for that + /// story (no panic, no error). + #[tokio::test] + async fn auto_assign_stage_mismatch_with_no_fallback_starts_no_agent() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".story_kit"); + let qa_dir = sk.join("work/3_qa"); + std::fs::create_dir_all(&qa_dir).unwrap(); + // Only a coder agent is configured — no QA agent exists. + std::fs::write( + sk.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", + ) + .unwrap(); + // Story in 3_qa/ requests coder-1 (wrong stage) and no QA agent exists. + std::fs::write( + qa_dir.join("story-noqa.md"), + "---\nname: QA Story No Agent\nagent: coder-1\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + + // Must not panic. + pool.auto_assign_available_work(tmp.path()).await; + + let agents = pool.agents.lock().unwrap(); + assert!( + agents.is_empty(), + "No agent should be started when no stage-appropriate agent is available" + ); + } + + /// Two concurrent auto_assign_available_work calls must not assign the same + /// agent to two stories simultaneously. After both complete, at most one + /// Pending/Running entry must exist per agent name. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn toctou_concurrent_auto_assign_no_duplicate_agent_assignments() { + use std::fs; + use std::sync::Arc; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path().to_path_buf(); + + let sk_dir = root.join(".story_kit"); + // Two stories waiting in 2_current, one coder agent. + fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\n", + ) + .unwrap(); + fs::write( + sk_dir.join("work/2_current/86_story_foo.md"), + "---\nname: Foo\n---\n", + ) + .unwrap(); + fs::write( + sk_dir.join("work/2_current/130_story_bar.md"), + "---\nname: Bar\n---\n", + ) + .unwrap(); + + let pool = Arc::new(AgentPool::new_test(3099)); + + // Run two concurrent auto_assign calls. + let pool1 = pool.clone(); + let root1 = root.clone(); + let t1 = tokio::spawn(async move { pool1.auto_assign_available_work(&root1).await }); + + let pool2 = pool.clone(); + let root2 = root.clone(); + let t2 = tokio::spawn(async move { pool2.auto_assign_available_work(&root2).await }); + + let _ = tokio::join!(t1, t2); + + // At most one Pending/Running entry should exist for coder-1. + let agents = pool.agents.lock().unwrap(); + let active_coder_count = agents + .values() + .filter(|a| { + a.agent_name == "coder-1" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }) + .count(); + + assert!( + active_coder_count <= 1, + "coder-1 must not be assigned to more than one story simultaneously; \ + found {active_coder_count} active entries" + ); + } + + // ── has_review_hold tests ──────────────────────────────────────────────── + + #[test] + fn has_review_hold_returns_true_when_set() { + let tmp = tempfile::tempdir().unwrap(); + let qa_dir = tmp.path().join(".story_kit/work/3_qa"); + std::fs::create_dir_all(&qa_dir).unwrap(); + let spike_path = qa_dir.join("10_spike_research.md"); + std::fs::write( + &spike_path, + "---\nname: Research spike\nreview_hold: true\n---\n# Spike\n", + ) + .unwrap(); + assert!(has_review_hold(tmp.path(), "3_qa", "10_spike_research")); + } + + #[test] + fn has_review_hold_returns_false_when_not_set() { + let tmp = tempfile::tempdir().unwrap(); + let qa_dir = tmp.path().join(".story_kit/work/3_qa"); + std::fs::create_dir_all(&qa_dir).unwrap(); + let spike_path = qa_dir.join("10_spike_research.md"); + std::fs::write(&spike_path, "---\nname: Research spike\n---\n# Spike\n").unwrap(); + assert!(!has_review_hold(tmp.path(), "3_qa", "10_spike_research")); + } + + #[test] + fn has_review_hold_returns_false_when_file_missing() { + let tmp = tempfile::tempdir().unwrap(); + assert!(!has_review_hold(tmp.path(), "3_qa", "99_spike_missing")); + } + + // ── find_free_agent_for_stage: default_coder_model filtering ───────── + + #[test] + fn find_free_agent_skips_opus_when_default_coder_model_set() { + let config = make_config( + r#" +default_coder_model = "sonnet" + +[[agent]] +name = "coder-1" +stage = "coder" +model = "sonnet" + +[[agent]] +name = "coder-opus" +stage = "coder" +model = "opus" +"#, + ); + + let agents = HashMap::new(); + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert_eq!(free, Some("coder-1")); + } + + #[test] + fn find_free_agent_returns_opus_when_no_default_coder_model() { + let config = make_config( + r#" +[[agent]] +name = "coder-opus" +stage = "coder" +model = "opus" +"#, + ); + + let agents = HashMap::new(); + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert_eq!(free, Some("coder-opus")); + } + + #[test] + fn find_free_agent_returns_none_when_all_sonnet_coders_busy() { + let config = make_config( + r#" +default_coder_model = "sonnet" + +[[agent]] +name = "coder-1" +stage = "coder" +model = "sonnet" + +[[agent]] +name = "coder-opus" +stage = "coder" +model = "opus" +"#, + ); + + let mut agents = HashMap::new(); + agents.insert( + "story1:coder-1".to_string(), + make_test_story_agent("coder-1", AgentStatus::Running), + ); + + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert_eq!(free, None, "opus agent should not be auto-assigned"); + } + + // ── find_free_agent_for_stage: max_coders limit ───────────────────── + + #[test] + fn find_free_agent_respects_max_coders() { + let config = make_config( + r#" +max_coders = 1 + +[[agent]] +name = "coder-1" +stage = "coder" +model = "sonnet" + +[[agent]] +name = "coder-2" +stage = "coder" +model = "sonnet" +"#, + ); + + let mut agents = HashMap::new(); + agents.insert( + "story1:coder-1".to_string(), + make_test_story_agent("coder-1", AgentStatus::Running), + ); + + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert_eq!(free, None, "max_coders=1 should block second coder"); + } + + #[test] + fn find_free_agent_allows_within_max_coders() { + let config = make_config( + r#" +max_coders = 2 + +[[agent]] +name = "coder-1" +stage = "coder" +model = "sonnet" + +[[agent]] +name = "coder-2" +stage = "coder" +model = "sonnet" +"#, + ); + + let mut agents = HashMap::new(); + agents.insert( + "story1:coder-1".to_string(), + make_test_story_agent("coder-1", AgentStatus::Running), + ); + + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); + assert_eq!(free, Some("coder-2")); + } + + #[test] + fn max_coders_does_not_affect_qa_stage() { + let config = make_config( + r#" +max_coders = 1 + +[[agent]] +name = "qa" +stage = "qa" +model = "sonnet" +"#, + ); + + let agents = HashMap::new(); + let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa); + assert_eq!(free, Some("qa")); + } + + // ── count_active_agents_for_stage ──────────────────────────────────── + + #[test] + fn count_active_agents_counts_running_and_pending() { + let config = make_config( + r#" +[[agent]] +name = "coder-1" +stage = "coder" + +[[agent]] +name = "coder-2" +stage = "coder" +"#, + ); + + let mut agents = HashMap::new(); + agents.insert( + "s1:coder-1".to_string(), + make_test_story_agent("coder-1", AgentStatus::Running), + ); + agents.insert( + "s2:coder-2".to_string(), + make_test_story_agent("coder-2", AgentStatus::Completed), + ); + + let count = count_active_agents_for_stage(&config, &agents, &PipelineStage::Coder); + assert_eq!(count, 1, "Only Running coder should be counted, not Completed"); + } + + // ── reconcile_on_startup tests ──────────────────────────────────────────── + + #[tokio::test] + async fn reconcile_on_startup_noop_when_no_worktrees() { + let tmp = tempfile::tempdir().unwrap(); + let pool = AgentPool::new_test(3001); + let (tx, _rx) = broadcast::channel(16); + // Should not panic; no worktrees to reconcile. + pool.reconcile_on_startup(tmp.path(), &tx).await; + } + + #[tokio::test] + async fn reconcile_on_startup_emits_done_event() { + let tmp = tempfile::tempdir().unwrap(); + let pool = AgentPool::new_test(3001); + let (tx, mut rx) = broadcast::channel::(16); + pool.reconcile_on_startup(tmp.path(), &tx).await; + + // Collect all events; the last must be "done". + let mut events: Vec = Vec::new(); + while let Ok(evt) = rx.try_recv() { + events.push(evt); + } + assert!( + events.iter().any(|e| e.status == "done"), + "reconcile_on_startup must emit a 'done' event; got: {:?}", + events.iter().map(|e| &e.status).collect::>() + ); + } + + #[tokio::test] + async fn reconcile_on_startup_skips_story_without_committed_work() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Set up story in 2_current/. + let current = root.join(".story_kit/work/2_current"); + fs::create_dir_all(¤t).unwrap(); + fs::write(current.join("60_story_test.md"), "test").unwrap(); + + // Create a worktree directory that is a fresh git repo with no commits + // ahead of its own base branch (simulates a worktree where no work was done). + let wt_dir = root.join(".story_kit/worktrees/60_story_test"); + fs::create_dir_all(&wt_dir).unwrap(); + init_git_repo(&wt_dir); + + let pool = AgentPool::new_test(3001); + let (tx, _rx) = broadcast::channel(16); + pool.reconcile_on_startup(root, &tx).await; + + // Story should still be in 2_current/ — nothing was reconciled. + assert!( + current.join("60_story_test.md").exists(), + "story should stay in 2_current/ when worktree has no committed work" + ); + } + + #[tokio::test] + async fn reconcile_on_startup_runs_gates_on_worktree_with_committed_work() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Set up a git repo for the project root. + init_git_repo(root); + + // Set up story in 2_current/ and commit it so the project root is clean. + let current = root.join(".story_kit/work/2_current"); + fs::create_dir_all(¤t).unwrap(); + fs::write(current.join("61_story_test.md"), "test").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args([ + "-c", + "user.email=test@test.com", + "-c", + "user.name=Test", + "commit", + "-m", + "add story", + ]) + .current_dir(root) + .output() + .unwrap(); + + // Create a real git worktree for the story. + let wt_dir = root.join(".story_kit/worktrees/61_story_test"); + fs::create_dir_all(wt_dir.parent().unwrap()).unwrap(); + Command::new("git") + .args([ + "worktree", + "add", + &wt_dir.to_string_lossy(), + "-b", + "feature/story-61_story_test", + ]) + .current_dir(root) + .output() + .unwrap(); + + // Add a commit to the feature branch (simulates coder completing work). + fs::write(wt_dir.join("implementation.txt"), "done").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(&wt_dir) + .output() + .unwrap(); + Command::new("git") + .args([ + "-c", + "user.email=test@test.com", + "-c", + "user.name=Test", + "commit", + "-m", + "implement story", + ]) + .current_dir(&wt_dir) + .output() + .unwrap(); + + assert!( + crate::agents::gates::worktree_has_committed_work(&wt_dir), + "test setup: worktree should have committed work" + ); + + let pool = AgentPool::new_test(3001); + let (tx, _rx) = broadcast::channel(16); + pool.reconcile_on_startup(root, &tx).await; + + // In the test env, cargo clippy will fail (no Cargo.toml) so gates fail + // and the story stays in 2_current/. The important assertion is that + // reconcile ran without panicking and the story is in a consistent state. + let in_current = current.join("61_story_test.md").exists(); + let in_qa = root.join(".story_kit/work/3_qa/61_story_test.md").exists(); + assert!( + in_current || in_qa, + "story should be in 2_current/ or 3_qa/ after reconciliation" + ); + } +} diff --git a/server/src/agents/pool/mod.rs b/server/src/agents/pool/mod.rs new file mode 100644 index 0000000..dce39cd --- /dev/null +++ b/server/src/agents/pool/mod.rs @@ -0,0 +1,2187 @@ +mod auto_assign; +mod pipeline; + +use crate::agent_log::AgentLogWriter; +use crate::config::ProjectConfig; +use crate::io::watcher::WatcherEvent; +use crate::slog; +use crate::slog_error; +use crate::worktree::{self, WorktreeInfo}; +use portable_pty::ChildKiller; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex}; +use tokio::sync::broadcast; + +use super::{ + AgentEvent, AgentInfo, AgentStatus, CompletionReport, PipelineStage, agent_config_stage, + pipeline_stage, +}; + +/// Build the composite key used to track agents in the pool. +fn composite_key(story_id: &str, agent_name: &str) -> String { + format!("{story_id}:{agent_name}") +} + +/// RAII guard that removes a pending agent entry from the pool on drop. +/// +/// Created after inserting a `Pending` entry into the agent HashMap. +/// If `start_agent` succeeds (the agent process is spawned and status +/// transitions to `Running`), call [`disarm`](Self::disarm) to prevent +/// cleanup. If any intermediate step fails and the guard is dropped +/// without being disarmed, the pending entry is removed so it cannot +/// block future auto-assign dispatches. +struct PendingGuard { + agents: Arc>>, + key: String, + armed: bool, +} + +impl PendingGuard { + fn new(agents: Arc>>, key: String) -> Self { + Self { + agents, + key, + armed: true, + } + } + + /// Prevent the guard from cleaning up the entry (call after + /// successful spawn). + fn disarm(&mut self) { + self.armed = false; + } +} + +impl Drop for PendingGuard { + fn drop(&mut self) { + if self.armed + && let Ok(mut agents) = self.agents.lock() + && agents + .get(&self.key) + .is_some_and(|a| a.status == AgentStatus::Pending) + { + agents.remove(&self.key); + slog!( + "[agents] Cleaned up leaked Pending entry for '{}'", + self.key + ); + } + } +} + +struct StoryAgent { + agent_name: String, + status: AgentStatus, + worktree_info: Option, + session_id: Option, + tx: broadcast::Sender, + task_handle: Option>, + /// Accumulated events for polling via get_agent_output. + event_log: Arc>>, + /// Set when the agent calls report_completion. + completion: Option, + /// Project root, stored for pipeline advancement after completion. + project_root: Option, + /// UUID identifying the log file for this session. + log_session_id: Option, + /// Set to `true` when the agent calls `report_merge_failure`. + /// Prevents the pipeline from blindly advancing to `5_done/` after a + /// failed merge: the server-owned gate check runs in the feature-branch + /// worktree (which compiles fine) and returns `gates_passed=true` even + /// though the code was never squash-merged onto master. + merge_failure_reported: bool, +} + +/// Build an `AgentInfo` snapshot from a `StoryAgent` map entry. +fn agent_info_from_entry(story_id: &str, agent: &StoryAgent) -> AgentInfo { + AgentInfo { + story_id: story_id.to_string(), + agent_name: agent.agent_name.clone(), + status: agent.status.clone(), + session_id: agent.session_id.clone(), + worktree_path: agent + .worktree_info + .as_ref() + .map(|wt| wt.path.to_string_lossy().to_string()), + base_branch: agent + .worktree_info + .as_ref() + .map(|wt| wt.base_branch.clone()), + completion: agent.completion.clone(), + log_session_id: agent.log_session_id.clone(), + } +} + +/// Manages concurrent story agents, each in its own worktree. +pub struct AgentPool { + agents: Arc>>, + port: u16, + /// Registry of active PTY child process killers, keyed by "{story_id}:{agent_name}". + /// Used to terminate child processes on server shutdown or agent stop, preventing + /// orphaned Claude Code processes from running after the server exits. + child_killers: Arc>>>, + /// Broadcast channel for notifying WebSocket clients of agent state changes. + /// When an agent transitions state (Pending, Running, Completed, Failed, Stopped), + /// an `AgentStateChanged` event is emitted so the frontend can refresh the + /// pipeline board without waiting for a filesystem event. + watcher_tx: broadcast::Sender, + /// Tracks background merge jobs started by `merge_agent_work`, keyed by story_id. + /// The MCP tool returns immediately and the mergemaster agent polls + /// `get_merge_status` until the job reaches a terminal state. + merge_jobs: Arc>>, +} + +impl AgentPool { + pub fn new(port: u16, watcher_tx: broadcast::Sender) -> Self { + Self { + agents: Arc::new(Mutex::new(HashMap::new())), + port, + child_killers: Arc::new(Mutex::new(HashMap::new())), + watcher_tx, + merge_jobs: Arc::new(Mutex::new(HashMap::new())), + } + } + + /// Create a pool with a dummy watcher channel for unit tests. + #[cfg(test)] + pub fn new_test(port: u16) -> Self { + let (watcher_tx, _) = broadcast::channel(16); + Self::new(port, watcher_tx) + } + + /// Notify WebSocket clients that agent state has changed, so the pipeline + /// board and agent panel can refresh. + fn notify_agent_state_changed(watcher_tx: &broadcast::Sender) { + let _ = watcher_tx.send(WatcherEvent::AgentStateChanged); + } + + /// Kill all active PTY child processes. + /// + /// Called on server shutdown to prevent orphaned Claude Code processes from + /// continuing to run after the server exits. Each registered killer is called + /// once, then the registry is cleared. + pub fn kill_all_children(&self) { + if let Ok(mut killers) = self.child_killers.lock() { + for (key, killer) in killers.iter_mut() { + slog!("[agents] Killing child process for {key} on shutdown"); + let _ = killer.kill(); + } + killers.clear(); + } + } + + /// Kill and deregister the child process for a specific agent key. + /// + /// Used by `stop_agent` to ensure the PTY child is terminated even though + /// aborting a `spawn_blocking` task handle does not interrupt the blocking thread. + fn kill_child_for_key(&self, key: &str) { + if let Ok(mut killers) = self.child_killers.lock() + && let Some(mut killer) = killers.remove(key) + { + slog!("[agents] Killing child process for {key} on stop"); + let _ = killer.kill(); + } + } + + /// Start an agent for a story: load config, create worktree, spawn agent. + /// + /// When `agent_name` is `None`, automatically selects the first idle coder + /// agent (story 190). If all coders are busy the call fails with an error + /// indicating the story will be picked up when one becomes available. + /// + /// If `resume_context` is provided, it is appended to the rendered prompt + /// so the agent can pick up from a previous failed attempt. + pub async fn start_agent( + &self, + project_root: &Path, + story_id: &str, + agent_name: Option<&str>, + resume_context: Option<&str>, + ) -> Result { + let config = ProjectConfig::load(project_root)?; + + // Validate explicit agent name early (no lock needed). + if let Some(name) = agent_name { + config + .find_agent(name) + .ok_or_else(|| format!("No agent named '{name}' in config"))?; + } + + // Create name-independent shared resources before the lock so they are + // ready for the atomic check-and-insert (story 132). + let (tx, _) = broadcast::channel::(1024); + let event_log: Arc>> = Arc::new(Mutex::new(Vec::new())); + let log_session_id = uuid::Uuid::new_v4().to_string(); + + // Move story from backlog/ to current/ before checking agent + // availability so that auto_assign_available_work can pick it up even + // when all coders are currently busy (story 203). This is idempotent: + // if the story is already in 2_current/ or a later stage, the call is + // a no-op. + super::lifecycle::move_story_to_current(project_root, story_id)?; + + // Validate that the agent's configured stage matches the story's + // pipeline stage. This prevents any caller (auto-assign, MCP tool, + // pipeline advance, supervisor) from starting a wrong-stage agent on + // a story — e.g. mergemaster on a coding-stage story (bug 312). + if let Some(name) = agent_name { + let agent_stage = config + .find_agent(name) + .map(agent_config_stage) + .unwrap_or_else(|| pipeline_stage(name)); + if agent_stage != PipelineStage::Other + && let Some(story_stage_dir) = find_active_story_stage(project_root, story_id) + { + let expected_stage = match story_stage_dir { + "2_current" => PipelineStage::Coder, + "3_qa" => PipelineStage::Qa, + "4_merge" => PipelineStage::Mergemaster, + _ => PipelineStage::Other, + }; + if expected_stage != PipelineStage::Other && expected_stage != agent_stage { + return Err(format!( + "Agent '{name}' (stage: {agent_stage:?}) cannot be assigned to \ + story '{story_id}' in {story_stage_dir}/ (requires stage: {expected_stage:?})" + )); + } + } + } + + // Atomically resolve agent name, check availability, and register as + // Pending. When `agent_name` is `None` the first idle coder is + // selected inside the lock so no TOCTOU race can occur between the + // availability check and the Pending insert (story 132, story 190). + // + // The `PendingGuard` ensures that if any step below fails the entry is + // removed from the pool so it does not permanently block auto-assign + // (bug 118). + let resolved_name: String; + let key: String; + { + let mut agents = self.agents.lock().map_err(|e| e.to_string())?; + + resolved_name = match agent_name { + Some(name) => name.to_string(), + None => auto_assign::find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder) + .map(|s| s.to_string()) + .ok_or_else(|| { + if config + .agent + .iter() + .any(|a| agent_config_stage(a) == PipelineStage::Coder) + { + format!( + "All coder agents are busy; story '{story_id}' has been \ + queued in work/2_current/ and will be auto-assigned when \ + one becomes available" + ) + } else { + "No coder agent configured. Specify an agent_name explicitly." + .to_string() + } + })?, + }; + + key = composite_key(story_id, &resolved_name); + + // Check for duplicate assignment (same story + same agent already active). + if let Some(agent) = agents.get(&key) + && (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending) + { + return Err(format!( + "Agent '{resolved_name}' for story '{story_id}' is already {}", + agent.status + )); + } + // Enforce single-stage concurrency: reject if there is already a + // Running/Pending agent at the same pipeline stage for this story. + // This prevents two coders (or two QA/mergemaster agents) from + // corrupting each other's work in the same worktree. + // Applies to both explicit and auto-selected agents; the Other + // stage (supervisors, unknown agents) is exempt. + let resolved_stage = config + .find_agent(&resolved_name) + .map(agent_config_stage) + .unwrap_or_else(|| pipeline_stage(&resolved_name)); + if resolved_stage != PipelineStage::Other + && let Some(conflicting_name) = agents.iter().find_map(|(k, a)| { + let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k); + if k_story == story_id + && a.agent_name != resolved_name + && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) + { + let a_stage = config + .find_agent(&a.agent_name) + .map(agent_config_stage) + .unwrap_or_else(|| pipeline_stage(&a.agent_name)); + if a_stage == resolved_stage { + Some(a.agent_name.clone()) + } else { + None + } + } else { + None + } + }) + { + return Err(format!( + "Cannot start '{resolved_name}' on story '{story_id}': \ + '{conflicting_name}' is already active at the same pipeline stage" + )); + } + // Enforce single-instance concurrency for explicitly-named agents: + // if this agent is already running on any other story, reject. + // Auto-selected agents are already guaranteed idle by + // find_free_agent_for_stage, so this check is only needed for + // explicit requests. + if agent_name.is_some() + && let Some(busy_story) = agents.iter().find_map(|(k, a)| { + if a.agent_name == resolved_name + && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) + { + Some( + k.rsplit_once(':') + .map(|(sid, _)| sid) + .unwrap_or(k) + .to_string(), + ) + } else { + None + } + }) + { + return Err(format!( + "Agent '{resolved_name}' is already running on story '{busy_story}'; \ + story '{story_id}' will be picked up when the agent becomes available" + )); + } + agents.insert( + key.clone(), + StoryAgent { + agent_name: resolved_name.clone(), + status: AgentStatus::Pending, + worktree_info: None, + session_id: None, + tx: tx.clone(), + task_handle: None, + event_log: event_log.clone(), + completion: None, + project_root: Some(project_root.to_path_buf()), + log_session_id: Some(log_session_id.clone()), + merge_failure_reported: false, + }, + ); + } + let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone()); + + // Create persistent log writer (needs resolved_name, so must be after + // the atomic resolution above). + let log_writer = + match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) { + Ok(w) => Some(Arc::new(Mutex::new(w))), + Err(e) => { + eprintln!( + "[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}" + ); + None + } + }; + + // Notify WebSocket clients that a new agent is pending. + Self::notify_agent_state_changed(&self.watcher_tx); + + let _ = tx.send(AgentEvent::Status { + story_id: story_id.to_string(), + agent_name: resolved_name.clone(), + status: "pending".to_string(), + }); + + // Extract inactivity timeout from the agent config before cloning config. + let inactivity_timeout_secs = config + .find_agent(&resolved_name) + .map(|a| a.inactivity_timeout_secs) + .unwrap_or(300); + + // Clone all values needed inside the background spawn. + let project_root_clone = project_root.to_path_buf(); + let config_clone = config.clone(); + let resume_context_owned = resume_context.map(str::to_string); + let sid = story_id.to_string(); + let aname = resolved_name.clone(); + let tx_clone = tx.clone(); + let agents_ref = self.agents.clone(); + let key_clone = key.clone(); + let log_clone = event_log.clone(); + let port_for_task = self.port; + let log_writer_clone = log_writer.clone(); + let child_killers_clone = self.child_killers.clone(); + let watcher_tx_clone = self.watcher_tx.clone(); + + // Spawn the background task. Worktree creation and agent launch happen here + // so `start_agent` returns immediately after registering the agent as + // Pending — non-blocking by design (story 157). + let handle = tokio::spawn(async move { + // Step 1: create the worktree (slow — git checkout, pnpm install, etc.) + let wt_info = match worktree::create_worktree( + &project_root_clone, + &sid, + &config_clone, + port_for_task, + ) + .await + { + Ok(wt) => wt, + Err(e) => { + let error_msg = format!("Failed to create worktree: {e}"); + slog_error!("[agents] {error_msg}"); + let event = AgentEvent::Error { + story_id: sid.clone(), + agent_name: aname.clone(), + message: error_msg, + }; + if let Ok(mut log) = log_clone.lock() { + log.push(event.clone()); + } + let _ = tx_clone.send(event); + if let Ok(mut agents) = agents_ref.lock() + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Failed; + } + Self::notify_agent_state_changed(&watcher_tx_clone); + return; + } + }; + + // Step 2: store worktree info and render agent command/args/prompt. + let wt_path_str = wt_info.path.to_string_lossy().to_string(); + { + if let Ok(mut agents) = agents_ref.lock() + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.worktree_info = Some(wt_info.clone()); + } + } + + let (command, args, mut prompt) = match config_clone.render_agent_args( + &wt_path_str, + &sid, + Some(&aname), + Some(&wt_info.base_branch), + ) { + Ok(result) => result, + Err(e) => { + let error_msg = format!("Failed to render agent args: {e}"); + slog_error!("[agents] {error_msg}"); + let event = AgentEvent::Error { + story_id: sid.clone(), + agent_name: aname.clone(), + message: error_msg, + }; + if let Ok(mut log) = log_clone.lock() { + log.push(event.clone()); + } + let _ = tx_clone.send(event); + if let Ok(mut agents) = agents_ref.lock() + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Failed; + } + Self::notify_agent_state_changed(&watcher_tx_clone); + return; + } + }; + + // Append resume context if this is a restart with failure information. + if let Some(ctx) = resume_context_owned { + prompt.push_str(&ctx); + } + + // Step 3: transition to Running now that the worktree is ready. + { + if let Ok(mut agents) = agents_ref.lock() + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Running; + } + } + let _ = tx_clone.send(AgentEvent::Status { + story_id: sid.clone(), + agent_name: aname.clone(), + status: "running".to_string(), + }); + Self::notify_agent_state_changed(&watcher_tx_clone); + + // Step 4: launch the agent process. + match super::pty::run_agent_pty_streaming( + &sid, + &aname, + &command, + &args, + &prompt, + &wt_path_str, + &tx_clone, + &log_clone, + log_writer_clone, + inactivity_timeout_secs, + child_killers_clone, + ) + .await + { + Ok(pty_result) => { + // Persist token usage if the agent reported it. + if let Some(ref usage) = pty_result.token_usage + && let Ok(agents) = agents_ref.lock() + && let Some(agent) = agents.get(&key_clone) + && let Some(ref pr) = agent.project_root + { + let model = config_clone + .find_agent(&aname) + .and_then(|a| a.model.clone()); + let record = super::token_usage::build_record( + &sid, &aname, model, usage.clone(), + ); + if let Err(e) = super::token_usage::append_record(pr, &record) { + slog_error!( + "[agents] Failed to persist token usage for \ + {sid}:{aname}: {e}" + ); + } + } + + // Server-owned completion: run acceptance gates automatically + // when the agent process exits normally. + pipeline::run_server_owned_completion( + &agents_ref, + port_for_task, + &sid, + &aname, + pty_result.session_id, + watcher_tx_clone.clone(), + ) + .await; + Self::notify_agent_state_changed(&watcher_tx_clone); + } + Err(e) => { + slog_error!("[agents] Agent process error for {aname} on {sid}: {e}"); + let event = AgentEvent::Error { + story_id: sid.clone(), + agent_name: aname.clone(), + message: e, + }; + if let Ok(mut log) = log_clone.lock() { + log.push(event.clone()); + } + let _ = tx_clone.send(event); + if let Ok(mut agents) = agents_ref.lock() + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Failed; + } + Self::notify_agent_state_changed(&watcher_tx_clone); + } + } + }); + + // Store the task handle while the agent is still Pending. + { + let mut agents = self.agents.lock().map_err(|e| e.to_string())?; + if let Some(agent) = agents.get_mut(&key) { + agent.task_handle = Some(handle); + } + } + + // Agent successfully spawned — prevent the guard from removing the entry. + pending_guard.disarm(); + + Ok(AgentInfo { + story_id: story_id.to_string(), + agent_name: resolved_name, + status: AgentStatus::Pending, + session_id: None, + worktree_path: None, + base_branch: None, + completion: None, + log_session_id: Some(log_session_id), + }) + } + + /// Stop a running agent. Worktree is preserved for inspection. + pub async fn stop_agent( + &self, + _project_root: &Path, + story_id: &str, + agent_name: &str, + ) -> Result<(), String> { + let key = composite_key(story_id, agent_name); + + let (worktree_info, task_handle, tx) = { + let mut agents = self.agents.lock().map_err(|e| e.to_string())?; + let agent = agents + .get_mut(&key) + .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; + + let wt = agent.worktree_info.clone(); + let handle = agent.task_handle.take(); + let tx = agent.tx.clone(); + agent.status = AgentStatus::Failed; + (wt, handle, tx) + }; + + // Abort the task and kill the PTY child process. + // Note: aborting a spawn_blocking task handle does not interrupt the blocking + // thread, so we must also kill the child process directly via the killer registry. + if let Some(handle) = task_handle { + handle.abort(); + let _ = handle.await; + } + self.kill_child_for_key(&key); + + // Preserve worktree for inspection — don't destroy agent's work on stop. + if let Some(ref wt) = worktree_info { + slog!( + "[agents] Worktree preserved for {story_id}:{agent_name}: {}", + wt.path.display() + ); + } + + let _ = tx.send(AgentEvent::Status { + story_id: story_id.to_string(), + agent_name: agent_name.to_string(), + status: "stopped".to_string(), + }); + + // Remove from map + { + let mut agents = self.agents.lock().map_err(|e| e.to_string())?; + agents.remove(&key); + } + + // Notify WebSocket clients so pipeline board and agent panel update. + Self::notify_agent_state_changed(&self.watcher_tx); + + Ok(()) + } + + /// Return the names of configured agents for `stage` that are not currently + /// running or pending. + pub fn available_agents_for_stage( + &self, + config: &ProjectConfig, + stage: &PipelineStage, + ) -> Result, String> { + let agents = self.agents.lock().map_err(|e| e.to_string())?; + Ok(config + .agent + .iter() + .filter(|cfg| agent_config_stage(cfg) == *stage) + .filter(|cfg| { + !agents.values().any(|a| { + a.agent_name == cfg.name + && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) + }) + }) + .map(|cfg| cfg.name.clone()) + .collect()) + } + + /// List all agents with their status. + pub fn list_agents(&self) -> Result, String> { + let agents = self.agents.lock().map_err(|e| e.to_string())?; + Ok(agents + .iter() + .map(|(key, agent)| { + // Extract story_id from composite key "story_id:agent_name" + let story_id = key + .rsplit_once(':') + .map(|(sid, _)| sid.to_string()) + .unwrap_or_else(|| key.clone()); + agent_info_from_entry(&story_id, agent) + }) + .collect()) + } + + /// Subscribe to events for a story agent. + pub fn subscribe( + &self, + story_id: &str, + agent_name: &str, + ) -> Result, String> { + let key = composite_key(story_id, agent_name); + let agents = self.agents.lock().map_err(|e| e.to_string())?; + let agent = agents + .get(&key) + .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; + Ok(agent.tx.subscribe()) + } + + /// Drain accumulated events for polling. Returns all events since the last drain. + pub fn drain_events( + &self, + story_id: &str, + agent_name: &str, + ) -> Result, String> { + let key = composite_key(story_id, agent_name); + let agents = self.agents.lock().map_err(|e| e.to_string())?; + let agent = agents + .get(&key) + .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; + let mut log = agent.event_log.lock().map_err(|e| e.to_string())?; + Ok(log.drain(..).collect()) + } + + /// Block until the agent reaches a terminal state (completed, failed, stopped). + /// Returns the agent's final `AgentInfo`. + /// `timeout_ms` caps how long to wait; returns an error if the deadline passes. + pub async fn wait_for_agent( + &self, + story_id: &str, + agent_name: &str, + timeout_ms: u64, + ) -> Result { + // Subscribe before checking status so we don't miss the terminal event + // if the agent completes in the window between the two operations. + let mut rx = self.subscribe(story_id, agent_name)?; + + // Return immediately if already in a terminal state. + { + let agents = self.agents.lock().map_err(|e| e.to_string())?; + let key = composite_key(story_id, agent_name); + if let Some(agent) = agents.get(&key) + && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed) + { + return Ok(agent_info_from_entry(story_id, agent)); + } + } + + let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms); + + loop { + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + if remaining.is_zero() { + return Err(format!( + "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'" + )); + } + + match tokio::time::timeout(remaining, rx.recv()).await { + Ok(Ok(event)) => { + let is_terminal = match &event { + AgentEvent::Done { .. } | AgentEvent::Error { .. } => true, + AgentEvent::Status { status, .. } if status == "stopped" => true, + _ => false, + }; + if is_terminal { + let agents = self.agents.lock().map_err(|e| e.to_string())?; + let key = composite_key(story_id, agent_name); + return Ok(if let Some(agent) = agents.get(&key) { + agent_info_from_entry(story_id, agent) + } else { + // Agent was removed from map (e.g. stop_agent removes it after + // the "stopped" status event is sent). + let (status, session_id) = match event { + AgentEvent::Done { session_id, .. } => { + (AgentStatus::Completed, session_id) + } + _ => (AgentStatus::Failed, None), + }; + AgentInfo { + story_id: story_id.to_string(), + agent_name: agent_name.to_string(), + status, + session_id, + worktree_path: None, + base_branch: None, + completion: None, + log_session_id: None, + } + }); + } + } + Ok(Err(broadcast::error::RecvError::Lagged(_))) => { + // Missed some buffered events — check current status before resuming. + let agents = self.agents.lock().map_err(|e| e.to_string())?; + let key = composite_key(story_id, agent_name); + if let Some(agent) = agents.get(&key) + && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed) + { + return Ok(agent_info_from_entry(story_id, agent)); + } + // Still running — continue the loop. + } + Ok(Err(broadcast::error::RecvError::Closed)) => { + // Channel closed: no more events will arrive. Return current state. + let agents = self.agents.lock().map_err(|e| e.to_string())?; + let key = composite_key(story_id, agent_name); + if let Some(agent) = agents.get(&key) { + return Ok(agent_info_from_entry(story_id, agent)); + } + return Err(format!( + "Agent '{agent_name}' for story '{story_id}' channel closed unexpectedly" + )); + } + Err(_) => { + return Err(format!( + "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'" + )); + } + } + } + } + + /// Create a worktree for the given story using the server port (writes .mcp.json). + pub async fn create_worktree( + &self, + project_root: &Path, + story_id: &str, + ) -> Result { + let config = ProjectConfig::load(project_root)?; + worktree::create_worktree(project_root, story_id, &config, self.port).await + } + + /// Get project root helper. + pub fn get_project_root(&self, state: &crate::state::SessionState) -> Result { + state.get_project_root() + } + + /// Get the log session ID and project root for an agent, if available. + /// + /// Used by MCP tools to find the persistent log file for a completed agent. + pub fn get_log_info(&self, story_id: &str, agent_name: &str) -> Option<(String, PathBuf)> { + let key = composite_key(story_id, agent_name); + let agents = self.agents.lock().ok()?; + let agent = agents.get(&key)?; + let session_id = agent.log_session_id.clone()?; + let project_root = agent.project_root.clone()?; + Some((session_id, project_root)) + } + + /// Remove all agent entries for a given story_id from the pool. + /// + /// Called when a story is archived so that stale entries don't accumulate. + /// Returns the number of entries removed. + pub fn remove_agents_for_story(&self, story_id: &str) -> usize { + let mut agents = match self.agents.lock() { + Ok(a) => a, + Err(e) => { + slog_error!("[agents] Failed to lock pool for cleanup of '{story_id}': {e}"); + return 0; + } + }; + let prefix = format!("{story_id}:"); + let keys_to_remove: Vec = agents + .keys() + .filter(|k| k.starts_with(&prefix)) + .cloned() + .collect(); + let count = keys_to_remove.len(); + for key in &keys_to_remove { + agents.remove(key); + } + if count > 0 { + slog!("[agents] Removed {count} agent entries for archived story '{story_id}'"); + } + count + } + + /// Test helper: inject a pre-built agent entry so unit tests can exercise + /// wait/subscribe logic without spawning a real process. + #[cfg(test)] + pub fn inject_test_agent( + &self, + story_id: &str, + agent_name: &str, + status: AgentStatus, + ) -> broadcast::Sender { + let (tx, _) = broadcast::channel::(64); + let key = composite_key(story_id, agent_name); + let mut agents = self.agents.lock().unwrap(); + agents.insert( + key, + StoryAgent { + agent_name: agent_name.to_string(), + status, + worktree_info: None, + session_id: None, + tx: tx.clone(), + task_handle: None, + event_log: Arc::new(Mutex::new(Vec::new())), + completion: None, + project_root: None, + log_session_id: None, + merge_failure_reported: false, + }, + ); + tx + } + + /// Test helper: inject an agent with a specific worktree path for testing + /// gate-related logic. + #[cfg(test)] + pub fn inject_test_agent_with_path( + &self, + story_id: &str, + agent_name: &str, + status: AgentStatus, + worktree_path: PathBuf, + ) -> broadcast::Sender { + let (tx, _) = broadcast::channel::(64); + let key = composite_key(story_id, agent_name); + let mut agents = self.agents.lock().unwrap(); + agents.insert( + key, + StoryAgent { + agent_name: agent_name.to_string(), + status, + worktree_info: Some(WorktreeInfo { + path: worktree_path, + branch: format!("feature/story-{story_id}"), + base_branch: "master".to_string(), + }), + session_id: None, + tx: tx.clone(), + task_handle: None, + event_log: Arc::new(Mutex::new(Vec::new())), + completion: None, + project_root: None, + log_session_id: None, + merge_failure_reported: false, + }, + ); + tx + } + + /// Test helper: inject an agent with a completion report and project_root + /// for testing pipeline advance logic without spawning real agents. + #[cfg(test)] + pub fn inject_test_agent_with_completion( + &self, + story_id: &str, + agent_name: &str, + status: AgentStatus, + project_root: PathBuf, + completion: CompletionReport, + ) -> broadcast::Sender { + let (tx, _) = broadcast::channel::(64); + let key = composite_key(story_id, agent_name); + let mut agents = self.agents.lock().unwrap(); + agents.insert( + key, + StoryAgent { + agent_name: agent_name.to_string(), + status, + worktree_info: None, + session_id: None, + tx: tx.clone(), + task_handle: None, + event_log: Arc::new(Mutex::new(Vec::new())), + completion: Some(completion), + project_root: Some(project_root), + log_session_id: None, + merge_failure_reported: false, + }, + ); + tx + } + + /// Inject a Running agent with a pre-built (possibly finished) task handle. + /// Used by watchdog tests to simulate an orphaned agent. + #[cfg(test)] + pub fn inject_test_agent_with_handle( + &self, + story_id: &str, + agent_name: &str, + status: AgentStatus, + task_handle: tokio::task::JoinHandle<()>, + ) -> broadcast::Sender { + let (tx, _) = broadcast::channel::(64); + let key = composite_key(story_id, agent_name); + let mut agents = self.agents.lock().unwrap(); + agents.insert( + key, + StoryAgent { + agent_name: agent_name.to_string(), + status, + worktree_info: None, + session_id: None, + tx: tx.clone(), + task_handle: Some(task_handle), + event_log: Arc::new(Mutex::new(Vec::new())), + completion: None, + project_root: None, + log_session_id: None, + merge_failure_reported: false, + }, + ); + tx + } + + /// Test helper: inject a child killer into the registry. + #[cfg(test)] + pub fn inject_child_killer(&self, key: &str, killer: Box) { + let mut killers = self.child_killers.lock().unwrap(); + killers.insert(key.to_string(), killer); + } + + /// Test helper: return the number of registered child killers. + #[cfg(test)] + pub fn child_killer_count(&self) -> usize { + self.child_killers.lock().unwrap().len() + } +} + +/// Return the active pipeline stage directory name for `story_id`, or `None` if the +/// story is not in any active stage (`2_current/`, `3_qa/`, `4_merge/`). +fn find_active_story_stage(project_root: &Path, story_id: &str) -> Option<&'static str> { + const STAGES: [&str; 3] = ["2_current", "3_qa", "4_merge"]; + for stage in &STAGES { + let path = project_root + .join(".story_kit") + .join("work") + .join(stage) + .join(format!("{story_id}.md")); + if path.exists() { + return Some(stage); + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::agents::{AgentEvent, AgentStatus, PipelineStage}; + use crate::config::ProjectConfig; + use portable_pty::{CommandBuilder, PtySize, native_pty_system}; + + fn make_config(toml_str: &str) -> ProjectConfig { + ProjectConfig::parse(toml_str).unwrap() + } + + #[tokio::test] + async fn wait_for_agent_returns_immediately_if_completed() { + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("s1", "bot", AgentStatus::Completed); + + let info = pool.wait_for_agent("s1", "bot", 1000).await.unwrap(); + assert_eq!(info.status, AgentStatus::Completed); + assert_eq!(info.story_id, "s1"); + assert_eq!(info.agent_name, "bot"); + } + + #[tokio::test] + async fn wait_for_agent_returns_immediately_if_failed() { + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("s2", "bot", AgentStatus::Failed); + + let info = pool.wait_for_agent("s2", "bot", 1000).await.unwrap(); + assert_eq!(info.status, AgentStatus::Failed); + } + + #[tokio::test] + async fn wait_for_agent_completes_on_done_event() { + let pool = AgentPool::new_test(3001); + let tx = pool.inject_test_agent("s3", "bot", AgentStatus::Running); + + // Send Done event after a short delay + let tx_clone = tx.clone(); + tokio::spawn(async move { + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + let _ = tx_clone.send(AgentEvent::Done { + story_id: "s3".to_string(), + agent_name: "bot".to_string(), + session_id: Some("sess-abc".to_string()), + }); + }); + + let info = pool.wait_for_agent("s3", "bot", 2000).await.unwrap(); + assert_eq!(info.story_id, "s3"); + } + + #[tokio::test] + async fn wait_for_agent_times_out() { + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("s4", "bot", AgentStatus::Running); + + let result = pool.wait_for_agent("s4", "bot", 50).await; + assert!(result.is_err()); + let msg = result.unwrap_err(); + assert!(msg.contains("Timed out"), "unexpected message: {msg}"); + } + + #[tokio::test] + async fn wait_for_agent_errors_for_nonexistent() { + let pool = AgentPool::new_test(3001); + let result = pool.wait_for_agent("no_story", "no_bot", 100).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn wait_for_agent_completes_on_stopped_status_event() { + let pool = AgentPool::new_test(3001); + let tx = pool.inject_test_agent("s5", "bot", AgentStatus::Running); + + let tx_clone = tx.clone(); + tokio::spawn(async move { + tokio::time::sleep(std::time::Duration::from_millis(30)).await; + let _ = tx_clone.send(AgentEvent::Status { + story_id: "s5".to_string(), + agent_name: "bot".to_string(), + status: "stopped".to_string(), + }); + }); + + let info = pool.wait_for_agent("s5", "bot", 2000).await.unwrap(); + assert_eq!(info.story_id, "s5"); + } + + // ── kill_all_children tests ──────────────────────────────────── + + /// Returns true if a process with the given PID is currently running. + fn process_is_running(pid: u32) -> bool { + std::process::Command::new("ps") + .arg("-p") + .arg(pid.to_string()) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map(|s| s.success()) + .unwrap_or(false) + } + + #[test] + fn kill_all_children_is_safe_on_empty_pool() { + let pool = AgentPool::new_test(3001); + pool.kill_all_children(); + assert_eq!(pool.child_killer_count(), 0); + } + + #[test] + fn kill_all_children_kills_real_process() { + let pool = AgentPool::new_test(3001); + + let pty_system = native_pty_system(); + let pair = pty_system + .openpty(PtySize { + rows: 24, + cols: 80, + pixel_width: 0, + pixel_height: 0, + }) + .expect("failed to open pty"); + + let mut cmd = CommandBuilder::new("sleep"); + cmd.arg("100"); + let mut child = pair + .slave + .spawn_command(cmd) + .expect("failed to spawn sleep"); + let pid = child.process_id().expect("no pid"); + + pool.inject_child_killer("story:agent", child.clone_killer()); + + assert!( + process_is_running(pid), + "process {pid} should be running before kill_all_children" + ); + + pool.kill_all_children(); + let _ = child.wait(); + + assert!( + !process_is_running(pid), + "process {pid} should have been killed by kill_all_children" + ); + } + + #[test] + fn kill_all_children_clears_registry() { + let pool = AgentPool::new_test(3001); + + let pty_system = native_pty_system(); + let pair = pty_system + .openpty(PtySize { + rows: 24, + cols: 80, + pixel_width: 0, + pixel_height: 0, + }) + .expect("failed to open pty"); + + let mut cmd = CommandBuilder::new("sleep"); + cmd.arg("1"); + let mut child = pair + .slave + .spawn_command(cmd) + .expect("failed to spawn sleep"); + + pool.inject_child_killer("story:agent", child.clone_killer()); + assert_eq!(pool.child_killer_count(), 1); + + pool.kill_all_children(); + let _ = child.wait(); + + assert_eq!( + pool.child_killer_count(), + 0, + "child_killers should be cleared after kill_all_children" + ); + } + + // ── available_agents_for_stage tests (story 190) ────────────────────────── + + #[test] + fn available_agents_for_stage_returns_idle_agents() { + let config = make_config( + r#" +[[agent]] +name = "coder-1" +stage = "coder" + +[[agent]] +name = "coder-2" +stage = "coder" + +[[agent]] +name = "qa" +stage = "qa" +"#, + ); + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); + + let available = pool + .available_agents_for_stage(&config, &PipelineStage::Coder) + .unwrap(); + assert_eq!(available, vec!["coder-2"]); + + let available_qa = pool + .available_agents_for_stage(&config, &PipelineStage::Qa) + .unwrap(); + assert_eq!(available_qa, vec!["qa"]); + } + + #[test] + fn available_agents_for_stage_returns_empty_when_all_busy() { + let config = make_config( + r#" +[[agent]] +name = "coder-1" +stage = "coder" +"#, + ); + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); + + let available = pool + .available_agents_for_stage(&config, &PipelineStage::Coder) + .unwrap(); + assert!(available.is_empty()); + } + + #[test] + fn available_agents_for_stage_ignores_completed_agents() { + let config = make_config( + r#" +[[agent]] +name = "coder-1" +stage = "coder" +"#, + ); + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story-1", "coder-1", AgentStatus::Completed); + + let available = pool + .available_agents_for_stage(&config, &PipelineStage::Coder) + .unwrap(); + assert_eq!(available, vec!["coder-1"]); + } + + #[tokio::test] + async fn start_agent_auto_selects_second_coder_when_first_busy() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".story_kit"); + std::fs::create_dir_all(&sk).unwrap(); + std::fs::write( + sk.join("project.toml"), + r#" +[[agent]] +name = "supervisor" +stage = "other" + +[[agent]] +name = "coder-1" +stage = "coder" + +[[agent]] +name = "coder-2" +stage = "coder" +"#, + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running); + + let result = pool + .start_agent(tmp.path(), "42_my_story", None, None) + .await; + match result { + Ok(info) => { + assert_eq!(info.agent_name, "coder-2"); + } + Err(err) => { + assert!( + !err.contains("All coder agents are busy"), + "should have selected coder-2 but got: {err}" + ); + assert!( + !err.contains("No coder agent configured"), + "should not fail on agent selection, got: {err}" + ); + } + } + } + + #[tokio::test] + async fn start_agent_returns_busy_when_all_coders_occupied() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".story_kit"); + std::fs::create_dir_all(&sk).unwrap(); + std::fs::write( + sk.join("project.toml"), + r#" +[[agent]] +name = "coder-1" +stage = "coder" + +[[agent]] +name = "coder-2" +stage = "coder" +"#, + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); + pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending); + + let result = pool.start_agent(tmp.path(), "story-3", None, None).await; + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.contains("All coder agents are busy"), + "expected busy error, got: {err}" + ); + } + + #[tokio::test] + async fn start_agent_moves_story_to_current_when_coders_busy() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".story_kit"); + let backlog = sk.join("work/1_backlog"); + std::fs::create_dir_all(&backlog).unwrap(); + std::fs::write( + sk.join("project.toml"), + r#" +[[agent]] +name = "coder-1" +stage = "coder" +"#, + ) + .unwrap(); + std::fs::write(backlog.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); + + let result = pool.start_agent(tmp.path(), "story-3", None, None).await; + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.contains("All coder agents are busy"), + "expected busy error, got: {err}" + ); + assert!( + err.contains("queued in work/2_current/"), + "expected story-to-current message, got: {err}" + ); + + let current_path = sk.join("work/2_current/story-3.md"); + assert!( + current_path.exists(), + "story should be in 2_current/ after busy error, but was not" + ); + let backlog_path = backlog.join("story-3.md"); + assert!( + !backlog_path.exists(), + "story should no longer be in 1_backlog/" + ); + } + + #[tokio::test] + async fn start_agent_story_already_in_current_is_noop() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".story_kit"); + let current = sk.join("work/2_current"); + std::fs::create_dir_all(¤t).unwrap(); + std::fs::write( + sk.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", + ) + .unwrap(); + std::fs::write(current.join("story-5.md"), "---\nname: Story 5\n---\n").unwrap(); + + let pool = AgentPool::new_test(3001); + + let result = pool.start_agent(tmp.path(), "story-5", None, None).await; + match result { + Ok(_) => {} + Err(e) => { + assert!( + !e.contains("Failed to move"), + "should not fail on idempotent move, got: {e}" + ); + } + } + } + + #[tokio::test] + async fn start_agent_explicit_name_unchanged_when_busy() { + let tmp = tempfile::tempdir().unwrap(); + let sk = tmp.path().join(".story_kit"); + std::fs::create_dir_all(&sk).unwrap(); + std::fs::write( + sk.join("project.toml"), + r#" +[[agent]] +name = "coder-1" +stage = "coder" + +[[agent]] +name = "coder-2" +stage = "coder" +"#, + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); + + let result = pool + .start_agent(tmp.path(), "story-2", Some("coder-1"), None) + .await; + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.contains("coder-1") && err.contains("already running"), + "expected explicit busy error, got: {err}" + ); + } + + // ── start_agent single-instance concurrency tests ───────────────────────── + + #[tokio::test] + async fn start_agent_rejects_when_same_agent_already_running_on_another_story() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(&sk_dir).unwrap(); + fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story-a", "qa", AgentStatus::Running); + + let result = pool.start_agent(root, "story-b", Some("qa"), None).await; + + assert!( + result.is_err(), + "start_agent should fail when qa is already running on another story" + ); + let err = result.unwrap_err(); + assert!( + err.contains("already running") || err.contains("becomes available"), + "error message should explain why: got '{err}'" + ); + } + + #[tokio::test] + async fn start_agent_allows_new_story_when_previous_run_is_completed() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(&sk_dir).unwrap(); + fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story-a", "qa", AgentStatus::Completed); + + let result = pool.start_agent(root, "story-b", Some("qa"), None).await; + + if let Err(ref e) = result { + assert!( + !e.contains("already running") && !e.contains("becomes available"), + "completed agent must not trigger the concurrency guard: got '{e}'" + ); + } + } + + // ── bug 118: pending entry cleanup on start_agent failure ──────────────── + + #[tokio::test] + async fn start_agent_cleans_up_pending_entry_on_failure() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(&sk_dir).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", + ) + .unwrap(); + + let upcoming = root.join(".story_kit/work/1_backlog"); + fs::create_dir_all(&upcoming).unwrap(); + fs::write(upcoming.join("50_story_test.md"), "---\nname: Test\n---\n").unwrap(); + + let pool = AgentPool::new_test(3099); + + let result = pool + .start_agent(root, "50_story_test", Some("coder-1"), None) + .await; + + assert!( + result.is_ok(), + "start_agent should return Ok(Pending) immediately: {:?}", + result.err() + ); + assert_eq!( + result.unwrap().status, + AgentStatus::Pending, + "initial status must be Pending" + ); + + let final_info = pool + .wait_for_agent("50_story_test", "coder-1", 5000) + .await + .expect("wait_for_agent should not time out"); + assert_eq!( + final_info.status, + AgentStatus::Failed, + "agent must transition to Failed after worktree creation error" + ); + + let agents = pool.agents.lock().unwrap(); + let failed_entry = agents + .values() + .find(|a| a.agent_name == "coder-1" && a.status == AgentStatus::Failed); + assert!( + failed_entry.is_some(), + "agent pool must retain a Failed entry so the UI can show the error state" + ); + drop(agents); + + let events = pool + .drain_events("50_story_test", "coder-1") + .expect("drain_events should succeed"); + let has_error_event = events.iter().any(|e| matches!(e, AgentEvent::Error { .. })); + assert!( + has_error_event, + "event_log must contain AgentEvent::Error after worktree creation fails" + ); + } + + #[tokio::test] + async fn start_agent_guard_does_not_remove_running_entry() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(&sk_dir).unwrap(); + fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); + + let pool = AgentPool::new_test(3099); + pool.inject_test_agent("story-x", "qa", AgentStatus::Running); + + let result = pool.start_agent(root, "story-y", Some("qa"), None).await; + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.contains("already running") || err.contains("becomes available"), + "running entry must survive: got '{err}'" + ); + } + + // ── TOCTOU race-condition regression tests (story 132) ─────────────────── + + #[tokio::test] + async fn toctou_pending_entry_blocks_same_agent_on_different_story() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(&sk_dir).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3099); + pool.inject_test_agent("86_story_foo", "coder-1", AgentStatus::Pending); + + let result = pool + .start_agent(root, "130_story_bar", Some("coder-1"), None) + .await; + + assert!(result.is_err(), "second start_agent must be rejected"); + let err = result.unwrap_err(); + assert!( + err.contains("already running") || err.contains("becomes available"), + "expected concurrency-rejection message, got: '{err}'" + ); + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn toctou_concurrent_start_agent_same_agent_exactly_one_concurrency_rejection() { + use std::fs; + use std::sync::Arc; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path().to_path_buf(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap(); + fs::write( + root.join(".story_kit/project.toml"), + "[[agent]]\nname = \"coder-1\"\n", + ) + .unwrap(); + fs::write( + root.join(".story_kit/work/1_backlog/86_story_foo.md"), + "---\nname: Foo\n---\n", + ) + .unwrap(); + fs::write( + root.join(".story_kit/work/1_backlog/130_story_bar.md"), + "---\nname: Bar\n---\n", + ) + .unwrap(); + + let pool = Arc::new(AgentPool::new_test(3099)); + + let pool1 = pool.clone(); + let root1 = root.clone(); + let t1 = tokio::spawn(async move { + pool1 + .start_agent(&root1, "86_story_foo", Some("coder-1"), None) + .await + }); + + let pool2 = pool.clone(); + let root2 = root.clone(); + let t2 = tokio::spawn(async move { + pool2 + .start_agent(&root2, "130_story_bar", Some("coder-1"), None) + .await + }); + + let (r1, r2) = tokio::join!(t1, t2); + let r1 = r1.unwrap(); + let r2 = r2.unwrap(); + + let concurrency_rejections = [&r1, &r2] + .iter() + .filter(|r| { + r.as_ref().is_err_and(|e| { + e.contains("already running") || e.contains("becomes available") + }) + }) + .count(); + + assert_eq!( + concurrency_rejections, 1, + "exactly one call must be rejected by the concurrency check; \ + got r1={r1:?} r2={r2:?}" + ); + } + + // ── story-230: prevent duplicate stage agents on same story ─────────────── + + #[tokio::test] + async fn start_agent_rejects_second_coder_stage_on_same_story() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(&sk_dir).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3099); + pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running); + + let result = pool + .start_agent(root, "42_story_foo", Some("coder-2"), None) + .await; + + assert!( + result.is_err(), + "second coder on same story must be rejected" + ); + let err = result.unwrap_err(); + assert!( + err.contains("same pipeline stage"), + "error must mention same pipeline stage, got: '{err}'" + ); + assert!( + err.contains("coder-1") && err.contains("coder-2"), + "error must name both agents, got: '{err}'" + ); + } + + #[tokio::test] + async fn start_agent_rejects_second_qa_stage_on_same_story() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(&sk_dir).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n\n\ + [[agent]]\nname = \"qa-2\"\nstage = \"qa\"\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3099); + pool.inject_test_agent("55_story_bar", "qa-1", AgentStatus::Running); + + let result = pool + .start_agent(root, "55_story_bar", Some("qa-2"), None) + .await; + + assert!(result.is_err(), "second qa on same story must be rejected"); + let err = result.unwrap_err(); + assert!( + err.contains("same pipeline stage"), + "error must mention same pipeline stage, got: '{err}'" + ); + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn start_agent_concurrent_two_coders_same_story_exactly_one_stage_rejection() { + use std::fs; + use std::sync::Arc; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path().to_path_buf(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); + fs::write( + root.join(".story_kit/project.toml"), + "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n", + ) + .unwrap(); + fs::write( + root.join(".story_kit/work/2_current/42_story_foo.md"), + "---\nname: Foo\n---\n", + ) + .unwrap(); + + let pool = Arc::new(AgentPool::new_test(3099)); + + let pool1 = pool.clone(); + let root1 = root.clone(); + let t1 = tokio::spawn(async move { + pool1 + .start_agent(&root1, "42_story_foo", Some("coder-1"), None) + .await + }); + + let pool2 = pool.clone(); + let root2 = root.clone(); + let t2 = tokio::spawn(async move { + pool2 + .start_agent(&root2, "42_story_foo", Some("coder-2"), None) + .await + }); + + let (r1, r2) = tokio::join!(t1, t2); + let r1 = r1.unwrap(); + let r2 = r2.unwrap(); + + let stage_rejections = [&r1, &r2] + .iter() + .filter(|r| r.as_ref().is_err_and(|e| e.contains("same pipeline stage"))) + .count(); + + assert_eq!( + stage_rejections, 1, + "exactly one call must be rejected by the stage-conflict check; \ + got r1={r1:?} r2={r2:?}" + ); + } + + #[tokio::test] + async fn start_agent_two_coders_different_stories_not_blocked_by_stage_check() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap(); + fs::write( + root.join(".story_kit/project.toml"), + "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n", + ) + .unwrap(); + fs::write( + root.join(".story_kit/work/1_backlog/99_story_baz.md"), + "---\nname: Baz\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3099); + pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running); + + let result = pool + .start_agent(root, "99_story_baz", Some("coder-2"), None) + .await; + + if let Err(ref e) = result { + assert!( + !e.contains("same pipeline stage"), + "stage-conflict guard must not fire for agents on different stories; \ + got: '{e}'" + ); + } + } + + // ── bug 312: stage-pipeline mismatch guard in start_agent ────────────── + + #[tokio::test] + async fn start_agent_rejects_mergemaster_on_coding_stage_story() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ + [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", + ) + .unwrap(); + fs::write( + sk_dir.join("work/2_current/310_story_foo.md"), + "---\nname: Foo\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3099); + let result = pool + .start_agent(root, "310_story_foo", Some("mergemaster"), None) + .await; + + assert!( + result.is_err(), + "mergemaster must not be assigned to a story in 2_current/" + ); + let err = result.unwrap_err(); + assert!( + err.contains("stage") && err.contains("2_current"), + "error must mention stage mismatch, got: '{err}'" + ); + } + + #[tokio::test] + async fn start_agent_rejects_coder_on_qa_stage_story() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(sk_dir.join("work/3_qa")).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ + [[agent]]\nname = \"qa\"\nstage = \"qa\"\n", + ) + .unwrap(); + fs::write( + sk_dir.join("work/3_qa/42_story_bar.md"), + "---\nname: Bar\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3099); + let result = pool + .start_agent(root, "42_story_bar", Some("coder-1"), None) + .await; + + assert!( + result.is_err(), + "coder must not be assigned to a story in 3_qa/" + ); + let err = result.unwrap_err(); + assert!( + err.contains("stage") && err.contains("3_qa"), + "error must mention stage mismatch, got: '{err}'" + ); + } + + #[tokio::test] + async fn start_agent_rejects_qa_on_merge_stage_story() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"qa\"\nstage = \"qa\"\n\n\ + [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", + ) + .unwrap(); + fs::write( + sk_dir.join("work/4_merge/55_story_baz.md"), + "---\nname: Baz\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3099); + let result = pool + .start_agent(root, "55_story_baz", Some("qa"), None) + .await; + + assert!( + result.is_err(), + "qa must not be assigned to a story in 4_merge/" + ); + let err = result.unwrap_err(); + assert!( + err.contains("stage") && err.contains("4_merge"), + "error must mention stage mismatch, got: '{err}'" + ); + } + + #[tokio::test] + async fn start_agent_allows_supervisor_on_any_stage() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"supervisor\"\nstage = \"other\"\n", + ) + .unwrap(); + fs::write( + sk_dir.join("work/2_current/77_story_sup.md"), + "---\nname: Sup\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3099); + let result = pool + .start_agent(root, "77_story_sup", Some("supervisor"), None) + .await; + + match result { + Ok(_) => {} + Err(e) => { + assert!( + !e.contains("stage:") || !e.contains("cannot be assigned"), + "supervisor should not be rejected for stage mismatch, got: '{e}'" + ); + } + } + } + + #[tokio::test] + async fn start_agent_allows_correct_stage_agent() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk_dir = root.join(".story_kit"); + fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", + ) + .unwrap(); + fs::write( + sk_dir.join("work/4_merge/88_story_ok.md"), + "---\nname: OK\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3099); + let result = pool + .start_agent(root, "88_story_ok", Some("mergemaster"), None) + .await; + + match result { + Ok(_) => {} + Err(e) => { + assert!( + !e.contains("cannot be assigned"), + "mergemaster on 4_merge/ story should not fail stage check, got: '{e}'" + ); + } + } + } + + // ── find_active_story_stage tests ───────────────────────────────────────── + + #[test] + fn find_active_story_stage_detects_current() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let current = root.join(".story_kit/work/2_current"); + fs::create_dir_all(¤t).unwrap(); + fs::write(current.join("10_story_test.md"), "test").unwrap(); + + assert_eq!( + find_active_story_stage(root, "10_story_test"), + Some("2_current") + ); + } + + #[test] + fn find_active_story_stage_detects_qa() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let qa = root.join(".story_kit/work/3_qa"); + fs::create_dir_all(&qa).unwrap(); + fs::write(qa.join("11_story_test.md"), "test").unwrap(); + + assert_eq!(find_active_story_stage(root, "11_story_test"), Some("3_qa")); + } + + #[test] + fn find_active_story_stage_detects_merge() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let merge = root.join(".story_kit/work/4_merge"); + fs::create_dir_all(&merge).unwrap(); + fs::write(merge.join("12_story_test.md"), "test").unwrap(); + + assert_eq!( + find_active_story_stage(root, "12_story_test"), + Some("4_merge") + ); + } + + #[test] + fn find_active_story_stage_returns_none_for_unknown_story() { + let tmp = tempfile::tempdir().unwrap(); + assert_eq!(find_active_story_stage(tmp.path(), "99_nonexistent"), None); + } + + // ── remove_agents_for_story tests ──────────────────────────────────────── + + #[test] + fn remove_agents_for_story_removes_all_entries() { + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story_a", "coder-1", AgentStatus::Completed); + pool.inject_test_agent("story_a", "qa", AgentStatus::Failed); + pool.inject_test_agent("story_b", "coder-1", AgentStatus::Running); + + let removed = pool.remove_agents_for_story("story_a"); + assert_eq!(removed, 2, "should remove both agents for story_a"); + + let agents = pool.list_agents().unwrap(); + assert_eq!(agents.len(), 1, "only story_b agent should remain"); + assert_eq!(agents[0].story_id, "story_b"); + } + + #[test] + fn remove_agents_for_story_returns_zero_when_no_match() { + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("story_a", "coder-1", AgentStatus::Running); + + let removed = pool.remove_agents_for_story("nonexistent"); + assert_eq!(removed, 0); + + let agents = pool.list_agents().unwrap(); + assert_eq!(agents.len(), 1, "existing agents should not be affected"); + } + + // ── archive + cleanup integration test ─────────────────────────────────── + + #[tokio::test] + async fn archiving_story_removes_agent_entries_from_pool() { + use crate::agents::lifecycle::move_story_to_archived; + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let current = root.join(".story_kit/work/2_current"); + fs::create_dir_all(¤t).unwrap(); + fs::write(current.join("60_story_cleanup.md"), "test").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("60_story_cleanup", "coder-1", AgentStatus::Completed); + pool.inject_test_agent("60_story_cleanup", "qa", AgentStatus::Completed); + pool.inject_test_agent("61_story_other", "coder-1", AgentStatus::Running); + + assert_eq!(pool.list_agents().unwrap().len(), 3); + + move_story_to_archived(root, "60_story_cleanup").unwrap(); + pool.remove_agents_for_story("60_story_cleanup"); + + let remaining = pool.list_agents().unwrap(); + assert_eq!( + remaining.len(), + 1, + "only the other story's agent should remain" + ); + assert_eq!(remaining[0].story_id, "61_story_other"); + + assert!( + root.join(".story_kit/work/5_done/60_story_cleanup.md") + .exists() + ); + } +} diff --git a/server/src/agents/pool/pipeline.rs b/server/src/agents/pool/pipeline.rs new file mode 100644 index 0000000..fa77a19 --- /dev/null +++ b/server/src/agents/pool/pipeline.rs @@ -0,0 +1,1771 @@ +use crate::config::ProjectConfig; +use crate::slog; +use crate::slog_error; +use crate::slog_warn; +use crate::worktree; +use crate::io::watcher::WatcherEvent; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex}; +use tokio::sync::broadcast; + +use super::super::{ + AgentEvent, AgentStatus, CompletionReport, PipelineStage, + agent_config_stage, pipeline_stage, +}; +use super::{AgentPool, StoryAgent, composite_key}; + +impl AgentPool { + /// Pipeline advancement: after an agent completes, move the story to + /// the next pipeline stage and start the appropriate agent. + pub(super) async fn run_pipeline_advance( + &self, + story_id: &str, + agent_name: &str, + completion: CompletionReport, + project_root: Option, + worktree_path: Option, + merge_failure_reported: bool, + ) { + let project_root = match project_root { + Some(p) => p, + None => { + slog_warn!("[pipeline] No project_root for '{story_id}:{agent_name}'"); + return; + } + }; + + let config = ProjectConfig::load(&project_root).unwrap_or_default(); + let stage = config + .find_agent(agent_name) + .map(agent_config_stage) + .unwrap_or_else(|| pipeline_stage(agent_name)); + + match stage { + PipelineStage::Other => { + // Supervisors and unknown agents do not advance the pipeline. + } + PipelineStage::Coder => { + if completion.gates_passed { + // Determine effective QA mode for this story. + let qa_mode = { + let item_type = super::super::lifecycle::item_type_from_id(story_id); + if item_type == "spike" { + crate::io::story_metadata::QaMode::Human + } else { + let default_qa = config.default_qa_mode(); + // Story is in 2_current/ when a coder completes. + let story_path = project_root + .join(".story_kit/work/2_current") + .join(format!("{story_id}.md")); + crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa) + } + }; + + match qa_mode { + crate::io::story_metadata::QaMode::Server => { + slog!( + "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \ + qa: server — moving directly to merge." + ); + if let Err(e) = + super::super::lifecycle::move_story_to_merge(&project_root, story_id) + { + slog_error!( + "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}" + ); + } else if let Err(e) = self + .start_agent(&project_root, story_id, Some("mergemaster"), None) + .await + { + slog_error!( + "[pipeline] Failed to start mergemaster for '{story_id}': {e}" + ); + } + } + crate::io::story_metadata::QaMode::Agent => { + slog!( + "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \ + qa: agent — moving to QA." + ); + if let Err(e) = super::super::lifecycle::move_story_to_qa(&project_root, story_id) { + slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"); + } else if let Err(e) = self + .start_agent(&project_root, story_id, Some("qa"), None) + .await + { + slog_error!("[pipeline] Failed to start qa agent for '{story_id}': {e}"); + } + } + crate::io::story_metadata::QaMode::Human => { + slog!( + "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \ + qa: human — holding for human review." + ); + if let Err(e) = super::super::lifecycle::move_story_to_qa(&project_root, story_id) { + slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"); + } else { + let qa_dir = project_root.join(".story_kit/work/3_qa"); + let story_path = qa_dir.join(format!("{story_id}.md")); + if let Err(e) = + crate::io::story_metadata::write_review_hold(&story_path) + { + slog_error!( + "[pipeline] Failed to set review_hold on '{story_id}': {e}" + ); + } + } + } + } + } else { + // Increment retry count and check if blocked. + let story_path = project_root + .join(".story_kit/work/2_current") + .join(format!("{story_id}.md")); + if should_block_story(&story_path, config.max_retries, story_id, "coder") { + // Story has exceeded retry limit — do not restart. + } else { + slog!( + "[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting." + ); + let context = format!( + "\n\n---\n## Previous Attempt Failed\n\ + The acceptance gates failed with the following output:\n{}\n\n\ + Please review the failures above, fix the issues, and try again.", + completion.gate_output + ); + if let Err(e) = self + .start_agent(&project_root, story_id, Some(agent_name), Some(&context)) + .await + { + slog_error!( + "[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}" + ); + } + } + } + } + PipelineStage::Qa => { + if completion.gates_passed { + // Run coverage gate in the QA worktree before advancing to merge. + let coverage_path = worktree_path + .clone() + .unwrap_or_else(|| project_root.clone()); + let cp = coverage_path.clone(); + let coverage_result = + tokio::task::spawn_blocking(move || super::super::gates::run_coverage_gate(&cp)) + .await + .unwrap_or_else(|e| { + slog_warn!("[pipeline] Coverage gate task panicked: {e}"); + Ok((false, format!("Coverage gate task panicked: {e}"))) + }); + let (coverage_passed, coverage_output) = match coverage_result { + Ok(pair) => pair, + Err(e) => (false, e), + }; + + if coverage_passed { + // Check whether this item needs human review before merging. + let needs_human_review = { + let item_type = super::super::lifecycle::item_type_from_id(story_id); + if item_type == "spike" { + true // Spikes always need human review. + } else { + let qa_dir = project_root.join(".story_kit/work/3_qa"); + let story_path = qa_dir.join(format!("{story_id}.md")); + let default_qa = config.default_qa_mode(); + matches!( + crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa), + crate::io::story_metadata::QaMode::Human + ) + } + }; + + if needs_human_review { + // Hold in 3_qa/ for human review. + let qa_dir = project_root.join(".story_kit/work/3_qa"); + let story_path = qa_dir.join(format!("{story_id}.md")); + if let Err(e) = + crate::io::story_metadata::write_review_hold(&story_path) + { + slog_error!( + "[pipeline] Failed to set review_hold on '{story_id}': {e}" + ); + } + slog!( + "[pipeline] QA passed for '{story_id}'. \ + Holding for human review. \ + Worktree preserved at: {worktree_path:?}" + ); + } else { + slog!( + "[pipeline] QA passed gates and coverage for '{story_id}'. \ + Moving directly to merge." + ); + if let Err(e) = + super::super::lifecycle::move_story_to_merge(&project_root, story_id) + { + slog_error!( + "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}" + ); + } else if let Err(e) = self + .start_agent(&project_root, story_id, Some("mergemaster"), None) + .await + { + slog_error!( + "[pipeline] Failed to start mergemaster for '{story_id}': {e}" + ); + } + } + } else { + let story_path = project_root + .join(".story_kit/work/3_qa") + .join(format!("{story_id}.md")); + if should_block_story(&story_path, config.max_retries, story_id, "qa-coverage") { + // Story has exceeded retry limit — do not restart. + } else { + slog!( + "[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA." + ); + let context = format!( + "\n\n---\n## Coverage Gate Failed\n\ + The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\ + Please improve test coverage until the coverage gate passes.", + coverage_output + ); + if let Err(e) = self + .start_agent(&project_root, story_id, Some("qa"), Some(&context)) + .await + { + slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}"); + } + } + } + } else { + let story_path = project_root + .join(".story_kit/work/3_qa") + .join(format!("{story_id}.md")); + if should_block_story(&story_path, config.max_retries, story_id, "qa") { + // Story has exceeded retry limit — do not restart. + } else { + slog!("[pipeline] QA failed gates for '{story_id}'. Restarting."); + let context = format!( + "\n\n---\n## Previous QA Attempt Failed\n\ + The acceptance gates failed with the following output:\n{}\n\n\ + Please re-run and fix the issues.", + completion.gate_output + ); + if let Err(e) = self + .start_agent(&project_root, story_id, Some("qa"), Some(&context)) + .await + { + slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}"); + } + } + } + } + PipelineStage::Mergemaster => { + // Block advancement if the mergemaster explicitly reported a failure. + // The server-owned gate check runs in the feature-branch worktree (not + // master), so `gates_passed=true` is misleading when no code was merged. + if merge_failure_reported { + slog!( + "[pipeline] Pipeline advancement blocked for '{story_id}': \ + mergemaster explicitly reported a merge failure. \ + Story stays in 4_merge/ for human review." + ); + } else { + // Run script/test on master (project_root) as the post-merge verification. + slog!( + "[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master." + ); + let root = project_root.clone(); + let test_result = + tokio::task::spawn_blocking(move || super::super::gates::run_project_tests(&root)) + .await + .unwrap_or_else(|e| { + slog_warn!("[pipeline] Post-merge test task panicked: {e}"); + Ok((false, format!("Test task panicked: {e}"))) + }); + let (passed, output) = match test_result { + Ok(pair) => pair, + Err(e) => (false, e), + }; + + if passed { + slog!( + "[pipeline] Post-merge tests passed for '{story_id}'. Moving to done." + ); + if let Err(e) = + super::super::lifecycle::move_story_to_archived(&project_root, story_id) + { + slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}"); + } + self.remove_agents_for_story(story_id); + // TODO: Re-enable worktree cleanup once we have persistent agent logs. + // Removing worktrees destroys evidence needed to debug empty-commit agents. + // let config = + // crate::config::ProjectConfig::load(&project_root).unwrap_or_default(); + // if let Err(e) = + // worktree::remove_worktree_by_story_id(&project_root, story_id, &config) + // .await + // { + // slog!( + // "[pipeline] Failed to remove worktree for '{story_id}': {e}" + // ); + // } + slog!( + "[pipeline] Story '{story_id}' done. Worktree preserved for inspection." + ); + } else { + let story_path = project_root + .join(".story_kit/work/4_merge") + .join(format!("{story_id}.md")); + if should_block_story(&story_path, config.max_retries, story_id, "mergemaster") { + // Story has exceeded retry limit — do not restart. + } else { + slog!( + "[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster." + ); + let context = format!( + "\n\n---\n## Post-Merge Test Failed\n\ + The tests on master failed with the following output:\n{}\n\n\ + Please investigate and resolve the failures, then call merge_agent_work again.", + output + ); + if let Err(e) = self + .start_agent( + &project_root, + story_id, + Some("mergemaster"), + Some(&context), + ) + .await + { + slog_error!( + "[pipeline] Failed to restart mergemaster for '{story_id}': {e}" + ); + } + } + } + } + } + } + + // Always scan for unassigned work after any agent completes, regardless + // of the outcome (success, failure, restart). This ensures stories that + // failed agent assignment due to busy agents are retried when agents + // become available (bug 295). + self.auto_assign_available_work(&project_root).await; + } + + /// Internal: report that an agent has finished work on a story. + /// + /// **Note:** This is no longer exposed as an MCP tool. The server now + /// automatically runs completion gates when an agent process exits + /// (see `run_server_owned_completion`). This method is retained for + /// backwards compatibility and testing. + /// + /// - Rejects with an error if the worktree has uncommitted changes. + /// - Runs acceptance gates (cargo clippy + cargo nextest run / cargo test). + /// - Stores the `CompletionReport` on the agent record. + /// - Transitions status to `Completed` (gates passed) or `Failed` (gates failed). + /// - Emits a `Done` event so `wait_for_agent` unblocks. + #[allow(dead_code)] + pub async fn report_completion( + &self, + story_id: &str, + agent_name: &str, + summary: &str, + ) -> Result { + let key = composite_key(story_id, agent_name); + + // Verify agent exists, is Running, and grab its worktree path. + let worktree_path = { + let agents = self.agents.lock().map_err(|e| e.to_string())?; + let agent = agents + .get(&key) + .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; + + if agent.status != AgentStatus::Running { + return Err(format!( + "Agent '{agent_name}' for story '{story_id}' is not running (status: {}). \ + report_completion can only be called by a running agent.", + agent.status + )); + } + + agent + .worktree_info + .as_ref() + .map(|wt| wt.path.clone()) + .ok_or_else(|| { + format!( + "Agent '{agent_name}' for story '{story_id}' has no worktree. \ + Cannot run acceptance gates." + ) + })? + }; + + let path = worktree_path.clone(); + + // Run gate checks in a blocking thread to avoid stalling the async runtime. + let (gates_passed, gate_output) = tokio::task::spawn_blocking(move || { + // Step 1: Reject if worktree is dirty. + super::super::gates::check_uncommitted_changes(&path)?; + // Step 2: Run clippy + tests and return (passed, output). + super::super::gates::run_acceptance_gates(&path) + }) + .await + .map_err(|e| format!("Gate check task panicked: {e}"))??; + + let report = CompletionReport { + summary: summary.to_string(), + gates_passed, + gate_output, + }; + + // Extract data for pipeline advance, then remove the entry so + // completed agents never appear in list_agents. + let ( + tx, + session_id, + project_root_for_advance, + wt_path_for_advance, + merge_failure_reported_for_advance, + ) = { + let mut agents = self.agents.lock().map_err(|e| e.to_string())?; + let agent = agents.get_mut(&key).ok_or_else(|| { + format!("Agent '{agent_name}' for story '{story_id}' disappeared during gate check") + })?; + agent.completion = Some(report.clone()); + let tx = agent.tx.clone(); + let sid = agent.session_id.clone(); + let pr = agent.project_root.clone(); + let wt = agent.worktree_info.as_ref().map(|w| w.path.clone()); + let mfr = agent.merge_failure_reported; + agents.remove(&key); + (tx, sid, pr, wt, mfr) + }; + + // Emit Done so wait_for_agent unblocks. + let _ = tx.send(AgentEvent::Done { + story_id: story_id.to_string(), + agent_name: agent_name.to_string(), + session_id, + }); + + // Notify WebSocket clients that the agent is gone. + Self::notify_agent_state_changed(&self.watcher_tx); + + // Advance the pipeline state machine in a background task. + let pool_clone = Self { + agents: Arc::clone(&self.agents), + port: self.port, + child_killers: Arc::clone(&self.child_killers), + watcher_tx: self.watcher_tx.clone(), + merge_jobs: Arc::clone(&self.merge_jobs), + }; + let sid = story_id.to_string(); + let aname = agent_name.to_string(); + let report_for_advance = report.clone(); + tokio::spawn(async move { + pool_clone + .run_pipeline_advance( + &sid, + &aname, + report_for_advance, + project_root_for_advance, + wt_path_for_advance, + merge_failure_reported_for_advance, + ) + .await; + }); + + Ok(report) + } + + /// Start the merge pipeline as a background task. + /// + /// Returns immediately so the MCP tool call doesn't time out (the full + /// pipeline — squash merge + quality gates — takes well over 60 seconds, + /// exceeding Claude Code's MCP tool-call timeout). + /// + /// The mergemaster agent should poll [`get_merge_status`](Self::get_merge_status) + /// until the job reaches a terminal state. + pub fn start_merge_agent_work( + self: &Arc, + project_root: &Path, + story_id: &str, + ) -> Result<(), String> { + // Guard against double-starts. + { + let jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?; + if let Some(job) = jobs.get(story_id) + && matches!(job.status, super::super::merge::MergeJobStatus::Running) + { + return Err(format!( + "Merge already in progress for '{story_id}'. \ + Use get_merge_status to poll for completion." + )); + } + } + + // Insert Running job. + { + let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?; + jobs.insert( + story_id.to_string(), + super::super::merge::MergeJob { + story_id: story_id.to_string(), + status: super::super::merge::MergeJobStatus::Running, + }, + ); + } + + let pool = Arc::clone(self); + let root = project_root.to_path_buf(); + let sid = story_id.to_string(); + + tokio::spawn(async move { + let report = pool.run_merge_pipeline(&root, &sid).await; + let failed = report.is_err(); + let status = match report { + Ok(r) => super::super::merge::MergeJobStatus::Completed(r), + Err(e) => super::super::merge::MergeJobStatus::Failed(e), + }; + if let Ok(mut jobs) = pool.merge_jobs.lock() + && let Some(job) = jobs.get_mut(&sid) + { + job.status = status; + } + if failed { + pool.auto_assign_available_work(&root).await; + } + }); + + Ok(()) + } + + /// The actual merge pipeline, run inside a background task. + async fn run_merge_pipeline( + self: &Arc, + project_root: &Path, + story_id: &str, + ) -> Result { + let branch = format!("feature/story-{story_id}"); + let wt_path = worktree::worktree_path(project_root, story_id); + let root = project_root.to_path_buf(); + let sid = story_id.to_string(); + let br = branch.clone(); + + let merge_result = + tokio::task::spawn_blocking(move || super::super::merge::run_squash_merge(&root, &br, &sid)) + .await + .map_err(|e| format!("Merge task panicked: {e}"))??; + + if !merge_result.success { + return Ok(super::super::merge::MergeReport { + story_id: story_id.to_string(), + success: false, + had_conflicts: merge_result.had_conflicts, + conflicts_resolved: merge_result.conflicts_resolved, + conflict_details: merge_result.conflict_details, + gates_passed: merge_result.gates_passed, + gate_output: merge_result.output, + worktree_cleaned_up: false, + story_archived: false, + }); + } + + let story_archived = + super::super::lifecycle::move_story_to_archived(project_root, story_id).is_ok(); + if story_archived { + self.remove_agents_for_story(story_id); + } + + let worktree_cleaned_up = if wt_path.exists() { + let config = crate::config::ProjectConfig::load(project_root).unwrap_or_default(); + worktree::remove_worktree_by_story_id(project_root, story_id, &config) + .await + .is_ok() + } else { + false + }; + + self.auto_assign_available_work(project_root).await; + + Ok(super::super::merge::MergeReport { + story_id: story_id.to_string(), + success: true, + had_conflicts: merge_result.had_conflicts, + conflicts_resolved: merge_result.conflicts_resolved, + conflict_details: merge_result.conflict_details, + gates_passed: true, + gate_output: merge_result.output, + worktree_cleaned_up, + story_archived, + }) + } + + /// Check the status of a background merge job. + pub fn get_merge_status(&self, story_id: &str) -> Option { + self.merge_jobs + .lock() + .ok() + .and_then(|jobs| jobs.get(story_id).cloned()) + } + + /// Record that the mergemaster agent for `story_id` explicitly reported a + /// merge failure via the `report_merge_failure` MCP tool. + /// + /// Sets `merge_failure_reported = true` on the active mergemaster agent so + /// that `run_pipeline_advance` can block advancement to `5_done/` even when + /// the server-owned gate check returns `gates_passed=true` (those gates run + /// in the feature-branch worktree, not on master). + pub fn set_merge_failure_reported(&self, story_id: &str) { + match self.agents.lock() { + Ok(mut lock) => { + let found = lock.iter_mut().find(|(key, agent)| { + let key_story_id = key + .rsplit_once(':') + .map(|(sid, _)| sid) + .unwrap_or(key.as_str()); + key_story_id == story_id + && pipeline_stage(&agent.agent_name) == PipelineStage::Mergemaster + }); + match found { + Some((_, agent)) => { + agent.merge_failure_reported = true; + slog!( + "[pipeline] Merge failure flag set for '{story_id}:{}'", + agent.agent_name + ); + } + None => { + slog_warn!( + "[pipeline] set_merge_failure_reported: no running mergemaster found \ + for story '{story_id}' — flag not set" + ); + } + } + } + Err(e) => { + slog_error!("[pipeline] set_merge_failure_reported: could not lock agents: {e}"); + } + } + } +} + +/// Server-owned completion: runs acceptance gates when an agent process exits +/// normally, and advances the pipeline based on results. +/// +/// This is a **free function** (not a method on `AgentPool`) to break the +/// opaque type cycle that would otherwise arise: `start_agent` → spawned task +/// → server-owned completion → pipeline advance → `start_agent`. +/// +/// If the agent already has a completion report (e.g. from a legacy +/// `report_completion` call), this is a no-op to avoid double-running gates. +pub(super) async fn run_server_owned_completion( + agents: &Arc>>, + port: u16, + story_id: &str, + agent_name: &str, + session_id: Option, + watcher_tx: broadcast::Sender, +) { + let key = composite_key(story_id, agent_name); + + // Guard: skip if completion was already recorded (legacy path). + { + let lock = match agents.lock() { + Ok(a) => a, + Err(_) => return, + }; + match lock.get(&key) { + Some(agent) if agent.completion.is_some() => { + slog!( + "[agents] Completion already recorded for '{story_id}:{agent_name}'; \ + skipping server-owned gates." + ); + return; + } + Some(_) => {} + None => return, + } + } + + // Get worktree path for running gates. + let worktree_path = { + let lock = match agents.lock() { + Ok(a) => a, + Err(_) => return, + }; + lock.get(&key) + .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone())) + }; + + // Run acceptance gates. + let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path { + let path = wt_path; + match tokio::task::spawn_blocking(move || { + super::super::gates::check_uncommitted_changes(&path)?; + // AC5: Fail early if the coder finished with no commits on the feature branch. + // This prevents empty-diff stories from advancing through QA to merge. + if !super::super::gates::worktree_has_committed_work(&path) { + return Ok(( + false, + "Agent exited with no commits on the feature branch. \ + The agent did not produce any code changes." + .to_string(), + )); + } + super::super::gates::run_acceptance_gates(&path) + }) + .await + { + Ok(Ok(result)) => result, + Ok(Err(e)) => (false, e), + Err(e) => (false, format!("Gate check task panicked: {e}")), + } + } else { + ( + false, + "No worktree path available to run acceptance gates".to_string(), + ) + }; + + slog!( + "[agents] Server-owned completion for '{story_id}:{agent_name}': gates_passed={gates_passed}" + ); + + let report = CompletionReport { + summary: "Agent process exited normally".to_string(), + gates_passed, + gate_output, + }; + + // Store completion report, extract data for pipeline advance, then + // remove the entry so completed agents never appear in list_agents. + let (tx, project_root_for_advance, wt_path_for_advance, merge_failure_reported_for_advance) = { + let mut lock = match agents.lock() { + Ok(a) => a, + Err(_) => return, + }; + let agent = match lock.get_mut(&key) { + Some(a) => a, + None => return, + }; + agent.completion = Some(report.clone()); + agent.session_id = session_id.clone(); + let tx = agent.tx.clone(); + let pr = agent.project_root.clone(); + let wt = agent.worktree_info.as_ref().map(|w| w.path.clone()); + let mfr = agent.merge_failure_reported; + lock.remove(&key); + (tx, pr, wt, mfr) + }; + + // Emit Done so wait_for_agent unblocks. + let _ = tx.send(AgentEvent::Done { + story_id: story_id.to_string(), + agent_name: agent_name.to_string(), + session_id, + }); + + // Notify WebSocket clients that the agent is gone. + AgentPool::notify_agent_state_changed(&watcher_tx); + + // Advance the pipeline state machine in a background task. + spawn_pipeline_advance( + Arc::clone(agents), + port, + story_id, + agent_name, + report, + project_root_for_advance, + wt_path_for_advance, + watcher_tx, + merge_failure_reported_for_advance, + ); +} + +/// Spawn pipeline advancement as a background task. +/// +/// This is a **non-async** function so it does not participate in the opaque +/// type cycle between `start_agent` and `run_server_owned_completion`. +#[allow(clippy::too_many_arguments)] +fn spawn_pipeline_advance( + agents: Arc>>, + port: u16, + story_id: &str, + agent_name: &str, + completion: CompletionReport, + project_root: Option, + worktree_path: Option, + watcher_tx: broadcast::Sender, + merge_failure_reported: bool, +) { + let sid = story_id.to_string(); + let aname = agent_name.to_string(); + tokio::spawn(async move { + let pool = AgentPool { + agents, + port, + child_killers: Arc::new(Mutex::new(HashMap::new())), + watcher_tx, + merge_jobs: Arc::new(Mutex::new(HashMap::new())), + }; + pool.run_pipeline_advance( + &sid, + &aname, + completion, + project_root, + worktree_path, + merge_failure_reported, + ) + .await; + }); +} + +/// Increment retry_count and block the story if it exceeds `max_retries`. +/// +/// Returns `true` if the story is now blocked (caller should NOT restart the agent). +/// Returns `false` if the story may be retried. +/// When `max_retries` is 0, retry limits are disabled. +fn should_block_story(story_path: &Path, max_retries: u32, story_id: &str, stage_label: &str) -> bool { + use crate::io::story_metadata::{increment_retry_count, write_blocked}; + + if max_retries == 0 { + // Retry limits disabled. + return false; + } + + match increment_retry_count(story_path) { + Ok(new_count) => { + if new_count >= max_retries { + slog_warn!( + "[pipeline] Story '{story_id}' reached retry limit ({new_count}/{max_retries}) \ + at {stage_label} stage. Marking as blocked." + ); + if let Err(e) = write_blocked(story_path) { + slog_error!("[pipeline] Failed to write blocked flag for '{story_id}': {e}"); + } + true + } else { + slog!( + "[pipeline] Story '{story_id}' retry {new_count}/{max_retries} at {stage_label} stage." + ); + false + } + } + Err(e) => { + slog_error!("[pipeline] Failed to increment retry_count for '{story_id}': {e}"); + false // Don't block on error — allow retry. + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use super::super::AgentPool; + use crate::agents::merge::{MergeJob, MergeJobStatus}; + use crate::agents::{AgentEvent, AgentStatus, CompletionReport}; + use crate::io::watcher::WatcherEvent; + use std::path::PathBuf; + use std::process::Command; + + fn init_git_repo(repo: &std::path::Path) { + Command::new("git") + .args(["init"]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.email", "test@test.com"]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.name", "Test"]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "--allow-empty", "-m", "init"]) + .current_dir(repo) + .output() + .unwrap(); + } + + // ── report_completion tests ──────────────────────────────────── + + #[tokio::test] + async fn report_completion_rejects_nonexistent_agent() { + let pool = AgentPool::new_test(3001); + let result = pool.report_completion("no_story", "no_bot", "done").await; + assert!(result.is_err()); + let msg = result.unwrap_err(); + assert!(msg.contains("No agent"), "unexpected: {msg}"); + } + + #[tokio::test] + async fn report_completion_rejects_non_running_agent() { + let pool = AgentPool::new_test(3001); + pool.inject_test_agent("s6", "bot", AgentStatus::Completed); + + let result = pool.report_completion("s6", "bot", "done").await; + assert!(result.is_err()); + let msg = result.unwrap_err(); + assert!( + msg.contains("not running"), + "expected 'not running' in: {msg}" + ); + } + + #[tokio::test] + async fn report_completion_rejects_dirty_worktree() { + use std::fs; + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let repo = tmp.path(); + + // Init a real git repo and make an initial commit + Command::new("git") + .args(["init"]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "--allow-empty", "-m", "init"]) + .current_dir(repo) + .output() + .unwrap(); + + // Write an uncommitted file + fs::write(repo.join("dirty.txt"), "not committed").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent_with_path("s7", "bot", AgentStatus::Running, repo.to_path_buf()); + + let result = pool.report_completion("s7", "bot", "done").await; + assert!(result.is_err()); + let msg = result.unwrap_err(); + assert!( + msg.contains("uncommitted"), + "expected 'uncommitted' in: {msg}" + ); + } + + // ── server-owned completion tests ─────────────────────────────────────────── + + #[tokio::test] + async fn server_owned_completion_skips_when_already_completed() { + let pool = AgentPool::new_test(3001); + let report = CompletionReport { + summary: "Already done".to_string(), + gates_passed: true, + gate_output: String::new(), + }; + pool.inject_test_agent_with_completion( + "s10", + "coder-1", + AgentStatus::Completed, + PathBuf::from("/tmp/nonexistent"), + report, + ); + + // Subscribe before calling so we can check if Done event was emitted. + let mut rx = pool.subscribe("s10", "coder-1").unwrap(); + + run_server_owned_completion( + &pool.agents, + pool.port, + "s10", + "coder-1", + Some("sess-1".to_string()), + pool.watcher_tx.clone(), + ) + .await; + + // Status should remain Completed (unchanged) — no gate re-run. + let agents = pool.agents.lock().unwrap(); + let key = composite_key("s10", "coder-1"); + let agent = agents.get(&key).unwrap(); + assert_eq!(agent.status, AgentStatus::Completed); + // Summary should still be the original, not overwritten. + assert_eq!(agent.completion.as_ref().unwrap().summary, "Already done"); + drop(agents); + + // No Done event should have been emitted. + assert!( + rx.try_recv().is_err(), + "should not emit Done when completion already exists" + ); + } + + #[tokio::test] + async fn server_owned_completion_runs_gates_on_clean_worktree() { + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let repo = tmp.path(); + init_git_repo(repo); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent_with_path( + "s11", + "coder-1", + AgentStatus::Running, + repo.to_path_buf(), + ); + + let mut rx = pool.subscribe("s11", "coder-1").unwrap(); + + run_server_owned_completion( + &pool.agents, + pool.port, + "s11", + "coder-1", + Some("sess-2".to_string()), + pool.watcher_tx.clone(), + ) + .await; + + // Agent entry should be removed from the map after completion. + let agents = pool.agents.lock().unwrap(); + let key = composite_key("s11", "coder-1"); + assert!( + agents.get(&key).is_none(), + "agent should be removed from map after completion" + ); + drop(agents); + + // A Done event should have been emitted with the session_id. + let event = rx.try_recv().expect("should emit Done event"); + match &event { + AgentEvent::Done { session_id, .. } => { + assert_eq!(*session_id, Some("sess-2".to_string())); + } + other => panic!("expected Done event, got: {other:?}"), + } + } + + #[tokio::test] + async fn server_owned_completion_fails_on_dirty_worktree() { + use std::fs; + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let repo = tmp.path(); + init_git_repo(repo); + // Create an uncommitted file. + fs::write(repo.join("dirty.txt"), "not committed").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent_with_path( + "s12", + "coder-1", + AgentStatus::Running, + repo.to_path_buf(), + ); + + let mut rx = pool.subscribe("s12", "coder-1").unwrap(); + + run_server_owned_completion( + &pool.agents, + pool.port, + "s12", + "coder-1", + None, + pool.watcher_tx.clone(), + ) + .await; + + // Agent entry should be removed from the map after completion (even on failure). + let agents = pool.agents.lock().unwrap(); + let key = composite_key("s12", "coder-1"); + assert!( + agents.get(&key).is_none(), + "agent should be removed from map after failed completion" + ); + drop(agents); + + // A Done event should have been emitted. + let event = rx.try_recv().expect("should emit Done event"); + assert!( + matches!(event, AgentEvent::Done { .. }), + "expected Done event, got: {event:?}" + ); + } + + #[tokio::test] + async fn server_owned_completion_nonexistent_agent_is_noop() { + let pool = AgentPool::new_test(3001); + // Should not panic or error — just silently return. + run_server_owned_completion( + &pool.agents, + pool.port, + "nonexistent", + "bot", + None, + pool.watcher_tx.clone(), + ) + .await; + } + + // ── pipeline advance tests ──────────────────────────────────────────────── + + #[tokio::test] + async fn pipeline_advance_coder_gates_pass_server_qa_moves_to_merge() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Set up story in 2_current/ (no qa frontmatter → uses project default "server") + let current = root.join(".story_kit/work/2_current"); + fs::create_dir_all(¤t).unwrap(); + fs::write(current.join("50_story_test.md"), "test").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.run_pipeline_advance( + "50_story_test", + "coder-1", + CompletionReport { + summary: "done".to_string(), + gates_passed: true, + gate_output: String::new(), + }, + Some(root.to_path_buf()), + None, + false, + ) + .await; + + // With default qa: server, story skips QA and goes straight to 4_merge/ + assert!( + root.join(".story_kit/work/4_merge/50_story_test.md") + .exists(), + "story should be in 4_merge/" + ); + assert!( + !current.join("50_story_test.md").exists(), + "story should not still be in 2_current/" + ); + } + + #[tokio::test] + async fn pipeline_advance_coder_gates_pass_agent_qa_moves_to_qa() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Set up story in 2_current/ with qa: agent frontmatter + let current = root.join(".story_kit/work/2_current"); + fs::create_dir_all(¤t).unwrap(); + fs::write( + current.join("50_story_test.md"), + "---\nname: Test\nqa: agent\n---\ntest", + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + pool.run_pipeline_advance( + "50_story_test", + "coder-1", + CompletionReport { + summary: "done".to_string(), + gates_passed: true, + gate_output: String::new(), + }, + Some(root.to_path_buf()), + None, + false, + ) + .await; + + // With qa: agent, story should move to 3_qa/ + assert!( + root.join(".story_kit/work/3_qa/50_story_test.md").exists(), + "story should be in 3_qa/" + ); + assert!( + !current.join("50_story_test.md").exists(), + "story should not still be in 2_current/" + ); + } + + #[tokio::test] + async fn pipeline_advance_qa_gates_pass_moves_story_to_merge() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Set up story in 3_qa/ + let qa_dir = root.join(".story_kit/work/3_qa"); + fs::create_dir_all(&qa_dir).unwrap(); + // qa: server so the story skips human review and goes straight to merge. + fs::write( + qa_dir.join("51_story_test.md"), + "---\nname: Test\nqa: server\n---\ntest", + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + pool.run_pipeline_advance( + "51_story_test", + "qa", + CompletionReport { + summary: "QA done".to_string(), + gates_passed: true, + gate_output: String::new(), + }, + Some(root.to_path_buf()), + None, + false, + ) + .await; + + // Story should have moved to 4_merge/ + assert!( + root.join(".story_kit/work/4_merge/51_story_test.md") + .exists(), + "story should be in 4_merge/" + ); + assert!( + !qa_dir.join("51_story_test.md").exists(), + "story should not still be in 3_qa/" + ); + } + + #[tokio::test] + async fn pipeline_advance_supervisor_does_not_advance() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let current = root.join(".story_kit/work/2_current"); + fs::create_dir_all(¤t).unwrap(); + fs::write(current.join("52_story_test.md"), "test").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.run_pipeline_advance( + "52_story_test", + "supervisor", + CompletionReport { + summary: "supervised".to_string(), + gates_passed: true, + gate_output: String::new(), + }, + Some(root.to_path_buf()), + None, + false, + ) + .await; + + // Story should NOT have moved (supervisors don't advance pipeline) + assert!( + current.join("52_story_test.md").exists(), + "story should still be in 2_current/ for supervisor" + ); + } + + #[tokio::test] + async fn pipeline_advance_sends_agent_state_changed_to_watcher_tx() { + use std::fs; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Set up story in 2_current/ + let current = root.join(".story_kit/work/2_current"); + fs::create_dir_all(¤t).unwrap(); + fs::write(current.join("173_story_test.md"), "test").unwrap(); + // Ensure 3_qa/ exists for the move target + fs::create_dir_all(root.join(".story_kit/work/3_qa")).unwrap(); + // Ensure 1_backlog/ exists (start_agent calls move_story_to_current) + fs::create_dir_all(root.join(".story_kit/work/1_backlog")).unwrap(); + + // Write a project.toml with a qa agent so start_agent can resolve it. + fs::create_dir_all(root.join(".story_kit")).unwrap(); + fs::write( + root.join(".story_kit/project.toml"), + r#" +default_qa = "agent" + +[[agent]] +name = "coder-1" +role = "Coder" +command = "echo" +args = ["noop"] +prompt = "test" +stage = "coder" + +[[agent]] +name = "qa" +role = "QA" +command = "echo" +args = ["noop"] +prompt = "test" +stage = "qa" +"#, + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + // Subscribe to the watcher channel BEFORE the pipeline advance. + let mut rx = pool.watcher_tx.subscribe(); + + pool.run_pipeline_advance( + "173_story_test", + "coder-1", + CompletionReport { + summary: "done".to_string(), + gates_passed: true, + gate_output: String::new(), + }, + Some(root.to_path_buf()), + None, + false, + ) + .await; + + // The pipeline advance should have sent AgentStateChanged events via + // the pool's watcher_tx (not a dummy channel). Collect all events. + let mut got_agent_state_changed = false; + while let Ok(evt) = rx.try_recv() { + if matches!(evt, WatcherEvent::AgentStateChanged) { + got_agent_state_changed = true; + break; + } + } + + assert!( + got_agent_state_changed, + "pipeline advance should send AgentStateChanged through the real watcher_tx \ + (bug 173: lozenges must update when agents are assigned during pipeline advance)" + ); + } + + // ── merge_agent_work tests ──────────────────────────────────────────────── + + /// Helper: start a merge and poll until terminal state. + async fn run_merge_to_completion( + pool: &Arc, + repo: &std::path::Path, + story_id: &str, + ) -> MergeJob { + pool.start_merge_agent_work(repo, story_id).unwrap(); + loop { + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + if let Some(job) = pool.get_merge_status(story_id) + && !matches!(job.status, MergeJobStatus::Running) + { + return job; + } + } + } + + #[tokio::test] + async fn merge_agent_work_returns_error_when_branch_not_found() { + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let repo = tmp.path(); + init_git_repo(repo); + + let pool = Arc::new(AgentPool::new_test(3001)); + let job = run_merge_to_completion(&pool, repo, "99_nonexistent").await; + match &job.status { + MergeJobStatus::Completed(report) => { + assert!(!report.success, "should fail when branch missing"); + } + MergeJobStatus::Failed(_) => { + // Also acceptable — the pipeline errored out + } + MergeJobStatus::Running => { + panic!("should not still be running"); + } + } + } + + #[tokio::test] + async fn merge_agent_work_succeeds_on_clean_branch() { + use std::fs; + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let repo = tmp.path(); + init_git_repo(repo); + + // Create a feature branch with a commit + Command::new("git") + .args(["checkout", "-b", "feature/story-23_test"]) + .current_dir(repo) + .output() + .unwrap(); + fs::write(repo.join("feature.txt"), "feature content").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "add feature"]) + .current_dir(repo) + .output() + .unwrap(); + + // Switch back to master (initial branch) + Command::new("git") + .args(["checkout", "master"]) + .current_dir(repo) + .output() + .unwrap(); + + // Create the story file in 4_merge/ so we can test archival + let merge_dir = repo.join(".story_kit/work/4_merge"); + fs::create_dir_all(&merge_dir).unwrap(); + let story_file = merge_dir.join("23_test.md"); + fs::write(&story_file, "---\nname: Test\n---\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "add story in merge"]) + .current_dir(repo) + .output() + .unwrap(); + + let pool = Arc::new(AgentPool::new_test(3001)); + let job = run_merge_to_completion(&pool, repo, "23_test").await; + + match &job.status { + MergeJobStatus::Completed(report) => { + assert!(!report.had_conflicts, "should have no conflicts"); + assert!( + report.success + || report.gate_output.contains("Failed to run") + || !report.gates_passed, + "report should be coherent: {report:?}" + ); + if report.story_archived { + let done = repo.join(".story_kit/work/5_done/23_test.md"); + assert!(done.exists(), "done file should exist"); + } + } + MergeJobStatus::Failed(e) => { + // Gate failures are acceptable in test env + assert!( + e.contains("Failed") || e.contains("failed"), + "unexpected failure: {e}" + ); + } + MergeJobStatus::Running => panic!("should not still be running"), + } + } + + // ── quality gate ordering test ──────────────────────────────── + + /// Regression test for bug 142: quality gates must run BEFORE the fast-forward + /// to master so that broken code never lands on master. + #[cfg(unix)] + #[test] + fn quality_gates_run_before_fast_forward_to_master() { + use std::fs; + use std::os::unix::fs::PermissionsExt; + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let repo = tmp.path(); + init_git_repo(repo); + + // Add a failing script/test so quality gates will fail. + let script_dir = repo.join("script"); + fs::create_dir_all(&script_dir).unwrap(); + let script_test = script_dir.join("test"); + fs::write(&script_test, "#!/usr/bin/env bash\nexit 1\n").unwrap(); + let mut perms = fs::metadata(&script_test).unwrap().permissions(); + perms.set_mode(0o755); + fs::set_permissions(&script_test, perms).unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "add failing script/test"]) + .current_dir(repo) + .output() + .unwrap(); + + // Create a feature branch with a commit. + Command::new("git") + .args(["checkout", "-b", "feature/story-142_test"]) + .current_dir(repo) + .output() + .unwrap(); + fs::write(repo.join("change.txt"), "feature change").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "feature work"]) + .current_dir(repo) + .output() + .unwrap(); + + // Switch back to master and record its HEAD. + Command::new("git") + .args(["checkout", "master"]) + .current_dir(repo) + .output() + .unwrap(); + let head_before = String::from_utf8( + Command::new("git") + .args(["rev-parse", "HEAD"]) + .current_dir(repo) + .output() + .unwrap() + .stdout, + ) + .unwrap() + .trim() + .to_string(); + + // Run the squash-merge. The failing script/test makes quality gates + // fail → fast-forward must NOT happen. + let result = + crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test") + .unwrap(); + + let head_after = String::from_utf8( + Command::new("git") + .args(["rev-parse", "HEAD"]) + .current_dir(repo) + .output() + .unwrap() + .stdout, + ) + .unwrap() + .trim() + .to_string(); + + // Gates must have failed (script/test exits 1) so master should be untouched. + assert!( + !result.success, + "run_squash_merge must report failure when gates fail" + ); + assert_eq!( + head_before, head_after, + "master HEAD must not advance when quality gates fail (bug 142)" + ); + } + + #[tokio::test] + async fn merge_agent_work_conflict_does_not_break_master() { + use std::fs; + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let repo = tmp.path(); + init_git_repo(repo); + + // Create a file on master. + fs::write( + repo.join("code.rs"), + "fn main() {\n println!(\"hello\");\n}\n", + ) + .unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "initial code"]) + .current_dir(repo) + .output() + .unwrap(); + + // Feature branch: modify the same line differently. + Command::new("git") + .args(["checkout", "-b", "feature/story-42_story_foo"]) + .current_dir(repo) + .output() + .unwrap(); + fs::write( + repo.join("code.rs"), + "fn main() {\n println!(\"hello\");\n feature_fn();\n}\n", + ) + .unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "feature: add fn call"]) + .current_dir(repo) + .output() + .unwrap(); + + // Master: add different line at same location. + Command::new("git") + .args(["checkout", "master"]) + .current_dir(repo) + .output() + .unwrap(); + fs::write( + repo.join("code.rs"), + "fn main() {\n println!(\"hello\");\n master_fn();\n}\n", + ) + .unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "master: add fn call"]) + .current_dir(repo) + .output() + .unwrap(); + + // Create story file in 4_merge. + let merge_dir = repo.join(".story_kit/work/4_merge"); + fs::create_dir_all(&merge_dir).unwrap(); + fs::write(merge_dir.join("42_story_foo.md"), "---\nname: Test\n---\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(repo) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "add story"]) + .current_dir(repo) + .output() + .unwrap(); + + let pool = Arc::new(AgentPool::new_test(3001)); + let job = run_merge_to_completion(&pool, repo, "42_story_foo").await; + + // Master should NEVER have conflict markers, regardless of merge outcome. + let master_code = fs::read_to_string(repo.join("code.rs")).unwrap(); + assert!( + !master_code.contains("<<<<<<<"), + "master must never contain conflict markers:\n{master_code}" + ); + assert!( + !master_code.contains(">>>>>>>"), + "master must never contain conflict markers:\n{master_code}" + ); + + // The report should accurately reflect what happened. + match &job.status { + MergeJobStatus::Completed(report) => { + assert!(report.had_conflicts, "should report conflicts"); + } + MergeJobStatus::Failed(_) => { + // Acceptable — merge aborted due to conflicts + } + MergeJobStatus::Running => panic!("should not still be running"), + } + } + + // ── bug 295: pipeline advance picks up waiting QA stories ────────── + + #[tokio::test] + async fn pipeline_advance_picks_up_waiting_qa_stories_after_completion() { + use std::fs; + use super::super::auto_assign::is_agent_free; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk = root.join(".story_kit"); + let qa_dir = sk.join("work/3_qa"); + fs::create_dir_all(&qa_dir).unwrap(); + + // Configure a single QA agent. + fs::write( + sk.join("project.toml"), + r#" +[[agent]] +name = "qa" +stage = "qa" +"#, + ) + .unwrap(); + + // Story 292 is in QA with QA agent running (will "complete" via + // run_pipeline_advance below). Story 293 is in QA with NO agent — + // simulating the "stuck" state from bug 295. + fs::write( + qa_dir.join("292_story_first.md"), + "---\nname: First\nqa: human\n---\n", + ) + .unwrap(); + fs::write( + qa_dir.join("293_story_second.md"), + "---\nname: Second\nqa: human\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + // QA is currently running on story 292. + pool.inject_test_agent("292_story_first", "qa", AgentStatus::Running); + + // Verify that 293 cannot get a QA agent right now (QA is busy). + { + let agents = pool.agents.lock().unwrap(); + assert!( + !is_agent_free(&agents, "qa"), + "qa should be busy on story 292" + ); + } + + // Simulate QA completing on story 292: remove the agent from the pool + // (as run_server_owned_completion does) then run pipeline advance. + { + let mut agents = pool.agents.lock().unwrap(); + agents.remove(&composite_key("292_story_first", "qa")); + } + + pool.run_pipeline_advance( + "292_story_first", + "qa", + CompletionReport { + summary: "QA done".to_string(), + gates_passed: true, + gate_output: String::new(), + }, + Some(root.to_path_buf()), + None, + false, + ) + .await; + + // After pipeline advance, auto_assign should have started QA on story 293. + let agents = pool.agents.lock().unwrap(); + let qa_on_293 = agents.values().any(|a| { + a.agent_name == "qa" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + qa_on_293, + "auto_assign should have started qa for story 293 after 292's QA completed, \ + but no qa agent is pending/running. Pool: {:?}", + agents + .iter() + .map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status)) + .collect::>() + ); + } +}