From 5af3c1752251b6f556d524167b407d02e7051f29 Mon Sep 17 00:00:00 2001
From: Dave <futurechimp@users.noreply.github.com>
Date: Thu, 19 Mar 2026 22:41:17 +0000
Subject: [PATCH] story-kit: merge
 317_refactor_split_pool_rs_into_pipeline_auto_assign_and_agent_management_modules

---
 server/src/agents/pool.rs             | 5852 -------------------------
 server/src/agents/pool/auto_assign.rs | 1813 ++++++++
 server/src/agents/pool/mod.rs         | 2187 +++++++++
 server/src/agents/pool/pipeline.rs    | 1771 ++++++++
 4 files changed, 5771 insertions(+), 5852 deletions(-)
 delete mode 100644 server/src/agents/pool.rs
 create mode 100644 server/src/agents/pool/auto_assign.rs
 create mode 100644 server/src/agents/pool/mod.rs
 create mode 100644 server/src/agents/pool/pipeline.rs
diff --git a/server/src/agents/pool.rs b/server/src/agents/pool.rs
deleted file mode 100644
index 5148d66..0000000
--- a/server/src/agents/pool.rs
+++ /dev/null
@@ -1,5852 +0,0 @@
-use crate::agent_log::AgentLogWriter;
-use crate::config::ProjectConfig;
-use crate::io::watcher::WatcherEvent;
-use crate::slog;
-use crate::slog_error;
-use crate::slog_warn;
-use crate::worktree::{self, WorktreeInfo};
-use portable_pty::ChildKiller;
-use std::collections::HashMap;
-use std::path::{Path, PathBuf};
-use std::sync::{Arc, Mutex};
-use tokio::sync::broadcast;
-
-use super::{
-    AgentEvent, AgentInfo, AgentStatus, CompletionReport, PipelineStage, ReconciliationEvent,
-    agent_config_stage, pipeline_stage,
-};
-
-/// Build the composite key used to track agents in the pool.
-fn composite_key(story_id: &str, agent_name: &str) -> String {
-    format!("{story_id}:{agent_name}")
-}
-
-/// RAII guard that removes a pending agent entry from the pool on drop.
-///
-/// Created after inserting a `Pending` entry into the agent HashMap.
-/// If `start_agent` succeeds (the agent process is spawned and status
-/// transitions to `Running`), call [`disarm`](Self::disarm) to prevent
-/// cleanup.  If any intermediate step fails and the guard is dropped
-/// without being disarmed, the pending entry is removed so it cannot
-/// block future auto-assign dispatches.
-struct PendingGuard {
-    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
-    key: String,
-    armed: bool,
-}
-
-impl PendingGuard {
-    fn new(agents: Arc<Mutex<HashMap<String, StoryAgent>>>, key: String) -> Self {
-        Self {
-            agents,
-            key,
-            armed: true,
-        }
-    }
-
-    /// Prevent the guard from cleaning up the entry (call after
-    /// successful spawn).
-    fn disarm(&mut self) {
-        self.armed = false;
-    }
-}
-
-impl Drop for PendingGuard {
-    fn drop(&mut self) {
-        if self.armed
-            && let Ok(mut agents) = self.agents.lock()
-            && agents
-                .get(&self.key)
-                .is_some_and(|a| a.status == AgentStatus::Pending)
-        {
-            agents.remove(&self.key);
-            slog!(
-                "[agents] Cleaned up leaked Pending entry for '{}'",
-                self.key
-            );
-        }
-    }
-}
-
-struct StoryAgent {
-    agent_name: String,
-    status: AgentStatus,
-    worktree_info: Option<WorktreeInfo>,
-    session_id: Option<String>,
-    tx: broadcast::Sender<AgentEvent>,
-    task_handle: Option<tokio::task::JoinHandle<()>>,
-    /// Accumulated events for polling via get_agent_output.
-    event_log: Arc<Mutex<Vec<AgentEvent>>>,
-    /// Set when the agent calls report_completion.
-    completion: Option<CompletionReport>,
-    /// Project root, stored for pipeline advancement after completion.
-    project_root: Option<PathBuf>,
-    /// UUID identifying the log file for this session.
-    log_session_id: Option<String>,
-    /// Set to `true` when the agent calls `report_merge_failure`.
-    /// Prevents the pipeline from blindly advancing to `5_done/` after a
-    /// failed merge: the server-owned gate check runs in the feature-branch
-    /// worktree (which compiles fine) and returns `gates_passed=true` even
-    /// though the code was never squash-merged onto master.
-    merge_failure_reported: bool,
-}
-
-/// Build an `AgentInfo` snapshot from a `StoryAgent` map entry.
-fn agent_info_from_entry(story_id: &str, agent: &StoryAgent) -> AgentInfo {
-    AgentInfo {
-        story_id: story_id.to_string(),
-        agent_name: agent.agent_name.clone(),
-        status: agent.status.clone(),
-        session_id: agent.session_id.clone(),
-        worktree_path: agent
-            .worktree_info
-            .as_ref()
-            .map(|wt| wt.path.to_string_lossy().to_string()),
-        base_branch: agent
-            .worktree_info
-            .as_ref()
-            .map(|wt| wt.base_branch.clone()),
-        completion: agent.completion.clone(),
-        log_session_id: agent.log_session_id.clone(),
-    }
-}
-
-/// Manages concurrent story agents, each in its own worktree.
-pub struct AgentPool {
-    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
-    port: u16,
-    /// Registry of active PTY child process killers, keyed by "{story_id}:{agent_name}".
-    /// Used to terminate child processes on server shutdown or agent stop, preventing
-    /// orphaned Claude Code processes from running after the server exits.
-    child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
-    /// Broadcast channel for notifying WebSocket clients of agent state changes.
-    /// When an agent transitions state (Pending, Running, Completed, Failed, Stopped),
-    /// an `AgentStateChanged` event is emitted so the frontend can refresh the
-    /// pipeline board without waiting for a filesystem event.
-    watcher_tx: broadcast::Sender<WatcherEvent>,
-    /// Tracks background merge jobs started by `merge_agent_work`, keyed by story_id.
-    /// The MCP tool returns immediately and the mergemaster agent polls
-    /// `get_merge_status` until the job reaches a terminal state.
-    merge_jobs: Arc<Mutex<HashMap<String, super::merge::MergeJob>>>,
-}
-
-impl AgentPool {
-    pub fn new(port: u16, watcher_tx: broadcast::Sender<WatcherEvent>) -> Self {
-        Self {
-            agents: Arc::new(Mutex::new(HashMap::new())),
-            port,
-            child_killers: Arc::new(Mutex::new(HashMap::new())),
-            watcher_tx,
-            merge_jobs: Arc::new(Mutex::new(HashMap::new())),
-        }
-    }
-
-    /// Create a pool with a dummy watcher channel for unit tests.
-    #[cfg(test)]
-    pub fn new_test(port: u16) -> Self {
-        let (watcher_tx, _) = broadcast::channel(16);
-        Self::new(port, watcher_tx)
-    }
-
-    /// Notify WebSocket clients that agent state has changed, so the pipeline
-    /// board and agent panel can refresh.
-    fn notify_agent_state_changed(watcher_tx: &broadcast::Sender<WatcherEvent>) {
-        let _ = watcher_tx.send(WatcherEvent::AgentStateChanged);
-    }
-
-    /// Kill all active PTY child processes.
-    ///
-    /// Called on server shutdown to prevent orphaned Claude Code processes from
-    /// continuing to run after the server exits. Each registered killer is called
-    /// once, then the registry is cleared.
-    pub fn kill_all_children(&self) {
-        if let Ok(mut killers) = self.child_killers.lock() {
-            for (key, killer) in killers.iter_mut() {
-                slog!("[agents] Killing child process for {key} on shutdown");
-                let _ = killer.kill();
-            }
-            killers.clear();
-        }
-    }
-
-    /// Kill and deregister the child process for a specific agent key.
-    ///
-    /// Used by `stop_agent` to ensure the PTY child is terminated even though
-    /// aborting a `spawn_blocking` task handle does not interrupt the blocking thread.
-    fn kill_child_for_key(&self, key: &str) {
-        if let Ok(mut killers) = self.child_killers.lock()
-            && let Some(mut killer) = killers.remove(key)
-        {
-            slog!("[agents] Killing child process for {key} on stop");
-            let _ = killer.kill();
-        }
-    }
-
-    /// Start an agent for a story: load config, create worktree, spawn agent.
-    ///
-    /// When `agent_name` is `None`, automatically selects the first idle coder
-    /// agent (story 190). If all coders are busy the call fails with an error
-    /// indicating the story will be picked up when one becomes available.
-    ///
-    /// If `resume_context` is provided, it is appended to the rendered prompt
-    /// so the agent can pick up from a previous failed attempt.
-    pub async fn start_agent(
-        &self,
-        project_root: &Path,
-        story_id: &str,
-        agent_name: Option<&str>,
-        resume_context: Option<&str>,
-    ) -> Result<AgentInfo, String> {
-        let config = ProjectConfig::load(project_root)?;
-
-        // Validate explicit agent name early (no lock needed).
-        if let Some(name) = agent_name {
-            config
-                .find_agent(name)
-                .ok_or_else(|| format!("No agent named '{name}' in config"))?;
-        }
-
-        // Create name-independent shared resources before the lock so they are
-        // ready for the atomic check-and-insert (story 132).
-        let (tx, _) = broadcast::channel::<AgentEvent>(1024);
-        let event_log: Arc<Mutex<Vec<AgentEvent>>> = Arc::new(Mutex::new(Vec::new()));
-        let log_session_id = uuid::Uuid::new_v4().to_string();
-
-        // Move story from backlog/ to current/ before checking agent
-        // availability so that auto_assign_available_work can pick it up even
-        // when all coders are currently busy (story 203).  This is idempotent:
-        // if the story is already in 2_current/ or a later stage, the call is
-        // a no-op.
-        super::lifecycle::move_story_to_current(project_root, story_id)?;
-
-        // Validate that the agent's configured stage matches the story's
-        // pipeline stage.  This prevents any caller (auto-assign, MCP tool,
-        // pipeline advance, supervisor) from starting a wrong-stage agent on
-        // a story — e.g. mergemaster on a coding-stage story (bug 312).
-        if let Some(name) = agent_name {
-            let agent_stage = config
-                .find_agent(name)
-                .map(agent_config_stage)
-                .unwrap_or_else(|| pipeline_stage(name));
-            if agent_stage != PipelineStage::Other
-                && let Some(story_stage_dir) = find_active_story_stage(project_root, story_id)
-            {
-                let expected_stage = match story_stage_dir {
-                    "2_current" => PipelineStage::Coder,
-                    "3_qa" => PipelineStage::Qa,
-                    "4_merge" => PipelineStage::Mergemaster,
-                    _ => PipelineStage::Other,
-                };
-                if expected_stage != PipelineStage::Other && expected_stage != agent_stage {
-                    return Err(format!(
-                        "Agent '{name}' (stage: {agent_stage:?}) cannot be assigned to \
-                         story '{story_id}' in {story_stage_dir}/ (requires stage: {expected_stage:?})"
-                    ));
-                }
-            }
-        }
-
-        // Atomically resolve agent name, check availability, and register as
-        // Pending.  When `agent_name` is `None` the first idle coder is
-        // selected inside the lock so no TOCTOU race can occur between the
-        // availability check and the Pending insert (story 132, story 190).
-        //
-        // The `PendingGuard` ensures that if any step below fails the entry is
-        // removed from the pool so it does not permanently block auto-assign
-        // (bug 118).
-        let resolved_name: String;
-        let key: String;
-        {
-            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
-
-            resolved_name = match agent_name {
-                Some(name) => name.to_string(),
-                None => find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder)
-                    .map(|s| s.to_string())
-                    .ok_or_else(|| {
-                        if config
-                            .agent
-                            .iter()
-                            .any(|a| agent_config_stage(a) == PipelineStage::Coder)
-                        {
-                            format!(
-                                "All coder agents are busy; story '{story_id}' has been \
-                                 queued in work/2_current/ and will be auto-assigned when \
-                                 one becomes available"
-                            )
-                        } else {
-                            "No coder agent configured. Specify an agent_name explicitly."
-                                .to_string()
-                        }
-                    })?,
-            };
-
-            key = composite_key(story_id, &resolved_name);
-
-            // Check for duplicate assignment (same story + same agent already active).
-            if let Some(agent) = agents.get(&key)
-                && (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending)
-            {
-                return Err(format!(
-                    "Agent '{resolved_name}' for story '{story_id}' is already {}",
-                    agent.status
-                ));
-            }
-            // Enforce single-stage concurrency: reject if there is already a
-            // Running/Pending agent at the same pipeline stage for this story.
-            // This prevents two coders (or two QA/mergemaster agents) from
-            // corrupting each other's work in the same worktree.
-            // Applies to both explicit and auto-selected agents; the Other
-            // stage (supervisors, unknown agents) is exempt.
-            let resolved_stage = config
-                .find_agent(&resolved_name)
-                .map(agent_config_stage)
-                .unwrap_or_else(|| pipeline_stage(&resolved_name));
-            if resolved_stage != PipelineStage::Other
-                && let Some(conflicting_name) = agents.iter().find_map(|(k, a)| {
-                    let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k);
-                    if k_story == story_id
-                        && a.agent_name != resolved_name
-                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
-                    {
-                        let a_stage = config
-                            .find_agent(&a.agent_name)
-                            .map(agent_config_stage)
-                            .unwrap_or_else(|| pipeline_stage(&a.agent_name));
-                        if a_stage == resolved_stage {
-                            Some(a.agent_name.clone())
-                        } else {
-                            None
-                        }
-                    } else {
-                        None
-                    }
-                })
-            {
-                return Err(format!(
-                    "Cannot start '{resolved_name}' on story '{story_id}': \
-                     '{conflicting_name}' is already active at the same pipeline stage"
-                ));
-            }
-            // Enforce single-instance concurrency for explicitly-named agents:
-            // if this agent is already running on any other story, reject.
-            // Auto-selected agents are already guaranteed idle by
-            // find_free_agent_for_stage, so this check is only needed for
-            // explicit requests.
-            if agent_name.is_some()
-                && let Some(busy_story) = agents.iter().find_map(|(k, a)| {
-                    if a.agent_name == resolved_name
-                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
-                    {
-                        Some(
-                            k.rsplit_once(':')
-                                .map(|(sid, _)| sid)
-                                .unwrap_or(k)
-                                .to_string(),
-                        )
-                    } else {
-                        None
-                    }
-                })
-            {
-                return Err(format!(
-                    "Agent '{resolved_name}' is already running on story '{busy_story}'; \
-                     story '{story_id}' will be picked up when the agent becomes available"
-                ));
-            }
-            agents.insert(
-                key.clone(),
-                StoryAgent {
-                    agent_name: resolved_name.clone(),
-                    status: AgentStatus::Pending,
-                    worktree_info: None,
-                    session_id: None,
-                    tx: tx.clone(),
-                    task_handle: None,
-                    event_log: event_log.clone(),
-                    completion: None,
-                    project_root: Some(project_root.to_path_buf()),
-                    log_session_id: Some(log_session_id.clone()),
-                    merge_failure_reported: false,
-                },
-            );
-        }
-        let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone());
-
-        // Create persistent log writer (needs resolved_name, so must be after
-        // the atomic resolution above).
-        let log_writer =
-            match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) {
-                Ok(w) => Some(Arc::new(Mutex::new(w))),
-                Err(e) => {
-                    eprintln!(
-                        "[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}"
-                    );
-                    None
-                }
-            };
-
-        // Notify WebSocket clients that a new agent is pending.
-        Self::notify_agent_state_changed(&self.watcher_tx);
-
-        let _ = tx.send(AgentEvent::Status {
-            story_id: story_id.to_string(),
-            agent_name: resolved_name.clone(),
-            status: "pending".to_string(),
-        });
-
-        // Extract inactivity timeout from the agent config before cloning config.
-        let inactivity_timeout_secs = config
-            .find_agent(&resolved_name)
-            .map(|a| a.inactivity_timeout_secs)
-            .unwrap_or(300);
-
-        // Clone all values needed inside the background spawn.
-        let project_root_clone = project_root.to_path_buf();
-        let config_clone = config.clone();
-        let resume_context_owned = resume_context.map(str::to_string);
-        let sid = story_id.to_string();
-        let aname = resolved_name.clone();
-        let tx_clone = tx.clone();
-        let agents_ref = self.agents.clone();
-        let key_clone = key.clone();
-        let log_clone = event_log.clone();
-        let port_for_task = self.port;
-        let log_writer_clone = log_writer.clone();
-        let child_killers_clone = self.child_killers.clone();
-        let watcher_tx_clone = self.watcher_tx.clone();
-
-        // Spawn the background task. Worktree creation and agent launch happen here
-        // so `start_agent` returns immediately after registering the agent as
-        // Pending — non-blocking by design (story 157).
-        let handle = tokio::spawn(async move {
-            // Step 1: create the worktree (slow — git checkout, pnpm install, etc.)
-            let wt_info = match worktree::create_worktree(
-                &project_root_clone,
-                &sid,
-                &config_clone,
-                port_for_task,
-            )
-            .await
-            {
-                Ok(wt) => wt,
-                Err(e) => {
-                    let error_msg = format!("Failed to create worktree: {e}");
-                    slog_error!("[agents] {error_msg}");
-                    let event = AgentEvent::Error {
-                        story_id: sid.clone(),
-                        agent_name: aname.clone(),
-                        message: error_msg,
-                    };
-                    if let Ok(mut log) = log_clone.lock() {
-                        log.push(event.clone());
-                    }
-                    let _ = tx_clone.send(event);
-                    if let Ok(mut agents) = agents_ref.lock()
-                        && let Some(agent) = agents.get_mut(&key_clone)
-                    {
-                        agent.status = AgentStatus::Failed;
-                    }
-                    Self::notify_agent_state_changed(&watcher_tx_clone);
-                    return;
-                }
-            };
-
-            // Step 2: store worktree info and render agent command/args/prompt.
-            let wt_path_str = wt_info.path.to_string_lossy().to_string();
-            {
-                if let Ok(mut agents) = agents_ref.lock()
-                    && let Some(agent) = agents.get_mut(&key_clone)
-                {
-                    agent.worktree_info = Some(wt_info.clone());
-                }
-            }
-
-            let (command, args, mut prompt) = match config_clone.render_agent_args(
-                &wt_path_str,
-                &sid,
-                Some(&aname),
-                Some(&wt_info.base_branch),
-            ) {
-                Ok(result) => result,
-                Err(e) => {
-                    let error_msg = format!("Failed to render agent args: {e}");
-                    slog_error!("[agents] {error_msg}");
-                    let event = AgentEvent::Error {
-                        story_id: sid.clone(),
-                        agent_name: aname.clone(),
-                        message: error_msg,
-                    };
-                    if let Ok(mut log) = log_clone.lock() {
-                        log.push(event.clone());
-                    }
-                    let _ = tx_clone.send(event);
-                    if let Ok(mut agents) = agents_ref.lock()
-                        && let Some(agent) = agents.get_mut(&key_clone)
-                    {
-                        agent.status = AgentStatus::Failed;
-                    }
-                    Self::notify_agent_state_changed(&watcher_tx_clone);
-                    return;
-                }
-            };
-
-            // Append resume context if this is a restart with failure information.
-            if let Some(ctx) = resume_context_owned {
-                prompt.push_str(&ctx);
-            }
-
-            // Step 3: transition to Running now that the worktree is ready.
-            {
-                if let Ok(mut agents) = agents_ref.lock()
-                    && let Some(agent) = agents.get_mut(&key_clone)
-                {
-                    agent.status = AgentStatus::Running;
-                }
-            }
-            let _ = tx_clone.send(AgentEvent::Status {
-                story_id: sid.clone(),
-                agent_name: aname.clone(),
-                status: "running".to_string(),
-            });
-            Self::notify_agent_state_changed(&watcher_tx_clone);
-
-            // Step 4: launch the agent process.
-            match super::pty::run_agent_pty_streaming(
-                &sid,
-                &aname,
-                &command,
-                &args,
-                &prompt,
-                &wt_path_str,
-                &tx_clone,
-                &log_clone,
-                log_writer_clone,
-                inactivity_timeout_secs,
-                child_killers_clone,
-            )
-            .await
-            {
-                Ok(pty_result) => {
-                    // Persist token usage if the agent reported it.
-                    if let Some(ref usage) = pty_result.token_usage
-                        && let Ok(agents) = agents_ref.lock()
-                        && let Some(agent) = agents.get(&key_clone)
-                        && let Some(ref pr) = agent.project_root
-                    {
-                        let model = config_clone
-                            .find_agent(&aname)
-                            .and_then(|a| a.model.clone());
-                        let record = super::token_usage::build_record(
-                            &sid, &aname, model, usage.clone(),
-                        );
-                        if let Err(e) = super::token_usage::append_record(pr, &record) {
-                            slog_error!(
-                                "[agents] Failed to persist token usage for \
-                                 {sid}:{aname}: {e}"
-                            );
-                        }
-                    }
-
-                    // Server-owned completion: run acceptance gates automatically
-                    // when the agent process exits normally.
-                    run_server_owned_completion(
-                        &agents_ref,
-                        port_for_task,
-                        &sid,
-                        &aname,
-                        pty_result.session_id,
-                        watcher_tx_clone.clone(),
-                    )
-                    .await;
-                    Self::notify_agent_state_changed(&watcher_tx_clone);
-                }
-                Err(e) => {
-                    slog_error!("[agents] Agent process error for {aname} on {sid}: {e}");
-                    let event = AgentEvent::Error {
-                        story_id: sid.clone(),
-                        agent_name: aname.clone(),
-                        message: e,
-                    };
-                    if let Ok(mut log) = log_clone.lock() {
-                        log.push(event.clone());
-                    }
-                    let _ = tx_clone.send(event);
-                    if let Ok(mut agents) = agents_ref.lock()
-                        && let Some(agent) = agents.get_mut(&key_clone)
-                    {
-                        agent.status = AgentStatus::Failed;
-                    }
-                    Self::notify_agent_state_changed(&watcher_tx_clone);
-                }
-            }
-        });
-
-        // Store the task handle while the agent is still Pending.
-        {
-            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
-            if let Some(agent) = agents.get_mut(&key) {
-                agent.task_handle = Some(handle);
-            }
-        }
-
-        // Agent successfully spawned — prevent the guard from removing the entry.
-        pending_guard.disarm();
-
-        Ok(AgentInfo {
-            story_id: story_id.to_string(),
-            agent_name: resolved_name,
-            status: AgentStatus::Pending,
-            session_id: None,
-            worktree_path: None,
-            base_branch: None,
-            completion: None,
-            log_session_id: Some(log_session_id),
-        })
-    }
-
-    /// Stop a running agent. Worktree is preserved for inspection.
-    pub async fn stop_agent(
-        &self,
-        _project_root: &Path,
-        story_id: &str,
-        agent_name: &str,
-    ) -> Result<(), String> {
-        let key = composite_key(story_id, agent_name);
-
-        let (worktree_info, task_handle, tx) = {
-            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
-            let agent = agents
-                .get_mut(&key)
-                .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
-
-            let wt = agent.worktree_info.clone();
-            let handle = agent.task_handle.take();
-            let tx = agent.tx.clone();
-            agent.status = AgentStatus::Failed;
-            (wt, handle, tx)
-        };
-
-        // Abort the task and kill the PTY child process.
-        // Note: aborting a spawn_blocking task handle does not interrupt the blocking
-        // thread, so we must also kill the child process directly via the killer registry.
-        if let Some(handle) = task_handle {
-            handle.abort();
-            let _ = handle.await;
-        }
-        self.kill_child_for_key(&key);
-
-        // Preserve worktree for inspection — don't destroy agent's work on stop.
-        if let Some(ref wt) = worktree_info {
-            slog!(
-                "[agents] Worktree preserved for {story_id}:{agent_name}: {}",
-                wt.path.display()
-            );
-        }
-
-        let _ = tx.send(AgentEvent::Status {
-            story_id: story_id.to_string(),
-            agent_name: agent_name.to_string(),
-            status: "stopped".to_string(),
-        });
-
-        // Remove from map
-        {
-            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
-            agents.remove(&key);
-        }
-
-        // Notify WebSocket clients so pipeline board and agent panel update.
-        Self::notify_agent_state_changed(&self.watcher_tx);
-
-        Ok(())
-    }
-
-    /// Return the names of configured agents for `stage` that are not currently
-    /// running or pending.
-    pub fn available_agents_for_stage(
-        &self,
-        config: &ProjectConfig,
-        stage: &PipelineStage,
-    ) -> Result<Vec<String>, String> {
-        let agents = self.agents.lock().map_err(|e| e.to_string())?;
-        Ok(config
-            .agent
-            .iter()
-            .filter(|cfg| agent_config_stage(cfg) == *stage)
-            .filter(|cfg| {
-                !agents.values().any(|a| {
-                    a.agent_name == cfg.name
-                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
-                })
-            })
-            .map(|cfg| cfg.name.clone())
-            .collect())
-    }
-
-    /// List all agents with their status.
-    pub fn list_agents(&self) -> Result<Vec<AgentInfo>, String> {
-        let agents = self.agents.lock().map_err(|e| e.to_string())?;
-        Ok(agents
-            .iter()
-            .map(|(key, agent)| {
-                // Extract story_id from composite key "story_id:agent_name"
-                let story_id = key
-                    .rsplit_once(':')
-                    .map(|(sid, _)| sid.to_string())
-                    .unwrap_or_else(|| key.clone());
-                agent_info_from_entry(&story_id, agent)
-            })
-            .collect())
-    }
-
-    /// Subscribe to events for a story agent.
-    pub fn subscribe(
-        &self,
-        story_id: &str,
-        agent_name: &str,
-    ) -> Result<broadcast::Receiver<AgentEvent>, String> {
-        let key = composite_key(story_id, agent_name);
-        let agents = self.agents.lock().map_err(|e| e.to_string())?;
-        let agent = agents
-            .get(&key)
-            .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
-        Ok(agent.tx.subscribe())
-    }
-
-    /// Drain accumulated events for polling. Returns all events since the last drain.
-    pub fn drain_events(
-        &self,
-        story_id: &str,
-        agent_name: &str,
-    ) -> Result<Vec<AgentEvent>, String> {
-        let key = composite_key(story_id, agent_name);
-        let agents = self.agents.lock().map_err(|e| e.to_string())?;
-        let agent = agents
-            .get(&key)
-            .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
-        let mut log = agent.event_log.lock().map_err(|e| e.to_string())?;
-        Ok(log.drain(..).collect())
-    }
-
-    /// Block until the agent reaches a terminal state (completed, failed, stopped).
-    /// Returns the agent's final `AgentInfo`.
-    /// `timeout_ms` caps how long to wait; returns an error if the deadline passes.
-    pub async fn wait_for_agent(
-        &self,
-        story_id: &str,
-        agent_name: &str,
-        timeout_ms: u64,
-    ) -> Result<AgentInfo, String> {
-        // Subscribe before checking status so we don't miss the terminal event
-        // if the agent completes in the window between the two operations.
-        let mut rx = self.subscribe(story_id, agent_name)?;
-
-        // Return immediately if already in a terminal state.
-        {
-            let agents = self.agents.lock().map_err(|e| e.to_string())?;
-            let key = composite_key(story_id, agent_name);
-            if let Some(agent) = agents.get(&key)
-                && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed)
-            {
-                return Ok(agent_info_from_entry(story_id, agent));
-            }
-        }
-
-        let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms);
-
-        loop {
-            let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
-            if remaining.is_zero() {
-                return Err(format!(
-                    "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'"
-                ));
-            }
-
-            match tokio::time::timeout(remaining, rx.recv()).await {
-                Ok(Ok(event)) => {
-                    let is_terminal = match &event {
-                        AgentEvent::Done { .. } | AgentEvent::Error { .. } => true,
-                        AgentEvent::Status { status, .. } if status == "stopped" => true,
-                        _ => false,
-                    };
-                    if is_terminal {
-                        let agents = self.agents.lock().map_err(|e| e.to_string())?;
-                        let key = composite_key(story_id, agent_name);
-                        return Ok(if let Some(agent) = agents.get(&key) {
-                            agent_info_from_entry(story_id, agent)
-                        } else {
-                            // Agent was removed from map (e.g. stop_agent removes it after
-                            // the "stopped" status event is sent).
-                            let (status, session_id) = match event {
-                                AgentEvent::Done { session_id, .. } => {
-                                    (AgentStatus::Completed, session_id)
-                                }
-                                _ => (AgentStatus::Failed, None),
-                            };
-                            AgentInfo {
-                                story_id: story_id.to_string(),
-                                agent_name: agent_name.to_string(),
-                                status,
-                                session_id,
-                                worktree_path: None,
-                                base_branch: None,
-                                completion: None,
-                                log_session_id: None,
-                            }
-                        });
-                    }
-                }
-                Ok(Err(broadcast::error::RecvError::Lagged(_))) => {
-                    // Missed some buffered events — check current status before resuming.
-                    let agents = self.agents.lock().map_err(|e| e.to_string())?;
-                    let key = composite_key(story_id, agent_name);
-                    if let Some(agent) = agents.get(&key)
-                        && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed)
-                    {
-                        return Ok(agent_info_from_entry(story_id, agent));
-                    }
-                    // Still running — continue the loop.
-                }
-                Ok(Err(broadcast::error::RecvError::Closed)) => {
-                    // Channel closed: no more events will arrive. Return current state.
-                    let agents = self.agents.lock().map_err(|e| e.to_string())?;
-                    let key = composite_key(story_id, agent_name);
-                    if let Some(agent) = agents.get(&key) {
-                        return Ok(agent_info_from_entry(story_id, agent));
-                    }
-                    return Err(format!(
-                        "Agent '{agent_name}' for story '{story_id}' channel closed unexpectedly"
-                    ));
-                }
-                Err(_) => {
-                    return Err(format!(
-                        "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'"
-                    ));
-                }
-            }
-        }
-    }
-
-    /// Create a worktree for the given story using the server port (writes .mcp.json).
-    pub async fn create_worktree(
-        &self,
-        project_root: &Path,
-        story_id: &str,
-    ) -> Result<worktree::WorktreeInfo, String> {
-        let config = ProjectConfig::load(project_root)?;
-        worktree::create_worktree(project_root, story_id, &config, self.port).await
-    }
-
-    /// Advance the pipeline after an agent completes.
-    ///
-    /// Called internally by `report_completion` as a background task.
-    /// Reads the stored completion report and project_root from the agent,
-    /// then drives the next pipeline stage based on the agent's role:
-    ///
-    /// - **Coder** + gates passed → move story to `work/3_qa/`, start `qa` agent.
-    /// - **Coder** + gates failed → restart the same coder agent with failure context.
-    /// - **QA** + gates passed + coverage passed → move story to `work/4_merge/`, start `mergemaster` agent.
-    /// - **QA** + gates passed + coverage failed → restart `qa` with coverage failure context.
-    /// - **QA** + gates failed → restart `qa` with failure context.
-    /// - **Mergemaster** → run `script/test` on master; if pass: archive + cleanup worktree;
-    ///   if fail: restart `mergemaster` with failure context.
-    /// - **Other** (supervisor, unknown) → no automatic advancement.
-    async fn run_pipeline_advance(
-        &self,
-        story_id: &str,
-        agent_name: &str,
-        completion: CompletionReport,
-        project_root: Option<PathBuf>,
-        worktree_path: Option<PathBuf>,
-        merge_failure_reported: bool,
-    ) {
-        let project_root = match project_root {
-            Some(p) => p,
-            None => {
-                slog_warn!("[pipeline] No project_root for '{story_id}:{agent_name}'");
-                return;
-            }
-        };
-
-        let config = ProjectConfig::load(&project_root).unwrap_or_default();
-        let stage = config
-            .find_agent(agent_name)
-            .map(agent_config_stage)
-            .unwrap_or_else(|| pipeline_stage(agent_name));
-
-        match stage {
-            PipelineStage::Other => {
-                // Supervisors and unknown agents do not advance the pipeline.
-            }
-            PipelineStage::Coder => {
-                if completion.gates_passed {
-                    // Determine effective QA mode for this story.
-                    let qa_mode = {
-                        let item_type = super::lifecycle::item_type_from_id(story_id);
-                        if item_type == "spike" {
-                            crate::io::story_metadata::QaMode::Human
-                        } else {
-                            let default_qa = config.default_qa_mode();
-                            // Story is in 2_current/ when a coder completes.
-                            let story_path = project_root
-                                .join(".story_kit/work/2_current")
-                                .join(format!("{story_id}.md"));
-                            crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa)
-                        }
-                    };
-
-                    match qa_mode {
-                        crate::io::story_metadata::QaMode::Server => {
-                            slog!(
-                                "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
-                                 qa: server — moving directly to merge."
-                            );
-                            if let Err(e) =
-                                super::lifecycle::move_story_to_merge(&project_root, story_id)
-                            {
-                                slog_error!(
-                                    "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
-                                );
-                            } else if let Err(e) = self
-                                .start_agent(&project_root, story_id, Some("mergemaster"), None)
-                                .await
-                            {
-                                slog_error!(
-                                    "[pipeline] Failed to start mergemaster for '{story_id}': {e}"
-                                );
-                            }
-                        }
-                        crate::io::story_metadata::QaMode::Agent => {
-                            slog!(
-                                "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
-                                 qa: agent — moving to QA."
-                            );
-                            if let Err(e) = super::lifecycle::move_story_to_qa(&project_root, story_id) {
-                                slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
-                            } else if let Err(e) = self
-                                .start_agent(&project_root, story_id, Some("qa"), None)
-                                .await
-                            {
-                                slog_error!("[pipeline] Failed to start qa agent for '{story_id}': {e}");
-                            }
-                        }
-                        crate::io::story_metadata::QaMode::Human => {
-                            slog!(
-                                "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
-                                 qa: human — holding for human review."
-                            );
-                            if let Err(e) = super::lifecycle::move_story_to_qa(&project_root, story_id) {
-                                slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
-                            } else {
-                                let qa_dir = project_root.join(".story_kit/work/3_qa");
-                                let story_path = qa_dir.join(format!("{story_id}.md"));
-                                if let Err(e) =
-                                    crate::io::story_metadata::write_review_hold(&story_path)
-                                {
-                                    slog_error!(
-                                        "[pipeline] Failed to set review_hold on '{story_id}': {e}"
-                                    );
-                                }
-                            }
-                        }
-                    }
-                } else {
-                    // Increment retry count and check if blocked.
-                    let story_path = project_root
-                        .join(".story_kit/work/2_current")
-                        .join(format!("{story_id}.md"));
-                    if should_block_story(&story_path, config.max_retries, story_id, "coder") {
-                        // Story has exceeded retry limit — do not restart.
-                    } else {
-                        slog!(
-                            "[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting."
-                        );
-                        let context = format!(
-                            "\n\n---\n## Previous Attempt Failed\n\
-                             The acceptance gates failed with the following output:\n{}\n\n\
-                             Please review the failures above, fix the issues, and try again.",
-                            completion.gate_output
-                        );
-                        if let Err(e) = self
-                            .start_agent(&project_root, story_id, Some(agent_name), Some(&context))
-                            .await
-                        {
-                            slog_error!(
-                                "[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}"
-                            );
-                        }
-                    }
-                }
-            }
-            PipelineStage::Qa => {
-                if completion.gates_passed {
-                    // Run coverage gate in the QA worktree before advancing to merge.
-                    let coverage_path = worktree_path
-                        .clone()
-                        .unwrap_or_else(|| project_root.clone());
-                    let cp = coverage_path.clone();
-                    let coverage_result =
-                        tokio::task::spawn_blocking(move || super::gates::run_coverage_gate(&cp))
-                            .await
-                            .unwrap_or_else(|e| {
-                                slog_warn!("[pipeline] Coverage gate task panicked: {e}");
-                                Ok((false, format!("Coverage gate task panicked: {e}")))
-                            });
-                    let (coverage_passed, coverage_output) = match coverage_result {
-                        Ok(pair) => pair,
-                        Err(e) => (false, e),
-                    };
-
-                    if coverage_passed {
-                        // Check whether this item needs human review before merging.
-                        let needs_human_review = {
-                            let item_type = super::lifecycle::item_type_from_id(story_id);
-                            if item_type == "spike" {
-                                true // Spikes always need human review.
-                            } else {
-                                let qa_dir = project_root.join(".story_kit/work/3_qa");
-                                let story_path = qa_dir.join(format!("{story_id}.md"));
-                                let default_qa = config.default_qa_mode();
-                                matches!(
-                                    crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa),
-                                    crate::io::story_metadata::QaMode::Human
-                                )
-                            }
-                        };
-
-                        if needs_human_review {
-                            // Hold in 3_qa/ for human review.
-                            let qa_dir = project_root.join(".story_kit/work/3_qa");
-                            let story_path = qa_dir.join(format!("{story_id}.md"));
-                            if let Err(e) =
-                                crate::io::story_metadata::write_review_hold(&story_path)
-                            {
-                                slog_error!(
-                                    "[pipeline] Failed to set review_hold on '{story_id}': {e}"
-                                );
-                            }
-                            slog!(
-                                "[pipeline] QA passed for '{story_id}'. \
-                                 Holding for human review. \
-                                 Worktree preserved at: {worktree_path:?}"
-                            );
-                        } else {
-                            slog!(
-                                "[pipeline] QA passed gates and coverage for '{story_id}'. \
-                                 Moving directly to merge."
-                            );
-                            if let Err(e) =
-                                super::lifecycle::move_story_to_merge(&project_root, story_id)
-                            {
-                                slog_error!(
-                                    "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
-                                );
-                            } else if let Err(e) = self
-                                .start_agent(&project_root, story_id, Some("mergemaster"), None)
-                                .await
-                            {
-                                slog_error!(
-                                    "[pipeline] Failed to start mergemaster for '{story_id}': {e}"
-                                );
-                            }
-                        }
-                    } else {
-                        let story_path = project_root
-                            .join(".story_kit/work/3_qa")
-                            .join(format!("{story_id}.md"));
-                        if should_block_story(&story_path, config.max_retries, story_id, "qa-coverage") {
-                            // Story has exceeded retry limit — do not restart.
-                        } else {
-                            slog!(
-                                "[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA."
-                            );
-                            let context = format!(
-                                "\n\n---\n## Coverage Gate Failed\n\
-                                 The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\
-                                 Please improve test coverage until the coverage gate passes.",
-                                coverage_output
-                            );
-                            if let Err(e) = self
-                                .start_agent(&project_root, story_id, Some("qa"), Some(&context))
-                                .await
-                            {
-                                slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
-                            }
-                        }
-                    }
-                } else {
-                    let story_path = project_root
-                        .join(".story_kit/work/3_qa")
-                        .join(format!("{story_id}.md"));
-                    if should_block_story(&story_path, config.max_retries, story_id, "qa") {
-                        // Story has exceeded retry limit — do not restart.
-                    } else {
-                        slog!("[pipeline] QA failed gates for '{story_id}'. Restarting.");
-                        let context = format!(
-                            "\n\n---\n## Previous QA Attempt Failed\n\
-                             The acceptance gates failed with the following output:\n{}\n\n\
-                             Please re-run and fix the issues.",
-                            completion.gate_output
-                        );
-                        if let Err(e) = self
-                            .start_agent(&project_root, story_id, Some("qa"), Some(&context))
-                            .await
-                        {
-                            slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
-                        }
-                    }
-                }
-            }
-            PipelineStage::Mergemaster => {
-                // Block advancement if the mergemaster explicitly reported a failure.
-                // The server-owned gate check runs in the feature-branch worktree (not
-                // master), so `gates_passed=true` is misleading when no code was merged.
-                if merge_failure_reported {
-                    slog!(
-                        "[pipeline] Pipeline advancement blocked for '{story_id}': \
-                         mergemaster explicitly reported a merge failure. \
-                         Story stays in 4_merge/ for human review."
-                    );
-                } else {
-                    // Run script/test on master (project_root) as the post-merge verification.
-                    slog!(
-                        "[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master."
-                    );
-                    let root = project_root.clone();
-                    let test_result =
-                        tokio::task::spawn_blocking(move || super::gates::run_project_tests(&root))
-                            .await
-                            .unwrap_or_else(|e| {
-                                slog_warn!("[pipeline] Post-merge test task panicked: {e}");
-                                Ok((false, format!("Test task panicked: {e}")))
-                            });
-                    let (passed, output) = match test_result {
-                        Ok(pair) => pair,
-                        Err(e) => (false, e),
-                    };
-
-                    if passed {
-                        slog!(
-                            "[pipeline] Post-merge tests passed for '{story_id}'. Moving to done."
-                        );
-                        if let Err(e) =
-                            super::lifecycle::move_story_to_archived(&project_root, story_id)
-                        {
-                            slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}");
-                        }
-                        self.remove_agents_for_story(story_id);
-                        // TODO: Re-enable worktree cleanup once we have persistent agent logs.
-                        // Removing worktrees destroys evidence needed to debug empty-commit agents.
-                        // let config =
-                        //     crate::config::ProjectConfig::load(&project_root).unwrap_or_default();
-                        // if let Err(e) =
-                        //     worktree::remove_worktree_by_story_id(&project_root, story_id, &config)
-                        //         .await
-                        // {
-                        //     slog!(
-                        //         "[pipeline] Failed to remove worktree for '{story_id}': {e}"
-                        //     );
-                        // }
-                        slog!(
-                            "[pipeline] Story '{story_id}' done. Worktree preserved for inspection."
-                        );
-                    } else {
-                        let story_path = project_root
-                            .join(".story_kit/work/4_merge")
-                            .join(format!("{story_id}.md"));
-                        if should_block_story(&story_path, config.max_retries, story_id, "mergemaster") {
-                            // Story has exceeded retry limit — do not restart.
-                        } else {
-                            slog!(
-                                "[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster."
-                            );
-                            let context = format!(
-                                "\n\n---\n## Post-Merge Test Failed\n\
-                             The tests on master failed with the following output:\n{}\n\n\
-                             Please investigate and resolve the failures, then call merge_agent_work again.",
-                                output
-                            );
-                            if let Err(e) = self
-                                .start_agent(
-                                    &project_root,
-                                    story_id,
-                                    Some("mergemaster"),
-                                    Some(&context),
-                                )
-                                .await
-                            {
-                                slog_error!(
-                                    "[pipeline] Failed to restart mergemaster for '{story_id}': {e}"
-                                );
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        // Always scan for unassigned work after any agent completes, regardless
-        // of the outcome (success, failure, restart).  This ensures stories that
-        // failed agent assignment due to busy agents are retried when agents
-        // become available (bug 295).
-        self.auto_assign_available_work(&project_root).await;
-    }
-
-    /// Internal: report that an agent has finished work on a story.
-    ///
-    /// **Note:** This is no longer exposed as an MCP tool. The server now
-    /// automatically runs completion gates when an agent process exits
-    /// (see `run_server_owned_completion`). This method is retained for
-    /// backwards compatibility and testing.
-    ///
-    /// - Rejects with an error if the worktree has uncommitted changes.
-    /// - Runs acceptance gates (cargo clippy + cargo nextest run / cargo test).
-    /// - Stores the `CompletionReport` on the agent record.
-    /// - Transitions status to `Completed` (gates passed) or `Failed` (gates failed).
-    /// - Emits a `Done` event so `wait_for_agent` unblocks.
-    #[allow(dead_code)]
-    pub async fn report_completion(
-        &self,
-        story_id: &str,
-        agent_name: &str,
-        summary: &str,
-    ) -> Result<CompletionReport, String> {
-        let key = composite_key(story_id, agent_name);
-
-        // Verify agent exists, is Running, and grab its worktree path.
-        let worktree_path = {
-            let agents = self.agents.lock().map_err(|e| e.to_string())?;
-            let agent = agents
-                .get(&key)
-                .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
-
-            if agent.status != AgentStatus::Running {
-                return Err(format!(
-                    "Agent '{agent_name}' for story '{story_id}' is not running (status: {}). \
-                     report_completion can only be called by a running agent.",
-                    agent.status
-                ));
-            }
-
-            agent
-                .worktree_info
-                .as_ref()
-                .map(|wt| wt.path.clone())
-                .ok_or_else(|| {
-                    format!(
-                        "Agent '{agent_name}' for story '{story_id}' has no worktree. \
-                         Cannot run acceptance gates."
-                    )
-                })?
-        };
-
-        let path = worktree_path.clone();
-
-        // Run gate checks in a blocking thread to avoid stalling the async runtime.
-        let (gates_passed, gate_output) = tokio::task::spawn_blocking(move || {
-            // Step 1: Reject if worktree is dirty.
-            super::gates::check_uncommitted_changes(&path)?;
-            // Step 2: Run clippy + tests and return (passed, output).
-            super::gates::run_acceptance_gates(&path)
-        })
-        .await
-        .map_err(|e| format!("Gate check task panicked: {e}"))??;
-
-        let report = CompletionReport {
-            summary: summary.to_string(),
-            gates_passed,
-            gate_output,
-        };
-
-        // Extract data for pipeline advance, then remove the entry so
-        // completed agents never appear in list_agents.
-        let (
-            tx,
-            session_id,
-            project_root_for_advance,
-            wt_path_for_advance,
-            merge_failure_reported_for_advance,
-        ) = {
-            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
-            let agent = agents.get_mut(&key).ok_or_else(|| {
-                format!("Agent '{agent_name}' for story '{story_id}' disappeared during gate check")
-            })?;
-            agent.completion = Some(report.clone());
-            let tx = agent.tx.clone();
-            let sid = agent.session_id.clone();
-            let pr = agent.project_root.clone();
-            let wt = agent.worktree_info.as_ref().map(|w| w.path.clone());
-            let mfr = agent.merge_failure_reported;
-            agents.remove(&key);
-            (tx, sid, pr, wt, mfr)
-        };
-
-        // Emit Done so wait_for_agent unblocks.
-        let _ = tx.send(AgentEvent::Done {
-            story_id: story_id.to_string(),
-            agent_name: agent_name.to_string(),
-            session_id,
-        });
-
-        // Notify WebSocket clients that the agent is gone.
-        Self::notify_agent_state_changed(&self.watcher_tx);
-
-        // Advance the pipeline state machine in a background task.
-        let pool_clone = Self {
-            agents: Arc::clone(&self.agents),
-            port: self.port,
-            child_killers: Arc::clone(&self.child_killers),
-            watcher_tx: self.watcher_tx.clone(),
-            merge_jobs: Arc::clone(&self.merge_jobs),
-        };
-        let sid = story_id.to_string();
-        let aname = agent_name.to_string();
-        let report_for_advance = report.clone();
-        tokio::spawn(async move {
-            pool_clone
-                .run_pipeline_advance(
-                    &sid,
-                    &aname,
-                    report_for_advance,
-                    project_root_for_advance,
-                    wt_path_for_advance,
-                    merge_failure_reported_for_advance,
-                )
-                .await;
-        });
-
-        Ok(report)
-    }
-
-    /// Run the full mergemaster pipeline for a completed story:
-    ///
-    /// 1. Squash-merge the story's feature branch into the current branch (master).
-    /// 2. If conflicts are found: abort the merge and report them.
-    /// 3. Quality gates run **inside the merge worktree** before master is touched.
-    /// 4. If gates pass: cherry-pick the squash commit onto master and archive the story.
-    ///
-    /// Returns a `MergeReport` with full details of what happened.
-    /// Start the merge pipeline as a background task.
-    ///
-    /// Returns immediately so the MCP tool call doesn't time out (the full
-    /// pipeline — squash merge + quality gates — takes well over 60 seconds,
-    /// exceeding Claude Code's MCP tool-call timeout).
-    ///
-    /// The mergemaster agent should poll [`get_merge_status`](Self::get_merge_status)
-    /// until the job reaches a terminal state.
-    pub fn start_merge_agent_work(
-        self: &Arc<Self>,
-        project_root: &Path,
-        story_id: &str,
-    ) -> Result<(), String> {
-        // Guard against double-starts.
-        {
-            let jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
-            if let Some(job) = jobs.get(story_id)
-                && matches!(job.status, super::merge::MergeJobStatus::Running)
-            {
-                return Err(format!(
-                    "Merge already in progress for '{story_id}'. \
-                     Use get_merge_status to poll for completion."
-                ));
-            }
-        }
-
-        // Insert Running job.
-        {
-            let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
-            jobs.insert(
-                story_id.to_string(),
-                super::merge::MergeJob {
-                    story_id: story_id.to_string(),
-                    status: super::merge::MergeJobStatus::Running,
-                },
-            );
-        }
-
-        let pool = Arc::clone(self);
-        let root = project_root.to_path_buf();
-        let sid = story_id.to_string();
-
-        tokio::spawn(async move {
-            let report = pool.run_merge_pipeline(&root, &sid).await;
-            let failed = report.is_err();
-            let status = match report {
-                Ok(r) => super::merge::MergeJobStatus::Completed(r),
-                Err(e) => super::merge::MergeJobStatus::Failed(e),
-            };
-            if let Ok(mut jobs) = pool.merge_jobs.lock()
-                && let Some(job) = jobs.get_mut(&sid)
-            {
-                job.status = status;
-            }
-            if failed {
-                pool.auto_assign_available_work(&root).await;
-            }
-        });
-
-        Ok(())
-    }
-
-    /// The actual merge pipeline, run inside a background task.
-    async fn run_merge_pipeline(
-        self: &Arc<Self>,
-        project_root: &Path,
-        story_id: &str,
-    ) -> Result<super::merge::MergeReport, String> {
-        let branch = format!("feature/story-{story_id}");
-        let wt_path = worktree::worktree_path(project_root, story_id);
-        let root = project_root.to_path_buf();
-        let sid = story_id.to_string();
-        let br = branch.clone();
-
-        let merge_result =
-            tokio::task::spawn_blocking(move || super::merge::run_squash_merge(&root, &br, &sid))
-                .await
-                .map_err(|e| format!("Merge task panicked: {e}"))??;
-
-        if !merge_result.success {
-            return Ok(super::merge::MergeReport {
-                story_id: story_id.to_string(),
-                success: false,
-                had_conflicts: merge_result.had_conflicts,
-                conflicts_resolved: merge_result.conflicts_resolved,
-                conflict_details: merge_result.conflict_details,
-                gates_passed: merge_result.gates_passed,
-                gate_output: merge_result.output,
-                worktree_cleaned_up: false,
-                story_archived: false,
-            });
-        }
-
-        let story_archived =
-            super::lifecycle::move_story_to_archived(project_root, story_id).is_ok();
-        if story_archived {
-            self.remove_agents_for_story(story_id);
-        }
-
-        let worktree_cleaned_up = if wt_path.exists() {
-            let config = crate::config::ProjectConfig::load(project_root).unwrap_or_default();
-            worktree::remove_worktree_by_story_id(project_root, story_id, &config)
-                .await
-                .is_ok()
-        } else {
-            false
-        };
-
-        self.auto_assign_available_work(project_root).await;
-
-        Ok(super::merge::MergeReport {
-            story_id: story_id.to_string(),
-            success: true,
-            had_conflicts: merge_result.had_conflicts,
-            conflicts_resolved: merge_result.conflicts_resolved,
-            conflict_details: merge_result.conflict_details,
-            gates_passed: true,
-            gate_output: merge_result.output,
-            worktree_cleaned_up,
-            story_archived,
-        })
-    }
-
-    /// Check the status of a background merge job.
-    pub fn get_merge_status(&self, story_id: &str) -> Option<super::merge::MergeJob> {
-        self.merge_jobs
-            .lock()
-            .ok()
-            .and_then(|jobs| jobs.get(story_id).cloned())
-    }
-
-    /// Get project root helper.
-    pub fn get_project_root(&self, state: &crate::state::SessionState) -> Result<PathBuf, String> {
-        state.get_project_root()
-    }
-
-    /// Get the log session ID and project root for an agent, if available.
-    ///
-    /// Used by MCP tools to find the persistent log file for a completed agent.
-    pub fn get_log_info(&self, story_id: &str, agent_name: &str) -> Option<(String, PathBuf)> {
-        let key = composite_key(story_id, agent_name);
-        let agents = self.agents.lock().ok()?;
-        let agent = agents.get(&key)?;
-        let session_id = agent.log_session_id.clone()?;
-        let project_root = agent.project_root.clone()?;
-        Some((session_id, project_root))
-    }
-
-    /// Record that the mergemaster agent for `story_id` explicitly reported a
-    /// merge failure via the `report_merge_failure` MCP tool.
-    ///
-    /// Sets `merge_failure_reported = true` on the active mergemaster agent so
-    /// that `run_pipeline_advance` can block advancement to `5_done/` even when
-    /// the server-owned gate check returns `gates_passed=true` (those gates run
-    /// in the feature-branch worktree, not on master).
-    pub fn set_merge_failure_reported(&self, story_id: &str) {
-        match self.agents.lock() {
-            Ok(mut lock) => {
-                let found = lock.iter_mut().find(|(key, agent)| {
-                    let key_story_id = key
-                        .rsplit_once(':')
-                        .map(|(sid, _)| sid)
-                        .unwrap_or(key.as_str());
-                    key_story_id == story_id
-                        && pipeline_stage(&agent.agent_name) == PipelineStage::Mergemaster
-                });
-                match found {
-                    Some((_, agent)) => {
-                        agent.merge_failure_reported = true;
-                        slog!(
-                            "[pipeline] Merge failure flag set for '{story_id}:{}'",
-                            agent.agent_name
-                        );
-                    }
-                    None => {
-                        slog_warn!(
-                            "[pipeline] set_merge_failure_reported: no running mergemaster found \
-                             for story '{story_id}' — flag not set"
-                        );
-                    }
-                }
-            }
-            Err(e) => {
-                slog_error!("[pipeline] set_merge_failure_reported: could not lock agents: {e}");
-            }
-        }
-    }
-
-    /// Test helper: inject a pre-built agent entry so unit tests can exercise
-    /// wait/subscribe logic without spawning a real process.
-    #[cfg(test)]
-    pub fn inject_test_agent(
-        &self,
-        story_id: &str,
-        agent_name: &str,
-        status: AgentStatus,
-    ) -> broadcast::Sender<AgentEvent> {
-        let (tx, _) = broadcast::channel::<AgentEvent>(64);
-        let key = composite_key(story_id, agent_name);
-        let mut agents = self.agents.lock().unwrap();
-        agents.insert(
-            key,
-            StoryAgent {
-                agent_name: agent_name.to_string(),
-                status,
-                worktree_info: None,
-                session_id: None,
-                tx: tx.clone(),
-                task_handle: None,
-                event_log: Arc::new(Mutex::new(Vec::new())),
-                completion: None,
-                project_root: None,
-                log_session_id: None,
-                merge_failure_reported: false,
-            },
-        );
-        tx
-    }
-
-    /// Test helper: inject an agent with a specific worktree path for testing
-    /// gate-related logic.
-    #[cfg(test)]
-    pub fn inject_test_agent_with_path(
-        &self,
-        story_id: &str,
-        agent_name: &str,
-        status: AgentStatus,
-        worktree_path: PathBuf,
-    ) -> broadcast::Sender<AgentEvent> {
-        let (tx, _) = broadcast::channel::<AgentEvent>(64);
-        let key = composite_key(story_id, agent_name);
-        let mut agents = self.agents.lock().unwrap();
-        agents.insert(
-            key,
-            StoryAgent {
-                agent_name: agent_name.to_string(),
-                status,
-                worktree_info: Some(WorktreeInfo {
-                    path: worktree_path,
-                    branch: format!("feature/story-{story_id}"),
-                    base_branch: "master".to_string(),
-                }),
-                session_id: None,
-                tx: tx.clone(),
-                task_handle: None,
-                event_log: Arc::new(Mutex::new(Vec::new())),
-                completion: None,
-                project_root: None,
-                log_session_id: None,
-                merge_failure_reported: false,
-            },
-        );
-        tx
-    }
-
-    /// Automatically assign free agents to stories waiting in the active pipeline stages.
-    ///
-    /// Scans `work/2_current/`, `work/3_qa/`, and `work/4_merge/` for items that have no
-    /// active agent and assigns the first free agent of the appropriate role. Items in
-    /// `work/1_backlog/` are never auto-started.
-    ///
-    /// Respects the configured agent roster: the maximum number of concurrently active agents
-    /// per role is bounded by the count of agents of that role defined in `project.toml`.
-    pub async fn auto_assign_available_work(&self, project_root: &Path) {
-        let config = match ProjectConfig::load(project_root) {
-            Ok(c) => c,
-            Err(e) => {
-                slog_warn!("[auto-assign] Failed to load project config: {e}");
-                return;
-            }
-        };
-
-        // Process each active pipeline stage in order.
-        let stages: [(&str, PipelineStage); 3] = [
-            ("2_current", PipelineStage::Coder),
-            ("3_qa", PipelineStage::Qa),
-            ("4_merge", PipelineStage::Mergemaster),
-        ];
-
-        for (stage_dir, stage) in &stages {
-            let items = scan_stage_items(project_root, stage_dir);
-            if items.is_empty() {
-                continue;
-            }
-
-            for story_id in &items {
-                // Items marked with review_hold (e.g. spikes after QA passes) stay
-                // in their current stage for human review — don't auto-assign agents.
-                if has_review_hold(project_root, stage_dir, story_id) {
-                    continue;
-                }
-
-                // Skip blocked stories (retry limit exceeded).
-                if is_story_blocked(project_root, stage_dir, story_id) {
-                    continue;
-                }
-
-                // Skip stories in 4_merge/ that already have a reported merge failure.
-                // These need human intervention — auto-assigning a new mergemaster
-                // would just waste tokens on the same broken merge.
-                if *stage == PipelineStage::Mergemaster
-                    && has_merge_failure(project_root, stage_dir, story_id)
-                {
-                    continue;
-                }
-
-                // AC6: Detect empty-diff stories in 4_merge/ before starting a
-                // mergemaster. If the worktree has no commits on the feature branch,
-                // write a merge_failure and block the story immediately.
-                if *stage == PipelineStage::Mergemaster
-                    && let Some(wt_path) = worktree::find_worktree_path(project_root, story_id)
-                    && !super::gates::worktree_has_committed_work(&wt_path)
-                {
-                    slog_warn!(
-                        "[auto-assign] Story '{story_id}' in 4_merge/ has no commits \
-                         on feature branch. Writing merge_failure and blocking."
-                    );
-                    let story_path = project_root
-                        .join(".story_kit/work")
-                        .join(stage_dir)
-                        .join(format!("{story_id}.md"));
-                    let _ = crate::io::story_metadata::write_merge_failure(
-                        &story_path,
-                        "Feature branch has no code changes — the coder agent \
-                         did not produce any commits.",
-                    );
-                    let _ = crate::io::story_metadata::write_blocked(&story_path);
-                    continue;
-                }
-
-                // Re-acquire the lock on each iteration to see state changes
-                // from previous start_agent calls in the same pass.
-                let preferred_agent =
-                    read_story_front_matter_agent(project_root, stage_dir, story_id);
-
-                // Check max_coders limit for the Coder stage before agent selection.
-                // If the pool is full, all remaining items in this stage wait.
-                if *stage == PipelineStage::Coder
-                    && let Some(max) = config.max_coders
-                {
-                    let agents_lock = match self.agents.lock() {
-                        Ok(a) => a,
-                        Err(e) => {
-                            slog_error!("[auto-assign] Failed to lock agents: {e}");
-                            break;
-                        }
-                    };
-                    let active = count_active_agents_for_stage(&config, &agents_lock, stage);
-                    if active >= max {
-                        slog!(
-                            "[auto-assign] Coder pool full ({active}/{max}); remaining items in {stage_dir}/ will wait."
-                        );
-                        break;
-                    }
-                }
-
-                // Outcome: (already_assigned, chosen_agent, preferred_busy, stage_mismatch)
-                // preferred_busy=true means the story has a specific agent requested but it is
-                // currently occupied — the story should wait rather than fall back.
-                // stage_mismatch=true means the preferred agent's stage doesn't match the
-                // pipeline stage, so we fell back to a generic stage agent.
-                let (already_assigned, free_agent, preferred_busy, stage_mismatch) = {
-                    let agents = match self.agents.lock() {
-                        Ok(a) => a,
-                        Err(e) => {
-                            slog_error!("[auto-assign] Failed to lock agents: {e}");
-                            break;
-                        }
-                    };
-                    let assigned = is_story_assigned_for_stage(&config, &agents, story_id, stage);
-                    if assigned {
-                        (true, None, false, false)
-                    } else if let Some(ref pref) = preferred_agent {
-                        // Story has a front-matter agent preference.
-                        // Verify the preferred agent's stage matches the current
-                        // pipeline stage — a coder shouldn't be assigned to QA.
-                        let pref_stage_matches = config
-                            .find_agent(pref)
-                            .map(|cfg| agent_config_stage(cfg) == *stage)
-                            .unwrap_or(false);
-                        if !pref_stage_matches {
-                            // Stage mismatch — fall back to any free agent for this stage.
-                            let free = find_free_agent_for_stage(&config, &agents, stage)
-                                .map(|s| s.to_string());
-                            (false, free, false, true)
-                        } else if is_agent_free(&agents, pref) {
-                            (false, Some(pref.clone()), false, false)
-                        } else {
-                            (false, None, true, false)
-                        }
-                    } else {
-                        let free = find_free_agent_for_stage(&config, &agents, stage)
-                            .map(|s| s.to_string());
-                        (false, free, false, false)
-                    }
-                };
-
-                if already_assigned {
-                    // Story already has an active agent — skip silently.
-                    continue;
-                }
-
-                if preferred_busy {
-                    // The story requests a specific agent that is currently busy.
-                    // Do not fall back to a different agent; let this story wait.
-                    slog!(
-                        "[auto-assign] Preferred agent '{}' busy for '{story_id}'; story will wait.",
-                        preferred_agent.as_deref().unwrap_or("?")
-                    );
-                    continue;
-                }
-
-                if stage_mismatch {
-                    slog!(
-                        "[auto-assign] Preferred agent '{}' stage mismatch for '{story_id}' in {stage_dir}/; falling back to stage-appropriate agent.",
-                        preferred_agent.as_deref().unwrap_or("?")
-                    );
-                }
-
-                match free_agent {
-                    Some(agent_name) => {
-                        slog!(
-                            "[auto-assign] Assigning '{agent_name}' to '{story_id}' in {stage_dir}/"
-                        );
-                        if let Err(e) = self
-                            .start_agent(project_root, story_id, Some(&agent_name), None)
-                            .await
-                        {
-                            slog!(
-                                "[auto-assign] Failed to start '{agent_name}' for '{story_id}': {e}"
-                            );
-                        }
-                    }
-                    None => {
-                        // No free agents of this type — stop scanning this stage.
-                        slog!(
-                            "[auto-assign] All {:?} agents busy; remaining items in {stage_dir}/ will wait.",
-                            stage
-                        );
-                        break;
-                    }
-                }
-            }
-        }
-    }
-
-    /// Reconcile stories whose agent work was committed while the server was offline.
-    ///
-    /// On server startup the in-memory agent pool is empty, so any story that an agent
-    /// completed during a previous session is stuck: the worktree has committed work but
-    /// the pipeline never advanced.  This method detects those stories, re-runs the
-    /// acceptance gates, and advances the pipeline stage so that `auto_assign_available_work`
-    /// (called immediately after) picks up the right next-stage agents.
-    ///
-    /// Algorithm:
-    /// 1. List all worktree directories under `{project_root}/.story_kit/worktrees/`.
-    /// 2. For each worktree, check whether its feature branch has commits ahead of the
-    ///    base branch (`master` / `main`).
-    /// 3. If committed work is found AND the story is in `2_current/` or `3_qa/`:
-    ///    - Run acceptance gates (uncommitted-change check + clippy + tests).
-    ///    - On pass + `2_current/`: move the story to `3_qa/`.
-    ///    - On pass + `3_qa/`: run the coverage gate; if that also passes move to `4_merge/`.
-    ///    - On failure: leave the story where it is so `auto_assign_available_work` can
-    ///      start a fresh agent to retry.
-    /// 4. Stories in `4_merge/` are left for `auto_assign_available_work` to handle via a
-    ///    fresh mergemaster (squash-merge must be re-executed by the mergemaster agent).
-    pub async fn reconcile_on_startup(
-        &self,
-        project_root: &Path,
-        progress_tx: &broadcast::Sender<ReconciliationEvent>,
-    ) {
-        let worktrees = match worktree::list_worktrees(project_root) {
-            Ok(wt) => wt,
-            Err(e) => {
-                eprintln!("[startup:reconcile] Failed to list worktrees: {e}");
-                let _ = progress_tx.send(ReconciliationEvent {
-                    story_id: String::new(),
-                    status: "done".to_string(),
-                    message: format!("Reconciliation failed: {e}"),
-                });
-                return;
-            }
-        };
-
-        for wt_entry in &worktrees {
-            let story_id = &wt_entry.story_id;
-            let wt_path = wt_entry.path.clone();
-
-            // Determine which active stage the story is in.
-            let stage_dir = match find_active_story_stage(project_root, story_id) {
-                Some(s) => s,
-                None => continue, // Not in any active stage (backlog/archived or unknown).
-            };
-
-            // 4_merge/ is left for auto_assign to handle with a fresh mergemaster.
-            if stage_dir == "4_merge" {
-                continue;
-            }
-
-            let _ = progress_tx.send(ReconciliationEvent {
-                story_id: story_id.clone(),
-                status: "checking".to_string(),
-                message: format!("Checking for committed work in {stage_dir}/"),
-            });
-
-            // Check whether the worktree has commits ahead of the base branch.
-            let wt_path_for_check = wt_path.clone();
-            let has_work = tokio::task::spawn_blocking(move || {
-                super::gates::worktree_has_committed_work(&wt_path_for_check)
-            })
-            .await
-            .unwrap_or(false);
-
-            if !has_work {
-                eprintln!(
-                    "[startup:reconcile] No committed work for '{story_id}' in {stage_dir}/; skipping."
-                );
-                let _ = progress_tx.send(ReconciliationEvent {
-                    story_id: story_id.clone(),
-                    status: "skipped".to_string(),
-                    message: "No committed work found; skipping.".to_string(),
-                });
-                continue;
-            }
-
-            eprintln!(
-                "[startup:reconcile] Found committed work for '{story_id}' in {stage_dir}/. Running acceptance gates."
-            );
-            let _ = progress_tx.send(ReconciliationEvent {
-                story_id: story_id.clone(),
-                status: "gates_running".to_string(),
-                message: "Running acceptance gates…".to_string(),
-            });
-
-            // Run acceptance gates on the worktree.
-            let wt_path_for_gates = wt_path.clone();
-            let gates_result = tokio::task::spawn_blocking(move || {
-                super::gates::check_uncommitted_changes(&wt_path_for_gates)?;
-                super::gates::run_acceptance_gates(&wt_path_for_gates)
-            })
-            .await;
-
-            let (gates_passed, gate_output) = match gates_result {
-                Ok(Ok(pair)) => pair,
-                Ok(Err(e)) => {
-                    eprintln!("[startup:reconcile] Gate check error for '{story_id}': {e}");
-                    let _ = progress_tx.send(ReconciliationEvent {
-                        story_id: story_id.clone(),
-                        status: "failed".to_string(),
-                        message: format!("Gate error: {e}"),
-                    });
-                    continue;
-                }
-                Err(e) => {
-                    eprintln!("[startup:reconcile] Gate check task panicked for '{story_id}': {e}");
-                    let _ = progress_tx.send(ReconciliationEvent {
-                        story_id: story_id.clone(),
-                        status: "failed".to_string(),
-                        message: format!("Gate task panicked: {e}"),
-                    });
-                    continue;
-                }
-            };
-
-            if !gates_passed {
-                eprintln!(
-                    "[startup:reconcile] Gates failed for '{story_id}': {gate_output}\n\
-                     Leaving in {stage_dir}/ for auto-assign to restart the agent."
-                );
-                let _ = progress_tx.send(ReconciliationEvent {
-                    story_id: story_id.clone(),
-                    status: "failed".to_string(),
-                    message: "Gates failed; will be retried by auto-assign.".to_string(),
-                });
-                continue;
-            }
-
-            eprintln!("[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/).");
-
-            if stage_dir == "2_current" {
-                // Coder stage — determine qa mode to decide next step.
-                let qa_mode = {
-                    let item_type = super::lifecycle::item_type_from_id(story_id);
-                    if item_type == "spike" {
-                        crate::io::story_metadata::QaMode::Human
-                    } else {
-                        let default_qa = crate::config::ProjectConfig::load(project_root)
-                            .unwrap_or_default()
-                            .default_qa_mode();
-                        let story_path = project_root
-                            .join(".story_kit/work/2_current")
-                            .join(format!("{story_id}.md"));
-                        crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa)
-                    }
-                };
-
-                match qa_mode {
-                    crate::io::story_metadata::QaMode::Server => {
-                        if let Err(e) = super::lifecycle::move_story_to_merge(project_root, story_id) {
-                            eprintln!("[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}");
-                            let _ = progress_tx.send(ReconciliationEvent {
-                                story_id: story_id.clone(),
-                                status: "failed".to_string(),
-                                message: format!("Failed to advance to merge: {e}"),
-                            });
-                        } else {
-                            eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/ (qa: server).");
-                            let _ = progress_tx.send(ReconciliationEvent {
-                                story_id: story_id.clone(),
-                                status: "advanced".to_string(),
-                                message: "Gates passed — moved to merge (qa: server).".to_string(),
-                            });
-                        }
-                    }
-                    crate::io::story_metadata::QaMode::Agent => {
-                        if let Err(e) = super::lifecycle::move_story_to_qa(project_root, story_id) {
-                            eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}");
-                            let _ = progress_tx.send(ReconciliationEvent {
-                                story_id: story_id.clone(),
-                                status: "failed".to_string(),
-                                message: format!("Failed to advance to QA: {e}"),
-                            });
-                        } else {
-                            eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/.");
-                            let _ = progress_tx.send(ReconciliationEvent {
-                                story_id: story_id.clone(),
-                                status: "advanced".to_string(),
-                                message: "Gates passed — moved to QA.".to_string(),
-                            });
-                        }
-                    }
-                    crate::io::story_metadata::QaMode::Human => {
-                        if let Err(e) = super::lifecycle::move_story_to_qa(project_root, story_id) {
-                            eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}");
-                            let _ = progress_tx.send(ReconciliationEvent {
-                                story_id: story_id.clone(),
-                                status: "failed".to_string(),
-                                message: format!("Failed to advance to QA: {e}"),
-                            });
-                        } else {
-                            let story_path = project_root
-                                .join(".story_kit/work/3_qa")
-                                .join(format!("{story_id}.md"));
-                            if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) {
-                                eprintln!(
-                                    "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}"
-                                );
-                            }
-                            eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/ (qa: human — holding for review).");
-                            let _ = progress_tx.send(ReconciliationEvent {
-                                story_id: story_id.clone(),
-                                status: "review_hold".to_string(),
-                                message: "Gates passed — holding for human review.".to_string(),
-                            });
-                        }
-                    }
-                }
-            } else if stage_dir == "3_qa" {
-                // QA stage → run coverage gate before advancing to merge.
-                let wt_path_for_cov = wt_path.clone();
-                let coverage_result = tokio::task::spawn_blocking(move || {
-                    super::gates::run_coverage_gate(&wt_path_for_cov)
-                })
-                .await;
-
-                let (coverage_passed, coverage_output) = match coverage_result {
-                    Ok(Ok(pair)) => pair,
-                    Ok(Err(e)) => {
-                        eprintln!("[startup:reconcile] Coverage gate error for '{story_id}': {e}");
-                        let _ = progress_tx.send(ReconciliationEvent {
-                            story_id: story_id.clone(),
-                            status: "failed".to_string(),
-                            message: format!("Coverage gate error: {e}"),
-                        });
-                        continue;
-                    }
-                    Err(e) => {
-                        eprintln!(
-                            "[startup:reconcile] Coverage gate panicked for '{story_id}': {e}"
-                        );
-                        let _ = progress_tx.send(ReconciliationEvent {
-                            story_id: story_id.clone(),
-                            status: "failed".to_string(),
-                            message: format!("Coverage gate panicked: {e}"),
-                        });
-                        continue;
-                    }
-                };
-
-                if coverage_passed {
-                    // Check whether this item needs human review before merging.
-                    let needs_human_review = {
-                        let item_type = super::lifecycle::item_type_from_id(story_id);
-                        if item_type == "spike" {
-                            true
-                        } else {
-                            let story_path = project_root
-                                .join(".story_kit/work/3_qa")
-                                .join(format!("{story_id}.md"));
-                            let default_qa = crate::config::ProjectConfig::load(project_root)
-                                .unwrap_or_default()
-                                .default_qa_mode();
-                            matches!(
-                                crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa),
-                                crate::io::story_metadata::QaMode::Human
-                            )
-                        }
-                    };
-
-                    if needs_human_review {
-                        let story_path = project_root
-                            .join(".story_kit/work/3_qa")
-                            .join(format!("{story_id}.md"));
-                        if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) {
-                            eprintln!(
-                                "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}"
-                            );
-                        }
-                        eprintln!(
-                            "[startup:reconcile] '{story_id}' passed QA — holding for human review."
-                        );
-                        let _ = progress_tx.send(ReconciliationEvent {
-                            story_id: story_id.clone(),
-                            status: "review_hold".to_string(),
-                            message: "Passed QA — waiting for human review.".to_string(),
-                        });
-                    } else if let Err(e) =
-                        super::lifecycle::move_story_to_merge(project_root, story_id)
-                    {
-                        eprintln!(
-                            "[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}"
-                        );
-                        let _ = progress_tx.send(ReconciliationEvent {
-                            story_id: story_id.clone(),
-                            status: "failed".to_string(),
-                            message: format!("Failed to advance to merge: {e}"),
-                        });
-                    } else {
-                        eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/.");
-                        let _ = progress_tx.send(ReconciliationEvent {
-                            story_id: story_id.clone(),
-                            status: "advanced".to_string(),
-                            message: "Gates passed — moved to merge.".to_string(),
-                        });
-                    }
-                } else {
-                    eprintln!(
-                        "[startup:reconcile] Coverage gate failed for '{story_id}': {coverage_output}\n\
-                         Leaving in 3_qa/ for auto-assign to restart the QA agent."
-                    );
-                    let _ = progress_tx.send(ReconciliationEvent {
-                        story_id: story_id.clone(),
-                        status: "failed".to_string(),
-                        message: "Coverage gate failed; will be retried.".to_string(),
-                    });
-                }
-            }
-        }
-
-        // Signal that reconciliation is complete.
-        let _ = progress_tx.send(ReconciliationEvent {
-            story_id: String::new(),
-            status: "done".to_string(),
-            message: "Startup reconciliation complete.".to_string(),
-        });
-    }
-
-    /// Test helper: inject an agent with a completion report and project_root
-    /// for testing pipeline advance logic without spawning real agents.
-    #[cfg(test)]
-    pub fn inject_test_agent_with_completion(
-        &self,
-        story_id: &str,
-        agent_name: &str,
-        status: AgentStatus,
-        project_root: PathBuf,
-        completion: CompletionReport,
-    ) -> broadcast::Sender<AgentEvent> {
-        let (tx, _) = broadcast::channel::<AgentEvent>(64);
-        let key = composite_key(story_id, agent_name);
-        let mut agents = self.agents.lock().unwrap();
-        agents.insert(
-            key,
-            StoryAgent {
-                agent_name: agent_name.to_string(),
-                status,
-                worktree_info: None,
-                session_id: None,
-                tx: tx.clone(),
-                task_handle: None,
-                event_log: Arc::new(Mutex::new(Vec::new())),
-                completion: Some(completion),
-                project_root: Some(project_root),
-                log_session_id: None,
-                merge_failure_reported: false,
-            },
-        );
-        tx
-    }
-
-    /// Inject a Running agent with a pre-built (possibly finished) task handle.
-    /// Used by watchdog tests to simulate an orphaned agent.
-    #[cfg(test)]
-    pub fn inject_test_agent_with_handle(
-        &self,
-        story_id: &str,
-        agent_name: &str,
-        status: AgentStatus,
-        task_handle: tokio::task::JoinHandle<()>,
-    ) -> broadcast::Sender<AgentEvent> {
-        let (tx, _) = broadcast::channel::<AgentEvent>(64);
-        let key = composite_key(story_id, agent_name);
-        let mut agents = self.agents.lock().unwrap();
-        agents.insert(
-            key,
-            StoryAgent {
-                agent_name: agent_name.to_string(),
-                status,
-                worktree_info: None,
-                session_id: None,
-                tx: tx.clone(),
-                task_handle: Some(task_handle),
-                event_log: Arc::new(Mutex::new(Vec::new())),
-                completion: None,
-                project_root: None,
-                log_session_id: None,
-                merge_failure_reported: false,
-            },
-        );
-        tx
-    }
-
-    /// Test helper: inject a child killer into the registry.
-    #[cfg(test)]
-    pub fn inject_child_killer(&self, key: &str, killer: Box<dyn ChildKiller + Send + Sync>) {
-        let mut killers = self.child_killers.lock().unwrap();
-        killers.insert(key.to_string(), killer);
-    }
-
-    /// Test helper: return the number of registered child killers.
-    #[cfg(test)]
-    pub fn child_killer_count(&self) -> usize {
-        self.child_killers.lock().unwrap().len()
-    }
-
-    /// Run a single watchdog pass synchronously (test helper).
-    #[cfg(test)]
-    pub fn run_watchdog_once(&self) {
-        check_orphaned_agents(&self.agents);
-    }
-
-    /// Spawn a background watchdog task that periodically checks for Running agents
-    /// whose underlying task has already finished (orphaned entries).  Any such agent
-    /// is marked Failed and an Error event is emitted so that `wait_for_agent` unblocks.
-    ///
-    /// The watchdog runs every 30 seconds.  It is a safety net for edge cases where the
-    /// PTY read loop exits without updating the agent status (e.g. a panic in the
-    /// spawn_blocking task, or an external SIGKILL that closes the PTY fd immediately).
-    ///
-    /// When orphaned agents are detected and a `project_root` is provided, auto-assign
-    /// is triggered so that free agents can pick up unassigned work.
-    pub fn spawn_watchdog(pool: Arc<AgentPool>, project_root: Option<PathBuf>) {
-        tokio::spawn(async move {
-            let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
-            loop {
-                interval.tick().await;
-                let found = check_orphaned_agents(&pool.agents);
-                if found > 0
-                    && let Some(ref root) = project_root
-                {
-                    slog!("[watchdog] {found} orphaned agent(s) detected; triggering auto-assign.");
-                    pool.auto_assign_available_work(root).await;
-                }
-            }
-        });
-    }
-
-    /// Remove all agent entries for a given story_id from the pool.
-    ///
-    /// Called when a story is archived so that stale entries don't accumulate.
-    /// Returns the number of entries removed.
-    pub fn remove_agents_for_story(&self, story_id: &str) -> usize {
-        let mut agents = match self.agents.lock() {
-            Ok(a) => a,
-            Err(e) => {
-                slog_error!("[agents] Failed to lock pool for cleanup of '{story_id}': {e}");
-                return 0;
-            }
-        };
-        let prefix = format!("{story_id}:");
-        let keys_to_remove: Vec<String> = agents
-            .keys()
-            .filter(|k| k.starts_with(&prefix))
-            .cloned()
-            .collect();
-        let count = keys_to_remove.len();
-        for key in &keys_to_remove {
-            agents.remove(key);
-        }
-        if count > 0 {
-            slog!("[agents] Removed {count} agent entries for archived story '{story_id}'");
-        }
-        count
-    }
-}
-
-/// Return the active pipeline stage directory name for `story_id`, or `None` if the
-/// story is not in any active stage (`2_current/`, `3_qa/`, `4_merge/`).
-fn find_active_story_stage(project_root: &Path, story_id: &str) -> Option<&'static str> {
-    const STAGES: [&str; 3] = ["2_current", "3_qa", "4_merge"];
-    for stage in &STAGES {
-        let path = project_root
-            .join(".story_kit")
-            .join("work")
-            .join(stage)
-            .join(format!("{story_id}.md"));
-        if path.exists() {
-            return Some(stage);
-        }
-    }
-    None
-}
-
-/// Scan a work pipeline stage directory and return story IDs, sorted alphabetically.
-/// Returns an empty `Vec` if the directory does not exist.
-/// Read the optional `agent:` field from the front matter of a story file.
-///
-/// Returns `Some(agent_name)` if the front matter specifies an agent, or `None`
-/// if the field is absent or the file cannot be read / parsed.
-fn read_story_front_matter_agent(
-    project_root: &Path,
-    stage_dir: &str,
-    story_id: &str,
-) -> Option<String> {
-    use crate::io::story_metadata::parse_front_matter;
-    let path = project_root
-        .join(".story_kit")
-        .join("work")
-        .join(stage_dir)
-        .join(format!("{story_id}.md"));
-    let contents = std::fs::read_to_string(path).ok()?;
-    parse_front_matter(&contents).ok()?.agent
-}
-
-/// Return `true` if the story file in the given stage has `review_hold: true` in its front matter.
-fn has_review_hold(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
-    use crate::io::story_metadata::parse_front_matter;
-    let path = project_root
-        .join(".story_kit")
-        .join("work")
-        .join(stage_dir)
-        .join(format!("{story_id}.md"));
-    let contents = match std::fs::read_to_string(path) {
-        Ok(c) => c,
-        Err(_) => return false,
-    };
-    parse_front_matter(&contents)
-        .ok()
-        .and_then(|m| m.review_hold)
-        .unwrap_or(false)
-}
-
-/// Increment retry_count and block the story if it exceeds `max_retries`.
-///
-/// Returns `true` if the story is now blocked (caller should NOT restart the agent).
-/// Returns `false` if the story may be retried.
-/// When `max_retries` is 0, retry limits are disabled.
-fn should_block_story(story_path: &Path, max_retries: u32, story_id: &str, stage_label: &str) -> bool {
-    use crate::io::story_metadata::{increment_retry_count, write_blocked};
-
-    if max_retries == 0 {
-        // Retry limits disabled.
-        return false;
-    }
-
-    match increment_retry_count(story_path) {
-        Ok(new_count) => {
-            if new_count >= max_retries {
-                slog_warn!(
-                    "[pipeline] Story '{story_id}' reached retry limit ({new_count}/{max_retries}) \
-                     at {stage_label} stage. Marking as blocked."
-                );
-                if let Err(e) = write_blocked(story_path) {
-                    slog_error!("[pipeline] Failed to write blocked flag for '{story_id}': {e}");
-                }
-                true
-            } else {
-                slog!(
-                    "[pipeline] Story '{story_id}' retry {new_count}/{max_retries} at {stage_label} stage."
-                );
-                false
-            }
-        }
-        Err(e) => {
-            slog_error!("[pipeline] Failed to increment retry_count for '{story_id}': {e}");
-            false // Don't block on error — allow retry.
-        }
-    }
-}
-
-/// Return `true` if the story file has `blocked: true` in its front matter.
-fn is_story_blocked(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
-    use crate::io::story_metadata::parse_front_matter;
-    let path = project_root
-        .join(".story_kit")
-        .join("work")
-        .join(stage_dir)
-        .join(format!("{story_id}.md"));
-    let contents = match std::fs::read_to_string(path) {
-        Ok(c) => c,
-        Err(_) => return false,
-    };
-    parse_front_matter(&contents)
-        .ok()
-        .and_then(|m| m.blocked)
-        .unwrap_or(false)
-}
-
-/// Return `true` if the story file has a `merge_failure` field in its front matter.
-fn has_merge_failure(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
-    use crate::io::story_metadata::parse_front_matter;
-    let path = project_root
-        .join(".story_kit")
-        .join("work")
-        .join(stage_dir)
-        .join(format!("{story_id}.md"));
-    let contents = match std::fs::read_to_string(path) {
-        Ok(c) => c,
-        Err(_) => return false,
-    };
-    parse_front_matter(&contents)
-        .ok()
-        .and_then(|m| m.merge_failure)
-        .is_some()
-}
-
-/// Return `true` if `agent_name` has no active (pending/running) entry in the pool.
-fn is_agent_free(agents: &HashMap<String, StoryAgent>, agent_name: &str) -> bool {
-    !agents.values().any(|a| {
-        a.agent_name == agent_name
-            && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
-    })
-}
-
-fn scan_stage_items(project_root: &Path, stage_dir: &str) -> Vec<String> {
-    let dir = project_root.join(".story_kit").join("work").join(stage_dir);
-    if !dir.is_dir() {
-        return Vec::new();
-    }
-    let mut items = Vec::new();
-    if let Ok(entries) = std::fs::read_dir(&dir) {
-        for entry in entries.flatten() {
-            let path = entry.path();
-            if path.extension().and_then(|e| e.to_str()) == Some("md")
-                && let Some(stem) = path.file_stem().and_then(|s| s.to_str())
-            {
-                items.push(stem.to_string());
-            }
-        }
-    }
-    items.sort();
-    items
-}
-
-/// Return `true` if `story_id` has any active (pending/running) agent matching `stage`.
-///
-/// Uses the explicit `stage` config field when the agent is found in `config`;
-/// falls back to the legacy name-based heuristic for unlisted agents.
-fn is_story_assigned_for_stage(
-    config: &ProjectConfig,
-    agents: &HashMap<String, StoryAgent>,
-    story_id: &str,
-    stage: &PipelineStage,
-) -> bool {
-    agents.iter().any(|(key, agent)| {
-        // Composite key format: "{story_id}:{agent_name}"
-        let key_story_id = key.rsplit_once(':').map(|(sid, _)| sid).unwrap_or(key);
-        let agent_stage = config
-            .find_agent(&agent.agent_name)
-            .map(agent_config_stage)
-            .unwrap_or_else(|| pipeline_stage(&agent.agent_name));
-        key_story_id == story_id
-            && agent_stage == *stage
-            && matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
-    })
-}
-
-/// Count active (pending/running) agents for a given pipeline stage.
-fn count_active_agents_for_stage(
-    config: &ProjectConfig,
-    agents: &HashMap<String, StoryAgent>,
-    stage: &PipelineStage,
-) -> usize {
-    agents
-        .values()
-        .filter(|a| {
-            matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
-                && config
-                    .find_agent(&a.agent_name)
-                    .map(|cfg| agent_config_stage(cfg) == *stage)
-                    .unwrap_or_else(|| pipeline_stage(&a.agent_name) == *stage)
-        })
-        .count()
-}
-
-/// Find the first configured agent for `stage` that has no active (pending/running) assignment.
-/// Returns `None` if all agents for that stage are busy, none are configured,
-/// or the `max_coders` limit has been reached (for the Coder stage).
-///
-/// For the Coder stage, when `default_coder_model` is set, only considers agents whose
-/// model matches the default. This ensures opus-class agents are reserved for explicit
-/// front-matter requests.
-fn find_free_agent_for_stage<'a>(
-    config: &'a ProjectConfig,
-    agents: &HashMap<String, StoryAgent>,
-    stage: &PipelineStage,
-) -> Option<&'a str> {
-    // Enforce max_coders limit for the Coder stage.
-    if *stage == PipelineStage::Coder
-        && let Some(max) = config.max_coders
-    {
-        let active = count_active_agents_for_stage(config, agents, stage);
-        if active >= max {
-            return None;
-        }
-    }
-
-    for agent_config in &config.agent {
-        if agent_config_stage(agent_config) != *stage {
-            continue;
-        }
-        // When default_coder_model is set, only auto-assign coder agents whose
-        // model matches. This keeps opus agents reserved for explicit requests.
-        if *stage == PipelineStage::Coder
-            && let Some(ref default_model) = config.default_coder_model
-        {
-            let agent_model = agent_config.model.as_deref().unwrap_or("");
-            if agent_model != default_model {
-                continue;
-            }
-        }
-        let is_busy = agents.values().any(|a| {
-            a.agent_name == agent_config.name
-                && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
-        });
-        if !is_busy {
-            return Some(&agent_config.name);
-        }
-    }
-    None
-}
-
-/// Scan the agent pool for Running entries whose backing tokio task has already
-/// finished and mark them as Failed.
-///
-/// This handles the case where the PTY read loop or the spawned task exits
-/// without updating the agent status — for example when the process is killed
-/// externally and the PTY master fd returns EOF before our inactivity timeout
-/// fires, but some other edge case prevents the normal cleanup path from running.
-fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>) -> usize {
-    let mut lock = match agents.lock() {
-        Ok(l) => l,
-        Err(_) => return 0,
-    };
-
-    // Collect orphaned entries: Running or Pending agents whose task handle is finished.
-    // Pending agents can be orphaned if worktree creation panics before setting status.
-    let orphaned: Vec<(String, String, broadcast::Sender<AgentEvent>, AgentStatus)> = lock
-        .iter()
-        .filter_map(|(key, agent)| {
-            if matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
-                && let Some(handle) = &agent.task_handle
-                && handle.is_finished()
-            {
-                let story_id = key
-                    .rsplit_once(':')
-                    .map(|(s, _)| s.to_string())
-                    .unwrap_or_else(|| key.clone());
-                return Some((
-                    key.clone(),
-                    story_id,
-                    agent.tx.clone(),
-                    agent.status.clone(),
-                ));
-            }
-            None
-        })
-        .collect();
-
-    let count = orphaned.len();
-    for (key, story_id, tx, prev_status) in orphaned {
-        if let Some(agent) = lock.get_mut(&key) {
-            agent.status = AgentStatus::Failed;
-            slog!(
-                "[watchdog] Orphaned agent '{key}': task finished but status was {prev_status}. \
-                 Marking Failed."
-            );
-            let _ = tx.send(AgentEvent::Error {
-                story_id,
-                agent_name: agent.agent_name.clone(),
-                message: "Agent process terminated unexpectedly (watchdog detected orphan)"
-                    .to_string(),
-            });
-        }
-    }
-    count
-}
-
-/// Server-owned completion: runs acceptance gates when an agent process exits
-/// normally, and advances the pipeline based on results.
-///
-/// This is a **free function** (not a method on `AgentPool`) to break the
-/// opaque type cycle that would otherwise arise: `start_agent` → spawned task
-/// → server-owned completion → pipeline advance → `start_agent`.
-///
-/// If the agent already has a completion report (e.g. from a legacy
-/// `report_completion` call), this is a no-op to avoid double-running gates.
-async fn run_server_owned_completion(
-    agents: &Arc<Mutex<HashMap<String, StoryAgent>>>,
-    port: u16,
-    story_id: &str,
-    agent_name: &str,
-    session_id: Option<String>,
-    watcher_tx: broadcast::Sender<WatcherEvent>,
-) {
-    let key = composite_key(story_id, agent_name);
-
-    // Guard: skip if completion was already recorded (legacy path).
-    {
-        let lock = match agents.lock() {
-            Ok(a) => a,
-            Err(_) => return,
-        };
-        match lock.get(&key) {
-            Some(agent) if agent.completion.is_some() => {
-                slog!(
-                    "[agents] Completion already recorded for '{story_id}:{agent_name}'; \
-                     skipping server-owned gates."
-                );
-                return;
-            }
-            Some(_) => {}
-            None => return,
-        }
-    }
-
-    // Get worktree path for running gates.
-    let worktree_path = {
-        let lock = match agents.lock() {
-            Ok(a) => a,
-            Err(_) => return,
-        };
-        lock.get(&key)
-            .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
-    };
-
-    // Run acceptance gates.
-    let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path {
-        let path = wt_path;
-        match tokio::task::spawn_blocking(move || {
-            super::gates::check_uncommitted_changes(&path)?;
-            // AC5: Fail early if the coder finished with no commits on the feature branch.
-            // This prevents empty-diff stories from advancing through QA to merge.
-            if !super::gates::worktree_has_committed_work(&path) {
-                return Ok((
-                    false,
-                    "Agent exited with no commits on the feature branch. \
-                     The agent did not produce any code changes."
-                        .to_string(),
-                ));
-            }
-            super::gates::run_acceptance_gates(&path)
-        })
-        .await
-        {
-            Ok(Ok(result)) => result,
-            Ok(Err(e)) => (false, e),
-            Err(e) => (false, format!("Gate check task panicked: {e}")),
-        }
-    } else {
-        (
-            false,
-            "No worktree path available to run acceptance gates".to_string(),
-        )
-    };
-
-    slog!(
-        "[agents] Server-owned completion for '{story_id}:{agent_name}': gates_passed={gates_passed}"
-    );
-
-    let report = CompletionReport {
-        summary: "Agent process exited normally".to_string(),
-        gates_passed,
-        gate_output,
-    };
-
-    // Store completion report, extract data for pipeline advance, then
-    // remove the entry so completed agents never appear in list_agents.
-    let (tx, project_root_for_advance, wt_path_for_advance, merge_failure_reported_for_advance) = {
-        let mut lock = match agents.lock() {
-            Ok(a) => a,
-            Err(_) => return,
-        };
-        let agent = match lock.get_mut(&key) {
-            Some(a) => a,
-            None => return,
-        };
-        agent.completion = Some(report.clone());
-        agent.session_id = session_id.clone();
-        let tx = agent.tx.clone();
-        let pr = agent.project_root.clone();
-        let wt = agent.worktree_info.as_ref().map(|w| w.path.clone());
-        let mfr = agent.merge_failure_reported;
-        lock.remove(&key);
-        (tx, pr, wt, mfr)
-    };
-
-    // Emit Done so wait_for_agent unblocks.
-    let _ = tx.send(AgentEvent::Done {
-        story_id: story_id.to_string(),
-        agent_name: agent_name.to_string(),
-        session_id,
-    });
-
-    // Notify WebSocket clients that the agent is gone.
-    AgentPool::notify_agent_state_changed(&watcher_tx);
-
-    // Advance the pipeline state machine in a background task.
-    spawn_pipeline_advance(
-        Arc::clone(agents),
-        port,
-        story_id,
-        agent_name,
-        report,
-        project_root_for_advance,
-        wt_path_for_advance,
-        watcher_tx,
-        merge_failure_reported_for_advance,
-    );
-}
-
-/// Spawn pipeline advancement as a background task.
-///
-/// This is a **non-async** function so it does not participate in the opaque
-/// type cycle between `start_agent` and `run_server_owned_completion`.
-#[allow(clippy::too_many_arguments)]
-fn spawn_pipeline_advance(
-    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
-    port: u16,
-    story_id: &str,
-    agent_name: &str,
-    completion: CompletionReport,
-    project_root: Option<PathBuf>,
-    worktree_path: Option<PathBuf>,
-    watcher_tx: broadcast::Sender<WatcherEvent>,
-    merge_failure_reported: bool,
-) {
-    let sid = story_id.to_string();
-    let aname = agent_name.to_string();
-    tokio::spawn(async move {
-        let pool = AgentPool {
-            agents,
-            port,
-            child_killers: Arc::new(Mutex::new(HashMap::new())),
-            watcher_tx,
-            merge_jobs: Arc::new(Mutex::new(HashMap::new())),
-        };
-        pool.run_pipeline_advance(
-            &sid,
-            &aname,
-            completion,
-            project_root,
-            worktree_path,
-            merge_failure_reported,
-        )
-        .await;
-    });
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::agents::merge::{MergeJob, MergeJobStatus};
-    use crate::agents::{
-        AgentEvent, AgentStatus, CompletionReport, PipelineStage, ReconciliationEvent,
-        lifecycle::move_story_to_archived,
-    };
-    use crate::config::ProjectConfig;
-    use crate::io::watcher::WatcherEvent;
-    use portable_pty::{CommandBuilder, PtySize, native_pty_system};
-    use std::collections::HashMap;
-    use std::path::PathBuf;
-    use std::process::Command;
-    use tokio::sync::broadcast;
-
-    fn init_git_repo(repo: &std::path::Path) {
-        Command::new("git")
-            .args(["init"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["config", "user.email", "test@test.com"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["config", "user.name", "Test"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "--allow-empty", "-m", "init"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-    }
-
-    fn make_config(toml_str: &str) -> ProjectConfig {
-        ProjectConfig::parse(toml_str).unwrap()
-    }
-
-    #[tokio::test]
-    async fn wait_for_agent_returns_immediately_if_completed() {
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("s1", "bot", AgentStatus::Completed);
-
-        let info = pool.wait_for_agent("s1", "bot", 1000).await.unwrap();
-        assert_eq!(info.status, AgentStatus::Completed);
-        assert_eq!(info.story_id, "s1");
-        assert_eq!(info.agent_name, "bot");
-    }
-
-    #[tokio::test]
-    async fn wait_for_agent_returns_immediately_if_failed() {
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("s2", "bot", AgentStatus::Failed);
-
-        let info = pool.wait_for_agent("s2", "bot", 1000).await.unwrap();
-        assert_eq!(info.status, AgentStatus::Failed);
-    }
-
-    #[tokio::test]
-    async fn wait_for_agent_completes_on_done_event() {
-        let pool = AgentPool::new_test(3001);
-        let tx = pool.inject_test_agent("s3", "bot", AgentStatus::Running);
-
-        // Send Done event after a short delay
-        let tx_clone = tx.clone();
-        tokio::spawn(async move {
-            tokio::time::sleep(std::time::Duration::from_millis(50)).await;
-            // Mark status via event; real code also updates the map, but for
-            // this unit test the map entry stays Running — we verify the
-            // wait loop reacts to the event.
-            let _ = tx_clone.send(AgentEvent::Done {
-                story_id: "s3".to_string(),
-                agent_name: "bot".to_string(),
-                session_id: Some("sess-abc".to_string()),
-            });
-        });
-
-        let info = pool.wait_for_agent("s3", "bot", 2000).await.unwrap();
-        // Status comes from the map entry (Running in this unit test)
-        // — the important thing is that wait_for_agent returned without timing out.
-        assert_eq!(info.story_id, "s3");
-    }
-
-    #[tokio::test]
-    async fn wait_for_agent_times_out() {
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("s4", "bot", AgentStatus::Running);
-
-        let result = pool.wait_for_agent("s4", "bot", 50).await;
-        assert!(result.is_err());
-        let msg = result.unwrap_err();
-        assert!(msg.contains("Timed out"), "unexpected message: {msg}");
-    }
-
-    #[tokio::test]
-    async fn wait_for_agent_errors_for_nonexistent() {
-        let pool = AgentPool::new_test(3001);
-        let result = pool.wait_for_agent("no_story", "no_bot", 100).await;
-        assert!(result.is_err());
-    }
-
-    #[tokio::test]
-    async fn wait_for_agent_completes_on_stopped_status_event() {
-        let pool = AgentPool::new_test(3001);
-        let tx = pool.inject_test_agent("s5", "bot", AgentStatus::Running);
-
-        let tx_clone = tx.clone();
-        tokio::spawn(async move {
-            tokio::time::sleep(std::time::Duration::from_millis(30)).await;
-            let _ = tx_clone.send(AgentEvent::Status {
-                story_id: "s5".to_string(),
-                agent_name: "bot".to_string(),
-                status: "stopped".to_string(),
-            });
-        });
-
-        let info = pool.wait_for_agent("s5", "bot", 2000).await.unwrap();
-        assert_eq!(info.story_id, "s5");
-    }
-
-    // ── report_completion tests ────────────────────────────────────
-
-    #[tokio::test]
-    async fn report_completion_rejects_nonexistent_agent() {
-        let pool = AgentPool::new_test(3001);
-        let result = pool.report_completion("no_story", "no_bot", "done").await;
-        assert!(result.is_err());
-        let msg = result.unwrap_err();
-        assert!(msg.contains("No agent"), "unexpected: {msg}");
-    }
-
-    #[tokio::test]
-    async fn report_completion_rejects_non_running_agent() {
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("s6", "bot", AgentStatus::Completed);
-
-        let result = pool.report_completion("s6", "bot", "done").await;
-        assert!(result.is_err());
-        let msg = result.unwrap_err();
-        assert!(
-            msg.contains("not running"),
-            "expected 'not running' in: {msg}"
-        );
-    }
-
-    #[tokio::test]
-    async fn report_completion_rejects_dirty_worktree() {
-        use std::fs;
-        use tempfile::tempdir;
-
-        let tmp = tempdir().unwrap();
-        let repo = tmp.path();
-
-        // Init a real git repo and make an initial commit
-        Command::new("git")
-            .args(["init"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "--allow-empty", "-m", "init"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        // Write an uncommitted file
-        fs::write(repo.join("dirty.txt"), "not committed").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent_with_path("s7", "bot", AgentStatus::Running, repo.to_path_buf());
-
-        let result = pool.report_completion("s7", "bot", "done").await;
-        assert!(result.is_err());
-        let msg = result.unwrap_err();
-        assert!(
-            msg.contains("uncommitted"),
-            "expected 'uncommitted' in: {msg}"
-        );
-    }
-
-    // ── server-owned completion tests ───────────────────────────────────────────
-
-    #[tokio::test]
-    async fn server_owned_completion_skips_when_already_completed() {
-        let pool = AgentPool::new_test(3001);
-        let report = CompletionReport {
-            summary: "Already done".to_string(),
-            gates_passed: true,
-            gate_output: String::new(),
-        };
-        pool.inject_test_agent_with_completion(
-            "s10",
-            "coder-1",
-            AgentStatus::Completed,
-            PathBuf::from("/tmp/nonexistent"),
-            report,
-        );
-
-        // Subscribe before calling so we can check if Done event was emitted.
-        let mut rx = pool.subscribe("s10", "coder-1").unwrap();
-
-        run_server_owned_completion(
-            &pool.agents,
-            pool.port,
-            "s10",
-            "coder-1",
-            Some("sess-1".to_string()),
-            pool.watcher_tx.clone(),
-        )
-        .await;
-
-        // Status should remain Completed (unchanged) — no gate re-run.
-        let agents = pool.agents.lock().unwrap();
-        let key = composite_key("s10", "coder-1");
-        let agent = agents.get(&key).unwrap();
-        assert_eq!(agent.status, AgentStatus::Completed);
-        // Summary should still be the original, not overwritten.
-        assert_eq!(agent.completion.as_ref().unwrap().summary, "Already done");
-        drop(agents);
-
-        // No Done event should have been emitted.
-        assert!(
-            rx.try_recv().is_err(),
-            "should not emit Done when completion already exists"
-        );
-    }
-
-    #[tokio::test]
-    async fn server_owned_completion_runs_gates_on_clean_worktree() {
-        use tempfile::tempdir;
-
-        let tmp = tempdir().unwrap();
-        let repo = tmp.path();
-        init_git_repo(repo);
-
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent_with_path(
-            "s11",
-            "coder-1",
-            AgentStatus::Running,
-            repo.to_path_buf(),
-        );
-
-        let mut rx = pool.subscribe("s11", "coder-1").unwrap();
-
-        run_server_owned_completion(
-            &pool.agents,
-            pool.port,
-            "s11",
-            "coder-1",
-            Some("sess-2".to_string()),
-            pool.watcher_tx.clone(),
-        )
-        .await;
-
-        // Agent entry should be removed from the map after completion.
-        let agents = pool.agents.lock().unwrap();
-        let key = composite_key("s11", "coder-1");
-        assert!(
-            agents.get(&key).is_none(),
-            "agent should be removed from map after completion"
-        );
-        drop(agents);
-
-        // A Done event should have been emitted with the session_id.
-        let event = rx.try_recv().expect("should emit Done event");
-        match &event {
-            AgentEvent::Done { session_id, .. } => {
-                assert_eq!(*session_id, Some("sess-2".to_string()));
-            }
-            other => panic!("expected Done event, got: {other:?}"),
-        }
-    }
-
-    #[tokio::test]
-    async fn server_owned_completion_fails_on_dirty_worktree() {
-        use std::fs;
-        use tempfile::tempdir;
-
-        let tmp = tempdir().unwrap();
-        let repo = tmp.path();
-        init_git_repo(repo);
-        // Create an uncommitted file.
-        fs::write(repo.join("dirty.txt"), "not committed").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent_with_path(
-            "s12",
-            "coder-1",
-            AgentStatus::Running,
-            repo.to_path_buf(),
-        );
-
-        let mut rx = pool.subscribe("s12", "coder-1").unwrap();
-
-        run_server_owned_completion(
-            &pool.agents,
-            pool.port,
-            "s12",
-            "coder-1",
-            None,
-            pool.watcher_tx.clone(),
-        )
-        .await;
-
-        // Agent entry should be removed from the map after completion (even on failure).
-        let agents = pool.agents.lock().unwrap();
-        let key = composite_key("s12", "coder-1");
-        assert!(
-            agents.get(&key).is_none(),
-            "agent should be removed from map after failed completion"
-        );
-        drop(agents);
-
-        // A Done event should have been emitted.
-        let event = rx.try_recv().expect("should emit Done event");
-        assert!(
-            matches!(event, AgentEvent::Done { .. }),
-            "expected Done event, got: {event:?}"
-        );
-    }
-
-    #[tokio::test]
-    async fn server_owned_completion_nonexistent_agent_is_noop() {
-        let pool = AgentPool::new_test(3001);
-        // Should not panic or error — just silently return.
-        run_server_owned_completion(
-            &pool.agents,
-            pool.port,
-            "nonexistent",
-            "bot",
-            None,
-            pool.watcher_tx.clone(),
-        )
-        .await;
-    }
-
-    // ── pipeline advance tests ────────────────────────────────────────────────
-
-    #[tokio::test]
-    async fn pipeline_advance_coder_gates_pass_server_qa_moves_to_merge() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Set up story in 2_current/ (no qa frontmatter → uses project default "server")
-        let current = root.join(".story_kit/work/2_current");
-        fs::create_dir_all(&current).unwrap();
-        fs::write(current.join("50_story_test.md"), "test").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.run_pipeline_advance(
-            "50_story_test",
-            "coder-1",
-            CompletionReport {
-                summary: "done".to_string(),
-                gates_passed: true,
-                gate_output: String::new(),
-            },
-            Some(root.to_path_buf()),
-            None,
-            false,
-        )
-        .await;
-
-        // With default qa: server, story skips QA and goes straight to 4_merge/
-        assert!(
-            root.join(".story_kit/work/4_merge/50_story_test.md")
-                .exists(),
-            "story should be in 4_merge/"
-        );
-        assert!(
-            !current.join("50_story_test.md").exists(),
-            "story should not still be in 2_current/"
-        );
-    }
-
-    #[tokio::test]
-    async fn pipeline_advance_coder_gates_pass_agent_qa_moves_to_qa() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Set up story in 2_current/ with qa: agent frontmatter
-        let current = root.join(".story_kit/work/2_current");
-        fs::create_dir_all(&current).unwrap();
-        fs::write(
-            current.join("50_story_test.md"),
-            "---\nname: Test\nqa: agent\n---\ntest",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.run_pipeline_advance(
-            "50_story_test",
-            "coder-1",
-            CompletionReport {
-                summary: "done".to_string(),
-                gates_passed: true,
-                gate_output: String::new(),
-            },
-            Some(root.to_path_buf()),
-            None,
-            false,
-        )
-        .await;
-
-        // With qa: agent, story should move to 3_qa/
-        assert!(
-            root.join(".story_kit/work/3_qa/50_story_test.md").exists(),
-            "story should be in 3_qa/"
-        );
-        assert!(
-            !current.join("50_story_test.md").exists(),
-            "story should not still be in 2_current/"
-        );
-    }
-
-    #[tokio::test]
-    async fn pipeline_advance_qa_gates_pass_moves_story_to_merge() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Set up story in 3_qa/
-        let qa_dir = root.join(".story_kit/work/3_qa");
-        fs::create_dir_all(&qa_dir).unwrap();
-        // qa: server so the story skips human review and goes straight to merge.
-        fs::write(
-            qa_dir.join("51_story_test.md"),
-            "---\nname: Test\nqa: server\n---\ntest",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.run_pipeline_advance(
-            "51_story_test",
-            "qa",
-            CompletionReport {
-                summary: "QA done".to_string(),
-                gates_passed: true,
-                gate_output: String::new(),
-            },
-            Some(root.to_path_buf()),
-            None,
-            false,
-        )
-        .await;
-
-        // Story should have moved to 4_merge/
-        assert!(
-            root.join(".story_kit/work/4_merge/51_story_test.md")
-                .exists(),
-            "story should be in 4_merge/"
-        );
-        assert!(
-            !qa_dir.join("51_story_test.md").exists(),
-            "story should not still be in 3_qa/"
-        );
-    }
-
-    #[tokio::test]
-    async fn pipeline_advance_supervisor_does_not_advance() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let current = root.join(".story_kit/work/2_current");
-        fs::create_dir_all(&current).unwrap();
-        fs::write(current.join("52_story_test.md"), "test").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.run_pipeline_advance(
-            "52_story_test",
-            "supervisor",
-            CompletionReport {
-                summary: "supervised".to_string(),
-                gates_passed: true,
-                gate_output: String::new(),
-            },
-            Some(root.to_path_buf()),
-            None,
-            false,
-        )
-        .await;
-
-        // Story should NOT have moved (supervisors don't advance pipeline)
-        assert!(
-            current.join("52_story_test.md").exists(),
-            "story should still be in 2_current/ for supervisor"
-        );
-    }
-
-    #[tokio::test]
-    async fn pipeline_advance_sends_agent_state_changed_to_watcher_tx() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Set up story in 2_current/
-        let current = root.join(".story_kit/work/2_current");
-        fs::create_dir_all(&current).unwrap();
-        fs::write(current.join("173_story_test.md"), "test").unwrap();
-        // Ensure 3_qa/ exists for the move target
-        fs::create_dir_all(root.join(".story_kit/work/3_qa")).unwrap();
-        // Ensure 1_backlog/ exists (start_agent calls move_story_to_current)
-        fs::create_dir_all(root.join(".story_kit/work/1_backlog")).unwrap();
-
-        // Write a project.toml with a qa agent so start_agent can resolve it.
-        fs::create_dir_all(root.join(".story_kit")).unwrap();
-        fs::write(
-            root.join(".story_kit/project.toml"),
-            r#"
-default_qa = "agent"
-
-[[agent]]
-name = "coder-1"
-role = "Coder"
-command = "echo"
-args = ["noop"]
-prompt = "test"
-stage = "coder"
-
-[[agent]]
-name = "qa"
-role = "QA"
-command = "echo"
-args = ["noop"]
-prompt = "test"
-stage = "qa"
-"#,
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        // Subscribe to the watcher channel BEFORE the pipeline advance.
-        let mut rx = pool.watcher_tx.subscribe();
-
-        // Call pipeline advance directly. This will:
-        // 1. Move the story to 3_qa/
-        // 2. Start the QA agent (which calls notify_agent_state_changed)
-        // Note: the actual agent process will fail (no real worktree), but the
-        // agent insertion and notification happen before the background spawn.
-        pool.run_pipeline_advance(
-            "173_story_test",
-            "coder-1",
-            CompletionReport {
-                summary: "done".to_string(),
-                gates_passed: true,
-                gate_output: String::new(),
-            },
-            Some(root.to_path_buf()),
-            None,
-            false,
-        )
-        .await;
-
-        // The pipeline advance should have sent AgentStateChanged events via
-        // the pool's watcher_tx (not a dummy channel). Collect all events.
-        let mut got_agent_state_changed = false;
-        while let Ok(evt) = rx.try_recv() {
-            if matches!(evt, WatcherEvent::AgentStateChanged) {
-                got_agent_state_changed = true;
-                break;
-            }
-        }
-
-        assert!(
-            got_agent_state_changed,
-            "pipeline advance should send AgentStateChanged through the real watcher_tx \
-             (bug 173: lozenges must update when agents are assigned during pipeline advance)"
-        );
-    }
-
-    // ── auto-assign helper tests ───────────────────────────────────
-
-    #[test]
-    fn scan_stage_items_returns_empty_for_missing_dir() {
-        let tmp = tempfile::tempdir().unwrap();
-        let items = scan_stage_items(tmp.path(), "2_current");
-        assert!(items.is_empty());
-    }
-
-    #[test]
-    fn scan_stage_items_returns_sorted_story_ids() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let stage_dir = tmp.path().join(".story_kit").join("work").join("2_current");
-        fs::create_dir_all(&stage_dir).unwrap();
-        fs::write(stage_dir.join("42_story_foo.md"), "---\nname: foo\n---").unwrap();
-        fs::write(stage_dir.join("10_story_bar.md"), "---\nname: bar\n---").unwrap();
-        fs::write(stage_dir.join("5_story_baz.md"), "---\nname: baz\n---").unwrap();
-        // non-md file should be ignored
-        fs::write(stage_dir.join("README.txt"), "ignore me").unwrap();
-
-        let items = scan_stage_items(tmp.path(), "2_current");
-        assert_eq!(items, vec!["10_story_bar", "42_story_foo", "5_story_baz"]);
-    }
-
-    #[test]
-    fn is_story_assigned_returns_true_for_running_coder() {
-        let config = ProjectConfig::default();
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
-
-        let agents = pool.agents.lock().unwrap();
-        assert!(is_story_assigned_for_stage(
-            &config,
-            &agents,
-            "42_story_foo",
-            &PipelineStage::Coder
-        ));
-        // Same story but wrong stage — should be false
-        assert!(!is_story_assigned_for_stage(
-            &config,
-            &agents,
-            "42_story_foo",
-            &PipelineStage::Qa
-        ));
-        // Different story — should be false
-        assert!(!is_story_assigned_for_stage(
-            &config,
-            &agents,
-            "99_story_other",
-            &PipelineStage::Coder
-        ));
-    }
-
-    #[test]
-    fn is_story_assigned_returns_false_for_completed_agent() {
-        let config = ProjectConfig::default();
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Completed);
-
-        let agents = pool.agents.lock().unwrap();
-        // Completed agents don't count as assigned
-        assert!(!is_story_assigned_for_stage(
-            &config,
-            &agents,
-            "42_story_foo",
-            &PipelineStage::Coder
-        ));
-    }
-
-    #[test]
-    fn is_story_assigned_uses_config_stage_field_for_nonstandard_names() {
-        let config = ProjectConfig::parse(
-            r#"
-[[agent]]
-name = "qa-2"
-stage = "qa"
-"#,
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("42_story_foo", "qa-2", AgentStatus::Running);
-
-        let agents = pool.agents.lock().unwrap();
-        // qa-2 with stage=qa should be recognised as a QA agent
-        assert!(
-            is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Qa),
-            "qa-2 should be detected as assigned to QA stage"
-        );
-        // Should NOT appear as a coder
-        assert!(
-            !is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Coder),
-            "qa-2 should not be detected as a coder"
-        );
-    }
-
-    #[test]
-    fn find_free_agent_returns_none_when_all_busy() {
-        let config = ProjectConfig::parse(
-            r#"
-[[agent]]
-name = "coder-1"
-[[agent]]
-name = "coder-2"
-"#,
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("s1", "coder-1", AgentStatus::Running);
-        pool.inject_test_agent("s2", "coder-2", AgentStatus::Running);
-
-        let agents = pool.agents.lock().unwrap();
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert!(free.is_none(), "no free coders should be available");
-    }
-
-    #[test]
-    fn find_free_agent_returns_first_free_coder() {
-        let config = ProjectConfig::parse(
-            r#"
-[[agent]]
-name = "coder-1"
-[[agent]]
-name = "coder-2"
-[[agent]]
-name = "coder-3"
-"#,
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        // coder-1 is busy, coder-2 is free
-        pool.inject_test_agent("s1", "coder-1", AgentStatus::Running);
-
-        let agents = pool.agents.lock().unwrap();
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert_eq!(
-            free,
-            Some("coder-2"),
-            "coder-2 should be the first free coder"
-        );
-    }
-
-    #[test]
-    fn find_free_agent_ignores_completed_agents() {
-        let config = ProjectConfig::parse(
-            r#"
-[[agent]]
-name = "coder-1"
-"#,
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        // coder-1 completed its previous story — it's free for a new one
-        pool.inject_test_agent("s1", "coder-1", AgentStatus::Completed);
-
-        let agents = pool.agents.lock().unwrap();
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert_eq!(free, Some("coder-1"), "completed coder-1 should be free");
-    }
-
-    #[test]
-    fn find_free_agent_returns_none_for_wrong_stage() {
-        let config = ProjectConfig::parse(
-            r#"
-[[agent]]
-name = "qa"
-"#,
-        )
-        .unwrap();
-
-        let agents: HashMap<String, StoryAgent> = HashMap::new();
-        // Looking for a Coder but only QA is configured
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert!(free.is_none());
-        // Looking for QA should find it
-        let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
-        assert_eq!(free_qa, Some("qa"));
-    }
-
-    #[test]
-    fn find_free_agent_uses_config_stage_field_not_name() {
-        // Agents named "qa-2" and "coder-opus" don't match the legacy name heuristic
-        // but should be picked up via their explicit stage field.
-        let config = ProjectConfig::parse(
-            r#"
-[[agent]]
-name = "qa-2"
-stage = "qa"
-
-[[agent]]
-name = "coder-opus"
-stage = "coder"
-"#,
-        )
-        .unwrap();
-
-        let agents: HashMap<String, StoryAgent> = HashMap::new();
-
-        // qa-2 should be found for PipelineStage::Qa via config stage field
-        let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
-        assert_eq!(free_qa, Some("qa-2"), "qa-2 with stage=qa should be found");
-
-        // coder-opus should be found for PipelineStage::Coder via config stage field
-        let free_coder = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert_eq!(
-            free_coder,
-            Some("coder-opus"),
-            "coder-opus with stage=coder should be found"
-        );
-
-        // Neither should match the other stage
-        let free_merge = find_free_agent_for_stage(&config, &agents, &PipelineStage::Mergemaster);
-        assert!(free_merge.is_none());
-    }
-
-    // ── find_active_story_stage tests ─────────────────────────────────────────
-
-    #[test]
-    fn find_active_story_stage_detects_current() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-        let current = root.join(".story_kit/work/2_current");
-        fs::create_dir_all(&current).unwrap();
-        fs::write(current.join("10_story_test.md"), "test").unwrap();
-
-        assert_eq!(
-            find_active_story_stage(root, "10_story_test"),
-            Some("2_current")
-        );
-    }
-
-    #[test]
-    fn find_active_story_stage_detects_qa() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-        let qa = root.join(".story_kit/work/3_qa");
-        fs::create_dir_all(&qa).unwrap();
-        fs::write(qa.join("11_story_test.md"), "test").unwrap();
-
-        assert_eq!(find_active_story_stage(root, "11_story_test"), Some("3_qa"));
-    }
-
-    #[test]
-    fn find_active_story_stage_detects_merge() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-        let merge = root.join(".story_kit/work/4_merge");
-        fs::create_dir_all(&merge).unwrap();
-        fs::write(merge.join("12_story_test.md"), "test").unwrap();
-
-        assert_eq!(
-            find_active_story_stage(root, "12_story_test"),
-            Some("4_merge")
-        );
-    }
-
-    #[test]
-    fn find_active_story_stage_returns_none_for_unknown_story() {
-        let tmp = tempfile::tempdir().unwrap();
-        assert_eq!(find_active_story_stage(tmp.path(), "99_nonexistent"), None);
-    }
-
-    // ── check_orphaned_agents return value tests (bug 161) ──────────────────
-
-    #[tokio::test]
-    async fn check_orphaned_agents_returns_count_of_orphaned_agents() {
-        let pool = AgentPool::new_test(3001);
-
-        // Spawn two tasks that finish immediately.
-        let h1 = tokio::spawn(async {});
-        let h2 = tokio::spawn(async {});
-        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
-        assert!(h1.is_finished());
-        assert!(h2.is_finished());
-
-        pool.inject_test_agent_with_handle("story_a", "coder", AgentStatus::Running, h1);
-        pool.inject_test_agent_with_handle("story_b", "coder", AgentStatus::Running, h2);
-
-        let found = check_orphaned_agents(&pool.agents);
-        assert_eq!(found, 2, "should detect both orphaned agents");
-    }
-
-    #[test]
-    fn check_orphaned_agents_returns_zero_when_no_orphans() {
-        let pool = AgentPool::new_test(3001);
-        // Inject agents in terminal states — not orphaned.
-        pool.inject_test_agent("story_a", "coder", AgentStatus::Completed);
-        pool.inject_test_agent("story_b", "qa", AgentStatus::Failed);
-
-        let found = check_orphaned_agents(&pool.agents);
-        assert_eq!(
-            found, 0,
-            "no orphans should be detected for terminal agents"
-        );
-    }
-
-    #[tokio::test]
-    async fn watchdog_detects_orphaned_running_agent() {
-        let pool = AgentPool::new_test(3001);
-
-        let handle = tokio::spawn(async {});
-        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
-        assert!(
-            handle.is_finished(),
-            "task should be finished before injection"
-        );
-
-        let tx = pool.inject_test_agent_with_handle(
-            "orphan_story",
-            "coder",
-            AgentStatus::Running,
-            handle,
-        );
-        let mut rx = tx.subscribe();
-
-        pool.run_watchdog_once();
-
-        {
-            let agents = pool.agents.lock().unwrap();
-            let key = composite_key("orphan_story", "coder");
-            let agent = agents.get(&key).unwrap();
-            assert_eq!(
-                agent.status,
-                AgentStatus::Failed,
-                "watchdog must mark an orphaned Running agent as Failed"
-            );
-        }
-
-        let event = rx.try_recv().expect("watchdog must emit an Error event");
-        assert!(
-            matches!(event, AgentEvent::Error { .. }),
-            "expected AgentEvent::Error, got: {event:?}"
-        );
-    }
-
-    #[tokio::test]
-    async fn watchdog_orphan_detection_returns_nonzero_enabling_auto_assign() {
-        // This test verifies the contract that `check_orphaned_agents` returns
-        // a non-zero count when orphans exist, which the watchdog uses to
-        // decide whether to trigger auto-assign (bug 161).
-        let pool = AgentPool::new_test(3001);
-
-        let handle = tokio::spawn(async {});
-        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
-
-        pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle);
-
-        // Before watchdog: agent is Running.
-        {
-            let agents = pool.agents.lock().unwrap();
-            let key = composite_key("orphan_story", "coder");
-            assert_eq!(agents.get(&key).unwrap().status, AgentStatus::Running);
-        }
-
-        // Run watchdog pass — should return 1 (orphan found).
-        let found = check_orphaned_agents(&pool.agents);
-        assert_eq!(
-            found, 1,
-            "watchdog must return 1 for a single orphaned agent"
-        );
-
-        // After watchdog: agent is Failed.
-        {
-            let agents = pool.agents.lock().unwrap();
-            let key = composite_key("orphan_story", "coder");
-            assert_eq!(
-                agents.get(&key).unwrap().status,
-                AgentStatus::Failed,
-                "orphaned agent must be marked Failed"
-            );
-        }
-    }
-
-    // ── remove_agents_for_story tests ────────────────────────────────────────
-
-    #[test]
-    fn remove_agents_for_story_removes_all_entries() {
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("story_a", "coder-1", AgentStatus::Completed);
-        pool.inject_test_agent("story_a", "qa", AgentStatus::Failed);
-        pool.inject_test_agent("story_b", "coder-1", AgentStatus::Running);
-
-        let removed = pool.remove_agents_for_story("story_a");
-        assert_eq!(removed, 2, "should remove both agents for story_a");
-
-        let agents = pool.list_agents().unwrap();
-        assert_eq!(agents.len(), 1, "only story_b agent should remain");
-        assert_eq!(agents[0].story_id, "story_b");
-    }
-
-    #[test]
-    fn remove_agents_for_story_returns_zero_when_no_match() {
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("story_a", "coder-1", AgentStatus::Running);
-
-        let removed = pool.remove_agents_for_story("nonexistent");
-        assert_eq!(removed, 0);
-
-        let agents = pool.list_agents().unwrap();
-        assert_eq!(agents.len(), 1, "existing agents should not be affected");
-    }
-
-    // ── archive + cleanup integration test ───────────────────────────────────
-
-    #[tokio::test]
-    async fn archiving_story_removes_agent_entries_from_pool() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Set up story in 2_current/
-        let current = root.join(".story_kit/work/2_current");
-        fs::create_dir_all(&current).unwrap();
-        fs::write(current.join("60_story_cleanup.md"), "test").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("60_story_cleanup", "coder-1", AgentStatus::Completed);
-        pool.inject_test_agent("60_story_cleanup", "qa", AgentStatus::Completed);
-        pool.inject_test_agent("61_story_other", "coder-1", AgentStatus::Running);
-
-        // Verify all 3 agents exist.
-        assert_eq!(pool.list_agents().unwrap().len(), 3);
-
-        // Archive the story.
-        move_story_to_archived(root, "60_story_cleanup").unwrap();
-        pool.remove_agents_for_story("60_story_cleanup");
-
-        // Agent entries for the archived story should be gone.
-        let remaining = pool.list_agents().unwrap();
-        assert_eq!(
-            remaining.len(),
-            1,
-            "only the other story's agent should remain"
-        );
-        assert_eq!(remaining[0].story_id, "61_story_other");
-
-        // Story file should be in 5_done/
-        assert!(
-            root.join(".story_kit/work/5_done/60_story_cleanup.md")
-                .exists()
-        );
-    }
-
-    // ── kill_all_children tests ────────────────────────────────────
-
-    /// Returns true if a process with the given PID is currently running.
-    fn process_is_running(pid: u32) -> bool {
-        std::process::Command::new("ps")
-            .arg("-p")
-            .arg(pid.to_string())
-            .stdout(std::process::Stdio::null())
-            .stderr(std::process::Stdio::null())
-            .status()
-            .map(|s| s.success())
-            .unwrap_or(false)
-    }
-
-    #[test]
-    fn kill_all_children_is_safe_on_empty_pool() {
-        let pool = AgentPool::new_test(3001);
-        // Should not panic or deadlock on an empty registry.
-        pool.kill_all_children();
-        assert_eq!(pool.child_killer_count(), 0);
-    }
-
-    #[test]
-    fn kill_all_children_kills_real_process() {
-        // GIVEN: a real PTY child process (sleep 100) with its killer registered.
-        let pool = AgentPool::new_test(3001);
-
-        let pty_system = native_pty_system();
-        let pair = pty_system
-            .openpty(PtySize {
-                rows: 24,
-                cols: 80,
-                pixel_width: 0,
-                pixel_height: 0,
-            })
-            .expect("failed to open pty");
-
-        let mut cmd = CommandBuilder::new("sleep");
-        cmd.arg("100");
-        let mut child = pair
-            .slave
-            .spawn_command(cmd)
-            .expect("failed to spawn sleep");
-        let pid = child.process_id().expect("no pid");
-
-        pool.inject_child_killer("story:agent", child.clone_killer());
-
-        // Verify the process is alive before we kill it.
-        assert!(
-            process_is_running(pid),
-            "process {pid} should be running before kill_all_children"
-        );
-
-        // WHEN: kill_all_children() is called.
-        pool.kill_all_children();
-
-        // Collect the exit status (prevents zombie; also ensures signal was sent).
-        let _ = child.wait();
-
-        // THEN: the process should be dead.
-        assert!(
-            !process_is_running(pid),
-            "process {pid} should have been killed by kill_all_children"
-        );
-    }
-
-    #[test]
-    fn kill_all_children_clears_registry() {
-        // GIVEN: a pool with one registered killer.
-        let pool = AgentPool::new_test(3001);
-
-        let pty_system = native_pty_system();
-        let pair = pty_system
-            .openpty(PtySize {
-                rows: 24,
-                cols: 80,
-                pixel_width: 0,
-                pixel_height: 0,
-            })
-            .expect("failed to open pty");
-
-        let mut cmd = CommandBuilder::new("sleep");
-        cmd.arg("1");
-        let mut child = pair
-            .slave
-            .spawn_command(cmd)
-            .expect("failed to spawn sleep");
-
-        pool.inject_child_killer("story:agent", child.clone_killer());
-        assert_eq!(pool.child_killer_count(), 1);
-
-        // WHEN: kill_all_children() is called.
-        pool.kill_all_children();
-        let _ = child.wait();
-
-        // THEN: the registry is empty.
-        assert_eq!(
-            pool.child_killer_count(),
-            0,
-            "child_killers should be cleared after kill_all_children"
-        );
-    }
-
-    // ── available_agents_for_stage tests (story 190) ──────────────────────────
-
-    #[test]
-    fn available_agents_for_stage_returns_idle_agents() {
-        let config = make_config(
-            r#"
-[[agent]]
-name = "coder-1"
-stage = "coder"
-
-[[agent]]
-name = "coder-2"
-stage = "coder"
-
-[[agent]]
-name = "qa"
-stage = "qa"
-"#,
-        );
-        let pool = AgentPool::new_test(3001);
-        // coder-1 is busy on story-1
-        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
-
-        let available = pool
-            .available_agents_for_stage(&config, &PipelineStage::Coder)
-            .unwrap();
-        assert_eq!(available, vec!["coder-2"]);
-
-        let available_qa = pool
-            .available_agents_for_stage(&config, &PipelineStage::Qa)
-            .unwrap();
-        assert_eq!(available_qa, vec!["qa"]);
-    }
-
-    #[test]
-    fn available_agents_for_stage_returns_empty_when_all_busy() {
-        let config = make_config(
-            r#"
-[[agent]]
-name = "coder-1"
-stage = "coder"
-"#,
-        );
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
-
-        let available = pool
-            .available_agents_for_stage(&config, &PipelineStage::Coder)
-            .unwrap();
-        assert!(available.is_empty());
-    }
-
-    #[test]
-    fn available_agents_for_stage_ignores_completed_agents() {
-        let config = make_config(
-            r#"
-[[agent]]
-name = "coder-1"
-stage = "coder"
-"#,
-        );
-        let pool = AgentPool::new_test(3001);
-        // Completed agents should not count as busy.
-        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Completed);
-
-        let available = pool
-            .available_agents_for_stage(&config, &PipelineStage::Coder)
-            .unwrap();
-        assert_eq!(available, vec!["coder-1"]);
-    }
-
-    #[tokio::test]
-    async fn start_agent_auto_selects_second_coder_when_first_busy() {
-        let tmp = tempfile::tempdir().unwrap();
-        let sk = tmp.path().join(".story_kit");
-        std::fs::create_dir_all(&sk).unwrap();
-        std::fs::write(
-            sk.join("project.toml"),
-            r#"
-[[agent]]
-name = "supervisor"
-stage = "other"
-
-[[agent]]
-name = "coder-1"
-stage = "coder"
-
-[[agent]]
-name = "coder-2"
-stage = "coder"
-"#,
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        // coder-1 is busy on another story
-        pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running);
-
-        // Call start_agent without agent_name — should pick coder-2
-        let result = pool
-            .start_agent(tmp.path(), "42_my_story", None, None)
-            .await;
-        // Will fail for infrastructure reasons (no git repo), but should NOT
-        // fail with "All coder agents are busy" — that would mean it didn't
-        // try coder-2.
-        match result {
-            Ok(info) => {
-                assert_eq!(info.agent_name, "coder-2");
-            }
-            Err(err) => {
-                assert!(
-                    !err.contains("All coder agents are busy"),
-                    "should have selected coder-2 but got: {err}"
-                );
-                assert!(
-                    !err.contains("No coder agent configured"),
-                    "should not fail on agent selection, got: {err}"
-                );
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn start_agent_returns_busy_when_all_coders_occupied() {
-        let tmp = tempfile::tempdir().unwrap();
-        let sk = tmp.path().join(".story_kit");
-        std::fs::create_dir_all(&sk).unwrap();
-        std::fs::write(
-            sk.join("project.toml"),
-            r#"
-[[agent]]
-name = "coder-1"
-stage = "coder"
-
-[[agent]]
-name = "coder-2"
-stage = "coder"
-"#,
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
-        pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending);
-
-        let result = pool.start_agent(tmp.path(), "story-3", None, None).await;
-        assert!(result.is_err());
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("All coder agents are busy"),
-            "expected busy error, got: {err}"
-        );
-    }
-
-    /// Story 203: when all coders are busy the story file must be moved from
-    /// 1_backlog/ to 2_current/ so that auto_assign_available_work can pick
-    /// it up once a coder finishes.
-    #[tokio::test]
-    async fn start_agent_moves_story_to_current_when_coders_busy() {
-        let tmp = tempfile::tempdir().unwrap();
-        let sk = tmp.path().join(".story_kit");
-        let backlog = sk.join("work/1_backlog");
-        std::fs::create_dir_all(&backlog).unwrap();
-        std::fs::write(
-            sk.join("project.toml"),
-            r#"
-[[agent]]
-name = "coder-1"
-stage = "coder"
-"#,
-        )
-        .unwrap();
-        // Place the story in 1_backlog/.
-        std::fs::write(backlog.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
-
-        let result = pool.start_agent(tmp.path(), "story-3", None, None).await;
-
-        // Should fail because all coders are busy.
-        assert!(result.is_err());
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("All coder agents are busy"),
-            "expected busy error, got: {err}"
-        );
-        assert!(
-            err.contains("queued in work/2_current/"),
-            "expected story-to-current message, got: {err}"
-        );
-
-        // Story must have been moved to 2_current/.
-        let current_path = sk.join("work/2_current/story-3.md");
-        assert!(
-            current_path.exists(),
-            "story should be in 2_current/ after busy error, but was not"
-        );
-        let backlog_path = backlog.join("story-3.md");
-        assert!(
-            !backlog_path.exists(),
-            "story should no longer be in 1_backlog/"
-        );
-    }
-
-    /// Story 203: auto_assign_available_work must detect a story in 2_current/
-    /// with no active agent and start an agent for it.
-    #[tokio::test]
-    async fn auto_assign_picks_up_story_queued_in_current() {
-        let tmp = tempfile::tempdir().unwrap();
-        let sk = tmp.path().join(".story_kit");
-        let current = sk.join("work/2_current");
-        std::fs::create_dir_all(&current).unwrap();
-        std::fs::write(
-            sk.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
-        )
-        .unwrap();
-        // Place the story in 2_current/ (simulating the "queued" state).
-        std::fs::write(current.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        // No agents are running — coder-1 is free.
-
-        // auto_assign will try to call start_agent, which will attempt to create
-        // a worktree (will fail without a git repo) — that is fine. We only need
-        // to verify the agent is registered as Pending before the background
-        // task eventually fails.
-        pool.auto_assign_available_work(tmp.path()).await;
-
-        let agents = pool.agents.lock().unwrap();
-        let has_pending = agents.values().any(|a| {
-            a.agent_name == "coder-1"
-                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
-        });
-        assert!(
-            has_pending,
-            "auto_assign should have started coder-1 for story-3, but pool is empty"
-        );
-    }
-
-    /// Story 203: if a story is already in 2_current/ or later, start_agent
-    /// must not fail — the move is a no-op.
-    #[tokio::test]
-    async fn start_agent_story_already_in_current_is_noop() {
-        let tmp = tempfile::tempdir().unwrap();
-        let sk = tmp.path().join(".story_kit");
-        let current = sk.join("work/2_current");
-        std::fs::create_dir_all(&current).unwrap();
-        std::fs::write(
-            sk.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
-        )
-        .unwrap();
-        // Place the story directly in 2_current/.
-        std::fs::write(current.join("story-5.md"), "---\nname: Story 5\n---\n").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-
-        // start_agent should attempt to assign coder-1 (no infra, so it will
-        // fail for git reasons), but must NOT fail due to the story already
-        // being in 2_current/.
-        let result = pool.start_agent(tmp.path(), "story-5", None, None).await;
-        match result {
-            Ok(_) => {}
-            Err(e) => {
-                assert!(
-                    !e.contains("Failed to move"),
-                    "should not fail on idempotent move, got: {e}"
-                );
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn start_agent_explicit_name_unchanged_when_busy() {
-        let tmp = tempfile::tempdir().unwrap();
-        let sk = tmp.path().join(".story_kit");
-        std::fs::create_dir_all(&sk).unwrap();
-        std::fs::write(
-            sk.join("project.toml"),
-            r#"
-[[agent]]
-name = "coder-1"
-stage = "coder"
-
-[[agent]]
-name = "coder-2"
-stage = "coder"
-"#,
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
-
-        // Explicit request for coder-1 (busy) should fail even though coder-2 is free.
-        let result = pool
-            .start_agent(tmp.path(), "story-2", Some("coder-1"), None)
-            .await;
-        assert!(result.is_err());
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("coder-1") && err.contains("already running"),
-            "expected explicit busy error, got: {err}"
-        );
-    }
-
-    // ── start_agent single-instance concurrency tests ─────────────────────────
-
-    /// Regression test for bug 97: the agent pool must reject a second concurrent
-    /// instance of the same agent name even if it would run on a different story.
-    #[tokio::test]
-    async fn start_agent_rejects_when_same_agent_already_running_on_another_story() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Write a minimal project.toml so ProjectConfig::load can find the "qa" agent.
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(&sk_dir).unwrap();
-        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        // Simulate qa already running on story-a.
-        pool.inject_test_agent("story-a", "qa", AgentStatus::Running);
-
-        // Attempt to start qa on story-b — must be rejected.
-        let result = pool.start_agent(root, "story-b", Some("qa"), None).await;
-
-        assert!(
-            result.is_err(),
-            "start_agent should fail when qa is already running on another story"
-        );
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("already running") || err.contains("becomes available"),
-            "error message should explain why: got '{err}'"
-        );
-    }
-
-    /// Verify that the concurrency guard does NOT block an agent that is merely
-    /// Completed (not Running/Pending) — completed agents are free for new work.
-    #[tokio::test]
-    async fn start_agent_allows_new_story_when_previous_run_is_completed() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(&sk_dir).unwrap();
-        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        // Previous run completed — should NOT block a new story.
-        pool.inject_test_agent("story-a", "qa", AgentStatus::Completed);
-
-        // The call will fail eventually (no real worktree / Claude CLI), but it must
-        // NOT fail at the concurrency check.  We detect the difference by inspecting
-        // the error message: a concurrency rejection says "already running", while a
-        // later failure (missing story file, missing claude binary, etc.) says something else.
-        let result = pool.start_agent(root, "story-b", Some("qa"), None).await;
-
-        if let Err(ref e) = result {
-            assert!(
-                !e.contains("already running") && !e.contains("becomes available"),
-                "completed agent must not trigger the concurrency guard: got '{e}'"
-            );
-        }
-        // result may be Ok (unlikely in test env) or Err for infra reasons — both fine.
-    }
-
-    // ── bug 118: pending entry cleanup on start_agent failure ────────────────
-
-    /// Regression test for bug 118: when worktree creation fails (e.g. because
-    /// there is no git repo), the Pending entry that was inserted into the agent
-    /// HashMap must not remain Pending — it must transition to Failed.  This
-    /// prevents `find_free_agent_for_stage` / auto-assign from being permanently
-    /// blocked.
-    ///
-    /// With story 157 the worktree creation moved into the background spawn, so
-    /// `start_agent` returns `Ok(Pending)` immediately.  We use `wait_for_agent`
-    /// to block until the background task resolves.
-    #[tokio::test]
-    async fn start_agent_cleans_up_pending_entry_on_failure() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Minimal project.toml with a coder agent (must match 2_current/ stage).
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(&sk_dir).unwrap();
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
-        )
-        .unwrap();
-
-        // Create the story in upcoming so `move_story_to_current` succeeds,
-        // but do NOT init a git repo — `create_worktree` will fail in the spawn.
-        let upcoming = root.join(".story_kit/work/1_backlog");
-        fs::create_dir_all(&upcoming).unwrap();
-        fs::write(upcoming.join("50_story_test.md"), "---\nname: Test\n---\n").unwrap();
-
-        let pool = AgentPool::new_test(3099);
-
-        let result = pool
-            .start_agent(root, "50_story_test", Some("coder-1"), None)
-            .await;
-
-        // With the non-blocking flow, start_agent returns Ok(Pending) immediately.
-        // Worktree creation failure happens asynchronously in the background.
-        assert!(
-            result.is_ok(),
-            "start_agent should return Ok(Pending) immediately: {:?}",
-            result.err()
-        );
-        assert_eq!(
-            result.unwrap().status,
-            AgentStatus::Pending,
-            "initial status must be Pending"
-        );
-
-        // Wait for the background task to reach a terminal state.
-        // It must fail (no git repo → create_worktree returns an error).
-        let final_info = pool
-            .wait_for_agent("50_story_test", "coder-1", 5000)
-            .await
-            .expect("wait_for_agent should not time out");
-        assert_eq!(
-            final_info.status,
-            AgentStatus::Failed,
-            "agent must transition to Failed after worktree creation error"
-        );
-
-        // The pool must retain a Failed entry (not disappear silently).
-        let agents = pool.agents.lock().unwrap();
-        let failed_entry = agents
-            .values()
-            .find(|a| a.agent_name == "coder-1" && a.status == AgentStatus::Failed);
-        assert!(
-            failed_entry.is_some(),
-            "agent pool must retain a Failed entry so the UI can show the error state"
-        );
-        drop(agents);
-
-        // The AgentEvent::Error must be persisted in the event_log so late
-        // subscribers / polling clients can see the failure reason.
-        let events = pool
-            .drain_events("50_story_test", "coder-1")
-            .expect("drain_events should succeed");
-        let has_error_event = events.iter().any(|e| matches!(e, AgentEvent::Error { .. }));
-        assert!(
-            has_error_event,
-            "event_log must contain AgentEvent::Error after worktree creation fails"
-        );
-    }
-
-    /// Verify that a successful start_agent keeps the Running entry (guard is
-    /// disarmed).  We cannot truly spawn an agent in tests, but we verify that
-    /// the concurrency check still blocks a second concurrent start — which
-    /// proves the first entry survived the guard.
-    #[tokio::test]
-    async fn start_agent_guard_does_not_remove_running_entry() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(&sk_dir).unwrap();
-        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
-
-        let pool = AgentPool::new_test(3099);
-
-        // Manually inject a Running agent (simulates successful start).
-        pool.inject_test_agent("story-x", "qa", AgentStatus::Running);
-
-        // Attempting to start the same agent on a different story must be
-        // rejected — the Running entry must still be there.
-        let result = pool.start_agent(root, "story-y", Some("qa"), None).await;
-
-        assert!(result.is_err());
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("already running") || err.contains("becomes available"),
-            "running entry must survive: got '{err}'"
-        );
-    }
-
-    // ── TOCTOU race-condition regression tests (story 132) ───────────────────
-
-    /// Verify that a Pending entry (not just Running) blocks a concurrent
-    /// start_agent for the same agent name on a different story.  This proves
-    /// the check-and-insert is atomic: the Pending entry is visible to the
-    /// second caller because it was inserted while the lock was still held.
-    #[tokio::test]
-    async fn toctou_pending_entry_blocks_same_agent_on_different_story() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(&sk_dir).unwrap();
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3099);
-
-        // Simulate what the winning concurrent call would have done: insert a
-        // Pending entry for coder-1 on story-86.
-        pool.inject_test_agent("86_story_foo", "coder-1", AgentStatus::Pending);
-
-        // Now attempt to start coder-1 on a *different* story — must be rejected.
-        let result = pool
-            .start_agent(root, "130_story_bar", Some("coder-1"), None)
-            .await;
-
-        assert!(result.is_err(), "second start_agent must be rejected");
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("already running") || err.contains("becomes available"),
-            "expected concurrency-rejection message, got: '{err}'"
-        );
-    }
-
-    /// Concurrent start_agent calls for the same agent name on different stories
-    /// must result in exactly one rejection due to the concurrency check (not
-    /// due to an unrelated failure such as missing git repo).
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn toctou_concurrent_start_agent_same_agent_exactly_one_concurrency_rejection() {
-        use std::fs;
-        use std::sync::Arc;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path().to_path_buf();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap();
-        fs::write(
-            root.join(".story_kit/project.toml"),
-            "[[agent]]\nname = \"coder-1\"\n",
-        )
-        .unwrap();
-        // Both stories must exist in upcoming so move_story_to_current can run
-        // (only the winner reaches that point, but we set both up defensively).
-        fs::write(
-            root.join(".story_kit/work/1_backlog/86_story_foo.md"),
-            "---\nname: Foo\n---\n",
-        )
-        .unwrap();
-        fs::write(
-            root.join(".story_kit/work/1_backlog/130_story_bar.md"),
-            "---\nname: Bar\n---\n",
-        )
-        .unwrap();
-
-        let pool = Arc::new(AgentPool::new_test(3099));
-
-        let pool1 = pool.clone();
-        let root1 = root.clone();
-        let t1 = tokio::spawn(async move {
-            pool1
-                .start_agent(&root1, "86_story_foo", Some("coder-1"), None)
-                .await
-        });
-
-        let pool2 = pool.clone();
-        let root2 = root.clone();
-        let t2 = tokio::spawn(async move {
-            pool2
-                .start_agent(&root2, "130_story_bar", Some("coder-1"), None)
-                .await
-        });
-
-        let (r1, r2) = tokio::join!(t1, t2);
-        let r1 = r1.unwrap();
-        let r2 = r2.unwrap();
-
-        // The concurrency-rejection message always contains "already running" /
-        // "becomes available".  Any other error (e.g., missing git repo) means
-        // that call *won* the atomic check-and-insert.
-        let concurrency_rejections = [&r1, &r2]
-            .iter()
-            .filter(|r| {
-                r.as_ref().is_err_and(|e| {
-                    e.contains("already running") || e.contains("becomes available")
-                })
-            })
-            .count();
-
-        assert_eq!(
-            concurrency_rejections, 1,
-            "exactly one call must be rejected by the concurrency check; \
-             got r1={r1:?} r2={r2:?}"
-        );
-    }
-
-    // ── story-230: prevent duplicate stage agents on same story ───────────────
-
-    /// start_agent must reject a second coder on a story that already has a
-    /// Running coder, even if they are *different* agent names.
-    #[tokio::test]
-    async fn start_agent_rejects_second_coder_stage_on_same_story() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(&sk_dir).unwrap();
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3099);
-        // coder-1 is already running on the story.
-        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
-
-        // Attempt to start coder-2 on the *same* story — must be rejected.
-        let result = pool
-            .start_agent(root, "42_story_foo", Some("coder-2"), None)
-            .await;
-
-        assert!(
-            result.is_err(),
-            "second coder on same story must be rejected"
-        );
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("same pipeline stage"),
-            "error must mention same pipeline stage, got: '{err}'"
-        );
-        assert!(
-            err.contains("coder-1") && err.contains("coder-2"),
-            "error must name both agents, got: '{err}'"
-        );
-    }
-
-    /// The stage-conflict check must also cover QA: a second QA agent on the
-    /// same story must be rejected.
-    #[tokio::test]
-    async fn start_agent_rejects_second_qa_stage_on_same_story() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(&sk_dir).unwrap();
-        // Two qa agents using the explicit stage field so name-based detection
-        // doesn't interfere.
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n\n\
-             [[agent]]\nname = \"qa-2\"\nstage = \"qa\"\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3099);
-        pool.inject_test_agent("55_story_bar", "qa-1", AgentStatus::Running);
-
-        let result = pool
-            .start_agent(root, "55_story_bar", Some("qa-2"), None)
-            .await;
-
-        assert!(result.is_err(), "second qa on same story must be rejected");
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("same pipeline stage"),
-            "error must mention same pipeline stage, got: '{err}'"
-        );
-    }
-
-    /// Regression test (story 230): concurrent start_agent calls with two
-    /// different coder names on the same story — exactly one must succeed
-    /// (or fail for infrastructure reasons), and exactly one must be rejected
-    /// with a stage-conflict error.
-    ///
-    /// The story is pre-placed in `2_current/` so that both concurrent
-    /// `move_story_to_current` calls are no-ops, guaranteeing both reach the
-    /// lock where the stage-conflict check fires.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn start_agent_concurrent_two_coders_same_story_exactly_one_stage_rejection() {
-        use std::fs;
-        use std::sync::Arc;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path().to_path_buf();
-
-        let sk_dir = root.join(".story_kit");
-        // Place story directly in 2_current/ so move_story_to_current is a
-        // no-op for both concurrent callers, letting both reach the lock.
-        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
-        fs::write(
-            root.join(".story_kit/project.toml"),
-            "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
-        )
-        .unwrap();
-        fs::write(
-            root.join(".story_kit/work/2_current/42_story_foo.md"),
-            "---\nname: Foo\n---\n",
-        )
-        .unwrap();
-
-        let pool = Arc::new(AgentPool::new_test(3099));
-
-        let pool1 = pool.clone();
-        let root1 = root.clone();
-        let t1 = tokio::spawn(async move {
-            pool1
-                .start_agent(&root1, "42_story_foo", Some("coder-1"), None)
-                .await
-        });
-
-        let pool2 = pool.clone();
-        let root2 = root.clone();
-        let t2 = tokio::spawn(async move {
-            pool2
-                .start_agent(&root2, "42_story_foo", Some("coder-2"), None)
-                .await
-        });
-
-        let (r1, r2) = tokio::join!(t1, t2);
-        let r1 = r1.unwrap();
-        let r2 = r2.unwrap();
-
-        // Exactly one call must be rejected with a stage-conflict error.
-        let stage_rejections = [&r1, &r2]
-            .iter()
-            .filter(|r| r.as_ref().is_err_and(|e| e.contains("same pipeline stage")))
-            .count();
-
-        assert_eq!(
-            stage_rejections, 1,
-            "exactly one call must be rejected by the stage-conflict check; \
-             got r1={r1:?} r2={r2:?}"
-        );
-    }
-
-    /// Regression test (story 230): two coders on *different* stories must
-    /// not trigger the stage-conflict guard — the guard is per-story.
-    #[tokio::test]
-    async fn start_agent_two_coders_different_stories_not_blocked_by_stage_check() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap();
-        fs::write(
-            root.join(".story_kit/project.toml"),
-            "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
-        )
-        .unwrap();
-        fs::write(
-            root.join(".story_kit/work/1_backlog/99_story_baz.md"),
-            "---\nname: Baz\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3099);
-        // coder-1 is running on a *different* story.
-        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
-
-        // Starting coder-2 on story-99 must NOT be rejected by the stage
-        // guard (it may fail for infrastructure reasons like missing git repo,
-        // but not because of the stage-conflict check).
-        let result = pool
-            .start_agent(root, "99_story_baz", Some("coder-2"), None)
-            .await;
-
-        if let Err(ref e) = result {
-            assert!(
-                !e.contains("same pipeline stage"),
-                "stage-conflict guard must not fire for agents on different stories; \
-                 got: '{e}'"
-            );
-        }
-        // result may be Ok (unlikely in test env) or Err for infra reasons — both fine.
-    }
-
-    /// Two concurrent auto_assign_available_work calls must not assign the same
-    /// agent to two stories simultaneously.  After both complete, at most one
-    /// Pending/Running entry must exist per agent name.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn toctou_concurrent_auto_assign_no_duplicate_agent_assignments() {
-        use std::fs;
-        use std::sync::Arc;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path().to_path_buf();
-
-        let sk_dir = root.join(".story_kit");
-        // Two stories waiting in 2_current, one coder agent.
-        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\n",
-        )
-        .unwrap();
-        fs::write(
-            sk_dir.join("work/2_current/86_story_foo.md"),
-            "---\nname: Foo\n---\n",
-        )
-        .unwrap();
-        fs::write(
-            sk_dir.join("work/2_current/130_story_bar.md"),
-            "---\nname: Bar\n---\n",
-        )
-        .unwrap();
-
-        let pool = Arc::new(AgentPool::new_test(3099));
-
-        // Run two concurrent auto_assign calls.
-        let pool1 = pool.clone();
-        let root1 = root.clone();
-        let t1 = tokio::spawn(async move { pool1.auto_assign_available_work(&root1).await });
-
-        let pool2 = pool.clone();
-        let root2 = root.clone();
-        let t2 = tokio::spawn(async move { pool2.auto_assign_available_work(&root2).await });
-
-        let _ = tokio::join!(t1, t2);
-
-        // At most one Pending/Running entry should exist for coder-1.
-        let agents = pool.agents.lock().unwrap();
-        let active_coder_count = agents
-            .values()
-            .filter(|a| {
-                a.agent_name == "coder-1"
-                    && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
-            })
-            .count();
-
-        assert!(
-            active_coder_count <= 1,
-            "coder-1 must not be assigned to more than one story simultaneously; \
-             found {active_coder_count} active entries"
-        );
-    }
-
-    // ── bug 312: stage-pipeline mismatch guard in start_agent ──────────────
-
-    /// Bug 312: start_agent must reject a mergemaster on a story in 2_current/.
-    #[tokio::test]
-    async fn start_agent_rejects_mergemaster_on_coding_stage_story() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
-             [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
-        )
-        .unwrap();
-        fs::write(
-            sk_dir.join("work/2_current/310_story_foo.md"),
-            "---\nname: Foo\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3099);
-        let result = pool
-            .start_agent(root, "310_story_foo", Some("mergemaster"), None)
-            .await;
-
-        assert!(
-            result.is_err(),
-            "mergemaster must not be assigned to a story in 2_current/"
-        );
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("stage") && err.contains("2_current"),
-            "error must mention stage mismatch, got: '{err}'"
-        );
-    }
-
-    /// Bug 312: start_agent must reject a coder on a story in 3_qa/.
-    #[tokio::test]
-    async fn start_agent_rejects_coder_on_qa_stage_story() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(sk_dir.join("work/3_qa")).unwrap();
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
-             [[agent]]\nname = \"qa\"\nstage = \"qa\"\n",
-        )
-        .unwrap();
-        fs::write(
-            sk_dir.join("work/3_qa/42_story_bar.md"),
-            "---\nname: Bar\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3099);
-        let result = pool
-            .start_agent(root, "42_story_bar", Some("coder-1"), None)
-            .await;
-
-        assert!(
-            result.is_err(),
-            "coder must not be assigned to a story in 3_qa/"
-        );
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("stage") && err.contains("3_qa"),
-            "error must mention stage mismatch, got: '{err}'"
-        );
-    }
-
-    /// Bug 312: start_agent must reject a QA agent on a story in 4_merge/.
-    #[tokio::test]
-    async fn start_agent_rejects_qa_on_merge_stage_story() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap();
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"qa\"\nstage = \"qa\"\n\n\
-             [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
-        )
-        .unwrap();
-        fs::write(
-            sk_dir.join("work/4_merge/55_story_baz.md"),
-            "---\nname: Baz\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3099);
-        let result = pool
-            .start_agent(root, "55_story_baz", Some("qa"), None)
-            .await;
-
-        assert!(
-            result.is_err(),
-            "qa must not be assigned to a story in 4_merge/"
-        );
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("stage") && err.contains("4_merge"),
-            "error must mention stage mismatch, got: '{err}'"
-        );
-    }
-
-    /// Bug 312: supervisor (stage=other) should be allowed on any pipeline stage.
-    #[tokio::test]
-    async fn start_agent_allows_supervisor_on_any_stage() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"supervisor\"\nstage = \"other\"\n",
-        )
-        .unwrap();
-        fs::write(
-            sk_dir.join("work/2_current/77_story_sup.md"),
-            "---\nname: Sup\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3099);
-        // start_agent will fail for git/worktree reasons, but NOT for stage
-        // mismatch. We just need to verify it doesn't fail with a stage error.
-        let result = pool
-            .start_agent(root, "77_story_sup", Some("supervisor"), None)
-            .await;
-
-        match result {
-            Ok(_) => {} // Fine — no stage error.
-            Err(e) => {
-                assert!(
-                    !e.contains("stage:") || !e.contains("cannot be assigned"),
-                    "supervisor should not be rejected for stage mismatch, got: '{e}'"
-                );
-            }
-        }
-    }
-
-    /// Bug 312: correct stage agent should still be allowed.
-    #[tokio::test]
-    async fn start_agent_allows_correct_stage_agent() {
-        use std::fs;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk_dir = root.join(".story_kit");
-        fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap();
-        fs::write(
-            sk_dir.join("project.toml"),
-            "[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
-        )
-        .unwrap();
-        fs::write(
-            sk_dir.join("work/4_merge/88_story_ok.md"),
-            "---\nname: OK\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3099);
-        let result = pool
-            .start_agent(root, "88_story_ok", Some("mergemaster"), None)
-            .await;
-
-        match result {
-            Ok(_) => {} // Fine — correct stage.
-            Err(e) => {
-                assert!(
-                    !e.contains("cannot be assigned"),
-                    "mergemaster on 4_merge/ story should not fail stage check, got: '{e}'"
-                );
-            }
-        }
-    }
-
-    // ── merge_agent_work tests ────────────────────────────────────────────────
-
-    /// Helper: start a merge and poll until terminal state.
-    async fn run_merge_to_completion(
-        pool: &Arc<AgentPool>,
-        repo: &std::path::Path,
-        story_id: &str,
-    ) -> MergeJob {
-        pool.start_merge_agent_work(repo, story_id).unwrap();
-        loop {
-            tokio::time::sleep(std::time::Duration::from_millis(50)).await;
-            if let Some(job) = pool.get_merge_status(story_id)
-                && !matches!(job.status, MergeJobStatus::Running)
-            {
-                return job;
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn merge_agent_work_returns_error_when_branch_not_found() {
-        use tempfile::tempdir;
-
-        let tmp = tempdir().unwrap();
-        let repo = tmp.path();
-        init_git_repo(repo);
-
-        let pool = Arc::new(AgentPool::new_test(3001));
-        let job = run_merge_to_completion(&pool, repo, "99_nonexistent").await;
-        match &job.status {
-            MergeJobStatus::Completed(report) => {
-                assert!(!report.success, "should fail when branch missing");
-            }
-            MergeJobStatus::Failed(_) => {
-                // Also acceptable — the pipeline errored out
-            }
-            MergeJobStatus::Running => {
-                panic!("should not still be running");
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn merge_agent_work_succeeds_on_clean_branch() {
-        use std::fs;
-        use tempfile::tempdir;
-
-        let tmp = tempdir().unwrap();
-        let repo = tmp.path();
-        init_git_repo(repo);
-
-        // Create a feature branch with a commit
-        Command::new("git")
-            .args(["checkout", "-b", "feature/story-23_test"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        fs::write(repo.join("feature.txt"), "feature content").unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "-m", "add feature"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        // Switch back to master (initial branch)
-        Command::new("git")
-            .args(["checkout", "master"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        // Create the story file in 4_merge/ so we can test archival
-        let merge_dir = repo.join(".story_kit/work/4_merge");
-        fs::create_dir_all(&merge_dir).unwrap();
-        let story_file = merge_dir.join("23_test.md");
-        fs::write(&story_file, "---\nname: Test\n---\n").unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "-m", "add story in merge"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        let pool = Arc::new(AgentPool::new_test(3001));
-        let job = run_merge_to_completion(&pool, repo, "23_test").await;
-
-        match &job.status {
-            MergeJobStatus::Completed(report) => {
-                assert!(!report.had_conflicts, "should have no conflicts");
-                assert!(
-                    report.success
-                        || report.gate_output.contains("Failed to run")
-                        || !report.gates_passed,
-                    "report should be coherent: {report:?}"
-                );
-                if report.story_archived {
-                    let done = repo.join(".story_kit/work/5_done/23_test.md");
-                    assert!(done.exists(), "done file should exist");
-                }
-            }
-            MergeJobStatus::Failed(e) => {
-                // Gate failures are acceptable in test env
-                assert!(
-                    e.contains("Failed") || e.contains("failed"),
-                    "unexpected failure: {e}"
-                );
-            }
-            MergeJobStatus::Running => panic!("should not still be running"),
-        }
-    }
-
-    // ── quality gate ordering test ────────────────────────────────
-
-    /// Regression test for bug 142: quality gates must run BEFORE the fast-forward
-    /// to master so that broken code never lands on master.
-    ///
-    /// Setup: a repo with a failing `script/test`, a feature branch with one commit.
-    /// When `run_squash_merge` is called, the gates must detect failure and abort the
-    /// fast-forward, leaving master HEAD unchanged.
-    #[cfg(unix)]
-    #[test]
-    fn quality_gates_run_before_fast_forward_to_master() {
-        use std::fs;
-        use std::os::unix::fs::PermissionsExt;
-        use tempfile::tempdir;
-
-        let tmp = tempdir().unwrap();
-        let repo = tmp.path();
-        init_git_repo(repo);
-
-        // Add a failing script/test so quality gates will fail.
-        let script_dir = repo.join("script");
-        fs::create_dir_all(&script_dir).unwrap();
-        let script_test = script_dir.join("test");
-        fs::write(&script_test, "#!/usr/bin/env bash\nexit 1\n").unwrap();
-        let mut perms = fs::metadata(&script_test).unwrap().permissions();
-        perms.set_mode(0o755);
-        fs::set_permissions(&script_test, perms).unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "-m", "add failing script/test"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        // Create a feature branch with a commit.
-        Command::new("git")
-            .args(["checkout", "-b", "feature/story-142_test"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        fs::write(repo.join("change.txt"), "feature change").unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "-m", "feature work"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        // Switch back to master and record its HEAD.
-        Command::new("git")
-            .args(["checkout", "master"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        let head_before = String::from_utf8(
-            Command::new("git")
-                .args(["rev-parse", "HEAD"])
-                .current_dir(repo)
-                .output()
-                .unwrap()
-                .stdout,
-        )
-        .unwrap()
-        .trim()
-        .to_string();
-
-        // Run the squash-merge.  The failing script/test makes quality gates
-        // fail → fast-forward must NOT happen.
-        let result =
-            crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test")
-                .unwrap();
-
-        let head_after = String::from_utf8(
-            Command::new("git")
-                .args(["rev-parse", "HEAD"])
-                .current_dir(repo)
-                .output()
-                .unwrap()
-                .stdout,
-        )
-        .unwrap()
-        .trim()
-        .to_string();
-
-        // Gates must have failed (script/test exits 1) so master should be untouched.
-        assert!(
-            !result.success,
-            "run_squash_merge must report failure when gates fail"
-        );
-        assert_eq!(
-            head_before, head_after,
-            "master HEAD must not advance when quality gates fail (bug 142)"
-        );
-    }
-
-    #[tokio::test]
-    async fn merge_agent_work_conflict_does_not_break_master() {
-        use std::fs;
-        use tempfile::tempdir;
-
-        let tmp = tempdir().unwrap();
-        let repo = tmp.path();
-        init_git_repo(repo);
-
-        // Create a file on master.
-        fs::write(
-            repo.join("code.rs"),
-            "fn main() {\n    println!(\"hello\");\n}\n",
-        )
-        .unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "-m", "initial code"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        // Feature branch: modify the same line differently.
-        Command::new("git")
-            .args(["checkout", "-b", "feature/story-42_story_foo"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        fs::write(
-            repo.join("code.rs"),
-            "fn main() {\n    println!(\"hello\");\n    feature_fn();\n}\n",
-        )
-        .unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "-m", "feature: add fn call"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        // Master: add different line at same location.
-        Command::new("git")
-            .args(["checkout", "master"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        fs::write(
-            repo.join("code.rs"),
-            "fn main() {\n    println!(\"hello\");\n    master_fn();\n}\n",
-        )
-        .unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "-m", "master: add fn call"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        // Create story file in 4_merge.
-        let merge_dir = repo.join(".story_kit/work/4_merge");
-        fs::create_dir_all(&merge_dir).unwrap();
-        fs::write(merge_dir.join("42_story_foo.md"), "---\nname: Test\n---\n").unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args(["commit", "-m", "add story"])
-            .current_dir(repo)
-            .output()
-            .unwrap();
-
-        let pool = Arc::new(AgentPool::new_test(3001));
-        let job = run_merge_to_completion(&pool, repo, "42_story_foo").await;
-
-        // Master should NEVER have conflict markers, regardless of merge outcome.
-        let master_code = fs::read_to_string(repo.join("code.rs")).unwrap();
-        assert!(
-            !master_code.contains("<<<<<<<"),
-            "master must never contain conflict markers:\n{master_code}"
-        );
-        assert!(
-            !master_code.contains(">>>>>>>"),
-            "master must never contain conflict markers:\n{master_code}"
-        );
-
-        // The report should accurately reflect what happened.
-        match &job.status {
-            MergeJobStatus::Completed(report) => {
-                assert!(report.had_conflicts, "should report conflicts");
-            }
-            MergeJobStatus::Failed(_) => {
-                // Acceptable — merge aborted due to conflicts
-            }
-            MergeJobStatus::Running => panic!("should not still be running"),
-        }
-    }
-
-    // ── reconcile_on_startup tests ────────────────────────────────────────────
-
-    #[tokio::test]
-    async fn reconcile_on_startup_noop_when_no_worktrees() {
-        let tmp = tempfile::tempdir().unwrap();
-        let pool = AgentPool::new_test(3001);
-        let (tx, _rx) = broadcast::channel(16);
-        // Should not panic; no worktrees to reconcile.
-        pool.reconcile_on_startup(tmp.path(), &tx).await;
-    }
-
-    #[tokio::test]
-    async fn reconcile_on_startup_emits_done_event() {
-        let tmp = tempfile::tempdir().unwrap();
-        let pool = AgentPool::new_test(3001);
-        let (tx, mut rx) = broadcast::channel::<ReconciliationEvent>(16);
-        pool.reconcile_on_startup(tmp.path(), &tx).await;
-
-        // Collect all events; the last must be "done".
-        let mut events: Vec<ReconciliationEvent> = Vec::new();
-        while let Ok(evt) = rx.try_recv() {
-            events.push(evt);
-        }
-        assert!(
-            events.iter().any(|e| e.status == "done"),
-            "reconcile_on_startup must emit a 'done' event; got: {:?}",
-            events.iter().map(|e| &e.status).collect::<Vec<_>>()
-        );
-    }
-
-    #[tokio::test]
-    async fn reconcile_on_startup_skips_story_without_committed_work() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Set up story in 2_current/.
-        let current = root.join(".story_kit/work/2_current");
-        fs::create_dir_all(&current).unwrap();
-        fs::write(current.join("60_story_test.md"), "test").unwrap();
-
-        // Create a worktree directory that is a fresh git repo with no commits
-        // ahead of its own base branch (simulates a worktree where no work was done).
-        let wt_dir = root.join(".story_kit/worktrees/60_story_test");
-        fs::create_dir_all(&wt_dir).unwrap();
-        init_git_repo(&wt_dir);
-
-        let pool = AgentPool::new_test(3001);
-        let (tx, _rx) = broadcast::channel(16);
-        pool.reconcile_on_startup(root, &tx).await;
-
-        // Story should still be in 2_current/ — nothing was reconciled.
-        assert!(
-            current.join("60_story_test.md").exists(),
-            "story should stay in 2_current/ when worktree has no committed work"
-        );
-    }
-
-    #[tokio::test]
-    async fn reconcile_on_startup_runs_gates_on_worktree_with_committed_work() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Set up a git repo for the project root.
-        init_git_repo(root);
-
-        // Set up story in 2_current/ and commit it so the project root is clean.
-        let current = root.join(".story_kit/work/2_current");
-        fs::create_dir_all(&current).unwrap();
-        fs::write(current.join("61_story_test.md"), "test").unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(root)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args([
-                "-c",
-                "user.email=test@test.com",
-                "-c",
-                "user.name=Test",
-                "commit",
-                "-m",
-                "add story",
-            ])
-            .current_dir(root)
-            .output()
-            .unwrap();
-
-        // Create a real git worktree for the story.
-        let wt_dir = root.join(".story_kit/worktrees/61_story_test");
-        fs::create_dir_all(wt_dir.parent().unwrap()).unwrap();
-        Command::new("git")
-            .args([
-                "worktree",
-                "add",
-                &wt_dir.to_string_lossy(),
-                "-b",
-                "feature/story-61_story_test",
-            ])
-            .current_dir(root)
-            .output()
-            .unwrap();
-
-        // Add a commit to the feature branch (simulates coder completing work).
-        fs::write(wt_dir.join("implementation.txt"), "done").unwrap();
-        Command::new("git")
-            .args(["add", "."])
-            .current_dir(&wt_dir)
-            .output()
-            .unwrap();
-        Command::new("git")
-            .args([
-                "-c",
-                "user.email=test@test.com",
-                "-c",
-                "user.name=Test",
-                "commit",
-                "-m",
-                "implement story",
-            ])
-            .current_dir(&wt_dir)
-            .output()
-            .unwrap();
-
-        assert!(
-            crate::agents::gates::worktree_has_committed_work(&wt_dir),
-            "test setup: worktree should have committed work"
-        );
-
-        let pool = AgentPool::new_test(3001);
-        let (tx, _rx) = broadcast::channel(16);
-        pool.reconcile_on_startup(root, &tx).await;
-
-        // In the test env, cargo clippy will fail (no Cargo.toml) so gates fail
-        // and the story stays in 2_current/.  The important assertion is that
-        // reconcile ran without panicking and the story is in a consistent state.
-        let in_current = current.join("61_story_test.md").exists();
-        let in_qa = root.join(".story_kit/work/3_qa/61_story_test.md").exists();
-        assert!(
-            in_current || in_qa,
-            "story should be in 2_current/ or 3_qa/ after reconciliation"
-        );
-    }
-
-    #[test]
-    fn has_review_hold_returns_true_when_set() {
-        let tmp = tempfile::tempdir().unwrap();
-        let qa_dir = tmp.path().join(".story_kit/work/3_qa");
-        std::fs::create_dir_all(&qa_dir).unwrap();
-        let spike_path = qa_dir.join("10_spike_research.md");
-        std::fs::write(
-            &spike_path,
-            "---\nname: Research spike\nreview_hold: true\n---\n# Spike\n",
-        )
-        .unwrap();
-        assert!(has_review_hold(tmp.path(), "3_qa", "10_spike_research"));
-    }
-
-    #[test]
-    fn has_review_hold_returns_false_when_not_set() {
-        let tmp = tempfile::tempdir().unwrap();
-        let qa_dir = tmp.path().join(".story_kit/work/3_qa");
-        std::fs::create_dir_all(&qa_dir).unwrap();
-        let spike_path = qa_dir.join("10_spike_research.md");
-        std::fs::write(&spike_path, "---\nname: Research spike\n---\n# Spike\n").unwrap();
-        assert!(!has_review_hold(tmp.path(), "3_qa", "10_spike_research"));
-    }
-
-    #[test]
-    fn has_review_hold_returns_false_when_file_missing() {
-        let tmp = tempfile::tempdir().unwrap();
-        assert!(!has_review_hold(tmp.path(), "3_qa", "99_spike_missing"));
-    }
-
-    /// Story 265: auto_assign_available_work must skip spikes in 3_qa/ that
-    /// have review_hold: true set in their front matter.
-    #[tokio::test]
-    async fn auto_assign_skips_spikes_with_review_hold() {
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        // Create project.toml with a QA agent.
-        let sk = root.join(".story_kit");
-        std::fs::create_dir_all(&sk).unwrap();
-        std::fs::write(
-            sk.join("project.toml"),
-            "[[agents]]\nname = \"qa\"\nrole = \"qa\"\nmodel = \"test\"\nprompt = \"test\"\n",
-        )
-        .unwrap();
-
-        // Put a spike in 3_qa/ with review_hold: true.
-        let qa_dir = root.join(".story_kit/work/3_qa");
-        std::fs::create_dir_all(&qa_dir).unwrap();
-        std::fs::write(
-            qa_dir.join("20_spike_test.md"),
-            "---\nname: Test Spike\nreview_hold: true\n---\n# Spike\n",
-        )
-        .unwrap();
-
-        let (watcher_tx, _) = broadcast::channel::<WatcherEvent>(4);
-        let pool = AgentPool::new(3001, watcher_tx);
-
-        pool.auto_assign_available_work(root).await;
-
-        // No agent should have been started for the spike.
-        let agents = pool.agents.lock().unwrap();
-        assert!(
-            agents.is_empty(),
-            "No agents should be assigned to a spike with review_hold"
-        );
-    }
-
-    // ── Story 279: auto-assign respects agent stage from front matter ──────────
-
-    /// When a story in 3_qa/ has `agent: coder-1` in its front matter but
-    /// coder-1 is a coder-stage agent, auto-assign must NOT assign coder-1.
-    /// Instead it should fall back to a free QA-stage agent.
-    #[tokio::test]
-    async fn auto_assign_ignores_coder_preference_when_story_is_in_qa_stage() {
-        let tmp = tempfile::tempdir().unwrap();
-        let sk = tmp.path().join(".story_kit");
-        let qa_dir = sk.join("work/3_qa");
-        std::fs::create_dir_all(&qa_dir).unwrap();
-        std::fs::write(
-            sk.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
-             [[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n",
-        )
-        .unwrap();
-        // Story in 3_qa/ with a preferred coder-stage agent.
-        std::fs::write(
-            qa_dir.join("story-qa1.md"),
-            "---\nname: QA Story\nagent: coder-1\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-
-        pool.auto_assign_available_work(tmp.path()).await;
-
-        let agents = pool.agents.lock().unwrap();
-        // coder-1 must NOT have been assigned (wrong stage for 3_qa/).
-        let coder_assigned = agents.values().any(|a| {
-            a.agent_name == "coder-1"
-                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
-        });
-        assert!(
-            !coder_assigned,
-            "coder-1 should not be assigned to a QA-stage story"
-        );
-        // qa-1 should have been assigned instead.
-        let qa_assigned = agents.values().any(|a| {
-            a.agent_name == "qa-1"
-                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
-        });
-        assert!(
-            qa_assigned,
-            "qa-1 should be assigned as fallback for the QA-stage story"
-        );
-    }
-
-    /// When a story in 2_current/ has `agent: coder-1` in its front matter and
-    /// coder-1 is a coder-stage agent, auto-assign must respect the preference
-    /// and assign coder-1 (not fall back to some other coder).
-    #[tokio::test]
-    async fn auto_assign_respects_coder_preference_when_story_is_in_current_stage() {
-        let tmp = tempfile::tempdir().unwrap();
-        let sk = tmp.path().join(".story_kit");
-        let current_dir = sk.join("work/2_current");
-        std::fs::create_dir_all(&current_dir).unwrap();
-        std::fs::write(
-            sk.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
-             [[agent]]\nname = \"coder-2\"\nstage = \"coder\"\n",
-        )
-        .unwrap();
-        // Story in 2_current/ with a preferred coder-1 agent.
-        std::fs::write(
-            current_dir.join("story-pref.md"),
-            "---\nname: Coder Story\nagent: coder-1\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-
-        pool.auto_assign_available_work(tmp.path()).await;
-
-        let agents = pool.agents.lock().unwrap();
-        // coder-1 should have been picked (it matches the stage and is preferred).
-        let coder1_assigned = agents.values().any(|a| {
-            a.agent_name == "coder-1"
-                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
-        });
-        assert!(
-            coder1_assigned,
-            "coder-1 should be assigned when it matches the stage and is preferred"
-        );
-        // coder-2 must NOT be assigned (not preferred).
-        let coder2_assigned = agents.values().any(|a| {
-            a.agent_name == "coder-2"
-                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
-        });
-        assert!(
-            !coder2_assigned,
-            "coder-2 should not be assigned when coder-1 is explicitly preferred"
-        );
-    }
-
-    /// When the preferred agent's stage mismatches and no other agent of the
-    /// correct stage is available, auto-assign must not start any agent for that
-    /// story (no panic, no error).
-    #[tokio::test]
-    async fn auto_assign_stage_mismatch_with_no_fallback_starts_no_agent() {
-        let tmp = tempfile::tempdir().unwrap();
-        let sk = tmp.path().join(".story_kit");
-        let qa_dir = sk.join("work/3_qa");
-        std::fs::create_dir_all(&qa_dir).unwrap();
-        // Only a coder agent is configured — no QA agent exists.
-        std::fs::write(
-            sk.join("project.toml"),
-            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
-        )
-        .unwrap();
-        // Story in 3_qa/ requests coder-1 (wrong stage) and no QA agent exists.
-        std::fs::write(
-            qa_dir.join("story-noqa.md"),
-            "---\nname: QA Story No Agent\nagent: coder-1\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-
-        // Must not panic.
-        pool.auto_assign_available_work(tmp.path()).await;
-
-        let agents = pool.agents.lock().unwrap();
-        assert!(
-            agents.is_empty(),
-            "No agent should be started when no stage-appropriate agent is available"
-        );
-    }
-
-    /// Bug 295: when a coder completes and QA is busy on another story,
-    /// the newly QA-queued story must be picked up when `run_pipeline_advance`
-    /// finishes for the busy QA agent's story (because auto_assign is now
-    /// called unconditionally at the end of pipeline advance).
-    #[tokio::test]
-    async fn pipeline_advance_picks_up_waiting_qa_stories_after_completion() {
-        use std::fs;
-        let tmp = tempfile::tempdir().unwrap();
-        let root = tmp.path();
-
-        let sk = root.join(".story_kit");
-        let qa_dir = sk.join("work/3_qa");
-        fs::create_dir_all(&qa_dir).unwrap();
-
-        // Configure a single QA agent.
-        fs::write(
-            sk.join("project.toml"),
-            r#"
-[[agent]]
-name = "qa"
-stage = "qa"
-"#,
-        )
-        .unwrap();
-
-        // Story 292 is in QA with QA agent running (will "complete" via
-        // run_pipeline_advance below).  Story 293 is in QA with NO agent —
-        // simulating the "stuck" state from bug 295.
-        fs::write(
-            qa_dir.join("292_story_first.md"),
-            "---\nname: First\nqa: human\n---\n",
-        )
-        .unwrap();
-        fs::write(
-            qa_dir.join("293_story_second.md"),
-            "---\nname: Second\nqa: human\n---\n",
-        )
-        .unwrap();
-
-        let pool = AgentPool::new_test(3001);
-        // QA is currently running on story 292.
-        pool.inject_test_agent("292_story_first", "qa", AgentStatus::Running);
-
-        // Verify that 293 cannot get a QA agent right now (QA is busy).
-        {
-            let agents = pool.agents.lock().unwrap();
-            assert!(
-                !is_agent_free(&agents, "qa"),
-                "qa should be busy on story 292"
-            );
-        }
-
-        // Simulate QA completing on story 292: remove the agent from the pool
-        // (as run_server_owned_completion does) then run pipeline advance.
-        {
-            let mut agents = pool.agents.lock().unwrap();
-            agents.remove(&composite_key("292_story_first", "qa"));
-        }
-
-        // Pipeline advance for QA with gates_passed=true will:
-        // 1. Run coverage gate (will "pass" trivially in test — no script/test_coverage)
-        // 2. Set review_hold on 292 (qa: human)
-        // 3. Call auto_assign_available_work (the fix from bug 295)
-        // 4. auto_assign should find 293 in 3_qa/ with no agent and start qa on it
-        pool.run_pipeline_advance(
-            "292_story_first",
-            "qa",
-            CompletionReport {
-                summary: "QA done".to_string(),
-                gates_passed: true,
-                gate_output: String::new(),
-            },
-            Some(root.to_path_buf()),
-            None,
-            false,
-        )
-        .await;
-
-        // After pipeline advance, auto_assign should have started QA on story 293.
-        let agents = pool.agents.lock().unwrap();
-        let qa_on_293 = agents.values().any(|a| {
-            a.agent_name == "qa"
-                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
-        });
-        assert!(
-            qa_on_293,
-            "auto_assign should have started qa for story 293 after 292's QA completed, \
-             but no qa agent is pending/running. Pool: {:?}",
-            agents
-                .iter()
-                .map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status))
-                .collect::<Vec<_>>()
-        );
-    }
-
-    // ── Helper to construct a test StoryAgent ──────────────────────────
-
-    fn make_test_story_agent(agent_name: &str, status: AgentStatus) -> StoryAgent {
-        StoryAgent {
-            agent_name: agent_name.to_string(),
-            status,
-            worktree_info: None,
-            session_id: None,
-            tx: broadcast::channel(1).0,
-            task_handle: None,
-            event_log: Arc::new(Mutex::new(Vec::new())),
-            completion: None,
-            project_root: None,
-            log_session_id: None,
-            merge_failure_reported: false,
-        }
-    }
-
-    // ── find_free_agent_for_stage: default_coder_model filtering ─────────
-
-    #[test]
-    fn find_free_agent_skips_opus_when_default_coder_model_set() {
-        let config = make_config(
-            r#"
-default_coder_model = "sonnet"
-
-[[agent]]
-name = "coder-1"
-stage = "coder"
-model = "sonnet"
-
-[[agent]]
-name = "coder-opus"
-stage = "coder"
-model = "opus"
-"#,
-        );
-
-        let agents = HashMap::new();
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert_eq!(free, Some("coder-1"));
-    }
-
-    #[test]
-    fn find_free_agent_returns_opus_when_no_default_coder_model() {
-        let config = make_config(
-            r#"
-[[agent]]
-name = "coder-opus"
-stage = "coder"
-model = "opus"
-"#,
-        );
-
-        let agents = HashMap::new();
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert_eq!(free, Some("coder-opus"));
-    }
-
-    #[test]
-    fn find_free_agent_returns_none_when_all_sonnet_coders_busy() {
-        let config = make_config(
-            r#"
-default_coder_model = "sonnet"
-
-[[agent]]
-name = "coder-1"
-stage = "coder"
-model = "sonnet"
-
-[[agent]]
-name = "coder-opus"
-stage = "coder"
-model = "opus"
-"#,
-        );
-
-        let mut agents = HashMap::new();
-        agents.insert(
-            "story1:coder-1".to_string(),
-            make_test_story_agent("coder-1", AgentStatus::Running),
-        );
-
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert_eq!(free, None, "opus agent should not be auto-assigned");
-    }
-
-    // ── find_free_agent_for_stage: max_coders limit ─────────────────────
-
-    #[test]
-    fn find_free_agent_respects_max_coders() {
-        let config = make_config(
-            r#"
-max_coders = 1
-
-[[agent]]
-name = "coder-1"
-stage = "coder"
-model = "sonnet"
-
-[[agent]]
-name = "coder-2"
-stage = "coder"
-model = "sonnet"
-"#,
-        );
-
-        let mut agents = HashMap::new();
-        agents.insert(
-            "story1:coder-1".to_string(),
-            make_test_story_agent("coder-1", AgentStatus::Running),
-        );
-
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert_eq!(free, None, "max_coders=1 should block second coder");
-    }
-
-    #[test]
-    fn find_free_agent_allows_within_max_coders() {
-        let config = make_config(
-            r#"
-max_coders = 2
-
-[[agent]]
-name = "coder-1"
-stage = "coder"
-model = "sonnet"
-
-[[agent]]
-name = "coder-2"
-stage = "coder"
-model = "sonnet"
-"#,
-        );
-
-        let mut agents = HashMap::new();
-        agents.insert(
-            "story1:coder-1".to_string(),
-            make_test_story_agent("coder-1", AgentStatus::Running),
-        );
-
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert_eq!(free, Some("coder-2"));
-    }
-
-    #[test]
-    fn max_coders_does_not_affect_qa_stage() {
-        let config = make_config(
-            r#"
-max_coders = 1
-
-[[agent]]
-name = "qa"
-stage = "qa"
-model = "sonnet"
-"#,
-        );
-
-        let agents = HashMap::new();
-        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
-        assert_eq!(free, Some("qa"));
-    }
-
-    // ── count_active_agents_for_stage ────────────────────────────────────
-
-    #[test]
-    fn count_active_agents_counts_running_and_pending() {
-        let config = make_config(
-            r#"
-[[agent]]
-name = "coder-1"
-stage = "coder"
-
-[[agent]]
-name = "coder-2"
-stage = "coder"
-"#,
-        );
-
-        let mut agents = HashMap::new();
-        agents.insert(
-            "s1:coder-1".to_string(),
-            make_test_story_agent("coder-1", AgentStatus::Running),
-        );
-        agents.insert(
-            "s2:coder-2".to_string(),
-            make_test_story_agent("coder-2", AgentStatus::Completed),
-        );
-
-        let count = count_active_agents_for_stage(&config, &agents, &PipelineStage::Coder);
-        assert_eq!(count, 1, "Only Running coder should be counted, not Completed");
-    }
-}
diff --git a/server/src/agents/pool/auto_assign.rs b/server/src/agents/pool/auto_assign.rs
new file mode 100644
index 0000000..6e676be
--- /dev/null
+++ b/server/src/agents/pool/auto_assign.rs
@@ -0,0 +1,1813 @@
+//! Auto-assign logic: scanning pipeline stages for unassigned stories and
+//! dispatching free agents, startup reconciliation, and the watchdog task.
+
+use crate::config::ProjectConfig;
+use crate::slog;
+use crate::slog_error;
+use crate::slog_warn;
+use crate::worktree;
+use std::collections::HashMap;
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, Mutex};
+use tokio::sync::broadcast;
+
+use super::super::{
+    AgentEvent, AgentStatus, PipelineStage, ReconciliationEvent, agent_config_stage, pipeline_stage,
+};
+use super::{AgentPool, StoryAgent, find_active_story_stage};
+
+impl AgentPool {
+    pub async fn auto_assign_available_work(&self, project_root: &Path) {
+        let config = match ProjectConfig::load(project_root) {
+            Ok(c) => c,
+            Err(e) => {
+                slog_warn!("[auto-assign] Failed to load project config: {e}");
+                return;
+            }
+        };
+
+        // Process each active pipeline stage in order.
+        let stages: [(&str, PipelineStage); 3] = [
+            ("2_current", PipelineStage::Coder),
+            ("3_qa", PipelineStage::Qa),
+            ("4_merge", PipelineStage::Mergemaster),
+        ];
+
+        for (stage_dir, stage) in &stages {
+            let items = scan_stage_items(project_root, stage_dir);
+            if items.is_empty() {
+                continue;
+            }
+
+            for story_id in &items {
+                // Items marked with review_hold (e.g. spikes after QA passes) stay
+                // in their current stage for human review — don't auto-assign agents.
+                if has_review_hold(project_root, stage_dir, story_id) {
+                    continue;
+                }
+
+                // Skip blocked stories (retry limit exceeded).
+                if is_story_blocked(project_root, stage_dir, story_id) {
+                    continue;
+                }
+
+                // Skip stories in 4_merge/ that already have a reported merge failure.
+                // These need human intervention — auto-assigning a new mergemaster
+                // would just waste tokens on the same broken merge.
+                if *stage == PipelineStage::Mergemaster
+                    && has_merge_failure(project_root, stage_dir, story_id)
+                {
+                    continue;
+                }
+
+                // AC6: Detect empty-diff stories in 4_merge/ before starting a
+                // mergemaster. If the worktree has no commits on the feature branch,
+                // write a merge_failure and block the story immediately.
+                if *stage == PipelineStage::Mergemaster
+                    && let Some(wt_path) = worktree::find_worktree_path(project_root, story_id)
+                    && !super::super::gates::worktree_has_committed_work(&wt_path)
+                {
+                    slog_warn!(
+                        "[auto-assign] Story '{story_id}' in 4_merge/ has no commits \
+                         on feature branch. Writing merge_failure and blocking."
+                    );
+                    let story_path = project_root
+                        .join(".story_kit/work")
+                        .join(stage_dir)
+                        .join(format!("{story_id}.md"));
+                    let _ = crate::io::story_metadata::write_merge_failure(
+                        &story_path,
+                        "Feature branch has no code changes — the coder agent \
+                         did not produce any commits.",
+                    );
+                    let _ = crate::io::story_metadata::write_blocked(&story_path);
+                    continue;
+                }
+
+                // Re-acquire the lock on each iteration to see state changes
+                // from previous start_agent calls in the same pass.
+                let preferred_agent =
+                    read_story_front_matter_agent(project_root, stage_dir, story_id);
+
+                // Check max_coders limit for the Coder stage before agent selection.
+                // If the pool is full, all remaining items in this stage wait.
+                if *stage == PipelineStage::Coder
+                    && let Some(max) = config.max_coders
+                {
+                    let agents_lock = match self.agents.lock() {
+                        Ok(a) => a,
+                        Err(e) => {
+                            slog_error!("[auto-assign] Failed to lock agents: {e}");
+                            break;
+                        }
+                    };
+                    let active = count_active_agents_for_stage(&config, &agents_lock, stage);
+                    if active >= max {
+                        slog!(
+                            "[auto-assign] Coder pool full ({active}/{max}); remaining items in {stage_dir}/ will wait."
+                        );
+                        break;
+                    }
+                }
+
+                // Outcome: (already_assigned, chosen_agent, preferred_busy, stage_mismatch)
+                // preferred_busy=true means the story has a specific agent requested but it is
+                // currently occupied — the story should wait rather than fall back.
+                // stage_mismatch=true means the preferred agent's stage doesn't match the
+                // pipeline stage, so we fell back to a generic stage agent.
+                let (already_assigned, free_agent, preferred_busy, stage_mismatch) = {
+                    let agents = match self.agents.lock() {
+                        Ok(a) => a,
+                        Err(e) => {
+                            slog_error!("[auto-assign] Failed to lock agents: {e}");
+                            break;
+                        }
+                    };
+                    let assigned = is_story_assigned_for_stage(&config, &agents, story_id, stage);
+                    if assigned {
+                        (true, None, false, false)
+                    } else if let Some(ref pref) = preferred_agent {
+                        // Story has a front-matter agent preference.
+                        // Verify the preferred agent's stage matches the current
+                        // pipeline stage — a coder shouldn't be assigned to QA.
+                        let pref_stage_matches = config
+                            .find_agent(pref)
+                            .map(|cfg| agent_config_stage(cfg) == *stage)
+                            .unwrap_or(false);
+                        if !pref_stage_matches {
+                            // Stage mismatch — fall back to any free agent for this stage.
+                            let free = find_free_agent_for_stage(&config, &agents, stage)
+                                .map(|s| s.to_string());
+                            (false, free, false, true)
+                        } else if is_agent_free(&agents, pref) {
+                            (false, Some(pref.clone()), false, false)
+                        } else {
+                            (false, None, true, false)
+                        }
+                    } else {
+                        let free = find_free_agent_for_stage(&config, &agents, stage)
+                            .map(|s| s.to_string());
+                        (false, free, false, false)
+                    }
+                };
+
+                if already_assigned {
+                    // Story already has an active agent — skip silently.
+                    continue;
+                }
+
+                if preferred_busy {
+                    // The story requests a specific agent that is currently busy.
+                    // Do not fall back to a different agent; let this story wait.
+                    slog!(
+                        "[auto-assign] Preferred agent '{}' busy for '{story_id}'; story will wait.",
+                        preferred_agent.as_deref().unwrap_or("?")
+                    );
+                    continue;
+                }
+
+                if stage_mismatch {
+                    slog!(
+                        "[auto-assign] Preferred agent '{}' stage mismatch for '{story_id}' in {stage_dir}/; falling back to stage-appropriate agent.",
+                        preferred_agent.as_deref().unwrap_or("?")
+                    );
+                }
+
+                match free_agent {
+                    Some(agent_name) => {
+                        slog!(
+                            "[auto-assign] Assigning '{agent_name}' to '{story_id}' in {stage_dir}/"
+                        );
+                        if let Err(e) = self
+                            .start_agent(project_root, story_id, Some(&agent_name), None)
+                            .await
+                        {
+                            slog!(
+                                "[auto-assign] Failed to start '{agent_name}' for '{story_id}': {e}"
+                            );
+                        }
+                    }
+                    None => {
+                        // No free agents of this type — stop scanning this stage.
+                        slog!(
+                            "[auto-assign] All {:?} agents busy; remaining items in {stage_dir}/ will wait.",
+                            stage
+                        );
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    /// Reconcile stories whose agent work was committed while the server was offline.
+    ///
+    /// On server startup the in-memory agent pool is empty, so any story that an agent
+    /// completed during a previous session is stuck: the worktree has committed work but
+    /// the pipeline never advanced.  This method detects those stories, re-runs the
+    /// acceptance gates, and advances the pipeline stage so that `auto_assign_available_work`
+    /// (called immediately after) picks up the right next-stage agents.
+    ///
+    /// Algorithm:
+    /// 1. List all worktree directories under `{project_root}/.story_kit/worktrees/`.
+    /// 2. For each worktree, check whether its feature branch has commits ahead of the
+    ///    base branch (`master` / `main`).
+    /// 3. If committed work is found AND the story is in `2_current/` or `3_qa/`:
+    ///    - Run acceptance gates (uncommitted-change check + clippy + tests).
+    ///    - On pass + `2_current/`: move the story to `3_qa/`.
+    ///    - On pass + `3_qa/`: run the coverage gate; if that also passes move to `4_merge/`.
+    ///    - On failure: leave the story where it is so `auto_assign_available_work` can
+    ///      start a fresh agent to retry.
+    /// 4. Stories in `4_merge/` are left for `auto_assign_available_work` to handle via a
+    ///    fresh mergemaster (squash-merge must be re-executed by the mergemaster agent).
+    pub async fn reconcile_on_startup(
+        &self,
+        project_root: &Path,
+        progress_tx: &broadcast::Sender<ReconciliationEvent>,
+    ) {
+        let worktrees = match worktree::list_worktrees(project_root) {
+            Ok(wt) => wt,
+            Err(e) => {
+                eprintln!("[startup:reconcile] Failed to list worktrees: {e}");
+                let _ = progress_tx.send(ReconciliationEvent {
+                    story_id: String::new(),
+                    status: "done".to_string(),
+                    message: format!("Reconciliation failed: {e}"),
+                });
+                return;
+            }
+        };
+
+        for wt_entry in &worktrees {
+            let story_id = &wt_entry.story_id;
+            let wt_path = wt_entry.path.clone();
+
+            // Determine which active stage the story is in.
+            let stage_dir = match find_active_story_stage(project_root, story_id) {
+                Some(s) => s,
+                None => continue, // Not in any active stage (backlog/archived or unknown).
+            };
+
+            // 4_merge/ is left for auto_assign to handle with a fresh mergemaster.
+            if stage_dir == "4_merge" {
+                continue;
+            }
+
+            let _ = progress_tx.send(ReconciliationEvent {
+                story_id: story_id.clone(),
+                status: "checking".to_string(),
+                message: format!("Checking for committed work in {stage_dir}/"),
+            });
+
+            // Check whether the worktree has commits ahead of the base branch.
+            let wt_path_for_check = wt_path.clone();
+            let has_work = tokio::task::spawn_blocking(move || {
+                super::super::gates::worktree_has_committed_work(&wt_path_for_check)
+            })
+            .await
+            .unwrap_or(false);
+
+            if !has_work {
+                eprintln!(
+                    "[startup:reconcile] No committed work for '{story_id}' in {stage_dir}/; skipping."
+                );
+                let _ = progress_tx.send(ReconciliationEvent {
+                    story_id: story_id.clone(),
+                    status: "skipped".to_string(),
+                    message: "No committed work found; skipping.".to_string(),
+                });
+                continue;
+            }
+
+            eprintln!(
+                "[startup:reconcile] Found committed work for '{story_id}' in {stage_dir}/. Running acceptance gates."
+            );
+            let _ = progress_tx.send(ReconciliationEvent {
+                story_id: story_id.clone(),
+                status: "gates_running".to_string(),
+                message: "Running acceptance gates…".to_string(),
+            });
+
+            // Run acceptance gates on the worktree.
+            let wt_path_for_gates = wt_path.clone();
+            let gates_result = tokio::task::spawn_blocking(move || {
+                super::super::gates::check_uncommitted_changes(&wt_path_for_gates)?;
+                super::super::gates::run_acceptance_gates(&wt_path_for_gates)
+            })
+            .await;
+
+            let (gates_passed, gate_output) = match gates_result {
+                Ok(Ok(pair)) => pair,
+                Ok(Err(e)) => {
+                    eprintln!("[startup:reconcile] Gate check error for '{story_id}': {e}");
+                    let _ = progress_tx.send(ReconciliationEvent {
+                        story_id: story_id.clone(),
+                        status: "failed".to_string(),
+                        message: format!("Gate error: {e}"),
+                    });
+                    continue;
+                }
+                Err(e) => {
+                    eprintln!("[startup:reconcile] Gate check task panicked for '{story_id}': {e}");
+                    let _ = progress_tx.send(ReconciliationEvent {
+                        story_id: story_id.clone(),
+                        status: "failed".to_string(),
+                        message: format!("Gate task panicked: {e}"),
+                    });
+                    continue;
+                }
+            };
+
+            if !gates_passed {
+                eprintln!(
+                    "[startup:reconcile] Gates failed for '{story_id}': {gate_output}\n\
+                     Leaving in {stage_dir}/ for auto-assign to restart the agent."
+                );
+                let _ = progress_tx.send(ReconciliationEvent {
+                    story_id: story_id.clone(),
+                    status: "failed".to_string(),
+                    message: "Gates failed; will be retried by auto-assign.".to_string(),
+                });
+                continue;
+            }
+
+            eprintln!("[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/).");
+
+            if stage_dir == "2_current" {
+                // Coder stage — determine qa mode to decide next step.
+                let qa_mode = {
+                    let item_type = super::super::lifecycle::item_type_from_id(story_id);
+                    if item_type == "spike" {
+                        crate::io::story_metadata::QaMode::Human
+                    } else {
+                        let default_qa = crate::config::ProjectConfig::load(project_root)
+                            .unwrap_or_default()
+                            .default_qa_mode();
+                        let story_path = project_root
+                            .join(".story_kit/work/2_current")
+                            .join(format!("{story_id}.md"));
+                        crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa)
+                    }
+                };
+
+                match qa_mode {
+                    crate::io::story_metadata::QaMode::Server => {
+                        if let Err(e) = super::super::lifecycle::move_story_to_merge(project_root, story_id) {
+                            eprintln!("[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}");
+                            let _ = progress_tx.send(ReconciliationEvent {
+                                story_id: story_id.clone(),
+                                status: "failed".to_string(),
+                                message: format!("Failed to advance to merge: {e}"),
+                            });
+                        } else {
+                            eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/ (qa: server).");
+                            let _ = progress_tx.send(ReconciliationEvent {
+                                story_id: story_id.clone(),
+                                status: "advanced".to_string(),
+                                message: "Gates passed — moved to merge (qa: server).".to_string(),
+                            });
+                        }
+                    }
+                    crate::io::story_metadata::QaMode::Agent => {
+                        if let Err(e) = super::super::lifecycle::move_story_to_qa(project_root, story_id) {
+                            eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}");
+                            let _ = progress_tx.send(ReconciliationEvent {
+                                story_id: story_id.clone(),
+                                status: "failed".to_string(),
+                                message: format!("Failed to advance to QA: {e}"),
+                            });
+                        } else {
+                            eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/.");
+                            let _ = progress_tx.send(ReconciliationEvent {
+                                story_id: story_id.clone(),
+                                status: "advanced".to_string(),
+                                message: "Gates passed — moved to QA.".to_string(),
+                            });
+                        }
+                    }
+                    crate::io::story_metadata::QaMode::Human => {
+                        if let Err(e) = super::super::lifecycle::move_story_to_qa(project_root, story_id) {
+                            eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}");
+                            let _ = progress_tx.send(ReconciliationEvent {
+                                story_id: story_id.clone(),
+                                status: "failed".to_string(),
+                                message: format!("Failed to advance to QA: {e}"),
+                            });
+                        } else {
+                            let story_path = project_root
+                                .join(".story_kit/work/3_qa")
+                                .join(format!("{story_id}.md"));
+                            if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) {
+                                eprintln!(
+                                    "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}"
+                                );
+                            }
+                            eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/ (qa: human — holding for review).");
+                            let _ = progress_tx.send(ReconciliationEvent {
+                                story_id: story_id.clone(),
+                                status: "review_hold".to_string(),
+                                message: "Gates passed — holding for human review.".to_string(),
+                            });
+                        }
+                    }
+                }
+            } else if stage_dir == "3_qa" {
+                // QA stage → run coverage gate before advancing to merge.
+                let wt_path_for_cov = wt_path.clone();
+                let coverage_result = tokio::task::spawn_blocking(move || {
+                    super::super::gates::run_coverage_gate(&wt_path_for_cov)
+                })
+                .await;
+
+                let (coverage_passed, coverage_output) = match coverage_result {
+                    Ok(Ok(pair)) => pair,
+                    Ok(Err(e)) => {
+                        eprintln!("[startup:reconcile] Coverage gate error for '{story_id}': {e}");
+                        let _ = progress_tx.send(ReconciliationEvent {
+                            story_id: story_id.clone(),
+                            status: "failed".to_string(),
+                            message: format!("Coverage gate error: {e}"),
+                        });
+                        continue;
+                    }
+                    Err(e) => {
+                        eprintln!(
+                            "[startup:reconcile] Coverage gate panicked for '{story_id}': {e}"
+                        );
+                        let _ = progress_tx.send(ReconciliationEvent {
+                            story_id: story_id.clone(),
+                            status: "failed".to_string(),
+                            message: format!("Coverage gate panicked: {e}"),
+                        });
+                        continue;
+                    }
+                };
+
+                if coverage_passed {
+                    // Check whether this item needs human review before merging.
+                    let needs_human_review = {
+                        let item_type = super::super::lifecycle::item_type_from_id(story_id);
+                        if item_type == "spike" {
+                            true
+                        } else {
+                            let story_path = project_root
+                                .join(".story_kit/work/3_qa")
+                                .join(format!("{story_id}.md"));
+                            let default_qa = crate::config::ProjectConfig::load(project_root)
+                                .unwrap_or_default()
+                                .default_qa_mode();
+                            matches!(
+                                crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa),
+                                crate::io::story_metadata::QaMode::Human
+                            )
+                        }
+                    };
+
+                    if needs_human_review {
+                        let story_path = project_root
+                            .join(".story_kit/work/3_qa")
+                            .join(format!("{story_id}.md"));
+                        if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) {
+                            eprintln!(
+                                "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}"
+                            );
+                        }
+                        eprintln!(
+                            "[startup:reconcile] '{story_id}' passed QA — holding for human review."
+                        );
+                        let _ = progress_tx.send(ReconciliationEvent {
+                            story_id: story_id.clone(),
+                            status: "review_hold".to_string(),
+                            message: "Passed QA — waiting for human review.".to_string(),
+                        });
+                    } else if let Err(e) =
+                        super::super::lifecycle::move_story_to_merge(project_root, story_id)
+                    {
+                        eprintln!(
+                            "[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}"
+                        );
+                        let _ = progress_tx.send(ReconciliationEvent {
+                            story_id: story_id.clone(),
+                            status: "failed".to_string(),
+                            message: format!("Failed to advance to merge: {e}"),
+                        });
+                    } else {
+                        eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/.");
+                        let _ = progress_tx.send(ReconciliationEvent {
+                            story_id: story_id.clone(),
+                            status: "advanced".to_string(),
+                            message: "Gates passed — moved to merge.".to_string(),
+                        });
+                    }
+                } else {
+                    eprintln!(
+                        "[startup:reconcile] Coverage gate failed for '{story_id}': {coverage_output}\n\
+                         Leaving in 3_qa/ for auto-assign to restart the QA agent."
+                    );
+                    let _ = progress_tx.send(ReconciliationEvent {
+                        story_id: story_id.clone(),
+                        status: "failed".to_string(),
+                        message: "Coverage gate failed; will be retried.".to_string(),
+                    });
+                }
+            }
+        }
+
+        // Signal that reconciliation is complete.
+        let _ = progress_tx.send(ReconciliationEvent {
+            story_id: String::new(),
+            status: "done".to_string(),
+            message: "Startup reconciliation complete.".to_string(),
+        });
+    }
+
+    /// Run a single watchdog pass synchronously (test helper).
+    #[cfg(test)]
+    pub fn run_watchdog_once(&self) {
+        check_orphaned_agents(&self.agents);
+    }
+
+    /// Spawn a background watchdog task that periodically checks for Running agents
+    /// whose underlying task has already finished (orphaned entries).  Any such agent
+    /// is marked Failed and an Error event is emitted so that `wait_for_agent` unblocks.
+    ///
+    /// The watchdog runs every 30 seconds.  It is a safety net for edge cases where the
+    /// PTY read loop exits without updating the agent status (e.g. a panic in the
+    /// spawn_blocking task, or an external SIGKILL that closes the PTY fd immediately).
+    ///
+    /// When orphaned agents are detected and a `project_root` is provided, auto-assign
+    /// is triggered so that free agents can pick up unassigned work.
+    pub fn spawn_watchdog(pool: Arc<AgentPool>, project_root: Option<PathBuf>) {
+        tokio::spawn(async move {
+            let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
+            loop {
+                interval.tick().await;
+                let found = check_orphaned_agents(&pool.agents);
+                if found > 0
+                    && let Some(ref root) = project_root
+                {
+                    slog!("[watchdog] {found} orphaned agent(s) detected; triggering auto-assign.");
+                    pool.auto_assign_available_work(root).await;
+                }
+            }
+        });
+    }
+}
+
+// ── Free helper functions ──────────────────────────────────────────────────
+
+/// Read the optional `agent:` field from the front matter of a story file.
+///
+/// Returns `Some(agent_name)` if the front matter specifies an agent, or `None`
+/// if the field is absent or the file cannot be read / parsed.
+fn read_story_front_matter_agent(
+    project_root: &Path,
+    stage_dir: &str,
+    story_id: &str,
+) -> Option<String> {
+    use crate::io::story_metadata::parse_front_matter;
+    let path = project_root
+        .join(".story_kit")
+        .join("work")
+        .join(stage_dir)
+        .join(format!("{story_id}.md"));
+    let contents = std::fs::read_to_string(path).ok()?;
+    parse_front_matter(&contents).ok()?.agent
+}
+
+/// Return `true` if the story file in the given stage has `review_hold: true` in its front matter.
+fn has_review_hold(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
+    use crate::io::story_metadata::parse_front_matter;
+    let path = project_root
+        .join(".story_kit")
+        .join("work")
+        .join(stage_dir)
+        .join(format!("{story_id}.md"));
+    let contents = match std::fs::read_to_string(path) {
+        Ok(c) => c,
+        Err(_) => return false,
+    };
+    parse_front_matter(&contents)
+        .ok()
+        .and_then(|m| m.review_hold)
+        .unwrap_or(false)
+}
+
+/// Return `true` if the story file has `blocked: true` in its front matter.
+fn is_story_blocked(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
+    use crate::io::story_metadata::parse_front_matter;
+    let path = project_root
+        .join(".story_kit")
+        .join("work")
+        .join(stage_dir)
+        .join(format!("{story_id}.md"));
+    let contents = match std::fs::read_to_string(path) {
+        Ok(c) => c,
+        Err(_) => return false,
+    };
+    parse_front_matter(&contents)
+        .ok()
+        .and_then(|m| m.blocked)
+        .unwrap_or(false)
+}
+
+/// Return `true` if the story file has a `merge_failure` field in its front matter.
+fn has_merge_failure(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
+    use crate::io::story_metadata::parse_front_matter;
+    let path = project_root
+        .join(".story_kit")
+        .join("work")
+        .join(stage_dir)
+        .join(format!("{story_id}.md"));
+    let contents = match std::fs::read_to_string(path) {
+        Ok(c) => c,
+        Err(_) => return false,
+    };
+    parse_front_matter(&contents)
+        .ok()
+        .and_then(|m| m.merge_failure)
+        .is_some()
+}
+
+/// Return `true` if `agent_name` has no active (pending/running) entry in the pool.
+pub(super) fn is_agent_free(agents: &HashMap<String, StoryAgent>, agent_name: &str) -> bool {
+    !agents.values().any(|a| {
+        a.agent_name == agent_name
+            && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+    })
+}
+
+fn scan_stage_items(project_root: &Path, stage_dir: &str) -> Vec<String> {
+    let dir = project_root.join(".story_kit").join("work").join(stage_dir);
+    if !dir.is_dir() {
+        return Vec::new();
+    }
+    let mut items = Vec::new();
+    if let Ok(entries) = std::fs::read_dir(&dir) {
+        for entry in entries.flatten() {
+            let path = entry.path();
+            if path.extension().and_then(|e| e.to_str()) == Some("md")
+                && let Some(stem) = path.file_stem().and_then(|s| s.to_str())
+            {
+                items.push(stem.to_string());
+            }
+        }
+    }
+    items.sort();
+    items
+}
+
+/// Return `true` if `story_id` has any active (pending/running) agent matching `stage`.
+///
+/// Uses the explicit `stage` config field when the agent is found in `config`;
+/// falls back to the legacy name-based heuristic for unlisted agents.
+fn is_story_assigned_for_stage(
+    config: &ProjectConfig,
+    agents: &HashMap<String, StoryAgent>,
+    story_id: &str,
+    stage: &PipelineStage,
+) -> bool {
+    agents.iter().any(|(key, agent)| {
+        // Composite key format: "{story_id}:{agent_name}"
+        let key_story_id = key.rsplit_once(':').map(|(sid, _)| sid).unwrap_or(key);
+        let agent_stage = config
+            .find_agent(&agent.agent_name)
+            .map(agent_config_stage)
+            .unwrap_or_else(|| pipeline_stage(&agent.agent_name));
+        key_story_id == story_id
+            && agent_stage == *stage
+            && matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
+    })
+}
+
+/// Count active (pending/running) agents for a given pipeline stage.
+fn count_active_agents_for_stage(
+    config: &ProjectConfig,
+    agents: &HashMap<String, StoryAgent>,
+    stage: &PipelineStage,
+) -> usize {
+    agents
+        .values()
+        .filter(|a| {
+            matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+                && config
+                    .find_agent(&a.agent_name)
+                    .map(|cfg| agent_config_stage(cfg) == *stage)
+                    .unwrap_or_else(|| pipeline_stage(&a.agent_name) == *stage)
+        })
+        .count()
+}
+
+/// Find the first configured agent for `stage` that has no active (pending/running) assignment.
+/// Returns `None` if all agents for that stage are busy, none are configured,
+/// or the `max_coders` limit has been reached (for the Coder stage).
+///
+/// For the Coder stage, when `default_coder_model` is set, only considers agents whose
+/// model matches the default. This ensures opus-class agents are reserved for explicit
+/// front-matter requests.
+pub(super) fn find_free_agent_for_stage<'a>(
+    config: &'a ProjectConfig,
+    agents: &HashMap<String, StoryAgent>,
+    stage: &PipelineStage,
+) -> Option<&'a str> {
+    // Enforce max_coders limit for the Coder stage.
+    if *stage == PipelineStage::Coder
+        && let Some(max) = config.max_coders
+    {
+        let active = count_active_agents_for_stage(config, agents, stage);
+        if active >= max {
+            return None;
+        }
+    }
+
+    for agent_config in &config.agent {
+        if agent_config_stage(agent_config) != *stage {
+            continue;
+        }
+        // When default_coder_model is set, only auto-assign coder agents whose
+        // model matches. This keeps opus agents reserved for explicit requests.
+        if *stage == PipelineStage::Coder
+            && let Some(ref default_model) = config.default_coder_model
+        {
+            let agent_model = agent_config.model.as_deref().unwrap_or("");
+            if agent_model != default_model {
+                continue;
+            }
+        }
+        let is_busy = agents.values().any(|a| {
+            a.agent_name == agent_config.name
+                && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+        });
+        if !is_busy {
+            return Some(&agent_config.name);
+        }
+    }
+    None
+}
+
+/// Scan the agent pool for Running entries whose backing tokio task has already
+/// finished and mark them as Failed.
+///
+/// This handles the case where the PTY read loop or the spawned task exits
+/// without updating the agent status — for example when the process is killed
+/// externally and the PTY master fd returns EOF before our inactivity timeout
+/// fires, but some other edge case prevents the normal cleanup path from running.
+fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>) -> usize {
+    let mut lock = match agents.lock() {
+        Ok(l) => l,
+        Err(_) => return 0,
+    };
+
+    // Collect orphaned entries: Running or Pending agents whose task handle is finished.
+    // Pending agents can be orphaned if worktree creation panics before setting status.
+    let orphaned: Vec<(String, String, broadcast::Sender<AgentEvent>, AgentStatus)> = lock
+        .iter()
+        .filter_map(|(key, agent)| {
+            if matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
+                && let Some(handle) = &agent.task_handle
+                && handle.is_finished()
+            {
+                let story_id = key
+                    .rsplit_once(':')
+                    .map(|(s, _)| s.to_string())
+                    .unwrap_or_else(|| key.clone());
+                return Some((
+                    key.clone(),
+                    story_id,
+                    agent.tx.clone(),
+                    agent.status.clone(),
+                ));
+            }
+            None
+        })
+        .collect();
+
+    let count = orphaned.len();
+    for (key, story_id, tx, prev_status) in orphaned {
+        if let Some(agent) = lock.get_mut(&key) {
+            agent.status = AgentStatus::Failed;
+            slog!(
+                "[watchdog] Orphaned agent '{key}': task finished but status was {prev_status}. \
+                 Marking Failed."
+            );
+            let _ = tx.send(AgentEvent::Error {
+                story_id,
+                agent_name: agent.agent_name.clone(),
+                message: "Agent process terminated unexpectedly (watchdog detected orphan)"
+                    .to_string(),
+            });
+        }
+    }
+    count
+}
+
+// ── Tests ──────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::config::ProjectConfig;
+    use crate::io::watcher::WatcherEvent;
+    use std::process::Command;
+
+    use super::super::{AgentPool, StoryAgent, composite_key};
+
+    fn make_config(toml_str: &str) -> ProjectConfig {
+        ProjectConfig::parse(toml_str).unwrap()
+    }
+
+    fn init_git_repo(repo: &std::path::Path) {
+        Command::new("git")
+            .args(["init"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["config", "user.email", "test@test.com"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["config", "user.name", "Test"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        // Create initial commit so master branch exists.
+        std::fs::write(repo.join("README.md"), "# test\n").unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "-m", "initial"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+    }
+
+    fn make_test_story_agent(agent_name: &str, status: AgentStatus) -> StoryAgent {
+        StoryAgent {
+            agent_name: agent_name.to_string(),
+            status,
+            worktree_info: None,
+            session_id: None,
+            tx: broadcast::channel(1).0,
+            task_handle: None,
+            event_log: Arc::new(Mutex::new(Vec::new())),
+            completion: None,
+            project_root: None,
+            log_session_id: None,
+            merge_failure_reported: false,
+        }
+    }
+
+    // ── auto-assign helper tests ───────────────────────────────────
+
+    #[test]
+    fn scan_stage_items_returns_empty_for_missing_dir() {
+        let tmp = tempfile::tempdir().unwrap();
+        let items = scan_stage_items(tmp.path(), "2_current");
+        assert!(items.is_empty());
+    }
+
+    #[test]
+    fn scan_stage_items_returns_sorted_story_ids() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let stage_dir = tmp.path().join(".story_kit").join("work").join("2_current");
+        fs::create_dir_all(&stage_dir).unwrap();
+        fs::write(stage_dir.join("42_story_foo.md"), "---\nname: foo\n---").unwrap();
+        fs::write(stage_dir.join("10_story_bar.md"), "---\nname: bar\n---").unwrap();
+        fs::write(stage_dir.join("5_story_baz.md"), "---\nname: baz\n---").unwrap();
+        // non-md file should be ignored
+        fs::write(stage_dir.join("README.txt"), "ignore me").unwrap();
+
+        let items = scan_stage_items(tmp.path(), "2_current");
+        assert_eq!(items, vec!["10_story_bar", "42_story_foo", "5_story_baz"]);
+    }
+
+    #[test]
+    fn is_story_assigned_returns_true_for_running_coder() {
+        let config = ProjectConfig::default();
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
+
+        let agents = pool.agents.lock().unwrap();
+        assert!(is_story_assigned_for_stage(
+            &config,
+            &agents,
+            "42_story_foo",
+            &PipelineStage::Coder
+        ));
+        // Same story but wrong stage — should be false
+        assert!(!is_story_assigned_for_stage(
+            &config,
+            &agents,
+            "42_story_foo",
+            &PipelineStage::Qa
+        ));
+        // Different story — should be false
+        assert!(!is_story_assigned_for_stage(
+            &config,
+            &agents,
+            "99_story_other",
+            &PipelineStage::Coder
+        ));
+    }
+
+    #[test]
+    fn is_story_assigned_returns_false_for_completed_agent() {
+        let config = ProjectConfig::default();
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Completed);
+
+        let agents = pool.agents.lock().unwrap();
+        // Completed agents don't count as assigned
+        assert!(!is_story_assigned_for_stage(
+            &config,
+            &agents,
+            "42_story_foo",
+            &PipelineStage::Coder
+        ));
+    }
+
+    #[test]
+    fn is_story_assigned_uses_config_stage_field_for_nonstandard_names() {
+        let config = ProjectConfig::parse(
+            r#"
+[[agent]]
+name = "qa-2"
+stage = "qa"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("42_story_foo", "qa-2", AgentStatus::Running);
+
+        let agents = pool.agents.lock().unwrap();
+        // qa-2 with stage=qa should be recognised as a QA agent
+        assert!(
+            is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Qa),
+            "qa-2 should be detected as assigned to QA stage"
+        );
+        // Should NOT appear as a coder
+        assert!(
+            !is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Coder),
+            "qa-2 should not be detected as a coder"
+        );
+    }
+
+    #[test]
+    fn find_free_agent_returns_none_when_all_busy() {
+        let config = ProjectConfig::parse(
+            r#"
+[[agent]]
+name = "coder-1"
+[[agent]]
+name = "coder-2"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("s1", "coder-1", AgentStatus::Running);
+        pool.inject_test_agent("s2", "coder-2", AgentStatus::Running);
+
+        let agents = pool.agents.lock().unwrap();
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert!(free.is_none(), "no free coders should be available");
+    }
+
+    #[test]
+    fn find_free_agent_returns_first_free_coder() {
+        let config = ProjectConfig::parse(
+            r#"
+[[agent]]
+name = "coder-1"
+[[agent]]
+name = "coder-2"
+[[agent]]
+name = "coder-3"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        // coder-1 is busy, coder-2 is free
+        pool.inject_test_agent("s1", "coder-1", AgentStatus::Running);
+
+        let agents = pool.agents.lock().unwrap();
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert_eq!(
+            free,
+            Some("coder-2"),
+            "coder-2 should be the first free coder"
+        );
+    }
+
+    #[test]
+    fn find_free_agent_ignores_completed_agents() {
+        let config = ProjectConfig::parse(
+            r#"
+[[agent]]
+name = "coder-1"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        // coder-1 completed its previous story — it's free for a new one
+        pool.inject_test_agent("s1", "coder-1", AgentStatus::Completed);
+
+        let agents = pool.agents.lock().unwrap();
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert_eq!(free, Some("coder-1"), "completed coder-1 should be free");
+    }
+
+    #[test]
+    fn find_free_agent_returns_none_for_wrong_stage() {
+        let config = ProjectConfig::parse(
+            r#"
+[[agent]]
+name = "qa"
+"#,
+        )
+        .unwrap();
+
+        let agents: HashMap<String, StoryAgent> = HashMap::new();
+        // Looking for a Coder but only QA is configured
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert!(free.is_none());
+        // Looking for QA should find it
+        let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
+        assert_eq!(free_qa, Some("qa"));
+    }
+
+    #[test]
+    fn find_free_agent_uses_config_stage_field_not_name() {
+        // Agents named "qa-2" and "coder-opus" don't match the legacy name heuristic
+        // but should be picked up via their explicit stage field.
+        let config = ProjectConfig::parse(
+            r#"
+[[agent]]
+name = "qa-2"
+stage = "qa"
+
+[[agent]]
+name = "coder-opus"
+stage = "coder"
+"#,
+        )
+        .unwrap();
+
+        let agents: HashMap<String, StoryAgent> = HashMap::new();
+
+        // qa-2 should be found for PipelineStage::Qa via config stage field
+        let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
+        assert_eq!(free_qa, Some("qa-2"), "qa-2 with stage=qa should be found");
+
+        // coder-opus should be found for PipelineStage::Coder via config stage field
+        let free_coder = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert_eq!(
+            free_coder,
+            Some("coder-opus"),
+            "coder-opus with stage=coder should be found"
+        );
+
+        // Neither should match the other stage
+        let free_merge = find_free_agent_for_stage(&config, &agents, &PipelineStage::Mergemaster);
+        assert!(free_merge.is_none());
+    }
+
+    // ── check_orphaned_agents return value tests (bug 161) ──────────────────
+
+    #[tokio::test]
+    async fn check_orphaned_agents_returns_count_of_orphaned_agents() {
+        let pool = AgentPool::new_test(3001);
+
+        // Spawn two tasks that finish immediately.
+        let h1 = tokio::spawn(async {});
+        let h2 = tokio::spawn(async {});
+        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
+        assert!(h1.is_finished());
+        assert!(h2.is_finished());
+
+        pool.inject_test_agent_with_handle("story_a", "coder", AgentStatus::Running, h1);
+        pool.inject_test_agent_with_handle("story_b", "coder", AgentStatus::Running, h2);
+
+        let found = check_orphaned_agents(&pool.agents);
+        assert_eq!(found, 2, "should detect both orphaned agents");
+    }
+
+    #[test]
+    fn check_orphaned_agents_returns_zero_when_no_orphans() {
+        let pool = AgentPool::new_test(3001);
+        // Inject agents in terminal states — not orphaned.
+        pool.inject_test_agent("story_a", "coder", AgentStatus::Completed);
+        pool.inject_test_agent("story_b", "qa", AgentStatus::Failed);
+
+        let found = check_orphaned_agents(&pool.agents);
+        assert_eq!(
+            found, 0,
+            "no orphans should be detected for terminal agents"
+        );
+    }
+
+    #[tokio::test]
+    async fn watchdog_detects_orphaned_running_agent() {
+        let pool = AgentPool::new_test(3001);
+
+        let handle = tokio::spawn(async {});
+        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
+        assert!(
+            handle.is_finished(),
+            "task should be finished before injection"
+        );
+
+        let tx = pool.inject_test_agent_with_handle(
+            "orphan_story",
+            "coder",
+            AgentStatus::Running,
+            handle,
+        );
+        let mut rx = tx.subscribe();
+
+        pool.run_watchdog_once();
+
+        {
+            let agents = pool.agents.lock().unwrap();
+            let key = composite_key("orphan_story", "coder");
+            let agent = agents.get(&key).unwrap();
+            assert_eq!(
+                agent.status,
+                AgentStatus::Failed,
+                "watchdog must mark an orphaned Running agent as Failed"
+            );
+        }
+
+        let event = rx.try_recv().expect("watchdog must emit an Error event");
+        assert!(
+            matches!(event, AgentEvent::Error { .. }),
+            "expected AgentEvent::Error, got: {event:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn watchdog_orphan_detection_returns_nonzero_enabling_auto_assign() {
+        // This test verifies the contract that `check_orphaned_agents` returns
+        // a non-zero count when orphans exist, which the watchdog uses to
+        // decide whether to trigger auto-assign (bug 161).
+        let pool = AgentPool::new_test(3001);
+
+        let handle = tokio::spawn(async {});
+        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
+
+        pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle);
+
+        // Before watchdog: agent is Running.
+        {
+            let agents = pool.agents.lock().unwrap();
+            let key = composite_key("orphan_story", "coder");
+            assert_eq!(agents.get(&key).unwrap().status, AgentStatus::Running);
+        }
+
+        // Run watchdog pass — should return 1 (orphan found).
+        let found = check_orphaned_agents(&pool.agents);
+        assert_eq!(
+            found, 1,
+            "watchdog must return 1 for a single orphaned agent"
+        );
+
+        // After watchdog: agent is Failed.
+        {
+            let agents = pool.agents.lock().unwrap();
+            let key = composite_key("orphan_story", "coder");
+            assert_eq!(
+                agents.get(&key).unwrap().status,
+                AgentStatus::Failed,
+                "orphaned agent must be marked Failed"
+            );
+        }
+    }
+
+    // ── auto_assign_available_work tests ──────────────────────────────────────
+
+    /// Story 203: auto_assign_available_work must detect a story in 2_current/
+    /// with no active agent and start an agent for it.
+    #[tokio::test]
+    async fn auto_assign_picks_up_story_queued_in_current() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        let current = sk.join("work/2_current");
+        std::fs::create_dir_all(&current).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
+        )
+        .unwrap();
+        // Place the story in 2_current/ (simulating the "queued" state).
+        std::fs::write(current.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        // No agents are running — coder-1 is free.
+
+        // auto_assign will try to call start_agent, which will attempt to create
+        // a worktree (will fail without a git repo) — that is fine. We only need
+        // to verify the agent is registered as Pending before the background
+        // task eventually fails.
+        pool.auto_assign_available_work(tmp.path()).await;
+
+        let agents = pool.agents.lock().unwrap();
+        let has_pending = agents.values().any(|a| {
+            a.agent_name == "coder-1"
+                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
+        });
+        assert!(
+            has_pending,
+            "auto_assign should have started coder-1 for story-3, but pool is empty"
+        );
+    }
+
+    /// Story 265: auto_assign_available_work must skip spikes in 3_qa/ that
+    /// have review_hold: true set in their front matter.
+    #[tokio::test]
+    async fn auto_assign_skips_spikes_with_review_hold() {
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        // Create project.toml with a QA agent.
+        let sk = root.join(".story_kit");
+        std::fs::create_dir_all(&sk).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            "[[agents]]\nname = \"qa\"\nrole = \"qa\"\nmodel = \"test\"\nprompt = \"test\"\n",
+        )
+        .unwrap();
+
+        // Put a spike in 3_qa/ with review_hold: true.
+        let qa_dir = root.join(".story_kit/work/3_qa");
+        std::fs::create_dir_all(&qa_dir).unwrap();
+        std::fs::write(
+            qa_dir.join("20_spike_test.md"),
+            "---\nname: Test Spike\nreview_hold: true\n---\n# Spike\n",
+        )
+        .unwrap();
+
+        let (watcher_tx, _) = broadcast::channel::<WatcherEvent>(4);
+        let pool = AgentPool::new(3001, watcher_tx);
+
+        pool.auto_assign_available_work(root).await;
+
+        // No agent should have been started for the spike.
+        let agents = pool.agents.lock().unwrap();
+        assert!(
+            agents.is_empty(),
+            "No agents should be assigned to a spike with review_hold"
+        );
+    }
+
+    // ── Story 279: auto-assign respects agent stage from front matter ──────────
+
+    /// When a story in 3_qa/ has `agent: coder-1` in its front matter but
+    /// coder-1 is a coder-stage agent, auto-assign must NOT assign coder-1.
+    /// Instead it should fall back to a free QA-stage agent.
+    #[tokio::test]
+    async fn auto_assign_ignores_coder_preference_when_story_is_in_qa_stage() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        let qa_dir = sk.join("work/3_qa");
+        std::fs::create_dir_all(&qa_dir).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
+             [[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n",
+        )
+        .unwrap();
+        // Story in 3_qa/ with a preferred coder-stage agent.
+        std::fs::write(
+            qa_dir.join("story-qa1.md"),
+            "---\nname: QA Story\nagent: coder-1\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+
+        pool.auto_assign_available_work(tmp.path()).await;
+
+        let agents = pool.agents.lock().unwrap();
+        // coder-1 must NOT have been assigned (wrong stage for 3_qa/).
+        let coder_assigned = agents.values().any(|a| {
+            a.agent_name == "coder-1"
+                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
+        });
+        assert!(
+            !coder_assigned,
+            "coder-1 should not be assigned to a QA-stage story"
+        );
+        // qa-1 should have been assigned instead.
+        let qa_assigned = agents.values().any(|a| {
+            a.agent_name == "qa-1"
+                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
+        });
+        assert!(
+            qa_assigned,
+            "qa-1 should be assigned as fallback for the QA-stage story"
+        );
+    }
+
+    /// When a story in 2_current/ has `agent: coder-1` in its front matter and
+    /// coder-1 is a coder-stage agent, auto-assign must respect the preference
+    /// and assign coder-1 (not fall back to some other coder).
+    #[tokio::test]
+    async fn auto_assign_respects_coder_preference_when_story_is_in_current_stage() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        let current_dir = sk.join("work/2_current");
+        std::fs::create_dir_all(&current_dir).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
+             [[agent]]\nname = \"coder-2\"\nstage = \"coder\"\n",
+        )
+        .unwrap();
+        // Story in 2_current/ with a preferred coder-1 agent.
+        std::fs::write(
+            current_dir.join("story-pref.md"),
+            "---\nname: Coder Story\nagent: coder-1\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+
+        pool.auto_assign_available_work(tmp.path()).await;
+
+        let agents = pool.agents.lock().unwrap();
+        // coder-1 should have been picked (it matches the stage and is preferred).
+        let coder1_assigned = agents.values().any(|a| {
+            a.agent_name == "coder-1"
+                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
+        });
+        assert!(
+            coder1_assigned,
+            "coder-1 should be assigned when it matches the stage and is preferred"
+        );
+        // coder-2 must NOT be assigned (not preferred).
+        let coder2_assigned = agents.values().any(|a| {
+            a.agent_name == "coder-2"
+                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
+        });
+        assert!(
+            !coder2_assigned,
+            "coder-2 should not be assigned when coder-1 is explicitly preferred"
+        );
+    }
+
+    /// When the preferred agent's stage mismatches and no other agent of the
+    /// correct stage is available, auto-assign must not start any agent for that
+    /// story (no panic, no error).
+    #[tokio::test]
+    async fn auto_assign_stage_mismatch_with_no_fallback_starts_no_agent() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        let qa_dir = sk.join("work/3_qa");
+        std::fs::create_dir_all(&qa_dir).unwrap();
+        // Only a coder agent is configured — no QA agent exists.
+        std::fs::write(
+            sk.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
+        )
+        .unwrap();
+        // Story in 3_qa/ requests coder-1 (wrong stage) and no QA agent exists.
+        std::fs::write(
+            qa_dir.join("story-noqa.md"),
+            "---\nname: QA Story No Agent\nagent: coder-1\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+
+        // Must not panic.
+        pool.auto_assign_available_work(tmp.path()).await;
+
+        let agents = pool.agents.lock().unwrap();
+        assert!(
+            agents.is_empty(),
+            "No agent should be started when no stage-appropriate agent is available"
+        );
+    }
+
+    /// Two concurrent auto_assign_available_work calls must not assign the same
+    /// agent to two stories simultaneously.  After both complete, at most one
+    /// Pending/Running entry must exist per agent name.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn toctou_concurrent_auto_assign_no_duplicate_agent_assignments() {
+        use std::fs;
+        use std::sync::Arc;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path().to_path_buf();
+
+        let sk_dir = root.join(".story_kit");
+        // Two stories waiting in 2_current, one coder agent.
+        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\n",
+        )
+        .unwrap();
+        fs::write(
+            sk_dir.join("work/2_current/86_story_foo.md"),
+            "---\nname: Foo\n---\n",
+        )
+        .unwrap();
+        fs::write(
+            sk_dir.join("work/2_current/130_story_bar.md"),
+            "---\nname: Bar\n---\n",
+        )
+        .unwrap();
+
+        let pool = Arc::new(AgentPool::new_test(3099));
+
+        // Run two concurrent auto_assign calls.
+        let pool1 = pool.clone();
+        let root1 = root.clone();
+        let t1 = tokio::spawn(async move { pool1.auto_assign_available_work(&root1).await });
+
+        let pool2 = pool.clone();
+        let root2 = root.clone();
+        let t2 = tokio::spawn(async move { pool2.auto_assign_available_work(&root2).await });
+
+        let _ = tokio::join!(t1, t2);
+
+        // At most one Pending/Running entry should exist for coder-1.
+        let agents = pool.agents.lock().unwrap();
+        let active_coder_count = agents
+            .values()
+            .filter(|a| {
+                a.agent_name == "coder-1"
+                    && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
+            })
+            .count();
+
+        assert!(
+            active_coder_count <= 1,
+            "coder-1 must not be assigned to more than one story simultaneously; \
+             found {active_coder_count} active entries"
+        );
+    }
+
+    // ── has_review_hold tests ────────────────────────────────────────────────
+
+    #[test]
+    fn has_review_hold_returns_true_when_set() {
+        let tmp = tempfile::tempdir().unwrap();
+        let qa_dir = tmp.path().join(".story_kit/work/3_qa");
+        std::fs::create_dir_all(&qa_dir).unwrap();
+        let spike_path = qa_dir.join("10_spike_research.md");
+        std::fs::write(
+            &spike_path,
+            "---\nname: Research spike\nreview_hold: true\n---\n# Spike\n",
+        )
+        .unwrap();
+        assert!(has_review_hold(tmp.path(), "3_qa", "10_spike_research"));
+    }
+
+    #[test]
+    fn has_review_hold_returns_false_when_not_set() {
+        let tmp = tempfile::tempdir().unwrap();
+        let qa_dir = tmp.path().join(".story_kit/work/3_qa");
+        std::fs::create_dir_all(&qa_dir).unwrap();
+        let spike_path = qa_dir.join("10_spike_research.md");
+        std::fs::write(&spike_path, "---\nname: Research spike\n---\n# Spike\n").unwrap();
+        assert!(!has_review_hold(tmp.path(), "3_qa", "10_spike_research"));
+    }
+
+    #[test]
+    fn has_review_hold_returns_false_when_file_missing() {
+        let tmp = tempfile::tempdir().unwrap();
+        assert!(!has_review_hold(tmp.path(), "3_qa", "99_spike_missing"));
+    }
+
+    // ── find_free_agent_for_stage: default_coder_model filtering ─────────
+
+    #[test]
+    fn find_free_agent_skips_opus_when_default_coder_model_set() {
+        let config = make_config(
+            r#"
+default_coder_model = "sonnet"
+
+[[agent]]
+name = "coder-1"
+stage = "coder"
+model = "sonnet"
+
+[[agent]]
+name = "coder-opus"
+stage = "coder"
+model = "opus"
+"#,
+        );
+
+        let agents = HashMap::new();
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert_eq!(free, Some("coder-1"));
+    }
+
+    #[test]
+    fn find_free_agent_returns_opus_when_no_default_coder_model() {
+        let config = make_config(
+            r#"
+[[agent]]
+name = "coder-opus"
+stage = "coder"
+model = "opus"
+"#,
+        );
+
+        let agents = HashMap::new();
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert_eq!(free, Some("coder-opus"));
+    }
+
+    #[test]
+    fn find_free_agent_returns_none_when_all_sonnet_coders_busy() {
+        let config = make_config(
+            r#"
+default_coder_model = "sonnet"
+
+[[agent]]
+name = "coder-1"
+stage = "coder"
+model = "sonnet"
+
+[[agent]]
+name = "coder-opus"
+stage = "coder"
+model = "opus"
+"#,
+        );
+
+        let mut agents = HashMap::new();
+        agents.insert(
+            "story1:coder-1".to_string(),
+            make_test_story_agent("coder-1", AgentStatus::Running),
+        );
+
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert_eq!(free, None, "opus agent should not be auto-assigned");
+    }
+
+    // ── find_free_agent_for_stage: max_coders limit ─────────────────────
+
+    #[test]
+    fn find_free_agent_respects_max_coders() {
+        let config = make_config(
+            r#"
+max_coders = 1
+
+[[agent]]
+name = "coder-1"
+stage = "coder"
+model = "sonnet"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+model = "sonnet"
+"#,
+        );
+
+        let mut agents = HashMap::new();
+        agents.insert(
+            "story1:coder-1".to_string(),
+            make_test_story_agent("coder-1", AgentStatus::Running),
+        );
+
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert_eq!(free, None, "max_coders=1 should block second coder");
+    }
+
+    #[test]
+    fn find_free_agent_allows_within_max_coders() {
+        let config = make_config(
+            r#"
+max_coders = 2
+
+[[agent]]
+name = "coder-1"
+stage = "coder"
+model = "sonnet"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+model = "sonnet"
+"#,
+        );
+
+        let mut agents = HashMap::new();
+        agents.insert(
+            "story1:coder-1".to_string(),
+            make_test_story_agent("coder-1", AgentStatus::Running),
+        );
+
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert_eq!(free, Some("coder-2"));
+    }
+
+    #[test]
+    fn max_coders_does_not_affect_qa_stage() {
+        let config = make_config(
+            r#"
+max_coders = 1
+
+[[agent]]
+name = "qa"
+stage = "qa"
+model = "sonnet"
+"#,
+        );
+
+        let agents = HashMap::new();
+        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
+        assert_eq!(free, Some("qa"));
+    }
+
+    // ── count_active_agents_for_stage ────────────────────────────────────
+
+    #[test]
+    fn count_active_agents_counts_running_and_pending() {
+        let config = make_config(
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+"#,
+        );
+
+        let mut agents = HashMap::new();
+        agents.insert(
+            "s1:coder-1".to_string(),
+            make_test_story_agent("coder-1", AgentStatus::Running),
+        );
+        agents.insert(
+            "s2:coder-2".to_string(),
+            make_test_story_agent("coder-2", AgentStatus::Completed),
+        );
+
+        let count = count_active_agents_for_stage(&config, &agents, &PipelineStage::Coder);
+        assert_eq!(count, 1, "Only Running coder should be counted, not Completed");
+    }
+
+    // ── reconcile_on_startup tests ────────────────────────────────────────────
+
+    #[tokio::test]
+    async fn reconcile_on_startup_noop_when_no_worktrees() {
+        let tmp = tempfile::tempdir().unwrap();
+        let pool = AgentPool::new_test(3001);
+        let (tx, _rx) = broadcast::channel(16);
+        // Should not panic; no worktrees to reconcile.
+        pool.reconcile_on_startup(tmp.path(), &tx).await;
+    }
+
+    #[tokio::test]
+    async fn reconcile_on_startup_emits_done_event() {
+        let tmp = tempfile::tempdir().unwrap();
+        let pool = AgentPool::new_test(3001);
+        let (tx, mut rx) = broadcast::channel::<ReconciliationEvent>(16);
+        pool.reconcile_on_startup(tmp.path(), &tx).await;
+
+        // Collect all events; the last must be "done".
+        let mut events: Vec<ReconciliationEvent> = Vec::new();
+        while let Ok(evt) = rx.try_recv() {
+            events.push(evt);
+        }
+        assert!(
+            events.iter().any(|e| e.status == "done"),
+            "reconcile_on_startup must emit a 'done' event; got: {:?}",
+            events.iter().map(|e| &e.status).collect::<Vec<_>>()
+        );
+    }
+
+    #[tokio::test]
+    async fn reconcile_on_startup_skips_story_without_committed_work() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        // Set up story in 2_current/.
+        let current = root.join(".story_kit/work/2_current");
+        fs::create_dir_all(&current).unwrap();
+        fs::write(current.join("60_story_test.md"), "test").unwrap();
+
+        // Create a worktree directory that is a fresh git repo with no commits
+        // ahead of its own base branch (simulates a worktree where no work was done).
+        let wt_dir = root.join(".story_kit/worktrees/60_story_test");
+        fs::create_dir_all(&wt_dir).unwrap();
+        init_git_repo(&wt_dir);
+
+        let pool = AgentPool::new_test(3001);
+        let (tx, _rx) = broadcast::channel(16);
+        pool.reconcile_on_startup(root, &tx).await;
+
+        // Story should still be in 2_current/ — nothing was reconciled.
+        assert!(
+            current.join("60_story_test.md").exists(),
+            "story should stay in 2_current/ when worktree has no committed work"
+        );
+    }
+
+    #[tokio::test]
+    async fn reconcile_on_startup_runs_gates_on_worktree_with_committed_work() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        // Set up a git repo for the project root.
+        init_git_repo(root);
+
+        // Set up story in 2_current/ and commit it so the project root is clean.
+        let current = root.join(".story_kit/work/2_current");
+        fs::create_dir_all(&current).unwrap();
+        fs::write(current.join("61_story_test.md"), "test").unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(root)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args([
+                "-c",
+                "user.email=test@test.com",
+                "-c",
+                "user.name=Test",
+                "commit",
+                "-m",
+                "add story",
+            ])
+            .current_dir(root)
+            .output()
+            .unwrap();
+
+        // Create a real git worktree for the story.
+        let wt_dir = root.join(".story_kit/worktrees/61_story_test");
+        fs::create_dir_all(wt_dir.parent().unwrap()).unwrap();
+        Command::new("git")
+            .args([
+                "worktree",
+                "add",
+                &wt_dir.to_string_lossy(),
+                "-b",
+                "feature/story-61_story_test",
+            ])
+            .current_dir(root)
+            .output()
+            .unwrap();
+
+        // Add a commit to the feature branch (simulates coder completing work).
+        fs::write(wt_dir.join("implementation.txt"), "done").unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(&wt_dir)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args([
+                "-c",
+                "user.email=test@test.com",
+                "-c",
+                "user.name=Test",
+                "commit",
+                "-m",
+                "implement story",
+            ])
+            .current_dir(&wt_dir)
+            .output()
+            .unwrap();
+
+        assert!(
+            crate::agents::gates::worktree_has_committed_work(&wt_dir),
+            "test setup: worktree should have committed work"
+        );
+
+        let pool = AgentPool::new_test(3001);
+        let (tx, _rx) = broadcast::channel(16);
+        pool.reconcile_on_startup(root, &tx).await;
+
+        // In the test env, cargo clippy will fail (no Cargo.toml) so gates fail
+        // and the story stays in 2_current/.  The important assertion is that
+        // reconcile ran without panicking and the story is in a consistent state.
+        let in_current = current.join("61_story_test.md").exists();
+        let in_qa = root.join(".story_kit/work/3_qa/61_story_test.md").exists();
+        assert!(
+            in_current || in_qa,
+            "story should be in 2_current/ or 3_qa/ after reconciliation"
+        );
+    }
+}
diff --git a/server/src/agents/pool/mod.rs b/server/src/agents/pool/mod.rs
new file mode 100644
index 0000000..dce39cd
--- /dev/null
+++ b/server/src/agents/pool/mod.rs
@@ -0,0 +1,2187 @@
+mod auto_assign;
+mod pipeline;
+
+use crate::agent_log::AgentLogWriter;
+use crate::config::ProjectConfig;
+use crate::io::watcher::WatcherEvent;
+use crate::slog;
+use crate::slog_error;
+use crate::worktree::{self, WorktreeInfo};
+use portable_pty::ChildKiller;
+use std::collections::HashMap;
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, Mutex};
+use tokio::sync::broadcast;
+
+use super::{
+    AgentEvent, AgentInfo, AgentStatus, CompletionReport, PipelineStage, agent_config_stage,
+    pipeline_stage,
+};
+
+/// Build the composite key used to track agents in the pool.
+fn composite_key(story_id: &str, agent_name: &str) -> String {
+    format!("{story_id}:{agent_name}")
+}
+
+/// RAII guard that removes a pending agent entry from the pool on drop.
+///
+/// Created after inserting a `Pending` entry into the agent HashMap.
+/// If `start_agent` succeeds (the agent process is spawned and status
+/// transitions to `Running`), call [`disarm`](Self::disarm) to prevent
+/// cleanup.  If any intermediate step fails and the guard is dropped
+/// without being disarmed, the pending entry is removed so it cannot
+/// block future auto-assign dispatches.
+struct PendingGuard {
+    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
+    key: String,
+    armed: bool,
+}
+
+impl PendingGuard {
+    fn new(agents: Arc<Mutex<HashMap<String, StoryAgent>>>, key: String) -> Self {
+        Self {
+            agents,
+            key,
+            armed: true,
+        }
+    }
+
+    /// Prevent the guard from cleaning up the entry (call after
+    /// successful spawn).
+    fn disarm(&mut self) {
+        self.armed = false;
+    }
+}
+
+impl Drop for PendingGuard {
+    fn drop(&mut self) {
+        if self.armed
+            && let Ok(mut agents) = self.agents.lock()
+            && agents
+                .get(&self.key)
+                .is_some_and(|a| a.status == AgentStatus::Pending)
+        {
+            agents.remove(&self.key);
+            slog!(
+                "[agents] Cleaned up leaked Pending entry for '{}'",
+                self.key
+            );
+        }
+    }
+}
+
+struct StoryAgent {
+    agent_name: String,
+    status: AgentStatus,
+    worktree_info: Option<WorktreeInfo>,
+    session_id: Option<String>,
+    tx: broadcast::Sender<AgentEvent>,
+    task_handle: Option<tokio::task::JoinHandle<()>>,
+    /// Accumulated events for polling via get_agent_output.
+    event_log: Arc<Mutex<Vec<AgentEvent>>>,
+    /// Set when the agent calls report_completion.
+    completion: Option<CompletionReport>,
+    /// Project root, stored for pipeline advancement after completion.
+    project_root: Option<PathBuf>,
+    /// UUID identifying the log file for this session.
+    log_session_id: Option<String>,
+    /// Set to `true` when the agent calls `report_merge_failure`.
+    /// Prevents the pipeline from blindly advancing to `5_done/` after a
+    /// failed merge: the server-owned gate check runs in the feature-branch
+    /// worktree (which compiles fine) and returns `gates_passed=true` even
+    /// though the code was never squash-merged onto master.
+    merge_failure_reported: bool,
+}
+
+/// Build an `AgentInfo` snapshot from a `StoryAgent` map entry.
+fn agent_info_from_entry(story_id: &str, agent: &StoryAgent) -> AgentInfo {
+    AgentInfo {
+        story_id: story_id.to_string(),
+        agent_name: agent.agent_name.clone(),
+        status: agent.status.clone(),
+        session_id: agent.session_id.clone(),
+        worktree_path: agent
+            .worktree_info
+            .as_ref()
+            .map(|wt| wt.path.to_string_lossy().to_string()),
+        base_branch: agent
+            .worktree_info
+            .as_ref()
+            .map(|wt| wt.base_branch.clone()),
+        completion: agent.completion.clone(),
+        log_session_id: agent.log_session_id.clone(),
+    }
+}
+
+/// Manages concurrent story agents, each in its own worktree.
+pub struct AgentPool {
+    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
+    port: u16,
+    /// Registry of active PTY child process killers, keyed by "{story_id}:{agent_name}".
+    /// Used to terminate child processes on server shutdown or agent stop, preventing
+    /// orphaned Claude Code processes from running after the server exits.
+    child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
+    /// Broadcast channel for notifying WebSocket clients of agent state changes.
+    /// When an agent transitions state (Pending, Running, Completed, Failed, Stopped),
+    /// an `AgentStateChanged` event is emitted so the frontend can refresh the
+    /// pipeline board without waiting for a filesystem event.
+    watcher_tx: broadcast::Sender<WatcherEvent>,
+    /// Tracks background merge jobs started by `merge_agent_work`, keyed by story_id.
+    /// The MCP tool returns immediately and the mergemaster agent polls
+    /// `get_merge_status` until the job reaches a terminal state.
+    merge_jobs: Arc<Mutex<HashMap<String, super::merge::MergeJob>>>,
+}
+
+impl AgentPool {
+    pub fn new(port: u16, watcher_tx: broadcast::Sender<WatcherEvent>) -> Self {
+        Self {
+            agents: Arc::new(Mutex::new(HashMap::new())),
+            port,
+            child_killers: Arc::new(Mutex::new(HashMap::new())),
+            watcher_tx,
+            merge_jobs: Arc::new(Mutex::new(HashMap::new())),
+        }
+    }
+
+    /// Create a pool with a dummy watcher channel for unit tests.
+    #[cfg(test)]
+    pub fn new_test(port: u16) -> Self {
+        let (watcher_tx, _) = broadcast::channel(16);
+        Self::new(port, watcher_tx)
+    }
+
+    /// Notify WebSocket clients that agent state has changed, so the pipeline
+    /// board and agent panel can refresh.
+    fn notify_agent_state_changed(watcher_tx: &broadcast::Sender<WatcherEvent>) {
+        let _ = watcher_tx.send(WatcherEvent::AgentStateChanged);
+    }
+
+    /// Kill all active PTY child processes.
+    ///
+    /// Called on server shutdown to prevent orphaned Claude Code processes from
+    /// continuing to run after the server exits. Each registered killer is called
+    /// once, then the registry is cleared.
+    pub fn kill_all_children(&self) {
+        if let Ok(mut killers) = self.child_killers.lock() {
+            for (key, killer) in killers.iter_mut() {
+                slog!("[agents] Killing child process for {key} on shutdown");
+                let _ = killer.kill();
+            }
+            killers.clear();
+        }
+    }
+
+    /// Kill and deregister the child process for a specific agent key.
+    ///
+    /// Used by `stop_agent` to ensure the PTY child is terminated even though
+    /// aborting a `spawn_blocking` task handle does not interrupt the blocking thread.
+    fn kill_child_for_key(&self, key: &str) {
+        if let Ok(mut killers) = self.child_killers.lock()
+            && let Some(mut killer) = killers.remove(key)
+        {
+            slog!("[agents] Killing child process for {key} on stop");
+            let _ = killer.kill();
+        }
+    }
+
+    /// Start an agent for a story: load config, create worktree, spawn agent.
+    ///
+    /// When `agent_name` is `None`, automatically selects the first idle coder
+    /// agent (story 190). If all coders are busy the call fails with an error
+    /// indicating the story will be picked up when one becomes available.
+    ///
+    /// If `resume_context` is provided, it is appended to the rendered prompt
+    /// so the agent can pick up from a previous failed attempt.
+    pub async fn start_agent(
+        &self,
+        project_root: &Path,
+        story_id: &str,
+        agent_name: Option<&str>,
+        resume_context: Option<&str>,
+    ) -> Result<AgentInfo, String> {
+        let config = ProjectConfig::load(project_root)?;
+
+        // Validate explicit agent name early (no lock needed).
+        if let Some(name) = agent_name {
+            config
+                .find_agent(name)
+                .ok_or_else(|| format!("No agent named '{name}' in config"))?;
+        }
+
+        // Create name-independent shared resources before the lock so they are
+        // ready for the atomic check-and-insert (story 132).
+        let (tx, _) = broadcast::channel::<AgentEvent>(1024);
+        let event_log: Arc<Mutex<Vec<AgentEvent>>> = Arc::new(Mutex::new(Vec::new()));
+        let log_session_id = uuid::Uuid::new_v4().to_string();
+
+        // Move story from backlog/ to current/ before checking agent
+        // availability so that auto_assign_available_work can pick it up even
+        // when all coders are currently busy (story 203).  This is idempotent:
+        // if the story is already in 2_current/ or a later stage, the call is
+        // a no-op.
+        super::lifecycle::move_story_to_current(project_root, story_id)?;
+
+        // Validate that the agent's configured stage matches the story's
+        // pipeline stage.  This prevents any caller (auto-assign, MCP tool,
+        // pipeline advance, supervisor) from starting a wrong-stage agent on
+        // a story — e.g. mergemaster on a coding-stage story (bug 312).
+        if let Some(name) = agent_name {
+            let agent_stage = config
+                .find_agent(name)
+                .map(agent_config_stage)
+                .unwrap_or_else(|| pipeline_stage(name));
+            if agent_stage != PipelineStage::Other
+                && let Some(story_stage_dir) = find_active_story_stage(project_root, story_id)
+            {
+                let expected_stage = match story_stage_dir {
+                    "2_current" => PipelineStage::Coder,
+                    "3_qa" => PipelineStage::Qa,
+                    "4_merge" => PipelineStage::Mergemaster,
+                    _ => PipelineStage::Other,
+                };
+                if expected_stage != PipelineStage::Other && expected_stage != agent_stage {
+                    return Err(format!(
+                        "Agent '{name}' (stage: {agent_stage:?}) cannot be assigned to \
+                         story '{story_id}' in {story_stage_dir}/ (requires stage: {expected_stage:?})"
+                    ));
+                }
+            }
+        }
+
+        // Atomically resolve agent name, check availability, and register as
+        // Pending.  When `agent_name` is `None` the first idle coder is
+        // selected inside the lock so no TOCTOU race can occur between the
+        // availability check and the Pending insert (story 132, story 190).
+        //
+        // The `PendingGuard` ensures that if any step below fails the entry is
+        // removed from the pool so it does not permanently block auto-assign
+        // (bug 118).
+        let resolved_name: String;
+        let key: String;
+        {
+            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
+
+            resolved_name = match agent_name {
+                Some(name) => name.to_string(),
+                None => auto_assign::find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder)
+                    .map(|s| s.to_string())
+                    .ok_or_else(|| {
+                        if config
+                            .agent
+                            .iter()
+                            .any(|a| agent_config_stage(a) == PipelineStage::Coder)
+                        {
+                            format!(
+                                "All coder agents are busy; story '{story_id}' has been \
+                                 queued in work/2_current/ and will be auto-assigned when \
+                                 one becomes available"
+                            )
+                        } else {
+                            "No coder agent configured. Specify an agent_name explicitly."
+                                .to_string()
+                        }
+                    })?,
+            };
+
+            key = composite_key(story_id, &resolved_name);
+
+            // Check for duplicate assignment (same story + same agent already active).
+            if let Some(agent) = agents.get(&key)
+                && (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending)
+            {
+                return Err(format!(
+                    "Agent '{resolved_name}' for story '{story_id}' is already {}",
+                    agent.status
+                ));
+            }
+            // Enforce single-stage concurrency: reject if there is already a
+            // Running/Pending agent at the same pipeline stage for this story.
+            // This prevents two coders (or two QA/mergemaster agents) from
+            // corrupting each other's work in the same worktree.
+            // Applies to both explicit and auto-selected agents; the Other
+            // stage (supervisors, unknown agents) is exempt.
+            let resolved_stage = config
+                .find_agent(&resolved_name)
+                .map(agent_config_stage)
+                .unwrap_or_else(|| pipeline_stage(&resolved_name));
+            if resolved_stage != PipelineStage::Other
+                && let Some(conflicting_name) = agents.iter().find_map(|(k, a)| {
+                    let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k);
+                    if k_story == story_id
+                        && a.agent_name != resolved_name
+                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+                    {
+                        let a_stage = config
+                            .find_agent(&a.agent_name)
+                            .map(agent_config_stage)
+                            .unwrap_or_else(|| pipeline_stage(&a.agent_name));
+                        if a_stage == resolved_stage {
+                            Some(a.agent_name.clone())
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    }
+                })
+            {
+                return Err(format!(
+                    "Cannot start '{resolved_name}' on story '{story_id}': \
+                     '{conflicting_name}' is already active at the same pipeline stage"
+                ));
+            }
+            // Enforce single-instance concurrency for explicitly-named agents:
+            // if this agent is already running on any other story, reject.
+            // Auto-selected agents are already guaranteed idle by
+            // find_free_agent_for_stage, so this check is only needed for
+            // explicit requests.
+            if agent_name.is_some()
+                && let Some(busy_story) = agents.iter().find_map(|(k, a)| {
+                    if a.agent_name == resolved_name
+                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+                    {
+                        Some(
+                            k.rsplit_once(':')
+                                .map(|(sid, _)| sid)
+                                .unwrap_or(k)
+                                .to_string(),
+                        )
+                    } else {
+                        None
+                    }
+                })
+            {
+                return Err(format!(
+                    "Agent '{resolved_name}' is already running on story '{busy_story}'; \
+                     story '{story_id}' will be picked up when the agent becomes available"
+                ));
+            }
+            agents.insert(
+                key.clone(),
+                StoryAgent {
+                    agent_name: resolved_name.clone(),
+                    status: AgentStatus::Pending,
+                    worktree_info: None,
+                    session_id: None,
+                    tx: tx.clone(),
+                    task_handle: None,
+                    event_log: event_log.clone(),
+                    completion: None,
+                    project_root: Some(project_root.to_path_buf()),
+                    log_session_id: Some(log_session_id.clone()),
+                    merge_failure_reported: false,
+                },
+            );
+        }
+        let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone());
+
+        // Create persistent log writer (needs resolved_name, so must be after
+        // the atomic resolution above).
+        let log_writer =
+            match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) {
+                Ok(w) => Some(Arc::new(Mutex::new(w))),
+                Err(e) => {
+                    eprintln!(
+                        "[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}"
+                    );
+                    None
+                }
+            };
+
+        // Notify WebSocket clients that a new agent is pending.
+        Self::notify_agent_state_changed(&self.watcher_tx);
+
+        let _ = tx.send(AgentEvent::Status {
+            story_id: story_id.to_string(),
+            agent_name: resolved_name.clone(),
+            status: "pending".to_string(),
+        });
+
+        // Extract inactivity timeout from the agent config before cloning config.
+        let inactivity_timeout_secs = config
+            .find_agent(&resolved_name)
+            .map(|a| a.inactivity_timeout_secs)
+            .unwrap_or(300);
+
+        // Clone all values needed inside the background spawn.
+        let project_root_clone = project_root.to_path_buf();
+        let config_clone = config.clone();
+        let resume_context_owned = resume_context.map(str::to_string);
+        let sid = story_id.to_string();
+        let aname = resolved_name.clone();
+        let tx_clone = tx.clone();
+        let agents_ref = self.agents.clone();
+        let key_clone = key.clone();
+        let log_clone = event_log.clone();
+        let port_for_task = self.port;
+        let log_writer_clone = log_writer.clone();
+        let child_killers_clone = self.child_killers.clone();
+        let watcher_tx_clone = self.watcher_tx.clone();
+
+        // Spawn the background task. Worktree creation and agent launch happen here
+        // so `start_agent` returns immediately after registering the agent as
+        // Pending — non-blocking by design (story 157).
+        let handle = tokio::spawn(async move {
+            // Step 1: create the worktree (slow — git checkout, pnpm install, etc.)
+            let wt_info = match worktree::create_worktree(
+                &project_root_clone,
+                &sid,
+                &config_clone,
+                port_for_task,
+            )
+            .await
+            {
+                Ok(wt) => wt,
+                Err(e) => {
+                    let error_msg = format!("Failed to create worktree: {e}");
+                    slog_error!("[agents] {error_msg}");
+                    let event = AgentEvent::Error {
+                        story_id: sid.clone(),
+                        agent_name: aname.clone(),
+                        message: error_msg,
+                    };
+                    if let Ok(mut log) = log_clone.lock() {
+                        log.push(event.clone());
+                    }
+                    let _ = tx_clone.send(event);
+                    if let Ok(mut agents) = agents_ref.lock()
+                        && let Some(agent) = agents.get_mut(&key_clone)
+                    {
+                        agent.status = AgentStatus::Failed;
+                    }
+                    Self::notify_agent_state_changed(&watcher_tx_clone);
+                    return;
+                }
+            };
+
+            // Step 2: store worktree info and render agent command/args/prompt.
+            let wt_path_str = wt_info.path.to_string_lossy().to_string();
+            {
+                if let Ok(mut agents) = agents_ref.lock()
+                    && let Some(agent) = agents.get_mut(&key_clone)
+                {
+                    agent.worktree_info = Some(wt_info.clone());
+                }
+            }
+
+            let (command, args, mut prompt) = match config_clone.render_agent_args(
+                &wt_path_str,
+                &sid,
+                Some(&aname),
+                Some(&wt_info.base_branch),
+            ) {
+                Ok(result) => result,
+                Err(e) => {
+                    let error_msg = format!("Failed to render agent args: {e}");
+                    slog_error!("[agents] {error_msg}");
+                    let event = AgentEvent::Error {
+                        story_id: sid.clone(),
+                        agent_name: aname.clone(),
+                        message: error_msg,
+                    };
+                    if let Ok(mut log) = log_clone.lock() {
+                        log.push(event.clone());
+                    }
+                    let _ = tx_clone.send(event);
+                    if let Ok(mut agents) = agents_ref.lock()
+                        && let Some(agent) = agents.get_mut(&key_clone)
+                    {
+                        agent.status = AgentStatus::Failed;
+                    }
+                    Self::notify_agent_state_changed(&watcher_tx_clone);
+                    return;
+                }
+            };
+
+            // Append resume context if this is a restart with failure information.
+            if let Some(ctx) = resume_context_owned {
+                prompt.push_str(&ctx);
+            }
+
+            // Step 3: transition to Running now that the worktree is ready.
+            {
+                if let Ok(mut agents) = agents_ref.lock()
+                    && let Some(agent) = agents.get_mut(&key_clone)
+                {
+                    agent.status = AgentStatus::Running;
+                }
+            }
+            let _ = tx_clone.send(AgentEvent::Status {
+                story_id: sid.clone(),
+                agent_name: aname.clone(),
+                status: "running".to_string(),
+            });
+            Self::notify_agent_state_changed(&watcher_tx_clone);
+
+            // Step 4: launch the agent process.
+            match super::pty::run_agent_pty_streaming(
+                &sid,
+                &aname,
+                &command,
+                &args,
+                &prompt,
+                &wt_path_str,
+                &tx_clone,
+                &log_clone,
+                log_writer_clone,
+                inactivity_timeout_secs,
+                child_killers_clone,
+            )
+            .await
+            {
+                Ok(pty_result) => {
+                    // Persist token usage if the agent reported it.
+                    if let Some(ref usage) = pty_result.token_usage
+                        && let Ok(agents) = agents_ref.lock()
+                        && let Some(agent) = agents.get(&key_clone)
+                        && let Some(ref pr) = agent.project_root
+                    {
+                        let model = config_clone
+                            .find_agent(&aname)
+                            .and_then(|a| a.model.clone());
+                        let record = super::token_usage::build_record(
+                            &sid, &aname, model, usage.clone(),
+                        );
+                        if let Err(e) = super::token_usage::append_record(pr, &record) {
+                            slog_error!(
+                                "[agents] Failed to persist token usage for \
+                                 {sid}:{aname}: {e}"
+                            );
+                        }
+                    }
+
+                    // Server-owned completion: run acceptance gates automatically
+                    // when the agent process exits normally.
+                    pipeline::run_server_owned_completion(
+                        &agents_ref,
+                        port_for_task,
+                        &sid,
+                        &aname,
+                        pty_result.session_id,
+                        watcher_tx_clone.clone(),
+                    )
+                    .await;
+                    Self::notify_agent_state_changed(&watcher_tx_clone);
+                }
+                Err(e) => {
+                    slog_error!("[agents] Agent process error for {aname} on {sid}: {e}");
+                    let event = AgentEvent::Error {
+                        story_id: sid.clone(),
+                        agent_name: aname.clone(),
+                        message: e,
+                    };
+                    if let Ok(mut log) = log_clone.lock() {
+                        log.push(event.clone());
+                    }
+                    let _ = tx_clone.send(event);
+                    if let Ok(mut agents) = agents_ref.lock()
+                        && let Some(agent) = agents.get_mut(&key_clone)
+                    {
+                        agent.status = AgentStatus::Failed;
+                    }
+                    Self::notify_agent_state_changed(&watcher_tx_clone);
+                }
+            }
+        });
+
+        // Store the task handle while the agent is still Pending.
+        {
+            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
+            if let Some(agent) = agents.get_mut(&key) {
+                agent.task_handle = Some(handle);
+            }
+        }
+
+        // Agent successfully spawned — prevent the guard from removing the entry.
+        pending_guard.disarm();
+
+        Ok(AgentInfo {
+            story_id: story_id.to_string(),
+            agent_name: resolved_name,
+            status: AgentStatus::Pending,
+            session_id: None,
+            worktree_path: None,
+            base_branch: None,
+            completion: None,
+            log_session_id: Some(log_session_id),
+        })
+    }
+
+    /// Stop a running agent. Worktree is preserved for inspection.
+    pub async fn stop_agent(
+        &self,
+        _project_root: &Path,
+        story_id: &str,
+        agent_name: &str,
+    ) -> Result<(), String> {
+        let key = composite_key(story_id, agent_name);
+
+        let (worktree_info, task_handle, tx) = {
+            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
+            let agent = agents
+                .get_mut(&key)
+                .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
+
+            let wt = agent.worktree_info.clone();
+            let handle = agent.task_handle.take();
+            let tx = agent.tx.clone();
+            agent.status = AgentStatus::Failed;
+            (wt, handle, tx)
+        };
+
+        // Abort the task and kill the PTY child process.
+        // Note: aborting a spawn_blocking task handle does not interrupt the blocking
+        // thread, so we must also kill the child process directly via the killer registry.
+        if let Some(handle) = task_handle {
+            handle.abort();
+            let _ = handle.await;
+        }
+        self.kill_child_for_key(&key);
+
+        // Preserve worktree for inspection — don't destroy agent's work on stop.
+        if let Some(ref wt) = worktree_info {
+            slog!(
+                "[agents] Worktree preserved for {story_id}:{agent_name}: {}",
+                wt.path.display()
+            );
+        }
+
+        let _ = tx.send(AgentEvent::Status {
+            story_id: story_id.to_string(),
+            agent_name: agent_name.to_string(),
+            status: "stopped".to_string(),
+        });
+
+        // Remove from map
+        {
+            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
+            agents.remove(&key);
+        }
+
+        // Notify WebSocket clients so pipeline board and agent panel update.
+        Self::notify_agent_state_changed(&self.watcher_tx);
+
+        Ok(())
+    }
+
+    /// Return the names of configured agents for `stage` that are not currently
+    /// running or pending.
+    pub fn available_agents_for_stage(
+        &self,
+        config: &ProjectConfig,
+        stage: &PipelineStage,
+    ) -> Result<Vec<String>, String> {
+        let agents = self.agents.lock().map_err(|e| e.to_string())?;
+        Ok(config
+            .agent
+            .iter()
+            .filter(|cfg| agent_config_stage(cfg) == *stage)
+            .filter(|cfg| {
+                !agents.values().any(|a| {
+                    a.agent_name == cfg.name
+                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+                })
+            })
+            .map(|cfg| cfg.name.clone())
+            .collect())
+    }
+
+    /// List all agents with their status.
+    pub fn list_agents(&self) -> Result<Vec<AgentInfo>, String> {
+        let agents = self.agents.lock().map_err(|e| e.to_string())?;
+        Ok(agents
+            .iter()
+            .map(|(key, agent)| {
+                // Extract story_id from composite key "story_id:agent_name"
+                let story_id = key
+                    .rsplit_once(':')
+                    .map(|(sid, _)| sid.to_string())
+                    .unwrap_or_else(|| key.clone());
+                agent_info_from_entry(&story_id, agent)
+            })
+            .collect())
+    }
+
+    /// Subscribe to events for a story agent.
+    pub fn subscribe(
+        &self,
+        story_id: &str,
+        agent_name: &str,
+    ) -> Result<broadcast::Receiver<AgentEvent>, String> {
+        let key = composite_key(story_id, agent_name);
+        let agents = self.agents.lock().map_err(|e| e.to_string())?;
+        let agent = agents
+            .get(&key)
+            .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
+        Ok(agent.tx.subscribe())
+    }
+
+    /// Drain accumulated events for polling. Returns all events since the last drain.
+    pub fn drain_events(
+        &self,
+        story_id: &str,
+        agent_name: &str,
+    ) -> Result<Vec<AgentEvent>, String> {
+        let key = composite_key(story_id, agent_name);
+        let agents = self.agents.lock().map_err(|e| e.to_string())?;
+        let agent = agents
+            .get(&key)
+            .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
+        let mut log = agent.event_log.lock().map_err(|e| e.to_string())?;
+        Ok(log.drain(..).collect())
+    }
+
+    /// Block until the agent reaches a terminal state (completed, failed, stopped).
+    /// Returns the agent's final `AgentInfo`.
+    /// `timeout_ms` caps how long to wait; returns an error if the deadline passes.
+    pub async fn wait_for_agent(
+        &self,
+        story_id: &str,
+        agent_name: &str,
+        timeout_ms: u64,
+    ) -> Result<AgentInfo, String> {
+        // Subscribe before checking status so we don't miss the terminal event
+        // if the agent completes in the window between the two operations.
+        let mut rx = self.subscribe(story_id, agent_name)?;
+
+        // Return immediately if already in a terminal state.
+        {
+            let agents = self.agents.lock().map_err(|e| e.to_string())?;
+            let key = composite_key(story_id, agent_name);
+            if let Some(agent) = agents.get(&key)
+                && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed)
+            {
+                return Ok(agent_info_from_entry(story_id, agent));
+            }
+        }
+
+        let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms);
+
+        loop {
+            let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
+            if remaining.is_zero() {
+                return Err(format!(
+                    "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'"
+                ));
+            }
+
+            match tokio::time::timeout(remaining, rx.recv()).await {
+                Ok(Ok(event)) => {
+                    let is_terminal = match &event {
+                        AgentEvent::Done { .. } | AgentEvent::Error { .. } => true,
+                        AgentEvent::Status { status, .. } if status == "stopped" => true,
+                        _ => false,
+                    };
+                    if is_terminal {
+                        let agents = self.agents.lock().map_err(|e| e.to_string())?;
+                        let key = composite_key(story_id, agent_name);
+                        return Ok(if let Some(agent) = agents.get(&key) {
+                            agent_info_from_entry(story_id, agent)
+                        } else {
+                            // Agent was removed from map (e.g. stop_agent removes it after
+                            // the "stopped" status event is sent).
+                            let (status, session_id) = match event {
+                                AgentEvent::Done { session_id, .. } => {
+                                    (AgentStatus::Completed, session_id)
+                                }
+                                _ => (AgentStatus::Failed, None),
+                            };
+                            AgentInfo {
+                                story_id: story_id.to_string(),
+                                agent_name: agent_name.to_string(),
+                                status,
+                                session_id,
+                                worktree_path: None,
+                                base_branch: None,
+                                completion: None,
+                                log_session_id: None,
+                            }
+                        });
+                    }
+                }
+                Ok(Err(broadcast::error::RecvError::Lagged(_))) => {
+                    // Missed some buffered events — check current status before resuming.
+                    let agents = self.agents.lock().map_err(|e| e.to_string())?;
+                    let key = composite_key(story_id, agent_name);
+                    if let Some(agent) = agents.get(&key)
+                        && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed)
+                    {
+                        return Ok(agent_info_from_entry(story_id, agent));
+                    }
+                    // Still running — continue the loop.
+                }
+                Ok(Err(broadcast::error::RecvError::Closed)) => {
+                    // Channel closed: no more events will arrive. Return current state.
+                    let agents = self.agents.lock().map_err(|e| e.to_string())?;
+                    let key = composite_key(story_id, agent_name);
+                    if let Some(agent) = agents.get(&key) {
+                        return Ok(agent_info_from_entry(story_id, agent));
+                    }
+                    return Err(format!(
+                        "Agent '{agent_name}' for story '{story_id}' channel closed unexpectedly"
+                    ));
+                }
+                Err(_) => {
+                    return Err(format!(
+                        "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'"
+                    ));
+                }
+            }
+        }
+    }
+
+    /// Create a worktree for the given story using the server port (writes .mcp.json).
+    pub async fn create_worktree(
+        &self,
+        project_root: &Path,
+        story_id: &str,
+    ) -> Result<worktree::WorktreeInfo, String> {
+        let config = ProjectConfig::load(project_root)?;
+        worktree::create_worktree(project_root, story_id, &config, self.port).await
+    }
+
+    /// Get project root helper.
+    pub fn get_project_root(&self, state: &crate::state::SessionState) -> Result<PathBuf, String> {
+        state.get_project_root()
+    }
+
+    /// Get the log session ID and project root for an agent, if available.
+    ///
+    /// Used by MCP tools to find the persistent log file for a completed agent.
+    pub fn get_log_info(&self, story_id: &str, agent_name: &str) -> Option<(String, PathBuf)> {
+        let key = composite_key(story_id, agent_name);
+        let agents = self.agents.lock().ok()?;
+        let agent = agents.get(&key)?;
+        let session_id = agent.log_session_id.clone()?;
+        let project_root = agent.project_root.clone()?;
+        Some((session_id, project_root))
+    }
+
+    /// Remove all agent entries for a given story_id from the pool.
+    ///
+    /// Called when a story is archived so that stale entries don't accumulate.
+    /// Returns the number of entries removed.
+    pub fn remove_agents_for_story(&self, story_id: &str) -> usize {
+        let mut agents = match self.agents.lock() {
+            Ok(a) => a,
+            Err(e) => {
+                slog_error!("[agents] Failed to lock pool for cleanup of '{story_id}': {e}");
+                return 0;
+            }
+        };
+        let prefix = format!("{story_id}:");
+        let keys_to_remove: Vec<String> = agents
+            .keys()
+            .filter(|k| k.starts_with(&prefix))
+            .cloned()
+            .collect();
+        let count = keys_to_remove.len();
+        for key in &keys_to_remove {
+            agents.remove(key);
+        }
+        if count > 0 {
+            slog!("[agents] Removed {count} agent entries for archived story '{story_id}'");
+        }
+        count
+    }
+
+    /// Test helper: inject a pre-built agent entry so unit tests can exercise
+    /// wait/subscribe logic without spawning a real process.
+    #[cfg(test)]
+    pub fn inject_test_agent(
+        &self,
+        story_id: &str,
+        agent_name: &str,
+        status: AgentStatus,
+    ) -> broadcast::Sender<AgentEvent> {
+        let (tx, _) = broadcast::channel::<AgentEvent>(64);
+        let key = composite_key(story_id, agent_name);
+        let mut agents = self.agents.lock().unwrap();
+        agents.insert(
+            key,
+            StoryAgent {
+                agent_name: agent_name.to_string(),
+                status,
+                worktree_info: None,
+                session_id: None,
+                tx: tx.clone(),
+                task_handle: None,
+                event_log: Arc::new(Mutex::new(Vec::new())),
+                completion: None,
+                project_root: None,
+                log_session_id: None,
+                merge_failure_reported: false,
+            },
+        );
+        tx
+    }
+
+    /// Test helper: inject an agent with a specific worktree path for testing
+    /// gate-related logic.
+    #[cfg(test)]
+    pub fn inject_test_agent_with_path(
+        &self,
+        story_id: &str,
+        agent_name: &str,
+        status: AgentStatus,
+        worktree_path: PathBuf,
+    ) -> broadcast::Sender<AgentEvent> {
+        let (tx, _) = broadcast::channel::<AgentEvent>(64);
+        let key = composite_key(story_id, agent_name);
+        let mut agents = self.agents.lock().unwrap();
+        agents.insert(
+            key,
+            StoryAgent {
+                agent_name: agent_name.to_string(),
+                status,
+                worktree_info: Some(WorktreeInfo {
+                    path: worktree_path,
+                    branch: format!("feature/story-{story_id}"),
+                    base_branch: "master".to_string(),
+                }),
+                session_id: None,
+                tx: tx.clone(),
+                task_handle: None,
+                event_log: Arc::new(Mutex::new(Vec::new())),
+                completion: None,
+                project_root: None,
+                log_session_id: None,
+                merge_failure_reported: false,
+            },
+        );
+        tx
+    }
+
+    /// Test helper: inject an agent with a completion report and project_root
+    /// for testing pipeline advance logic without spawning real agents.
+    #[cfg(test)]
+    pub fn inject_test_agent_with_completion(
+        &self,
+        story_id: &str,
+        agent_name: &str,
+        status: AgentStatus,
+        project_root: PathBuf,
+        completion: CompletionReport,
+    ) -> broadcast::Sender<AgentEvent> {
+        let (tx, _) = broadcast::channel::<AgentEvent>(64);
+        let key = composite_key(story_id, agent_name);
+        let mut agents = self.agents.lock().unwrap();
+        agents.insert(
+            key,
+            StoryAgent {
+                agent_name: agent_name.to_string(),
+                status,
+                worktree_info: None,
+                session_id: None,
+                tx: tx.clone(),
+                task_handle: None,
+                event_log: Arc::new(Mutex::new(Vec::new())),
+                completion: Some(completion),
+                project_root: Some(project_root),
+                log_session_id: None,
+                merge_failure_reported: false,
+            },
+        );
+        tx
+    }
+
+    /// Inject a Running agent with a pre-built (possibly finished) task handle.
+    /// Used by watchdog tests to simulate an orphaned agent.
+    #[cfg(test)]
+    pub fn inject_test_agent_with_handle(
+        &self,
+        story_id: &str,
+        agent_name: &str,
+        status: AgentStatus,
+        task_handle: tokio::task::JoinHandle<()>,
+    ) -> broadcast::Sender<AgentEvent> {
+        let (tx, _) = broadcast::channel::<AgentEvent>(64);
+        let key = composite_key(story_id, agent_name);
+        let mut agents = self.agents.lock().unwrap();
+        agents.insert(
+            key,
+            StoryAgent {
+                agent_name: agent_name.to_string(),
+                status,
+                worktree_info: None,
+                session_id: None,
+                tx: tx.clone(),
+                task_handle: Some(task_handle),
+                event_log: Arc::new(Mutex::new(Vec::new())),
+                completion: None,
+                project_root: None,
+                log_session_id: None,
+                merge_failure_reported: false,
+            },
+        );
+        tx
+    }
+
+    /// Test helper: inject a child killer into the registry.
+    #[cfg(test)]
+    pub fn inject_child_killer(&self, key: &str, killer: Box<dyn ChildKiller + Send + Sync>) {
+        let mut killers = self.child_killers.lock().unwrap();
+        killers.insert(key.to_string(), killer);
+    }
+
+    /// Test helper: return the number of registered child killers.
+    #[cfg(test)]
+    pub fn child_killer_count(&self) -> usize {
+        self.child_killers.lock().unwrap().len()
+    }
+}
+
+/// Return the active pipeline stage directory name for `story_id`, or `None` if the
+/// story is not in any active stage (`2_current/`, `3_qa/`, `4_merge/`).
+fn find_active_story_stage(project_root: &Path, story_id: &str) -> Option<&'static str> {
+    const STAGES: [&str; 3] = ["2_current", "3_qa", "4_merge"];
+    for stage in &STAGES {
+        let path = project_root
+            .join(".story_kit")
+            .join("work")
+            .join(stage)
+            .join(format!("{story_id}.md"));
+        if path.exists() {
+            return Some(stage);
+        }
+    }
+    None
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::agents::{AgentEvent, AgentStatus, PipelineStage};
+    use crate::config::ProjectConfig;
+    use portable_pty::{CommandBuilder, PtySize, native_pty_system};
+
+    fn make_config(toml_str: &str) -> ProjectConfig {
+        ProjectConfig::parse(toml_str).unwrap()
+    }
+
+    #[tokio::test]
+    async fn wait_for_agent_returns_immediately_if_completed() {
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("s1", "bot", AgentStatus::Completed);
+
+        let info = pool.wait_for_agent("s1", "bot", 1000).await.unwrap();
+        assert_eq!(info.status, AgentStatus::Completed);
+        assert_eq!(info.story_id, "s1");
+        assert_eq!(info.agent_name, "bot");
+    }
+
+    #[tokio::test]
+    async fn wait_for_agent_returns_immediately_if_failed() {
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("s2", "bot", AgentStatus::Failed);
+
+        let info = pool.wait_for_agent("s2", "bot", 1000).await.unwrap();
+        assert_eq!(info.status, AgentStatus::Failed);
+    }
+
+    #[tokio::test]
+    async fn wait_for_agent_completes_on_done_event() {
+        let pool = AgentPool::new_test(3001);
+        let tx = pool.inject_test_agent("s3", "bot", AgentStatus::Running);
+
+        // Send Done event after a short delay
+        let tx_clone = tx.clone();
+        tokio::spawn(async move {
+            tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+            let _ = tx_clone.send(AgentEvent::Done {
+                story_id: "s3".to_string(),
+                agent_name: "bot".to_string(),
+                session_id: Some("sess-abc".to_string()),
+            });
+        });
+
+        let info = pool.wait_for_agent("s3", "bot", 2000).await.unwrap();
+        assert_eq!(info.story_id, "s3");
+    }
+
+    #[tokio::test]
+    async fn wait_for_agent_times_out() {
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("s4", "bot", AgentStatus::Running);
+
+        let result = pool.wait_for_agent("s4", "bot", 50).await;
+        assert!(result.is_err());
+        let msg = result.unwrap_err();
+        assert!(msg.contains("Timed out"), "unexpected message: {msg}");
+    }
+
+    #[tokio::test]
+    async fn wait_for_agent_errors_for_nonexistent() {
+        let pool = AgentPool::new_test(3001);
+        let result = pool.wait_for_agent("no_story", "no_bot", 100).await;
+        assert!(result.is_err());
+    }
+
+    #[tokio::test]
+    async fn wait_for_agent_completes_on_stopped_status_event() {
+        let pool = AgentPool::new_test(3001);
+        let tx = pool.inject_test_agent("s5", "bot", AgentStatus::Running);
+
+        let tx_clone = tx.clone();
+        tokio::spawn(async move {
+            tokio::time::sleep(std::time::Duration::from_millis(30)).await;
+            let _ = tx_clone.send(AgentEvent::Status {
+                story_id: "s5".to_string(),
+                agent_name: "bot".to_string(),
+                status: "stopped".to_string(),
+            });
+        });
+
+        let info = pool.wait_for_agent("s5", "bot", 2000).await.unwrap();
+        assert_eq!(info.story_id, "s5");
+    }
+
+    // ── kill_all_children tests ────────────────────────────────────
+
+    /// Returns true if a process with the given PID is currently running.
+    fn process_is_running(pid: u32) -> bool {
+        std::process::Command::new("ps")
+            .arg("-p")
+            .arg(pid.to_string())
+            .stdout(std::process::Stdio::null())
+            .stderr(std::process::Stdio::null())
+            .status()
+            .map(|s| s.success())
+            .unwrap_or(false)
+    }
+
+    #[test]
+    fn kill_all_children_is_safe_on_empty_pool() {
+        let pool = AgentPool::new_test(3001);
+        pool.kill_all_children();
+        assert_eq!(pool.child_killer_count(), 0);
+    }
+
+    #[test]
+    fn kill_all_children_kills_real_process() {
+        let pool = AgentPool::new_test(3001);
+
+        let pty_system = native_pty_system();
+        let pair = pty_system
+            .openpty(PtySize {
+                rows: 24,
+                cols: 80,
+                pixel_width: 0,
+                pixel_height: 0,
+            })
+            .expect("failed to open pty");
+
+        let mut cmd = CommandBuilder::new("sleep");
+        cmd.arg("100");
+        let mut child = pair
+            .slave
+            .spawn_command(cmd)
+            .expect("failed to spawn sleep");
+        let pid = child.process_id().expect("no pid");
+
+        pool.inject_child_killer("story:agent", child.clone_killer());
+
+        assert!(
+            process_is_running(pid),
+            "process {pid} should be running before kill_all_children"
+        );
+
+        pool.kill_all_children();
+        let _ = child.wait();
+
+        assert!(
+            !process_is_running(pid),
+            "process {pid} should have been killed by kill_all_children"
+        );
+    }
+
+    #[test]
+    fn kill_all_children_clears_registry() {
+        let pool = AgentPool::new_test(3001);
+
+        let pty_system = native_pty_system();
+        let pair = pty_system
+            .openpty(PtySize {
+                rows: 24,
+                cols: 80,
+                pixel_width: 0,
+                pixel_height: 0,
+            })
+            .expect("failed to open pty");
+
+        let mut cmd = CommandBuilder::new("sleep");
+        cmd.arg("1");
+        let mut child = pair
+            .slave
+            .spawn_command(cmd)
+            .expect("failed to spawn sleep");
+
+        pool.inject_child_killer("story:agent", child.clone_killer());
+        assert_eq!(pool.child_killer_count(), 1);
+
+        pool.kill_all_children();
+        let _ = child.wait();
+
+        assert_eq!(
+            pool.child_killer_count(),
+            0,
+            "child_killers should be cleared after kill_all_children"
+        );
+    }
+
+    // ── available_agents_for_stage tests (story 190) ──────────────────────────
+
+    #[test]
+    fn available_agents_for_stage_returns_idle_agents() {
+        let config = make_config(
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+
+[[agent]]
+name = "qa"
+stage = "qa"
+"#,
+        );
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
+
+        let available = pool
+            .available_agents_for_stage(&config, &PipelineStage::Coder)
+            .unwrap();
+        assert_eq!(available, vec!["coder-2"]);
+
+        let available_qa = pool
+            .available_agents_for_stage(&config, &PipelineStage::Qa)
+            .unwrap();
+        assert_eq!(available_qa, vec!["qa"]);
+    }
+
+    #[test]
+    fn available_agents_for_stage_returns_empty_when_all_busy() {
+        let config = make_config(
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+"#,
+        );
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
+
+        let available = pool
+            .available_agents_for_stage(&config, &PipelineStage::Coder)
+            .unwrap();
+        assert!(available.is_empty());
+    }
+
+    #[test]
+    fn available_agents_for_stage_ignores_completed_agents() {
+        let config = make_config(
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+"#,
+        );
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Completed);
+
+        let available = pool
+            .available_agents_for_stage(&config, &PipelineStage::Coder)
+            .unwrap();
+        assert_eq!(available, vec!["coder-1"]);
+    }
+
+    #[tokio::test]
+    async fn start_agent_auto_selects_second_coder_when_first_busy() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        std::fs::create_dir_all(&sk).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            r#"
+[[agent]]
+name = "supervisor"
+stage = "other"
+
+[[agent]]
+name = "coder-1"
+stage = "coder"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running);
+
+        let result = pool
+            .start_agent(tmp.path(), "42_my_story", None, None)
+            .await;
+        match result {
+            Ok(info) => {
+                assert_eq!(info.agent_name, "coder-2");
+            }
+            Err(err) => {
+                assert!(
+                    !err.contains("All coder agents are busy"),
+                    "should have selected coder-2 but got: {err}"
+                );
+                assert!(
+                    !err.contains("No coder agent configured"),
+                    "should not fail on agent selection, got: {err}"
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn start_agent_returns_busy_when_all_coders_occupied() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        std::fs::create_dir_all(&sk).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
+        pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending);
+
+        let result = pool.start_agent(tmp.path(), "story-3", None, None).await;
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("All coder agents are busy"),
+            "expected busy error, got: {err}"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_moves_story_to_current_when_coders_busy() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        let backlog = sk.join("work/1_backlog");
+        std::fs::create_dir_all(&backlog).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+"#,
+        )
+        .unwrap();
+        std::fs::write(backlog.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
+
+        let result = pool.start_agent(tmp.path(), "story-3", None, None).await;
+
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("All coder agents are busy"),
+            "expected busy error, got: {err}"
+        );
+        assert!(
+            err.contains("queued in work/2_current/"),
+            "expected story-to-current message, got: {err}"
+        );
+
+        let current_path = sk.join("work/2_current/story-3.md");
+        assert!(
+            current_path.exists(),
+            "story should be in 2_current/ after busy error, but was not"
+        );
+        let backlog_path = backlog.join("story-3.md");
+        assert!(
+            !backlog_path.exists(),
+            "story should no longer be in 1_backlog/"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_story_already_in_current_is_noop() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        let current = sk.join("work/2_current");
+        std::fs::create_dir_all(&current).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
+        )
+        .unwrap();
+        std::fs::write(current.join("story-5.md"), "---\nname: Story 5\n---\n").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+
+        let result = pool.start_agent(tmp.path(), "story-5", None, None).await;
+        match result {
+            Ok(_) => {}
+            Err(e) => {
+                assert!(
+                    !e.contains("Failed to move"),
+                    "should not fail on idempotent move, got: {e}"
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn start_agent_explicit_name_unchanged_when_busy() {
+        let tmp = tempfile::tempdir().unwrap();
+        let sk = tmp.path().join(".story_kit");
+        std::fs::create_dir_all(&sk).unwrap();
+        std::fs::write(
+            sk.join("project.toml"),
+            r#"
+[[agent]]
+name = "coder-1"
+stage = "coder"
+
+[[agent]]
+name = "coder-2"
+stage = "coder"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
+
+        let result = pool
+            .start_agent(tmp.path(), "story-2", Some("coder-1"), None)
+            .await;
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("coder-1") && err.contains("already running"),
+            "expected explicit busy error, got: {err}"
+        );
+    }
+
+    // ── start_agent single-instance concurrency tests ─────────────────────────
+
+    #[tokio::test]
+    async fn start_agent_rejects_when_same_agent_already_running_on_another_story() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(&sk_dir).unwrap();
+        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-a", "qa", AgentStatus::Running);
+
+        let result = pool.start_agent(root, "story-b", Some("qa"), None).await;
+
+        assert!(
+            result.is_err(),
+            "start_agent should fail when qa is already running on another story"
+        );
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("already running") || err.contains("becomes available"),
+            "error message should explain why: got '{err}'"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_allows_new_story_when_previous_run_is_completed() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(&sk_dir).unwrap();
+        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story-a", "qa", AgentStatus::Completed);
+
+        let result = pool.start_agent(root, "story-b", Some("qa"), None).await;
+
+        if let Err(ref e) = result {
+            assert!(
+                !e.contains("already running") && !e.contains("becomes available"),
+                "completed agent must not trigger the concurrency guard: got '{e}'"
+            );
+        }
+    }
+
+    // ── bug 118: pending entry cleanup on start_agent failure ────────────────
+
+    #[tokio::test]
+    async fn start_agent_cleans_up_pending_entry_on_failure() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(&sk_dir).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
+        )
+        .unwrap();
+
+        let upcoming = root.join(".story_kit/work/1_backlog");
+        fs::create_dir_all(&upcoming).unwrap();
+        fs::write(upcoming.join("50_story_test.md"), "---\nname: Test\n---\n").unwrap();
+
+        let pool = AgentPool::new_test(3099);
+
+        let result = pool
+            .start_agent(root, "50_story_test", Some("coder-1"), None)
+            .await;
+
+        assert!(
+            result.is_ok(),
+            "start_agent should return Ok(Pending) immediately: {:?}",
+            result.err()
+        );
+        assert_eq!(
+            result.unwrap().status,
+            AgentStatus::Pending,
+            "initial status must be Pending"
+        );
+
+        let final_info = pool
+            .wait_for_agent("50_story_test", "coder-1", 5000)
+            .await
+            .expect("wait_for_agent should not time out");
+        assert_eq!(
+            final_info.status,
+            AgentStatus::Failed,
+            "agent must transition to Failed after worktree creation error"
+        );
+
+        let agents = pool.agents.lock().unwrap();
+        let failed_entry = agents
+            .values()
+            .find(|a| a.agent_name == "coder-1" && a.status == AgentStatus::Failed);
+        assert!(
+            failed_entry.is_some(),
+            "agent pool must retain a Failed entry so the UI can show the error state"
+        );
+        drop(agents);
+
+        let events = pool
+            .drain_events("50_story_test", "coder-1")
+            .expect("drain_events should succeed");
+        let has_error_event = events.iter().any(|e| matches!(e, AgentEvent::Error { .. }));
+        assert!(
+            has_error_event,
+            "event_log must contain AgentEvent::Error after worktree creation fails"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_guard_does_not_remove_running_entry() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(&sk_dir).unwrap();
+        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        pool.inject_test_agent("story-x", "qa", AgentStatus::Running);
+
+        let result = pool.start_agent(root, "story-y", Some("qa"), None).await;
+
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("already running") || err.contains("becomes available"),
+            "running entry must survive: got '{err}'"
+        );
+    }
+
+    // ── TOCTOU race-condition regression tests (story 132) ───────────────────
+
+    #[tokio::test]
+    async fn toctou_pending_entry_blocks_same_agent_on_different_story() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(&sk_dir).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        pool.inject_test_agent("86_story_foo", "coder-1", AgentStatus::Pending);
+
+        let result = pool
+            .start_agent(root, "130_story_bar", Some("coder-1"), None)
+            .await;
+
+        assert!(result.is_err(), "second start_agent must be rejected");
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("already running") || err.contains("becomes available"),
+            "expected concurrency-rejection message, got: '{err}'"
+        );
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn toctou_concurrent_start_agent_same_agent_exactly_one_concurrency_rejection() {
+        use std::fs;
+        use std::sync::Arc;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path().to_path_buf();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap();
+        fs::write(
+            root.join(".story_kit/project.toml"),
+            "[[agent]]\nname = \"coder-1\"\n",
+        )
+        .unwrap();
+        fs::write(
+            root.join(".story_kit/work/1_backlog/86_story_foo.md"),
+            "---\nname: Foo\n---\n",
+        )
+        .unwrap();
+        fs::write(
+            root.join(".story_kit/work/1_backlog/130_story_bar.md"),
+            "---\nname: Bar\n---\n",
+        )
+        .unwrap();
+
+        let pool = Arc::new(AgentPool::new_test(3099));
+
+        let pool1 = pool.clone();
+        let root1 = root.clone();
+        let t1 = tokio::spawn(async move {
+            pool1
+                .start_agent(&root1, "86_story_foo", Some("coder-1"), None)
+                .await
+        });
+
+        let pool2 = pool.clone();
+        let root2 = root.clone();
+        let t2 = tokio::spawn(async move {
+            pool2
+                .start_agent(&root2, "130_story_bar", Some("coder-1"), None)
+                .await
+        });
+
+        let (r1, r2) = tokio::join!(t1, t2);
+        let r1 = r1.unwrap();
+        let r2 = r2.unwrap();
+
+        let concurrency_rejections = [&r1, &r2]
+            .iter()
+            .filter(|r| {
+                r.as_ref().is_err_and(|e| {
+                    e.contains("already running") || e.contains("becomes available")
+                })
+            })
+            .count();
+
+        assert_eq!(
+            concurrency_rejections, 1,
+            "exactly one call must be rejected by the concurrency check; \
+             got r1={r1:?} r2={r2:?}"
+        );
+    }
+
+    // ── story-230: prevent duplicate stage agents on same story ───────────────
+
+    #[tokio::test]
+    async fn start_agent_rejects_second_coder_stage_on_same_story() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(&sk_dir).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
+
+        let result = pool
+            .start_agent(root, "42_story_foo", Some("coder-2"), None)
+            .await;
+
+        assert!(
+            result.is_err(),
+            "second coder on same story must be rejected"
+        );
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("same pipeline stage"),
+            "error must mention same pipeline stage, got: '{err}'"
+        );
+        assert!(
+            err.contains("coder-1") && err.contains("coder-2"),
+            "error must name both agents, got: '{err}'"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_rejects_second_qa_stage_on_same_story() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(&sk_dir).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n\n\
+             [[agent]]\nname = \"qa-2\"\nstage = \"qa\"\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        pool.inject_test_agent("55_story_bar", "qa-1", AgentStatus::Running);
+
+        let result = pool
+            .start_agent(root, "55_story_bar", Some("qa-2"), None)
+            .await;
+
+        assert!(result.is_err(), "second qa on same story must be rejected");
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("same pipeline stage"),
+            "error must mention same pipeline stage, got: '{err}'"
+        );
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn start_agent_concurrent_two_coders_same_story_exactly_one_stage_rejection() {
+        use std::fs;
+        use std::sync::Arc;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path().to_path_buf();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
+        fs::write(
+            root.join(".story_kit/project.toml"),
+            "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
+        )
+        .unwrap();
+        fs::write(
+            root.join(".story_kit/work/2_current/42_story_foo.md"),
+            "---\nname: Foo\n---\n",
+        )
+        .unwrap();
+
+        let pool = Arc::new(AgentPool::new_test(3099));
+
+        let pool1 = pool.clone();
+        let root1 = root.clone();
+        let t1 = tokio::spawn(async move {
+            pool1
+                .start_agent(&root1, "42_story_foo", Some("coder-1"), None)
+                .await
+        });
+
+        let pool2 = pool.clone();
+        let root2 = root.clone();
+        let t2 = tokio::spawn(async move {
+            pool2
+                .start_agent(&root2, "42_story_foo", Some("coder-2"), None)
+                .await
+        });
+
+        let (r1, r2) = tokio::join!(t1, t2);
+        let r1 = r1.unwrap();
+        let r2 = r2.unwrap();
+
+        let stage_rejections = [&r1, &r2]
+            .iter()
+            .filter(|r| r.as_ref().is_err_and(|e| e.contains("same pipeline stage")))
+            .count();
+
+        assert_eq!(
+            stage_rejections, 1,
+            "exactly one call must be rejected by the stage-conflict check; \
+             got r1={r1:?} r2={r2:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_two_coders_different_stories_not_blocked_by_stage_check() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap();
+        fs::write(
+            root.join(".story_kit/project.toml"),
+            "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
+        )
+        .unwrap();
+        fs::write(
+            root.join(".story_kit/work/1_backlog/99_story_baz.md"),
+            "---\nname: Baz\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
+
+        let result = pool
+            .start_agent(root, "99_story_baz", Some("coder-2"), None)
+            .await;
+
+        if let Err(ref e) = result {
+            assert!(
+                !e.contains("same pipeline stage"),
+                "stage-conflict guard must not fire for agents on different stories; \
+                 got: '{e}'"
+            );
+        }
+    }
+
+    // ── bug 312: stage-pipeline mismatch guard in start_agent ──────────────
+
+    #[tokio::test]
+    async fn start_agent_rejects_mergemaster_on_coding_stage_story() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
+             [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
+        )
+        .unwrap();
+        fs::write(
+            sk_dir.join("work/2_current/310_story_foo.md"),
+            "---\nname: Foo\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        let result = pool
+            .start_agent(root, "310_story_foo", Some("mergemaster"), None)
+            .await;
+
+        assert!(
+            result.is_err(),
+            "mergemaster must not be assigned to a story in 2_current/"
+        );
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("stage") && err.contains("2_current"),
+            "error must mention stage mismatch, got: '{err}'"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_rejects_coder_on_qa_stage_story() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(sk_dir.join("work/3_qa")).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
+             [[agent]]\nname = \"qa\"\nstage = \"qa\"\n",
+        )
+        .unwrap();
+        fs::write(
+            sk_dir.join("work/3_qa/42_story_bar.md"),
+            "---\nname: Bar\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        let result = pool
+            .start_agent(root, "42_story_bar", Some("coder-1"), None)
+            .await;
+
+        assert!(
+            result.is_err(),
+            "coder must not be assigned to a story in 3_qa/"
+        );
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("stage") && err.contains("3_qa"),
+            "error must mention stage mismatch, got: '{err}'"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_rejects_qa_on_merge_stage_story() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"qa\"\nstage = \"qa\"\n\n\
+             [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
+        )
+        .unwrap();
+        fs::write(
+            sk_dir.join("work/4_merge/55_story_baz.md"),
+            "---\nname: Baz\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        let result = pool
+            .start_agent(root, "55_story_baz", Some("qa"), None)
+            .await;
+
+        assert!(
+            result.is_err(),
+            "qa must not be assigned to a story in 4_merge/"
+        );
+        let err = result.unwrap_err();
+        assert!(
+            err.contains("stage") && err.contains("4_merge"),
+            "error must mention stage mismatch, got: '{err}'"
+        );
+    }
+
+    #[tokio::test]
+    async fn start_agent_allows_supervisor_on_any_stage() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"supervisor\"\nstage = \"other\"\n",
+        )
+        .unwrap();
+        fs::write(
+            sk_dir.join("work/2_current/77_story_sup.md"),
+            "---\nname: Sup\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        let result = pool
+            .start_agent(root, "77_story_sup", Some("supervisor"), None)
+            .await;
+
+        match result {
+            Ok(_) => {}
+            Err(e) => {
+                assert!(
+                    !e.contains("stage:") || !e.contains("cannot be assigned"),
+                    "supervisor should not be rejected for stage mismatch, got: '{e}'"
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn start_agent_allows_correct_stage_agent() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk_dir = root.join(".story_kit");
+        fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap();
+        fs::write(
+            sk_dir.join("project.toml"),
+            "[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
+        )
+        .unwrap();
+        fs::write(
+            sk_dir.join("work/4_merge/88_story_ok.md"),
+            "---\nname: OK\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3099);
+        let result = pool
+            .start_agent(root, "88_story_ok", Some("mergemaster"), None)
+            .await;
+
+        match result {
+            Ok(_) => {}
+            Err(e) => {
+                assert!(
+                    !e.contains("cannot be assigned"),
+                    "mergemaster on 4_merge/ story should not fail stage check, got: '{e}'"
+                );
+            }
+        }
+    }
+
+    // ── find_active_story_stage tests ─────────────────────────────────────────
+
+    #[test]
+    fn find_active_story_stage_detects_current() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+        let current = root.join(".story_kit/work/2_current");
+        fs::create_dir_all(&current).unwrap();
+        fs::write(current.join("10_story_test.md"), "test").unwrap();
+
+        assert_eq!(
+            find_active_story_stage(root, "10_story_test"),
+            Some("2_current")
+        );
+    }
+
+    #[test]
+    fn find_active_story_stage_detects_qa() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+        let qa = root.join(".story_kit/work/3_qa");
+        fs::create_dir_all(&qa).unwrap();
+        fs::write(qa.join("11_story_test.md"), "test").unwrap();
+
+        assert_eq!(find_active_story_stage(root, "11_story_test"), Some("3_qa"));
+    }
+
+    #[test]
+    fn find_active_story_stage_detects_merge() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+        let merge = root.join(".story_kit/work/4_merge");
+        fs::create_dir_all(&merge).unwrap();
+        fs::write(merge.join("12_story_test.md"), "test").unwrap();
+
+        assert_eq!(
+            find_active_story_stage(root, "12_story_test"),
+            Some("4_merge")
+        );
+    }
+
+    #[test]
+    fn find_active_story_stage_returns_none_for_unknown_story() {
+        let tmp = tempfile::tempdir().unwrap();
+        assert_eq!(find_active_story_stage(tmp.path(), "99_nonexistent"), None);
+    }
+
+    // ── remove_agents_for_story tests ────────────────────────────────────────
+
+    #[test]
+    fn remove_agents_for_story_removes_all_entries() {
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story_a", "coder-1", AgentStatus::Completed);
+        pool.inject_test_agent("story_a", "qa", AgentStatus::Failed);
+        pool.inject_test_agent("story_b", "coder-1", AgentStatus::Running);
+
+        let removed = pool.remove_agents_for_story("story_a");
+        assert_eq!(removed, 2, "should remove both agents for story_a");
+
+        let agents = pool.list_agents().unwrap();
+        assert_eq!(agents.len(), 1, "only story_b agent should remain");
+        assert_eq!(agents[0].story_id, "story_b");
+    }
+
+    #[test]
+    fn remove_agents_for_story_returns_zero_when_no_match() {
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("story_a", "coder-1", AgentStatus::Running);
+
+        let removed = pool.remove_agents_for_story("nonexistent");
+        assert_eq!(removed, 0);
+
+        let agents = pool.list_agents().unwrap();
+        assert_eq!(agents.len(), 1, "existing agents should not be affected");
+    }
+
+    // ── archive + cleanup integration test ───────────────────────────────────
+
+    #[tokio::test]
+    async fn archiving_story_removes_agent_entries_from_pool() {
+        use crate::agents::lifecycle::move_story_to_archived;
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let current = root.join(".story_kit/work/2_current");
+        fs::create_dir_all(&current).unwrap();
+        fs::write(current.join("60_story_cleanup.md"), "test").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("60_story_cleanup", "coder-1", AgentStatus::Completed);
+        pool.inject_test_agent("60_story_cleanup", "qa", AgentStatus::Completed);
+        pool.inject_test_agent("61_story_other", "coder-1", AgentStatus::Running);
+
+        assert_eq!(pool.list_agents().unwrap().len(), 3);
+
+        move_story_to_archived(root, "60_story_cleanup").unwrap();
+        pool.remove_agents_for_story("60_story_cleanup");
+
+        let remaining = pool.list_agents().unwrap();
+        assert_eq!(
+            remaining.len(),
+            1,
+            "only the other story's agent should remain"
+        );
+        assert_eq!(remaining[0].story_id, "61_story_other");
+
+        assert!(
+            root.join(".story_kit/work/5_done/60_story_cleanup.md")
+                .exists()
+        );
+    }
+}
diff --git a/server/src/agents/pool/pipeline.rs b/server/src/agents/pool/pipeline.rs
new file mode 100644
index 0000000..fa77a19
--- /dev/null
+++ b/server/src/agents/pool/pipeline.rs
@@ -0,0 +1,1771 @@
+use crate::config::ProjectConfig;
+use crate::slog;
+use crate::slog_error;
+use crate::slog_warn;
+use crate::worktree;
+use crate::io::watcher::WatcherEvent;
+use std::collections::HashMap;
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, Mutex};
+use tokio::sync::broadcast;
+
+use super::super::{
+    AgentEvent, AgentStatus, CompletionReport, PipelineStage,
+    agent_config_stage, pipeline_stage,
+};
+use super::{AgentPool, StoryAgent, composite_key};
+
+impl AgentPool {
+    /// Pipeline advancement: after an agent completes, move the story to
+    /// the next pipeline stage and start the appropriate agent.
+    pub(super) async fn run_pipeline_advance(
+        &self,
+        story_id: &str,
+        agent_name: &str,
+        completion: CompletionReport,
+        project_root: Option<PathBuf>,
+        worktree_path: Option<PathBuf>,
+        merge_failure_reported: bool,
+    ) {
+        let project_root = match project_root {
+            Some(p) => p,
+            None => {
+                slog_warn!("[pipeline] No project_root for '{story_id}:{agent_name}'");
+                return;
+            }
+        };
+
+        let config = ProjectConfig::load(&project_root).unwrap_or_default();
+        let stage = config
+            .find_agent(agent_name)
+            .map(agent_config_stage)
+            .unwrap_or_else(|| pipeline_stage(agent_name));
+
+        match stage {
+            PipelineStage::Other => {
+                // Supervisors and unknown agents do not advance the pipeline.
+            }
+            PipelineStage::Coder => {
+                if completion.gates_passed {
+                    // Determine effective QA mode for this story.
+                    let qa_mode = {
+                        let item_type = super::super::lifecycle::item_type_from_id(story_id);
+                        if item_type == "spike" {
+                            crate::io::story_metadata::QaMode::Human
+                        } else {
+                            let default_qa = config.default_qa_mode();
+                            // Story is in 2_current/ when a coder completes.
+                            let story_path = project_root
+                                .join(".story_kit/work/2_current")
+                                .join(format!("{story_id}.md"));
+                            crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa)
+                        }
+                    };
+
+                    match qa_mode {
+                        crate::io::story_metadata::QaMode::Server => {
+                            slog!(
+                                "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
+                                 qa: server — moving directly to merge."
+                            );
+                            if let Err(e) =
+                                super::super::lifecycle::move_story_to_merge(&project_root, story_id)
+                            {
+                                slog_error!(
+                                    "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
+                                );
+                            } else if let Err(e) = self
+                                .start_agent(&project_root, story_id, Some("mergemaster"), None)
+                                .await
+                            {
+                                slog_error!(
+                                    "[pipeline] Failed to start mergemaster for '{story_id}': {e}"
+                                );
+                            }
+                        }
+                        crate::io::story_metadata::QaMode::Agent => {
+                            slog!(
+                                "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
+                                 qa: agent — moving to QA."
+                            );
+                            if let Err(e) = super::super::lifecycle::move_story_to_qa(&project_root, story_id) {
+                                slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
+                            } else if let Err(e) = self
+                                .start_agent(&project_root, story_id, Some("qa"), None)
+                                .await
+                            {
+                                slog_error!("[pipeline] Failed to start qa agent for '{story_id}': {e}");
+                            }
+                        }
+                        crate::io::story_metadata::QaMode::Human => {
+                            slog!(
+                                "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
+                                 qa: human — holding for human review."
+                            );
+                            if let Err(e) = super::super::lifecycle::move_story_to_qa(&project_root, story_id) {
+                                slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
+                            } else {
+                                let qa_dir = project_root.join(".story_kit/work/3_qa");
+                                let story_path = qa_dir.join(format!("{story_id}.md"));
+                                if let Err(e) =
+                                    crate::io::story_metadata::write_review_hold(&story_path)
+                                {
+                                    slog_error!(
+                                        "[pipeline] Failed to set review_hold on '{story_id}': {e}"
+                                    );
+                                }
+                            }
+                        }
+                    }
+                } else {
+                    // Increment retry count and check if blocked.
+                    let story_path = project_root
+                        .join(".story_kit/work/2_current")
+                        .join(format!("{story_id}.md"));
+                    if should_block_story(&story_path, config.max_retries, story_id, "coder") {
+                        // Story has exceeded retry limit — do not restart.
+                    } else {
+                        slog!(
+                            "[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting."
+                        );
+                        let context = format!(
+                            "\n\n---\n## Previous Attempt Failed\n\
+                             The acceptance gates failed with the following output:\n{}\n\n\
+                             Please review the failures above, fix the issues, and try again.",
+                            completion.gate_output
+                        );
+                        if let Err(e) = self
+                            .start_agent(&project_root, story_id, Some(agent_name), Some(&context))
+                            .await
+                        {
+                            slog_error!(
+                                "[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}"
+                            );
+                        }
+                    }
+                }
+            }
+            PipelineStage::Qa => {
+                if completion.gates_passed {
+                    // Run coverage gate in the QA worktree before advancing to merge.
+                    let coverage_path = worktree_path
+                        .clone()
+                        .unwrap_or_else(|| project_root.clone());
+                    let cp = coverage_path.clone();
+                    let coverage_result =
+                        tokio::task::spawn_blocking(move || super::super::gates::run_coverage_gate(&cp))
+                            .await
+                            .unwrap_or_else(|e| {
+                                slog_warn!("[pipeline] Coverage gate task panicked: {e}");
+                                Ok((false, format!("Coverage gate task panicked: {e}")))
+                            });
+                    let (coverage_passed, coverage_output) = match coverage_result {
+                        Ok(pair) => pair,
+                        Err(e) => (false, e),
+                    };
+
+                    if coverage_passed {
+                        // Check whether this item needs human review before merging.
+                        let needs_human_review = {
+                            let item_type = super::super::lifecycle::item_type_from_id(story_id);
+                            if item_type == "spike" {
+                                true // Spikes always need human review.
+                            } else {
+                                let qa_dir = project_root.join(".story_kit/work/3_qa");
+                                let story_path = qa_dir.join(format!("{story_id}.md"));
+                                let default_qa = config.default_qa_mode();
+                                matches!(
+                                    crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa),
+                                    crate::io::story_metadata::QaMode::Human
+                                )
+                            }
+                        };
+
+                        if needs_human_review {
+                            // Hold in 3_qa/ for human review.
+                            let qa_dir = project_root.join(".story_kit/work/3_qa");
+                            let story_path = qa_dir.join(format!("{story_id}.md"));
+                            if let Err(e) =
+                                crate::io::story_metadata::write_review_hold(&story_path)
+                            {
+                                slog_error!(
+                                    "[pipeline] Failed to set review_hold on '{story_id}': {e}"
+                                );
+                            }
+                            slog!(
+                                "[pipeline] QA passed for '{story_id}'. \
+                                 Holding for human review. \
+                                 Worktree preserved at: {worktree_path:?}"
+                            );
+                        } else {
+                            slog!(
+                                "[pipeline] QA passed gates and coverage for '{story_id}'. \
+                                 Moving directly to merge."
+                            );
+                            if let Err(e) =
+                                super::super::lifecycle::move_story_to_merge(&project_root, story_id)
+                            {
+                                slog_error!(
+                                    "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
+                                );
+                            } else if let Err(e) = self
+                                .start_agent(&project_root, story_id, Some("mergemaster"), None)
+                                .await
+                            {
+                                slog_error!(
+                                    "[pipeline] Failed to start mergemaster for '{story_id}': {e}"
+                                );
+                            }
+                        }
+                    } else {
+                        let story_path = project_root
+                            .join(".story_kit/work/3_qa")
+                            .join(format!("{story_id}.md"));
+                        if should_block_story(&story_path, config.max_retries, story_id, "qa-coverage") {
+                            // Story has exceeded retry limit — do not restart.
+                        } else {
+                            slog!(
+                                "[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA."
+                            );
+                            let context = format!(
+                                "\n\n---\n## Coverage Gate Failed\n\
+                                 The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\
+                                 Please improve test coverage until the coverage gate passes.",
+                                coverage_output
+                            );
+                            if let Err(e) = self
+                                .start_agent(&project_root, story_id, Some("qa"), Some(&context))
+                                .await
+                            {
+                                slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
+                            }
+                        }
+                    }
+                } else {
+                    let story_path = project_root
+                        .join(".story_kit/work/3_qa")
+                        .join(format!("{story_id}.md"));
+                    if should_block_story(&story_path, config.max_retries, story_id, "qa") {
+                        // Story has exceeded retry limit — do not restart.
+                    } else {
+                        slog!("[pipeline] QA failed gates for '{story_id}'. Restarting.");
+                        let context = format!(
+                            "\n\n---\n## Previous QA Attempt Failed\n\
+                             The acceptance gates failed with the following output:\n{}\n\n\
+                             Please re-run and fix the issues.",
+                            completion.gate_output
+                        );
+                        if let Err(e) = self
+                            .start_agent(&project_root, story_id, Some("qa"), Some(&context))
+                            .await
+                        {
+                            slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
+                        }
+                    }
+                }
+            }
+            PipelineStage::Mergemaster => {
+                // Block advancement if the mergemaster explicitly reported a failure.
+                // The server-owned gate check runs in the feature-branch worktree (not
+                // master), so `gates_passed=true` is misleading when no code was merged.
+                if merge_failure_reported {
+                    slog!(
+                        "[pipeline] Pipeline advancement blocked for '{story_id}': \
+                         mergemaster explicitly reported a merge failure. \
+                         Story stays in 4_merge/ for human review."
+                    );
+                } else {
+                    // Run script/test on master (project_root) as the post-merge verification.
+                    slog!(
+                        "[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master."
+                    );
+                    let root = project_root.clone();
+                    let test_result =
+                        tokio::task::spawn_blocking(move || super::super::gates::run_project_tests(&root))
+                            .await
+                            .unwrap_or_else(|e| {
+                                slog_warn!("[pipeline] Post-merge test task panicked: {e}");
+                                Ok((false, format!("Test task panicked: {e}")))
+                            });
+                    let (passed, output) = match test_result {
+                        Ok(pair) => pair,
+                        Err(e) => (false, e),
+                    };
+
+                    if passed {
+                        slog!(
+                            "[pipeline] Post-merge tests passed for '{story_id}'. Moving to done."
+                        );
+                        if let Err(e) =
+                            super::super::lifecycle::move_story_to_archived(&project_root, story_id)
+                        {
+                            slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}");
+                        }
+                        self.remove_agents_for_story(story_id);
+                        // TODO: Re-enable worktree cleanup once we have persistent agent logs.
+                        // Removing worktrees destroys evidence needed to debug empty-commit agents.
+                        // let config =
+                        //     crate::config::ProjectConfig::load(&project_root).unwrap_or_default();
+                        // if let Err(e) =
+                        //     worktree::remove_worktree_by_story_id(&project_root, story_id, &config)
+                        //         .await
+                        // {
+                        //     slog!(
+                        //         "[pipeline] Failed to remove worktree for '{story_id}': {e}"
+                        //     );
+                        // }
+                        slog!(
+                            "[pipeline] Story '{story_id}' done. Worktree preserved for inspection."
+                        );
+                    } else {
+                        let story_path = project_root
+                            .join(".story_kit/work/4_merge")
+                            .join(format!("{story_id}.md"));
+                        if should_block_story(&story_path, config.max_retries, story_id, "mergemaster") {
+                            // Story has exceeded retry limit — do not restart.
+                        } else {
+                            slog!(
+                                "[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster."
+                            );
+                            let context = format!(
+                                "\n\n---\n## Post-Merge Test Failed\n\
+                             The tests on master failed with the following output:\n{}\n\n\
+                             Please investigate and resolve the failures, then call merge_agent_work again.",
+                                output
+                            );
+                            if let Err(e) = self
+                                .start_agent(
+                                    &project_root,
+                                    story_id,
+                                    Some("mergemaster"),
+                                    Some(&context),
+                                )
+                                .await
+                            {
+                                slog_error!(
+                                    "[pipeline] Failed to restart mergemaster for '{story_id}': {e}"
+                                );
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Always scan for unassigned work after any agent completes, regardless
+        // of the outcome (success, failure, restart).  This ensures stories that
+        // failed agent assignment due to busy agents are retried when agents
+        // become available (bug 295).
+        self.auto_assign_available_work(&project_root).await;
+    }
+
+    /// Internal: report that an agent has finished work on a story.
+    ///
+    /// **Note:** This is no longer exposed as an MCP tool. The server now
+    /// automatically runs completion gates when an agent process exits
+    /// (see `run_server_owned_completion`). This method is retained for
+    /// backwards compatibility and testing.
+    ///
+    /// - Rejects with an error if the worktree has uncommitted changes.
+    /// - Runs acceptance gates (cargo clippy + cargo nextest run / cargo test).
+    /// - Stores the `CompletionReport` on the agent record.
+    /// - Transitions status to `Completed` (gates passed) or `Failed` (gates failed).
+    /// - Emits a `Done` event so `wait_for_agent` unblocks.
+    #[allow(dead_code)]
+    pub async fn report_completion(
+        &self,
+        story_id: &str,
+        agent_name: &str,
+        summary: &str,
+    ) -> Result<CompletionReport, String> {
+        let key = composite_key(story_id, agent_name);
+
+        // Verify agent exists, is Running, and grab its worktree path.
+        let worktree_path = {
+            let agents = self.agents.lock().map_err(|e| e.to_string())?;
+            let agent = agents
+                .get(&key)
+                .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
+
+            if agent.status != AgentStatus::Running {
+                return Err(format!(
+                    "Agent '{agent_name}' for story '{story_id}' is not running (status: {}). \
+                     report_completion can only be called by a running agent.",
+                    agent.status
+                ));
+            }
+
+            agent
+                .worktree_info
+                .as_ref()
+                .map(|wt| wt.path.clone())
+                .ok_or_else(|| {
+                    format!(
+                        "Agent '{agent_name}' for story '{story_id}' has no worktree. \
+                         Cannot run acceptance gates."
+                    )
+                })?
+        };
+
+        let path = worktree_path.clone();
+
+        // Run gate checks in a blocking thread to avoid stalling the async runtime.
+        let (gates_passed, gate_output) = tokio::task::spawn_blocking(move || {
+            // Step 1: Reject if worktree is dirty.
+            super::super::gates::check_uncommitted_changes(&path)?;
+            // Step 2: Run clippy + tests and return (passed, output).
+            super::super::gates::run_acceptance_gates(&path)
+        })
+        .await
+        .map_err(|e| format!("Gate check task panicked: {e}"))??;
+
+        let report = CompletionReport {
+            summary: summary.to_string(),
+            gates_passed,
+            gate_output,
+        };
+
+        // Extract data for pipeline advance, then remove the entry so
+        // completed agents never appear in list_agents.
+        let (
+            tx,
+            session_id,
+            project_root_for_advance,
+            wt_path_for_advance,
+            merge_failure_reported_for_advance,
+        ) = {
+            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
+            let agent = agents.get_mut(&key).ok_or_else(|| {
+                format!("Agent '{agent_name}' for story '{story_id}' disappeared during gate check")
+            })?;
+            agent.completion = Some(report.clone());
+            let tx = agent.tx.clone();
+            let sid = agent.session_id.clone();
+            let pr = agent.project_root.clone();
+            let wt = agent.worktree_info.as_ref().map(|w| w.path.clone());
+            let mfr = agent.merge_failure_reported;
+            agents.remove(&key);
+            (tx, sid, pr, wt, mfr)
+        };
+
+        // Emit Done so wait_for_agent unblocks.
+        let _ = tx.send(AgentEvent::Done {
+            story_id: story_id.to_string(),
+            agent_name: agent_name.to_string(),
+            session_id,
+        });
+
+        // Notify WebSocket clients that the agent is gone.
+        Self::notify_agent_state_changed(&self.watcher_tx);
+
+        // Advance the pipeline state machine in a background task.
+        let pool_clone = Self {
+            agents: Arc::clone(&self.agents),
+            port: self.port,
+            child_killers: Arc::clone(&self.child_killers),
+            watcher_tx: self.watcher_tx.clone(),
+            merge_jobs: Arc::clone(&self.merge_jobs),
+        };
+        let sid = story_id.to_string();
+        let aname = agent_name.to_string();
+        let report_for_advance = report.clone();
+        tokio::spawn(async move {
+            pool_clone
+                .run_pipeline_advance(
+                    &sid,
+                    &aname,
+                    report_for_advance,
+                    project_root_for_advance,
+                    wt_path_for_advance,
+                    merge_failure_reported_for_advance,
+                )
+                .await;
+        });
+
+        Ok(report)
+    }
+
+    /// Start the merge pipeline as a background task.
+    ///
+    /// Returns immediately so the MCP tool call doesn't time out (the full
+    /// pipeline — squash merge + quality gates — takes well over 60 seconds,
+    /// exceeding Claude Code's MCP tool-call timeout).
+    ///
+    /// The mergemaster agent should poll [`get_merge_status`](Self::get_merge_status)
+    /// until the job reaches a terminal state.
+    pub fn start_merge_agent_work(
+        self: &Arc<Self>,
+        project_root: &Path,
+        story_id: &str,
+    ) -> Result<(), String> {
+        // Guard against double-starts.
+        {
+            let jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
+            if let Some(job) = jobs.get(story_id)
+                && matches!(job.status, super::super::merge::MergeJobStatus::Running)
+            {
+                return Err(format!(
+                    "Merge already in progress for '{story_id}'. \
+                     Use get_merge_status to poll for completion."
+                ));
+            }
+        }
+
+        // Insert Running job.
+        {
+            let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
+            jobs.insert(
+                story_id.to_string(),
+                super::super::merge::MergeJob {
+                    story_id: story_id.to_string(),
+                    status: super::super::merge::MergeJobStatus::Running,
+                },
+            );
+        }
+
+        let pool = Arc::clone(self);
+        let root = project_root.to_path_buf();
+        let sid = story_id.to_string();
+
+        tokio::spawn(async move {
+            let report = pool.run_merge_pipeline(&root, &sid).await;
+            let failed = report.is_err();
+            let status = match report {
+                Ok(r) => super::super::merge::MergeJobStatus::Completed(r),
+                Err(e) => super::super::merge::MergeJobStatus::Failed(e),
+            };
+            if let Ok(mut jobs) = pool.merge_jobs.lock()
+                && let Some(job) = jobs.get_mut(&sid)
+            {
+                job.status = status;
+            }
+            if failed {
+                pool.auto_assign_available_work(&root).await;
+            }
+        });
+
+        Ok(())
+    }
+
+    /// The actual merge pipeline, run inside a background task.
+    async fn run_merge_pipeline(
+        self: &Arc<Self>,
+        project_root: &Path,
+        story_id: &str,
+    ) -> Result<super::super::merge::MergeReport, String> {
+        let branch = format!("feature/story-{story_id}");
+        let wt_path = worktree::worktree_path(project_root, story_id);
+        let root = project_root.to_path_buf();
+        let sid = story_id.to_string();
+        let br = branch.clone();
+
+        let merge_result =
+            tokio::task::spawn_blocking(move || super::super::merge::run_squash_merge(&root, &br, &sid))
+                .await
+                .map_err(|e| format!("Merge task panicked: {e}"))??;
+
+        if !merge_result.success {
+            return Ok(super::super::merge::MergeReport {
+                story_id: story_id.to_string(),
+                success: false,
+                had_conflicts: merge_result.had_conflicts,
+                conflicts_resolved: merge_result.conflicts_resolved,
+                conflict_details: merge_result.conflict_details,
+                gates_passed: merge_result.gates_passed,
+                gate_output: merge_result.output,
+                worktree_cleaned_up: false,
+                story_archived: false,
+            });
+        }
+
+        let story_archived =
+            super::super::lifecycle::move_story_to_archived(project_root, story_id).is_ok();
+        if story_archived {
+            self.remove_agents_for_story(story_id);
+        }
+
+        let worktree_cleaned_up = if wt_path.exists() {
+            let config = crate::config::ProjectConfig::load(project_root).unwrap_or_default();
+            worktree::remove_worktree_by_story_id(project_root, story_id, &config)
+                .await
+                .is_ok()
+        } else {
+            false
+        };
+
+        self.auto_assign_available_work(project_root).await;
+
+        Ok(super::super::merge::MergeReport {
+            story_id: story_id.to_string(),
+            success: true,
+            had_conflicts: merge_result.had_conflicts,
+            conflicts_resolved: merge_result.conflicts_resolved,
+            conflict_details: merge_result.conflict_details,
+            gates_passed: true,
+            gate_output: merge_result.output,
+            worktree_cleaned_up,
+            story_archived,
+        })
+    }
+
+    /// Check the status of a background merge job.
+    pub fn get_merge_status(&self, story_id: &str) -> Option<super::super::merge::MergeJob> {
+        self.merge_jobs
+            .lock()
+            .ok()
+            .and_then(|jobs| jobs.get(story_id).cloned())
+    }
+
+    /// Record that the mergemaster agent for `story_id` explicitly reported a
+    /// merge failure via the `report_merge_failure` MCP tool.
+    ///
+    /// Sets `merge_failure_reported = true` on the active mergemaster agent so
+    /// that `run_pipeline_advance` can block advancement to `5_done/` even when
+    /// the server-owned gate check returns `gates_passed=true` (those gates run
+    /// in the feature-branch worktree, not on master).
+    pub fn set_merge_failure_reported(&self, story_id: &str) {
+        match self.agents.lock() {
+            Ok(mut lock) => {
+                let found = lock.iter_mut().find(|(key, agent)| {
+                    let key_story_id = key
+                        .rsplit_once(':')
+                        .map(|(sid, _)| sid)
+                        .unwrap_or(key.as_str());
+                    key_story_id == story_id
+                        && pipeline_stage(&agent.agent_name) == PipelineStage::Mergemaster
+                });
+                match found {
+                    Some((_, agent)) => {
+                        agent.merge_failure_reported = true;
+                        slog!(
+                            "[pipeline] Merge failure flag set for '{story_id}:{}'",
+                            agent.agent_name
+                        );
+                    }
+                    None => {
+                        slog_warn!(
+                            "[pipeline] set_merge_failure_reported: no running mergemaster found \
+                             for story '{story_id}' — flag not set"
+                        );
+                    }
+                }
+            }
+            Err(e) => {
+                slog_error!("[pipeline] set_merge_failure_reported: could not lock agents: {e}");
+            }
+        }
+    }
+}
+
+/// Server-owned completion: runs acceptance gates when an agent process exits
+/// normally, and advances the pipeline based on results.
+///
+/// This is a **free function** (not a method on `AgentPool`) to break the
+/// opaque type cycle that would otherwise arise: `start_agent` → spawned task
+/// → server-owned completion → pipeline advance → `start_agent`.
+///
+/// If the agent already has a completion report (e.g. from a legacy
+/// `report_completion` call), this is a no-op to avoid double-running gates.
+pub(super) async fn run_server_owned_completion(
+    agents: &Arc<Mutex<HashMap<String, StoryAgent>>>,
+    port: u16,
+    story_id: &str,
+    agent_name: &str,
+    session_id: Option<String>,
+    watcher_tx: broadcast::Sender<WatcherEvent>,
+) {
+    let key = composite_key(story_id, agent_name);
+
+    // Guard: skip if completion was already recorded (legacy path).
+    {
+        let lock = match agents.lock() {
+            Ok(a) => a,
+            Err(_) => return,
+        };
+        match lock.get(&key) {
+            Some(agent) if agent.completion.is_some() => {
+                slog!(
+                    "[agents] Completion already recorded for '{story_id}:{agent_name}'; \
+                     skipping server-owned gates."
+                );
+                return;
+            }
+            Some(_) => {}
+            None => return,
+        }
+    }
+
+    // Get worktree path for running gates.
+    let worktree_path = {
+        let lock = match agents.lock() {
+            Ok(a) => a,
+            Err(_) => return,
+        };
+        lock.get(&key)
+            .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
+    };
+
+    // Run acceptance gates.
+    let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path {
+        let path = wt_path;
+        match tokio::task::spawn_blocking(move || {
+            super::super::gates::check_uncommitted_changes(&path)?;
+            // AC5: Fail early if the coder finished with no commits on the feature branch.
+            // This prevents empty-diff stories from advancing through QA to merge.
+            if !super::super::gates::worktree_has_committed_work(&path) {
+                return Ok((
+                    false,
+                    "Agent exited with no commits on the feature branch. \
+                     The agent did not produce any code changes."
+                        .to_string(),
+                ));
+            }
+            super::super::gates::run_acceptance_gates(&path)
+        })
+        .await
+        {
+            Ok(Ok(result)) => result,
+            Ok(Err(e)) => (false, e),
+            Err(e) => (false, format!("Gate check task panicked: {e}")),
+        }
+    } else {
+        (
+            false,
+            "No worktree path available to run acceptance gates".to_string(),
+        )
+    };
+
+    slog!(
+        "[agents] Server-owned completion for '{story_id}:{agent_name}': gates_passed={gates_passed}"
+    );
+
+    let report = CompletionReport {
+        summary: "Agent process exited normally".to_string(),
+        gates_passed,
+        gate_output,
+    };
+
+    // Store completion report, extract data for pipeline advance, then
+    // remove the entry so completed agents never appear in list_agents.
+    let (tx, project_root_for_advance, wt_path_for_advance, merge_failure_reported_for_advance) = {
+        let mut lock = match agents.lock() {
+            Ok(a) => a,
+            Err(_) => return,
+        };
+        let agent = match lock.get_mut(&key) {
+            Some(a) => a,
+            None => return,
+        };
+        agent.completion = Some(report.clone());
+        agent.session_id = session_id.clone();
+        let tx = agent.tx.clone();
+        let pr = agent.project_root.clone();
+        let wt = agent.worktree_info.as_ref().map(|w| w.path.clone());
+        let mfr = agent.merge_failure_reported;
+        lock.remove(&key);
+        (tx, pr, wt, mfr)
+    };
+
+    // Emit Done so wait_for_agent unblocks.
+    let _ = tx.send(AgentEvent::Done {
+        story_id: story_id.to_string(),
+        agent_name: agent_name.to_string(),
+        session_id,
+    });
+
+    // Notify WebSocket clients that the agent is gone.
+    AgentPool::notify_agent_state_changed(&watcher_tx);
+
+    // Advance the pipeline state machine in a background task.
+    spawn_pipeline_advance(
+        Arc::clone(agents),
+        port,
+        story_id,
+        agent_name,
+        report,
+        project_root_for_advance,
+        wt_path_for_advance,
+        watcher_tx,
+        merge_failure_reported_for_advance,
+    );
+}
+
+/// Spawn pipeline advancement as a background task.
+///
+/// This is a **non-async** function so it does not participate in the opaque
+/// type cycle between `start_agent` and `run_server_owned_completion`.
+#[allow(clippy::too_many_arguments)]
+fn spawn_pipeline_advance(
+    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
+    port: u16,
+    story_id: &str,
+    agent_name: &str,
+    completion: CompletionReport,
+    project_root: Option<PathBuf>,
+    worktree_path: Option<PathBuf>,
+    watcher_tx: broadcast::Sender<WatcherEvent>,
+    merge_failure_reported: bool,
+) {
+    let sid = story_id.to_string();
+    let aname = agent_name.to_string();
+    tokio::spawn(async move {
+        let pool = AgentPool {
+            agents,
+            port,
+            child_killers: Arc::new(Mutex::new(HashMap::new())),
+            watcher_tx,
+            merge_jobs: Arc::new(Mutex::new(HashMap::new())),
+        };
+        pool.run_pipeline_advance(
+            &sid,
+            &aname,
+            completion,
+            project_root,
+            worktree_path,
+            merge_failure_reported,
+        )
+        .await;
+    });
+}
+
+/// Increment retry_count and block the story if it exceeds `max_retries`.
+///
+/// Returns `true` if the story is now blocked (caller should NOT restart the agent).
+/// Returns `false` if the story may be retried.
+/// When `max_retries` is 0, retry limits are disabled.
+fn should_block_story(story_path: &Path, max_retries: u32, story_id: &str, stage_label: &str) -> bool {
+    use crate::io::story_metadata::{increment_retry_count, write_blocked};
+
+    if max_retries == 0 {
+        // Retry limits disabled.
+        return false;
+    }
+
+    match increment_retry_count(story_path) {
+        Ok(new_count) => {
+            if new_count >= max_retries {
+                slog_warn!(
+                    "[pipeline] Story '{story_id}' reached retry limit ({new_count}/{max_retries}) \
+                     at {stage_label} stage. Marking as blocked."
+                );
+                if let Err(e) = write_blocked(story_path) {
+                    slog_error!("[pipeline] Failed to write blocked flag for '{story_id}': {e}");
+                }
+                true
+            } else {
+                slog!(
+                    "[pipeline] Story '{story_id}' retry {new_count}/{max_retries} at {stage_label} stage."
+                );
+                false
+            }
+        }
+        Err(e) => {
+            slog_error!("[pipeline] Failed to increment retry_count for '{story_id}': {e}");
+            false // Don't block on error — allow retry.
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use super::super::AgentPool;
+    use crate::agents::merge::{MergeJob, MergeJobStatus};
+    use crate::agents::{AgentEvent, AgentStatus, CompletionReport};
+    use crate::io::watcher::WatcherEvent;
+    use std::path::PathBuf;
+    use std::process::Command;
+
+    fn init_git_repo(repo: &std::path::Path) {
+        Command::new("git")
+            .args(["init"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["config", "user.email", "test@test.com"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["config", "user.name", "Test"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "--allow-empty", "-m", "init"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+    }
+
+    // ── report_completion tests ────────────────────────────────────
+
+    #[tokio::test]
+    async fn report_completion_rejects_nonexistent_agent() {
+        let pool = AgentPool::new_test(3001);
+        let result = pool.report_completion("no_story", "no_bot", "done").await;
+        assert!(result.is_err());
+        let msg = result.unwrap_err();
+        assert!(msg.contains("No agent"), "unexpected: {msg}");
+    }
+
+    #[tokio::test]
+    async fn report_completion_rejects_non_running_agent() {
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent("s6", "bot", AgentStatus::Completed);
+
+        let result = pool.report_completion("s6", "bot", "done").await;
+        assert!(result.is_err());
+        let msg = result.unwrap_err();
+        assert!(
+            msg.contains("not running"),
+            "expected 'not running' in: {msg}"
+        );
+    }
+
+    #[tokio::test]
+    async fn report_completion_rejects_dirty_worktree() {
+        use std::fs;
+        use tempfile::tempdir;
+
+        let tmp = tempdir().unwrap();
+        let repo = tmp.path();
+
+        // Init a real git repo and make an initial commit
+        Command::new("git")
+            .args(["init"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "--allow-empty", "-m", "init"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        // Write an uncommitted file
+        fs::write(repo.join("dirty.txt"), "not committed").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent_with_path("s7", "bot", AgentStatus::Running, repo.to_path_buf());
+
+        let result = pool.report_completion("s7", "bot", "done").await;
+        assert!(result.is_err());
+        let msg = result.unwrap_err();
+        assert!(
+            msg.contains("uncommitted"),
+            "expected 'uncommitted' in: {msg}"
+        );
+    }
+
+    // ── server-owned completion tests ───────────────────────────────────────────
+
+    #[tokio::test]
+    async fn server_owned_completion_skips_when_already_completed() {
+        let pool = AgentPool::new_test(3001);
+        let report = CompletionReport {
+            summary: "Already done".to_string(),
+            gates_passed: true,
+            gate_output: String::new(),
+        };
+        pool.inject_test_agent_with_completion(
+            "s10",
+            "coder-1",
+            AgentStatus::Completed,
+            PathBuf::from("/tmp/nonexistent"),
+            report,
+        );
+
+        // Subscribe before calling so we can check if Done event was emitted.
+        let mut rx = pool.subscribe("s10", "coder-1").unwrap();
+
+        run_server_owned_completion(
+            &pool.agents,
+            pool.port,
+            "s10",
+            "coder-1",
+            Some("sess-1".to_string()),
+            pool.watcher_tx.clone(),
+        )
+        .await;
+
+        // Status should remain Completed (unchanged) — no gate re-run.
+        let agents = pool.agents.lock().unwrap();
+        let key = composite_key("s10", "coder-1");
+        let agent = agents.get(&key).unwrap();
+        assert_eq!(agent.status, AgentStatus::Completed);
+        // Summary should still be the original, not overwritten.
+        assert_eq!(agent.completion.as_ref().unwrap().summary, "Already done");
+        drop(agents);
+
+        // No Done event should have been emitted.
+        assert!(
+            rx.try_recv().is_err(),
+            "should not emit Done when completion already exists"
+        );
+    }
+
+    #[tokio::test]
+    async fn server_owned_completion_runs_gates_on_clean_worktree() {
+        use tempfile::tempdir;
+
+        let tmp = tempdir().unwrap();
+        let repo = tmp.path();
+        init_git_repo(repo);
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent_with_path(
+            "s11",
+            "coder-1",
+            AgentStatus::Running,
+            repo.to_path_buf(),
+        );
+
+        let mut rx = pool.subscribe("s11", "coder-1").unwrap();
+
+        run_server_owned_completion(
+            &pool.agents,
+            pool.port,
+            "s11",
+            "coder-1",
+            Some("sess-2".to_string()),
+            pool.watcher_tx.clone(),
+        )
+        .await;
+
+        // Agent entry should be removed from the map after completion.
+        let agents = pool.agents.lock().unwrap();
+        let key = composite_key("s11", "coder-1");
+        assert!(
+            agents.get(&key).is_none(),
+            "agent should be removed from map after completion"
+        );
+        drop(agents);
+
+        // A Done event should have been emitted with the session_id.
+        let event = rx.try_recv().expect("should emit Done event");
+        match &event {
+            AgentEvent::Done { session_id, .. } => {
+                assert_eq!(*session_id, Some("sess-2".to_string()));
+            }
+            other => panic!("expected Done event, got: {other:?}"),
+        }
+    }
+
+    #[tokio::test]
+    async fn server_owned_completion_fails_on_dirty_worktree() {
+        use std::fs;
+        use tempfile::tempdir;
+
+        let tmp = tempdir().unwrap();
+        let repo = tmp.path();
+        init_git_repo(repo);
+        // Create an uncommitted file.
+        fs::write(repo.join("dirty.txt"), "not committed").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.inject_test_agent_with_path(
+            "s12",
+            "coder-1",
+            AgentStatus::Running,
+            repo.to_path_buf(),
+        );
+
+        let mut rx = pool.subscribe("s12", "coder-1").unwrap();
+
+        run_server_owned_completion(
+            &pool.agents,
+            pool.port,
+            "s12",
+            "coder-1",
+            None,
+            pool.watcher_tx.clone(),
+        )
+        .await;
+
+        // Agent entry should be removed from the map after completion (even on failure).
+        let agents = pool.agents.lock().unwrap();
+        let key = composite_key("s12", "coder-1");
+        assert!(
+            agents.get(&key).is_none(),
+            "agent should be removed from map after failed completion"
+        );
+        drop(agents);
+
+        // A Done event should have been emitted.
+        let event = rx.try_recv().expect("should emit Done event");
+        assert!(
+            matches!(event, AgentEvent::Done { .. }),
+            "expected Done event, got: {event:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn server_owned_completion_nonexistent_agent_is_noop() {
+        let pool = AgentPool::new_test(3001);
+        // Should not panic or error — just silently return.
+        run_server_owned_completion(
+            &pool.agents,
+            pool.port,
+            "nonexistent",
+            "bot",
+            None,
+            pool.watcher_tx.clone(),
+        )
+        .await;
+    }
+
+    // ── pipeline advance tests ────────────────────────────────────────────────
+
+    #[tokio::test]
+    async fn pipeline_advance_coder_gates_pass_server_qa_moves_to_merge() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        // Set up story in 2_current/ (no qa frontmatter → uses project default "server")
+        let current = root.join(".story_kit/work/2_current");
+        fs::create_dir_all(&current).unwrap();
+        fs::write(current.join("50_story_test.md"), "test").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.run_pipeline_advance(
+            "50_story_test",
+            "coder-1",
+            CompletionReport {
+                summary: "done".to_string(),
+                gates_passed: true,
+                gate_output: String::new(),
+            },
+            Some(root.to_path_buf()),
+            None,
+            false,
+        )
+        .await;
+
+        // With default qa: server, story skips QA and goes straight to 4_merge/
+        assert!(
+            root.join(".story_kit/work/4_merge/50_story_test.md")
+                .exists(),
+            "story should be in 4_merge/"
+        );
+        assert!(
+            !current.join("50_story_test.md").exists(),
+            "story should not still be in 2_current/"
+        );
+    }
+
+    #[tokio::test]
+    async fn pipeline_advance_coder_gates_pass_agent_qa_moves_to_qa() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        // Set up story in 2_current/ with qa: agent frontmatter
+        let current = root.join(".story_kit/work/2_current");
+        fs::create_dir_all(&current).unwrap();
+        fs::write(
+            current.join("50_story_test.md"),
+            "---\nname: Test\nqa: agent\n---\ntest",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.run_pipeline_advance(
+            "50_story_test",
+            "coder-1",
+            CompletionReport {
+                summary: "done".to_string(),
+                gates_passed: true,
+                gate_output: String::new(),
+            },
+            Some(root.to_path_buf()),
+            None,
+            false,
+        )
+        .await;
+
+        // With qa: agent, story should move to 3_qa/
+        assert!(
+            root.join(".story_kit/work/3_qa/50_story_test.md").exists(),
+            "story should be in 3_qa/"
+        );
+        assert!(
+            !current.join("50_story_test.md").exists(),
+            "story should not still be in 2_current/"
+        );
+    }
+
+    #[tokio::test]
+    async fn pipeline_advance_qa_gates_pass_moves_story_to_merge() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        // Set up story in 3_qa/
+        let qa_dir = root.join(".story_kit/work/3_qa");
+        fs::create_dir_all(&qa_dir).unwrap();
+        // qa: server so the story skips human review and goes straight to merge.
+        fs::write(
+            qa_dir.join("51_story_test.md"),
+            "---\nname: Test\nqa: server\n---\ntest",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.run_pipeline_advance(
+            "51_story_test",
+            "qa",
+            CompletionReport {
+                summary: "QA done".to_string(),
+                gates_passed: true,
+                gate_output: String::new(),
+            },
+            Some(root.to_path_buf()),
+            None,
+            false,
+        )
+        .await;
+
+        // Story should have moved to 4_merge/
+        assert!(
+            root.join(".story_kit/work/4_merge/51_story_test.md")
+                .exists(),
+            "story should be in 4_merge/"
+        );
+        assert!(
+            !qa_dir.join("51_story_test.md").exists(),
+            "story should not still be in 3_qa/"
+        );
+    }
+
+    #[tokio::test]
+    async fn pipeline_advance_supervisor_does_not_advance() {
+        use std::fs;
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let current = root.join(".story_kit/work/2_current");
+        fs::create_dir_all(&current).unwrap();
+        fs::write(current.join("52_story_test.md"), "test").unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        pool.run_pipeline_advance(
+            "52_story_test",
+            "supervisor",
+            CompletionReport {
+                summary: "supervised".to_string(),
+                gates_passed: true,
+                gate_output: String::new(),
+            },
+            Some(root.to_path_buf()),
+            None,
+            false,
+        )
+        .await;
+
+        // Story should NOT have moved (supervisors don't advance pipeline)
+        assert!(
+            current.join("52_story_test.md").exists(),
+            "story should still be in 2_current/ for supervisor"
+        );
+    }
+
+    #[tokio::test]
+    async fn pipeline_advance_sends_agent_state_changed_to_watcher_tx() {
+        use std::fs;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        // Set up story in 2_current/
+        let current = root.join(".story_kit/work/2_current");
+        fs::create_dir_all(&current).unwrap();
+        fs::write(current.join("173_story_test.md"), "test").unwrap();
+        // Ensure 3_qa/ exists for the move target
+        fs::create_dir_all(root.join(".story_kit/work/3_qa")).unwrap();
+        // Ensure 1_backlog/ exists (start_agent calls move_story_to_current)
+        fs::create_dir_all(root.join(".story_kit/work/1_backlog")).unwrap();
+
+        // Write a project.toml with a qa agent so start_agent can resolve it.
+        fs::create_dir_all(root.join(".story_kit")).unwrap();
+        fs::write(
+            root.join(".story_kit/project.toml"),
+            r#"
+default_qa = "agent"
+
+[[agent]]
+name = "coder-1"
+role = "Coder"
+command = "echo"
+args = ["noop"]
+prompt = "test"
+stage = "coder"
+
+[[agent]]
+name = "qa"
+role = "QA"
+command = "echo"
+args = ["noop"]
+prompt = "test"
+stage = "qa"
+"#,
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        // Subscribe to the watcher channel BEFORE the pipeline advance.
+        let mut rx = pool.watcher_tx.subscribe();
+
+        pool.run_pipeline_advance(
+            "173_story_test",
+            "coder-1",
+            CompletionReport {
+                summary: "done".to_string(),
+                gates_passed: true,
+                gate_output: String::new(),
+            },
+            Some(root.to_path_buf()),
+            None,
+            false,
+        )
+        .await;
+
+        // The pipeline advance should have sent AgentStateChanged events via
+        // the pool's watcher_tx (not a dummy channel). Collect all events.
+        let mut got_agent_state_changed = false;
+        while let Ok(evt) = rx.try_recv() {
+            if matches!(evt, WatcherEvent::AgentStateChanged) {
+                got_agent_state_changed = true;
+                break;
+            }
+        }
+
+        assert!(
+            got_agent_state_changed,
+            "pipeline advance should send AgentStateChanged through the real watcher_tx \
+             (bug 173: lozenges must update when agents are assigned during pipeline advance)"
+        );
+    }
+
+    // ── merge_agent_work tests ────────────────────────────────────────────────
+
+    /// Helper: start a merge and poll until terminal state.
+    async fn run_merge_to_completion(
+        pool: &Arc<AgentPool>,
+        repo: &std::path::Path,
+        story_id: &str,
+    ) -> MergeJob {
+        pool.start_merge_agent_work(repo, story_id).unwrap();
+        loop {
+            tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+            if let Some(job) = pool.get_merge_status(story_id)
+                && !matches!(job.status, MergeJobStatus::Running)
+            {
+                return job;
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn merge_agent_work_returns_error_when_branch_not_found() {
+        use tempfile::tempdir;
+
+        let tmp = tempdir().unwrap();
+        let repo = tmp.path();
+        init_git_repo(repo);
+
+        let pool = Arc::new(AgentPool::new_test(3001));
+        let job = run_merge_to_completion(&pool, repo, "99_nonexistent").await;
+        match &job.status {
+            MergeJobStatus::Completed(report) => {
+                assert!(!report.success, "should fail when branch missing");
+            }
+            MergeJobStatus::Failed(_) => {
+                // Also acceptable — the pipeline errored out
+            }
+            MergeJobStatus::Running => {
+                panic!("should not still be running");
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn merge_agent_work_succeeds_on_clean_branch() {
+        use std::fs;
+        use tempfile::tempdir;
+
+        let tmp = tempdir().unwrap();
+        let repo = tmp.path();
+        init_git_repo(repo);
+
+        // Create a feature branch with a commit
+        Command::new("git")
+            .args(["checkout", "-b", "feature/story-23_test"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        fs::write(repo.join("feature.txt"), "feature content").unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "-m", "add feature"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        // Switch back to master (initial branch)
+        Command::new("git")
+            .args(["checkout", "master"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        // Create the story file in 4_merge/ so we can test archival
+        let merge_dir = repo.join(".story_kit/work/4_merge");
+        fs::create_dir_all(&merge_dir).unwrap();
+        let story_file = merge_dir.join("23_test.md");
+        fs::write(&story_file, "---\nname: Test\n---\n").unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "-m", "add story in merge"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        let pool = Arc::new(AgentPool::new_test(3001));
+        let job = run_merge_to_completion(&pool, repo, "23_test").await;
+
+        match &job.status {
+            MergeJobStatus::Completed(report) => {
+                assert!(!report.had_conflicts, "should have no conflicts");
+                assert!(
+                    report.success
+                        || report.gate_output.contains("Failed to run")
+                        || !report.gates_passed,
+                    "report should be coherent: {report:?}"
+                );
+                if report.story_archived {
+                    let done = repo.join(".story_kit/work/5_done/23_test.md");
+                    assert!(done.exists(), "done file should exist");
+                }
+            }
+            MergeJobStatus::Failed(e) => {
+                // Gate failures are acceptable in test env
+                assert!(
+                    e.contains("Failed") || e.contains("failed"),
+                    "unexpected failure: {e}"
+                );
+            }
+            MergeJobStatus::Running => panic!("should not still be running"),
+        }
+    }
+
+    // ── quality gate ordering test ────────────────────────────────
+
+    /// Regression test for bug 142: quality gates must run BEFORE the fast-forward
+    /// to master so that broken code never lands on master.
+    #[cfg(unix)]
+    #[test]
+    fn quality_gates_run_before_fast_forward_to_master() {
+        use std::fs;
+        use std::os::unix::fs::PermissionsExt;
+        use tempfile::tempdir;
+
+        let tmp = tempdir().unwrap();
+        let repo = tmp.path();
+        init_git_repo(repo);
+
+        // Add a failing script/test so quality gates will fail.
+        let script_dir = repo.join("script");
+        fs::create_dir_all(&script_dir).unwrap();
+        let script_test = script_dir.join("test");
+        fs::write(&script_test, "#!/usr/bin/env bash\nexit 1\n").unwrap();
+        let mut perms = fs::metadata(&script_test).unwrap().permissions();
+        perms.set_mode(0o755);
+        fs::set_permissions(&script_test, perms).unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "-m", "add failing script/test"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        // Create a feature branch with a commit.
+        Command::new("git")
+            .args(["checkout", "-b", "feature/story-142_test"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        fs::write(repo.join("change.txt"), "feature change").unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "-m", "feature work"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        // Switch back to master and record its HEAD.
+        Command::new("git")
+            .args(["checkout", "master"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        let head_before = String::from_utf8(
+            Command::new("git")
+                .args(["rev-parse", "HEAD"])
+                .current_dir(repo)
+                .output()
+                .unwrap()
+                .stdout,
+        )
+        .unwrap()
+        .trim()
+        .to_string();
+
+        // Run the squash-merge.  The failing script/test makes quality gates
+        // fail → fast-forward must NOT happen.
+        let result =
+            crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test")
+                .unwrap();
+
+        let head_after = String::from_utf8(
+            Command::new("git")
+                .args(["rev-parse", "HEAD"])
+                .current_dir(repo)
+                .output()
+                .unwrap()
+                .stdout,
+        )
+        .unwrap()
+        .trim()
+        .to_string();
+
+        // Gates must have failed (script/test exits 1) so master should be untouched.
+        assert!(
+            !result.success,
+            "run_squash_merge must report failure when gates fail"
+        );
+        assert_eq!(
+            head_before, head_after,
+            "master HEAD must not advance when quality gates fail (bug 142)"
+        );
+    }
+
+    #[tokio::test]
+    async fn merge_agent_work_conflict_does_not_break_master() {
+        use std::fs;
+        use tempfile::tempdir;
+
+        let tmp = tempdir().unwrap();
+        let repo = tmp.path();
+        init_git_repo(repo);
+
+        // Create a file on master.
+        fs::write(
+            repo.join("code.rs"),
+            "fn main() {\n    println!(\"hello\");\n}\n",
+        )
+        .unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "-m", "initial code"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        // Feature branch: modify the same line differently.
+        Command::new("git")
+            .args(["checkout", "-b", "feature/story-42_story_foo"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        fs::write(
+            repo.join("code.rs"),
+            "fn main() {\n    println!(\"hello\");\n    feature_fn();\n}\n",
+        )
+        .unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "-m", "feature: add fn call"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        // Master: add different line at same location.
+        Command::new("git")
+            .args(["checkout", "master"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        fs::write(
+            repo.join("code.rs"),
+            "fn main() {\n    println!(\"hello\");\n    master_fn();\n}\n",
+        )
+        .unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "-m", "master: add fn call"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        // Create story file in 4_merge.
+        let merge_dir = repo.join(".story_kit/work/4_merge");
+        fs::create_dir_all(&merge_dir).unwrap();
+        fs::write(merge_dir.join("42_story_foo.md"), "---\nname: Test\n---\n").unwrap();
+        Command::new("git")
+            .args(["add", "."])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+        Command::new("git")
+            .args(["commit", "-m", "add story"])
+            .current_dir(repo)
+            .output()
+            .unwrap();
+
+        let pool = Arc::new(AgentPool::new_test(3001));
+        let job = run_merge_to_completion(&pool, repo, "42_story_foo").await;
+
+        // Master should NEVER have conflict markers, regardless of merge outcome.
+        let master_code = fs::read_to_string(repo.join("code.rs")).unwrap();
+        assert!(
+            !master_code.contains("<<<<<<<"),
+            "master must never contain conflict markers:\n{master_code}"
+        );
+        assert!(
+            !master_code.contains(">>>>>>>"),
+            "master must never contain conflict markers:\n{master_code}"
+        );
+
+        // The report should accurately reflect what happened.
+        match &job.status {
+            MergeJobStatus::Completed(report) => {
+                assert!(report.had_conflicts, "should report conflicts");
+            }
+            MergeJobStatus::Failed(_) => {
+                // Acceptable — merge aborted due to conflicts
+            }
+            MergeJobStatus::Running => panic!("should not still be running"),
+        }
+    }
+
+    // ── bug 295: pipeline advance picks up waiting QA stories ──────────
+
+    #[tokio::test]
+    async fn pipeline_advance_picks_up_waiting_qa_stories_after_completion() {
+        use std::fs;
+        use super::super::auto_assign::is_agent_free;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        let sk = root.join(".story_kit");
+        let qa_dir = sk.join("work/3_qa");
+        fs::create_dir_all(&qa_dir).unwrap();
+
+        // Configure a single QA agent.
+        fs::write(
+            sk.join("project.toml"),
+            r#"
+[[agent]]
+name = "qa"
+stage = "qa"
+"#,
+        )
+        .unwrap();
+
+        // Story 292 is in QA with QA agent running (will "complete" via
+        // run_pipeline_advance below).  Story 293 is in QA with NO agent —
+        // simulating the "stuck" state from bug 295.
+        fs::write(
+            qa_dir.join("292_story_first.md"),
+            "---\nname: First\nqa: human\n---\n",
+        )
+        .unwrap();
+        fs::write(
+            qa_dir.join("293_story_second.md"),
+            "---\nname: Second\nqa: human\n---\n",
+        )
+        .unwrap();
+
+        let pool = AgentPool::new_test(3001);
+        // QA is currently running on story 292.
+        pool.inject_test_agent("292_story_first", "qa", AgentStatus::Running);
+
+        // Verify that 293 cannot get a QA agent right now (QA is busy).
+        {
+            let agents = pool.agents.lock().unwrap();
+            assert!(
+                !is_agent_free(&agents, "qa"),
+                "qa should be busy on story 292"
+            );
+        }
+
+        // Simulate QA completing on story 292: remove the agent from the pool
+        // (as run_server_owned_completion does) then run pipeline advance.
+        {
+            let mut agents = pool.agents.lock().unwrap();
+            agents.remove(&composite_key("292_story_first", "qa"));
+        }
+
+        pool.run_pipeline_advance(
+            "292_story_first",
+            "qa",
+            CompletionReport {
+                summary: "QA done".to_string(),
+                gates_passed: true,
+                gate_output: String::new(),
+            },
+            Some(root.to_path_buf()),
+            None,
+            false,
+        )
+        .await;
+
+        // After pipeline advance, auto_assign should have started QA on story 293.
+        let agents = pool.agents.lock().unwrap();
+        let qa_on_293 = agents.values().any(|a| {
+            a.agent_name == "qa"
+                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
+        });
+        assert!(
+            qa_on_293,
+            "auto_assign should have started qa for story 293 after 292's QA completed, \
+             but no qa agent is pending/running. Pool: {:?}",
+            agents
+                .iter()
+                .map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status))
+                .collect::<Vec<_>>()
+        );
+    }
+}