server/src/agents/pool.rs

use crate::agent_log::AgentLogWriter;
use crate::config::ProjectConfig;
use crate::io::watcher::WatcherEvent;
use crate::slog;
use crate::slog_error;
use crate::slog_warn;
use crate::worktree::{self, WorktreeInfo};
use portable_pty::ChildKiller;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};
use tokio::sync::broadcast;

use super::{
    AgentEvent, AgentInfo, AgentStatus, CompletionReport, PipelineStage, ReconciliationEvent,
    agent_config_stage, pipeline_stage,
};

/// Build the composite key used to track agents in the pool.
fn composite_key(story_id: &str, agent_name: &str) -> String {
    format!("{story_id}:{agent_name}")
}

/// RAII guard that removes a pending agent entry from the pool on drop.
///
/// Created after inserting a `Pending` entry into the agent HashMap.
/// If `start_agent` succeeds (the agent process is spawned and status
/// transitions to `Running`), call [`disarm`](Self::disarm) to prevent
/// cleanup.  If any intermediate step fails and the guard is dropped
/// without being disarmed, the pending entry is removed so it cannot
/// block future auto-assign dispatches.
struct PendingGuard {
    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
    key: String,
    armed: bool,
}

impl PendingGuard {
    fn new(agents: Arc<Mutex<HashMap<String, StoryAgent>>>, key: String) -> Self {
        Self {
            agents,
            key,
            armed: true,
        }
    }

    /// Prevent the guard from cleaning up the entry (call after
    /// successful spawn).
    fn disarm(&mut self) {
        self.armed = false;
    }
}

impl Drop for PendingGuard {
    fn drop(&mut self) {
        if self.armed
            && let Ok(mut agents) = self.agents.lock()
            && agents
                .get(&self.key)
                .is_some_and(|a| a.status == AgentStatus::Pending)
        {
            agents.remove(&self.key);
            slog!(
                "[agents] Cleaned up leaked Pending entry for '{}'",
                self.key
            );
        }
    }
}

struct StoryAgent {
    agent_name: String,
    status: AgentStatus,
    worktree_info: Option<WorktreeInfo>,
    session_id: Option<String>,
    tx: broadcast::Sender<AgentEvent>,
    task_handle: Option<tokio::task::JoinHandle<()>>,
    /// Accumulated events for polling via get_agent_output.
    event_log: Arc<Mutex<Vec<AgentEvent>>>,
    /// Set when the agent calls report_completion.
    completion: Option<CompletionReport>,
    /// Project root, stored for pipeline advancement after completion.
    project_root: Option<PathBuf>,
    /// UUID identifying the log file for this session.
    log_session_id: Option<String>,
    /// Set to `true` when the agent calls `report_merge_failure`.
    /// Prevents the pipeline from blindly advancing to `5_done/` after a
    /// failed merge: the server-owned gate check runs in the feature-branch
    /// worktree (which compiles fine) and returns `gates_passed=true` even
    /// though the code was never squash-merged onto master.
    merge_failure_reported: bool,
}

/// Build an `AgentInfo` snapshot from a `StoryAgent` map entry.
fn agent_info_from_entry(story_id: &str, agent: &StoryAgent) -> AgentInfo {
    AgentInfo {
        story_id: story_id.to_string(),
        agent_name: agent.agent_name.clone(),
        status: agent.status.clone(),
        session_id: agent.session_id.clone(),
        worktree_path: agent
            .worktree_info
            .as_ref()
            .map(|wt| wt.path.to_string_lossy().to_string()),
        base_branch: agent
            .worktree_info
            .as_ref()
            .map(|wt| wt.base_branch.clone()),
        completion: agent.completion.clone(),
        log_session_id: agent.log_session_id.clone(),
    }
}

/// Manages concurrent story agents, each in its own worktree.
pub struct AgentPool {
    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
    port: u16,
    /// Registry of active PTY child process killers, keyed by "{story_id}:{agent_name}".
    /// Used to terminate child processes on server shutdown or agent stop, preventing
    /// orphaned Claude Code processes from running after the server exits.
    child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
    /// Broadcast channel for notifying WebSocket clients of agent state changes.
    /// When an agent transitions state (Pending, Running, Completed, Failed, Stopped),
    /// an `AgentStateChanged` event is emitted so the frontend can refresh the
    /// pipeline board without waiting for a filesystem event.
    watcher_tx: broadcast::Sender<WatcherEvent>,
    /// Tracks background merge jobs started by `merge_agent_work`, keyed by story_id.
    /// The MCP tool returns immediately and the mergemaster agent polls
    /// `get_merge_status` until the job reaches a terminal state.
    merge_jobs: Arc<Mutex<HashMap<String, super::merge::MergeJob>>>,
}

impl AgentPool {
    pub fn new(port: u16, watcher_tx: broadcast::Sender<WatcherEvent>) -> Self {
        Self {
            agents: Arc::new(Mutex::new(HashMap::new())),
            port,
            child_killers: Arc::new(Mutex::new(HashMap::new())),
            watcher_tx,
            merge_jobs: Arc::new(Mutex::new(HashMap::new())),
        }
    }

    /// Create a pool with a dummy watcher channel for unit tests.
    #[cfg(test)]
    pub fn new_test(port: u16) -> Self {
        let (watcher_tx, _) = broadcast::channel(16);
        Self::new(port, watcher_tx)
    }

    /// Notify WebSocket clients that agent state has changed, so the pipeline
    /// board and agent panel can refresh.
    fn notify_agent_state_changed(watcher_tx: &broadcast::Sender<WatcherEvent>) {
        let _ = watcher_tx.send(WatcherEvent::AgentStateChanged);
    }

    /// Kill all active PTY child processes.
    ///
    /// Called on server shutdown to prevent orphaned Claude Code processes from
    /// continuing to run after the server exits. Each registered killer is called
    /// once, then the registry is cleared.
    pub fn kill_all_children(&self) {
        if let Ok(mut killers) = self.child_killers.lock() {
            for (key, killer) in killers.iter_mut() {
                slog!("[agents] Killing child process for {key} on shutdown");
                let _ = killer.kill();
            }
            killers.clear();
        }
    }

    /// Kill and deregister the child process for a specific agent key.
    ///
    /// Used by `stop_agent` to ensure the PTY child is terminated even though
    /// aborting a `spawn_blocking` task handle does not interrupt the blocking thread.
    fn kill_child_for_key(&self, key: &str) {
        if let Ok(mut killers) = self.child_killers.lock()
            && let Some(mut killer) = killers.remove(key)
        {
            slog!("[agents] Killing child process for {key} on stop");
            let _ = killer.kill();
        }
    }

    /// Start an agent for a story: load config, create worktree, spawn agent.
    ///
    /// When `agent_name` is `None`, automatically selects the first idle coder
    /// agent (story 190). If all coders are busy the call fails with an error
    /// indicating the story will be picked up when one becomes available.
    ///
    /// If `resume_context` is provided, it is appended to the rendered prompt
    /// so the agent can pick up from a previous failed attempt.
    pub async fn start_agent(
        &self,
        project_root: &Path,
        story_id: &str,
        agent_name: Option<&str>,
        resume_context: Option<&str>,
    ) -> Result<AgentInfo, String> {
        let config = ProjectConfig::load(project_root)?;

        // Validate explicit agent name early (no lock needed).
        if let Some(name) = agent_name {
            config
                .find_agent(name)
                .ok_or_else(|| format!("No agent named '{name}' in config"))?;
        }

        // Create name-independent shared resources before the lock so they are
        // ready for the atomic check-and-insert (story 132).
        let (tx, _) = broadcast::channel::<AgentEvent>(1024);
        let event_log: Arc<Mutex<Vec<AgentEvent>>> = Arc::new(Mutex::new(Vec::new()));
        let log_session_id = uuid::Uuid::new_v4().to_string();

        // Move story from backlog/ to current/ before checking agent
        // availability so that auto_assign_available_work can pick it up even
        // when all coders are currently busy (story 203).  This is idempotent:
        // if the story is already in 2_current/ or a later stage, the call is
        // a no-op.
        super::lifecycle::move_story_to_current(project_root, story_id)?;

        // Atomically resolve agent name, check availability, and register as
        // Pending.  When `agent_name` is `None` the first idle coder is
        // selected inside the lock so no TOCTOU race can occur between the
        // availability check and the Pending insert (story 132, story 190).
        //
        // The `PendingGuard` ensures that if any step below fails the entry is
        // removed from the pool so it does not permanently block auto-assign
        // (bug 118).
        let resolved_name: String;
        let key: String;
        {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;

            resolved_name = match agent_name {
                Some(name) => name.to_string(),
                None => find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder)
                    .map(|s| s.to_string())
                    .ok_or_else(|| {
                        if config
                            .agent
                            .iter()
                            .any(|a| agent_config_stage(a) == PipelineStage::Coder)
                        {
                            format!(
                                "All coder agents are busy; story '{story_id}' has been \
                                 queued in work/2_current/ and will be auto-assigned when \
                                 one becomes available"
                            )
                        } else {
                            "No coder agent configured. Specify an agent_name explicitly."
                                .to_string()
                        }
                    })?,
            };

            key = composite_key(story_id, &resolved_name);

            // Check for duplicate assignment (same story + same agent already active).
            if let Some(agent) = agents.get(&key)
                && (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending)
            {
                return Err(format!(
                    "Agent '{resolved_name}' for story '{story_id}' is already {}",
                    agent.status
                ));
            }
            // Enforce single-stage concurrency: reject if there is already a
            // Running/Pending agent at the same pipeline stage for this story.
            // This prevents two coders (or two QA/mergemaster agents) from
            // corrupting each other's work in the same worktree.
            // Applies to both explicit and auto-selected agents; the Other
            // stage (supervisors, unknown agents) is exempt.
            let resolved_stage = config
                .find_agent(&resolved_name)
                .map(agent_config_stage)
                .unwrap_or_else(|| pipeline_stage(&resolved_name));
            if resolved_stage != PipelineStage::Other
                && let Some(conflicting_name) = agents.iter().find_map(|(k, a)| {
                    let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k);
                    if k_story == story_id
                        && a.agent_name != resolved_name
                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
                    {
                        let a_stage = config
                            .find_agent(&a.agent_name)
                            .map(agent_config_stage)
                            .unwrap_or_else(|| pipeline_stage(&a.agent_name));
                        if a_stage == resolved_stage {
                            Some(a.agent_name.clone())
                        } else {
                            None
                        }
                    } else {
                        None
                    }
                })
            {
                return Err(format!(
                    "Cannot start '{resolved_name}' on story '{story_id}': \
                     '{conflicting_name}' is already active at the same pipeline stage"
                ));
            }
            // Enforce single-instance concurrency for explicitly-named agents:
            // if this agent is already running on any other story, reject.
            // Auto-selected agents are already guaranteed idle by
            // find_free_agent_for_stage, so this check is only needed for
            // explicit requests.
            if agent_name.is_some()
                && let Some(busy_story) = agents.iter().find_map(|(k, a)| {
                    if a.agent_name == resolved_name
                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
                    {
                        Some(
                            k.rsplit_once(':')
                                .map(|(sid, _)| sid)
                                .unwrap_or(k)
                                .to_string(),
                        )
                    } else {
                        None
                    }
                })
            {
                return Err(format!(
                    "Agent '{resolved_name}' is already running on story '{busy_story}'; \
                     story '{story_id}' will be picked up when the agent becomes available"
                ));
            }
            agents.insert(
                key.clone(),
                StoryAgent {
                    agent_name: resolved_name.clone(),
                    status: AgentStatus::Pending,
                    worktree_info: None,
                    session_id: None,
                    tx: tx.clone(),
                    task_handle: None,
                    event_log: event_log.clone(),
                    completion: None,
                    project_root: Some(project_root.to_path_buf()),
                    log_session_id: Some(log_session_id.clone()),
                    merge_failure_reported: false,
                },
            );
        }
        let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone());

        // Create persistent log writer (needs resolved_name, so must be after
        // the atomic resolution above).
        let log_writer =
            match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) {
                Ok(w) => Some(Arc::new(Mutex::new(w))),
                Err(e) => {
                    eprintln!(
                        "[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}"
                    );
                    None
                }
            };

        // Notify WebSocket clients that a new agent is pending.
        Self::notify_agent_state_changed(&self.watcher_tx);

        let _ = tx.send(AgentEvent::Status {
            story_id: story_id.to_string(),
            agent_name: resolved_name.clone(),
            status: "pending".to_string(),
        });

        // Extract inactivity timeout from the agent config before cloning config.
        let inactivity_timeout_secs = config
            .find_agent(&resolved_name)
            .map(|a| a.inactivity_timeout_secs)
            .unwrap_or(300);

        // Clone all values needed inside the background spawn.
        let project_root_clone = project_root.to_path_buf();
        let config_clone = config.clone();
        let resume_context_owned = resume_context.map(str::to_string);
        let sid = story_id.to_string();
        let aname = resolved_name.clone();
        let tx_clone = tx.clone();
        let agents_ref = self.agents.clone();
        let key_clone = key.clone();
        let log_clone = event_log.clone();
        let port_for_task = self.port;
        let log_writer_clone = log_writer.clone();
        let child_killers_clone = self.child_killers.clone();
        let watcher_tx_clone = self.watcher_tx.clone();

        // Spawn the background task. Worktree creation and agent launch happen here
        // so `start_agent` returns immediately after registering the agent as
        // Pending — non-blocking by design (story 157).
        let handle = tokio::spawn(async move {
            // Step 1: create the worktree (slow — git checkout, pnpm install, etc.)
            let wt_info = match worktree::create_worktree(
                &project_root_clone,
                &sid,
                &config_clone,
                port_for_task,
            )
            .await
            {
                Ok(wt) => wt,
                Err(e) => {
                    let error_msg = format!("Failed to create worktree: {e}");
                    slog_error!("[agents] {error_msg}");
                    let event = AgentEvent::Error {
                        story_id: sid.clone(),
                        agent_name: aname.clone(),
                        message: error_msg,
                    };
                    if let Ok(mut log) = log_clone.lock() {
                        log.push(event.clone());
                    }
                    let _ = tx_clone.send(event);
                    if let Ok(mut agents) = agents_ref.lock()
                        && let Some(agent) = agents.get_mut(&key_clone)
                    {
                        agent.status = AgentStatus::Failed;
                    }
                    Self::notify_agent_state_changed(&watcher_tx_clone);
                    return;
                }
            };

            // Step 2: store worktree info and render agent command/args/prompt.
            let wt_path_str = wt_info.path.to_string_lossy().to_string();
            {
                if let Ok(mut agents) = agents_ref.lock()
                    && let Some(agent) = agents.get_mut(&key_clone)
                {
                    agent.worktree_info = Some(wt_info.clone());
                }
            }

            let (command, args, mut prompt) = match config_clone.render_agent_args(
                &wt_path_str,
                &sid,
                Some(&aname),
                Some(&wt_info.base_branch),
            ) {
                Ok(result) => result,
                Err(e) => {
                    let error_msg = format!("Failed to render agent args: {e}");
                    slog_error!("[agents] {error_msg}");
                    let event = AgentEvent::Error {
                        story_id: sid.clone(),
                        agent_name: aname.clone(),
                        message: error_msg,
                    };
                    if let Ok(mut log) = log_clone.lock() {
                        log.push(event.clone());
                    }
                    let _ = tx_clone.send(event);
                    if let Ok(mut agents) = agents_ref.lock()
                        && let Some(agent) = agents.get_mut(&key_clone)
                    {
                        agent.status = AgentStatus::Failed;
                    }
                    Self::notify_agent_state_changed(&watcher_tx_clone);
                    return;
                }
            };

            // Append resume context if this is a restart with failure information.
            if let Some(ctx) = resume_context_owned {
                prompt.push_str(&ctx);
            }

            // Step 3: transition to Running now that the worktree is ready.
            {
                if let Ok(mut agents) = agents_ref.lock()
                    && let Some(agent) = agents.get_mut(&key_clone)
                {
                    agent.status = AgentStatus::Running;
                }
            }
            let _ = tx_clone.send(AgentEvent::Status {
                story_id: sid.clone(),
                agent_name: aname.clone(),
                status: "running".to_string(),
            });
            Self::notify_agent_state_changed(&watcher_tx_clone);

            // Step 4: launch the agent process.
            match super::pty::run_agent_pty_streaming(
                &sid,
                &aname,
                &command,
                &args,
                &prompt,
                &wt_path_str,
                &tx_clone,
                &log_clone,
                log_writer_clone,
                inactivity_timeout_secs,
                child_killers_clone,
            )
            .await
            {
                Ok(pty_result) => {
                    // Persist token usage if the agent reported it.
                    if let Some(ref usage) = pty_result.token_usage
                        && let Ok(agents) = agents_ref.lock()
                        && let Some(agent) = agents.get(&key_clone)
                        && let Some(ref pr) = agent.project_root
                    {
                        let model = config_clone
                            .find_agent(&aname)
                            .and_then(|a| a.model.clone());
                        let record = super::token_usage::build_record(
                            &sid, &aname, model, usage.clone(),
                        );
                        if let Err(e) = super::token_usage::append_record(pr, &record) {
                            slog_error!(
                                "[agents] Failed to persist token usage for \
                                 {sid}:{aname}: {e}"
                            );
                        }
                    }

                    // Server-owned completion: run acceptance gates automatically
                    // when the agent process exits normally.
                    run_server_owned_completion(
                        &agents_ref,
                        port_for_task,
                        &sid,
                        &aname,
                        pty_result.session_id,
                        watcher_tx_clone.clone(),
                    )
                    .await;
                    Self::notify_agent_state_changed(&watcher_tx_clone);
                }
                Err(e) => {
                    slog_error!("[agents] Agent process error for {aname} on {sid}: {e}");
                    let event = AgentEvent::Error {
                        story_id: sid.clone(),
                        agent_name: aname.clone(),
                        message: e,
                    };
                    if let Ok(mut log) = log_clone.lock() {
                        log.push(event.clone());
                    }
                    let _ = tx_clone.send(event);
                    if let Ok(mut agents) = agents_ref.lock()
                        && let Some(agent) = agents.get_mut(&key_clone)
                    {
                        agent.status = AgentStatus::Failed;
                    }
                    Self::notify_agent_state_changed(&watcher_tx_clone);
                }
            }
        });

        // Store the task handle while the agent is still Pending.
        {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
            if let Some(agent) = agents.get_mut(&key) {
                agent.task_handle = Some(handle);
            }
        }

        // Agent successfully spawned — prevent the guard from removing the entry.
        pending_guard.disarm();

        Ok(AgentInfo {
            story_id: story_id.to_string(),
            agent_name: resolved_name,
            status: AgentStatus::Pending,
            session_id: None,
            worktree_path: None,
            base_branch: None,
            completion: None,
            log_session_id: Some(log_session_id),
        })
    }

    /// Stop a running agent. Worktree is preserved for inspection.
    pub async fn stop_agent(
        &self,
        _project_root: &Path,
        story_id: &str,
        agent_name: &str,
    ) -> Result<(), String> {
        let key = composite_key(story_id, agent_name);

        let (worktree_info, task_handle, tx) = {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
            let agent = agents
                .get_mut(&key)
                .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;

            let wt = agent.worktree_info.clone();
            let handle = agent.task_handle.take();
            let tx = agent.tx.clone();
            agent.status = AgentStatus::Failed;
            (wt, handle, tx)
        };

        // Abort the task and kill the PTY child process.
        // Note: aborting a spawn_blocking task handle does not interrupt the blocking
        // thread, so we must also kill the child process directly via the killer registry.
        if let Some(handle) = task_handle {
            handle.abort();
            let _ = handle.await;
        }
        self.kill_child_for_key(&key);

        // Preserve worktree for inspection — don't destroy agent's work on stop.
        if let Some(ref wt) = worktree_info {
            slog!(
                "[agents] Worktree preserved for {story_id}:{agent_name}: {}",
                wt.path.display()
            );
        }

        let _ = tx.send(AgentEvent::Status {
            story_id: story_id.to_string(),
            agent_name: agent_name.to_string(),
            status: "stopped".to_string(),
        });

        // Remove from map
        {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
            agents.remove(&key);
        }

        // Notify WebSocket clients so pipeline board and agent panel update.
        Self::notify_agent_state_changed(&self.watcher_tx);

        Ok(())
    }

    /// Return the names of configured agents for `stage` that are not currently
    /// running or pending.
    pub fn available_agents_for_stage(
        &self,
        config: &ProjectConfig,
        stage: &PipelineStage,
    ) -> Result<Vec<String>, String> {
        let agents = self.agents.lock().map_err(|e| e.to_string())?;
        Ok(config
            .agent
            .iter()
            .filter(|cfg| agent_config_stage(cfg) == *stage)
            .filter(|cfg| {
                !agents.values().any(|a| {
                    a.agent_name == cfg.name
                        && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
                })
            })
            .map(|cfg| cfg.name.clone())
            .collect())
    }

    /// List all agents with their status.
    pub fn list_agents(&self) -> Result<Vec<AgentInfo>, String> {
        let agents = self.agents.lock().map_err(|e| e.to_string())?;
        Ok(agents
            .iter()
            .map(|(key, agent)| {
                // Extract story_id from composite key "story_id:agent_name"
                let story_id = key
                    .rsplit_once(':')
                    .map(|(sid, _)| sid.to_string())
                    .unwrap_or_else(|| key.clone());
                agent_info_from_entry(&story_id, agent)
            })
            .collect())
    }

    /// Subscribe to events for a story agent.
    pub fn subscribe(
        &self,
        story_id: &str,
        agent_name: &str,
    ) -> Result<broadcast::Receiver<AgentEvent>, String> {
        let key = composite_key(story_id, agent_name);
        let agents = self.agents.lock().map_err(|e| e.to_string())?;
        let agent = agents
            .get(&key)
            .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
        Ok(agent.tx.subscribe())
    }

    /// Drain accumulated events for polling. Returns all events since the last drain.
    pub fn drain_events(
        &self,
        story_id: &str,
        agent_name: &str,
    ) -> Result<Vec<AgentEvent>, String> {
        let key = composite_key(story_id, agent_name);
        let agents = self.agents.lock().map_err(|e| e.to_string())?;
        let agent = agents
            .get(&key)
            .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
        let mut log = agent.event_log.lock().map_err(|e| e.to_string())?;
        Ok(log.drain(..).collect())
    }

    /// Block until the agent reaches a terminal state (completed, failed, stopped).
    /// Returns the agent's final `AgentInfo`.
    /// `timeout_ms` caps how long to wait; returns an error if the deadline passes.
    pub async fn wait_for_agent(
        &self,
        story_id: &str,
        agent_name: &str,
        timeout_ms: u64,
    ) -> Result<AgentInfo, String> {
        // Subscribe before checking status so we don't miss the terminal event
        // if the agent completes in the window between the two operations.
        let mut rx = self.subscribe(story_id, agent_name)?;

        // Return immediately if already in a terminal state.
        {
            let agents = self.agents.lock().map_err(|e| e.to_string())?;
            let key = composite_key(story_id, agent_name);
            if let Some(agent) = agents.get(&key)
                && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed)
            {
                return Ok(agent_info_from_entry(story_id, agent));
            }
        }

        let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms);

        loop {
            let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
            if remaining.is_zero() {
                return Err(format!(
                    "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'"
                ));
            }

            match tokio::time::timeout(remaining, rx.recv()).await {
                Ok(Ok(event)) => {
                    let is_terminal = match &event {
                        AgentEvent::Done { .. } | AgentEvent::Error { .. } => true,
                        AgentEvent::Status { status, .. } if status == "stopped" => true,
                        _ => false,
                    };
                    if is_terminal {
                        let agents = self.agents.lock().map_err(|e| e.to_string())?;
                        let key = composite_key(story_id, agent_name);
                        return Ok(if let Some(agent) = agents.get(&key) {
                            agent_info_from_entry(story_id, agent)
                        } else {
                            // Agent was removed from map (e.g. stop_agent removes it after
                            // the "stopped" status event is sent).
                            let (status, session_id) = match event {
                                AgentEvent::Done { session_id, .. } => {
                                    (AgentStatus::Completed, session_id)
                                }
                                _ => (AgentStatus::Failed, None),
                            };
                            AgentInfo {
                                story_id: story_id.to_string(),
                                agent_name: agent_name.to_string(),
                                status,
                                session_id,
                                worktree_path: None,
                                base_branch: None,
                                completion: None,
                                log_session_id: None,
                            }
                        });
                    }
                }
                Ok(Err(broadcast::error::RecvError::Lagged(_))) => {
                    // Missed some buffered events — check current status before resuming.
                    let agents = self.agents.lock().map_err(|e| e.to_string())?;
                    let key = composite_key(story_id, agent_name);
                    if let Some(agent) = agents.get(&key)
                        && matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed)
                    {
                        return Ok(agent_info_from_entry(story_id, agent));
                    }
                    // Still running — continue the loop.
                }
                Ok(Err(broadcast::error::RecvError::Closed)) => {
                    // Channel closed: no more events will arrive. Return current state.
                    let agents = self.agents.lock().map_err(|e| e.to_string())?;
                    let key = composite_key(story_id, agent_name);
                    if let Some(agent) = agents.get(&key) {
                        return Ok(agent_info_from_entry(story_id, agent));
                    }
                    return Err(format!(
                        "Agent '{agent_name}' for story '{story_id}' channel closed unexpectedly"
                    ));
                }
                Err(_) => {
                    return Err(format!(
                        "Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'"
                    ));
                }
            }
        }
    }

    /// Create a worktree for the given story using the server port (writes .mcp.json).
    pub async fn create_worktree(
        &self,
        project_root: &Path,
        story_id: &str,
    ) -> Result<worktree::WorktreeInfo, String> {
        let config = ProjectConfig::load(project_root)?;
        worktree::create_worktree(project_root, story_id, &config, self.port).await
    }

    /// Advance the pipeline after an agent completes.
    ///
    /// Called internally by `report_completion` as a background task.
    /// Reads the stored completion report and project_root from the agent,
    /// then drives the next pipeline stage based on the agent's role:
    ///
    /// - **Coder** + gates passed → move story to `work/3_qa/`, start `qa` agent.
    /// - **Coder** + gates failed → restart the same coder agent with failure context.
    /// - **QA** + gates passed + coverage passed → move story to `work/4_merge/`, start `mergemaster` agent.
    /// - **QA** + gates passed + coverage failed → restart `qa` with coverage failure context.
    /// - **QA** + gates failed → restart `qa` with failure context.
    /// - **Mergemaster** → run `script/test` on master; if pass: archive + cleanup worktree;
    ///   if fail: restart `mergemaster` with failure context.
    /// - **Other** (supervisor, unknown) → no automatic advancement.
    async fn run_pipeline_advance(
        &self,
        story_id: &str,
        agent_name: &str,
        completion: CompletionReport,
        project_root: Option<PathBuf>,
        worktree_path: Option<PathBuf>,
        merge_failure_reported: bool,
    ) {
        let project_root = match project_root {
            Some(p) => p,
            None => {
                slog_warn!("[pipeline] No project_root for '{story_id}:{agent_name}'");
                return;
            }
        };

        let config = ProjectConfig::load(&project_root).unwrap_or_default();
        let stage = config
            .find_agent(agent_name)
            .map(agent_config_stage)
            .unwrap_or_else(|| pipeline_stage(agent_name));

        match stage {
            PipelineStage::Other => {
                // Supervisors and unknown agents do not advance the pipeline.
            }
            PipelineStage::Coder => {
                if completion.gates_passed {
                    // Determine effective QA mode for this story.
                    let qa_mode = {
                        let item_type = super::lifecycle::item_type_from_id(story_id);
                        if item_type == "spike" {
                            crate::io::story_metadata::QaMode::Human
                        } else {
                            let default_qa = config.default_qa_mode();
                            // Story is in 2_current/ when a coder completes.
                            let story_path = project_root
                                .join(".story_kit/work/2_current")
                                .join(format!("{story_id}.md"));
                            crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa)
                        }
                    };

                    match qa_mode {
                        crate::io::story_metadata::QaMode::Server => {
                            slog!(
                                "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
                                 qa: server — moving directly to merge."
                            );
                            if let Err(e) =
                                super::lifecycle::move_story_to_merge(&project_root, story_id)
                            {
                                slog_error!(
                                    "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
                                );
                            } else if let Err(e) = self
                                .start_agent(&project_root, story_id, Some("mergemaster"), None)
                                .await
                            {
                                slog_error!(
                                    "[pipeline] Failed to start mergemaster for '{story_id}': {e}"
                                );
                            }
                        }
                        crate::io::story_metadata::QaMode::Agent => {
                            slog!(
                                "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
                                 qa: agent — moving to QA."
                            );
                            if let Err(e) = super::lifecycle::move_story_to_qa(&project_root, story_id) {
                                slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
                            } else if let Err(e) = self
                                .start_agent(&project_root, story_id, Some("qa"), None)
                                .await
                            {
                                slog_error!("[pipeline] Failed to start qa agent for '{story_id}': {e}");
                            }
                        }
                        crate::io::story_metadata::QaMode::Human => {
                            slog!(
                                "[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
                                 qa: human — holding for human review."
                            );
                            if let Err(e) = super::lifecycle::move_story_to_qa(&project_root, story_id) {
                                slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
                            } else {
                                let qa_dir = project_root.join(".story_kit/work/3_qa");
                                let story_path = qa_dir.join(format!("{story_id}.md"));
                                if let Err(e) =
                                    crate::io::story_metadata::write_review_hold(&story_path)
                                {
                                    slog_error!(
                                        "[pipeline] Failed to set review_hold on '{story_id}': {e}"
                                    );
                                }
                            }
                        }
                    }
                } else {
                    slog!(
                        "[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting."
                    );
                    let context = format!(
                        "\n\n---\n## Previous Attempt Failed\n\
                         The acceptance gates failed with the following output:\n{}\n\n\
                         Please review the failures above, fix the issues, and try again.",
                        completion.gate_output
                    );
                    if let Err(e) = self
                        .start_agent(&project_root, story_id, Some(agent_name), Some(&context))
                        .await
                    {
                        slog_error!(
                            "[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}"
                        );
                    }
                }
            }
            PipelineStage::Qa => {
                if completion.gates_passed {
                    // Run coverage gate in the QA worktree before advancing to merge.
                    let coverage_path = worktree_path
                        .clone()
                        .unwrap_or_else(|| project_root.clone());
                    let cp = coverage_path.clone();
                    let coverage_result =
                        tokio::task::spawn_blocking(move || super::gates::run_coverage_gate(&cp))
                            .await
                            .unwrap_or_else(|e| {
                                slog_warn!("[pipeline] Coverage gate task panicked: {e}");
                                Ok((false, format!("Coverage gate task panicked: {e}")))
                            });
                    let (coverage_passed, coverage_output) = match coverage_result {
                        Ok(pair) => pair,
                        Err(e) => (false, e),
                    };

                    if coverage_passed {
                        // Check whether this item needs human review before merging.
                        let needs_human_review = {
                            let item_type = super::lifecycle::item_type_from_id(story_id);
                            if item_type == "spike" {
                                true // Spikes always need human review.
                            } else {
                                let qa_dir = project_root.join(".story_kit/work/3_qa");
                                let story_path = qa_dir.join(format!("{story_id}.md"));
                                let default_qa = config.default_qa_mode();
                                matches!(
                                    crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa),
                                    crate::io::story_metadata::QaMode::Human
                                )
                            }
                        };

                        if needs_human_review {
                            // Hold in 3_qa/ for human review.
                            let qa_dir = project_root.join(".story_kit/work/3_qa");
                            let story_path = qa_dir.join(format!("{story_id}.md"));
                            if let Err(e) =
                                crate::io::story_metadata::write_review_hold(&story_path)
                            {
                                slog_error!(
                                    "[pipeline] Failed to set review_hold on '{story_id}': {e}"
                                );
                            }
                            slog!(
                                "[pipeline] QA passed for '{story_id}'. \
                                 Holding for human review. \
                                 Worktree preserved at: {worktree_path:?}"
                            );
                        } else {
                            slog!(
                                "[pipeline] QA passed gates and coverage for '{story_id}'. \
                                 Moving directly to merge."
                            );
                            if let Err(e) =
                                super::lifecycle::move_story_to_merge(&project_root, story_id)
                            {
                                slog_error!(
                                    "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
                                );
                            } else if let Err(e) = self
                                .start_agent(&project_root, story_id, Some("mergemaster"), None)
                                .await
                            {
                                slog_error!(
                                    "[pipeline] Failed to start mergemaster for '{story_id}': {e}"
                                );
                            }
                        }
                    } else {
                        slog!(
                            "[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA."
                        );
                        let context = format!(
                            "\n\n---\n## Coverage Gate Failed\n\
                             The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\
                             Please improve test coverage until the coverage gate passes.",
                            coverage_output
                        );
                        if let Err(e) = self
                            .start_agent(&project_root, story_id, Some("qa"), Some(&context))
                            .await
                        {
                            slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
                        }
                    }
                } else {
                    slog!("[pipeline] QA failed gates for '{story_id}'. Restarting.");
                    let context = format!(
                        "\n\n---\n## Previous QA Attempt Failed\n\
                         The acceptance gates failed with the following output:\n{}\n\n\
                         Please re-run and fix the issues.",
                        completion.gate_output
                    );
                    if let Err(e) = self
                        .start_agent(&project_root, story_id, Some("qa"), Some(&context))
                        .await
                    {
                        slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
                    }
                }
            }
            PipelineStage::Mergemaster => {
                // Block advancement if the mergemaster explicitly reported a failure.
                // The server-owned gate check runs in the feature-branch worktree (not
                // master), so `gates_passed=true` is misleading when no code was merged.
                if merge_failure_reported {
                    slog!(
                        "[pipeline] Pipeline advancement blocked for '{story_id}': \
                         mergemaster explicitly reported a merge failure. \
                         Story stays in 4_merge/ for human review."
                    );
                } else {
                    // Run script/test on master (project_root) as the post-merge verification.
                    slog!(
                        "[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master."
                    );
                    let root = project_root.clone();
                    let test_result =
                        tokio::task::spawn_blocking(move || super::gates::run_project_tests(&root))
                            .await
                            .unwrap_or_else(|e| {
                                slog_warn!("[pipeline] Post-merge test task panicked: {e}");
                                Ok((false, format!("Test task panicked: {e}")))
                            });
                    let (passed, output) = match test_result {
                        Ok(pair) => pair,
                        Err(e) => (false, e),
                    };

                    if passed {
                        slog!(
                            "[pipeline] Post-merge tests passed for '{story_id}'. Moving to done."
                        );
                        if let Err(e) =
                            super::lifecycle::move_story_to_archived(&project_root, story_id)
                        {
                            slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}");
                        }
                        self.remove_agents_for_story(story_id);
                        // TODO: Re-enable worktree cleanup once we have persistent agent logs.
                        // Removing worktrees destroys evidence needed to debug empty-commit agents.
                        // let config =
                        //     crate::config::ProjectConfig::load(&project_root).unwrap_or_default();
                        // if let Err(e) =
                        //     worktree::remove_worktree_by_story_id(&project_root, story_id, &config)
                        //         .await
                        // {
                        //     slog!(
                        //         "[pipeline] Failed to remove worktree for '{story_id}': {e}"
                        //     );
                        // }
                        slog!(
                            "[pipeline] Story '{story_id}' done. Worktree preserved for inspection."
                        );
                    } else {
                        slog!(
                            "[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster."
                        );
                        let context = format!(
                            "\n\n---\n## Post-Merge Test Failed\n\
                         The tests on master failed with the following output:\n{}\n\n\
                         Please investigate and resolve the failures, then call merge_agent_work again.",
                            output
                        );
                        if let Err(e) = self
                            .start_agent(
                                &project_root,
                                story_id,
                                Some("mergemaster"),
                                Some(&context),
                            )
                            .await
                        {
                            slog_error!(
                                "[pipeline] Failed to restart mergemaster for '{story_id}': {e}"
                            );
                        }
                    }
                }
            }
        }

        // Always scan for unassigned work after any agent completes, regardless
        // of the outcome (success, failure, restart).  This ensures stories that
        // failed agent assignment due to busy agents are retried when agents
        // become available (bug 295).
        self.auto_assign_available_work(&project_root).await;
    }

    /// Internal: report that an agent has finished work on a story.
    ///
    /// **Note:** This is no longer exposed as an MCP tool. The server now
    /// automatically runs completion gates when an agent process exits
    /// (see `run_server_owned_completion`). This method is retained for
    /// backwards compatibility and testing.
    ///
    /// - Rejects with an error if the worktree has uncommitted changes.
    /// - Runs acceptance gates (cargo clippy + cargo nextest run / cargo test).
    /// - Stores the `CompletionReport` on the agent record.
    /// - Transitions status to `Completed` (gates passed) or `Failed` (gates failed).
    /// - Emits a `Done` event so `wait_for_agent` unblocks.
    #[allow(dead_code)]
    pub async fn report_completion(
        &self,
        story_id: &str,
        agent_name: &str,
        summary: &str,
    ) -> Result<CompletionReport, String> {
        let key = composite_key(story_id, agent_name);

        // Verify agent exists, is Running, and grab its worktree path.
        let worktree_path = {
            let agents = self.agents.lock().map_err(|e| e.to_string())?;
            let agent = agents
                .get(&key)
                .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;

            if agent.status != AgentStatus::Running {
                return Err(format!(
                    "Agent '{agent_name}' for story '{story_id}' is not running (status: {}). \
                     report_completion can only be called by a running agent.",
                    agent.status
                ));
            }

            agent
                .worktree_info
                .as_ref()
                .map(|wt| wt.path.clone())
                .ok_or_else(|| {
                    format!(
                        "Agent '{agent_name}' for story '{story_id}' has no worktree. \
                         Cannot run acceptance gates."
                    )
                })?
        };

        let path = worktree_path.clone();

        // Run gate checks in a blocking thread to avoid stalling the async runtime.
        let (gates_passed, gate_output) = tokio::task::spawn_blocking(move || {
            // Step 1: Reject if worktree is dirty.
            super::gates::check_uncommitted_changes(&path)?;
            // Step 2: Run clippy + tests and return (passed, output).
            super::gates::run_acceptance_gates(&path)
        })
        .await
        .map_err(|e| format!("Gate check task panicked: {e}"))??;

        let report = CompletionReport {
            summary: summary.to_string(),
            gates_passed,
            gate_output,
        };

        // Extract data for pipeline advance, then remove the entry so
        // completed agents never appear in list_agents.
        let (
            tx,
            session_id,
            project_root_for_advance,
            wt_path_for_advance,
            merge_failure_reported_for_advance,
        ) = {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
            let agent = agents.get_mut(&key).ok_or_else(|| {
                format!("Agent '{agent_name}' for story '{story_id}' disappeared during gate check")
            })?;
            agent.completion = Some(report.clone());
            let tx = agent.tx.clone();
            let sid = agent.session_id.clone();
            let pr = agent.project_root.clone();
            let wt = agent.worktree_info.as_ref().map(|w| w.path.clone());
            let mfr = agent.merge_failure_reported;
            agents.remove(&key);
            (tx, sid, pr, wt, mfr)
        };

        // Emit Done so wait_for_agent unblocks.
        let _ = tx.send(AgentEvent::Done {
            story_id: story_id.to_string(),
            agent_name: agent_name.to_string(),
            session_id,
        });

        // Notify WebSocket clients that the agent is gone.
        Self::notify_agent_state_changed(&self.watcher_tx);

        // Advance the pipeline state machine in a background task.
        let pool_clone = Self {
            agents: Arc::clone(&self.agents),
            port: self.port,
            child_killers: Arc::clone(&self.child_killers),
            watcher_tx: self.watcher_tx.clone(),
            merge_jobs: Arc::clone(&self.merge_jobs),
        };
        let sid = story_id.to_string();
        let aname = agent_name.to_string();
        let report_for_advance = report.clone();
        tokio::spawn(async move {
            pool_clone
                .run_pipeline_advance(
                    &sid,
                    &aname,
                    report_for_advance,
                    project_root_for_advance,
                    wt_path_for_advance,
                    merge_failure_reported_for_advance,
                )
                .await;
        });

        Ok(report)
    }

    /// Run the full mergemaster pipeline for a completed story:
    ///
    /// 1. Squash-merge the story's feature branch into the current branch (master).
    /// 2. If conflicts are found: abort the merge and report them.
    /// 3. Quality gates run **inside the merge worktree** before master is touched.
    /// 4. If gates pass: cherry-pick the squash commit onto master and archive the story.
    ///
    /// Returns a `MergeReport` with full details of what happened.
    /// Start the merge pipeline as a background task.
    ///
    /// Returns immediately so the MCP tool call doesn't time out (the full
    /// pipeline — squash merge + quality gates — takes well over 60 seconds,
    /// exceeding Claude Code's MCP tool-call timeout).
    ///
    /// The mergemaster agent should poll [`get_merge_status`](Self::get_merge_status)
    /// until the job reaches a terminal state.
    pub fn start_merge_agent_work(
        self: &Arc<Self>,
        project_root: &Path,
        story_id: &str,
    ) -> Result<(), String> {
        // Guard against double-starts.
        {
            let jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
            if let Some(job) = jobs.get(story_id)
                && matches!(job.status, super::merge::MergeJobStatus::Running)
            {
                return Err(format!(
                    "Merge already in progress for '{story_id}'. \
                     Use get_merge_status to poll for completion."
                ));
            }
        }

        // Insert Running job.
        {
            let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
            jobs.insert(
                story_id.to_string(),
                super::merge::MergeJob {
                    story_id: story_id.to_string(),
                    status: super::merge::MergeJobStatus::Running,
                },
            );
        }

        let pool = Arc::clone(self);
        let root = project_root.to_path_buf();
        let sid = story_id.to_string();

        tokio::spawn(async move {
            let report = pool.run_merge_pipeline(&root, &sid).await;
            let failed = report.is_err();
            let status = match report {
                Ok(r) => super::merge::MergeJobStatus::Completed(r),
                Err(e) => super::merge::MergeJobStatus::Failed(e),
            };
            if let Ok(mut jobs) = pool.merge_jobs.lock()
                && let Some(job) = jobs.get_mut(&sid)
            {
                job.status = status;
            }
            if failed {
                pool.auto_assign_available_work(&root).await;
            }
        });

        Ok(())
    }

    /// The actual merge pipeline, run inside a background task.
    async fn run_merge_pipeline(
        self: &Arc<Self>,
        project_root: &Path,
        story_id: &str,
    ) -> Result<super::merge::MergeReport, String> {
        let branch = format!("feature/story-{story_id}");
        let wt_path = worktree::worktree_path(project_root, story_id);
        let root = project_root.to_path_buf();
        let sid = story_id.to_string();
        let br = branch.clone();

        let merge_result =
            tokio::task::spawn_blocking(move || super::merge::run_squash_merge(&root, &br, &sid))
                .await
                .map_err(|e| format!("Merge task panicked: {e}"))??;

        if !merge_result.success {
            return Ok(super::merge::MergeReport {
                story_id: story_id.to_string(),
                success: false,
                had_conflicts: merge_result.had_conflicts,
                conflicts_resolved: merge_result.conflicts_resolved,
                conflict_details: merge_result.conflict_details,
                gates_passed: merge_result.gates_passed,
                gate_output: merge_result.output,
                worktree_cleaned_up: false,
                story_archived: false,
            });
        }

        let story_archived =
            super::lifecycle::move_story_to_archived(project_root, story_id).is_ok();
        if story_archived {
            self.remove_agents_for_story(story_id);
        }

        let worktree_cleaned_up = if wt_path.exists() {
            let config = crate::config::ProjectConfig::load(project_root).unwrap_or_default();
            worktree::remove_worktree_by_story_id(project_root, story_id, &config)
                .await
                .is_ok()
        } else {
            false
        };

        self.auto_assign_available_work(project_root).await;

        Ok(super::merge::MergeReport {
            story_id: story_id.to_string(),
            success: true,
            had_conflicts: merge_result.had_conflicts,
            conflicts_resolved: merge_result.conflicts_resolved,
            conflict_details: merge_result.conflict_details,
            gates_passed: true,
            gate_output: merge_result.output,
            worktree_cleaned_up,
            story_archived,
        })
    }

    /// Check the status of a background merge job.
    pub fn get_merge_status(&self, story_id: &str) -> Option<super::merge::MergeJob> {
        self.merge_jobs
            .lock()
            .ok()
            .and_then(|jobs| jobs.get(story_id).cloned())
    }

    /// Get project root helper.
    pub fn get_project_root(&self, state: &crate::state::SessionState) -> Result<PathBuf, String> {
        state.get_project_root()
    }

    /// Get the log session ID and project root for an agent, if available.
    ///
    /// Used by MCP tools to find the persistent log file for a completed agent.
    pub fn get_log_info(&self, story_id: &str, agent_name: &str) -> Option<(String, PathBuf)> {
        let key = composite_key(story_id, agent_name);
        let agents = self.agents.lock().ok()?;
        let agent = agents.get(&key)?;
        let session_id = agent.log_session_id.clone()?;
        let project_root = agent.project_root.clone()?;
        Some((session_id, project_root))
    }

    /// Record that the mergemaster agent for `story_id` explicitly reported a
    /// merge failure via the `report_merge_failure` MCP tool.
    ///
    /// Sets `merge_failure_reported = true` on the active mergemaster agent so
    /// that `run_pipeline_advance` can block advancement to `5_done/` even when
    /// the server-owned gate check returns `gates_passed=true` (those gates run
    /// in the feature-branch worktree, not on master).
    pub fn set_merge_failure_reported(&self, story_id: &str) {
        match self.agents.lock() {
            Ok(mut lock) => {
                let found = lock.iter_mut().find(|(key, agent)| {
                    let key_story_id = key
                        .rsplit_once(':')
                        .map(|(sid, _)| sid)
                        .unwrap_or(key.as_str());
                    key_story_id == story_id
                        && pipeline_stage(&agent.agent_name) == PipelineStage::Mergemaster
                });
                match found {
                    Some((_, agent)) => {
                        agent.merge_failure_reported = true;
                        slog!(
                            "[pipeline] Merge failure flag set for '{story_id}:{}'",
                            agent.agent_name
                        );
                    }
                    None => {
                        slog_warn!(
                            "[pipeline] set_merge_failure_reported: no running mergemaster found \
                             for story '{story_id}' — flag not set"
                        );
                    }
                }
            }
            Err(e) => {
                slog_error!("[pipeline] set_merge_failure_reported: could not lock agents: {e}");
            }
        }
    }

    /// Test helper: inject a pre-built agent entry so unit tests can exercise
    /// wait/subscribe logic without spawning a real process.
    #[cfg(test)]
    pub fn inject_test_agent(
        &self,
        story_id: &str,
        agent_name: &str,
        status: AgentStatus,
    ) -> broadcast::Sender<AgentEvent> {
        let (tx, _) = broadcast::channel::<AgentEvent>(64);
        let key = composite_key(story_id, agent_name);
        let mut agents = self.agents.lock().unwrap();
        agents.insert(
            key,
            StoryAgent {
                agent_name: agent_name.to_string(),
                status,
                worktree_info: None,
                session_id: None,
                tx: tx.clone(),
                task_handle: None,
                event_log: Arc::new(Mutex::new(Vec::new())),
                completion: None,
                project_root: None,
                log_session_id: None,
                merge_failure_reported: false,
            },
        );
        tx
    }

    /// Test helper: inject an agent with a specific worktree path for testing
    /// gate-related logic.
    #[cfg(test)]
    pub fn inject_test_agent_with_path(
        &self,
        story_id: &str,
        agent_name: &str,
        status: AgentStatus,
        worktree_path: PathBuf,
    ) -> broadcast::Sender<AgentEvent> {
        let (tx, _) = broadcast::channel::<AgentEvent>(64);
        let key = composite_key(story_id, agent_name);
        let mut agents = self.agents.lock().unwrap();
        agents.insert(
            key,
            StoryAgent {
                agent_name: agent_name.to_string(),
                status,
                worktree_info: Some(WorktreeInfo {
                    path: worktree_path,
                    branch: format!("feature/story-{story_id}"),
                    base_branch: "master".to_string(),
                }),
                session_id: None,
                tx: tx.clone(),
                task_handle: None,
                event_log: Arc::new(Mutex::new(Vec::new())),
                completion: None,
                project_root: None,
                log_session_id: None,
                merge_failure_reported: false,
            },
        );
        tx
    }

    /// Automatically assign free agents to stories waiting in the active pipeline stages.
    ///
    /// Scans `work/2_current/`, `work/3_qa/`, and `work/4_merge/` for items that have no
    /// active agent and assigns the first free agent of the appropriate role. Items in
    /// `work/1_backlog/` are never auto-started.
    ///
    /// Respects the configured agent roster: the maximum number of concurrently active agents
    /// per role is bounded by the count of agents of that role defined in `project.toml`.
    pub async fn auto_assign_available_work(&self, project_root: &Path) {
        let config = match ProjectConfig::load(project_root) {
            Ok(c) => c,
            Err(e) => {
                slog_warn!("[auto-assign] Failed to load project config: {e}");
                return;
            }
        };

        // Process each active pipeline stage in order.
        let stages: [(&str, PipelineStage); 3] = [
            ("2_current", PipelineStage::Coder),
            ("3_qa", PipelineStage::Qa),
            ("4_merge", PipelineStage::Mergemaster),
        ];

        for (stage_dir, stage) in &stages {
            let items = scan_stage_items(project_root, stage_dir);
            if items.is_empty() {
                continue;
            }

            for story_id in &items {
                // Items marked with review_hold (e.g. spikes after QA passes) stay
                // in their current stage for human review — don't auto-assign agents.
                if has_review_hold(project_root, stage_dir, story_id) {
                    continue;
                }

                // Re-acquire the lock on each iteration to see state changes
                // from previous start_agent calls in the same pass.
                let preferred_agent =
                    read_story_front_matter_agent(project_root, stage_dir, story_id);

                // Outcome: (already_assigned, chosen_agent, preferred_busy, stage_mismatch)
                // preferred_busy=true means the story has a specific agent requested but it is
                // currently occupied — the story should wait rather than fall back.
                // stage_mismatch=true means the preferred agent's stage doesn't match the
                // pipeline stage, so we fell back to a generic stage agent.
                let (already_assigned, free_agent, preferred_busy, stage_mismatch) = {
                    let agents = match self.agents.lock() {
                        Ok(a) => a,
                        Err(e) => {
                            slog_error!("[auto-assign] Failed to lock agents: {e}");
                            break;
                        }
                    };
                    let assigned = is_story_assigned_for_stage(&config, &agents, story_id, stage);
                    if assigned {
                        (true, None, false, false)
                    } else if let Some(ref pref) = preferred_agent {
                        // Story has a front-matter agent preference.
                        // Verify the preferred agent's stage matches the current
                        // pipeline stage — a coder shouldn't be assigned to QA.
                        let pref_stage_matches = config
                            .find_agent(pref)
                            .map(|cfg| agent_config_stage(cfg) == *stage)
                            .unwrap_or(false);
                        if !pref_stage_matches {
                            // Stage mismatch — fall back to any free agent for this stage.
                            let free = find_free_agent_for_stage(&config, &agents, stage)
                                .map(|s| s.to_string());
                            (false, free, false, true)
                        } else if is_agent_free(&agents, pref) {
                            (false, Some(pref.clone()), false, false)
                        } else {
                            (false, None, true, false)
                        }
                    } else {
                        let free = find_free_agent_for_stage(&config, &agents, stage)
                            .map(|s| s.to_string());
                        (false, free, false, false)
                    }
                };

                if already_assigned {
                    // Story already has an active agent — skip silently.
                    continue;
                }

                if preferred_busy {
                    // The story requests a specific agent that is currently busy.
                    // Do not fall back to a different agent; let this story wait.
                    slog!(
                        "[auto-assign] Preferred agent '{}' busy for '{story_id}'; story will wait.",
                        preferred_agent.as_deref().unwrap_or("?")
                    );
                    continue;
                }

                if stage_mismatch {
                    slog!(
                        "[auto-assign] Preferred agent '{}' stage mismatch for '{story_id}' in {stage_dir}/; falling back to stage-appropriate agent.",
                        preferred_agent.as_deref().unwrap_or("?")
                    );
                }

                match free_agent {
                    Some(agent_name) => {
                        slog!(
                            "[auto-assign] Assigning '{agent_name}' to '{story_id}' in {stage_dir}/"
                        );
                        if let Err(e) = self
                            .start_agent(project_root, story_id, Some(&agent_name), None)
                            .await
                        {
                            slog!(
                                "[auto-assign] Failed to start '{agent_name}' for '{story_id}': {e}"
                            );
                        }
                    }
                    None => {
                        // No free agents of this type — stop scanning this stage.
                        slog!(
                            "[auto-assign] All {:?} agents busy; remaining items in {stage_dir}/ will wait.",
                            stage
                        );
                        break;
                    }
                }
            }
        }
    }

    /// Reconcile stories whose agent work was committed while the server was offline.
    ///
    /// On server startup the in-memory agent pool is empty, so any story that an agent
    /// completed during a previous session is stuck: the worktree has committed work but
    /// the pipeline never advanced.  This method detects those stories, re-runs the
    /// acceptance gates, and advances the pipeline stage so that `auto_assign_available_work`
    /// (called immediately after) picks up the right next-stage agents.
    ///
    /// Algorithm:
    /// 1. List all worktree directories under `{project_root}/.story_kit/worktrees/`.
    /// 2. For each worktree, check whether its feature branch has commits ahead of the
    ///    base branch (`master` / `main`).
    /// 3. If committed work is found AND the story is in `2_current/` or `3_qa/`:
    ///    - Run acceptance gates (uncommitted-change check + clippy + tests).
    ///    - On pass + `2_current/`: move the story to `3_qa/`.
    ///    - On pass + `3_qa/`: run the coverage gate; if that also passes move to `4_merge/`.
    ///    - On failure: leave the story where it is so `auto_assign_available_work` can
    ///      start a fresh agent to retry.
    /// 4. Stories in `4_merge/` are left for `auto_assign_available_work` to handle via a
    ///    fresh mergemaster (squash-merge must be re-executed by the mergemaster agent).
    pub async fn reconcile_on_startup(
        &self,
        project_root: &Path,
        progress_tx: &broadcast::Sender<ReconciliationEvent>,
    ) {
        let worktrees = match worktree::list_worktrees(project_root) {
            Ok(wt) => wt,
            Err(e) => {
                eprintln!("[startup:reconcile] Failed to list worktrees: {e}");
                let _ = progress_tx.send(ReconciliationEvent {
                    story_id: String::new(),
                    status: "done".to_string(),
                    message: format!("Reconciliation failed: {e}"),
                });
                return;
            }
        };

        for wt_entry in &worktrees {
            let story_id = &wt_entry.story_id;
            let wt_path = wt_entry.path.clone();

            // Determine which active stage the story is in.
            let stage_dir = match find_active_story_stage(project_root, story_id) {
                Some(s) => s,
                None => continue, // Not in any active stage (backlog/archived or unknown).
            };

            // 4_merge/ is left for auto_assign to handle with a fresh mergemaster.
            if stage_dir == "4_merge" {
                continue;
            }

            let _ = progress_tx.send(ReconciliationEvent {
                story_id: story_id.clone(),
                status: "checking".to_string(),
                message: format!("Checking for committed work in {stage_dir}/"),
            });

            // Check whether the worktree has commits ahead of the base branch.
            let wt_path_for_check = wt_path.clone();
            let has_work = tokio::task::spawn_blocking(move || {
                super::gates::worktree_has_committed_work(&wt_path_for_check)
            })
            .await
            .unwrap_or(false);

            if !has_work {
                eprintln!(
                    "[startup:reconcile] No committed work for '{story_id}' in {stage_dir}/; skipping."
                );
                let _ = progress_tx.send(ReconciliationEvent {
                    story_id: story_id.clone(),
                    status: "skipped".to_string(),
                    message: "No committed work found; skipping.".to_string(),
                });
                continue;
            }

            eprintln!(
                "[startup:reconcile] Found committed work for '{story_id}' in {stage_dir}/. Running acceptance gates."
            );
            let _ = progress_tx.send(ReconciliationEvent {
                story_id: story_id.clone(),
                status: "gates_running".to_string(),
                message: "Running acceptance gates…".to_string(),
            });

            // Run acceptance gates on the worktree.
            let wt_path_for_gates = wt_path.clone();
            let gates_result = tokio::task::spawn_blocking(move || {
                super::gates::check_uncommitted_changes(&wt_path_for_gates)?;
                super::gates::run_acceptance_gates(&wt_path_for_gates)
            })
            .await;

            let (gates_passed, gate_output) = match gates_result {
                Ok(Ok(pair)) => pair,
                Ok(Err(e)) => {
                    eprintln!("[startup:reconcile] Gate check error for '{story_id}': {e}");
                    let _ = progress_tx.send(ReconciliationEvent {
                        story_id: story_id.clone(),
                        status: "failed".to_string(),
                        message: format!("Gate error: {e}"),
                    });
                    continue;
                }
                Err(e) => {
                    eprintln!("[startup:reconcile] Gate check task panicked for '{story_id}': {e}");
                    let _ = progress_tx.send(ReconciliationEvent {
                        story_id: story_id.clone(),
                        status: "failed".to_string(),
                        message: format!("Gate task panicked: {e}"),
                    });
                    continue;
                }
            };

            if !gates_passed {
                eprintln!(
                    "[startup:reconcile] Gates failed for '{story_id}': {gate_output}\n\
                     Leaving in {stage_dir}/ for auto-assign to restart the agent."
                );
                let _ = progress_tx.send(ReconciliationEvent {
                    story_id: story_id.clone(),
                    status: "failed".to_string(),
                    message: "Gates failed; will be retried by auto-assign.".to_string(),
                });
                continue;
            }

            eprintln!("[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/).");

            if stage_dir == "2_current" {
                // Coder stage — determine qa mode to decide next step.
                let qa_mode = {
                    let item_type = super::lifecycle::item_type_from_id(story_id);
                    if item_type == "spike" {
                        crate::io::story_metadata::QaMode::Human
                    } else {
                        let default_qa = crate::config::ProjectConfig::load(project_root)
                            .unwrap_or_default()
                            .default_qa_mode();
                        let story_path = project_root
                            .join(".story_kit/work/2_current")
                            .join(format!("{story_id}.md"));
                        crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa)
                    }
                };

                match qa_mode {
                    crate::io::story_metadata::QaMode::Server => {
                        if let Err(e) = super::lifecycle::move_story_to_merge(project_root, story_id) {
                            eprintln!("[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}");
                            let _ = progress_tx.send(ReconciliationEvent {
                                story_id: story_id.clone(),
                                status: "failed".to_string(),
                                message: format!("Failed to advance to merge: {e}"),
                            });
                        } else {
                            eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/ (qa: server).");
                            let _ = progress_tx.send(ReconciliationEvent {
                                story_id: story_id.clone(),
                                status: "advanced".to_string(),
                                message: "Gates passed — moved to merge (qa: server).".to_string(),
                            });
                        }
                    }
                    crate::io::story_metadata::QaMode::Agent => {
                        if let Err(e) = super::lifecycle::move_story_to_qa(project_root, story_id) {
                            eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}");
                            let _ = progress_tx.send(ReconciliationEvent {
                                story_id: story_id.clone(),
                                status: "failed".to_string(),
                                message: format!("Failed to advance to QA: {e}"),
                            });
                        } else {
                            eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/.");
                            let _ = progress_tx.send(ReconciliationEvent {
                                story_id: story_id.clone(),
                                status: "advanced".to_string(),
                                message: "Gates passed — moved to QA.".to_string(),
                            });
                        }
                    }
                    crate::io::story_metadata::QaMode::Human => {
                        if let Err(e) = super::lifecycle::move_story_to_qa(project_root, story_id) {
                            eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}");
                            let _ = progress_tx.send(ReconciliationEvent {
                                story_id: story_id.clone(),
                                status: "failed".to_string(),
                                message: format!("Failed to advance to QA: {e}"),
                            });
                        } else {
                            let story_path = project_root
                                .join(".story_kit/work/3_qa")
                                .join(format!("{story_id}.md"));
                            if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) {
                                eprintln!(
                                    "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}"
                                );
                            }
                            eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/ (qa: human — holding for review).");
                            let _ = progress_tx.send(ReconciliationEvent {
                                story_id: story_id.clone(),
                                status: "review_hold".to_string(),
                                message: "Gates passed — holding for human review.".to_string(),
                            });
                        }
                    }
                }
            } else if stage_dir == "3_qa" {
                // QA stage → run coverage gate before advancing to merge.
                let wt_path_for_cov = wt_path.clone();
                let coverage_result = tokio::task::spawn_blocking(move || {
                    super::gates::run_coverage_gate(&wt_path_for_cov)
                })
                .await;

                let (coverage_passed, coverage_output) = match coverage_result {
                    Ok(Ok(pair)) => pair,
                    Ok(Err(e)) => {
                        eprintln!("[startup:reconcile] Coverage gate error for '{story_id}': {e}");
                        let _ = progress_tx.send(ReconciliationEvent {
                            story_id: story_id.clone(),
                            status: "failed".to_string(),
                            message: format!("Coverage gate error: {e}"),
                        });
                        continue;
                    }
                    Err(e) => {
                        eprintln!(
                            "[startup:reconcile] Coverage gate panicked for '{story_id}': {e}"
                        );
                        let _ = progress_tx.send(ReconciliationEvent {
                            story_id: story_id.clone(),
                            status: "failed".to_string(),
                            message: format!("Coverage gate panicked: {e}"),
                        });
                        continue;
                    }
                };

                if coverage_passed {
                    // Check whether this item needs human review before merging.
                    let needs_human_review = {
                        let item_type = super::lifecycle::item_type_from_id(story_id);
                        if item_type == "spike" {
                            true
                        } else {
                            let story_path = project_root
                                .join(".story_kit/work/3_qa")
                                .join(format!("{story_id}.md"));
                            let default_qa = crate::config::ProjectConfig::load(project_root)
                                .unwrap_or_default()
                                .default_qa_mode();
                            matches!(
                                crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa),
                                crate::io::story_metadata::QaMode::Human
                            )
                        }
                    };

                    if needs_human_review {
                        let story_path = project_root
                            .join(".story_kit/work/3_qa")
                            .join(format!("{story_id}.md"));
                        if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) {
                            eprintln!(
                                "[startup:reconcile] Failed to set review_hold on '{story_id}': {e}"
                            );
                        }
                        eprintln!(
                            "[startup:reconcile] '{story_id}' passed QA — holding for human review."
                        );
                        let _ = progress_tx.send(ReconciliationEvent {
                            story_id: story_id.clone(),
                            status: "review_hold".to_string(),
                            message: "Passed QA — waiting for human review.".to_string(),
                        });
                    } else if let Err(e) =
                        super::lifecycle::move_story_to_merge(project_root, story_id)
                    {
                        eprintln!(
                            "[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}"
                        );
                        let _ = progress_tx.send(ReconciliationEvent {
                            story_id: story_id.clone(),
                            status: "failed".to_string(),
                            message: format!("Failed to advance to merge: {e}"),
                        });
                    } else {
                        eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/.");
                        let _ = progress_tx.send(ReconciliationEvent {
                            story_id: story_id.clone(),
                            status: "advanced".to_string(),
                            message: "Gates passed — moved to merge.".to_string(),
                        });
                    }
                } else {
                    eprintln!(
                        "[startup:reconcile] Coverage gate failed for '{story_id}': {coverage_output}\n\
                         Leaving in 3_qa/ for auto-assign to restart the QA agent."
                    );
                    let _ = progress_tx.send(ReconciliationEvent {
                        story_id: story_id.clone(),
                        status: "failed".to_string(),
                        message: "Coverage gate failed; will be retried.".to_string(),
                    });
                }
            }
        }

        // Signal that reconciliation is complete.
        let _ = progress_tx.send(ReconciliationEvent {
            story_id: String::new(),
            status: "done".to_string(),
            message: "Startup reconciliation complete.".to_string(),
        });
    }

    /// Test helper: inject an agent with a completion report and project_root
    /// for testing pipeline advance logic without spawning real agents.
    #[cfg(test)]
    pub fn inject_test_agent_with_completion(
        &self,
        story_id: &str,
        agent_name: &str,
        status: AgentStatus,
        project_root: PathBuf,
        completion: CompletionReport,
    ) -> broadcast::Sender<AgentEvent> {
        let (tx, _) = broadcast::channel::<AgentEvent>(64);
        let key = composite_key(story_id, agent_name);
        let mut agents = self.agents.lock().unwrap();
        agents.insert(
            key,
            StoryAgent {
                agent_name: agent_name.to_string(),
                status,
                worktree_info: None,
                session_id: None,
                tx: tx.clone(),
                task_handle: None,
                event_log: Arc::new(Mutex::new(Vec::new())),
                completion: Some(completion),
                project_root: Some(project_root),
                log_session_id: None,
                merge_failure_reported: false,
            },
        );
        tx
    }

    /// Inject a Running agent with a pre-built (possibly finished) task handle.
    /// Used by watchdog tests to simulate an orphaned agent.
    #[cfg(test)]
    pub fn inject_test_agent_with_handle(
        &self,
        story_id: &str,
        agent_name: &str,
        status: AgentStatus,
        task_handle: tokio::task::JoinHandle<()>,
    ) -> broadcast::Sender<AgentEvent> {
        let (tx, _) = broadcast::channel::<AgentEvent>(64);
        let key = composite_key(story_id, agent_name);
        let mut agents = self.agents.lock().unwrap();
        agents.insert(
            key,
            StoryAgent {
                agent_name: agent_name.to_string(),
                status,
                worktree_info: None,
                session_id: None,
                tx: tx.clone(),
                task_handle: Some(task_handle),
                event_log: Arc::new(Mutex::new(Vec::new())),
                completion: None,
                project_root: None,
                log_session_id: None,
                merge_failure_reported: false,
            },
        );
        tx
    }

    /// Test helper: inject a child killer into the registry.
    #[cfg(test)]
    pub fn inject_child_killer(&self, key: &str, killer: Box<dyn ChildKiller + Send + Sync>) {
        let mut killers = self.child_killers.lock().unwrap();
        killers.insert(key.to_string(), killer);
    }

    /// Test helper: return the number of registered child killers.
    #[cfg(test)]
    pub fn child_killer_count(&self) -> usize {
        self.child_killers.lock().unwrap().len()
    }

    /// Run a single watchdog pass synchronously (test helper).
    #[cfg(test)]
    pub fn run_watchdog_once(&self) {
        check_orphaned_agents(&self.agents);
    }

    /// Spawn a background watchdog task that periodically checks for Running agents
    /// whose underlying task has already finished (orphaned entries).  Any such agent
    /// is marked Failed and an Error event is emitted so that `wait_for_agent` unblocks.
    ///
    /// The watchdog runs every 30 seconds.  It is a safety net for edge cases where the
    /// PTY read loop exits without updating the agent status (e.g. a panic in the
    /// spawn_blocking task, or an external SIGKILL that closes the PTY fd immediately).
    ///
    /// When orphaned agents are detected and a `project_root` is provided, auto-assign
    /// is triggered so that free agents can pick up unassigned work.
    pub fn spawn_watchdog(pool: Arc<AgentPool>, project_root: Option<PathBuf>) {
        tokio::spawn(async move {
            let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
            loop {
                interval.tick().await;
                let found = check_orphaned_agents(&pool.agents);
                if found > 0
                    && let Some(ref root) = project_root
                {
                    slog!("[watchdog] {found} orphaned agent(s) detected; triggering auto-assign.");
                    pool.auto_assign_available_work(root).await;
                }
            }
        });
    }

    /// Remove all agent entries for a given story_id from the pool.
    ///
    /// Called when a story is archived so that stale entries don't accumulate.
    /// Returns the number of entries removed.
    pub fn remove_agents_for_story(&self, story_id: &str) -> usize {
        let mut agents = match self.agents.lock() {
            Ok(a) => a,
            Err(e) => {
                slog_error!("[agents] Failed to lock pool for cleanup of '{story_id}': {e}");
                return 0;
            }
        };
        let prefix = format!("{story_id}:");
        let keys_to_remove: Vec<String> = agents
            .keys()
            .filter(|k| k.starts_with(&prefix))
            .cloned()
            .collect();
        let count = keys_to_remove.len();
        for key in &keys_to_remove {
            agents.remove(key);
        }
        if count > 0 {
            slog!("[agents] Removed {count} agent entries for archived story '{story_id}'");
        }
        count
    }
}

/// Return the active pipeline stage directory name for `story_id`, or `None` if the
/// story is not in any active stage (`2_current/`, `3_qa/`, `4_merge/`).
fn find_active_story_stage(project_root: &Path, story_id: &str) -> Option<&'static str> {
    const STAGES: [&str; 3] = ["2_current", "3_qa", "4_merge"];
    for stage in &STAGES {
        let path = project_root
            .join(".story_kit")
            .join("work")
            .join(stage)
            .join(format!("{story_id}.md"));
        if path.exists() {
            return Some(stage);
        }
    }
    None
}

/// Scan a work pipeline stage directory and return story IDs, sorted alphabetically.
/// Returns an empty `Vec` if the directory does not exist.
/// Read the optional `agent:` field from the front matter of a story file.
///
/// Returns `Some(agent_name)` if the front matter specifies an agent, or `None`
/// if the field is absent or the file cannot be read / parsed.
fn read_story_front_matter_agent(
    project_root: &Path,
    stage_dir: &str,
    story_id: &str,
) -> Option<String> {
    use crate::io::story_metadata::parse_front_matter;
    let path = project_root
        .join(".story_kit")
        .join("work")
        .join(stage_dir)
        .join(format!("{story_id}.md"));
    let contents = std::fs::read_to_string(path).ok()?;
    parse_front_matter(&contents).ok()?.agent
}

/// Return `true` if the story file in the given stage has `review_hold: true` in its front matter.
fn has_review_hold(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
    use crate::io::story_metadata::parse_front_matter;
    let path = project_root
        .join(".story_kit")
        .join("work")
        .join(stage_dir)
        .join(format!("{story_id}.md"));
    let contents = match std::fs::read_to_string(path) {
        Ok(c) => c,
        Err(_) => return false,
    };
    parse_front_matter(&contents)
        .ok()
        .and_then(|m| m.review_hold)
        .unwrap_or(false)
}

/// Return `true` if `agent_name` has no active (pending/running) entry in the pool.
fn is_agent_free(agents: &HashMap<String, StoryAgent>, agent_name: &str) -> bool {
    !agents.values().any(|a| {
        a.agent_name == agent_name
            && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
    })
}

fn scan_stage_items(project_root: &Path, stage_dir: &str) -> Vec<String> {
    let dir = project_root.join(".story_kit").join("work").join(stage_dir);
    if !dir.is_dir() {
        return Vec::new();
    }
    let mut items = Vec::new();
    if let Ok(entries) = std::fs::read_dir(&dir) {
        for entry in entries.flatten() {
            let path = entry.path();
            if path.extension().and_then(|e| e.to_str()) == Some("md")
                && let Some(stem) = path.file_stem().and_then(|s| s.to_str())
            {
                items.push(stem.to_string());
            }
        }
    }
    items.sort();
    items
}

/// Return `true` if `story_id` has any active (pending/running) agent matching `stage`.
///
/// Uses the explicit `stage` config field when the agent is found in `config`;
/// falls back to the legacy name-based heuristic for unlisted agents.
fn is_story_assigned_for_stage(
    config: &ProjectConfig,
    agents: &HashMap<String, StoryAgent>,
    story_id: &str,
    stage: &PipelineStage,
) -> bool {
    agents.iter().any(|(key, agent)| {
        // Composite key format: "{story_id}:{agent_name}"
        let key_story_id = key.rsplit_once(':').map(|(sid, _)| sid).unwrap_or(key);
        let agent_stage = config
            .find_agent(&agent.agent_name)
            .map(agent_config_stage)
            .unwrap_or_else(|| pipeline_stage(&agent.agent_name));
        key_story_id == story_id
            && agent_stage == *stage
            && matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
    })
}

/// Find the first configured agent for `stage` that has no active (pending/running) assignment.
/// Returns `None` if all agents for that stage are busy or none are configured.
/// Uses the agent's explicit `stage` config field (preferred) or falls back to name-based detection.
fn find_free_agent_for_stage<'a>(
    config: &'a ProjectConfig,
    agents: &HashMap<String, StoryAgent>,
    stage: &PipelineStage,
) -> Option<&'a str> {
    for agent_config in &config.agent {
        if agent_config_stage(agent_config) != *stage {
            continue;
        }
        let is_busy = agents.values().any(|a| {
            a.agent_name == agent_config.name
                && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
        });
        if !is_busy {
            return Some(&agent_config.name);
        }
    }
    None
}

/// Scan the agent pool for Running entries whose backing tokio task has already
/// finished and mark them as Failed.
///
/// This handles the case where the PTY read loop or the spawned task exits
/// without updating the agent status — for example when the process is killed
/// externally and the PTY master fd returns EOF before our inactivity timeout
/// fires, but some other edge case prevents the normal cleanup path from running.
fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>) -> usize {
    let mut lock = match agents.lock() {
        Ok(l) => l,
        Err(_) => return 0,
    };

    // Collect orphaned entries: Running or Pending agents whose task handle is finished.
    // Pending agents can be orphaned if worktree creation panics before setting status.
    let orphaned: Vec<(String, String, broadcast::Sender<AgentEvent>, AgentStatus)> = lock
        .iter()
        .filter_map(|(key, agent)| {
            if matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
                && let Some(handle) = &agent.task_handle
                && handle.is_finished()
            {
                let story_id = key
                    .rsplit_once(':')
                    .map(|(s, _)| s.to_string())
                    .unwrap_or_else(|| key.clone());
                return Some((
                    key.clone(),
                    story_id,
                    agent.tx.clone(),
                    agent.status.clone(),
                ));
            }
            None
        })
        .collect();

    let count = orphaned.len();
    for (key, story_id, tx, prev_status) in orphaned {
        if let Some(agent) = lock.get_mut(&key) {
            agent.status = AgentStatus::Failed;
            slog!(
                "[watchdog] Orphaned agent '{key}': task finished but status was {prev_status}. \
                 Marking Failed."
            );
            let _ = tx.send(AgentEvent::Error {
                story_id,
                agent_name: agent.agent_name.clone(),
                message: "Agent process terminated unexpectedly (watchdog detected orphan)"
                    .to_string(),
            });
        }
    }
    count
}

/// Server-owned completion: runs acceptance gates when an agent process exits
/// normally, and advances the pipeline based on results.
///
/// This is a **free function** (not a method on `AgentPool`) to break the
/// opaque type cycle that would otherwise arise: `start_agent` → spawned task
/// → server-owned completion → pipeline advance → `start_agent`.
///
/// If the agent already has a completion report (e.g. from a legacy
/// `report_completion` call), this is a no-op to avoid double-running gates.
async fn run_server_owned_completion(
    agents: &Arc<Mutex<HashMap<String, StoryAgent>>>,
    port: u16,
    story_id: &str,
    agent_name: &str,
    session_id: Option<String>,
    watcher_tx: broadcast::Sender<WatcherEvent>,
) {
    let key = composite_key(story_id, agent_name);

    // Guard: skip if completion was already recorded (legacy path).
    {
        let lock = match agents.lock() {
            Ok(a) => a,
            Err(_) => return,
        };
        match lock.get(&key) {
            Some(agent) if agent.completion.is_some() => {
                slog!(
                    "[agents] Completion already recorded for '{story_id}:{agent_name}'; \
                     skipping server-owned gates."
                );
                return;
            }
            Some(_) => {}
            None => return,
        }
    }

    // Get worktree path for running gates.
    let worktree_path = {
        let lock = match agents.lock() {
            Ok(a) => a,
            Err(_) => return,
        };
        lock.get(&key)
            .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
    };

    // Run acceptance gates.
    let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path {
        let path = wt_path;
        match tokio::task::spawn_blocking(move || {
            super::gates::check_uncommitted_changes(&path)?;
            super::gates::run_acceptance_gates(&path)
        })
        .await
        {
            Ok(Ok(result)) => result,
            Ok(Err(e)) => (false, e),
            Err(e) => (false, format!("Gate check task panicked: {e}")),
        }
    } else {
        (
            false,
            "No worktree path available to run acceptance gates".to_string(),
        )
    };

    slog!(
        "[agents] Server-owned completion for '{story_id}:{agent_name}': gates_passed={gates_passed}"
    );

    let report = CompletionReport {
        summary: "Agent process exited normally".to_string(),
        gates_passed,
        gate_output,
    };

    // Store completion report, extract data for pipeline advance, then
    // remove the entry so completed agents never appear in list_agents.
    let (tx, project_root_for_advance, wt_path_for_advance, merge_failure_reported_for_advance) = {
        let mut lock = match agents.lock() {
            Ok(a) => a,
            Err(_) => return,
        };
        let agent = match lock.get_mut(&key) {
            Some(a) => a,
            None => return,
        };
        agent.completion = Some(report.clone());
        agent.session_id = session_id.clone();
        let tx = agent.tx.clone();
        let pr = agent.project_root.clone();
        let wt = agent.worktree_info.as_ref().map(|w| w.path.clone());
        let mfr = agent.merge_failure_reported;
        lock.remove(&key);
        (tx, pr, wt, mfr)
    };

    // Emit Done so wait_for_agent unblocks.
    let _ = tx.send(AgentEvent::Done {
        story_id: story_id.to_string(),
        agent_name: agent_name.to_string(),
        session_id,
    });

    // Notify WebSocket clients that the agent is gone.
    AgentPool::notify_agent_state_changed(&watcher_tx);

    // Advance the pipeline state machine in a background task.
    spawn_pipeline_advance(
        Arc::clone(agents),
        port,
        story_id,
        agent_name,
        report,
        project_root_for_advance,
        wt_path_for_advance,
        watcher_tx,
        merge_failure_reported_for_advance,
    );
}

/// Spawn pipeline advancement as a background task.
///
/// This is a **non-async** function so it does not participate in the opaque
/// type cycle between `start_agent` and `run_server_owned_completion`.
#[allow(clippy::too_many_arguments)]
fn spawn_pipeline_advance(
    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
    port: u16,
    story_id: &str,
    agent_name: &str,
    completion: CompletionReport,
    project_root: Option<PathBuf>,
    worktree_path: Option<PathBuf>,
    watcher_tx: broadcast::Sender<WatcherEvent>,
    merge_failure_reported: bool,
) {
    let sid = story_id.to_string();
    let aname = agent_name.to_string();
    tokio::spawn(async move {
        let pool = AgentPool {
            agents,
            port,
            child_killers: Arc::new(Mutex::new(HashMap::new())),
            watcher_tx,
            merge_jobs: Arc::new(Mutex::new(HashMap::new())),
        };
        pool.run_pipeline_advance(
            &sid,
            &aname,
            completion,
            project_root,
            worktree_path,
            merge_failure_reported,
        )
        .await;
    });
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::agents::merge::{MergeJob, MergeJobStatus};
    use crate::agents::{
        AgentEvent, AgentStatus, CompletionReport, PipelineStage, ReconciliationEvent,
        lifecycle::move_story_to_archived,
    };
    use crate::config::ProjectConfig;
    use crate::io::watcher::WatcherEvent;
    use portable_pty::{CommandBuilder, PtySize, native_pty_system};
    use std::collections::HashMap;
    use std::path::PathBuf;
    use std::process::Command;
    use tokio::sync::broadcast;

    fn init_git_repo(repo: &std::path::Path) {
        Command::new("git")
            .args(["init"])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["config", "user.email", "test@test.com"])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["config", "user.name", "Test"])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "--allow-empty", "-m", "init"])
            .current_dir(repo)
            .output()
            .unwrap();
    }

    fn make_config(toml_str: &str) -> ProjectConfig {
        ProjectConfig::parse(toml_str).unwrap()
    }

    #[tokio::test]
    async fn wait_for_agent_returns_immediately_if_completed() {
        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("s1", "bot", AgentStatus::Completed);

        let info = pool.wait_for_agent("s1", "bot", 1000).await.unwrap();
        assert_eq!(info.status, AgentStatus::Completed);
        assert_eq!(info.story_id, "s1");
        assert_eq!(info.agent_name, "bot");
    }

    #[tokio::test]
    async fn wait_for_agent_returns_immediately_if_failed() {
        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("s2", "bot", AgentStatus::Failed);

        let info = pool.wait_for_agent("s2", "bot", 1000).await.unwrap();
        assert_eq!(info.status, AgentStatus::Failed);
    }

    #[tokio::test]
    async fn wait_for_agent_completes_on_done_event() {
        let pool = AgentPool::new_test(3001);
        let tx = pool.inject_test_agent("s3", "bot", AgentStatus::Running);

        // Send Done event after a short delay
        let tx_clone = tx.clone();
        tokio::spawn(async move {
            tokio::time::sleep(std::time::Duration::from_millis(50)).await;
            // Mark status via event; real code also updates the map, but for
            // this unit test the map entry stays Running — we verify the
            // wait loop reacts to the event.
            let _ = tx_clone.send(AgentEvent::Done {
                story_id: "s3".to_string(),
                agent_name: "bot".to_string(),
                session_id: Some("sess-abc".to_string()),
            });
        });

        let info = pool.wait_for_agent("s3", "bot", 2000).await.unwrap();
        // Status comes from the map entry (Running in this unit test)
        // — the important thing is that wait_for_agent returned without timing out.
        assert_eq!(info.story_id, "s3");
    }

    #[tokio::test]
    async fn wait_for_agent_times_out() {
        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("s4", "bot", AgentStatus::Running);

        let result = pool.wait_for_agent("s4", "bot", 50).await;
        assert!(result.is_err());
        let msg = result.unwrap_err();
        assert!(msg.contains("Timed out"), "unexpected message: {msg}");
    }

    #[tokio::test]
    async fn wait_for_agent_errors_for_nonexistent() {
        let pool = AgentPool::new_test(3001);
        let result = pool.wait_for_agent("no_story", "no_bot", 100).await;
        assert!(result.is_err());
    }

    #[tokio::test]
    async fn wait_for_agent_completes_on_stopped_status_event() {
        let pool = AgentPool::new_test(3001);
        let tx = pool.inject_test_agent("s5", "bot", AgentStatus::Running);

        let tx_clone = tx.clone();
        tokio::spawn(async move {
            tokio::time::sleep(std::time::Duration::from_millis(30)).await;
            let _ = tx_clone.send(AgentEvent::Status {
                story_id: "s5".to_string(),
                agent_name: "bot".to_string(),
                status: "stopped".to_string(),
            });
        });

        let info = pool.wait_for_agent("s5", "bot", 2000).await.unwrap();
        assert_eq!(info.story_id, "s5");
    }

    // ── report_completion tests ────────────────────────────────────

    #[tokio::test]
    async fn report_completion_rejects_nonexistent_agent() {
        let pool = AgentPool::new_test(3001);
        let result = pool.report_completion("no_story", "no_bot", "done").await;
        assert!(result.is_err());
        let msg = result.unwrap_err();
        assert!(msg.contains("No agent"), "unexpected: {msg}");
    }

    #[tokio::test]
    async fn report_completion_rejects_non_running_agent() {
        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("s6", "bot", AgentStatus::Completed);

        let result = pool.report_completion("s6", "bot", "done").await;
        assert!(result.is_err());
        let msg = result.unwrap_err();
        assert!(
            msg.contains("not running"),
            "expected 'not running' in: {msg}"
        );
    }

    #[tokio::test]
    async fn report_completion_rejects_dirty_worktree() {
        use std::fs;
        use tempfile::tempdir;

        let tmp = tempdir().unwrap();
        let repo = tmp.path();

        // Init a real git repo and make an initial commit
        Command::new("git")
            .args(["init"])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "--allow-empty", "-m", "init"])
            .current_dir(repo)
            .output()
            .unwrap();

        // Write an uncommitted file
        fs::write(repo.join("dirty.txt"), "not committed").unwrap();

        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent_with_path("s7", "bot", AgentStatus::Running, repo.to_path_buf());

        let result = pool.report_completion("s7", "bot", "done").await;
        assert!(result.is_err());
        let msg = result.unwrap_err();
        assert!(
            msg.contains("uncommitted"),
            "expected 'uncommitted' in: {msg}"
        );
    }

    // ── server-owned completion tests ───────────────────────────────────────────

    #[tokio::test]
    async fn server_owned_completion_skips_when_already_completed() {
        let pool = AgentPool::new_test(3001);
        let report = CompletionReport {
            summary: "Already done".to_string(),
            gates_passed: true,
            gate_output: String::new(),
        };
        pool.inject_test_agent_with_completion(
            "s10",
            "coder-1",
            AgentStatus::Completed,
            PathBuf::from("/tmp/nonexistent"),
            report,
        );

        // Subscribe before calling so we can check if Done event was emitted.
        let mut rx = pool.subscribe("s10", "coder-1").unwrap();

        run_server_owned_completion(
            &pool.agents,
            pool.port,
            "s10",
            "coder-1",
            Some("sess-1".to_string()),
            pool.watcher_tx.clone(),
        )
        .await;

        // Status should remain Completed (unchanged) — no gate re-run.
        let agents = pool.agents.lock().unwrap();
        let key = composite_key("s10", "coder-1");
        let agent = agents.get(&key).unwrap();
        assert_eq!(agent.status, AgentStatus::Completed);
        // Summary should still be the original, not overwritten.
        assert_eq!(agent.completion.as_ref().unwrap().summary, "Already done");
        drop(agents);

        // No Done event should have been emitted.
        assert!(
            rx.try_recv().is_err(),
            "should not emit Done when completion already exists"
        );
    }

    #[tokio::test]
    async fn server_owned_completion_runs_gates_on_clean_worktree() {
        use tempfile::tempdir;

        let tmp = tempdir().unwrap();
        let repo = tmp.path();
        init_git_repo(repo);

        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent_with_path(
            "s11",
            "coder-1",
            AgentStatus::Running,
            repo.to_path_buf(),
        );

        let mut rx = pool.subscribe("s11", "coder-1").unwrap();

        run_server_owned_completion(
            &pool.agents,
            pool.port,
            "s11",
            "coder-1",
            Some("sess-2".to_string()),
            pool.watcher_tx.clone(),
        )
        .await;

        // Agent entry should be removed from the map after completion.
        let agents = pool.agents.lock().unwrap();
        let key = composite_key("s11", "coder-1");
        assert!(
            agents.get(&key).is_none(),
            "agent should be removed from map after completion"
        );
        drop(agents);

        // A Done event should have been emitted with the session_id.
        let event = rx.try_recv().expect("should emit Done event");
        match &event {
            AgentEvent::Done { session_id, .. } => {
                assert_eq!(*session_id, Some("sess-2".to_string()));
            }
            other => panic!("expected Done event, got: {other:?}"),
        }
    }

    #[tokio::test]
    async fn server_owned_completion_fails_on_dirty_worktree() {
        use std::fs;
        use tempfile::tempdir;

        let tmp = tempdir().unwrap();
        let repo = tmp.path();
        init_git_repo(repo);
        // Create an uncommitted file.
        fs::write(repo.join("dirty.txt"), "not committed").unwrap();

        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent_with_path(
            "s12",
            "coder-1",
            AgentStatus::Running,
            repo.to_path_buf(),
        );

        let mut rx = pool.subscribe("s12", "coder-1").unwrap();

        run_server_owned_completion(
            &pool.agents,
            pool.port,
            "s12",
            "coder-1",
            None,
            pool.watcher_tx.clone(),
        )
        .await;

        // Agent entry should be removed from the map after completion (even on failure).
        let agents = pool.agents.lock().unwrap();
        let key = composite_key("s12", "coder-1");
        assert!(
            agents.get(&key).is_none(),
            "agent should be removed from map after failed completion"
        );
        drop(agents);

        // A Done event should have been emitted.
        let event = rx.try_recv().expect("should emit Done event");
        assert!(
            matches!(event, AgentEvent::Done { .. }),
            "expected Done event, got: {event:?}"
        );
    }

    #[tokio::test]
    async fn server_owned_completion_nonexistent_agent_is_noop() {
        let pool = AgentPool::new_test(3001);
        // Should not panic or error — just silently return.
        run_server_owned_completion(
            &pool.agents,
            pool.port,
            "nonexistent",
            "bot",
            None,
            pool.watcher_tx.clone(),
        )
        .await;
    }

    // ── pipeline advance tests ────────────────────────────────────────────────

    #[tokio::test]
    async fn pipeline_advance_coder_gates_pass_server_qa_moves_to_merge() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Set up story in 2_current/ (no qa frontmatter → uses project default "server")
        let current = root.join(".story_kit/work/2_current");
        fs::create_dir_all(&current).unwrap();
        fs::write(current.join("50_story_test.md"), "test").unwrap();

        let pool = AgentPool::new_test(3001);
        pool.run_pipeline_advance(
            "50_story_test",
            "coder-1",
            CompletionReport {
                summary: "done".to_string(),
                gates_passed: true,
                gate_output: String::new(),
            },
            Some(root.to_path_buf()),
            None,
            false,
        )
        .await;

        // With default qa: server, story skips QA and goes straight to 4_merge/
        assert!(
            root.join(".story_kit/work/4_merge/50_story_test.md")
                .exists(),
            "story should be in 4_merge/"
        );
        assert!(
            !current.join("50_story_test.md").exists(),
            "story should not still be in 2_current/"
        );
    }

    #[tokio::test]
    async fn pipeline_advance_coder_gates_pass_agent_qa_moves_to_qa() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Set up story in 2_current/ with qa: agent frontmatter
        let current = root.join(".story_kit/work/2_current");
        fs::create_dir_all(&current).unwrap();
        fs::write(
            current.join("50_story_test.md"),
            "---\nname: Test\nqa: agent\n---\ntest",
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        pool.run_pipeline_advance(
            "50_story_test",
            "coder-1",
            CompletionReport {
                summary: "done".to_string(),
                gates_passed: true,
                gate_output: String::new(),
            },
            Some(root.to_path_buf()),
            None,
            false,
        )
        .await;

        // With qa: agent, story should move to 3_qa/
        assert!(
            root.join(".story_kit/work/3_qa/50_story_test.md").exists(),
            "story should be in 3_qa/"
        );
        assert!(
            !current.join("50_story_test.md").exists(),
            "story should not still be in 2_current/"
        );
    }

    #[tokio::test]
    async fn pipeline_advance_qa_gates_pass_moves_story_to_merge() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Set up story in 3_qa/
        let qa_dir = root.join(".story_kit/work/3_qa");
        fs::create_dir_all(&qa_dir).unwrap();
        // qa: server so the story skips human review and goes straight to merge.
        fs::write(
            qa_dir.join("51_story_test.md"),
            "---\nname: Test\nqa: server\n---\ntest",
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        pool.run_pipeline_advance(
            "51_story_test",
            "qa",
            CompletionReport {
                summary: "QA done".to_string(),
                gates_passed: true,
                gate_output: String::new(),
            },
            Some(root.to_path_buf()),
            None,
            false,
        )
        .await;

        // Story should have moved to 4_merge/
        assert!(
            root.join(".story_kit/work/4_merge/51_story_test.md")
                .exists(),
            "story should be in 4_merge/"
        );
        assert!(
            !qa_dir.join("51_story_test.md").exists(),
            "story should not still be in 3_qa/"
        );
    }

    #[tokio::test]
    async fn pipeline_advance_supervisor_does_not_advance() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        let current = root.join(".story_kit/work/2_current");
        fs::create_dir_all(&current).unwrap();
        fs::write(current.join("52_story_test.md"), "test").unwrap();

        let pool = AgentPool::new_test(3001);
        pool.run_pipeline_advance(
            "52_story_test",
            "supervisor",
            CompletionReport {
                summary: "supervised".to_string(),
                gates_passed: true,
                gate_output: String::new(),
            },
            Some(root.to_path_buf()),
            None,
            false,
        )
        .await;

        // Story should NOT have moved (supervisors don't advance pipeline)
        assert!(
            current.join("52_story_test.md").exists(),
            "story should still be in 2_current/ for supervisor"
        );
    }

    #[tokio::test]
    async fn pipeline_advance_sends_agent_state_changed_to_watcher_tx() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Set up story in 2_current/
        let current = root.join(".story_kit/work/2_current");
        fs::create_dir_all(&current).unwrap();
        fs::write(current.join("173_story_test.md"), "test").unwrap();
        // Ensure 3_qa/ exists for the move target
        fs::create_dir_all(root.join(".story_kit/work/3_qa")).unwrap();
        // Ensure 1_backlog/ exists (start_agent calls move_story_to_current)
        fs::create_dir_all(root.join(".story_kit/work/1_backlog")).unwrap();

        // Write a project.toml with a qa agent so start_agent can resolve it.
        fs::create_dir_all(root.join(".story_kit")).unwrap();
        fs::write(
            root.join(".story_kit/project.toml"),
            r#"
default_qa = "agent"

[[agent]]
name = "coder-1"
role = "Coder"
command = "echo"
args = ["noop"]
prompt = "test"
stage = "coder"

[[agent]]
name = "qa"
role = "QA"
command = "echo"
args = ["noop"]
prompt = "test"
stage = "qa"
"#,
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        // Subscribe to the watcher channel BEFORE the pipeline advance.
        let mut rx = pool.watcher_tx.subscribe();

        // Call pipeline advance directly. This will:
        // 1. Move the story to 3_qa/
        // 2. Start the QA agent (which calls notify_agent_state_changed)
        // Note: the actual agent process will fail (no real worktree), but the
        // agent insertion and notification happen before the background spawn.
        pool.run_pipeline_advance(
            "173_story_test",
            "coder-1",
            CompletionReport {
                summary: "done".to_string(),
                gates_passed: true,
                gate_output: String::new(),
            },
            Some(root.to_path_buf()),
            None,
            false,
        )
        .await;

        // The pipeline advance should have sent AgentStateChanged events via
        // the pool's watcher_tx (not a dummy channel). Collect all events.
        let mut got_agent_state_changed = false;
        while let Ok(evt) = rx.try_recv() {
            if matches!(evt, WatcherEvent::AgentStateChanged) {
                got_agent_state_changed = true;
                break;
            }
        }

        assert!(
            got_agent_state_changed,
            "pipeline advance should send AgentStateChanged through the real watcher_tx \
             (bug 173: lozenges must update when agents are assigned during pipeline advance)"
        );
    }

    // ── auto-assign helper tests ───────────────────────────────────

    #[test]
    fn scan_stage_items_returns_empty_for_missing_dir() {
        let tmp = tempfile::tempdir().unwrap();
        let items = scan_stage_items(tmp.path(), "2_current");
        assert!(items.is_empty());
    }

    #[test]
    fn scan_stage_items_returns_sorted_story_ids() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let stage_dir = tmp.path().join(".story_kit").join("work").join("2_current");
        fs::create_dir_all(&stage_dir).unwrap();
        fs::write(stage_dir.join("42_story_foo.md"), "---\nname: foo\n---").unwrap();
        fs::write(stage_dir.join("10_story_bar.md"), "---\nname: bar\n---").unwrap();
        fs::write(stage_dir.join("5_story_baz.md"), "---\nname: baz\n---").unwrap();
        // non-md file should be ignored
        fs::write(stage_dir.join("README.txt"), "ignore me").unwrap();

        let items = scan_stage_items(tmp.path(), "2_current");
        assert_eq!(items, vec!["10_story_bar", "42_story_foo", "5_story_baz"]);
    }

    #[test]
    fn is_story_assigned_returns_true_for_running_coder() {
        let config = ProjectConfig::default();
        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);

        let agents = pool.agents.lock().unwrap();
        assert!(is_story_assigned_for_stage(
            &config,
            &agents,
            "42_story_foo",
            &PipelineStage::Coder
        ));
        // Same story but wrong stage — should be false
        assert!(!is_story_assigned_for_stage(
            &config,
            &agents,
            "42_story_foo",
            &PipelineStage::Qa
        ));
        // Different story — should be false
        assert!(!is_story_assigned_for_stage(
            &config,
            &agents,
            "99_story_other",
            &PipelineStage::Coder
        ));
    }

    #[test]
    fn is_story_assigned_returns_false_for_completed_agent() {
        let config = ProjectConfig::default();
        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Completed);

        let agents = pool.agents.lock().unwrap();
        // Completed agents don't count as assigned
        assert!(!is_story_assigned_for_stage(
            &config,
            &agents,
            "42_story_foo",
            &PipelineStage::Coder
        ));
    }

    #[test]
    fn is_story_assigned_uses_config_stage_field_for_nonstandard_names() {
        let config = ProjectConfig::parse(
            r#"
[[agent]]
name = "qa-2"
stage = "qa"
"#,
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("42_story_foo", "qa-2", AgentStatus::Running);

        let agents = pool.agents.lock().unwrap();
        // qa-2 with stage=qa should be recognised as a QA agent
        assert!(
            is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Qa),
            "qa-2 should be detected as assigned to QA stage"
        );
        // Should NOT appear as a coder
        assert!(
            !is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Coder),
            "qa-2 should not be detected as a coder"
        );
    }

    #[test]
    fn find_free_agent_returns_none_when_all_busy() {
        let config = ProjectConfig::parse(
            r#"
[[agent]]
name = "coder-1"
[[agent]]
name = "coder-2"
"#,
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("s1", "coder-1", AgentStatus::Running);
        pool.inject_test_agent("s2", "coder-2", AgentStatus::Running);

        let agents = pool.agents.lock().unwrap();
        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
        assert!(free.is_none(), "no free coders should be available");
    }

    #[test]
    fn find_free_agent_returns_first_free_coder() {
        let config = ProjectConfig::parse(
            r#"
[[agent]]
name = "coder-1"
[[agent]]
name = "coder-2"
[[agent]]
name = "coder-3"
"#,
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        // coder-1 is busy, coder-2 is free
        pool.inject_test_agent("s1", "coder-1", AgentStatus::Running);

        let agents = pool.agents.lock().unwrap();
        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
        assert_eq!(
            free,
            Some("coder-2"),
            "coder-2 should be the first free coder"
        );
    }

    #[test]
    fn find_free_agent_ignores_completed_agents() {
        let config = ProjectConfig::parse(
            r#"
[[agent]]
name = "coder-1"
"#,
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        // coder-1 completed its previous story — it's free for a new one
        pool.inject_test_agent("s1", "coder-1", AgentStatus::Completed);

        let agents = pool.agents.lock().unwrap();
        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
        assert_eq!(free, Some("coder-1"), "completed coder-1 should be free");
    }

    #[test]
    fn find_free_agent_returns_none_for_wrong_stage() {
        let config = ProjectConfig::parse(
            r#"
[[agent]]
name = "qa"
"#,
        )
        .unwrap();

        let agents: HashMap<String, StoryAgent> = HashMap::new();
        // Looking for a Coder but only QA is configured
        let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
        assert!(free.is_none());
        // Looking for QA should find it
        let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
        assert_eq!(free_qa, Some("qa"));
    }

    #[test]
    fn find_free_agent_uses_config_stage_field_not_name() {
        // Agents named "qa-2" and "coder-opus" don't match the legacy name heuristic
        // but should be picked up via their explicit stage field.
        let config = ProjectConfig::parse(
            r#"
[[agent]]
name = "qa-2"
stage = "qa"

[[agent]]
name = "coder-opus"
stage = "coder"
"#,
        )
        .unwrap();

        let agents: HashMap<String, StoryAgent> = HashMap::new();

        // qa-2 should be found for PipelineStage::Qa via config stage field
        let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
        assert_eq!(free_qa, Some("qa-2"), "qa-2 with stage=qa should be found");

        // coder-opus should be found for PipelineStage::Coder via config stage field
        let free_coder = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
        assert_eq!(
            free_coder,
            Some("coder-opus"),
            "coder-opus with stage=coder should be found"
        );

        // Neither should match the other stage
        let free_merge = find_free_agent_for_stage(&config, &agents, &PipelineStage::Mergemaster);
        assert!(free_merge.is_none());
    }

    // ── find_active_story_stage tests ─────────────────────────────────────────

    #[test]
    fn find_active_story_stage_detects_current() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();
        let current = root.join(".story_kit/work/2_current");
        fs::create_dir_all(&current).unwrap();
        fs::write(current.join("10_story_test.md"), "test").unwrap();

        assert_eq!(
            find_active_story_stage(root, "10_story_test"),
            Some("2_current")
        );
    }

    #[test]
    fn find_active_story_stage_detects_qa() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();
        let qa = root.join(".story_kit/work/3_qa");
        fs::create_dir_all(&qa).unwrap();
        fs::write(qa.join("11_story_test.md"), "test").unwrap();

        assert_eq!(find_active_story_stage(root, "11_story_test"), Some("3_qa"));
    }

    #[test]
    fn find_active_story_stage_detects_merge() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();
        let merge = root.join(".story_kit/work/4_merge");
        fs::create_dir_all(&merge).unwrap();
        fs::write(merge.join("12_story_test.md"), "test").unwrap();

        assert_eq!(
            find_active_story_stage(root, "12_story_test"),
            Some("4_merge")
        );
    }

    #[test]
    fn find_active_story_stage_returns_none_for_unknown_story() {
        let tmp = tempfile::tempdir().unwrap();
        assert_eq!(find_active_story_stage(tmp.path(), "99_nonexistent"), None);
    }

    // ── check_orphaned_agents return value tests (bug 161) ──────────────────

    #[tokio::test]
    async fn check_orphaned_agents_returns_count_of_orphaned_agents() {
        let pool = AgentPool::new_test(3001);

        // Spawn two tasks that finish immediately.
        let h1 = tokio::spawn(async {});
        let h2 = tokio::spawn(async {});
        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
        assert!(h1.is_finished());
        assert!(h2.is_finished());

        pool.inject_test_agent_with_handle("story_a", "coder", AgentStatus::Running, h1);
        pool.inject_test_agent_with_handle("story_b", "coder", AgentStatus::Running, h2);

        let found = check_orphaned_agents(&pool.agents);
        assert_eq!(found, 2, "should detect both orphaned agents");
    }

    #[test]
    fn check_orphaned_agents_returns_zero_when_no_orphans() {
        let pool = AgentPool::new_test(3001);
        // Inject agents in terminal states — not orphaned.
        pool.inject_test_agent("story_a", "coder", AgentStatus::Completed);
        pool.inject_test_agent("story_b", "qa", AgentStatus::Failed);

        let found = check_orphaned_agents(&pool.agents);
        assert_eq!(
            found, 0,
            "no orphans should be detected for terminal agents"
        );
    }

    #[tokio::test]
    async fn watchdog_detects_orphaned_running_agent() {
        let pool = AgentPool::new_test(3001);

        let handle = tokio::spawn(async {});
        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
        assert!(
            handle.is_finished(),
            "task should be finished before injection"
        );

        let tx = pool.inject_test_agent_with_handle(
            "orphan_story",
            "coder",
            AgentStatus::Running,
            handle,
        );
        let mut rx = tx.subscribe();

        pool.run_watchdog_once();

        {
            let agents = pool.agents.lock().unwrap();
            let key = composite_key("orphan_story", "coder");
            let agent = agents.get(&key).unwrap();
            assert_eq!(
                agent.status,
                AgentStatus::Failed,
                "watchdog must mark an orphaned Running agent as Failed"
            );
        }

        let event = rx.try_recv().expect("watchdog must emit an Error event");
        assert!(
            matches!(event, AgentEvent::Error { .. }),
            "expected AgentEvent::Error, got: {event:?}"
        );
    }

    #[tokio::test]
    async fn watchdog_orphan_detection_returns_nonzero_enabling_auto_assign() {
        // This test verifies the contract that `check_orphaned_agents` returns
        // a non-zero count when orphans exist, which the watchdog uses to
        // decide whether to trigger auto-assign (bug 161).
        let pool = AgentPool::new_test(3001);

        let handle = tokio::spawn(async {});
        tokio::time::sleep(std::time::Duration::from_millis(20)).await;

        pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle);

        // Before watchdog: agent is Running.
        {
            let agents = pool.agents.lock().unwrap();
            let key = composite_key("orphan_story", "coder");
            assert_eq!(agents.get(&key).unwrap().status, AgentStatus::Running);
        }

        // Run watchdog pass — should return 1 (orphan found).
        let found = check_orphaned_agents(&pool.agents);
        assert_eq!(
            found, 1,
            "watchdog must return 1 for a single orphaned agent"
        );

        // After watchdog: agent is Failed.
        {
            let agents = pool.agents.lock().unwrap();
            let key = composite_key("orphan_story", "coder");
            assert_eq!(
                agents.get(&key).unwrap().status,
                AgentStatus::Failed,
                "orphaned agent must be marked Failed"
            );
        }
    }

    // ── remove_agents_for_story tests ────────────────────────────────────────

    #[test]
    fn remove_agents_for_story_removes_all_entries() {
        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("story_a", "coder-1", AgentStatus::Completed);
        pool.inject_test_agent("story_a", "qa", AgentStatus::Failed);
        pool.inject_test_agent("story_b", "coder-1", AgentStatus::Running);

        let removed = pool.remove_agents_for_story("story_a");
        assert_eq!(removed, 2, "should remove both agents for story_a");

        let agents = pool.list_agents().unwrap();
        assert_eq!(agents.len(), 1, "only story_b agent should remain");
        assert_eq!(agents[0].story_id, "story_b");
    }

    #[test]
    fn remove_agents_for_story_returns_zero_when_no_match() {
        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("story_a", "coder-1", AgentStatus::Running);

        let removed = pool.remove_agents_for_story("nonexistent");
        assert_eq!(removed, 0);

        let agents = pool.list_agents().unwrap();
        assert_eq!(agents.len(), 1, "existing agents should not be affected");
    }

    // ── archive + cleanup integration test ───────────────────────────────────

    #[tokio::test]
    async fn archiving_story_removes_agent_entries_from_pool() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Set up story in 2_current/
        let current = root.join(".story_kit/work/2_current");
        fs::create_dir_all(&current).unwrap();
        fs::write(current.join("60_story_cleanup.md"), "test").unwrap();

        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("60_story_cleanup", "coder-1", AgentStatus::Completed);
        pool.inject_test_agent("60_story_cleanup", "qa", AgentStatus::Completed);
        pool.inject_test_agent("61_story_other", "coder-1", AgentStatus::Running);

        // Verify all 3 agents exist.
        assert_eq!(pool.list_agents().unwrap().len(), 3);

        // Archive the story.
        move_story_to_archived(root, "60_story_cleanup").unwrap();
        pool.remove_agents_for_story("60_story_cleanup");

        // Agent entries for the archived story should be gone.
        let remaining = pool.list_agents().unwrap();
        assert_eq!(
            remaining.len(),
            1,
            "only the other story's agent should remain"
        );
        assert_eq!(remaining[0].story_id, "61_story_other");

        // Story file should be in 5_done/
        assert!(
            root.join(".story_kit/work/5_done/60_story_cleanup.md")
                .exists()
        );
    }

    // ── kill_all_children tests ────────────────────────────────────

    /// Returns true if a process with the given PID is currently running.
    fn process_is_running(pid: u32) -> bool {
        std::process::Command::new("ps")
            .arg("-p")
            .arg(pid.to_string())
            .stdout(std::process::Stdio::null())
            .stderr(std::process::Stdio::null())
            .status()
            .map(|s| s.success())
            .unwrap_or(false)
    }

    #[test]
    fn kill_all_children_is_safe_on_empty_pool() {
        let pool = AgentPool::new_test(3001);
        // Should not panic or deadlock on an empty registry.
        pool.kill_all_children();
        assert_eq!(pool.child_killer_count(), 0);
    }

    #[test]
    fn kill_all_children_kills_real_process() {
        // GIVEN: a real PTY child process (sleep 100) with its killer registered.
        let pool = AgentPool::new_test(3001);

        let pty_system = native_pty_system();
        let pair = pty_system
            .openpty(PtySize {
                rows: 24,
                cols: 80,
                pixel_width: 0,
                pixel_height: 0,
            })
            .expect("failed to open pty");

        let mut cmd = CommandBuilder::new("sleep");
        cmd.arg("100");
        let mut child = pair
            .slave
            .spawn_command(cmd)
            .expect("failed to spawn sleep");
        let pid = child.process_id().expect("no pid");

        pool.inject_child_killer("story:agent", child.clone_killer());

        // Verify the process is alive before we kill it.
        assert!(
            process_is_running(pid),
            "process {pid} should be running before kill_all_children"
        );

        // WHEN: kill_all_children() is called.
        pool.kill_all_children();

        // Collect the exit status (prevents zombie; also ensures signal was sent).
        let _ = child.wait();

        // THEN: the process should be dead.
        assert!(
            !process_is_running(pid),
            "process {pid} should have been killed by kill_all_children"
        );
    }

    #[test]
    fn kill_all_children_clears_registry() {
        // GIVEN: a pool with one registered killer.
        let pool = AgentPool::new_test(3001);

        let pty_system = native_pty_system();
        let pair = pty_system
            .openpty(PtySize {
                rows: 24,
                cols: 80,
                pixel_width: 0,
                pixel_height: 0,
            })
            .expect("failed to open pty");

        let mut cmd = CommandBuilder::new("sleep");
        cmd.arg("1");
        let mut child = pair
            .slave
            .spawn_command(cmd)
            .expect("failed to spawn sleep");

        pool.inject_child_killer("story:agent", child.clone_killer());
        assert_eq!(pool.child_killer_count(), 1);

        // WHEN: kill_all_children() is called.
        pool.kill_all_children();
        let _ = child.wait();

        // THEN: the registry is empty.
        assert_eq!(
            pool.child_killer_count(),
            0,
            "child_killers should be cleared after kill_all_children"
        );
    }

    // ── available_agents_for_stage tests (story 190) ──────────────────────────

    #[test]
    fn available_agents_for_stage_returns_idle_agents() {
        let config = make_config(
            r#"
[[agent]]
name = "coder-1"
stage = "coder"

[[agent]]
name = "coder-2"
stage = "coder"

[[agent]]
name = "qa"
stage = "qa"
"#,
        );
        let pool = AgentPool::new_test(3001);
        // coder-1 is busy on story-1
        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);

        let available = pool
            .available_agents_for_stage(&config, &PipelineStage::Coder)
            .unwrap();
        assert_eq!(available, vec!["coder-2"]);

        let available_qa = pool
            .available_agents_for_stage(&config, &PipelineStage::Qa)
            .unwrap();
        assert_eq!(available_qa, vec!["qa"]);
    }

    #[test]
    fn available_agents_for_stage_returns_empty_when_all_busy() {
        let config = make_config(
            r#"
[[agent]]
name = "coder-1"
stage = "coder"
"#,
        );
        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);

        let available = pool
            .available_agents_for_stage(&config, &PipelineStage::Coder)
            .unwrap();
        assert!(available.is_empty());
    }

    #[test]
    fn available_agents_for_stage_ignores_completed_agents() {
        let config = make_config(
            r#"
[[agent]]
name = "coder-1"
stage = "coder"
"#,
        );
        let pool = AgentPool::new_test(3001);
        // Completed agents should not count as busy.
        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Completed);

        let available = pool
            .available_agents_for_stage(&config, &PipelineStage::Coder)
            .unwrap();
        assert_eq!(available, vec!["coder-1"]);
    }

    #[tokio::test]
    async fn start_agent_auto_selects_second_coder_when_first_busy() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".story_kit");
        std::fs::create_dir_all(&sk).unwrap();
        std::fs::write(
            sk.join("project.toml"),
            r#"
[[agent]]
name = "supervisor"
stage = "other"

[[agent]]
name = "coder-1"
stage = "coder"

[[agent]]
name = "coder-2"
stage = "coder"
"#,
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        // coder-1 is busy on another story
        pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running);

        // Call start_agent without agent_name — should pick coder-2
        let result = pool
            .start_agent(tmp.path(), "42_my_story", None, None)
            .await;
        // Will fail for infrastructure reasons (no git repo), but should NOT
        // fail with "All coder agents are busy" — that would mean it didn't
        // try coder-2.
        match result {
            Ok(info) => {
                assert_eq!(info.agent_name, "coder-2");
            }
            Err(err) => {
                assert!(
                    !err.contains("All coder agents are busy"),
                    "should have selected coder-2 but got: {err}"
                );
                assert!(
                    !err.contains("No coder agent configured"),
                    "should not fail on agent selection, got: {err}"
                );
            }
        }
    }

    #[tokio::test]
    async fn start_agent_returns_busy_when_all_coders_occupied() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".story_kit");
        std::fs::create_dir_all(&sk).unwrap();
        std::fs::write(
            sk.join("project.toml"),
            r#"
[[agent]]
name = "coder-1"
stage = "coder"

[[agent]]
name = "coder-2"
stage = "coder"
"#,
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
        pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending);

        let result = pool.start_agent(tmp.path(), "story-3", None, None).await;
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(
            err.contains("All coder agents are busy"),
            "expected busy error, got: {err}"
        );
    }

    /// Story 203: when all coders are busy the story file must be moved from
    /// 1_backlog/ to 2_current/ so that auto_assign_available_work can pick
    /// it up once a coder finishes.
    #[tokio::test]
    async fn start_agent_moves_story_to_current_when_coders_busy() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".story_kit");
        let backlog = sk.join("work/1_backlog");
        std::fs::create_dir_all(&backlog).unwrap();
        std::fs::write(
            sk.join("project.toml"),
            r#"
[[agent]]
name = "coder-1"
stage = "coder"
"#,
        )
        .unwrap();
        // Place the story in 1_backlog/.
        std::fs::write(backlog.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap();

        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);

        let result = pool.start_agent(tmp.path(), "story-3", None, None).await;

        // Should fail because all coders are busy.
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(
            err.contains("All coder agents are busy"),
            "expected busy error, got: {err}"
        );
        assert!(
            err.contains("queued in work/2_current/"),
            "expected story-to-current message, got: {err}"
        );

        // Story must have been moved to 2_current/.
        let current_path = sk.join("work/2_current/story-3.md");
        assert!(
            current_path.exists(),
            "story should be in 2_current/ after busy error, but was not"
        );
        let backlog_path = backlog.join("story-3.md");
        assert!(
            !backlog_path.exists(),
            "story should no longer be in 1_backlog/"
        );
    }

    /// Story 203: auto_assign_available_work must detect a story in 2_current/
    /// with no active agent and start an agent for it.
    #[tokio::test]
    async fn auto_assign_picks_up_story_queued_in_current() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".story_kit");
        let current = sk.join("work/2_current");
        std::fs::create_dir_all(&current).unwrap();
        std::fs::write(
            sk.join("project.toml"),
            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
        )
        .unwrap();
        // Place the story in 2_current/ (simulating the "queued" state).
        std::fs::write(current.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap();

        let pool = AgentPool::new_test(3001);
        // No agents are running — coder-1 is free.

        // auto_assign will try to call start_agent, which will attempt to create
        // a worktree (will fail without a git repo) — that is fine. We only need
        // to verify the agent is registered as Pending before the background
        // task eventually fails.
        pool.auto_assign_available_work(tmp.path()).await;

        let agents = pool.agents.lock().unwrap();
        let has_pending = agents.values().any(|a| {
            a.agent_name == "coder-1"
                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
        });
        assert!(
            has_pending,
            "auto_assign should have started coder-1 for story-3, but pool is empty"
        );
    }

    /// Story 203: if a story is already in 2_current/ or later, start_agent
    /// must not fail — the move is a no-op.
    #[tokio::test]
    async fn start_agent_story_already_in_current_is_noop() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".story_kit");
        let current = sk.join("work/2_current");
        std::fs::create_dir_all(&current).unwrap();
        std::fs::write(
            sk.join("project.toml"),
            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
        )
        .unwrap();
        // Place the story directly in 2_current/.
        std::fs::write(current.join("story-5.md"), "---\nname: Story 5\n---\n").unwrap();

        let pool = AgentPool::new_test(3001);

        // start_agent should attempt to assign coder-1 (no infra, so it will
        // fail for git reasons), but must NOT fail due to the story already
        // being in 2_current/.
        let result = pool.start_agent(tmp.path(), "story-5", None, None).await;
        match result {
            Ok(_) => {}
            Err(e) => {
                assert!(
                    !e.contains("Failed to move"),
                    "should not fail on idempotent move, got: {e}"
                );
            }
        }
    }

    #[tokio::test]
    async fn start_agent_explicit_name_unchanged_when_busy() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".story_kit");
        std::fs::create_dir_all(&sk).unwrap();
        std::fs::write(
            sk.join("project.toml"),
            r#"
[[agent]]
name = "coder-1"
stage = "coder"

[[agent]]
name = "coder-2"
stage = "coder"
"#,
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);

        // Explicit request for coder-1 (busy) should fail even though coder-2 is free.
        let result = pool
            .start_agent(tmp.path(), "story-2", Some("coder-1"), None)
            .await;
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(
            err.contains("coder-1") && err.contains("already running"),
            "expected explicit busy error, got: {err}"
        );
    }

    // ── start_agent single-instance concurrency tests ─────────────────────────

    /// Regression test for bug 97: the agent pool must reject a second concurrent
    /// instance of the same agent name even if it would run on a different story.
    #[tokio::test]
    async fn start_agent_rejects_when_same_agent_already_running_on_another_story() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Write a minimal project.toml so ProjectConfig::load can find the "qa" agent.
        let sk_dir = root.join(".story_kit");
        fs::create_dir_all(&sk_dir).unwrap();
        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();

        let pool = AgentPool::new_test(3001);
        // Simulate qa already running on story-a.
        pool.inject_test_agent("story-a", "qa", AgentStatus::Running);

        // Attempt to start qa on story-b — must be rejected.
        let result = pool.start_agent(root, "story-b", Some("qa"), None).await;

        assert!(
            result.is_err(),
            "start_agent should fail when qa is already running on another story"
        );
        let err = result.unwrap_err();
        assert!(
            err.contains("already running") || err.contains("becomes available"),
            "error message should explain why: got '{err}'"
        );
    }

    /// Verify that the concurrency guard does NOT block an agent that is merely
    /// Completed (not Running/Pending) — completed agents are free for new work.
    #[tokio::test]
    async fn start_agent_allows_new_story_when_previous_run_is_completed() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        let sk_dir = root.join(".story_kit");
        fs::create_dir_all(&sk_dir).unwrap();
        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();

        let pool = AgentPool::new_test(3001);
        // Previous run completed — should NOT block a new story.
        pool.inject_test_agent("story-a", "qa", AgentStatus::Completed);

        // The call will fail eventually (no real worktree / Claude CLI), but it must
        // NOT fail at the concurrency check.  We detect the difference by inspecting
        // the error message: a concurrency rejection says "already running", while a
        // later failure (missing story file, missing claude binary, etc.) says something else.
        let result = pool.start_agent(root, "story-b", Some("qa"), None).await;

        if let Err(ref e) = result {
            assert!(
                !e.contains("already running") && !e.contains("becomes available"),
                "completed agent must not trigger the concurrency guard: got '{e}'"
            );
        }
        // result may be Ok (unlikely in test env) or Err for infra reasons — both fine.
    }

    // ── bug 118: pending entry cleanup on start_agent failure ────────────────

    /// Regression test for bug 118: when worktree creation fails (e.g. because
    /// there is no git repo), the Pending entry that was inserted into the agent
    /// HashMap must not remain Pending — it must transition to Failed.  This
    /// prevents `find_free_agent_for_stage` / auto-assign from being permanently
    /// blocked.
    ///
    /// With story 157 the worktree creation moved into the background spawn, so
    /// `start_agent` returns `Ok(Pending)` immediately.  We use `wait_for_agent`
    /// to block until the background task resolves.
    #[tokio::test]
    async fn start_agent_cleans_up_pending_entry_on_failure() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Minimal project.toml with a "qa" agent.
        let sk_dir = root.join(".story_kit");
        fs::create_dir_all(&sk_dir).unwrap();
        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();

        // Create the story in upcoming so `move_story_to_current` succeeds,
        // but do NOT init a git repo — `create_worktree` will fail in the spawn.
        let upcoming = root.join(".story_kit/work/1_backlog");
        fs::create_dir_all(&upcoming).unwrap();
        fs::write(upcoming.join("50_story_test.md"), "---\nname: Test\n---\n").unwrap();

        let pool = AgentPool::new_test(3099);

        let result = pool
            .start_agent(root, "50_story_test", Some("qa"), None)
            .await;

        // With the non-blocking flow, start_agent returns Ok(Pending) immediately.
        // Worktree creation failure happens asynchronously in the background.
        assert!(
            result.is_ok(),
            "start_agent should return Ok(Pending) immediately: {:?}",
            result.err()
        );
        assert_eq!(
            result.unwrap().status,
            AgentStatus::Pending,
            "initial status must be Pending"
        );

        // Wait for the background task to reach a terminal state.
        // It must fail (no git repo → create_worktree returns an error).
        let final_info = pool
            .wait_for_agent("50_story_test", "qa", 5000)
            .await
            .expect("wait_for_agent should not time out");
        assert_eq!(
            final_info.status,
            AgentStatus::Failed,
            "agent must transition to Failed after worktree creation error"
        );

        // The pool must retain a Failed entry (not disappear silently).
        let agents = pool.agents.lock().unwrap();
        let failed_entry = agents
            .values()
            .find(|a| a.agent_name == "qa" && a.status == AgentStatus::Failed);
        assert!(
            failed_entry.is_some(),
            "agent pool must retain a Failed entry so the UI can show the error state"
        );
        drop(agents);

        // The AgentEvent::Error must be persisted in the event_log so late
        // subscribers / polling clients can see the failure reason.
        let events = pool
            .drain_events("50_story_test", "qa")
            .expect("drain_events should succeed");
        let has_error_event = events.iter().any(|e| matches!(e, AgentEvent::Error { .. }));
        assert!(
            has_error_event,
            "event_log must contain AgentEvent::Error after worktree creation fails"
        );
    }

    /// Verify that a successful start_agent keeps the Running entry (guard is
    /// disarmed).  We cannot truly spawn an agent in tests, but we verify that
    /// the concurrency check still blocks a second concurrent start — which
    /// proves the first entry survived the guard.
    #[tokio::test]
    async fn start_agent_guard_does_not_remove_running_entry() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        let sk_dir = root.join(".story_kit");
        fs::create_dir_all(&sk_dir).unwrap();
        fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();

        let pool = AgentPool::new_test(3099);

        // Manually inject a Running agent (simulates successful start).
        pool.inject_test_agent("story-x", "qa", AgentStatus::Running);

        // Attempting to start the same agent on a different story must be
        // rejected — the Running entry must still be there.
        let result = pool.start_agent(root, "story-y", Some("qa"), None).await;

        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(
            err.contains("already running") || err.contains("becomes available"),
            "running entry must survive: got '{err}'"
        );
    }

    // ── TOCTOU race-condition regression tests (story 132) ───────────────────

    /// Verify that a Pending entry (not just Running) blocks a concurrent
    /// start_agent for the same agent name on a different story.  This proves
    /// the check-and-insert is atomic: the Pending entry is visible to the
    /// second caller because it was inserted while the lock was still held.
    #[tokio::test]
    async fn toctou_pending_entry_blocks_same_agent_on_different_story() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        let sk_dir = root.join(".story_kit");
        fs::create_dir_all(&sk_dir).unwrap();
        fs::write(
            sk_dir.join("project.toml"),
            "[[agent]]\nname = \"coder-1\"\n",
        )
        .unwrap();

        let pool = AgentPool::new_test(3099);

        // Simulate what the winning concurrent call would have done: insert a
        // Pending entry for coder-1 on story-86.
        pool.inject_test_agent("86_story_foo", "coder-1", AgentStatus::Pending);

        // Now attempt to start coder-1 on a *different* story — must be rejected.
        let result = pool
            .start_agent(root, "130_story_bar", Some("coder-1"), None)
            .await;

        assert!(result.is_err(), "second start_agent must be rejected");
        let err = result.unwrap_err();
        assert!(
            err.contains("already running") || err.contains("becomes available"),
            "expected concurrency-rejection message, got: '{err}'"
        );
    }

    /// Concurrent start_agent calls for the same agent name on different stories
    /// must result in exactly one rejection due to the concurrency check (not
    /// due to an unrelated failure such as missing git repo).
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn toctou_concurrent_start_agent_same_agent_exactly_one_concurrency_rejection() {
        use std::fs;
        use std::sync::Arc;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path().to_path_buf();

        let sk_dir = root.join(".story_kit");
        fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap();
        fs::write(
            root.join(".story_kit/project.toml"),
            "[[agent]]\nname = \"coder-1\"\n",
        )
        .unwrap();
        // Both stories must exist in upcoming so move_story_to_current can run
        // (only the winner reaches that point, but we set both up defensively).
        fs::write(
            root.join(".story_kit/work/1_backlog/86_story_foo.md"),
            "---\nname: Foo\n---\n",
        )
        .unwrap();
        fs::write(
            root.join(".story_kit/work/1_backlog/130_story_bar.md"),
            "---\nname: Bar\n---\n",
        )
        .unwrap();

        let pool = Arc::new(AgentPool::new_test(3099));

        let pool1 = pool.clone();
        let root1 = root.clone();
        let t1 = tokio::spawn(async move {
            pool1
                .start_agent(&root1, "86_story_foo", Some("coder-1"), None)
                .await
        });

        let pool2 = pool.clone();
        let root2 = root.clone();
        let t2 = tokio::spawn(async move {
            pool2
                .start_agent(&root2, "130_story_bar", Some("coder-1"), None)
                .await
        });

        let (r1, r2) = tokio::join!(t1, t2);
        let r1 = r1.unwrap();
        let r2 = r2.unwrap();

        // The concurrency-rejection message always contains "already running" /
        // "becomes available".  Any other error (e.g., missing git repo) means
        // that call *won* the atomic check-and-insert.
        let concurrency_rejections = [&r1, &r2]
            .iter()
            .filter(|r| {
                r.as_ref().is_err_and(|e| {
                    e.contains("already running") || e.contains("becomes available")
                })
            })
            .count();

        assert_eq!(
            concurrency_rejections, 1,
            "exactly one call must be rejected by the concurrency check; \
             got r1={r1:?} r2={r2:?}"
        );
    }

    // ── story-230: prevent duplicate stage agents on same story ───────────────

    /// start_agent must reject a second coder on a story that already has a
    /// Running coder, even if they are *different* agent names.
    #[tokio::test]
    async fn start_agent_rejects_second_coder_stage_on_same_story() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        let sk_dir = root.join(".story_kit");
        fs::create_dir_all(&sk_dir).unwrap();
        fs::write(
            sk_dir.join("project.toml"),
            "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
        )
        .unwrap();

        let pool = AgentPool::new_test(3099);
        // coder-1 is already running on the story.
        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);

        // Attempt to start coder-2 on the *same* story — must be rejected.
        let result = pool
            .start_agent(root, "42_story_foo", Some("coder-2"), None)
            .await;

        assert!(
            result.is_err(),
            "second coder on same story must be rejected"
        );
        let err = result.unwrap_err();
        assert!(
            err.contains("same pipeline stage"),
            "error must mention same pipeline stage, got: '{err}'"
        );
        assert!(
            err.contains("coder-1") && err.contains("coder-2"),
            "error must name both agents, got: '{err}'"
        );
    }

    /// The stage-conflict check must also cover QA: a second QA agent on the
    /// same story must be rejected.
    #[tokio::test]
    async fn start_agent_rejects_second_qa_stage_on_same_story() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        let sk_dir = root.join(".story_kit");
        fs::create_dir_all(&sk_dir).unwrap();
        // Two qa agents using the explicit stage field so name-based detection
        // doesn't interfere.
        fs::write(
            sk_dir.join("project.toml"),
            "[[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n\n\
             [[agent]]\nname = \"qa-2\"\nstage = \"qa\"\n",
        )
        .unwrap();

        let pool = AgentPool::new_test(3099);
        pool.inject_test_agent("55_story_bar", "qa-1", AgentStatus::Running);

        let result = pool
            .start_agent(root, "55_story_bar", Some("qa-2"), None)
            .await;

        assert!(result.is_err(), "second qa on same story must be rejected");
        let err = result.unwrap_err();
        assert!(
            err.contains("same pipeline stage"),
            "error must mention same pipeline stage, got: '{err}'"
        );
    }

    /// Regression test (story 230): concurrent start_agent calls with two
    /// different coder names on the same story — exactly one must succeed
    /// (or fail for infrastructure reasons), and exactly one must be rejected
    /// with a stage-conflict error.
    ///
    /// The story is pre-placed in `2_current/` so that both concurrent
    /// `move_story_to_current` calls are no-ops, guaranteeing both reach the
    /// lock where the stage-conflict check fires.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn start_agent_concurrent_two_coders_same_story_exactly_one_stage_rejection() {
        use std::fs;
        use std::sync::Arc;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path().to_path_buf();

        let sk_dir = root.join(".story_kit");
        // Place story directly in 2_current/ so move_story_to_current is a
        // no-op for both concurrent callers, letting both reach the lock.
        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
        fs::write(
            root.join(".story_kit/project.toml"),
            "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
        )
        .unwrap();
        fs::write(
            root.join(".story_kit/work/2_current/42_story_foo.md"),
            "---\nname: Foo\n---\n",
        )
        .unwrap();

        let pool = Arc::new(AgentPool::new_test(3099));

        let pool1 = pool.clone();
        let root1 = root.clone();
        let t1 = tokio::spawn(async move {
            pool1
                .start_agent(&root1, "42_story_foo", Some("coder-1"), None)
                .await
        });

        let pool2 = pool.clone();
        let root2 = root.clone();
        let t2 = tokio::spawn(async move {
            pool2
                .start_agent(&root2, "42_story_foo", Some("coder-2"), None)
                .await
        });

        let (r1, r2) = tokio::join!(t1, t2);
        let r1 = r1.unwrap();
        let r2 = r2.unwrap();

        // Exactly one call must be rejected with a stage-conflict error.
        let stage_rejections = [&r1, &r2]
            .iter()
            .filter(|r| r.as_ref().is_err_and(|e| e.contains("same pipeline stage")))
            .count();

        assert_eq!(
            stage_rejections, 1,
            "exactly one call must be rejected by the stage-conflict check; \
             got r1={r1:?} r2={r2:?}"
        );
    }

    /// Regression test (story 230): two coders on *different* stories must
    /// not trigger the stage-conflict guard — the guard is per-story.
    #[tokio::test]
    async fn start_agent_two_coders_different_stories_not_blocked_by_stage_check() {
        use std::fs;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        let sk_dir = root.join(".story_kit");
        fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap();
        fs::write(
            root.join(".story_kit/project.toml"),
            "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
        )
        .unwrap();
        fs::write(
            root.join(".story_kit/work/1_backlog/99_story_baz.md"),
            "---\nname: Baz\n---\n",
        )
        .unwrap();

        let pool = AgentPool::new_test(3099);
        // coder-1 is running on a *different* story.
        pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);

        // Starting coder-2 on story-99 must NOT be rejected by the stage
        // guard (it may fail for infrastructure reasons like missing git repo,
        // but not because of the stage-conflict check).
        let result = pool
            .start_agent(root, "99_story_baz", Some("coder-2"), None)
            .await;

        if let Err(ref e) = result {
            assert!(
                !e.contains("same pipeline stage"),
                "stage-conflict guard must not fire for agents on different stories; \
                 got: '{e}'"
            );
        }
        // result may be Ok (unlikely in test env) or Err for infra reasons — both fine.
    }

    /// Two concurrent auto_assign_available_work calls must not assign the same
    /// agent to two stories simultaneously.  After both complete, at most one
    /// Pending/Running entry must exist per agent name.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn toctou_concurrent_auto_assign_no_duplicate_agent_assignments() {
        use std::fs;
        use std::sync::Arc;

        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path().to_path_buf();

        let sk_dir = root.join(".story_kit");
        // Two stories waiting in 2_current, one coder agent.
        fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
        fs::write(
            sk_dir.join("project.toml"),
            "[[agent]]\nname = \"coder-1\"\n",
        )
        .unwrap();
        fs::write(
            sk_dir.join("work/2_current/86_story_foo.md"),
            "---\nname: Foo\n---\n",
        )
        .unwrap();
        fs::write(
            sk_dir.join("work/2_current/130_story_bar.md"),
            "---\nname: Bar\n---\n",
        )
        .unwrap();

        let pool = Arc::new(AgentPool::new_test(3099));

        // Run two concurrent auto_assign calls.
        let pool1 = pool.clone();
        let root1 = root.clone();
        let t1 = tokio::spawn(async move { pool1.auto_assign_available_work(&root1).await });

        let pool2 = pool.clone();
        let root2 = root.clone();
        let t2 = tokio::spawn(async move { pool2.auto_assign_available_work(&root2).await });

        let _ = tokio::join!(t1, t2);

        // At most one Pending/Running entry should exist for coder-1.
        let agents = pool.agents.lock().unwrap();
        let active_coder_count = agents
            .values()
            .filter(|a| {
                a.agent_name == "coder-1"
                    && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
            })
            .count();

        assert!(
            active_coder_count <= 1,
            "coder-1 must not be assigned to more than one story simultaneously; \
             found {active_coder_count} active entries"
        );
    }

    // ── merge_agent_work tests ────────────────────────────────────────────────

    /// Helper: start a merge and poll until terminal state.
    async fn run_merge_to_completion(
        pool: &Arc<AgentPool>,
        repo: &std::path::Path,
        story_id: &str,
    ) -> MergeJob {
        pool.start_merge_agent_work(repo, story_id).unwrap();
        loop {
            tokio::time::sleep(std::time::Duration::from_millis(50)).await;
            if let Some(job) = pool.get_merge_status(story_id)
                && !matches!(job.status, MergeJobStatus::Running)
            {
                return job;
            }
        }
    }

    #[tokio::test]
    async fn merge_agent_work_returns_error_when_branch_not_found() {
        use tempfile::tempdir;

        let tmp = tempdir().unwrap();
        let repo = tmp.path();
        init_git_repo(repo);

        let pool = Arc::new(AgentPool::new_test(3001));
        let job = run_merge_to_completion(&pool, repo, "99_nonexistent").await;
        match &job.status {
            MergeJobStatus::Completed(report) => {
                assert!(!report.success, "should fail when branch missing");
            }
            MergeJobStatus::Failed(_) => {
                // Also acceptable — the pipeline errored out
            }
            MergeJobStatus::Running => {
                panic!("should not still be running");
            }
        }
    }

    #[tokio::test]
    async fn merge_agent_work_succeeds_on_clean_branch() {
        use std::fs;
        use tempfile::tempdir;

        let tmp = tempdir().unwrap();
        let repo = tmp.path();
        init_git_repo(repo);

        // Create a feature branch with a commit
        Command::new("git")
            .args(["checkout", "-b", "feature/story-23_test"])
            .current_dir(repo)
            .output()
            .unwrap();
        fs::write(repo.join("feature.txt"), "feature content").unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "-m", "add feature"])
            .current_dir(repo)
            .output()
            .unwrap();

        // Switch back to master (initial branch)
        Command::new("git")
            .args(["checkout", "master"])
            .current_dir(repo)
            .output()
            .unwrap();

        // Create the story file in 4_merge/ so we can test archival
        let merge_dir = repo.join(".story_kit/work/4_merge");
        fs::create_dir_all(&merge_dir).unwrap();
        let story_file = merge_dir.join("23_test.md");
        fs::write(&story_file, "---\nname: Test\n---\n").unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "-m", "add story in merge"])
            .current_dir(repo)
            .output()
            .unwrap();

        let pool = Arc::new(AgentPool::new_test(3001));
        let job = run_merge_to_completion(&pool, repo, "23_test").await;

        match &job.status {
            MergeJobStatus::Completed(report) => {
                assert!(!report.had_conflicts, "should have no conflicts");
                assert!(
                    report.success
                        || report.gate_output.contains("Failed to run")
                        || !report.gates_passed,
                    "report should be coherent: {report:?}"
                );
                if report.story_archived {
                    let done = repo.join(".story_kit/work/5_done/23_test.md");
                    assert!(done.exists(), "done file should exist");
                }
            }
            MergeJobStatus::Failed(e) => {
                // Gate failures are acceptable in test env
                assert!(
                    e.contains("Failed") || e.contains("failed"),
                    "unexpected failure: {e}"
                );
            }
            MergeJobStatus::Running => panic!("should not still be running"),
        }
    }

    // ── quality gate ordering test ────────────────────────────────

    /// Regression test for bug 142: quality gates must run BEFORE the fast-forward
    /// to master so that broken code never lands on master.
    ///
    /// Setup: a repo with a failing `script/test`, a feature branch with one commit.
    /// When `run_squash_merge` is called, the gates must detect failure and abort the
    /// fast-forward, leaving master HEAD unchanged.
    #[cfg(unix)]
    #[test]
    fn quality_gates_run_before_fast_forward_to_master() {
        use std::fs;
        use std::os::unix::fs::PermissionsExt;
        use tempfile::tempdir;

        let tmp = tempdir().unwrap();
        let repo = tmp.path();
        init_git_repo(repo);

        // Add a failing script/test so quality gates will fail.
        let script_dir = repo.join("script");
        fs::create_dir_all(&script_dir).unwrap();
        let script_test = script_dir.join("test");
        fs::write(&script_test, "#!/usr/bin/env bash\nexit 1\n").unwrap();
        let mut perms = fs::metadata(&script_test).unwrap().permissions();
        perms.set_mode(0o755);
        fs::set_permissions(&script_test, perms).unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "-m", "add failing script/test"])
            .current_dir(repo)
            .output()
            .unwrap();

        // Create a feature branch with a commit.
        Command::new("git")
            .args(["checkout", "-b", "feature/story-142_test"])
            .current_dir(repo)
            .output()
            .unwrap();
        fs::write(repo.join("change.txt"), "feature change").unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "-m", "feature work"])
            .current_dir(repo)
            .output()
            .unwrap();

        // Switch back to master and record its HEAD.
        Command::new("git")
            .args(["checkout", "master"])
            .current_dir(repo)
            .output()
            .unwrap();
        let head_before = String::from_utf8(
            Command::new("git")
                .args(["rev-parse", "HEAD"])
                .current_dir(repo)
                .output()
                .unwrap()
                .stdout,
        )
        .unwrap()
        .trim()
        .to_string();

        // Run the squash-merge.  The failing script/test makes quality gates
        // fail → fast-forward must NOT happen.
        let result =
            crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test")
                .unwrap();

        let head_after = String::from_utf8(
            Command::new("git")
                .args(["rev-parse", "HEAD"])
                .current_dir(repo)
                .output()
                .unwrap()
                .stdout,
        )
        .unwrap()
        .trim()
        .to_string();

        // Gates must have failed (script/test exits 1) so master should be untouched.
        assert!(
            !result.success,
            "run_squash_merge must report failure when gates fail"
        );
        assert_eq!(
            head_before, head_after,
            "master HEAD must not advance when quality gates fail (bug 142)"
        );
    }

    #[tokio::test]
    async fn merge_agent_work_conflict_does_not_break_master() {
        use std::fs;
        use tempfile::tempdir;

        let tmp = tempdir().unwrap();
        let repo = tmp.path();
        init_git_repo(repo);

        // Create a file on master.
        fs::write(
            repo.join("code.rs"),
            "fn main() {\n    println!(\"hello\");\n}\n",
        )
        .unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "-m", "initial code"])
            .current_dir(repo)
            .output()
            .unwrap();

        // Feature branch: modify the same line differently.
        Command::new("git")
            .args(["checkout", "-b", "feature/story-42_story_foo"])
            .current_dir(repo)
            .output()
            .unwrap();
        fs::write(
            repo.join("code.rs"),
            "fn main() {\n    println!(\"hello\");\n    feature_fn();\n}\n",
        )
        .unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "-m", "feature: add fn call"])
            .current_dir(repo)
            .output()
            .unwrap();

        // Master: add different line at same location.
        Command::new("git")
            .args(["checkout", "master"])
            .current_dir(repo)
            .output()
            .unwrap();
        fs::write(
            repo.join("code.rs"),
            "fn main() {\n    println!(\"hello\");\n    master_fn();\n}\n",
        )
        .unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "-m", "master: add fn call"])
            .current_dir(repo)
            .output()
            .unwrap();

        // Create story file in 4_merge.
        let merge_dir = repo.join(".story_kit/work/4_merge");
        fs::create_dir_all(&merge_dir).unwrap();
        fs::write(merge_dir.join("42_story_foo.md"), "---\nname: Test\n---\n").unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(repo)
            .output()
            .unwrap();
        Command::new("git")
            .args(["commit", "-m", "add story"])
            .current_dir(repo)
            .output()
            .unwrap();

        let pool = Arc::new(AgentPool::new_test(3001));
        let job = run_merge_to_completion(&pool, repo, "42_story_foo").await;

        // Master should NEVER have conflict markers, regardless of merge outcome.
        let master_code = fs::read_to_string(repo.join("code.rs")).unwrap();
        assert!(
            !master_code.contains("<<<<<<<"),
            "master must never contain conflict markers:\n{master_code}"
        );
        assert!(
            !master_code.contains(">>>>>>>"),
            "master must never contain conflict markers:\n{master_code}"
        );

        // The report should accurately reflect what happened.
        match &job.status {
            MergeJobStatus::Completed(report) => {
                assert!(report.had_conflicts, "should report conflicts");
            }
            MergeJobStatus::Failed(_) => {
                // Acceptable — merge aborted due to conflicts
            }
            MergeJobStatus::Running => panic!("should not still be running"),
        }
    }

    // ── reconcile_on_startup tests ────────────────────────────────────────────

    #[tokio::test]
    async fn reconcile_on_startup_noop_when_no_worktrees() {
        let tmp = tempfile::tempdir().unwrap();
        let pool = AgentPool::new_test(3001);
        let (tx, _rx) = broadcast::channel(16);
        // Should not panic; no worktrees to reconcile.
        pool.reconcile_on_startup(tmp.path(), &tx).await;
    }

    #[tokio::test]
    async fn reconcile_on_startup_emits_done_event() {
        let tmp = tempfile::tempdir().unwrap();
        let pool = AgentPool::new_test(3001);
        let (tx, mut rx) = broadcast::channel::<ReconciliationEvent>(16);
        pool.reconcile_on_startup(tmp.path(), &tx).await;

        // Collect all events; the last must be "done".
        let mut events: Vec<ReconciliationEvent> = Vec::new();
        while let Ok(evt) = rx.try_recv() {
            events.push(evt);
        }
        assert!(
            events.iter().any(|e| e.status == "done"),
            "reconcile_on_startup must emit a 'done' event; got: {:?}",
            events.iter().map(|e| &e.status).collect::<Vec<_>>()
        );
    }

    #[tokio::test]
    async fn reconcile_on_startup_skips_story_without_committed_work() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Set up story in 2_current/.
        let current = root.join(".story_kit/work/2_current");
        fs::create_dir_all(&current).unwrap();
        fs::write(current.join("60_story_test.md"), "test").unwrap();

        // Create a worktree directory that is a fresh git repo with no commits
        // ahead of its own base branch (simulates a worktree where no work was done).
        let wt_dir = root.join(".story_kit/worktrees/60_story_test");
        fs::create_dir_all(&wt_dir).unwrap();
        init_git_repo(&wt_dir);

        let pool = AgentPool::new_test(3001);
        let (tx, _rx) = broadcast::channel(16);
        pool.reconcile_on_startup(root, &tx).await;

        // Story should still be in 2_current/ — nothing was reconciled.
        assert!(
            current.join("60_story_test.md").exists(),
            "story should stay in 2_current/ when worktree has no committed work"
        );
    }

    #[tokio::test]
    async fn reconcile_on_startup_runs_gates_on_worktree_with_committed_work() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Set up a git repo for the project root.
        init_git_repo(root);

        // Set up story in 2_current/ and commit it so the project root is clean.
        let current = root.join(".story_kit/work/2_current");
        fs::create_dir_all(&current).unwrap();
        fs::write(current.join("61_story_test.md"), "test").unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(root)
            .output()
            .unwrap();
        Command::new("git")
            .args([
                "-c",
                "user.email=test@test.com",
                "-c",
                "user.name=Test",
                "commit",
                "-m",
                "add story",
            ])
            .current_dir(root)
            .output()
            .unwrap();

        // Create a real git worktree for the story.
        let wt_dir = root.join(".story_kit/worktrees/61_story_test");
        fs::create_dir_all(wt_dir.parent().unwrap()).unwrap();
        Command::new("git")
            .args([
                "worktree",
                "add",
                &wt_dir.to_string_lossy(),
                "-b",
                "feature/story-61_story_test",
            ])
            .current_dir(root)
            .output()
            .unwrap();

        // Add a commit to the feature branch (simulates coder completing work).
        fs::write(wt_dir.join("implementation.txt"), "done").unwrap();
        Command::new("git")
            .args(["add", "."])
            .current_dir(&wt_dir)
            .output()
            .unwrap();
        Command::new("git")
            .args([
                "-c",
                "user.email=test@test.com",
                "-c",
                "user.name=Test",
                "commit",
                "-m",
                "implement story",
            ])
            .current_dir(&wt_dir)
            .output()
            .unwrap();

        assert!(
            crate::agents::gates::worktree_has_committed_work(&wt_dir),
            "test setup: worktree should have committed work"
        );

        let pool = AgentPool::new_test(3001);
        let (tx, _rx) = broadcast::channel(16);
        pool.reconcile_on_startup(root, &tx).await;

        // In the test env, cargo clippy will fail (no Cargo.toml) so gates fail
        // and the story stays in 2_current/.  The important assertion is that
        // reconcile ran without panicking and the story is in a consistent state.
        let in_current = current.join("61_story_test.md").exists();
        let in_qa = root.join(".story_kit/work/3_qa/61_story_test.md").exists();
        assert!(
            in_current || in_qa,
            "story should be in 2_current/ or 3_qa/ after reconciliation"
        );
    }

    #[test]
    fn has_review_hold_returns_true_when_set() {
        let tmp = tempfile::tempdir().unwrap();
        let qa_dir = tmp.path().join(".story_kit/work/3_qa");
        std::fs::create_dir_all(&qa_dir).unwrap();
        let spike_path = qa_dir.join("10_spike_research.md");
        std::fs::write(
            &spike_path,
            "---\nname: Research spike\nreview_hold: true\n---\n# Spike\n",
        )
        .unwrap();
        assert!(has_review_hold(tmp.path(), "3_qa", "10_spike_research"));
    }

    #[test]
    fn has_review_hold_returns_false_when_not_set() {
        let tmp = tempfile::tempdir().unwrap();
        let qa_dir = tmp.path().join(".story_kit/work/3_qa");
        std::fs::create_dir_all(&qa_dir).unwrap();
        let spike_path = qa_dir.join("10_spike_research.md");
        std::fs::write(&spike_path, "---\nname: Research spike\n---\n# Spike\n").unwrap();
        assert!(!has_review_hold(tmp.path(), "3_qa", "10_spike_research"));
    }

    #[test]
    fn has_review_hold_returns_false_when_file_missing() {
        let tmp = tempfile::tempdir().unwrap();
        assert!(!has_review_hold(tmp.path(), "3_qa", "99_spike_missing"));
    }

    /// Story 265: auto_assign_available_work must skip spikes in 3_qa/ that
    /// have review_hold: true set in their front matter.
    #[tokio::test]
    async fn auto_assign_skips_spikes_with_review_hold() {
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        // Create project.toml with a QA agent.
        let sk = root.join(".story_kit");
        std::fs::create_dir_all(&sk).unwrap();
        std::fs::write(
            sk.join("project.toml"),
            "[[agents]]\nname = \"qa\"\nrole = \"qa\"\nmodel = \"test\"\nprompt = \"test\"\n",
        )
        .unwrap();

        // Put a spike in 3_qa/ with review_hold: true.
        let qa_dir = root.join(".story_kit/work/3_qa");
        std::fs::create_dir_all(&qa_dir).unwrap();
        std::fs::write(
            qa_dir.join("20_spike_test.md"),
            "---\nname: Test Spike\nreview_hold: true\n---\n# Spike\n",
        )
        .unwrap();

        let (watcher_tx, _) = broadcast::channel::<WatcherEvent>(4);
        let pool = AgentPool::new(3001, watcher_tx);

        pool.auto_assign_available_work(root).await;

        // No agent should have been started for the spike.
        let agents = pool.agents.lock().unwrap();
        assert!(
            agents.is_empty(),
            "No agents should be assigned to a spike with review_hold"
        );
    }

    // ── Story 279: auto-assign respects agent stage from front matter ──────────

    /// When a story in 3_qa/ has `agent: coder-1` in its front matter but
    /// coder-1 is a coder-stage agent, auto-assign must NOT assign coder-1.
    /// Instead it should fall back to a free QA-stage agent.
    #[tokio::test]
    async fn auto_assign_ignores_coder_preference_when_story_is_in_qa_stage() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".story_kit");
        let qa_dir = sk.join("work/3_qa");
        std::fs::create_dir_all(&qa_dir).unwrap();
        std::fs::write(
            sk.join("project.toml"),
            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
             [[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n",
        )
        .unwrap();
        // Story in 3_qa/ with a preferred coder-stage agent.
        std::fs::write(
            qa_dir.join("story-qa1.md"),
            "---\nname: QA Story\nagent: coder-1\n---\n",
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);

        pool.auto_assign_available_work(tmp.path()).await;

        let agents = pool.agents.lock().unwrap();
        // coder-1 must NOT have been assigned (wrong stage for 3_qa/).
        let coder_assigned = agents.values().any(|a| {
            a.agent_name == "coder-1"
                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
        });
        assert!(
            !coder_assigned,
            "coder-1 should not be assigned to a QA-stage story"
        );
        // qa-1 should have been assigned instead.
        let qa_assigned = agents.values().any(|a| {
            a.agent_name == "qa-1"
                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
        });
        assert!(
            qa_assigned,
            "qa-1 should be assigned as fallback for the QA-stage story"
        );
    }

    /// When a story in 2_current/ has `agent: coder-1` in its front matter and
    /// coder-1 is a coder-stage agent, auto-assign must respect the preference
    /// and assign coder-1 (not fall back to some other coder).
    #[tokio::test]
    async fn auto_assign_respects_coder_preference_when_story_is_in_current_stage() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".story_kit");
        let current_dir = sk.join("work/2_current");
        std::fs::create_dir_all(&current_dir).unwrap();
        std::fs::write(
            sk.join("project.toml"),
            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
             [[agent]]\nname = \"coder-2\"\nstage = \"coder\"\n",
        )
        .unwrap();
        // Story in 2_current/ with a preferred coder-1 agent.
        std::fs::write(
            current_dir.join("story-pref.md"),
            "---\nname: Coder Story\nagent: coder-1\n---\n",
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);

        pool.auto_assign_available_work(tmp.path()).await;

        let agents = pool.agents.lock().unwrap();
        // coder-1 should have been picked (it matches the stage and is preferred).
        let coder1_assigned = agents.values().any(|a| {
            a.agent_name == "coder-1"
                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
        });
        assert!(
            coder1_assigned,
            "coder-1 should be assigned when it matches the stage and is preferred"
        );
        // coder-2 must NOT be assigned (not preferred).
        let coder2_assigned = agents.values().any(|a| {
            a.agent_name == "coder-2"
                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
        });
        assert!(
            !coder2_assigned,
            "coder-2 should not be assigned when coder-1 is explicitly preferred"
        );
    }

    /// When the preferred agent's stage mismatches and no other agent of the
    /// correct stage is available, auto-assign must not start any agent for that
    /// story (no panic, no error).
    #[tokio::test]
    async fn auto_assign_stage_mismatch_with_no_fallback_starts_no_agent() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".story_kit");
        let qa_dir = sk.join("work/3_qa");
        std::fs::create_dir_all(&qa_dir).unwrap();
        // Only a coder agent is configured — no QA agent exists.
        std::fs::write(
            sk.join("project.toml"),
            "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
        )
        .unwrap();
        // Story in 3_qa/ requests coder-1 (wrong stage) and no QA agent exists.
        std::fs::write(
            qa_dir.join("story-noqa.md"),
            "---\nname: QA Story No Agent\nagent: coder-1\n---\n",
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);

        // Must not panic.
        pool.auto_assign_available_work(tmp.path()).await;

        let agents = pool.agents.lock().unwrap();
        assert!(
            agents.is_empty(),
            "No agent should be started when no stage-appropriate agent is available"
        );
    }

    /// Bug 295: when a coder completes and QA is busy on another story,
    /// the newly QA-queued story must be picked up when `run_pipeline_advance`
    /// finishes for the busy QA agent's story (because auto_assign is now
    /// called unconditionally at the end of pipeline advance).
    #[tokio::test]
    async fn pipeline_advance_picks_up_waiting_qa_stories_after_completion() {
        use std::fs;
        let tmp = tempfile::tempdir().unwrap();
        let root = tmp.path();

        let sk = root.join(".story_kit");
        let qa_dir = sk.join("work/3_qa");
        fs::create_dir_all(&qa_dir).unwrap();

        // Configure a single QA agent.
        fs::write(
            sk.join("project.toml"),
            r#"
[[agent]]
name = "qa"
stage = "qa"
"#,
        )
        .unwrap();

        // Story 292 is in QA with QA agent running (will "complete" via
        // run_pipeline_advance below).  Story 293 is in QA with NO agent —
        // simulating the "stuck" state from bug 295.
        fs::write(
            qa_dir.join("292_story_first.md"),
            "---\nname: First\nqa: human\n---\n",
        )
        .unwrap();
        fs::write(
            qa_dir.join("293_story_second.md"),
            "---\nname: Second\nqa: human\n---\n",
        )
        .unwrap();

        let pool = AgentPool::new_test(3001);
        // QA is currently running on story 292.
        pool.inject_test_agent("292_story_first", "qa", AgentStatus::Running);

        // Verify that 293 cannot get a QA agent right now (QA is busy).
        {
            let agents = pool.agents.lock().unwrap();
            assert!(
                !is_agent_free(&agents, "qa"),
                "qa should be busy on story 292"
            );
        }

        // Simulate QA completing on story 292: remove the agent from the pool
        // (as run_server_owned_completion does) then run pipeline advance.
        {
            let mut agents = pool.agents.lock().unwrap();
            agents.remove(&composite_key("292_story_first", "qa"));
        }

        // Pipeline advance for QA with gates_passed=true will:
        // 1. Run coverage gate (will "pass" trivially in test — no script/test_coverage)
        // 2. Set review_hold on 292 (qa: human)
        // 3. Call auto_assign_available_work (the fix from bug 295)
        // 4. auto_assign should find 293 in 3_qa/ with no agent and start qa on it
        pool.run_pipeline_advance(
            "292_story_first",
            "qa",
            CompletionReport {
                summary: "QA done".to_string(),
                gates_passed: true,
                gate_output: String::new(),
            },
            Some(root.to_path_buf()),
            None,
            false,
        )
        .await;

        // After pipeline advance, auto_assign should have started QA on story 293.
        let agents = pool.agents.lock().unwrap();
        let qa_on_293 = agents.values().any(|a| {
            a.agent_name == "qa"
                && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
        });
        assert!(
            qa_on_293,
            "auto_assign should have started qa for story 293 after 292's QA completed, \
             but no qa agent is pending/running. Pool: {:?}",
            agents
                .iter()
                .map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status))
                .collect::<Vec<_>>()
        );
    }
}