//! Agent stop — terminates a running agent while preserving its worktree. use crate::process_kill::{pids_matching, sigkill_pids_and_verify}; use crate::slog; use crate::slog_error; use crate::slog_warn; use std::path::Path; use super::super::{ AgentEvent, AgentStatus, PipelineStage, agent_config_stage, canonical_pipeline_stage, pipeline_stage, }; use super::AgentPool; use super::types::composite_key; impl AgentPool { /// Stop a running agent. Worktree is preserved for inspection. /// /// **Order of operations matters here.** The naive implementation set /// `status = Failed` before killing the process, which opened the same /// idempotency window that produced the 2026-05-15 watchdog /// double-spawn: the `start_agent` check whitelists Running/Pending, /// so flipping status away from Running while the underlying claude /// process was still alive let a fresh spawn race in alongside the /// surviving one. The fix is: /// /// 1. Read the worktree path (so we can find every process running /// in it) without mutating the agent record yet. /// 2. SIGKILL the process tree via [`crate::process_kill`] and BLOCK /// until verified gone. While this is in progress, status stays /// Running and `start_agent` continues to reject duplicate spawns. /// 3. Now that the process is gone, mutate the agent record (status, /// handle abort, removal). pub async fn stop_agent( &self, _project_root: &Path, story_id: &str, agent_name: &str, ) -> Result<(), String> { let key = composite_key(story_id, agent_name); // Step 1: snapshot the worktree path (no status mutation yet). let worktree_info = { let agents = self.agents.lock().map_err(|e| e.to_string())?; let agent = agents .get(&key) .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; agent.worktree_info.clone() }; // Step 2: SIGKILL every process running in the worktree, verify gone. // We do this BEFORE updating the agent record so the idempotency check // in `start_agent` keeps rejecting duplicate spawns until the slot is // legitimately free. Replaces the prior `kill_child_for_key` path, // which sent SIGHUP via portable_pty (ignored by claude-code). if let Some(wt) = worktree_info.as_ref() { let pids = pids_matching(&wt.path.display().to_string()); if !pids.is_empty() { match sigkill_pids_and_verify(&pids) { Ok(n) => slog!( "[stop_agent] SIGKILL'd {n} process(es) in worktree {} for '{key}'.", wt.path.display() ), Err(survivors) => slog_warn!( "[stop_agent] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \ Proceeding with record cleanup anyway; concurrent spawn protection may be weakened." ), } } } else { slog_warn!( "[stop_agent] No worktree path recorded for '{key}'; cannot tree-kill, \ falling back to portable_pty SIGHUP (likely no-op for claude-code)." ); self.kill_child_for_key(&key); } // Step 3: now safe to mutate. Status flip and handle abort. let (task_handle, tx) = { let mut agents = self.agents.lock().map_err(|e| e.to_string())?; let agent = agents .get_mut(&key) .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; let handle = agent.task_handle.take(); let tx = agent.tx.clone(); agent.status = AgentStatus::Failed; (handle, tx) }; if let Some(handle) = task_handle { handle.abort(); let _ = handle.await; } // Preserve worktree for inspection — don't destroy agent's work on stop. if let Some(ref wt) = worktree_info { slog!( "[agents] Worktree preserved for {story_id}:{agent_name}: {}", wt.path.display() ); } let _ = tx.send(AgentEvent::Status { story_id: story_id.to_string(), agent_name: agent_name.to_string(), status: "stopped".to_string(), }); // Remove from map. { let mut agents = self.agents.lock().map_err(|e| e.to_string())?; agents.remove(&key); } // Notify WebSocket clients so pipeline board and agent panel update. Self::notify_agent_state_changed(&self.watcher_tx); Ok(()) } /// Stop LLM agents whose pipeline stage no longer matches the story's canonical stage. /// /// Called periodically by the tick loop (story 1100). For each Running or Pending /// LLM agent (Coder, Qa, or Mergemaster) whose stage does not match the canonical /// stage derived from the story's current CRDT state, the agent is stopped via the /// existing SIGKILL path. Idempotent: agents already at the correct stage are left /// untouched. Also stops LLM agents on stories that have no active pipeline stage /// (terminal, blocked, or frozen), since no LLM agent should run there. pub async fn reconcile_canonical_agents(&self, root: &std::path::Path) { use crate::config::ProjectConfig; let config = match ProjectConfig::load(root) { Ok(c) => c, Err(e) => { slog_warn!("[reconcile] Cannot load config for canonical reconcile: {e}"); return; } }; // Snapshot active LLM agents without holding the lock during async stops. let snapshot: Vec<(String, String, PipelineStage)> = { let Ok(agents) = self.agents.lock() else { return; }; agents .iter() .filter_map(|(key, a)| { if !matches!(a.status, AgentStatus::Running | AgentStatus::Pending) { return None; } let stage = config .find_agent(&a.agent_name) .map(agent_config_stage) .unwrap_or_else(|| pipeline_stage(&a.agent_name)); if !matches!( stage, PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster ) { return None; } let story_id = key .rsplit_once(':') .map(|(s, _)| s) .unwrap_or(key) .to_string(); Some((story_id, a.agent_name.clone(), stage)) }) .collect() }; for (story_id, agent_name, agent_stage) in snapshot { let canonical = crate::pipeline_state::read_typed(&story_id) .ok() .flatten() .and_then(|item| canonical_pipeline_stage(&item.stage)); let should_stop = match &canonical { None => true, Some(c) if *c != agent_stage => true, _ => false, }; if !should_stop { continue; } slog!( "[reconcile] stopping '{agent_name}' on '{story_id}': \ canonical={canonical:?} actual={agent_stage:?}" ); if let Err(e) = self.stop_agent(root, &story_id, &agent_name).await { slog_warn!("[reconcile] failed to stop '{agent_name}' on '{story_id}': {e}"); } } } /// Remove all agent entries for a given story_id from the pool. /// /// Called when a story is archived so that stale entries don't accumulate. /// Returns the number of entries removed. pub fn remove_agents_for_story(&self, story_id: &str) -> usize { let mut agents = match self.agents.lock() { Ok(a) => a, Err(e) => { slog_error!("[agents] Failed to lock pool for cleanup of '{story_id}': {e}"); return 0; } }; let prefix = format!("{story_id}:"); let keys_to_remove: Vec = agents .keys() .filter(|k| k.starts_with(&prefix)) .cloned() .collect(); let count = keys_to_remove.len(); for key in &keys_to_remove { agents.remove(key); } if count > 0 { slog!("[agents] Removed {count} agent entries for archived story '{story_id}'"); } count } } #[cfg(test)] mod tests { use super::super::AgentPool; use crate::agents::AgentStatus; // ── remove_agents_for_story tests ──────────────────────────────────────── #[test] fn remove_agents_for_story_removes_all_entries() { let pool = AgentPool::new_test(3001); pool.inject_test_agent("story_a", "coder-1", AgentStatus::Completed); pool.inject_test_agent("story_a", "qa", AgentStatus::Failed); pool.inject_test_agent("story_b", "coder-1", AgentStatus::Running); let removed = pool.remove_agents_for_story("story_a"); assert_eq!(removed, 2, "should remove both agents for story_a"); let agents = pool.list_agents().unwrap(); assert_eq!(agents.len(), 1, "only story_b agent should remain"); assert_eq!(agents[0].story_id, "story_b"); } #[test] fn remove_agents_for_story_returns_zero_when_no_match() { let pool = AgentPool::new_test(3001); pool.inject_test_agent("story_a", "coder-1", AgentStatus::Running); let removed = pool.remove_agents_for_story("nonexistent"); assert_eq!(removed, 0); let agents = pool.list_agents().unwrap(); assert_eq!(agents.len(), 1, "existing agents should not be affected"); } // ── archive + cleanup integration test ─────────────────────────────────── #[tokio::test] async fn archiving_story_removes_agent_entries_from_pool() { use crate::agents::lifecycle::move_story_to_done; use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let current = root.join(".huskies/work/2_current"); fs::create_dir_all(¤t).unwrap(); let story_content = "test"; fs::write(current.join("60_story_cleanup.md"), story_content).unwrap(); crate::db::ensure_content_store(); crate::db::write_item_with_content( "60_story_cleanup", "2_current", story_content, crate::db::ItemMeta::named("Cleanup"), ); let pool = AgentPool::new_test(3001); pool.inject_test_agent("60_story_cleanup", "coder-1", AgentStatus::Completed); pool.inject_test_agent("60_story_cleanup", "qa", AgentStatus::Completed); pool.inject_test_agent("61_story_other", "coder-1", AgentStatus::Running); assert_eq!(pool.list_agents().unwrap().len(), 3); move_story_to_done("60_story_cleanup").unwrap(); pool.remove_agents_for_story("60_story_cleanup"); let remaining = pool.list_agents().unwrap(); assert_eq!( remaining.len(), 1, "only the other story's agent should remain" ); assert_eq!(remaining[0].story_id, "61_story_other"); // The lifecycle function updates the content store (not the filesystem), // so verify the move via the DB. let content = crate::db::read_content(crate::db::ContentKey::Story("60_story_cleanup")) .expect("60_story_cleanup should be in content store after move to done"); assert_eq!(content, "test", "content should be preserved after move"); } }