Files
huskies/server/src/agents/pool/stop.rs
T

306 lines
12 KiB
Rust

//! Agent stop — terminates a running agent while preserving its worktree.
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
use crate::slog;
use crate::slog_error;
use crate::slog_warn;
use std::path::Path;
use super::super::{
AgentEvent, AgentStatus, PipelineStage, agent_config_stage, canonical_pipeline_stage,
pipeline_stage,
};
use super::AgentPool;
use super::types::composite_key;
impl AgentPool {
/// Stop a running agent. Worktree is preserved for inspection.
///
/// **Order of operations matters here.** The naive implementation set
/// `status = Failed` before killing the process, which opened the same
/// idempotency window that produced the 2026-05-15 watchdog
/// double-spawn: the `start_agent` check whitelists Running/Pending,
/// so flipping status away from Running while the underlying claude
/// process was still alive let a fresh spawn race in alongside the
/// surviving one. The fix is:
///
/// 1. Read the worktree path (so we can find every process running
/// in it) without mutating the agent record yet.
/// 2. SIGKILL the process tree via [`crate::process_kill`] and BLOCK
/// until verified gone. While this is in progress, status stays
/// Running and `start_agent` continues to reject duplicate spawns.
/// 3. Now that the process is gone, mutate the agent record (status,
/// handle abort, removal).
pub async fn stop_agent(
&self,
_project_root: &Path,
story_id: &str,
agent_name: &str,
) -> Result<(), String> {
let key = composite_key(story_id, agent_name);
// Step 1: snapshot the worktree path (no status mutation yet).
let worktree_info = {
let agents = self.agents.lock().map_err(|e| e.to_string())?;
let agent = agents
.get(&key)
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
agent.worktree_info.clone()
};
// Step 2: SIGKILL every process running in the worktree, verify gone.
// We do this BEFORE updating the agent record so the idempotency check
// in `start_agent` keeps rejecting duplicate spawns until the slot is
// legitimately free. Replaces the prior `kill_child_for_key` path,
// which sent SIGHUP via portable_pty (ignored by claude-code).
if let Some(wt) = worktree_info.as_ref() {
let pids = pids_matching(&wt.path.display().to_string());
if !pids.is_empty() {
match sigkill_pids_and_verify(&pids) {
Ok(n) => slog!(
"[stop_agent] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
wt.path.display()
),
Err(survivors) => slog_warn!(
"[stop_agent] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
Proceeding with record cleanup anyway; concurrent spawn protection may be weakened."
),
}
}
} else {
slog_warn!(
"[stop_agent] No worktree path recorded for '{key}'; cannot tree-kill, \
falling back to portable_pty SIGHUP (likely no-op for claude-code)."
);
self.kill_child_for_key(&key);
}
// Step 3: now safe to mutate. Status flip and handle abort.
let (task_handle, tx) = {
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
let agent = agents
.get_mut(&key)
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
let handle = agent.task_handle.take();
let tx = agent.tx.clone();
agent.status = AgentStatus::Failed;
(handle, tx)
};
if let Some(handle) = task_handle {
handle.abort();
let _ = handle.await;
}
// Preserve worktree for inspection — don't destroy agent's work on stop.
if let Some(ref wt) = worktree_info {
slog!(
"[agents] Worktree preserved for {story_id}:{agent_name}: {}",
wt.path.display()
);
}
let _ = tx.send(AgentEvent::Status {
story_id: story_id.to_string(),
agent_name: agent_name.to_string(),
status: "stopped".to_string(),
});
// Remove from map.
{
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
agents.remove(&key);
}
// Notify WebSocket clients so pipeline board and agent panel update.
Self::notify_agent_state_changed(&self.watcher_tx);
Ok(())
}
/// Stop LLM agents whose pipeline stage no longer matches the story's canonical stage.
///
/// Called periodically by the tick loop (story 1100). For each Running or Pending
/// LLM agent (Coder, Qa, or Mergemaster) whose stage does not match the canonical
/// stage derived from the story's current CRDT state, the agent is stopped via the
/// existing SIGKILL path. Idempotent: agents already at the correct stage are left
/// untouched. Also stops LLM agents on stories that have no active pipeline stage
/// (terminal, blocked, or frozen), since no LLM agent should run there.
pub async fn reconcile_canonical_agents(&self, root: &std::path::Path) {
use crate::config::ProjectConfig;
let config = match ProjectConfig::load(root) {
Ok(c) => c,
Err(e) => {
slog_warn!("[reconcile] Cannot load config for canonical reconcile: {e}");
return;
}
};
// Snapshot active LLM agents without holding the lock during async stops.
let snapshot: Vec<(String, String, PipelineStage)> = {
let Ok(agents) = self.agents.lock() else {
return;
};
agents
.iter()
.filter_map(|(key, a)| {
if !matches!(a.status, AgentStatus::Running | AgentStatus::Pending) {
return None;
}
let stage = config
.find_agent(&a.agent_name)
.map(agent_config_stage)
.unwrap_or_else(|| pipeline_stage(&a.agent_name));
if !matches!(
stage,
PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster
) {
return None;
}
let story_id = key
.rsplit_once(':')
.map(|(s, _)| s)
.unwrap_or(key)
.to_string();
Some((story_id, a.agent_name.clone(), stage))
})
.collect()
};
for (story_id, agent_name, agent_stage) in snapshot {
let canonical = crate::pipeline_state::read_typed(&story_id)
.ok()
.flatten()
.and_then(|item| canonical_pipeline_stage(&item.stage));
let should_stop = match &canonical {
None => true,
Some(c) if *c != agent_stage => true,
_ => false,
};
if !should_stop {
continue;
}
slog!(
"[reconcile] stopping '{agent_name}' on '{story_id}': \
canonical={canonical:?} actual={agent_stage:?}"
);
if let Err(e) = self.stop_agent(root, &story_id, &agent_name).await {
slog_warn!("[reconcile] failed to stop '{agent_name}' on '{story_id}': {e}");
}
}
}
/// Remove all agent entries for a given story_id from the pool.
///
/// Called when a story is archived so that stale entries don't accumulate.
/// Returns the number of entries removed.
pub fn remove_agents_for_story(&self, story_id: &str) -> usize {
let mut agents = match self.agents.lock() {
Ok(a) => a,
Err(e) => {
slog_error!("[agents] Failed to lock pool for cleanup of '{story_id}': {e}");
return 0;
}
};
let prefix = format!("{story_id}:");
let keys_to_remove: Vec<String> = agents
.keys()
.filter(|k| k.starts_with(&prefix))
.cloned()
.collect();
let count = keys_to_remove.len();
for key in &keys_to_remove {
agents.remove(key);
}
if count > 0 {
slog!("[agents] Removed {count} agent entries for archived story '{story_id}'");
}
count
}
}
#[cfg(test)]
mod tests {
use super::super::AgentPool;
use crate::agents::AgentStatus;
// ── remove_agents_for_story tests ────────────────────────────────────────
#[test]
fn remove_agents_for_story_removes_all_entries() {
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("story_a", "coder-1", AgentStatus::Completed);
pool.inject_test_agent("story_a", "qa", AgentStatus::Failed);
pool.inject_test_agent("story_b", "coder-1", AgentStatus::Running);
let removed = pool.remove_agents_for_story("story_a");
assert_eq!(removed, 2, "should remove both agents for story_a");
let agents = pool.list_agents().unwrap();
assert_eq!(agents.len(), 1, "only story_b agent should remain");
assert_eq!(agents[0].story_id, "story_b");
}
#[test]
fn remove_agents_for_story_returns_zero_when_no_match() {
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("story_a", "coder-1", AgentStatus::Running);
let removed = pool.remove_agents_for_story("nonexistent");
assert_eq!(removed, 0);
let agents = pool.list_agents().unwrap();
assert_eq!(agents.len(), 1, "existing agents should not be affected");
}
// ── archive + cleanup integration test ───────────────────────────────────
#[tokio::test]
async fn archiving_story_removes_agent_entries_from_pool() {
use crate::agents::lifecycle::move_story_to_done;
use std::fs;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
let current = root.join(".huskies/work/2_current");
fs::create_dir_all(&current).unwrap();
let story_content = "test";
fs::write(current.join("60_story_cleanup.md"), story_content).unwrap();
crate::db::ensure_content_store();
crate::db::write_item_with_content(
"60_story_cleanup",
"2_current",
story_content,
crate::db::ItemMeta::named("Cleanup"),
);
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("60_story_cleanup", "coder-1", AgentStatus::Completed);
pool.inject_test_agent("60_story_cleanup", "qa", AgentStatus::Completed);
pool.inject_test_agent("61_story_other", "coder-1", AgentStatus::Running);
assert_eq!(pool.list_agents().unwrap().len(), 3);
move_story_to_done("60_story_cleanup").unwrap();
pool.remove_agents_for_story("60_story_cleanup");
let remaining = pool.list_agents().unwrap();
assert_eq!(
remaining.len(),
1,
"only the other story's agent should remain"
);
assert_eq!(remaining[0].story_id, "61_story_other");
// The lifecycle function updates the content store (not the filesystem),
// so verify the move via the DB.
let content = crate::db::read_content(crate::db::ContentKey::Story("60_story_cleanup"))
.expect("60_story_cleanup should be in content store after move to done");
assert_eq!(content, "test", "content should be preserved after move");
}
}