From b7df5cbe4e73050967df4550798ea7a99ee7bf31 Mon Sep 17 00:00:00 2001 From: Timmy Date: Fri, 15 May 2026 10:46:02 +0100 Subject: [PATCH] fix(agents): kill-then-status reorder in stop_agent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stop_agent had the same order-of-operations bug fixed in the watchdog: status flipped to Failed before the claude process was verified gone, opening the idempotency window that allowed a duplicate spawn to race in alongside the surviving process. Now follows the three-step protocol: 1. Read worktree path under a read-only lock (no mutation). 2. SIGKILL the worktree's process tree via process_kill and block until verified gone — start_agent's Running/Pending whitelist continues to reject duplicate spawns throughout. 3. Only then mutate the agent record, abort the task handle, and drop the child_killers entry. Falls back to the old portable_pty SIGHUP path (with a warning) when no worktree was recorded, matching the watchdog's behaviour. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/src/agents/pool/stop.rs | 71 +++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/server/src/agents/pool/stop.rs b/server/src/agents/pool/stop.rs index ce7a76b6..2ad1dfe1 100644 --- a/server/src/agents/pool/stop.rs +++ b/server/src/agents/pool/stop.rs @@ -1,6 +1,8 @@ //! Agent stop — terminates a running agent while preserving its worktree. +use crate::process_kill::{pids_matching, sigkill_pids_and_verify}; use crate::slog; use crate::slog_error; +use crate::slog_warn; use std::path::Path; use super::super::{AgentEvent, AgentStatus}; @@ -9,6 +11,22 @@ use super::types::composite_key; impl AgentPool { /// Stop a running agent. Worktree is preserved for inspection. + /// + /// **Order of operations matters here.** The naive implementation set + /// `status = Failed` before killing the process, which opened the same + /// idempotency window that produced the 2026-05-15 watchdog + /// double-spawn: the `start_agent` check whitelists Running/Pending, + /// so flipping status away from Running while the underlying claude + /// process was still alive let a fresh spawn race in alongside the + /// surviving one. The fix is: + /// + /// 1. Read the worktree path (so we can find every process running + /// in it) without mutating the agent record yet. + /// 2. SIGKILL the process tree via [`crate::process_kill`] and BLOCK + /// until verified gone. While this is in progress, status stays + /// Running and `start_agent` continues to reject duplicate spawns. + /// 3. Now that the process is gone, mutate the agent record (status, + /// handle abort, removal). pub async fn stop_agent( &self, _project_root: &Path, @@ -17,27 +35,62 @@ impl AgentPool { ) -> Result<(), String> { let key = composite_key(story_id, agent_name); - let (worktree_info, task_handle, tx) = { + // Step 1: snapshot the worktree path (no status mutation yet). + let worktree_info = { + let agents = self.agents.lock().map_err(|e| e.to_string())?; + let agent = agents + .get(&key) + .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; + agent.worktree_info.clone() + }; + + // Step 2: SIGKILL every process running in the worktree, verify gone. + // We do this BEFORE updating the agent record so the idempotency check + // in `start_agent` keeps rejecting duplicate spawns until the slot is + // legitimately free. Replaces the prior `kill_child_for_key` path, + // which sent SIGHUP via portable_pty (ignored by claude-code). + if let Some(wt) = worktree_info.as_ref() { + let pids = pids_matching(&wt.path.display().to_string()); + if !pids.is_empty() { + match sigkill_pids_and_verify(&pids) { + Ok(n) => slog!( + "[stop_agent] SIGKILL'd {n} process(es) in worktree {} for '{key}'.", + wt.path.display() + ), + Err(survivors) => slog_warn!( + "[stop_agent] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \ + Proceeding with record cleanup anyway; concurrent spawn protection may be weakened." + ), + } + } + } else { + slog_warn!( + "[stop_agent] No worktree path recorded for '{key}'; cannot tree-kill, \ + falling back to portable_pty SIGHUP (likely no-op for claude-code)." + ); + self.kill_child_for_key(&key); + } + + // Step 3: now safe to mutate. Status flip, handle abort, drop the + // child_killers entry. + let (task_handle, tx) = { let mut agents = self.agents.lock().map_err(|e| e.to_string())?; let agent = agents .get_mut(&key) .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; - let wt = agent.worktree_info.clone(); let handle = agent.task_handle.take(); let tx = agent.tx.clone(); agent.status = AgentStatus::Failed; - (wt, handle, tx) + (handle, tx) }; - - // Abort the task and kill the PTY child process. - // Note: aborting a spawn_blocking task handle does not interrupt the blocking - // thread, so we must also kill the child process directly via the killer registry. if let Some(handle) = task_handle { handle.abort(); let _ = handle.await; } - self.kill_child_for_key(&key); + if let Ok(mut killers) = self.child_killers.lock() { + killers.remove(&key); + } // Preserve worktree for inspection — don't destroy agent's work on stop. if let Some(ref wt) = worktree_info { @@ -53,7 +106,7 @@ impl AgentPool { status: "stopped".to_string(), }); - // Remove from map + // Remove from map. { let mut agents = self.agents.lock().map_err(|e| e.to_string())?; agents.remove(&key);