fix(agents): kill-then-status reorder in stop_agent

stop_agent had the same order-of-operations bug fixed in the watchdog:
status flipped to Failed before the claude process was verified gone,
opening the idempotency window that allowed a duplicate spawn to race
in alongside the surviving process.

Now follows the three-step protocol:
1. Read worktree path under a read-only lock (no mutation).
2. SIGKILL the worktree's process tree via process_kill and block
   until verified gone — start_agent's Running/Pending whitelist
   continues to reject duplicate spawns throughout.
3. Only then mutate the agent record, abort the task handle, and
   drop the child_killers entry.

Falls back to the old portable_pty SIGHUP path (with a warning) when
no worktree was recorded, matching the watchdog's behaviour.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Timmy
2026-05-15 10:46:02 +01:00
parent fe9804b32c
commit b7df5cbe4e
+62 -9
View File
@@ -1,6 +1,8 @@
//! Agent stop — terminates a running agent while preserving its worktree. //! Agent stop — terminates a running agent while preserving its worktree.
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
use crate::slog; use crate::slog;
use crate::slog_error; use crate::slog_error;
use crate::slog_warn;
use std::path::Path; use std::path::Path;
use super::super::{AgentEvent, AgentStatus}; use super::super::{AgentEvent, AgentStatus};
@@ -9,6 +11,22 @@ use super::types::composite_key;
impl AgentPool { impl AgentPool {
/// Stop a running agent. Worktree is preserved for inspection. /// Stop a running agent. Worktree is preserved for inspection.
///
/// **Order of operations matters here.** The naive implementation set
/// `status = Failed` before killing the process, which opened the same
/// idempotency window that produced the 2026-05-15 watchdog
/// double-spawn: the `start_agent` check whitelists Running/Pending,
/// so flipping status away from Running while the underlying claude
/// process was still alive let a fresh spawn race in alongside the
/// surviving one. The fix is:
///
/// 1. Read the worktree path (so we can find every process running
/// in it) without mutating the agent record yet.
/// 2. SIGKILL the process tree via [`crate::process_kill`] and BLOCK
/// until verified gone. While this is in progress, status stays
/// Running and `start_agent` continues to reject duplicate spawns.
/// 3. Now that the process is gone, mutate the agent record (status,
/// handle abort, removal).
pub async fn stop_agent( pub async fn stop_agent(
&self, &self,
_project_root: &Path, _project_root: &Path,
@@ -17,27 +35,62 @@ impl AgentPool {
) -> Result<(), String> { ) -> Result<(), String> {
let key = composite_key(story_id, agent_name); let key = composite_key(story_id, agent_name);
let (worktree_info, task_handle, tx) = { // Step 1: snapshot the worktree path (no status mutation yet).
let worktree_info = {
let agents = self.agents.lock().map_err(|e| e.to_string())?;
let agent = agents
.get(&key)
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
agent.worktree_info.clone()
};
// Step 2: SIGKILL every process running in the worktree, verify gone.
// We do this BEFORE updating the agent record so the idempotency check
// in `start_agent` keeps rejecting duplicate spawns until the slot is
// legitimately free. Replaces the prior `kill_child_for_key` path,
// which sent SIGHUP via portable_pty (ignored by claude-code).
if let Some(wt) = worktree_info.as_ref() {
let pids = pids_matching(&wt.path.display().to_string());
if !pids.is_empty() {
match sigkill_pids_and_verify(&pids) {
Ok(n) => slog!(
"[stop_agent] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
wt.path.display()
),
Err(survivors) => slog_warn!(
"[stop_agent] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
Proceeding with record cleanup anyway; concurrent spawn protection may be weakened."
),
}
}
} else {
slog_warn!(
"[stop_agent] No worktree path recorded for '{key}'; cannot tree-kill, \
falling back to portable_pty SIGHUP (likely no-op for claude-code)."
);
self.kill_child_for_key(&key);
}
// Step 3: now safe to mutate. Status flip, handle abort, drop the
// child_killers entry.
let (task_handle, tx) = {
let mut agents = self.agents.lock().map_err(|e| e.to_string())?; let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
let agent = agents let agent = agents
.get_mut(&key) .get_mut(&key)
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
let wt = agent.worktree_info.clone();
let handle = agent.task_handle.take(); let handle = agent.task_handle.take();
let tx = agent.tx.clone(); let tx = agent.tx.clone();
agent.status = AgentStatus::Failed; agent.status = AgentStatus::Failed;
(wt, handle, tx) (handle, tx)
}; };
// Abort the task and kill the PTY child process.
// Note: aborting a spawn_blocking task handle does not interrupt the blocking
// thread, so we must also kill the child process directly via the killer registry.
if let Some(handle) = task_handle { if let Some(handle) = task_handle {
handle.abort(); handle.abort();
let _ = handle.await; let _ = handle.await;
} }
self.kill_child_for_key(&key); if let Ok(mut killers) = self.child_killers.lock() {
killers.remove(&key);
}
// Preserve worktree for inspection — don't destroy agent's work on stop. // Preserve worktree for inspection — don't destroy agent's work on stop.
if let Some(ref wt) = worktree_info { if let Some(ref wt) = worktree_info {
@@ -53,7 +106,7 @@ impl AgentPool {
status: "stopped".to_string(), status: "stopped".to_string(),
}); });
// Remove from map // Remove from map.
{ {
let mut agents = self.agents.lock().map_err(|e| e.to_string())?; let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
agents.remove(&key); agents.remove(&key);