fix(agents): kill-then-status reorder in stop_agent
stop_agent had the same order-of-operations bug fixed in the watchdog: status flipped to Failed before the claude process was verified gone, opening the idempotency window that allowed a duplicate spawn to race in alongside the surviving process. Now follows the three-step protocol: 1. Read worktree path under a read-only lock (no mutation). 2. SIGKILL the worktree's process tree via process_kill and block until verified gone — start_agent's Running/Pending whitelist continues to reject duplicate spawns throughout. 3. Only then mutate the agent record, abort the task handle, and drop the child_killers entry. Falls back to the old portable_pty SIGHUP path (with a warning) when no worktree was recorded, matching the watchdog's behaviour. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
//! Agent stop — terminates a running agent while preserving its worktree.
|
||||
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
|
||||
use crate::slog;
|
||||
use crate::slog_error;
|
||||
use crate::slog_warn;
|
||||
use std::path::Path;
|
||||
|
||||
use super::super::{AgentEvent, AgentStatus};
|
||||
@@ -9,6 +11,22 @@ use super::types::composite_key;
|
||||
|
||||
impl AgentPool {
|
||||
/// Stop a running agent. Worktree is preserved for inspection.
|
||||
///
|
||||
/// **Order of operations matters here.** The naive implementation set
|
||||
/// `status = Failed` before killing the process, which opened the same
|
||||
/// idempotency window that produced the 2026-05-15 watchdog
|
||||
/// double-spawn: the `start_agent` check whitelists Running/Pending,
|
||||
/// so flipping status away from Running while the underlying claude
|
||||
/// process was still alive let a fresh spawn race in alongside the
|
||||
/// surviving one. The fix is:
|
||||
///
|
||||
/// 1. Read the worktree path (so we can find every process running
|
||||
/// in it) without mutating the agent record yet.
|
||||
/// 2. SIGKILL the process tree via [`crate::process_kill`] and BLOCK
|
||||
/// until verified gone. While this is in progress, status stays
|
||||
/// Running and `start_agent` continues to reject duplicate spawns.
|
||||
/// 3. Now that the process is gone, mutate the agent record (status,
|
||||
/// handle abort, removal).
|
||||
pub async fn stop_agent(
|
||||
&self,
|
||||
_project_root: &Path,
|
||||
@@ -17,27 +35,62 @@ impl AgentPool {
|
||||
) -> Result<(), String> {
|
||||
let key = composite_key(story_id, agent_name);
|
||||
|
||||
let (worktree_info, task_handle, tx) = {
|
||||
// Step 1: snapshot the worktree path (no status mutation yet).
|
||||
let worktree_info = {
|
||||
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
||||
let agent = agents
|
||||
.get(&key)
|
||||
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
|
||||
agent.worktree_info.clone()
|
||||
};
|
||||
|
||||
// Step 2: SIGKILL every process running in the worktree, verify gone.
|
||||
// We do this BEFORE updating the agent record so the idempotency check
|
||||
// in `start_agent` keeps rejecting duplicate spawns until the slot is
|
||||
// legitimately free. Replaces the prior `kill_child_for_key` path,
|
||||
// which sent SIGHUP via portable_pty (ignored by claude-code).
|
||||
if let Some(wt) = worktree_info.as_ref() {
|
||||
let pids = pids_matching(&wt.path.display().to_string());
|
||||
if !pids.is_empty() {
|
||||
match sigkill_pids_and_verify(&pids) {
|
||||
Ok(n) => slog!(
|
||||
"[stop_agent] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
|
||||
wt.path.display()
|
||||
),
|
||||
Err(survivors) => slog_warn!(
|
||||
"[stop_agent] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
|
||||
Proceeding with record cleanup anyway; concurrent spawn protection may be weakened."
|
||||
),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
slog_warn!(
|
||||
"[stop_agent] No worktree path recorded for '{key}'; cannot tree-kill, \
|
||||
falling back to portable_pty SIGHUP (likely no-op for claude-code)."
|
||||
);
|
||||
self.kill_child_for_key(&key);
|
||||
}
|
||||
|
||||
// Step 3: now safe to mutate. Status flip, handle abort, drop the
|
||||
// child_killers entry.
|
||||
let (task_handle, tx) = {
|
||||
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
||||
let agent = agents
|
||||
.get_mut(&key)
|
||||
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
|
||||
|
||||
let wt = agent.worktree_info.clone();
|
||||
let handle = agent.task_handle.take();
|
||||
let tx = agent.tx.clone();
|
||||
agent.status = AgentStatus::Failed;
|
||||
(wt, handle, tx)
|
||||
(handle, tx)
|
||||
};
|
||||
|
||||
// Abort the task and kill the PTY child process.
|
||||
// Note: aborting a spawn_blocking task handle does not interrupt the blocking
|
||||
// thread, so we must also kill the child process directly via the killer registry.
|
||||
if let Some(handle) = task_handle {
|
||||
handle.abort();
|
||||
let _ = handle.await;
|
||||
}
|
||||
self.kill_child_for_key(&key);
|
||||
if let Ok(mut killers) = self.child_killers.lock() {
|
||||
killers.remove(&key);
|
||||
}
|
||||
|
||||
// Preserve worktree for inspection — don't destroy agent's work on stop.
|
||||
if let Some(ref wt) = worktree_info {
|
||||
@@ -53,7 +106,7 @@ impl AgentPool {
|
||||
status: "stopped".to_string(),
|
||||
});
|
||||
|
||||
// Remove from map
|
||||
// Remove from map.
|
||||
{
|
||||
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
||||
agents.remove(&key);
|
||||
|
||||
Reference in New Issue
Block a user