fix(agents): kill-then-status reorder in stop_agent
stop_agent had the same order-of-operations bug fixed in the watchdog: status flipped to Failed before the claude process was verified gone, opening the idempotency window that allowed a duplicate spawn to race in alongside the surviving process. Now follows the three-step protocol: 1. Read worktree path under a read-only lock (no mutation). 2. SIGKILL the worktree's process tree via process_kill and block until verified gone — start_agent's Running/Pending whitelist continues to reject duplicate spawns throughout. 3. Only then mutate the agent record, abort the task handle, and drop the child_killers entry. Falls back to the old portable_pty SIGHUP path (with a warning) when no worktree was recorded, matching the watchdog's behaviour. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,8 @@
|
|||||||
//! Agent stop — terminates a running agent while preserving its worktree.
|
//! Agent stop — terminates a running agent while preserving its worktree.
|
||||||
|
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
|
||||||
use crate::slog;
|
use crate::slog;
|
||||||
use crate::slog_error;
|
use crate::slog_error;
|
||||||
|
use crate::slog_warn;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use super::super::{AgentEvent, AgentStatus};
|
use super::super::{AgentEvent, AgentStatus};
|
||||||
@@ -9,6 +11,22 @@ use super::types::composite_key;
|
|||||||
|
|
||||||
impl AgentPool {
|
impl AgentPool {
|
||||||
/// Stop a running agent. Worktree is preserved for inspection.
|
/// Stop a running agent. Worktree is preserved for inspection.
|
||||||
|
///
|
||||||
|
/// **Order of operations matters here.** The naive implementation set
|
||||||
|
/// `status = Failed` before killing the process, which opened the same
|
||||||
|
/// idempotency window that produced the 2026-05-15 watchdog
|
||||||
|
/// double-spawn: the `start_agent` check whitelists Running/Pending,
|
||||||
|
/// so flipping status away from Running while the underlying claude
|
||||||
|
/// process was still alive let a fresh spawn race in alongside the
|
||||||
|
/// surviving one. The fix is:
|
||||||
|
///
|
||||||
|
/// 1. Read the worktree path (so we can find every process running
|
||||||
|
/// in it) without mutating the agent record yet.
|
||||||
|
/// 2. SIGKILL the process tree via [`crate::process_kill`] and BLOCK
|
||||||
|
/// until verified gone. While this is in progress, status stays
|
||||||
|
/// Running and `start_agent` continues to reject duplicate spawns.
|
||||||
|
/// 3. Now that the process is gone, mutate the agent record (status,
|
||||||
|
/// handle abort, removal).
|
||||||
pub async fn stop_agent(
|
pub async fn stop_agent(
|
||||||
&self,
|
&self,
|
||||||
_project_root: &Path,
|
_project_root: &Path,
|
||||||
@@ -17,27 +35,62 @@ impl AgentPool {
|
|||||||
) -> Result<(), String> {
|
) -> Result<(), String> {
|
||||||
let key = composite_key(story_id, agent_name);
|
let key = composite_key(story_id, agent_name);
|
||||||
|
|
||||||
let (worktree_info, task_handle, tx) = {
|
// Step 1: snapshot the worktree path (no status mutation yet).
|
||||||
|
let worktree_info = {
|
||||||
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
||||||
|
let agent = agents
|
||||||
|
.get(&key)
|
||||||
|
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
|
||||||
|
agent.worktree_info.clone()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step 2: SIGKILL every process running in the worktree, verify gone.
|
||||||
|
// We do this BEFORE updating the agent record so the idempotency check
|
||||||
|
// in `start_agent` keeps rejecting duplicate spawns until the slot is
|
||||||
|
// legitimately free. Replaces the prior `kill_child_for_key` path,
|
||||||
|
// which sent SIGHUP via portable_pty (ignored by claude-code).
|
||||||
|
if let Some(wt) = worktree_info.as_ref() {
|
||||||
|
let pids = pids_matching(&wt.path.display().to_string());
|
||||||
|
if !pids.is_empty() {
|
||||||
|
match sigkill_pids_and_verify(&pids) {
|
||||||
|
Ok(n) => slog!(
|
||||||
|
"[stop_agent] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
|
||||||
|
wt.path.display()
|
||||||
|
),
|
||||||
|
Err(survivors) => slog_warn!(
|
||||||
|
"[stop_agent] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
|
||||||
|
Proceeding with record cleanup anyway; concurrent spawn protection may be weakened."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
slog_warn!(
|
||||||
|
"[stop_agent] No worktree path recorded for '{key}'; cannot tree-kill, \
|
||||||
|
falling back to portable_pty SIGHUP (likely no-op for claude-code)."
|
||||||
|
);
|
||||||
|
self.kill_child_for_key(&key);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: now safe to mutate. Status flip, handle abort, drop the
|
||||||
|
// child_killers entry.
|
||||||
|
let (task_handle, tx) = {
|
||||||
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
||||||
let agent = agents
|
let agent = agents
|
||||||
.get_mut(&key)
|
.get_mut(&key)
|
||||||
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
|
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
|
||||||
|
|
||||||
let wt = agent.worktree_info.clone();
|
|
||||||
let handle = agent.task_handle.take();
|
let handle = agent.task_handle.take();
|
||||||
let tx = agent.tx.clone();
|
let tx = agent.tx.clone();
|
||||||
agent.status = AgentStatus::Failed;
|
agent.status = AgentStatus::Failed;
|
||||||
(wt, handle, tx)
|
(handle, tx)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Abort the task and kill the PTY child process.
|
|
||||||
// Note: aborting a spawn_blocking task handle does not interrupt the blocking
|
|
||||||
// thread, so we must also kill the child process directly via the killer registry.
|
|
||||||
if let Some(handle) = task_handle {
|
if let Some(handle) = task_handle {
|
||||||
handle.abort();
|
handle.abort();
|
||||||
let _ = handle.await;
|
let _ = handle.await;
|
||||||
}
|
}
|
||||||
self.kill_child_for_key(&key);
|
if let Ok(mut killers) = self.child_killers.lock() {
|
||||||
|
killers.remove(&key);
|
||||||
|
}
|
||||||
|
|
||||||
// Preserve worktree for inspection — don't destroy agent's work on stop.
|
// Preserve worktree for inspection — don't destroy agent's work on stop.
|
||||||
if let Some(ref wt) = worktree_info {
|
if let Some(ref wt) = worktree_info {
|
||||||
@@ -53,7 +106,7 @@ impl AgentPool {
|
|||||||
status: "stopped".to_string(),
|
status: "stopped".to_string(),
|
||||||
});
|
});
|
||||||
|
|
||||||
// Remove from map
|
// Remove from map.
|
||||||
{
|
{
|
||||||
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
||||||
agents.remove(&key);
|
agents.remove(&key);
|
||||||
|
|||||||
Reference in New Issue
Block a user