feat: add process_kill module + use it to fix watchdog double-spawn
Adds `crate::process_kill` — reliable SIGKILL-with-verify primitives used
across the server in place of the various ad-hoc kill paths that ignored
their kill-effective return values. The module exposes three pieces:
- `sigkill_pids_and_verify(pids)`: SIGKILL each pid and block (up to 2s)
until every pid is verified gone. Returns survivors if not.
- `pids_matching(pattern)`: pgrep -f wrapper.
- `descendant_pids(root)`: recursive pgrep -P walker for process trees.
Wires the watchdog's limit-termination path through it, and reorders the
protocol to fix the duplicate-coder bug observed on story 1086 (2026-05-15):
Before: check_agent_limits set status=Failed before the kill ran. The
kill itself was `portable_pty::ChildKiller::kill()`, which sends SIGHUP
on Unix — claude-code ignores SIGHUP, so the process kept running while
the agent record was already marked terminated. The idempotency check
in `start_agent` whitelists Running/Pending, so the next auto-assign
pass spawned a fresh agent alongside the still-alive prior one. Two
claude PIDs sharing one session_id, racing on the same worktree.
After: status update is moved OUT of check_agent_limits and into the
caller AFTER the kill is verified. The kill itself is now SIGKILL-the-
process-tree-in-the-worktree, with explicit verification that every pid
is gone. The idempotency window is closed.
The existing watchdog test suite (14 tests) still passes; 7 new tests
cover the process_kill primitives directly.
`agents/pool/process.rs`'s `kill_all_children` and `kill_child_for_key`
still use the old portable_pty SIGHUP path — they have the same bug but
in lower-impact code paths (shutdown, operator stop). They will be
migrated under a separate story to keep this commit focused.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -187,13 +187,14 @@ pub(super) fn check_agent_limits(
|
||||
),
|
||||
};
|
||||
|
||||
// Mark agent as Failed with termination reason.
|
||||
if let Ok(mut lock) = agents.lock()
|
||||
&& let Some(agent) = lock.get_mut(key)
|
||||
{
|
||||
agent.status = AgentStatus::Failed;
|
||||
agent.termination_reason = Some(reason.clone());
|
||||
}
|
||||
// NOTE: agent status is intentionally NOT updated here. Setting
|
||||
// `status = Failed` before the kill (the previous behaviour)
|
||||
// opened a window where the `start_agent` idempotency check
|
||||
// (which whitelists Running/Pending) would let a fresh spawn
|
||||
// through while the prior PTY child was still alive — directly
|
||||
// causing the concurrent-agents bug we hit on story 1086
|
||||
// (2026-05-15). The caller (`run_watchdog_pass`) is responsible
|
||||
// for: (1) verifying the kill, (2) THEN updating the agent record.
|
||||
|
||||
slog!("[watchdog] Terminating agent '{key}': {reason_str}.");
|
||||
|
||||
|
||||
@@ -9,8 +9,11 @@ mod tests;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use crate::agents::AgentStatus;
|
||||
use crate::config::ProjectConfig;
|
||||
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
|
||||
use crate::slog;
|
||||
use crate::slog_warn;
|
||||
|
||||
use super::super::AgentPool;
|
||||
use limits::check_agent_limits;
|
||||
@@ -42,14 +45,70 @@ impl AgentPool {
|
||||
if let Some(root) = project_root {
|
||||
let terminated = check_agent_limits(&self.agents, root);
|
||||
let config = ProjectConfig::load(root).unwrap_or_default();
|
||||
for (key, _reason) in &terminated {
|
||||
// Kill the PTY child and abort the task, same as stop_agent.
|
||||
self.kill_child_for_key(key);
|
||||
for (key, reason) in &terminated {
|
||||
// Step 1: snapshot the agent's worktree path so we can find every
|
||||
// process running in it (claude + any subprocesses). This must
|
||||
// happen BEFORE we mutate the agent record so we can read the
|
||||
// worktree info safely.
|
||||
let worktree_path = self.agents.lock().ok().and_then(|lock| {
|
||||
lock.get(key)
|
||||
.and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
|
||||
});
|
||||
|
||||
// Step 2: SIGKILL every process running in the worktree and
|
||||
// BLOCK until verified gone. The previous mechanism — portable_pty's
|
||||
// `ChildKiller::kill()` — sends SIGHUP, which claude-code
|
||||
// ignores, leaving the process alive while the agent record
|
||||
// was being marked terminated; that gap let a fresh spawn race
|
||||
// in alongside the surviving one. SIGKILL is uncatchable;
|
||||
// [`sigkill_pids_and_verify`] only returns once the kernel has
|
||||
// reaped each pid.
|
||||
if let Some(wt_path) = worktree_path.as_ref() {
|
||||
let pids = pids_matching(&wt_path.display().to_string());
|
||||
if pids.is_empty() {
|
||||
// Nothing in this worktree — agent likely already
|
||||
// exited on its own before the watchdog noticed.
|
||||
} else {
|
||||
match sigkill_pids_and_verify(&pids) {
|
||||
Ok(n) => slog!(
|
||||
"[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
|
||||
wt_path.display()
|
||||
),
|
||||
Err(survivors) => slog_warn!(
|
||||
"[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
|
||||
Proceeding with cleanup; concurrent spawn protection may be weakened."
|
||||
),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
slog_warn!(
|
||||
"[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \
|
||||
falling back to portable_pty SIGHUP (likely no-op for claude-code)."
|
||||
);
|
||||
self.kill_child_for_key(key);
|
||||
}
|
||||
|
||||
// Step 3: NOW update the agent record. The process is verified
|
||||
// gone (or we logged that SIGKILL didn't take effect, which is
|
||||
// exceptional), so flipping status away from Running can no
|
||||
// longer open a window for a concurrent spawn.
|
||||
if let Ok(mut lock) = self.agents.lock()
|
||||
&& let Some(agent) = lock.get_mut(key)
|
||||
&& let Some(handle) = agent.task_handle.take()
|
||||
{
|
||||
handle.abort();
|
||||
agent.status = AgentStatus::Failed;
|
||||
agent.termination_reason = Some(reason.clone());
|
||||
if let Some(handle) = agent.task_handle.take() {
|
||||
// Best-effort abort of the outer tokio task. The PTY
|
||||
// blocking thread already returned (claude is dead),
|
||||
// so this is bookkeeping rather than load-bearing.
|
||||
handle.abort();
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: drop the (now-stale) child_killers entry — the
|
||||
// process it pointed at is gone.
|
||||
if let Ok(mut killers) = self.child_killers.lock() {
|
||||
killers.remove(key);
|
||||
}
|
||||
|
||||
// Use the retry mechanism: increment retry_count and only block
|
||||
|
||||
Reference in New Issue
Block a user