feat: add process_kill module + use it to fix watchdog double-spawn

Adds `crate::process_kill` — reliable SIGKILL-with-verify primitives used across the server in place of the various ad-hoc kill paths that ignored their kill-effective return values. The module exposes three pieces: - `sigkill_pids_and_verify(pids)`: SIGKILL each pid and block (up to 2s) until every pid is verified gone. Returns survivors if not. - `pids_matching(pattern)`: pgrep -f wrapper. - `descendant_pids(root)`: recursive pgrep -P walker for process trees. Wires the watchdog's limit-termination path through it, and reorders the protocol to fix the duplicate-coder bug observed on story 1086 (2026-05-15): Before: check_agent_limits set status=Failed before the kill ran. The kill itself was `portable_pty::ChildKiller::kill()`, which sends SIGHUP on Unix — claude-code ignores SIGHUP, so the process kept running while the agent record was already marked terminated. The idempotency check in `start_agent` whitelists Running/Pending, so the next auto-assign pass spawned a fresh agent alongside the still-alive prior one. Two claude PIDs sharing one session_id, racing on the same worktree. After: status update is moved OUT of check_agent_limits and into the caller AFTER the kill is verified. The kill itself is now SIGKILL-the- process-tree-in-the-worktree, with explicit verification that every pid is gone. The idempotency window is closed. The existing watchdog test suite (14 tests) still passes; 7 new tests cover the process_kill primitives directly. `agents/pool/process.rs`'s `kill_all_children` and `kill_child_for_key` still use the old portable_pty SIGHUP path — they have the same bug but in lower-impact code paths (shutdown, operator stop). They will be migrated under a separate story to keep this commit focused. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 10:36:33 +01:00
parent 8446ab1c71
commit fe9804b32c
5 changed files with 403 additions and 12 deletions
@@ -187,13 +187,14 @@ pub(super) fn check_agent_limits(
                ),
            };

-            // Mark agent as Failed with termination reason.
-            if let Ok(mut lock) = agents.lock()
-                && let Some(agent) = lock.get_mut(key)
-            {
-                agent.status = AgentStatus::Failed;
-                agent.termination_reason = Some(reason.clone());
-            }
+            // NOTE: agent status is intentionally NOT updated here.  Setting
+            // `status = Failed` before the kill (the previous behaviour)
+            // opened a window where the `start_agent` idempotency check
+            // (which whitelists Running/Pending) would let a fresh spawn
+            // through while the prior PTY child was still alive — directly
+            // causing the concurrent-agents bug we hit on story 1086
+            // (2026-05-15).  The caller (`run_watchdog_pass`) is responsible
+            // for: (1) verifying the kill, (2) THEN updating the agent record.

            slog!("[watchdog] Terminating agent '{key}': {reason_str}.");

@@ -9,8 +9,11 @@ mod tests;

 use std::path::Path;

+use crate::agents::AgentStatus;
 use crate::config::ProjectConfig;
+use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
 use crate::slog;
+use crate::slog_warn;

 use super::super::AgentPool;
 use limits::check_agent_limits;
@@ -42,14 +45,70 @@ impl AgentPool {
        if let Some(root) = project_root {
            let terminated = check_agent_limits(&self.agents, root);
            let config = ProjectConfig::load(root).unwrap_or_default();
-            for (key, _reason) in &terminated {
-                // Kill the PTY child and abort the task, same as stop_agent.
-                self.kill_child_for_key(key);
+            for (key, reason) in &terminated {
+                // Step 1: snapshot the agent's worktree path so we can find every
+                // process running in it (claude + any subprocesses).  This must
+                // happen BEFORE we mutate the agent record so we can read the
+                // worktree info safely.
+                let worktree_path = self.agents.lock().ok().and_then(|lock| {
+                    lock.get(key)
+                        .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
+                });
+
+                // Step 2: SIGKILL every process running in the worktree and
+                // BLOCK until verified gone.  The previous mechanism — portable_pty's
+                // `ChildKiller::kill()` — sends SIGHUP, which claude-code
+                // ignores, leaving the process alive while the agent record
+                // was being marked terminated; that gap let a fresh spawn race
+                // in alongside the surviving one.  SIGKILL is uncatchable;
+                // [`sigkill_pids_and_verify`] only returns once the kernel has
+                // reaped each pid.
+                if let Some(wt_path) = worktree_path.as_ref() {
+                    let pids = pids_matching(&wt_path.display().to_string());
+                    if pids.is_empty() {
+                        // Nothing in this worktree — agent likely already
+                        // exited on its own before the watchdog noticed.
+                    } else {
+                        match sigkill_pids_and_verify(&pids) {
+                            Ok(n) => slog!(
+                                "[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
+                                wt_path.display()
+                            ),
+                            Err(survivors) => slog_warn!(
+                                "[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
+                                 Proceeding with cleanup; concurrent spawn protection may be weakened."
+                            ),
+                        }
+                    }
+                } else {
+                    slog_warn!(
+                        "[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \
+                         falling back to portable_pty SIGHUP (likely no-op for claude-code)."
+                    );
+                    self.kill_child_for_key(key);
+                }
+
+                // Step 3: NOW update the agent record.  The process is verified
+                // gone (or we logged that SIGKILL didn't take effect, which is
+                // exceptional), so flipping status away from Running can no
+                // longer open a window for a concurrent spawn.
                if let Ok(mut lock) = self.agents.lock()
                    && let Some(agent) = lock.get_mut(key)
-                    && let Some(handle) = agent.task_handle.take()
                {
-                    handle.abort();
+                    agent.status = AgentStatus::Failed;
+                    agent.termination_reason = Some(reason.clone());
+                    if let Some(handle) = agent.task_handle.take() {
+                        // Best-effort abort of the outer tokio task.  The PTY
+                        // blocking thread already returned (claude is dead),
+                        // so this is bookkeeping rather than load-bearing.
+                        handle.abort();
+                    }
+                }
+
+                // Step 4: drop the (now-stale) child_killers entry — the
+                // process it pointed at is gone.
+                if let Ok(mut killers) = self.child_killers.lock() {
+                    killers.remove(key);
                }

                // Use the retry mechanism: increment retry_count and only block