feat: add process_kill module + use it to fix watchdog double-spawn

Adds `crate::process_kill` — reliable SIGKILL-with-verify primitives used across the server in place of the various ad-hoc kill paths that ignored their kill-effective return values. The module exposes three pieces: - `sigkill_pids_and_verify(pids)`: SIGKILL each pid and block (up to 2s) until every pid is verified gone. Returns survivors if not. - `pids_matching(pattern)`: pgrep -f wrapper. - `descendant_pids(root)`: recursive pgrep -P walker for process trees. Wires the watchdog's limit-termination path through it, and reorders the protocol to fix the duplicate-coder bug observed on story 1086 (2026-05-15): Before: check_agent_limits set status=Failed before the kill ran. The kill itself was `portable_pty::ChildKiller::kill()`, which sends SIGHUP on Unix — claude-code ignores SIGHUP, so the process kept running while the agent record was already marked terminated. The idempotency check in `start_agent` whitelists Running/Pending, so the next auto-assign pass spawned a fresh agent alongside the still-alive prior one. Two claude PIDs sharing one session_id, racing on the same worktree. After: status update is moved OUT of check_agent_limits and into the caller AFTER the kill is verified. The kill itself is now SIGKILL-the- process-tree-in-the-worktree, with explicit verification that every pid is gone. The idempotency window is closed. The existing watchdog test suite (14 tests) still passes; 7 new tests cover the process_kill primitives directly. `agents/pool/process.rs`'s `kill_all_children` and `kill_child_for_key` still use the old portable_pty SIGHUP path — they have the same bug but in lower-impact code paths (shutdown, operator stop). They will be migrated under a separate story to keep this commit focused. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 10:36:33 +01:00
parent 8446ab1c71
commit fe9804b32c
5 changed files with 403 additions and 12 deletions
@@ -187,13 +187,14 @@ pub(super) fn check_agent_limits(
                ),
            };
-            // Mark agent as Failed with termination reason.
+            // NOTE: agent status is intentionally NOT updated here.  Setting
-            if let Ok(mut lock) = agents.lock()
+            // `status = Failed` before the kill (the previous behaviour)
-                && let Some(agent) = lock.get_mut(key)
+            // opened a window where the `start_agent` idempotency check
-            {
+            // (which whitelists Running/Pending) would let a fresh spawn
-                agent.status = AgentStatus::Failed;
+            // through while the prior PTY child was still alive — directly
-                agent.termination_reason = Some(reason.clone());
+            // causing the concurrent-agents bug we hit on story 1086
-            }
+            // (2026-05-15).  The caller (`run_watchdog_pass`) is responsible
            // for: (1) verifying the kill, (2) THEN updating the agent record.
            slog!("[watchdog] Terminating agent '{key}': {reason_str}.");
@@ -9,8 +9,11 @@ mod tests;
 use std::path::Path;
 use crate::agents::AgentStatus;
 use crate::config::ProjectConfig;
 use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
 use crate::slog;
 use crate::slog_warn;
 use super::super::AgentPool;
 use limits::check_agent_limits;
@@ -42,15 +45,71 @@ impl AgentPool {
        if let Some(root) = project_root {
            let terminated = check_agent_limits(&self.agents, root);
            let config = ProjectConfig::load(root).unwrap_or_default();
-            for (key, _reason) in &terminated {
+            for (key, reason) in &terminated {
-                // Kill the PTY child and abort the task, same as stop_agent.
+                // Step 1: snapshot the agent's worktree path so we can find every
                // process running in it (claude + any subprocesses).  This must
                // happen BEFORE we mutate the agent record so we can read the
                // worktree info safely.
                let worktree_path = self.agents.lock().ok().and_then(|lock| {
                    lock.get(key)
                        .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
                });
                // Step 2: SIGKILL every process running in the worktree and
                // BLOCK until verified gone.  The previous mechanism — portable_pty's
                // `ChildKiller::kill()` — sends SIGHUP, which claude-code
                // ignores, leaving the process alive while the agent record
                // was being marked terminated; that gap let a fresh spawn race
                // in alongside the surviving one.  SIGKILL is uncatchable;
                // [`sigkill_pids_and_verify`] only returns once the kernel has
                // reaped each pid.
                if let Some(wt_path) = worktree_path.as_ref() {
                    let pids = pids_matching(&wt_path.display().to_string());
                    if pids.is_empty() {
                        // Nothing in this worktree — agent likely already
                        // exited on its own before the watchdog noticed.
                    } else {
                        match sigkill_pids_and_verify(&pids) {
                            Ok(n) => slog!(
                                "[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
                                wt_path.display()
                            ),
                            Err(survivors) => slog_warn!(
                                "[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
                                 Proceeding with cleanup; concurrent spawn protection may be weakened."
                            ),
                        }
                    }
                } else {
                    slog_warn!(
                        "[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \
                         falling back to portable_pty SIGHUP (likely no-op for claude-code)."
                    );
                    self.kill_child_for_key(key);
                }
                // Step 3: NOW update the agent record.  The process is verified
                // gone (or we logged that SIGKILL didn't take effect, which is
                // exceptional), so flipping status away from Running can no
                // longer open a window for a concurrent spawn.
                if let Ok(mut lock) = self.agents.lock()
                    && let Some(agent) = lock.get_mut(key)
                    && let Some(handle) = agent.task_handle.take()
                {
                    agent.status = AgentStatus::Failed;
                    agent.termination_reason = Some(reason.clone());
                    if let Some(handle) = agent.task_handle.take() {
                        // Best-effort abort of the outer tokio task.  The PTY
                        // blocking thread already returned (claude is dead),
                        // so this is bookkeeping rather than load-bearing.
                        handle.abort();
                    }
                }
                // Step 4: drop the (now-stale) child_killers entry — the
                // process it pointed at is gone.
                if let Ok(mut killers) = self.child_killers.lock() {
                    killers.remove(key);
                }
                // Use the retry mechanism: increment retry_count and only block
                // when the limit is exceeded, matching the pipeline's behaviour.
@@ -1,4 +1,11 @@
 //! Process management — kills orphaned PTY child processes on server shutdown.
 //!
 //! See [`crate::process_kill`] for the general process-termination primitives
 //! this module's existing methods (`kill_all_children`, `kill_child_for_key`)
 //! should eventually be migrated to.  Those methods currently use
 //! `portable_pty::ChildKiller::kill()`, which sends `SIGHUP` — a signal
 //! claude-code ignores — so they leave orphans on every shutdown/stop.  The
 //! migration is tracked in a separate story to keep its diff focused.
 use crate::slog;
 use super::AgentPool;
@@ -33,6 +33,8 @@ pub mod mesh;
 /// Node identity — Ed25519 keypair generation and stable node ID management.
 pub mod node_identity;
 pub(crate) mod pipeline_state;
 /// Reliable process-termination primitives shared across the server.
 pub mod process_kill;
 /// Rebuild — process restart and shutdown coordination.
 pub mod rebuild;
 mod service;
@@ -0,0 +1,322 @@
 //! Reliable process-termination primitives.
 //!
 //! The huskies server kills child processes in several distinct places:
 //! the watchdog terminates agents that have exceeded turn/budget limits,
 //! `stop_agent` terminates on operator request, `kill_all_children` runs at
 //! server shutdown, the merge-gate completion path kills stale `cargo`
 //! processes, and `script/local-release` tears down the gateway during a
 //! redeploy.  Every one of these used to send a signal that the target was
 //! free to ignore (most commonly `portable_pty`'s `SIGHUP`), with no
 //! verification that the process actually exited.  Agents and bots that
 //! ignore `SIGHUP` survived the "kill", which produced concurrent claude
 //! processes on the same story — directly the duplicate-spawn bug we hit on
 //! 2026-05-15.
 //!
 //! This module provides one trustworthy way to kill processes: SIGKILL with
 //! verification.  Build a pid set with the helpers in this module (or your
 //! own), then hand it to [`sigkill_pids_and_verify`].
 //!
 //! All functions on this module are deliberately Unix-only — huskies runs in
 //! Linux containers and macOS dev hosts, both POSIX.
 use crate::slog_warn;
 /// Maximum time we'll wait for SIGKILL'd processes to disappear before
 /// declaring failure.  SIGKILL is uncatchable, so the kernel normally
 /// reaps within tens of milliseconds; anything past 2 s indicates the
 /// process is wedged in uninterruptible IO (e.g. waiting on a frozen NFS
 /// mount).  Caller can decide whether to proceed despite survivors.
 const KILL_VERIFY_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(2);
 /// Polling interval while waiting for processes to disappear.  100 ms is
 /// fine-grained enough that the typical few-ms reap latency is barely
 /// observable, but coarse enough that we don't burn CPU spinning.
 const KILL_VERIFY_POLL: std::time::Duration = std::time::Duration::from_millis(100);
 /// SIGKILL every pid in `pids`, then poll until all of them are gone.
 ///
 /// Returns `Ok(n)` where `n == pids.len()` when every pid is verified
 /// reaped within [`KILL_VERIFY_TIMEOUT`].  Returns `Err(survivors)` with the
 /// pids still alive after the timeout — extremely rare for SIGKILL but
 /// possible if a process is wedged in uninterruptible IO.  An empty `pids`
 /// slice returns `Ok(0)` immediately.
 ///
 /// **Why SIGKILL and not SIGTERM-first:** several huskies-internal targets
 /// (claude-code, the bot itself) either ignore the polite signals or take
 /// arbitrarily long to honour them.  The watchdog only kills agents that
 /// have already misbehaved by definition (exceeded budget/turn limits), so
 /// there is no reason to give them a graceful-shutdown grace period.
 pub fn sigkill_pids_and_verify(pids: &[u32]) -> Result<usize, Vec<u32>> {
    if pids.is_empty() {
        return Ok(0);
    }
    for &pid in pids {
        // libc::kill returns -1 on failure (with errno).  We deliberately
        // ignore the result: the process may already be gone (errno ESRCH),
        // and trying again wouldn't help.  The verification loop below is
        // the source of truth for "did this work".
        unsafe { libc::kill(pid as i32, libc::SIGKILL) };
    }
    let deadline = std::time::Instant::now() + KILL_VERIFY_TIMEOUT;
    while std::time::Instant::now() < deadline {
        if pids.iter().copied().all(|pid| !pid_is_alive(pid)) {
            return Ok(pids.len());
        }
        std::thread::sleep(KILL_VERIFY_POLL);
    }
    let survivors: Vec<u32> = pids
        .iter()
        .copied()
        .filter(|&pid| pid_is_alive(pid))
        .collect();
    if survivors.is_empty() {
        Ok(pids.len())
    } else {
        slog_warn!(
            "[process_kill] SIGKILL did not reap pids within {:?}: {survivors:?}. \
             They may be wedged in uninterruptible IO.",
            KILL_VERIFY_TIMEOUT
        );
        Err(survivors)
    }
 }
 /// Return every pid whose command line matches `pattern` (passed to
 /// `pgrep -f`).  Empty when nothing matches or when `pgrep` is unavailable.
 ///
 /// Useful for collecting processes by a path or argument substring — e.g.
 /// "every process running in `<worktree>/`" or "every cargo invocation
 /// against this `Cargo.toml`".
 pub fn pids_matching(pattern: &str) -> Vec<u32> {
    let Ok(output) = std::process::Command::new("pgrep")
        .args(["-f", pattern])
        .output()
    else {
        return Vec::new();
    };
    String::from_utf8_lossy(&output.stdout)
        .lines()
        .filter_map(|l| l.trim().parse::<u32>().ok())
        .collect()
 }
 /// Return every descendant pid of `root_pid`, deepest-first, **excluding**
 /// `root_pid` itself.  Walks the parent→child relation via `pgrep -P`.
 ///
 /// Deepest-first ordering lets callers signal leaves before their parents
 /// when that matters; for SIGKILL it makes no difference.
 pub fn descendant_pids(root_pid: u32) -> Vec<u32> {
    let mut out: Vec<u32> = Vec::new();
    walk_descendants(root_pid, &mut out);
    out
 }
 fn walk_descendants(pid: u32, out: &mut Vec<u32>) {
    let Ok(output) = std::process::Command::new("pgrep")
        .args(["-P", &pid.to_string()])
        .output()
    else {
        return;
    };
    let kids: Vec<u32> = String::from_utf8_lossy(&output.stdout)
        .lines()
        .filter_map(|l| l.trim().parse::<u32>().ok())
        .collect();
    for kid in kids {
        walk_descendants(kid, out);
        out.push(kid);
    }
 }
 /// Check whether `pid` currently exists.  Implemented via `kill(pid, 0)` —
 /// no signal is sent, only existence is probed.
 fn pid_is_alive(pid: u32) -> bool {
    // signal 0: "is this process around?"  Returns 0 if the process exists
    // and we have permission to signal it, -1 with errno otherwise.
    unsafe { libc::kill(pid as i32, 0) == 0 }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::process::{Child, Command, Stdio};
    use std::thread::JoinHandle;
    /// Spawn a sleeper for kill testing, and spawn a background reaper that
    /// calls `wait()` as soon as the child exits.  Returns the pid plus the
    /// reaper join handle so the test can confirm reaping after the kill.
    ///
    /// The reaper is essential because the production code's verify loop
    /// uses `kill(pid, 0)` to test existence — which returns 0 for zombies.
    /// If no one reaps the test's sleeper, its pid stays occupied (as a
    /// zombie) and `sigkill_pids_and_verify` mistakenly reports survivors.
    /// In production the PTY blocking thread is always reaping on behalf of
    /// portable_pty, so this isn't a concern there.
    fn spawn_sleeper_with_reaper(secs: u64) -> (u32, JoinHandle<()>) {
        let child: Child = Command::new("sleep")
            .arg(secs.to_string())
            .stdout(Stdio::null())
            .stderr(Stdio::null())
            .stdin(Stdio::null())
            .spawn()
            .expect("failed to spawn sleep");
        let pid = child.id();
        let reaper = std::thread::spawn(move || {
            let mut c = child;
            let _ = c.wait();
        });
        (pid, reaper)
    }
    #[test]
    fn sigkill_empty_slice_is_ok() {
        let result = sigkill_pids_and_verify(&[]);
        assert!(matches!(result, Ok(0)));
    }
    #[test]
    fn sigkill_real_process_is_verified_gone() {
        let (pid, reaper) = spawn_sleeper_with_reaper(60);
        assert!(pid_is_alive(pid), "sleeper should be alive before kill");
        let result = sigkill_pids_and_verify(&[pid]);
        assert!(
            matches!(result, Ok(1)),
            "sigkill must verify the process is gone: {result:?}"
        );
        let _ = reaper.join();
        assert!(!pid_is_alive(pid), "sleeper must be dead after kill");
    }
    #[test]
    fn sigkill_already_dead_pid_is_ok() {
        let (pid, reaper) = spawn_sleeper_with_reaper(0);
        let _ = reaper.join();
        // Wait briefly for the kernel to recycle the pid.
        for _ in 0..20 {
            if !pid_is_alive(pid) {
                break;
            }
            std::thread::sleep(std::time::Duration::from_millis(100));
        }
        // Now SIGKILL a pid that no longer exists.  Result must still be Ok.
        let result = sigkill_pids_and_verify(&[pid]);
        assert!(
            result.is_ok(),
            "sigkill of already-dead pid must succeed: {result:?}"
        );
    }
    #[test]
    fn sigkill_multiple_real_processes() {
        let mut handles: Vec<(u32, JoinHandle<()>)> =
            (0..3).map(|_| spawn_sleeper_with_reaper(60)).collect();
        let pids: Vec<u32> = handles.iter().map(|(p, _)| *p).collect();
        for &pid in &pids {
            assert!(pid_is_alive(pid));
        }
        let result = sigkill_pids_and_verify(&pids);
        assert!(
            matches!(result, Ok(3)),
            "all 3 sleepers must die: {result:?}"
        );
        for (_, reaper) in handles.drain(..) {
            let _ = reaper.join();
        }
        for &pid in &pids {
            assert!(!pid_is_alive(pid), "pid {pid} survived sigkill");
        }
    }
    #[test]
    fn pids_matching_finds_a_running_process() {
        // pgrep -f matches the FULL command line, so the marker has to be
        // in argv somewhere.  Putting it in a shell comment doesn't work —
        // sh strips it.  Override argv[0] so the marker is durably visible.
        use std::os::unix::process::CommandExt;
        let marker = format!("kill-test-marker-{}-{}", std::process::id(), rand_u64());
        let argv0 = format!("test-marker-{marker}");
        let child: Child = Command::new("sleep")
            .arg0(argv0)
            .arg("60")
            .stdout(Stdio::null())
            .stderr(Stdio::null())
            .stdin(Stdio::null())
            .spawn()
            .expect("spawn");
        let child_pid = child.id();
        let reaper = std::thread::spawn(move || {
            let mut c = child;
            let _ = c.wait();
        });
        // pgrep needs a moment to see the new process.
        std::thread::sleep(std::time::Duration::from_millis(100));
        let found = pids_matching(&marker);
        assert!(
            found.contains(&child_pid),
            "pids_matching should find pid {child_pid} for marker '{marker}'; got {found:?}"
        );
        // Cleanup so the test doesn't leak a sleeper.
        let _ = sigkill_pids_and_verify(&[child_pid]);
        let _ = reaper.join();
    }
    #[test]
    fn pids_matching_returns_empty_when_no_match() {
        let pattern = format!("nonexistent-pattern-{}-{}", std::process::id(), rand_u64());
        let found = pids_matching(&pattern);
        assert!(found.is_empty(), "expected empty result, got {found:?}");
    }
    /// Cheap unique-ish u64 for distinguishing test invocations without a
    /// dependency on a randomness crate.
    fn rand_u64() -> u64 {
        use std::time::{SystemTime, UNIX_EPOCH};
        SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .map(|d| d.as_nanos() as u64)
            .unwrap_or(0)
    }
    #[test]
    fn descendant_pids_of_real_process_tree() {
        // Build a parent sh that spawns a child sleep.  The descendants of
        // the parent should include the sleep.
        let parent: Child = Command::new("sh")
            .args(["-c", "sleep 60"])
            .stdout(Stdio::null())
            .stderr(Stdio::null())
            .stdin(Stdio::null())
            .spawn()
            .expect("spawn parent");
        let parent_pid = parent.id();
        let reaper = std::thread::spawn(move || {
            let mut c = parent;
            let _ = c.wait();
        });
        // Let the shell get around to fork+execing its child.
        std::thread::sleep(std::time::Duration::from_millis(200));
        let descendants = descendant_pids(parent_pid);
        // On some shells `sh -c "sleep N"` exec-replaces sh with sleep, leaving
        // zero descendants.  On others it forks.  We don't care which; we only
        // care that the function doesn't panic and returns a sensible vec.
        assert!(
            descendants.iter().all(|&pid| pid != parent_pid),
            "descendant_pids must not include the root itself: {descendants:?}"
        );
        // Cleanup: kill the parent and any descendants.
        let mut all = descendants;
        all.push(parent_pid);
        let _ = sigkill_pids_and_verify(&all);
        let _ = reaper.join();
    }
 }