feat: add process_kill module + use it to fix watchdog double-spawn

Adds `crate::process_kill` — reliable SIGKILL-with-verify primitives used
across the server in place of the various ad-hoc kill paths that ignored
their kill-effective return values. The module exposes three pieces:

  - `sigkill_pids_and_verify(pids)`: SIGKILL each pid and block (up to 2s)
    until every pid is verified gone. Returns survivors if not.
  - `pids_matching(pattern)`: pgrep -f wrapper.
  - `descendant_pids(root)`: recursive pgrep -P walker for process trees.

Wires the watchdog's limit-termination path through it, and reorders the
protocol to fix the duplicate-coder bug observed on story 1086 (2026-05-15):

  Before: check_agent_limits set status=Failed before the kill ran. The
  kill itself was `portable_pty::ChildKiller::kill()`, which sends SIGHUP
  on Unix — claude-code ignores SIGHUP, so the process kept running while
  the agent record was already marked terminated. The idempotency check
  in `start_agent` whitelists Running/Pending, so the next auto-assign
  pass spawned a fresh agent alongside the still-alive prior one. Two
  claude PIDs sharing one session_id, racing on the same worktree.

  After: status update is moved OUT of check_agent_limits and into the
  caller AFTER the kill is verified. The kill itself is now SIGKILL-the-
  process-tree-in-the-worktree, with explicit verification that every pid
  is gone. The idempotency window is closed.

The existing watchdog test suite (14 tests) still passes; 7 new tests
cover the process_kill primitives directly.

`agents/pool/process.rs`'s `kill_all_children` and `kill_child_for_key`
still use the old portable_pty SIGHUP path — they have the same bug but
in lower-impact code paths (shutdown, operator stop). They will be
migrated under a separate story to keep this commit focused.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Timmy
2026-05-15 10:36:33 +01:00
parent 8446ab1c71
commit fe9804b32c
5 changed files with 403 additions and 12 deletions
@@ -187,13 +187,14 @@ pub(super) fn check_agent_limits(
), ),
}; };
// Mark agent as Failed with termination reason. // NOTE: agent status is intentionally NOT updated here. Setting
if let Ok(mut lock) = agents.lock() // `status = Failed` before the kill (the previous behaviour)
&& let Some(agent) = lock.get_mut(key) // opened a window where the `start_agent` idempotency check
{ // (which whitelists Running/Pending) would let a fresh spawn
agent.status = AgentStatus::Failed; // through while the prior PTY child was still alive — directly
agent.termination_reason = Some(reason.clone()); // causing the concurrent-agents bug we hit on story 1086
} // (2026-05-15). The caller (`run_watchdog_pass`) is responsible
// for: (1) verifying the kill, (2) THEN updating the agent record.
slog!("[watchdog] Terminating agent '{key}': {reason_str}."); slog!("[watchdog] Terminating agent '{key}': {reason_str}.");
@@ -9,8 +9,11 @@ mod tests;
use std::path::Path; use std::path::Path;
use crate::agents::AgentStatus;
use crate::config::ProjectConfig; use crate::config::ProjectConfig;
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
use crate::slog; use crate::slog;
use crate::slog_warn;
use super::super::AgentPool; use super::super::AgentPool;
use limits::check_agent_limits; use limits::check_agent_limits;
@@ -42,15 +45,71 @@ impl AgentPool {
if let Some(root) = project_root { if let Some(root) = project_root {
let terminated = check_agent_limits(&self.agents, root); let terminated = check_agent_limits(&self.agents, root);
let config = ProjectConfig::load(root).unwrap_or_default(); let config = ProjectConfig::load(root).unwrap_or_default();
for (key, _reason) in &terminated { for (key, reason) in &terminated {
// Kill the PTY child and abort the task, same as stop_agent. // Step 1: snapshot the agent's worktree path so we can find every
// process running in it (claude + any subprocesses). This must
// happen BEFORE we mutate the agent record so we can read the
// worktree info safely.
let worktree_path = self.agents.lock().ok().and_then(|lock| {
lock.get(key)
.and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
});
// Step 2: SIGKILL every process running in the worktree and
// BLOCK until verified gone. The previous mechanism — portable_pty's
// `ChildKiller::kill()` — sends SIGHUP, which claude-code
// ignores, leaving the process alive while the agent record
// was being marked terminated; that gap let a fresh spawn race
// in alongside the surviving one. SIGKILL is uncatchable;
// [`sigkill_pids_and_verify`] only returns once the kernel has
// reaped each pid.
if let Some(wt_path) = worktree_path.as_ref() {
let pids = pids_matching(&wt_path.display().to_string());
if pids.is_empty() {
// Nothing in this worktree — agent likely already
// exited on its own before the watchdog noticed.
} else {
match sigkill_pids_and_verify(&pids) {
Ok(n) => slog!(
"[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
wt_path.display()
),
Err(survivors) => slog_warn!(
"[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
Proceeding with cleanup; concurrent spawn protection may be weakened."
),
}
}
} else {
slog_warn!(
"[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \
falling back to portable_pty SIGHUP (likely no-op for claude-code)."
);
self.kill_child_for_key(key); self.kill_child_for_key(key);
}
// Step 3: NOW update the agent record. The process is verified
// gone (or we logged that SIGKILL didn't take effect, which is
// exceptional), so flipping status away from Running can no
// longer open a window for a concurrent spawn.
if let Ok(mut lock) = self.agents.lock() if let Ok(mut lock) = self.agents.lock()
&& let Some(agent) = lock.get_mut(key) && let Some(agent) = lock.get_mut(key)
&& let Some(handle) = agent.task_handle.take()
{ {
agent.status = AgentStatus::Failed;
agent.termination_reason = Some(reason.clone());
if let Some(handle) = agent.task_handle.take() {
// Best-effort abort of the outer tokio task. The PTY
// blocking thread already returned (claude is dead),
// so this is bookkeeping rather than load-bearing.
handle.abort(); handle.abort();
} }
}
// Step 4: drop the (now-stale) child_killers entry — the
// process it pointed at is gone.
if let Ok(mut killers) = self.child_killers.lock() {
killers.remove(key);
}
// Use the retry mechanism: increment retry_count and only block // Use the retry mechanism: increment retry_count and only block
// when the limit is exceeded, matching the pipeline's behaviour. // when the limit is exceeded, matching the pipeline's behaviour.
+7
View File
@@ -1,4 +1,11 @@
//! Process management — kills orphaned PTY child processes on server shutdown. //! Process management — kills orphaned PTY child processes on server shutdown.
//!
//! See [`crate::process_kill`] for the general process-termination primitives
//! this module's existing methods (`kill_all_children`, `kill_child_for_key`)
//! should eventually be migrated to. Those methods currently use
//! `portable_pty::ChildKiller::kill()`, which sends `SIGHUP` — a signal
//! claude-code ignores — so they leave orphans on every shutdown/stop. The
//! migration is tracked in a separate story to keep its diff focused.
use crate::slog; use crate::slog;
use super::AgentPool; use super::AgentPool;
+2
View File
@@ -33,6 +33,8 @@ pub mod mesh;
/// Node identity — Ed25519 keypair generation and stable node ID management. /// Node identity — Ed25519 keypair generation and stable node ID management.
pub mod node_identity; pub mod node_identity;
pub(crate) mod pipeline_state; pub(crate) mod pipeline_state;
/// Reliable process-termination primitives shared across the server.
pub mod process_kill;
/// Rebuild — process restart and shutdown coordination. /// Rebuild — process restart and shutdown coordination.
pub mod rebuild; pub mod rebuild;
mod service; mod service;
+322
View File
@@ -0,0 +1,322 @@
//! Reliable process-termination primitives.
//!
//! The huskies server kills child processes in several distinct places:
//! the watchdog terminates agents that have exceeded turn/budget limits,
//! `stop_agent` terminates on operator request, `kill_all_children` runs at
//! server shutdown, the merge-gate completion path kills stale `cargo`
//! processes, and `script/local-release` tears down the gateway during a
//! redeploy. Every one of these used to send a signal that the target was
//! free to ignore (most commonly `portable_pty`'s `SIGHUP`), with no
//! verification that the process actually exited. Agents and bots that
//! ignore `SIGHUP` survived the "kill", which produced concurrent claude
//! processes on the same story — directly the duplicate-spawn bug we hit on
//! 2026-05-15.
//!
//! This module provides one trustworthy way to kill processes: SIGKILL with
//! verification. Build a pid set with the helpers in this module (or your
//! own), then hand it to [`sigkill_pids_and_verify`].
//!
//! All functions on this module are deliberately Unix-only — huskies runs in
//! Linux containers and macOS dev hosts, both POSIX.
use crate::slog_warn;
/// Maximum time we'll wait for SIGKILL'd processes to disappear before
/// declaring failure. SIGKILL is uncatchable, so the kernel normally
/// reaps within tens of milliseconds; anything past 2 s indicates the
/// process is wedged in uninterruptible IO (e.g. waiting on a frozen NFS
/// mount). Caller can decide whether to proceed despite survivors.
const KILL_VERIFY_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(2);
/// Polling interval while waiting for processes to disappear. 100 ms is
/// fine-grained enough that the typical few-ms reap latency is barely
/// observable, but coarse enough that we don't burn CPU spinning.
const KILL_VERIFY_POLL: std::time::Duration = std::time::Duration::from_millis(100);
/// SIGKILL every pid in `pids`, then poll until all of them are gone.
///
/// Returns `Ok(n)` where `n == pids.len()` when every pid is verified
/// reaped within [`KILL_VERIFY_TIMEOUT`]. Returns `Err(survivors)` with the
/// pids still alive after the timeout — extremely rare for SIGKILL but
/// possible if a process is wedged in uninterruptible IO. An empty `pids`
/// slice returns `Ok(0)` immediately.
///
/// **Why SIGKILL and not SIGTERM-first:** several huskies-internal targets
/// (claude-code, the bot itself) either ignore the polite signals or take
/// arbitrarily long to honour them. The watchdog only kills agents that
/// have already misbehaved by definition (exceeded budget/turn limits), so
/// there is no reason to give them a graceful-shutdown grace period.
pub fn sigkill_pids_and_verify(pids: &[u32]) -> Result<usize, Vec<u32>> {
if pids.is_empty() {
return Ok(0);
}
for &pid in pids {
// libc::kill returns -1 on failure (with errno). We deliberately
// ignore the result: the process may already be gone (errno ESRCH),
// and trying again wouldn't help. The verification loop below is
// the source of truth for "did this work".
unsafe { libc::kill(pid as i32, libc::SIGKILL) };
}
let deadline = std::time::Instant::now() + KILL_VERIFY_TIMEOUT;
while std::time::Instant::now() < deadline {
if pids.iter().copied().all(|pid| !pid_is_alive(pid)) {
return Ok(pids.len());
}
std::thread::sleep(KILL_VERIFY_POLL);
}
let survivors: Vec<u32> = pids
.iter()
.copied()
.filter(|&pid| pid_is_alive(pid))
.collect();
if survivors.is_empty() {
Ok(pids.len())
} else {
slog_warn!(
"[process_kill] SIGKILL did not reap pids within {:?}: {survivors:?}. \
They may be wedged in uninterruptible IO.",
KILL_VERIFY_TIMEOUT
);
Err(survivors)
}
}
/// Return every pid whose command line matches `pattern` (passed to
/// `pgrep -f`). Empty when nothing matches or when `pgrep` is unavailable.
///
/// Useful for collecting processes by a path or argument substring — e.g.
/// "every process running in `<worktree>/`" or "every cargo invocation
/// against this `Cargo.toml`".
pub fn pids_matching(pattern: &str) -> Vec<u32> {
let Ok(output) = std::process::Command::new("pgrep")
.args(["-f", pattern])
.output()
else {
return Vec::new();
};
String::from_utf8_lossy(&output.stdout)
.lines()
.filter_map(|l| l.trim().parse::<u32>().ok())
.collect()
}
/// Return every descendant pid of `root_pid`, deepest-first, **excluding**
/// `root_pid` itself. Walks the parent→child relation via `pgrep -P`.
///
/// Deepest-first ordering lets callers signal leaves before their parents
/// when that matters; for SIGKILL it makes no difference.
pub fn descendant_pids(root_pid: u32) -> Vec<u32> {
let mut out: Vec<u32> = Vec::new();
walk_descendants(root_pid, &mut out);
out
}
fn walk_descendants(pid: u32, out: &mut Vec<u32>) {
let Ok(output) = std::process::Command::new("pgrep")
.args(["-P", &pid.to_string()])
.output()
else {
return;
};
let kids: Vec<u32> = String::from_utf8_lossy(&output.stdout)
.lines()
.filter_map(|l| l.trim().parse::<u32>().ok())
.collect();
for kid in kids {
walk_descendants(kid, out);
out.push(kid);
}
}
/// Check whether `pid` currently exists. Implemented via `kill(pid, 0)` —
/// no signal is sent, only existence is probed.
fn pid_is_alive(pid: u32) -> bool {
// signal 0: "is this process around?" Returns 0 if the process exists
// and we have permission to signal it, -1 with errno otherwise.
unsafe { libc::kill(pid as i32, 0) == 0 }
}
#[cfg(test)]
mod tests {
use super::*;
use std::process::{Child, Command, Stdio};
use std::thread::JoinHandle;
/// Spawn a sleeper for kill testing, and spawn a background reaper that
/// calls `wait()` as soon as the child exits. Returns the pid plus the
/// reaper join handle so the test can confirm reaping after the kill.
///
/// The reaper is essential because the production code's verify loop
/// uses `kill(pid, 0)` to test existence — which returns 0 for zombies.
/// If no one reaps the test's sleeper, its pid stays occupied (as a
/// zombie) and `sigkill_pids_and_verify` mistakenly reports survivors.
/// In production the PTY blocking thread is always reaping on behalf of
/// portable_pty, so this isn't a concern there.
fn spawn_sleeper_with_reaper(secs: u64) -> (u32, JoinHandle<()>) {
let child: Child = Command::new("sleep")
.arg(secs.to_string())
.stdout(Stdio::null())
.stderr(Stdio::null())
.stdin(Stdio::null())
.spawn()
.expect("failed to spawn sleep");
let pid = child.id();
let reaper = std::thread::spawn(move || {
let mut c = child;
let _ = c.wait();
});
(pid, reaper)
}
#[test]
fn sigkill_empty_slice_is_ok() {
let result = sigkill_pids_and_verify(&[]);
assert!(matches!(result, Ok(0)));
}
#[test]
fn sigkill_real_process_is_verified_gone() {
let (pid, reaper) = spawn_sleeper_with_reaper(60);
assert!(pid_is_alive(pid), "sleeper should be alive before kill");
let result = sigkill_pids_and_verify(&[pid]);
assert!(
matches!(result, Ok(1)),
"sigkill must verify the process is gone: {result:?}"
);
let _ = reaper.join();
assert!(!pid_is_alive(pid), "sleeper must be dead after kill");
}
#[test]
fn sigkill_already_dead_pid_is_ok() {
let (pid, reaper) = spawn_sleeper_with_reaper(0);
let _ = reaper.join();
// Wait briefly for the kernel to recycle the pid.
for _ in 0..20 {
if !pid_is_alive(pid) {
break;
}
std::thread::sleep(std::time::Duration::from_millis(100));
}
// Now SIGKILL a pid that no longer exists. Result must still be Ok.
let result = sigkill_pids_and_verify(&[pid]);
assert!(
result.is_ok(),
"sigkill of already-dead pid must succeed: {result:?}"
);
}
#[test]
fn sigkill_multiple_real_processes() {
let mut handles: Vec<(u32, JoinHandle<()>)> =
(0..3).map(|_| spawn_sleeper_with_reaper(60)).collect();
let pids: Vec<u32> = handles.iter().map(|(p, _)| *p).collect();
for &pid in &pids {
assert!(pid_is_alive(pid));
}
let result = sigkill_pids_and_verify(&pids);
assert!(
matches!(result, Ok(3)),
"all 3 sleepers must die: {result:?}"
);
for (_, reaper) in handles.drain(..) {
let _ = reaper.join();
}
for &pid in &pids {
assert!(!pid_is_alive(pid), "pid {pid} survived sigkill");
}
}
#[test]
fn pids_matching_finds_a_running_process() {
// pgrep -f matches the FULL command line, so the marker has to be
// in argv somewhere. Putting it in a shell comment doesn't work —
// sh strips it. Override argv[0] so the marker is durably visible.
use std::os::unix::process::CommandExt;
let marker = format!("kill-test-marker-{}-{}", std::process::id(), rand_u64());
let argv0 = format!("test-marker-{marker}");
let child: Child = Command::new("sleep")
.arg0(argv0)
.arg("60")
.stdout(Stdio::null())
.stderr(Stdio::null())
.stdin(Stdio::null())
.spawn()
.expect("spawn");
let child_pid = child.id();
let reaper = std::thread::spawn(move || {
let mut c = child;
let _ = c.wait();
});
// pgrep needs a moment to see the new process.
std::thread::sleep(std::time::Duration::from_millis(100));
let found = pids_matching(&marker);
assert!(
found.contains(&child_pid),
"pids_matching should find pid {child_pid} for marker '{marker}'; got {found:?}"
);
// Cleanup so the test doesn't leak a sleeper.
let _ = sigkill_pids_and_verify(&[child_pid]);
let _ = reaper.join();
}
#[test]
fn pids_matching_returns_empty_when_no_match() {
let pattern = format!("nonexistent-pattern-{}-{}", std::process::id(), rand_u64());
let found = pids_matching(&pattern);
assert!(found.is_empty(), "expected empty result, got {found:?}");
}
/// Cheap unique-ish u64 for distinguishing test invocations without a
/// dependency on a randomness crate.
fn rand_u64() -> u64 {
use std::time::{SystemTime, UNIX_EPOCH};
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_nanos() as u64)
.unwrap_or(0)
}
#[test]
fn descendant_pids_of_real_process_tree() {
// Build a parent sh that spawns a child sleep. The descendants of
// the parent should include the sleep.
let parent: Child = Command::new("sh")
.args(["-c", "sleep 60"])
.stdout(Stdio::null())
.stderr(Stdio::null())
.stdin(Stdio::null())
.spawn()
.expect("spawn parent");
let parent_pid = parent.id();
let reaper = std::thread::spawn(move || {
let mut c = parent;
let _ = c.wait();
});
// Let the shell get around to fork+execing its child.
std::thread::sleep(std::time::Duration::from_millis(200));
let descendants = descendant_pids(parent_pid);
// On some shells `sh -c "sleep N"` exec-replaces sh with sleep, leaving
// zero descendants. On others it forks. We don't care which; we only
// care that the function doesn't panic and returns a sensible vec.
assert!(
descendants.iter().all(|&pid| pid != parent_pid),
"descendant_pids must not include the root itself: {descendants:?}"
);
// Cleanup: kill the parent and any descendants.
let mut all = descendants;
all.push(parent_pid);
let _ = sigkill_pids_and_verify(&all);
let _ = reaper.join();
}
}