feat: add process_kill module + use it to fix watchdog double-spawn
Adds `crate::process_kill` — reliable SIGKILL-with-verify primitives used
across the server in place of the various ad-hoc kill paths that ignored
their kill-effective return values. The module exposes three pieces:
- `sigkill_pids_and_verify(pids)`: SIGKILL each pid and block (up to 2s)
until every pid is verified gone. Returns survivors if not.
- `pids_matching(pattern)`: pgrep -f wrapper.
- `descendant_pids(root)`: recursive pgrep -P walker for process trees.
Wires the watchdog's limit-termination path through it, and reorders the
protocol to fix the duplicate-coder bug observed on story 1086 (2026-05-15):
Before: check_agent_limits set status=Failed before the kill ran. The
kill itself was `portable_pty::ChildKiller::kill()`, which sends SIGHUP
on Unix — claude-code ignores SIGHUP, so the process kept running while
the agent record was already marked terminated. The idempotency check
in `start_agent` whitelists Running/Pending, so the next auto-assign
pass spawned a fresh agent alongside the still-alive prior one. Two
claude PIDs sharing one session_id, racing on the same worktree.
After: status update is moved OUT of check_agent_limits and into the
caller AFTER the kill is verified. The kill itself is now SIGKILL-the-
process-tree-in-the-worktree, with explicit verification that every pid
is gone. The idempotency window is closed.
The existing watchdog test suite (14 tests) still passes; 7 new tests
cover the process_kill primitives directly.
`agents/pool/process.rs`'s `kill_all_children` and `kill_child_for_key`
still use the old portable_pty SIGHUP path — they have the same bug but
in lower-impact code paths (shutdown, operator stop). They will be
migrated under a separate story to keep this commit focused.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -187,13 +187,14 @@ pub(super) fn check_agent_limits(
|
|||||||
),
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Mark agent as Failed with termination reason.
|
// NOTE: agent status is intentionally NOT updated here. Setting
|
||||||
if let Ok(mut lock) = agents.lock()
|
// `status = Failed` before the kill (the previous behaviour)
|
||||||
&& let Some(agent) = lock.get_mut(key)
|
// opened a window where the `start_agent` idempotency check
|
||||||
{
|
// (which whitelists Running/Pending) would let a fresh spawn
|
||||||
agent.status = AgentStatus::Failed;
|
// through while the prior PTY child was still alive — directly
|
||||||
agent.termination_reason = Some(reason.clone());
|
// causing the concurrent-agents bug we hit on story 1086
|
||||||
}
|
// (2026-05-15). The caller (`run_watchdog_pass`) is responsible
|
||||||
|
// for: (1) verifying the kill, (2) THEN updating the agent record.
|
||||||
|
|
||||||
slog!("[watchdog] Terminating agent '{key}': {reason_str}.");
|
slog!("[watchdog] Terminating agent '{key}': {reason_str}.");
|
||||||
|
|
||||||
|
|||||||
@@ -9,8 +9,11 @@ mod tests;
|
|||||||
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
|
use crate::agents::AgentStatus;
|
||||||
use crate::config::ProjectConfig;
|
use crate::config::ProjectConfig;
|
||||||
|
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
|
||||||
use crate::slog;
|
use crate::slog;
|
||||||
|
use crate::slog_warn;
|
||||||
|
|
||||||
use super::super::AgentPool;
|
use super::super::AgentPool;
|
||||||
use limits::check_agent_limits;
|
use limits::check_agent_limits;
|
||||||
@@ -42,14 +45,70 @@ impl AgentPool {
|
|||||||
if let Some(root) = project_root {
|
if let Some(root) = project_root {
|
||||||
let terminated = check_agent_limits(&self.agents, root);
|
let terminated = check_agent_limits(&self.agents, root);
|
||||||
let config = ProjectConfig::load(root).unwrap_or_default();
|
let config = ProjectConfig::load(root).unwrap_or_default();
|
||||||
for (key, _reason) in &terminated {
|
for (key, reason) in &terminated {
|
||||||
// Kill the PTY child and abort the task, same as stop_agent.
|
// Step 1: snapshot the agent's worktree path so we can find every
|
||||||
self.kill_child_for_key(key);
|
// process running in it (claude + any subprocesses). This must
|
||||||
|
// happen BEFORE we mutate the agent record so we can read the
|
||||||
|
// worktree info safely.
|
||||||
|
let worktree_path = self.agents.lock().ok().and_then(|lock| {
|
||||||
|
lock.get(key)
|
||||||
|
.and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
|
||||||
|
});
|
||||||
|
|
||||||
|
// Step 2: SIGKILL every process running in the worktree and
|
||||||
|
// BLOCK until verified gone. The previous mechanism — portable_pty's
|
||||||
|
// `ChildKiller::kill()` — sends SIGHUP, which claude-code
|
||||||
|
// ignores, leaving the process alive while the agent record
|
||||||
|
// was being marked terminated; that gap let a fresh spawn race
|
||||||
|
// in alongside the surviving one. SIGKILL is uncatchable;
|
||||||
|
// [`sigkill_pids_and_verify`] only returns once the kernel has
|
||||||
|
// reaped each pid.
|
||||||
|
if let Some(wt_path) = worktree_path.as_ref() {
|
||||||
|
let pids = pids_matching(&wt_path.display().to_string());
|
||||||
|
if pids.is_empty() {
|
||||||
|
// Nothing in this worktree — agent likely already
|
||||||
|
// exited on its own before the watchdog noticed.
|
||||||
|
} else {
|
||||||
|
match sigkill_pids_and_verify(&pids) {
|
||||||
|
Ok(n) => slog!(
|
||||||
|
"[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
|
||||||
|
wt_path.display()
|
||||||
|
),
|
||||||
|
Err(survivors) => slog_warn!(
|
||||||
|
"[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
|
||||||
|
Proceeding with cleanup; concurrent spawn protection may be weakened."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
slog_warn!(
|
||||||
|
"[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \
|
||||||
|
falling back to portable_pty SIGHUP (likely no-op for claude-code)."
|
||||||
|
);
|
||||||
|
self.kill_child_for_key(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: NOW update the agent record. The process is verified
|
||||||
|
// gone (or we logged that SIGKILL didn't take effect, which is
|
||||||
|
// exceptional), so flipping status away from Running can no
|
||||||
|
// longer open a window for a concurrent spawn.
|
||||||
if let Ok(mut lock) = self.agents.lock()
|
if let Ok(mut lock) = self.agents.lock()
|
||||||
&& let Some(agent) = lock.get_mut(key)
|
&& let Some(agent) = lock.get_mut(key)
|
||||||
&& let Some(handle) = agent.task_handle.take()
|
|
||||||
{
|
{
|
||||||
handle.abort();
|
agent.status = AgentStatus::Failed;
|
||||||
|
agent.termination_reason = Some(reason.clone());
|
||||||
|
if let Some(handle) = agent.task_handle.take() {
|
||||||
|
// Best-effort abort of the outer tokio task. The PTY
|
||||||
|
// blocking thread already returned (claude is dead),
|
||||||
|
// so this is bookkeeping rather than load-bearing.
|
||||||
|
handle.abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 4: drop the (now-stale) child_killers entry — the
|
||||||
|
// process it pointed at is gone.
|
||||||
|
if let Ok(mut killers) = self.child_killers.lock() {
|
||||||
|
killers.remove(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use the retry mechanism: increment retry_count and only block
|
// Use the retry mechanism: increment retry_count and only block
|
||||||
|
|||||||
@@ -1,4 +1,11 @@
|
|||||||
//! Process management — kills orphaned PTY child processes on server shutdown.
|
//! Process management — kills orphaned PTY child processes on server shutdown.
|
||||||
|
//!
|
||||||
|
//! See [`crate::process_kill`] for the general process-termination primitives
|
||||||
|
//! this module's existing methods (`kill_all_children`, `kill_child_for_key`)
|
||||||
|
//! should eventually be migrated to. Those methods currently use
|
||||||
|
//! `portable_pty::ChildKiller::kill()`, which sends `SIGHUP` — a signal
|
||||||
|
//! claude-code ignores — so they leave orphans on every shutdown/stop. The
|
||||||
|
//! migration is tracked in a separate story to keep its diff focused.
|
||||||
use crate::slog;
|
use crate::slog;
|
||||||
|
|
||||||
use super::AgentPool;
|
use super::AgentPool;
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ pub mod mesh;
|
|||||||
/// Node identity — Ed25519 keypair generation and stable node ID management.
|
/// Node identity — Ed25519 keypair generation and stable node ID management.
|
||||||
pub mod node_identity;
|
pub mod node_identity;
|
||||||
pub(crate) mod pipeline_state;
|
pub(crate) mod pipeline_state;
|
||||||
|
/// Reliable process-termination primitives shared across the server.
|
||||||
|
pub mod process_kill;
|
||||||
/// Rebuild — process restart and shutdown coordination.
|
/// Rebuild — process restart and shutdown coordination.
|
||||||
pub mod rebuild;
|
pub mod rebuild;
|
||||||
mod service;
|
mod service;
|
||||||
|
|||||||
@@ -0,0 +1,322 @@
|
|||||||
|
//! Reliable process-termination primitives.
|
||||||
|
//!
|
||||||
|
//! The huskies server kills child processes in several distinct places:
|
||||||
|
//! the watchdog terminates agents that have exceeded turn/budget limits,
|
||||||
|
//! `stop_agent` terminates on operator request, `kill_all_children` runs at
|
||||||
|
//! server shutdown, the merge-gate completion path kills stale `cargo`
|
||||||
|
//! processes, and `script/local-release` tears down the gateway during a
|
||||||
|
//! redeploy. Every one of these used to send a signal that the target was
|
||||||
|
//! free to ignore (most commonly `portable_pty`'s `SIGHUP`), with no
|
||||||
|
//! verification that the process actually exited. Agents and bots that
|
||||||
|
//! ignore `SIGHUP` survived the "kill", which produced concurrent claude
|
||||||
|
//! processes on the same story — directly the duplicate-spawn bug we hit on
|
||||||
|
//! 2026-05-15.
|
||||||
|
//!
|
||||||
|
//! This module provides one trustworthy way to kill processes: SIGKILL with
|
||||||
|
//! verification. Build a pid set with the helpers in this module (or your
|
||||||
|
//! own), then hand it to [`sigkill_pids_and_verify`].
|
||||||
|
//!
|
||||||
|
//! All functions on this module are deliberately Unix-only — huskies runs in
|
||||||
|
//! Linux containers and macOS dev hosts, both POSIX.
|
||||||
|
|
||||||
|
use crate::slog_warn;
|
||||||
|
|
||||||
|
/// Maximum time we'll wait for SIGKILL'd processes to disappear before
|
||||||
|
/// declaring failure. SIGKILL is uncatchable, so the kernel normally
|
||||||
|
/// reaps within tens of milliseconds; anything past 2 s indicates the
|
||||||
|
/// process is wedged in uninterruptible IO (e.g. waiting on a frozen NFS
|
||||||
|
/// mount). Caller can decide whether to proceed despite survivors.
|
||||||
|
const KILL_VERIFY_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(2);
|
||||||
|
|
||||||
|
/// Polling interval while waiting for processes to disappear. 100 ms is
|
||||||
|
/// fine-grained enough that the typical few-ms reap latency is barely
|
||||||
|
/// observable, but coarse enough that we don't burn CPU spinning.
|
||||||
|
const KILL_VERIFY_POLL: std::time::Duration = std::time::Duration::from_millis(100);
|
||||||
|
|
||||||
|
/// SIGKILL every pid in `pids`, then poll until all of them are gone.
|
||||||
|
///
|
||||||
|
/// Returns `Ok(n)` where `n == pids.len()` when every pid is verified
|
||||||
|
/// reaped within [`KILL_VERIFY_TIMEOUT`]. Returns `Err(survivors)` with the
|
||||||
|
/// pids still alive after the timeout — extremely rare for SIGKILL but
|
||||||
|
/// possible if a process is wedged in uninterruptible IO. An empty `pids`
|
||||||
|
/// slice returns `Ok(0)` immediately.
|
||||||
|
///
|
||||||
|
/// **Why SIGKILL and not SIGTERM-first:** several huskies-internal targets
|
||||||
|
/// (claude-code, the bot itself) either ignore the polite signals or take
|
||||||
|
/// arbitrarily long to honour them. The watchdog only kills agents that
|
||||||
|
/// have already misbehaved by definition (exceeded budget/turn limits), so
|
||||||
|
/// there is no reason to give them a graceful-shutdown grace period.
|
||||||
|
pub fn sigkill_pids_and_verify(pids: &[u32]) -> Result<usize, Vec<u32>> {
|
||||||
|
if pids.is_empty() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for &pid in pids {
|
||||||
|
// libc::kill returns -1 on failure (with errno). We deliberately
|
||||||
|
// ignore the result: the process may already be gone (errno ESRCH),
|
||||||
|
// and trying again wouldn't help. The verification loop below is
|
||||||
|
// the source of truth for "did this work".
|
||||||
|
unsafe { libc::kill(pid as i32, libc::SIGKILL) };
|
||||||
|
}
|
||||||
|
|
||||||
|
let deadline = std::time::Instant::now() + KILL_VERIFY_TIMEOUT;
|
||||||
|
while std::time::Instant::now() < deadline {
|
||||||
|
if pids.iter().copied().all(|pid| !pid_is_alive(pid)) {
|
||||||
|
return Ok(pids.len());
|
||||||
|
}
|
||||||
|
std::thread::sleep(KILL_VERIFY_POLL);
|
||||||
|
}
|
||||||
|
|
||||||
|
let survivors: Vec<u32> = pids
|
||||||
|
.iter()
|
||||||
|
.copied()
|
||||||
|
.filter(|&pid| pid_is_alive(pid))
|
||||||
|
.collect();
|
||||||
|
if survivors.is_empty() {
|
||||||
|
Ok(pids.len())
|
||||||
|
} else {
|
||||||
|
slog_warn!(
|
||||||
|
"[process_kill] SIGKILL did not reap pids within {:?}: {survivors:?}. \
|
||||||
|
They may be wedged in uninterruptible IO.",
|
||||||
|
KILL_VERIFY_TIMEOUT
|
||||||
|
);
|
||||||
|
Err(survivors)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return every pid whose command line matches `pattern` (passed to
|
||||||
|
/// `pgrep -f`). Empty when nothing matches or when `pgrep` is unavailable.
|
||||||
|
///
|
||||||
|
/// Useful for collecting processes by a path or argument substring — e.g.
|
||||||
|
/// "every process running in `<worktree>/`" or "every cargo invocation
|
||||||
|
/// against this `Cargo.toml`".
|
||||||
|
pub fn pids_matching(pattern: &str) -> Vec<u32> {
|
||||||
|
let Ok(output) = std::process::Command::new("pgrep")
|
||||||
|
.args(["-f", pattern])
|
||||||
|
.output()
|
||||||
|
else {
|
||||||
|
return Vec::new();
|
||||||
|
};
|
||||||
|
String::from_utf8_lossy(&output.stdout)
|
||||||
|
.lines()
|
||||||
|
.filter_map(|l| l.trim().parse::<u32>().ok())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return every descendant pid of `root_pid`, deepest-first, **excluding**
|
||||||
|
/// `root_pid` itself. Walks the parent→child relation via `pgrep -P`.
|
||||||
|
///
|
||||||
|
/// Deepest-first ordering lets callers signal leaves before their parents
|
||||||
|
/// when that matters; for SIGKILL it makes no difference.
|
||||||
|
pub fn descendant_pids(root_pid: u32) -> Vec<u32> {
|
||||||
|
let mut out: Vec<u32> = Vec::new();
|
||||||
|
walk_descendants(root_pid, &mut out);
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
fn walk_descendants(pid: u32, out: &mut Vec<u32>) {
|
||||||
|
let Ok(output) = std::process::Command::new("pgrep")
|
||||||
|
.args(["-P", &pid.to_string()])
|
||||||
|
.output()
|
||||||
|
else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
let kids: Vec<u32> = String::from_utf8_lossy(&output.stdout)
|
||||||
|
.lines()
|
||||||
|
.filter_map(|l| l.trim().parse::<u32>().ok())
|
||||||
|
.collect();
|
||||||
|
for kid in kids {
|
||||||
|
walk_descendants(kid, out);
|
||||||
|
out.push(kid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check whether `pid` currently exists. Implemented via `kill(pid, 0)` —
|
||||||
|
/// no signal is sent, only existence is probed.
|
||||||
|
fn pid_is_alive(pid: u32) -> bool {
|
||||||
|
// signal 0: "is this process around?" Returns 0 if the process exists
|
||||||
|
// and we have permission to signal it, -1 with errno otherwise.
|
||||||
|
unsafe { libc::kill(pid as i32, 0) == 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::process::{Child, Command, Stdio};
|
||||||
|
use std::thread::JoinHandle;
|
||||||
|
|
||||||
|
/// Spawn a sleeper for kill testing, and spawn a background reaper that
|
||||||
|
/// calls `wait()` as soon as the child exits. Returns the pid plus the
|
||||||
|
/// reaper join handle so the test can confirm reaping after the kill.
|
||||||
|
///
|
||||||
|
/// The reaper is essential because the production code's verify loop
|
||||||
|
/// uses `kill(pid, 0)` to test existence — which returns 0 for zombies.
|
||||||
|
/// If no one reaps the test's sleeper, its pid stays occupied (as a
|
||||||
|
/// zombie) and `sigkill_pids_and_verify` mistakenly reports survivors.
|
||||||
|
/// In production the PTY blocking thread is always reaping on behalf of
|
||||||
|
/// portable_pty, so this isn't a concern there.
|
||||||
|
fn spawn_sleeper_with_reaper(secs: u64) -> (u32, JoinHandle<()>) {
|
||||||
|
let child: Child = Command::new("sleep")
|
||||||
|
.arg(secs.to_string())
|
||||||
|
.stdout(Stdio::null())
|
||||||
|
.stderr(Stdio::null())
|
||||||
|
.stdin(Stdio::null())
|
||||||
|
.spawn()
|
||||||
|
.expect("failed to spawn sleep");
|
||||||
|
let pid = child.id();
|
||||||
|
let reaper = std::thread::spawn(move || {
|
||||||
|
let mut c = child;
|
||||||
|
let _ = c.wait();
|
||||||
|
});
|
||||||
|
(pid, reaper)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sigkill_empty_slice_is_ok() {
|
||||||
|
let result = sigkill_pids_and_verify(&[]);
|
||||||
|
assert!(matches!(result, Ok(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sigkill_real_process_is_verified_gone() {
|
||||||
|
let (pid, reaper) = spawn_sleeper_with_reaper(60);
|
||||||
|
assert!(pid_is_alive(pid), "sleeper should be alive before kill");
|
||||||
|
|
||||||
|
let result = sigkill_pids_and_verify(&[pid]);
|
||||||
|
assert!(
|
||||||
|
matches!(result, Ok(1)),
|
||||||
|
"sigkill must verify the process is gone: {result:?}"
|
||||||
|
);
|
||||||
|
let _ = reaper.join();
|
||||||
|
assert!(!pid_is_alive(pid), "sleeper must be dead after kill");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sigkill_already_dead_pid_is_ok() {
|
||||||
|
let (pid, reaper) = spawn_sleeper_with_reaper(0);
|
||||||
|
let _ = reaper.join();
|
||||||
|
// Wait briefly for the kernel to recycle the pid.
|
||||||
|
for _ in 0..20 {
|
||||||
|
if !pid_is_alive(pid) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||||
|
}
|
||||||
|
// Now SIGKILL a pid that no longer exists. Result must still be Ok.
|
||||||
|
let result = sigkill_pids_and_verify(&[pid]);
|
||||||
|
assert!(
|
||||||
|
result.is_ok(),
|
||||||
|
"sigkill of already-dead pid must succeed: {result:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sigkill_multiple_real_processes() {
|
||||||
|
let mut handles: Vec<(u32, JoinHandle<()>)> =
|
||||||
|
(0..3).map(|_| spawn_sleeper_with_reaper(60)).collect();
|
||||||
|
let pids: Vec<u32> = handles.iter().map(|(p, _)| *p).collect();
|
||||||
|
for &pid in &pids {
|
||||||
|
assert!(pid_is_alive(pid));
|
||||||
|
}
|
||||||
|
let result = sigkill_pids_and_verify(&pids);
|
||||||
|
assert!(
|
||||||
|
matches!(result, Ok(3)),
|
||||||
|
"all 3 sleepers must die: {result:?}"
|
||||||
|
);
|
||||||
|
for (_, reaper) in handles.drain(..) {
|
||||||
|
let _ = reaper.join();
|
||||||
|
}
|
||||||
|
for &pid in &pids {
|
||||||
|
assert!(!pid_is_alive(pid), "pid {pid} survived sigkill");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn pids_matching_finds_a_running_process() {
|
||||||
|
// pgrep -f matches the FULL command line, so the marker has to be
|
||||||
|
// in argv somewhere. Putting it in a shell comment doesn't work —
|
||||||
|
// sh strips it. Override argv[0] so the marker is durably visible.
|
||||||
|
use std::os::unix::process::CommandExt;
|
||||||
|
let marker = format!("kill-test-marker-{}-{}", std::process::id(), rand_u64());
|
||||||
|
let argv0 = format!("test-marker-{marker}");
|
||||||
|
let child: Child = Command::new("sleep")
|
||||||
|
.arg0(argv0)
|
||||||
|
.arg("60")
|
||||||
|
.stdout(Stdio::null())
|
||||||
|
.stderr(Stdio::null())
|
||||||
|
.stdin(Stdio::null())
|
||||||
|
.spawn()
|
||||||
|
.expect("spawn");
|
||||||
|
let child_pid = child.id();
|
||||||
|
let reaper = std::thread::spawn(move || {
|
||||||
|
let mut c = child;
|
||||||
|
let _ = c.wait();
|
||||||
|
});
|
||||||
|
|
||||||
|
// pgrep needs a moment to see the new process.
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||||
|
|
||||||
|
let found = pids_matching(&marker);
|
||||||
|
assert!(
|
||||||
|
found.contains(&child_pid),
|
||||||
|
"pids_matching should find pid {child_pid} for marker '{marker}'; got {found:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Cleanup so the test doesn't leak a sleeper.
|
||||||
|
let _ = sigkill_pids_and_verify(&[child_pid]);
|
||||||
|
let _ = reaper.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn pids_matching_returns_empty_when_no_match() {
|
||||||
|
let pattern = format!("nonexistent-pattern-{}-{}", std::process::id(), rand_u64());
|
||||||
|
let found = pids_matching(&pattern);
|
||||||
|
assert!(found.is_empty(), "expected empty result, got {found:?}");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cheap unique-ish u64 for distinguishing test invocations without a
|
||||||
|
/// dependency on a randomness crate.
|
||||||
|
fn rand_u64() -> u64 {
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.map(|d| d.as_nanos() as u64)
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn descendant_pids_of_real_process_tree() {
|
||||||
|
// Build a parent sh that spawns a child sleep. The descendants of
|
||||||
|
// the parent should include the sleep.
|
||||||
|
let parent: Child = Command::new("sh")
|
||||||
|
.args(["-c", "sleep 60"])
|
||||||
|
.stdout(Stdio::null())
|
||||||
|
.stderr(Stdio::null())
|
||||||
|
.stdin(Stdio::null())
|
||||||
|
.spawn()
|
||||||
|
.expect("spawn parent");
|
||||||
|
let parent_pid = parent.id();
|
||||||
|
let reaper = std::thread::spawn(move || {
|
||||||
|
let mut c = parent;
|
||||||
|
let _ = c.wait();
|
||||||
|
});
|
||||||
|
|
||||||
|
// Let the shell get around to fork+execing its child.
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(200));
|
||||||
|
|
||||||
|
let descendants = descendant_pids(parent_pid);
|
||||||
|
// On some shells `sh -c "sleep N"` exec-replaces sh with sleep, leaving
|
||||||
|
// zero descendants. On others it forks. We don't care which; we only
|
||||||
|
// care that the function doesn't panic and returns a sensible vec.
|
||||||
|
assert!(
|
||||||
|
descendants.iter().all(|&pid| pid != parent_pid),
|
||||||
|
"descendant_pids must not include the root itself: {descendants:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Cleanup: kill the parent and any descendants.
|
||||||
|
let mut all = descendants;
|
||||||
|
all.push(parent_pid);
|
||||||
|
let _ = sigkill_pids_and_verify(&all);
|
||||||
|
let _ = reaper.join();
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user