Files
huskies/server/src/agents/pool/auto_assign/watchdog/mod.rs
T

139 lines
6.3 KiB
Rust

//! Watchdog task: detects orphaned agents, enforces turn/budget limits, and
//! triggers auto-assign.
mod budget;
mod limits;
mod orphan;
#[cfg(test)]
mod tests;
use std::path::Path;
use crate::agents::AgentStatus;
use crate::config::ProjectConfig;
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
use crate::slog;
use crate::slog_warn;
use super::super::AgentPool;
use limits::check_agent_limits;
use orphan::check_orphaned_agents;
pub(crate) use budget::{compute_budget_from_logs, compute_budget_from_single_log};
pub(crate) use limits::{count_turns_in_log, resolve_session_log};
impl AgentPool {
/// Run a single watchdog pass synchronously (test helper).
#[cfg(test)]
pub fn run_watchdog_once(&self) {
check_orphaned_agents(&self.agents);
}
/// Run one watchdog pass: detect orphans, enforce limits, kill offenders.
///
/// Called by the unified background tick loop every 30 ticks.
///
/// When a limit is exceeded the agent's PTY child is killed and the
/// `should_block_story` retry mechanism is invoked. The story is marked
/// `blocked: true` only when `retry_count >= max_retries`; otherwise
/// `retry_count` is incremented and the story stays in `2_current/` for
/// re-attempt. This prevents the original kill-respawn loop (bug 646)
/// while restoring the `max_retries` semantic for turn/budget overruns.
pub fn run_watchdog_pass(&self, project_root: Option<&Path>) -> usize {
let orphaned = check_orphaned_agents(&self.agents);
if let Some(root) = project_root {
let terminated = check_agent_limits(&self.agents, root);
let config = ProjectConfig::load(root).unwrap_or_default();
for (key, reason) in &terminated {
// Step 1: snapshot the agent's worktree path so we can find every
// process running in it (claude + any subprocesses). This must
// happen BEFORE we mutate the agent record so we can read the
// worktree info safely.
let worktree_path = self.agents.lock().ok().and_then(|lock| {
lock.get(key)
.and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
});
// Step 2: SIGKILL every process running in the worktree and
// BLOCK until verified gone. The previous mechanism — portable_pty's
// `ChildKiller::kill()` — sends SIGHUP, which claude-code
// ignores, leaving the process alive while the agent record
// was being marked terminated; that gap let a fresh spawn race
// in alongside the surviving one. SIGKILL is uncatchable;
// [`sigkill_pids_and_verify`] only returns once the kernel has
// reaped each pid.
if let Some(wt_path) = worktree_path.as_ref() {
let pids = pids_matching(&wt_path.display().to_string());
if pids.is_empty() {
// Nothing in this worktree — agent likely already
// exited on its own before the watchdog noticed.
} else {
match sigkill_pids_and_verify(&pids) {
Ok(n) => slog!(
"[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
wt_path.display()
),
Err(survivors) => slog_warn!(
"[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
Proceeding with cleanup; concurrent spawn protection may be weakened."
),
}
}
} else {
slog_warn!(
"[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \
falling back to portable_pty SIGHUP (likely no-op for claude-code)."
);
self.kill_child_for_key(key);
}
// Step 3: NOW update the agent record. The process is verified
// gone (or we logged that SIGKILL didn't take effect, which is
// exceptional), so flipping status away from Running can no
// longer open a window for a concurrent spawn.
if let Ok(mut lock) = self.agents.lock()
&& let Some(agent) = lock.get_mut(key)
{
agent.status = AgentStatus::Failed;
agent.termination_reason = Some(reason.clone());
if let Some(handle) = agent.task_handle.take() {
// Best-effort abort of the outer tokio task. The PTY
// blocking thread already returned (claude is dead),
// so this is bookkeeping rather than load-bearing.
handle.abort();
}
}
// Use the retry mechanism: increment retry_count and only block
// when the limit is exceeded, matching the pipeline's behaviour.
let story_id = key.rsplit_once(':').map(|(s, _)| s).unwrap_or(key);
if let Some(block_reason) = super::super::pipeline::should_block_story(
story_id,
config.max_retries,
"watchdog",
) {
let _ = self
.watcher_tx
.send(crate::io::watcher::WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason: block_reason,
});
slog!("[watchdog] Story '{story_id}' blocked after exceeding retry limit.");
} else {
slog!(
"[watchdog] Story '{story_id}' retry incremented after limit \
termination; stays in 2_current/ for re-attempt."
);
}
}
if !terminated.is_empty() {
Self::notify_agent_state_changed(&self.watcher_tx);
}
return orphaned + terminated.len();
}
orphaned
}
}