139 lines
6.3 KiB
Rust
139 lines
6.3 KiB
Rust
//! Watchdog task: detects orphaned agents, enforces turn/budget limits, and
|
|
//! triggers auto-assign.
|
|
|
|
mod budget;
|
|
mod limits;
|
|
mod orphan;
|
|
#[cfg(test)]
|
|
mod tests;
|
|
|
|
use std::path::Path;
|
|
|
|
use crate::agents::AgentStatus;
|
|
use crate::config::ProjectConfig;
|
|
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
|
|
use crate::slog;
|
|
use crate::slog_warn;
|
|
|
|
use super::super::AgentPool;
|
|
use limits::check_agent_limits;
|
|
use orphan::check_orphaned_agents;
|
|
|
|
pub(crate) use budget::{compute_budget_from_logs, compute_budget_from_single_log};
|
|
pub(crate) use limits::{count_turns_in_log, resolve_session_log};
|
|
|
|
impl AgentPool {
|
|
/// Run a single watchdog pass synchronously (test helper).
|
|
#[cfg(test)]
|
|
pub fn run_watchdog_once(&self) {
|
|
check_orphaned_agents(&self.agents);
|
|
}
|
|
|
|
/// Run one watchdog pass: detect orphans, enforce limits, kill offenders.
|
|
///
|
|
/// Called by the unified background tick loop every 30 ticks.
|
|
///
|
|
/// When a limit is exceeded the agent's PTY child is killed and the
|
|
/// `should_block_story` retry mechanism is invoked. The story is marked
|
|
/// `blocked: true` only when `retry_count >= max_retries`; otherwise
|
|
/// `retry_count` is incremented and the story stays in `2_current/` for
|
|
/// re-attempt. This prevents the original kill-respawn loop (bug 646)
|
|
/// while restoring the `max_retries` semantic for turn/budget overruns.
|
|
pub fn run_watchdog_pass(&self, project_root: Option<&Path>) -> usize {
|
|
let orphaned = check_orphaned_agents(&self.agents);
|
|
|
|
if let Some(root) = project_root {
|
|
let terminated = check_agent_limits(&self.agents, root);
|
|
let config = ProjectConfig::load(root).unwrap_or_default();
|
|
for (key, reason) in &terminated {
|
|
// Step 1: snapshot the agent's worktree path so we can find every
|
|
// process running in it (claude + any subprocesses). This must
|
|
// happen BEFORE we mutate the agent record so we can read the
|
|
// worktree info safely.
|
|
let worktree_path = self.agents.lock().ok().and_then(|lock| {
|
|
lock.get(key)
|
|
.and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
|
|
});
|
|
|
|
// Step 2: SIGKILL every process running in the worktree and
|
|
// BLOCK until verified gone. The previous mechanism — portable_pty's
|
|
// `ChildKiller::kill()` — sends SIGHUP, which claude-code
|
|
// ignores, leaving the process alive while the agent record
|
|
// was being marked terminated; that gap let a fresh spawn race
|
|
// in alongside the surviving one. SIGKILL is uncatchable;
|
|
// [`sigkill_pids_and_verify`] only returns once the kernel has
|
|
// reaped each pid.
|
|
if let Some(wt_path) = worktree_path.as_ref() {
|
|
let pids = pids_matching(&wt_path.display().to_string());
|
|
if pids.is_empty() {
|
|
// Nothing in this worktree — agent likely already
|
|
// exited on its own before the watchdog noticed.
|
|
} else {
|
|
match sigkill_pids_and_verify(&pids) {
|
|
Ok(n) => slog!(
|
|
"[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
|
|
wt_path.display()
|
|
),
|
|
Err(survivors) => slog_warn!(
|
|
"[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
|
|
Proceeding with cleanup; concurrent spawn protection may be weakened."
|
|
),
|
|
}
|
|
}
|
|
} else {
|
|
slog_warn!(
|
|
"[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \
|
|
falling back to portable_pty SIGHUP (likely no-op for claude-code)."
|
|
);
|
|
self.kill_child_for_key(key);
|
|
}
|
|
|
|
// Step 3: NOW update the agent record. The process is verified
|
|
// gone (or we logged that SIGKILL didn't take effect, which is
|
|
// exceptional), so flipping status away from Running can no
|
|
// longer open a window for a concurrent spawn.
|
|
if let Ok(mut lock) = self.agents.lock()
|
|
&& let Some(agent) = lock.get_mut(key)
|
|
{
|
|
agent.status = AgentStatus::Failed;
|
|
agent.termination_reason = Some(reason.clone());
|
|
if let Some(handle) = agent.task_handle.take() {
|
|
// Best-effort abort of the outer tokio task. The PTY
|
|
// blocking thread already returned (claude is dead),
|
|
// so this is bookkeeping rather than load-bearing.
|
|
handle.abort();
|
|
}
|
|
}
|
|
|
|
// Use the retry mechanism: increment retry_count and only block
|
|
// when the limit is exceeded, matching the pipeline's behaviour.
|
|
let story_id = key.rsplit_once(':').map(|(s, _)| s).unwrap_or(key);
|
|
if let Some(block_reason) = super::super::pipeline::should_block_story(
|
|
story_id,
|
|
config.max_retries,
|
|
"watchdog",
|
|
) {
|
|
let _ = self
|
|
.watcher_tx
|
|
.send(crate::io::watcher::WatcherEvent::StoryBlocked {
|
|
story_id: story_id.to_string(),
|
|
reason: block_reason,
|
|
});
|
|
slog!("[watchdog] Story '{story_id}' blocked after exceeding retry limit.");
|
|
} else {
|
|
slog!(
|
|
"[watchdog] Story '{story_id}' retry incremented after limit \
|
|
termination; stays in 2_current/ for re-attempt."
|
|
);
|
|
}
|
|
}
|
|
if !terminated.is_empty() {
|
|
Self::notify_agent_state_changed(&self.watcher_tx);
|
|
}
|
|
return orphaned + terminated.len();
|
|
}
|
|
|
|
orphaned
|
|
}
|
|
}
|