//! Watchdog task: detects orphaned agents, enforces turn/budget limits, and //! triggers auto-assign. mod budget; mod limits; mod orphan; #[cfg(test)] mod tests; use std::path::Path; use crate::agents::AgentStatus; use crate::config::ProjectConfig; use crate::process_kill::{pids_matching, sigkill_pids_and_verify}; use crate::slog; use crate::slog_warn; use super::super::AgentPool; use limits::check_agent_limits; use orphan::check_orphaned_agents; pub(crate) use budget::{compute_budget_from_logs, compute_budget_from_single_log}; pub(crate) use limits::{count_turns_in_log, resolve_session_log}; impl AgentPool { /// Run a single watchdog pass synchronously (test helper). #[cfg(test)] pub fn run_watchdog_once(&self) { check_orphaned_agents(&self.agents); } /// Run one watchdog pass: detect orphans, enforce limits, kill offenders. /// /// Called by the unified background tick loop every 30 ticks. /// /// When a limit is exceeded the agent's PTY child is killed and the /// `should_block_story` retry mechanism is invoked. The story is marked /// `blocked: true` only when `retry_count >= max_retries`; otherwise /// `retry_count` is incremented and the story stays in `2_current/` for /// re-attempt. This prevents the original kill-respawn loop (bug 646) /// while restoring the `max_retries` semantic for turn/budget overruns. pub fn run_watchdog_pass(&self, project_root: Option<&Path>) -> usize { let orphaned = check_orphaned_agents(&self.agents); if let Some(root) = project_root { let terminated = check_agent_limits(&self.agents, root); let config = ProjectConfig::load(root).unwrap_or_default(); for (key, reason) in &terminated { // Step 1: snapshot the agent's worktree path so we can find every // process running in it (claude + any subprocesses). This must // happen BEFORE we mutate the agent record so we can read the // worktree info safely. let worktree_path = self.agents.lock().ok().and_then(|lock| { lock.get(key) .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone())) }); // Step 2: SIGKILL every process running in the worktree and // BLOCK until verified gone. The previous mechanism — portable_pty's // `ChildKiller::kill()` — sends SIGHUP, which claude-code // ignores, leaving the process alive while the agent record // was being marked terminated; that gap let a fresh spawn race // in alongside the surviving one. SIGKILL is uncatchable; // [`sigkill_pids_and_verify`] only returns once the kernel has // reaped each pid. if let Some(wt_path) = worktree_path.as_ref() { let pids = pids_matching(&wt_path.display().to_string()); if pids.is_empty() { // Nothing in this worktree — agent likely already // exited on its own before the watchdog noticed. } else { match sigkill_pids_and_verify(&pids) { Ok(n) => slog!( "[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.", wt_path.display() ), Err(survivors) => slog_warn!( "[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \ Proceeding with cleanup; concurrent spawn protection may be weakened." ), } } } else { slog_warn!( "[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \ falling back to portable_pty SIGHUP (likely no-op for claude-code)." ); self.kill_child_for_key(key); } // Step 3: NOW update the agent record. The process is verified // gone (or we logged that SIGKILL didn't take effect, which is // exceptional), so flipping status away from Running can no // longer open a window for a concurrent spawn. if let Ok(mut lock) = self.agents.lock() && let Some(agent) = lock.get_mut(key) { agent.status = AgentStatus::Failed; agent.termination_reason = Some(reason.clone()); if let Some(handle) = agent.task_handle.take() { // Best-effort abort of the outer tokio task. The PTY // blocking thread already returned (claude is dead), // so this is bookkeeping rather than load-bearing. handle.abort(); } } // Use the retry mechanism: increment retry_count and only block // when the limit is exceeded, matching the pipeline's behaviour. let story_id = key.rsplit_once(':').map(|(s, _)| s).unwrap_or(key); if let Some(block_reason) = super::super::pipeline::should_block_story( story_id, config.max_retries, "watchdog", ) { let _ = self .watcher_tx .send(crate::io::watcher::WatcherEvent::StoryBlocked { story_id: story_id.to_string(), reason: block_reason, }); slog!("[watchdog] Story '{story_id}' blocked after exceeding retry limit."); } else { slog!( "[watchdog] Story '{story_id}' retry incremented after limit \ termination; stays in 2_current/ for re-attempt." ); } } if !terminated.is_empty() { Self::notify_agent_state_changed(&self.watcher_tx); } return orphaned + terminated.len(); } orphaned } }