huskies/server/src/agents/pool/auto_assign/watchdog/mod.rs

//! Watchdog task: detects orphaned agents, enforces turn/budget limits, and
//! triggers auto-assign.

mod budget;
mod limits;
mod orphan;
#[cfg(test)]
mod tests;

use std::path::Path;

use crate::agents::AgentStatus;
use crate::config::ProjectConfig;
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
use crate::slog;
use crate::slog_warn;

use super::super::AgentPool;
use limits::check_agent_limits;
use orphan::check_orphaned_agents;

pub(crate) use budget::{compute_budget_from_logs, compute_budget_from_single_log};
pub(crate) use limits::{count_turns_in_log, resolve_session_log};

impl AgentPool {
    /// Run a single watchdog pass synchronously (test helper).
    #[cfg(test)]
    pub fn run_watchdog_once(&self) {
        check_orphaned_agents(&self.agents);
    }

    /// Run one watchdog pass: detect orphans, enforce limits, kill offenders.
    ///
    /// Called by the unified background tick loop every 30 ticks.
    ///
    /// When a limit is exceeded the agent's PTY child is killed and the
    /// `should_block_story` retry mechanism is invoked.  The story is marked
    /// `blocked: true` only when `retry_count >= max_retries`; otherwise
    /// `retry_count` is incremented and the story stays in `2_current/` for
    /// re-attempt.  This prevents the original kill-respawn loop (bug 646)
    /// while restoring the `max_retries` semantic for turn/budget overruns.
    pub fn run_watchdog_pass(&self, project_root: Option<&Path>) -> usize {
        let orphaned = check_orphaned_agents(&self.agents);

        if let Some(root) = project_root {
            let terminated = check_agent_limits(&self.agents, root);
            let config = ProjectConfig::load(root).unwrap_or_default();
            for (key, reason) in &terminated {
                // Step 1: snapshot the agent's worktree path so we can find every
                // process running in it (claude + any subprocesses).  This must
                // happen BEFORE we mutate the agent record so we can read the
                // worktree info safely.
                let worktree_path = self.agents.lock().ok().and_then(|lock| {
                    lock.get(key)
                        .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
                });

                // Step 2: SIGKILL every process running in the worktree and
                // BLOCK until verified gone.  The previous mechanism — portable_pty's
                // `ChildKiller::kill()` — sends SIGHUP, which claude-code
                // ignores, leaving the process alive while the agent record
                // was being marked terminated; that gap let a fresh spawn race
                // in alongside the surviving one.  SIGKILL is uncatchable;
                // [`sigkill_pids_and_verify`] only returns once the kernel has
                // reaped each pid.
                if let Some(wt_path) = worktree_path.as_ref() {
                    let pids = pids_matching(&wt_path.display().to_string());
                    if pids.is_empty() {
                        // Nothing in this worktree — agent likely already
                        // exited on its own before the watchdog noticed.
                    } else {
                        match sigkill_pids_and_verify(&pids) {
                            Ok(n) => slog!(
                                "[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
                                wt_path.display()
                            ),
                            Err(survivors) => slog_warn!(
                                "[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
                                 Proceeding with cleanup; concurrent spawn protection may be weakened."
                            ),
                        }
                    }
                } else {
                    slog_warn!(
                        "[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \
                         falling back to portable_pty SIGHUP (likely no-op for claude-code)."
                    );
                    self.kill_child_for_key(key);
                }

                // Step 3: NOW update the agent record.  The process is verified
                // gone (or we logged that SIGKILL didn't take effect, which is
                // exceptional), so flipping status away from Running can no
                // longer open a window for a concurrent spawn.
                if let Ok(mut lock) = self.agents.lock()
                    && let Some(agent) = lock.get_mut(key)
                {
                    agent.status = AgentStatus::Failed;
                    agent.termination_reason = Some(reason.clone());
                    if let Some(handle) = agent.task_handle.take() {
                        // Best-effort abort of the outer tokio task.  The PTY
                        // blocking thread already returned (claude is dead),
                        // so this is bookkeeping rather than load-bearing.
                        handle.abort();
                    }
                }

                // Use the retry mechanism: increment retry_count and only block
                // when the limit is exceeded, matching the pipeline's behaviour.
                let story_id = key.rsplit_once(':').map(|(s, _)| s).unwrap_or(key);
                if let Some(block_reason) = super::super::pipeline::should_block_story(
                    story_id,
                    config.max_retries,
                    "watchdog",
                ) {
                    let _ = self
                        .watcher_tx
                        .send(crate::io::watcher::WatcherEvent::StoryBlocked {
                            story_id: story_id.to_string(),
                            reason: block_reason,
                        });
                    slog!("[watchdog] Story '{story_id}' blocked after exceeding retry limit.");
                } else {
                    slog!(
                        "[watchdog] Story '{story_id}' retry incremented after limit \
                         termination; stays in 2_current/ for re-attempt."
                    );
                }
            }
            if !terminated.is_empty() {
                Self::notify_agent_state_changed(&self.watcher_tx);
            }
            return orphaned + terminated.len();
        }

        orphaned
    }
}