86 lines
3.3 KiB
Rust
86 lines
3.3 KiB
Rust
|
|
//! Watchdog task: detects orphaned agents, enforces turn/budget limits, and
|
||
|
|
//! triggers auto-assign.
|
||
|
|
|
||
|
|
mod budget;
|
||
|
|
mod limits;
|
||
|
|
mod orphan;
|
||
|
|
#[cfg(test)]
|
||
|
|
mod tests;
|
||
|
|
|
||
|
|
use std::path::Path;
|
||
|
|
|
||
|
|
use crate::config::ProjectConfig;
|
||
|
|
use crate::slog;
|
||
|
|
|
||
|
|
use super::super::AgentPool;
|
||
|
|
use limits::check_agent_limits;
|
||
|
|
use orphan::check_orphaned_agents;
|
||
|
|
|
||
|
|
pub(crate) use budget::{compute_budget_from_logs, compute_budget_from_single_log};
|
||
|
|
pub(crate) use limits::{count_turns_in_log, resolve_session_log};
|
||
|
|
|
||
|
|
impl AgentPool {
|
||
|
|
/// Run a single watchdog pass synchronously (test helper).
|
||
|
|
#[cfg(test)]
|
||
|
|
pub fn run_watchdog_once(&self) {
|
||
|
|
check_orphaned_agents(&self.agents);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Run one watchdog pass: detect orphans, enforce limits, kill offenders.
|
||
|
|
///
|
||
|
|
/// Called by the unified background tick loop every 30 ticks.
|
||
|
|
///
|
||
|
|
/// When a limit is exceeded the agent's PTY child is killed and the
|
||
|
|
/// `should_block_story` retry mechanism is invoked. The story is marked
|
||
|
|
/// `blocked: true` only when `retry_count >= max_retries`; otherwise
|
||
|
|
/// `retry_count` is incremented and the story stays in `2_current/` for
|
||
|
|
/// re-attempt. This prevents the original kill-respawn loop (bug 646)
|
||
|
|
/// while restoring the `max_retries` semantic for turn/budget overruns.
|
||
|
|
pub fn run_watchdog_pass(&self, project_root: Option<&Path>) -> usize {
|
||
|
|
let orphaned = check_orphaned_agents(&self.agents);
|
||
|
|
|
||
|
|
if let Some(root) = project_root {
|
||
|
|
let terminated = check_agent_limits(&self.agents, root);
|
||
|
|
let config = ProjectConfig::load(root).unwrap_or_default();
|
||
|
|
for (key, _reason) in &terminated {
|
||
|
|
// Kill the PTY child and abort the task, same as stop_agent.
|
||
|
|
self.kill_child_for_key(key);
|
||
|
|
if let Ok(mut lock) = self.agents.lock()
|
||
|
|
&& let Some(agent) = lock.get_mut(key)
|
||
|
|
&& let Some(handle) = agent.task_handle.take()
|
||
|
|
{
|
||
|
|
handle.abort();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Use the retry mechanism: increment retry_count and only block
|
||
|
|
// when the limit is exceeded, matching the pipeline's behaviour.
|
||
|
|
let story_id = key.rsplit_once(':').map(|(s, _)| s).unwrap_or(key);
|
||
|
|
if let Some(block_reason) = super::super::pipeline::should_block_story(
|
||
|
|
story_id,
|
||
|
|
config.max_retries,
|
||
|
|
"watchdog",
|
||
|
|
) {
|
||
|
|
let _ = self
|
||
|
|
.watcher_tx
|
||
|
|
.send(crate::io::watcher::WatcherEvent::StoryBlocked {
|
||
|
|
story_id: story_id.to_string(),
|
||
|
|
reason: block_reason,
|
||
|
|
});
|
||
|
|
slog!("[watchdog] Story '{story_id}' blocked after exceeding retry limit.");
|
||
|
|
} else {
|
||
|
|
slog!(
|
||
|
|
"[watchdog] Story '{story_id}' retry incremented after limit \
|
||
|
|
termination; stays in 2_current/ for re-attempt."
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if !terminated.is_empty() {
|
||
|
|
Self::notify_agent_state_changed(&self.watcher_tx);
|
||
|
|
}
|
||
|
|
return orphaned + terminated.len();
|
||
|
|
}
|
||
|
|
|
||
|
|
orphaned
|
||
|
|
}
|
||
|
|
}
|