2026-04-28 11:13:02 +00:00
|
|
|
|
//! Limit enforcement: checks agent turn/budget limits and terminates offenders.
|
|
|
|
|
|
|
|
|
|
|
|
use std::collections::HashMap;
|
|
|
|
|
|
use std::path::Path;
|
|
|
|
|
|
use std::sync::Mutex;
|
|
|
|
|
|
use tokio::sync::broadcast;
|
|
|
|
|
|
|
|
|
|
|
|
use crate::agents::pool::StoryAgent;
|
|
|
|
|
|
use crate::agents::{AgentEvent, AgentStatus, TerminationReason};
|
|
|
|
|
|
use crate::config::ProjectConfig;
|
|
|
|
|
|
use crate::slog;
|
|
|
|
|
|
|
|
|
|
|
|
use super::budget::compute_budget_from_single_log;
|
|
|
|
|
|
|
|
|
|
|
|
/// Snapshot of a running agent for limit checking: (composite_key, story_id, agent_name, event_tx, log_session_id).
|
|
|
|
|
|
type RunningAgentSnapshot = (
|
|
|
|
|
|
String,
|
|
|
|
|
|
String,
|
|
|
|
|
|
String,
|
|
|
|
|
|
broadcast::Sender<AgentEvent>,
|
|
|
|
|
|
Option<String>,
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
/// Resolve the current session's log file for an agent.
|
|
|
|
|
|
///
|
|
|
|
|
|
/// Uses `log_session_id` to construct the exact path when available;
|
|
|
|
|
|
/// otherwise falls back to the most recently modified log file for the
|
|
|
|
|
|
/// agent.
|
|
|
|
|
|
pub(crate) fn resolve_session_log(
|
|
|
|
|
|
project_root: &Path,
|
|
|
|
|
|
story_id: &str,
|
|
|
|
|
|
agent_name: &str,
|
|
|
|
|
|
log_session_id: &Option<String>,
|
|
|
|
|
|
) -> Option<std::path::PathBuf> {
|
|
|
|
|
|
if let Some(sid) = log_session_id {
|
|
|
|
|
|
let path = crate::agent_log::log_file_path(project_root, story_id, agent_name, sid);
|
|
|
|
|
|
if path.exists() {
|
|
|
|
|
|
return Some(path);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
crate::agent_log::find_latest_log(project_root, story_id, agent_name)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-12 17:25:11 +01:00
|
|
|
|
/// Count **tool-using** assistant turns in a single log file (story 923).
|
|
|
|
|
|
///
|
|
|
|
|
|
/// A turn counts only if its assistant message contains at least one
|
|
|
|
|
|
/// `content` block with `type == "tool_use"`. Narration-only turns
|
|
|
|
|
|
/// (text-only assistant messages such as "I'll read X next" preambles)
|
|
|
|
|
|
/// don't count against the watchdog limit — they make no progress, only
|
|
|
|
|
|
/// burn budget, and Sonnet emits them at roughly 30–55% of all turns.
|
2026-04-28 11:13:02 +00:00
|
|
|
|
pub(crate) fn count_turns_in_log(path: &Path) -> u64 {
|
|
|
|
|
|
let entries = match crate::agent_log::read_log(path) {
|
|
|
|
|
|
Ok(e) => e,
|
|
|
|
|
|
Err(_) => return 0,
|
|
|
|
|
|
};
|
|
|
|
|
|
entries
|
|
|
|
|
|
.iter()
|
|
|
|
|
|
.filter(|entry| {
|
2026-05-12 17:25:11 +01:00
|
|
|
|
if entry.event.get("type").and_then(|v| v.as_str()) != Some("agent_json") {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
let data = match entry.event.get("data") {
|
|
|
|
|
|
Some(d) => d,
|
|
|
|
|
|
None => return false,
|
|
|
|
|
|
};
|
|
|
|
|
|
if data.get("type").and_then(|v| v.as_str()) != Some("assistant") {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Require at least one tool_use content block.
|
|
|
|
|
|
data.pointer("/message/content")
|
|
|
|
|
|
.and_then(|c| c.as_array())
|
|
|
|
|
|
.map(|arr| {
|
|
|
|
|
|
arr.iter()
|
|
|
|
|
|
.any(|item| item.get("type").and_then(|v| v.as_str()) == Some("tool_use"))
|
|
|
|
|
|
})
|
|
|
|
|
|
.unwrap_or(false)
|
2026-04-28 11:13:02 +00:00
|
|
|
|
})
|
|
|
|
|
|
.count() as u64
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Scan running agents for turn/budget limit violations and terminate offenders.
|
|
|
|
|
|
///
|
|
|
|
|
|
/// Turns and budget are counted from the **current session's** log file
|
|
|
|
|
|
/// only — prior sessions are excluded so that restart counts from earlier
|
|
|
|
|
|
/// runs do not accumulate against the limits.
|
|
|
|
|
|
pub(super) fn check_agent_limits(
|
|
|
|
|
|
agents: &Mutex<HashMap<String, StoryAgent>>,
|
|
|
|
|
|
project_root: &Path,
|
|
|
|
|
|
) -> Vec<(String, TerminationReason)> {
|
|
|
|
|
|
let config = match ProjectConfig::load(project_root) {
|
|
|
|
|
|
Ok(c) => c,
|
|
|
|
|
|
Err(_) => return Vec::new(),
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Snapshot running agents: (key, story_id, agent_name, tx, log_session_id).
|
|
|
|
|
|
let running: Vec<RunningAgentSnapshot> = {
|
|
|
|
|
|
let lock = match agents.lock() {
|
|
|
|
|
|
Ok(l) => l,
|
|
|
|
|
|
Err(_) => return Vec::new(),
|
|
|
|
|
|
};
|
|
|
|
|
|
lock.iter()
|
|
|
|
|
|
.filter(|(_, agent)| agent.status == AgentStatus::Running)
|
|
|
|
|
|
.map(|(key, agent)| {
|
|
|
|
|
|
let story_id = key
|
|
|
|
|
|
.rsplit_once(':')
|
|
|
|
|
|
.map(|(s, _)| s.to_string())
|
|
|
|
|
|
.unwrap_or_else(|| key.clone());
|
|
|
|
|
|
(
|
|
|
|
|
|
key.clone(),
|
|
|
|
|
|
story_id,
|
|
|
|
|
|
agent.agent_name.clone(),
|
|
|
|
|
|
agent.tx.clone(),
|
|
|
|
|
|
agent.log_session_id.clone(),
|
|
|
|
|
|
)
|
|
|
|
|
|
})
|
|
|
|
|
|
.collect()
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
let mut terminated = Vec::new();
|
|
|
|
|
|
|
|
|
|
|
|
for (key, story_id, agent_name, tx, log_session_id) in &running {
|
|
|
|
|
|
let agent_config = config.agent.iter().find(|a| a.name == *agent_name);
|
2026-05-12 17:25:11 +01:00
|
|
|
|
// The watchdog gates on max_tool_turns (counts only tool-using
|
|
|
|
|
|
// assistant turns) when set; otherwise falls back to max_turns for
|
|
|
|
|
|
// backwards compatibility with configs that haven't migrated yet.
|
|
|
|
|
|
let max_turns = agent_config.and_then(|a| a.max_tool_turns.or(a.max_turns));
|
2026-04-28 11:13:02 +00:00
|
|
|
|
let max_budget_usd = agent_config.and_then(|a| a.max_budget_usd);
|
|
|
|
|
|
|
|
|
|
|
|
// Skip agents with no limits configured.
|
|
|
|
|
|
if max_turns.is_none() && max_budget_usd.is_none() {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Resolve the current session's log file.
|
|
|
|
|
|
let session_log = resolve_session_log(project_root, story_id, agent_name, log_session_id);
|
|
|
|
|
|
|
|
|
|
|
|
// Count turns from the current session's log file only.
|
|
|
|
|
|
let turns_used: u64 = if max_turns.is_some() {
|
|
|
|
|
|
session_log
|
|
|
|
|
|
.as_ref()
|
|
|
|
|
|
.map(|p| count_turns_in_log(p))
|
|
|
|
|
|
.unwrap_or(0)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
0
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Compute budget from the current session's log file only.
|
|
|
|
|
|
// Completed-session records from token_usage.jsonl are NOT included
|
|
|
|
|
|
// in the watchdog's enforcement — they belong to prior sessions.
|
|
|
|
|
|
let budget_used_usd: f64 = if max_budget_usd.is_some() {
|
|
|
|
|
|
session_log
|
|
|
|
|
|
.as_ref()
|
|
|
|
|
|
.map(|p| compute_budget_from_single_log(p))
|
|
|
|
|
|
.unwrap_or(0.0)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
0.0
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Determine if a limit is exceeded.
|
|
|
|
|
|
let reason = if let Some(max) = max_turns {
|
|
|
|
|
|
if turns_used >= max as u64 {
|
|
|
|
|
|
Some(TerminationReason::TurnLimit)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
None
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
None
|
|
|
|
|
|
}
|
|
|
|
|
|
.or(if let Some(max) = max_budget_usd {
|
|
|
|
|
|
if budget_used_usd >= max {
|
|
|
|
|
|
Some(TerminationReason::BudgetLimit)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
None
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
None
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
if let Some(reason) = reason {
|
|
|
|
|
|
let reason_str = match &reason {
|
|
|
|
|
|
TerminationReason::TurnLimit => {
|
|
|
|
|
|
format!("turn limit exceeded ({turns_used}/{})", max_turns.unwrap())
|
|
|
|
|
|
}
|
|
|
|
|
|
TerminationReason::BudgetLimit => format!(
|
|
|
|
|
|
"budget limit exceeded (${budget_used_usd:.2}/${:.2})",
|
|
|
|
|
|
max_budget_usd.unwrap()
|
|
|
|
|
|
),
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Mark agent as Failed with termination reason.
|
|
|
|
|
|
if let Ok(mut lock) = agents.lock()
|
|
|
|
|
|
&& let Some(agent) = lock.get_mut(key)
|
|
|
|
|
|
{
|
|
|
|
|
|
agent.status = AgentStatus::Failed;
|
|
|
|
|
|
agent.termination_reason = Some(reason.clone());
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
slog!("[watchdog] Terminating agent '{key}': {reason_str}.");
|
|
|
|
|
|
|
|
|
|
|
|
let _ = tx.send(AgentEvent::Error {
|
|
|
|
|
|
story_id: story_id.clone(),
|
|
|
|
|
|
agent_name: agent_name.clone(),
|
|
|
|
|
|
message: format!("Agent terminated by watchdog: {reason_str}"),
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
terminated.push((key.clone(), reason));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
terminated
|
|
|
|
|
|
}
|