192 lines
6.2 KiB
Rust
192 lines
6.2 KiB
Rust
|
|
//! Limit enforcement: checks agent turn/budget limits and terminates offenders.
|
||
|
|
|
||
|
|
use std::collections::HashMap;
|
||
|
|
use std::path::Path;
|
||
|
|
use std::sync::Mutex;
|
||
|
|
use tokio::sync::broadcast;
|
||
|
|
|
||
|
|
use crate::agents::pool::StoryAgent;
|
||
|
|
use crate::agents::{AgentEvent, AgentStatus, TerminationReason};
|
||
|
|
use crate::config::ProjectConfig;
|
||
|
|
use crate::slog;
|
||
|
|
|
||
|
|
use super::budget::compute_budget_from_single_log;
|
||
|
|
|
||
|
|
/// Snapshot of a running agent for limit checking: (composite_key, story_id, agent_name, event_tx, log_session_id).
|
||
|
|
type RunningAgentSnapshot = (
|
||
|
|
String,
|
||
|
|
String,
|
||
|
|
String,
|
||
|
|
broadcast::Sender<AgentEvent>,
|
||
|
|
Option<String>,
|
||
|
|
);
|
||
|
|
|
||
|
|
/// Resolve the current session's log file for an agent.
|
||
|
|
///
|
||
|
|
/// Uses `log_session_id` to construct the exact path when available;
|
||
|
|
/// otherwise falls back to the most recently modified log file for the
|
||
|
|
/// agent.
|
||
|
|
pub(crate) fn resolve_session_log(
|
||
|
|
project_root: &Path,
|
||
|
|
story_id: &str,
|
||
|
|
agent_name: &str,
|
||
|
|
log_session_id: &Option<String>,
|
||
|
|
) -> Option<std::path::PathBuf> {
|
||
|
|
if let Some(sid) = log_session_id {
|
||
|
|
let path = crate::agent_log::log_file_path(project_root, story_id, agent_name, sid);
|
||
|
|
if path.exists() {
|
||
|
|
return Some(path);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
crate::agent_log::find_latest_log(project_root, story_id, agent_name)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Count `assistant` events in a single log file.
|
||
|
|
pub(crate) fn count_turns_in_log(path: &Path) -> u64 {
|
||
|
|
let entries = match crate::agent_log::read_log(path) {
|
||
|
|
Ok(e) => e,
|
||
|
|
Err(_) => return 0,
|
||
|
|
};
|
||
|
|
entries
|
||
|
|
.iter()
|
||
|
|
.filter(|entry| {
|
||
|
|
entry.event.get("type").and_then(|v| v.as_str()) == Some("agent_json")
|
||
|
|
&& entry
|
||
|
|
.event
|
||
|
|
.get("data")
|
||
|
|
.and_then(|d| d.get("type"))
|
||
|
|
.and_then(|v| v.as_str())
|
||
|
|
== Some("assistant")
|
||
|
|
})
|
||
|
|
.count() as u64
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Scan running agents for turn/budget limit violations and terminate offenders.
|
||
|
|
///
|
||
|
|
/// Turns and budget are counted from the **current session's** log file
|
||
|
|
/// only — prior sessions are excluded so that restart counts from earlier
|
||
|
|
/// runs do not accumulate against the limits.
|
||
|
|
pub(super) fn check_agent_limits(
|
||
|
|
agents: &Mutex<HashMap<String, StoryAgent>>,
|
||
|
|
project_root: &Path,
|
||
|
|
) -> Vec<(String, TerminationReason)> {
|
||
|
|
let config = match ProjectConfig::load(project_root) {
|
||
|
|
Ok(c) => c,
|
||
|
|
Err(_) => return Vec::new(),
|
||
|
|
};
|
||
|
|
|
||
|
|
// Snapshot running agents: (key, story_id, agent_name, tx, log_session_id).
|
||
|
|
let running: Vec<RunningAgentSnapshot> = {
|
||
|
|
let lock = match agents.lock() {
|
||
|
|
Ok(l) => l,
|
||
|
|
Err(_) => return Vec::new(),
|
||
|
|
};
|
||
|
|
lock.iter()
|
||
|
|
.filter(|(_, agent)| agent.status == AgentStatus::Running)
|
||
|
|
.map(|(key, agent)| {
|
||
|
|
let story_id = key
|
||
|
|
.rsplit_once(':')
|
||
|
|
.map(|(s, _)| s.to_string())
|
||
|
|
.unwrap_or_else(|| key.clone());
|
||
|
|
(
|
||
|
|
key.clone(),
|
||
|
|
story_id,
|
||
|
|
agent.agent_name.clone(),
|
||
|
|
agent.tx.clone(),
|
||
|
|
agent.log_session_id.clone(),
|
||
|
|
)
|
||
|
|
})
|
||
|
|
.collect()
|
||
|
|
};
|
||
|
|
|
||
|
|
let mut terminated = Vec::new();
|
||
|
|
|
||
|
|
for (key, story_id, agent_name, tx, log_session_id) in &running {
|
||
|
|
let agent_config = config.agent.iter().find(|a| a.name == *agent_name);
|
||
|
|
let max_turns = agent_config.and_then(|a| a.max_turns);
|
||
|
|
let max_budget_usd = agent_config.and_then(|a| a.max_budget_usd);
|
||
|
|
|
||
|
|
// Skip agents with no limits configured.
|
||
|
|
if max_turns.is_none() && max_budget_usd.is_none() {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Resolve the current session's log file.
|
||
|
|
let session_log = resolve_session_log(project_root, story_id, agent_name, log_session_id);
|
||
|
|
|
||
|
|
// Count turns from the current session's log file only.
|
||
|
|
let turns_used: u64 = if max_turns.is_some() {
|
||
|
|
session_log
|
||
|
|
.as_ref()
|
||
|
|
.map(|p| count_turns_in_log(p))
|
||
|
|
.unwrap_or(0)
|
||
|
|
} else {
|
||
|
|
0
|
||
|
|
};
|
||
|
|
|
||
|
|
// Compute budget from the current session's log file only.
|
||
|
|
// Completed-session records from token_usage.jsonl are NOT included
|
||
|
|
// in the watchdog's enforcement — they belong to prior sessions.
|
||
|
|
let budget_used_usd: f64 = if max_budget_usd.is_some() {
|
||
|
|
session_log
|
||
|
|
.as_ref()
|
||
|
|
.map(|p| compute_budget_from_single_log(p))
|
||
|
|
.unwrap_or(0.0)
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
};
|
||
|
|
|
||
|
|
// Determine if a limit is exceeded.
|
||
|
|
let reason = if let Some(max) = max_turns {
|
||
|
|
if turns_used >= max as u64 {
|
||
|
|
Some(TerminationReason::TurnLimit)
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
}
|
||
|
|
.or(if let Some(max) = max_budget_usd {
|
||
|
|
if budget_used_usd >= max {
|
||
|
|
Some(TerminationReason::BudgetLimit)
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
});
|
||
|
|
|
||
|
|
if let Some(reason) = reason {
|
||
|
|
let reason_str = match &reason {
|
||
|
|
TerminationReason::TurnLimit => {
|
||
|
|
format!("turn limit exceeded ({turns_used}/{})", max_turns.unwrap())
|
||
|
|
}
|
||
|
|
TerminationReason::BudgetLimit => format!(
|
||
|
|
"budget limit exceeded (${budget_used_usd:.2}/${:.2})",
|
||
|
|
max_budget_usd.unwrap()
|
||
|
|
),
|
||
|
|
};
|
||
|
|
|
||
|
|
// Mark agent as Failed with termination reason.
|
||
|
|
if let Ok(mut lock) = agents.lock()
|
||
|
|
&& let Some(agent) = lock.get_mut(key)
|
||
|
|
{
|
||
|
|
agent.status = AgentStatus::Failed;
|
||
|
|
agent.termination_reason = Some(reason.clone());
|
||
|
|
}
|
||
|
|
|
||
|
|
slog!("[watchdog] Terminating agent '{key}': {reason_str}.");
|
||
|
|
|
||
|
|
let _ = tx.send(AgentEvent::Error {
|
||
|
|
story_id: story_id.clone(),
|
||
|
|
agent_name: agent_name.clone(),
|
||
|
|
message: format!("Agent terminated by watchdog: {reason_str}"),
|
||
|
|
});
|
||
|
|
|
||
|
|
terminated.push((key.clone(), reason));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
terminated
|
||
|
|
}
|