|
|
|
@@ -68,11 +68,88 @@ pub(super) fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>)
|
|
|
|
|
count
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Compute in-flight budget from per-message usage data in agent log files.
|
|
|
|
|
///
|
|
|
|
|
/// For completed sessions (containing a `result` event), uses the accurate
|
|
|
|
|
/// `total_cost_usd` from Claude Code. For running sessions (no `result`
|
|
|
|
|
/// event yet), estimates cost from per-turn token counts in `assistant`
|
|
|
|
|
/// events.
|
|
|
|
|
pub(crate) fn compute_budget_from_logs(
|
|
|
|
|
project_root: &Path,
|
|
|
|
|
story_id: &str,
|
|
|
|
|
agent_name: &str,
|
|
|
|
|
) -> f64 {
|
|
|
|
|
use crate::agents::TokenUsage;
|
|
|
|
|
|
|
|
|
|
let log_files =
|
|
|
|
|
crate::agent_log::list_story_log_files(project_root, story_id, Some(agent_name));
|
|
|
|
|
let mut total_cost = 0.0;
|
|
|
|
|
|
|
|
|
|
for path in &log_files {
|
|
|
|
|
let entries = match crate::agent_log::read_log(path) {
|
|
|
|
|
Ok(e) => e,
|
|
|
|
|
Err(_) => continue,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Check for a result event with accurate total_cost_usd (completed session).
|
|
|
|
|
let result_cost = entries.iter().rev().find_map(|entry| {
|
|
|
|
|
if entry.event.get("type").and_then(|v| v.as_str()) == Some("agent_json")
|
|
|
|
|
&& let Some(data) = entry.event.get("data")
|
|
|
|
|
&& data.get("type").and_then(|v| v.as_str()) == Some("result")
|
|
|
|
|
{
|
|
|
|
|
data.get("total_cost_usd").and_then(|v| v.as_f64())
|
|
|
|
|
} else {
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
if let Some(cost) = result_cost {
|
|
|
|
|
// Completed session — use accurate cost.
|
|
|
|
|
total_cost += cost;
|
|
|
|
|
} else {
|
|
|
|
|
// Running session — estimate from per-message token usage.
|
|
|
|
|
for entry in &entries {
|
|
|
|
|
if entry.event.get("type").and_then(|v| v.as_str()) == Some("agent_json")
|
|
|
|
|
&& let Some(data) = entry.event.get("data")
|
|
|
|
|
&& data.get("type").and_then(|v| v.as_str()) == Some("assistant")
|
|
|
|
|
&& let Some(message) = data.get("message")
|
|
|
|
|
&& let Some(usage) = message.get("usage")
|
|
|
|
|
{
|
|
|
|
|
let model = message.get("model").and_then(|v| v.as_str());
|
|
|
|
|
let token_usage = TokenUsage {
|
|
|
|
|
input_tokens: usage
|
|
|
|
|
.get("input_tokens")
|
|
|
|
|
.and_then(|v| v.as_u64())
|
|
|
|
|
.unwrap_or(0),
|
|
|
|
|
output_tokens: usage
|
|
|
|
|
.get("output_tokens")
|
|
|
|
|
.and_then(|v| v.as_u64())
|
|
|
|
|
.unwrap_or(0),
|
|
|
|
|
cache_creation_input_tokens: usage
|
|
|
|
|
.get("cache_creation_input_tokens")
|
|
|
|
|
.and_then(|v| v.as_u64())
|
|
|
|
|
.unwrap_or(0),
|
|
|
|
|
cache_read_input_tokens: usage
|
|
|
|
|
.get("cache_read_input_tokens")
|
|
|
|
|
.and_then(|v| v.as_u64())
|
|
|
|
|
.unwrap_or(0),
|
|
|
|
|
total_cost_usd: 0.0,
|
|
|
|
|
};
|
|
|
|
|
total_cost += token_usage.estimate_cost_usd(model);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
total_cost
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Scan running agents for turn/budget limit violations and terminate offenders.
|
|
|
|
|
///
|
|
|
|
|
/// Uses the same accessor path as `tool_get_agent_remaining_turns_and_budget`:
|
|
|
|
|
/// log files are counted via `agent_log::list_story_log_files` + `read_log`,
|
|
|
|
|
/// and budget via `token_usage::read_all`.
|
|
|
|
|
/// Turns are counted from `assistant` events in the log files. Budget is
|
|
|
|
|
/// computed from per-message token usage in the logs (for running sessions)
|
|
|
|
|
/// or from the `result` event's `total_cost_usd` (for completed sessions),
|
|
|
|
|
/// with `token_usage.jsonl` records as a fallback floor.
|
|
|
|
|
fn check_agent_limits(
|
|
|
|
|
agents: &Mutex<HashMap<String, StoryAgent>>,
|
|
|
|
|
project_root: &Path,
|
|
|
|
@@ -119,7 +196,7 @@ fn check_agent_limits(
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Count turns — same path as tool_get_agent_remaining_turns_and_budget.
|
|
|
|
|
// Count turns from log files.
|
|
|
|
|
let mut turns_used: u64 = 0;
|
|
|
|
|
if max_turns.is_some() {
|
|
|
|
|
let log_files =
|
|
|
|
@@ -138,13 +215,18 @@ fn check_agent_limits(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Compute budget — same path as tool_get_agent_remaining_turns_and_budget.
|
|
|
|
|
// Compute budget from log-based per-message estimates (works for
|
|
|
|
|
// running agents) and completed-session records from token_usage.jsonl.
|
|
|
|
|
// Use whichever is higher — log estimates cover in-flight sessions,
|
|
|
|
|
// while token_usage.jsonl records are accurate for completed ones.
|
|
|
|
|
let budget_used_usd: f64 = if max_budget_usd.is_some() {
|
|
|
|
|
all_token_records
|
|
|
|
|
let log_cost = compute_budget_from_logs(project_root, story_id, agent_name);
|
|
|
|
|
let record_cost: f64 = all_token_records
|
|
|
|
|
.iter()
|
|
|
|
|
.filter(|r| r.story_id == *story_id && r.agent_name == *agent_name)
|
|
|
|
|
.map(|r| r.usage.total_cost_usd)
|
|
|
|
|
.sum()
|
|
|
|
|
.sum();
|
|
|
|
|
log_cost.max(record_cost)
|
|
|
|
|
} else {
|
|
|
|
|
0.0
|
|
|
|
|
};
|
|
|
|
@@ -213,6 +295,13 @@ impl AgentPool {
|
|
|
|
|
/// Run one watchdog pass: detect orphans, enforce limits, kill offenders.
|
|
|
|
|
///
|
|
|
|
|
/// Called by the unified background tick loop every 30 ticks.
|
|
|
|
|
///
|
|
|
|
|
/// When a limit is exceeded, the agent's PTY child is killed AND the story is
|
|
|
|
|
/// marked `blocked: true` in CRDT state. Marking blocked is essential — without
|
|
|
|
|
/// it, `auto_assign_available_work` immediately re-spawns the agent on the next
|
|
|
|
|
/// tick, the new session inherits the prior session's logs (so `turns_used`
|
|
|
|
|
/// keeps growing across sessions), and the agent burns through hundreds of
|
|
|
|
|
/// turns in a kill-respawn loop before any operator intervention.
|
|
|
|
|
pub fn run_watchdog_pass(&self, project_root: Option<&Path>) -> usize {
|
|
|
|
|
let orphaned = check_orphaned_agents(&self.agents);
|
|
|
|
|
|
|
|
|
@@ -227,6 +316,24 @@ impl AgentPool {
|
|
|
|
|
{
|
|
|
|
|
handle.abort();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Mark the story `blocked: true` so auto-assign skips it on the
|
|
|
|
|
// next tick. Without this, the kill-respawn loop described above
|
|
|
|
|
// burns the agent's budget across many short sessions.
|
|
|
|
|
let story_id = key.rsplit_once(':').map(|(s, _)| s).unwrap_or(key);
|
|
|
|
|
if let Some(contents) = crate::db::read_content(story_id) {
|
|
|
|
|
let blocked = crate::io::story_metadata::write_blocked_in_content(&contents);
|
|
|
|
|
crate::db::write_content(story_id, &blocked);
|
|
|
|
|
let stage = crate::pipeline_state::read_typed(story_id)
|
|
|
|
|
.ok()
|
|
|
|
|
.flatten()
|
|
|
|
|
.map(|i| i.stage.dir_name().to_string())
|
|
|
|
|
.unwrap_or_else(|| "2_current".to_string());
|
|
|
|
|
crate::db::write_item_with_content(story_id, &stage, &blocked);
|
|
|
|
|
slog!(
|
|
|
|
|
"[watchdog] Marked story '{story_id}' as blocked after limit termination."
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if !terminated.is_empty() {
|
|
|
|
|
Self::notify_agent_state_changed(&self.watcher_tx);
|
|
|
|
@@ -591,4 +698,69 @@ max_budget_usd = 5.00
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Kill-respawn loop fix (bug 646) ──────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
/// When the watchdog terminates an agent for limit-exceeded, it must mark
|
|
|
|
|
/// the story `blocked: true` in CRDT state so `auto_assign_available_work`
|
|
|
|
|
/// won't immediately re-spawn the agent on the next tick.
|
|
|
|
|
///
|
|
|
|
|
/// Without this, the kill-respawn loop produces the symptom observed in
|
|
|
|
|
/// production: the new session inherits the prior session's logs, so
|
|
|
|
|
/// `turns_used` keeps growing across sessions (e.g. 991/50 turns observed).
|
|
|
|
|
#[test]
|
|
|
|
|
fn watchdog_marks_story_blocked_after_limit_termination() {
|
|
|
|
|
crate::db::ensure_content_store();
|
|
|
|
|
|
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
|
|
|
let root = tmp.path();
|
|
|
|
|
|
|
|
|
|
write_project_config(
|
|
|
|
|
root,
|
|
|
|
|
r#"
|
|
|
|
|
[[agent]]
|
|
|
|
|
name = "coder-1"
|
|
|
|
|
runtime = "claude-code"
|
|
|
|
|
max_turns = 10
|
|
|
|
|
"#,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Write story content into the CRDT-backed content store so the
|
|
|
|
|
// watchdog's mark-blocked path has something to read+update.
|
|
|
|
|
let story_id = "42_story_runaway";
|
|
|
|
|
let initial = "---\nname: Runaway Story\n---\n# Runaway Story\n";
|
|
|
|
|
crate::db::write_content(story_id, initial);
|
|
|
|
|
|
|
|
|
|
// 12 turns exceeds the configured max of 10.
|
|
|
|
|
write_fake_log(root, story_id, "coder-1", 12);
|
|
|
|
|
|
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
|
|
|
let _tx = pool.inject_test_agent(story_id, "coder-1", AgentStatus::Running);
|
|
|
|
|
|
|
|
|
|
let found = pool.run_watchdog_pass(Some(root));
|
|
|
|
|
assert!(found >= 1, "watchdog should detect the over-limit agent");
|
|
|
|
|
|
|
|
|
|
// CORE INVARIANT: after termination, the story must be marked blocked.
|
|
|
|
|
// Without this, auto_assign_available_work would immediately re-spawn
|
|
|
|
|
// the agent on its next tick.
|
|
|
|
|
let updated = crate::db::read_content(story_id)
|
|
|
|
|
.expect("story content must still exist after watchdog termination");
|
|
|
|
|
assert!(
|
|
|
|
|
updated.contains("blocked: true"),
|
|
|
|
|
"story must be marked `blocked: true` after limit termination — got:\n{updated}"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Sanity: the agent itself is also Failed with the right reason.
|
|
|
|
|
{
|
|
|
|
|
let agents = pool.agents.lock().unwrap();
|
|
|
|
|
let key = composite_key(story_id, "coder-1");
|
|
|
|
|
let agent = agents.get(&key).unwrap();
|
|
|
|
|
assert_eq!(agent.status, AgentStatus::Failed);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
agent.termination_reason,
|
|
|
|
|
Some(TerminationReason::TurnLimit),
|
|
|
|
|
"termination reason must be TurnLimit"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|