From 6feb68f3e335b97f4f1be6def56e9d97bff97c5c Mon Sep 17 00:00:00 2001 From: Timmy Date: Tue, 12 May 2026 17:25:11 +0100 Subject: [PATCH] fix(923): watchdog counts only tool-using turns; narration-only turns no longer burn budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Observed: stories 917, 918, 920, 910 all turn-limit-killed despite producing real commits. Tally across their session logs shows 30–55% of assistant turns were pure narration ("I'll read X next", "Now let me check Y") with no tool_use. At 80 max_turns the effective work budget was ~44 tool calls, not enough for a typical bug fix's edit + test + check_criterion cycle. Changes: - New optional AgentConfig field max_tool_turns. When set the watchdog uses it instead of max_turns; only assistant messages whose data.message.content has at least one tool_use block count. - count_turns_in_log in agents/pool/auto_assign/watchdog/limits.rs filters on tool_use. Existing test helper write_fake_session_log now emits tool_use blocks; added write_fake_mixed_session_log for the narration regression test. - agents.toml: coders/coder-opus get max_turns=200 (claude-code's own --max-turns cap, sized to never bite before the watchdog) and max_tool_turns=80. qa: 120 / 40. mergemaster: 250 / 100. Budgets unchanged — the dollar cap remains the runaway-loop backstop, with ~$3-5 worst-case waste if an agent narrates indefinitely. - Two new regression tests: * watchdog_does_not_count_narration_only_turns: 5 tool + 30 narration under max_tool_turns=10 stays Running. * watchdog_max_tool_turns_overrides_max_turns: 4 tool turns at max_tool_turns=3 / max_turns=200 still terminates with TurnLimit. Co-Authored-By: Claude Opus 4.7 (1M context) --- .huskies/agents.toml | 21 +++-- .../pool/auto_assign/watchdog/limits.rs | 38 ++++++-- .../watchdog/tests/limits_tests.rs | 94 ++++++++++++++++++- .../pool/auto_assign/watchdog/tests/mod.rs | 56 ++++++++++- server/src/config/mod.rs | 10 ++ 5 files changed, 197 insertions(+), 22 deletions(-) diff --git a/.huskies/agents.toml b/.huskies/agents.toml index 5b63d052..051985ef 100644 --- a/.huskies/agents.toml +++ b/.huskies/agents.toml @@ -3,7 +3,8 @@ name = "coder-1" stage = "coder" role = "Full-stack engineer. Implements features across all components." model = "sonnet" -max_turns = 80 +max_turns = 200 +max_tool_turns = 80 max_budget_usd = 5.00 disallowed_tools = ["ScheduleWakeup"] prompt ="You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .huskies/README.md for the dev process, .huskies/specs/00_CONTEXT.md for what this project does, and .huskies/specs/tech/STACK.md for the tech stack and source map. The story details are in your prompt above. The worktree and feature branch already exist - do not create them.\n\n## Your workflow\n1. Read the story and understand the acceptance criteria.\n2. Implement the changes.\n3. As you complete each acceptance criterion, call check_criterion MCP tool to mark it done.\n4. Run the run_tests MCP tool. It blocks server-side until tests finish (up to 20 minutes) and returns the full result. Do NOT call get_test_result — run_tests already gives you the pass/fail outcome.\n5. If tests fail, fix the failures and run run_tests again. Do not commit until tests pass.\n6. Once tests pass, commit your work with a descriptive message and exit.\n\nDo NOT accept stories, move them between stages, or merge to master. The server handles all of that after you exit.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix.\n2. If the story does NOT specify the exact location, investigate with targeted grep.\n3. Fix with a surgical, minimal change.\n4. Run tests, fix failures, commit and exit.\n5. Write commit messages that explain what broke and why." @@ -14,7 +15,8 @@ name = "coder-2" stage = "coder" role = "Full-stack engineer. Implements features across all components." model = "sonnet" -max_turns = 80 +max_turns = 200 +max_tool_turns = 80 max_budget_usd = 5.00 disallowed_tools = ["ScheduleWakeup"] prompt ="You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .huskies/README.md for the dev process, .huskies/specs/00_CONTEXT.md for what this project does, and .huskies/specs/tech/STACK.md for the tech stack and source map. The story details are in your prompt above. The worktree and feature branch already exist - do not create them.\n\n## Your workflow\n1. Read the story and understand the acceptance criteria.\n2. Implement the changes.\n3. As you complete each acceptance criterion, call check_criterion MCP tool to mark it done.\n4. Run the run_tests MCP tool. It blocks server-side until tests finish (up to 20 minutes) and returns the full result. Do NOT call get_test_result — run_tests already gives you the pass/fail outcome.\n5. If tests fail, fix the failures and run run_tests again. Do not commit until tests pass.\n6. Once tests pass, commit your work with a descriptive message and exit.\n\nDo NOT accept stories, move them between stages, or merge to master. The server handles all of that after you exit.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix.\n2. If the story does NOT specify the exact location, investigate with targeted grep.\n3. Fix with a surgical, minimal change.\n4. Run tests, fix failures, commit and exit.\n5. Write commit messages that explain what broke and why." @@ -25,7 +27,8 @@ name = "coder-3" stage = "coder" role = "Full-stack engineer. Implements features across all components." model = "sonnet" -max_turns = 80 +max_turns = 200 +max_tool_turns = 80 max_budget_usd = 5.00 disallowed_tools = ["ScheduleWakeup"] prompt ="You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .huskies/README.md for the dev process, .huskies/specs/00_CONTEXT.md for what this project does, and .huskies/specs/tech/STACK.md for the tech stack and source map. The story details are in your prompt above. The worktree and feature branch already exist - do not create them.\n\n## Your workflow\n1. Read the story and understand the acceptance criteria.\n2. Implement the changes.\n3. As you complete each acceptance criterion, call check_criterion MCP tool to mark it done.\n4. Run the run_tests MCP tool. It blocks server-side until tests finish (up to 20 minutes) and returns the full result. Do NOT call get_test_result — run_tests already gives you the pass/fail outcome.\n5. If tests fail, fix the failures and run run_tests again. Do not commit until tests pass.\n6. Once tests pass, commit your work with a descriptive message and exit.\n\nDo NOT accept stories, move them between stages, or merge to master. The server handles all of that after you exit.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix.\n2. If the story does NOT specify the exact location, investigate with targeted grep.\n3. Fix with a surgical, minimal change.\n4. Run tests, fix failures, commit and exit.\n5. Write commit messages that explain what broke and why." @@ -36,7 +39,8 @@ name = "qa-2" stage = "qa" role = "Reviews coder work in worktrees: runs quality gates, verifies acceptance criteria, and reports findings." model = "sonnet" -max_turns = 40 +max_turns = 120 +max_tool_turns = 40 max_budget_usd = 4.00 prompt = """You are the QA agent for story {{story_id}}. Your job is to verify the coder's work satisfies the story's acceptance criteria and produce a structured QA report. @@ -127,7 +131,8 @@ name = "coder-opus" stage = "coder" role = "Senior full-stack engineer for complex tasks. Implements features across all components." model = "opus" -max_turns = 80 +max_turns = 200 +max_tool_turns = 80 max_budget_usd = 20.00 disallowed_tools = ["ScheduleWakeup"] prompt ="You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .huskies/README.md for the dev process, .huskies/specs/00_CONTEXT.md for what this project does, and .huskies/specs/tech/STACK.md for the tech stack and source map. The story details are in your prompt above. The worktree and feature branch already exist - do not create them.\n\n## Your workflow\n1. Read the story and understand the acceptance criteria.\n2. Implement the changes.\n3. As you complete each acceptance criterion, call check_criterion MCP tool to mark it done.\n4. Run the run_tests MCP tool. It blocks server-side until tests finish (up to 20 minutes) and returns the full result. Do NOT call get_test_result — run_tests already gives you the pass/fail outcome.\n5. If tests fail, fix the failures and run run_tests again. Do not commit until tests pass.\n6. Once tests pass, commit your work with a descriptive message and exit.\n\nDo NOT accept stories, move them between stages, or merge to master. The server handles all of that after you exit.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix.\n2. If the story does NOT specify the exact location, investigate with targeted grep.\n3. Fix with a surgical, minimal change.\n4. Run tests, fix failures, commit and exit.\n5. Write commit messages that explain what broke and why." @@ -138,7 +143,8 @@ name = "qa" stage = "qa" role = "Reviews coder work in worktrees: runs quality gates, verifies acceptance criteria, and reports findings." model = "sonnet" -max_turns = 40 +max_turns = 120 +max_tool_turns = 40 max_budget_usd = 4.00 prompt = """You are the QA agent for story {{story_id}}. Your job is to verify the coder's work satisfies the story's acceptance criteria and produce a structured QA report. @@ -229,7 +235,8 @@ name = "mergemaster" stage = "mergemaster" role = "Merges completed coder work into master, runs quality gates, archives stories, and cleans up worktrees." model = "opus" -max_turns = 100 +max_turns = 250 +max_tool_turns = 100 max_budget_usd = 25.00 inactivity_timeout_secs = 900 prompt = """You are the mergemaster agent for story {{story_id}}. Your job is to merge the completed coder work into master. diff --git a/server/src/agents/pool/auto_assign/watchdog/limits.rs b/server/src/agents/pool/auto_assign/watchdog/limits.rs index 6124ef94..3de925dc 100644 --- a/server/src/agents/pool/auto_assign/watchdog/limits.rs +++ b/server/src/agents/pool/auto_assign/watchdog/limits.rs @@ -41,7 +41,13 @@ pub(crate) fn resolve_session_log( crate::agent_log::find_latest_log(project_root, story_id, agent_name) } -/// Count `assistant` events in a single log file. +/// Count **tool-using** assistant turns in a single log file (story 923). +/// +/// A turn counts only if its assistant message contains at least one +/// `content` block with `type == "tool_use"`. Narration-only turns +/// (text-only assistant messages such as "I'll read X next" preambles) +/// don't count against the watchdog limit — they make no progress, only +/// burn budget, and Sonnet emits them at roughly 30–55% of all turns. pub(crate) fn count_turns_in_log(path: &Path) -> u64 { let entries = match crate::agent_log::read_log(path) { Ok(e) => e, @@ -50,13 +56,24 @@ pub(crate) fn count_turns_in_log(path: &Path) -> u64 { entries .iter() .filter(|entry| { - entry.event.get("type").and_then(|v| v.as_str()) == Some("agent_json") - && entry - .event - .get("data") - .and_then(|d| d.get("type")) - .and_then(|v| v.as_str()) - == Some("assistant") + if entry.event.get("type").and_then(|v| v.as_str()) != Some("agent_json") { + return false; + } + let data = match entry.event.get("data") { + Some(d) => d, + None => return false, + }; + if data.get("type").and_then(|v| v.as_str()) != Some("assistant") { + return false; + } + // Require at least one tool_use content block. + data.pointer("/message/content") + .and_then(|c| c.as_array()) + .map(|arr| { + arr.iter() + .any(|item| item.get("type").and_then(|v| v.as_str()) == Some("tool_use")) + }) + .unwrap_or(false) }) .count() as u64 } @@ -103,7 +120,10 @@ pub(super) fn check_agent_limits( for (key, story_id, agent_name, tx, log_session_id) in &running { let agent_config = config.agent.iter().find(|a| a.name == *agent_name); - let max_turns = agent_config.and_then(|a| a.max_turns); + // The watchdog gates on max_tool_turns (counts only tool-using + // assistant turns) when set; otherwise falls back to max_turns for + // backwards compatibility with configs that haven't migrated yet. + let max_turns = agent_config.and_then(|a| a.max_tool_turns.or(a.max_turns)); let max_budget_usd = agent_config.and_then(|a| a.max_budget_usd); // Skip agents with no limits configured. diff --git a/server/src/agents/pool/auto_assign/watchdog/tests/limits_tests.rs b/server/src/agents/pool/auto_assign/watchdog/tests/limits_tests.rs index b3aa0092..f68ab556 100644 --- a/server/src/agents/pool/auto_assign/watchdog/tests/limits_tests.rs +++ b/server/src/agents/pool/auto_assign/watchdog/tests/limits_tests.rs @@ -2,7 +2,10 @@ //! for the watchdog (bugs 624, 646, 650). use super::super::super::super::{AgentPool, composite_key}; -use super::{write_fake_budget_session_log, write_fake_session_log, write_project_config}; +use super::{ + write_fake_budget_session_log, write_fake_mixed_session_log, write_fake_session_log, + write_project_config, +}; use crate::agents::{AgentEvent, AgentStatus, TerminationReason}; // ── Limit enforcement integration tests (bug 624) ──────────────────────── @@ -521,3 +524,92 @@ max_turns = 10 ); } } + +// ── Narration filter (story 923) ──────────────────────────────────────── + +/// Story 923: narration-only assistant turns (text-only, no `tool_use`) +/// must not count against the watchdog's turn budget. A session log with +/// 5 tool turns and 30 narration turns reports turns_used == 5, so an +/// agent with max_tool_turns = 10 stays Running. +#[test] +fn watchdog_does_not_count_narration_only_turns() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + write_project_config( + root, + r#" +[[agent]] +name = "coder-1" +runtime = "claude-code" +max_tool_turns = 10 +max_turns = 200 +"#, + ); + + // 5 tool-using turns + 30 narration-only turns. + // Without the filter, total turns = 35 > max_tool_turns = 10 → kill. + // With the filter, tool turns = 5 ≤ 10 → stay Running. + write_fake_mixed_session_log(root, "story_923", "coder-1", "sess-narr", 5, 30); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent_with_session("story_923", "coder-1", AgentStatus::Running, "sess-narr"); + + let found = pool.run_watchdog_pass(Some(root)); + assert_eq!( + found, 0, + "agent must not be terminated: only 5 tool turns of a 10-turn budget" + ); + + let agents = pool.agents.lock().unwrap(); + let key = composite_key("story_923", "coder-1"); + let agent = agents.get(&key).unwrap(); + assert_eq!(agent.status, AgentStatus::Running); + assert!(agent.termination_reason.is_none()); +} + +/// Story 923: max_tool_turns takes precedence over max_turns when both are +/// set. With max_tool_turns = 3 and max_turns = 200, an agent that has 4 +/// tool turns is killed even though total turns (4) is far below max_turns. +#[test] +fn watchdog_max_tool_turns_overrides_max_turns() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + write_project_config( + root, + r#" +[[agent]] +name = "coder-1" +runtime = "claude-code" +max_tool_turns = 3 +max_turns = 200 +"#, + ); + + write_fake_mixed_session_log(root, "story_923b", "coder-1", "sess-over", 4, 0); + + let pool = AgentPool::new_test(3001); + let tx = pool.inject_test_agent_with_session( + "story_923b", + "coder-1", + AgentStatus::Running, + "sess-over", + ); + let mut rx = tx.subscribe(); + + let found = pool.run_watchdog_pass(Some(root)); + assert!( + found >= 1, + "watchdog must terminate when tool turns exceed max_tool_turns" + ); + + let agents = pool.agents.lock().unwrap(); + let key = composite_key("story_923b", "coder-1"); + let agent = agents.get(&key).unwrap(); + assert_eq!(agent.status, AgentStatus::Failed); + assert_eq!(agent.termination_reason, Some(TerminationReason::TurnLimit)); + + let event = rx.try_recv().expect("watchdog must emit an Error event"); + assert!(matches!(event, AgentEvent::Error { .. })); +} diff --git a/server/src/agents/pool/auto_assign/watchdog/tests/mod.rs b/server/src/agents/pool/auto_assign/watchdog/tests/mod.rs index 318c55b8..326c2127 100644 --- a/server/src/agents/pool/auto_assign/watchdog/tests/mod.rs +++ b/server/src/agents/pool/auto_assign/watchdog/tests/mod.rs @@ -5,29 +5,75 @@ use std::path::Path; mod limits_tests; mod orphan_tests; -/// Write a fake session log file with `n` assistant turn entries. +/// Write a fake session log file with `n` tool-using assistant turn entries. /// -/// The file is named `{agent_name}-{session_id}.log` to match the -/// real naming convention used by `AgentLogWriter`. +/// Each turn includes a single `tool_use` content block so it counts under +/// the watchdog's tool-only turn filter (story 923). The file is named +/// `{agent_name}-{session_id}.log` to match the real naming convention +/// used by `AgentLogWriter`. pub(super) fn write_fake_session_log( project_root: &Path, story_id: &str, agent_name: &str, session_id: &str, n_turns: u64, +) { + write_fake_mixed_session_log(project_root, story_id, agent_name, session_id, n_turns, 0); +} + +/// Write a fake session log with `n_tool` tool-using assistant turns and +/// `n_narration` text-only narration turns (story 923 regression). +/// +/// Tool turns contain a `tool_use` content block; narration turns contain +/// only a `text` block. The watchdog's `count_turns_in_log` must report +/// `n_tool` only — narration must not count. +pub(super) fn write_fake_mixed_session_log( + project_root: &Path, + story_id: &str, + agent_name: &str, + session_id: &str, + n_tool: u64, + n_narration: u64, ) { let log_dir = project_root.join(".huskies").join("logs").join(story_id); std::fs::create_dir_all(&log_dir).unwrap(); let log_path = log_dir.join(format!("{agent_name}-{session_id}.log")); let mut content = String::new(); - for _ in 0..n_turns { + for _ in 0..n_tool { content.push_str( &serde_json::to_string(&serde_json::json!({ "timestamp": "2026-04-25T00:00:00Z", "type": "agent_json", "story_id": story_id, "agent_name": agent_name, - "data": { "type": "assistant", "message": {} } + "data": { + "type": "assistant", + "message": { + "content": [ + { "type": "tool_use", "name": "Read", "input": {} } + ] + } + } + })) + .unwrap(), + ); + content.push('\n'); + } + for _ in 0..n_narration { + content.push_str( + &serde_json::to_string(&serde_json::json!({ + "timestamp": "2026-04-25T00:00:00Z", + "type": "agent_json", + "story_id": story_id, + "agent_name": agent_name, + "data": { + "type": "assistant", + "message": { + "content": [ + { "type": "text", "text": "Now let me read the next file." } + ] + } + } })) .unwrap(), ); diff --git a/server/src/config/mod.rs b/server/src/config/mod.rs index 706df709..6f2d582c 100644 --- a/server/src/config/mod.rs +++ b/server/src/config/mod.rs @@ -242,6 +242,15 @@ pub struct AgentConfig { pub disallowed_tools: Option>, #[serde(default)] pub max_turns: Option, + /// Watchdog cap on **tool-using** assistant turns (story 923). + /// + /// Narration-only turns (assistant messages with no `tool_use` block) + /// don't count against this. When unset, the watchdog falls back to + /// `max_turns`. Set this lower than `max_turns` so claude-code's own + /// `--max-turns` cap (which counts narration) doesn't fire before the + /// watchdog. + #[serde(default)] + pub max_tool_turns: Option, #[serde(default)] pub max_budget_usd: Option, #[serde(default)] @@ -325,6 +334,7 @@ impl Default for ProjectConfig { allowed_tools: None, disallowed_tools: None, max_turns: None, + max_tool_turns: None, max_budget_usd: None, system_prompt: None, stage: None,