fix(923): watchdog counts only tool-using turns; narration-only turns no longer burn budget

Observed: stories 917, 918, 920, 910 all turn-limit-killed despite producing real commits. Tally across their session logs shows 30–55% of assistant turns were pure narration ("I'll read X next", "Now let me check Y") with no tool_use. At 80 max_turns the effective work budget was ~44 tool calls, not enough for a typical bug fix's edit + test + check_criterion cycle. Changes: - New optional AgentConfig field max_tool_turns. When set the watchdog uses it instead of max_turns; only assistant messages whose data.message.content has at least one tool_use block count. - count_turns_in_log in agents/pool/auto_assign/watchdog/limits.rs filters on tool_use. Existing test helper write_fake_session_log now emits tool_use blocks; added write_fake_mixed_session_log for the narration regression test. - agents.toml: coders/coder-opus get max_turns=200 (claude-code's own --max-turns cap, sized to never bite before the watchdog) and max_tool_turns=80. qa: 120 / 40. mergemaster: 250 / 100. Budgets unchanged — the dollar cap remains the runaway-loop backstop, with ~$3-5 worst-case waste if an agent narrates indefinitely. - Two new regression tests: * watchdog_does_not_count_narration_only_turns: 5 tool + 30 narration under max_tool_turns=10 stays Running. * watchdog_max_tool_turns_overrides_max_turns: 4 tool turns at max_tool_turns=3 / max_turns=200 still terminates with TurnLimit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 17:25:11 +01:00
parent ce07c4d7b7
commit 6feb68f3e3
5 changed files with 197 additions and 22 deletions
@@ -41,7 +41,13 @@ pub(crate) fn resolve_session_log(
    crate::agent_log::find_latest_log(project_root, story_id, agent_name)
 }

-/// Count `assistant` events in a single log file.
+/// Count **tool-using** assistant turns in a single log file (story 923).
+///
+/// A turn counts only if its assistant message contains at least one
+/// `content` block with `type == "tool_use"`. Narration-only turns
+/// (text-only assistant messages such as "I'll read X next" preambles)
+/// don't count against the watchdog limit — they make no progress, only
+/// burn budget, and Sonnet emits them at roughly 30–55% of all turns.
 pub(crate) fn count_turns_in_log(path: &Path) -> u64 {
    let entries = match crate::agent_log::read_log(path) {
        Ok(e) => e,
@@ -50,13 +56,24 @@ pub(crate) fn count_turns_in_log(path: &Path) -> u64 {
    entries
        .iter()
        .filter(|entry| {
-            entry.event.get("type").and_then(|v| v.as_str()) == Some("agent_json")
-                && entry
-                    .event
-                    .get("data")
-                    .and_then(|d| d.get("type"))
-                    .and_then(|v| v.as_str())
-                    == Some("assistant")
+            if entry.event.get("type").and_then(|v| v.as_str()) != Some("agent_json") {
+                return false;
+            }
+            let data = match entry.event.get("data") {
+                Some(d) => d,
+                None => return false,
+            };
+            if data.get("type").and_then(|v| v.as_str()) != Some("assistant") {
+                return false;
+            }
+            // Require at least one tool_use content block.
+            data.pointer("/message/content")
+                .and_then(|c| c.as_array())
+                .map(|arr| {
+                    arr.iter()
+                        .any(|item| item.get("type").and_then(|v| v.as_str()) == Some("tool_use"))
+                })
+                .unwrap_or(false)
        })
        .count() as u64
 }
@@ -103,7 +120,10 @@ pub(super) fn check_agent_limits(

    for (key, story_id, agent_name, tx, log_session_id) in &running {
        let agent_config = config.agent.iter().find(|a| a.name == *agent_name);
-        let max_turns = agent_config.and_then(|a| a.max_turns);
+        // The watchdog gates on max_tool_turns (counts only tool-using
+        // assistant turns) when set; otherwise falls back to max_turns for
+        // backwards compatibility with configs that haven't migrated yet.
+        let max_turns = agent_config.and_then(|a| a.max_tool_turns.or(a.max_turns));
        let max_budget_usd = agent_config.and_then(|a| a.max_budget_usd);

        // Skip agents with no limits configured.
@@ -2,7 +2,10 @@
 //! for the watchdog (bugs 624, 646, 650).

 use super::super::super::super::{AgentPool, composite_key};
-use super::{write_fake_budget_session_log, write_fake_session_log, write_project_config};
+use super::{
+    write_fake_budget_session_log, write_fake_mixed_session_log, write_fake_session_log,
+    write_project_config,
+};
 use crate::agents::{AgentEvent, AgentStatus, TerminationReason};

 // ── Limit enforcement integration tests (bug 624) ────────────────────────
@@ -521,3 +524,92 @@ max_turns = 10
        );
    }
 }
+
+// ── Narration filter (story 923) ────────────────────────────────────────
+
+/// Story 923: narration-only assistant turns (text-only, no `tool_use`)
+/// must not count against the watchdog's turn budget. A session log with
+/// 5 tool turns and 30 narration turns reports turns_used == 5, so an
+/// agent with max_tool_turns = 10 stays Running.
+#[test]
+fn watchdog_does_not_count_narration_only_turns() {
+    let tmp = tempfile::tempdir().unwrap();
+    let root = tmp.path();
+
+    write_project_config(
+        root,
+        r#"
+[[agent]]
+name = "coder-1"
+runtime = "claude-code"
+max_tool_turns = 10
+max_turns = 200
+"#,
+    );
+
+    // 5 tool-using turns + 30 narration-only turns.
+    // Without the filter, total turns = 35 > max_tool_turns = 10 → kill.
+    // With the filter, tool turns = 5 ≤ 10 → stay Running.
+    write_fake_mixed_session_log(root, "story_923", "coder-1", "sess-narr", 5, 30);
+
+    let pool = AgentPool::new_test(3001);
+    pool.inject_test_agent_with_session("story_923", "coder-1", AgentStatus::Running, "sess-narr");
+
+    let found = pool.run_watchdog_pass(Some(root));
+    assert_eq!(
+        found, 0,
+        "agent must not be terminated: only 5 tool turns of a 10-turn budget"
+    );
+
+    let agents = pool.agents.lock().unwrap();
+    let key = composite_key("story_923", "coder-1");
+    let agent = agents.get(&key).unwrap();
+    assert_eq!(agent.status, AgentStatus::Running);
+    assert!(agent.termination_reason.is_none());
+}
+
+/// Story 923: max_tool_turns takes precedence over max_turns when both are
+/// set. With max_tool_turns = 3 and max_turns = 200, an agent that has 4
+/// tool turns is killed even though total turns (4) is far below max_turns.
+#[test]
+fn watchdog_max_tool_turns_overrides_max_turns() {
+    let tmp = tempfile::tempdir().unwrap();
+    let root = tmp.path();
+
+    write_project_config(
+        root,
+        r#"
+[[agent]]
+name = "coder-1"
+runtime = "claude-code"
+max_tool_turns = 3
+max_turns = 200
+"#,
+    );
+
+    write_fake_mixed_session_log(root, "story_923b", "coder-1", "sess-over", 4, 0);
+
+    let pool = AgentPool::new_test(3001);
+    let tx = pool.inject_test_agent_with_session(
+        "story_923b",
+        "coder-1",
+        AgentStatus::Running,
+        "sess-over",
+    );
+    let mut rx = tx.subscribe();
+
+    let found = pool.run_watchdog_pass(Some(root));
+    assert!(
+        found >= 1,
+        "watchdog must terminate when tool turns exceed max_tool_turns"
+    );
+
+    let agents = pool.agents.lock().unwrap();
+    let key = composite_key("story_923b", "coder-1");
+    let agent = agents.get(&key).unwrap();
+    assert_eq!(agent.status, AgentStatus::Failed);
+    assert_eq!(agent.termination_reason, Some(TerminationReason::TurnLimit));
+
+    let event = rx.try_recv().expect("watchdog must emit an Error event");
+    assert!(matches!(event, AgentEvent::Error { .. }));
+}
@@ -5,29 +5,75 @@ use std::path::Path;
 mod limits_tests;
 mod orphan_tests;

-/// Write a fake session log file with `n` assistant turn entries.
+/// Write a fake session log file with `n` tool-using assistant turn entries.
 ///
-/// The file is named `{agent_name}-{session_id}.log` to match the
-/// real naming convention used by `AgentLogWriter`.
+/// Each turn includes a single `tool_use` content block so it counts under
+/// the watchdog's tool-only turn filter (story 923). The file is named
+/// `{agent_name}-{session_id}.log` to match the real naming convention
+/// used by `AgentLogWriter`.
 pub(super) fn write_fake_session_log(
    project_root: &Path,
    story_id: &str,
    agent_name: &str,
    session_id: &str,
    n_turns: u64,
+) {
+    write_fake_mixed_session_log(project_root, story_id, agent_name, session_id, n_turns, 0);
+}
+
+/// Write a fake session log with `n_tool` tool-using assistant turns and
+/// `n_narration` text-only narration turns (story 923 regression).
+///
+/// Tool turns contain a `tool_use` content block; narration turns contain
+/// only a `text` block. The watchdog's `count_turns_in_log` must report
+/// `n_tool` only — narration must not count.
+pub(super) fn write_fake_mixed_session_log(
+    project_root: &Path,
+    story_id: &str,
+    agent_name: &str,
+    session_id: &str,
+    n_tool: u64,
+    n_narration: u64,
 ) {
    let log_dir = project_root.join(".huskies").join("logs").join(story_id);
    std::fs::create_dir_all(&log_dir).unwrap();
    let log_path = log_dir.join(format!("{agent_name}-{session_id}.log"));
    let mut content = String::new();
-    for _ in 0..n_turns {
+    for _ in 0..n_tool {
        content.push_str(
            &serde_json::to_string(&serde_json::json!({
                "timestamp": "2026-04-25T00:00:00Z",
                "type": "agent_json",
                "story_id": story_id,
                "agent_name": agent_name,
-                "data": { "type": "assistant", "message": {} }
+                "data": {
+                    "type": "assistant",
+                    "message": {
+                        "content": [
+                            { "type": "tool_use", "name": "Read", "input": {} }
+                        ]
+                    }
+                }
+            }))
+            .unwrap(),
+        );
+        content.push('\n');
+    }
+    for _ in 0..n_narration {
+        content.push_str(
+            &serde_json::to_string(&serde_json::json!({
+                "timestamp": "2026-04-25T00:00:00Z",
+                "type": "agent_json",
+                "story_id": story_id,
+                "agent_name": agent_name,
+                "data": {
+                    "type": "assistant",
+                    "message": {
+                        "content": [
+                            { "type": "text", "text": "Now let me read the next file." }
+                        ]
+                    }
+                }
            }))
            .unwrap(),
        );
@@ -242,6 +242,15 @@ pub struct AgentConfig {
    pub disallowed_tools: Option<Vec<String>>,
    #[serde(default)]
    pub max_turns: Option<u32>,
+    /// Watchdog cap on **tool-using** assistant turns (story 923).
+    ///
+    /// Narration-only turns (assistant messages with no `tool_use` block)
+    /// don't count against this. When unset, the watchdog falls back to
+    /// `max_turns`. Set this lower than `max_turns` so claude-code's own
+    /// `--max-turns` cap (which counts narration) doesn't fire before the
+    /// watchdog.
+    #[serde(default)]
+    pub max_tool_turns: Option<u32>,
    #[serde(default)]
    pub max_budget_usd: Option<f64>,
    #[serde(default)]
@@ -325,6 +334,7 @@ impl Default for ProjectConfig {
                allowed_tools: None,
                disallowed_tools: None,
                max_turns: None,
+                max_tool_turns: None,
                max_budget_usd: None,
                system_prompt: None,
                stage: None,