huskies: merge 1089 bug Stuck-agent detector blocks stories on legitimate exploration / debugging — uses too narrow a "progress" signal

2026-05-15 11:34:50 +00:00
parent fb1311cdae
commit c4010854a5
8 changed files with 217 additions and 28 deletions
@@ -78,21 +78,34 @@ impl AgentPool {
                    // The coder exited with uncommitted content but no commits
                    // (typical "claude-code session boundary mid-sweep" pattern).
                    // Use a PROGRESS-AWARE retry cap: the agent gets unlimited
-                    // respawns as long as file edits keep growing between
-                    // attempts; only when the worktree diff is byte-identical
-                    // to the previous attempt do we count it as "no progress".
-                    // After NO_PROGRESS_CAP consecutive no-progress respawns,
-                    // block for human attention.
+                    // respawns as long as progress is being made between attempts.
+                    // Progress is satisfied if EITHER (a) the worktree diff grew,
+                    // OR (b) the set of files the agent read grew.  Raw tool-call
+                    // count does NOT count — a looping agent can produce many calls.
+                    // Only self-exited sessions with no file or read progress count
+                    // toward the cap; forced exits (API error, network, budget
+                    // exhaustion) are excluded (story 1089).
+                    // After NO_PROGRESS_CAP consecutive qualifying no-progress
+                    // respawns, block for human attention.
                    //
                    // TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
                    // keeps making file-edit progress every session, after this
-                    // many total respawns without a commit we escalate — caught
-                    // the "agent flaps between different edits but never
-                    // commits" pattern that the progress-aware counter would
-                    // never trigger.
+                    // many total respawns without a commit we escalate — catches
+                    // the "agent flaps between different edits but never commits"
+                    // pattern that the progress-aware counter would never trigger.
                    const NO_PROGRESS_CAP: u32 = 3;
                    const TOTAL_ATTEMPTS_CAP: u32 = 8;

+                    // AC1: consume the forced-exit flag written by spawn.rs when
+                    // the agent process exited with a non-zero code.
+                    let forced_exit = crate::db::read_content(
+                        crate::db::ContentKey::CommitRecoveryForcedExit(story_id),
+                    )
+                    .is_some();
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryForcedExit(
+                        story_id,
+                    ));
+
                    let current_fingerprint = worktree_path.as_deref().and_then(|p| {
                        std::process::Command::new("git")
                            .args(["diff", "master"])
@@ -104,18 +117,31 @@ impl AgentPool {
                    let stored_fingerprint = crate::db::read_content(
                        crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
                    );
-                    let made_progress = current_fingerprint.is_some()
+                    let diff_progress = current_fingerprint.is_some()
                        && stored_fingerprint.as_ref() != current_fingerprint.as_ref();
-                    let no_progress_count = if made_progress || stored_fingerprint.is_none() {
+
+                    // AC2: check read-file set progress as an additional signal.
+                    let read_progress = previous_session_id.as_deref().is_some_and(|session_id| {
+                        collect_read_progress(&project_root, story_id, agent_name, session_id)
+                    });
+
+                    let made_progress = diff_progress || read_progress;
+
+                    let prev_no_progress_count = crate::db::read_content(
+                        crate::db::ContentKey::CommitRecoveryPending(story_id),
+                    )
+                    .and_then(|s| s.trim().parse::<u32>().ok())
+                    .unwrap_or(0);
+
+                    // AC1: forced exits do not increment the stuck-respawn counter.
+                    let no_progress_count = if forced_exit {
+                        prev_no_progress_count
+                    } else if made_progress || stored_fingerprint.is_none() {
                        1
                    } else {
-                        crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
-                            story_id,
-                        ))
-                        .and_then(|s| s.trim().parse::<u32>().ok())
-                        .unwrap_or(0)
-                            + 1
+                        prev_no_progress_count + 1
                    };
+
                    let total_attempts = crate::db::read_content(
                        crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
                    )
@@ -136,13 +162,17 @@ impl AgentPool {
                        crate::db::delete_content(
                            crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
                        );
+                        crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
+                            story_id,
+                        ));
                        slog!(
                            "[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
                             commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
                             without a commit. Blocking story."
                        );
                        let reason = format!(
-                            "agent flapped — {total_attempts} respawns without ever committing"
+                            "commit absent after {total_attempts} respawns — \
+                             agent kept making edits but never committed"
                        );
                        if let Err(e) =
                            crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
@@ -167,14 +197,18 @@ impl AgentPool {
                        crate::db::delete_content(
                            crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
                        );
+                        crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
+                            story_id,
+                        ));
                        slog!(
                            "[pipeline] Coder '{agent_name}' for '{story_id}' made no \
-                             file-edit progress over {no_progress_count} consecutive \
-                             commit-recovery respawns. Blocking story."
+                             file or read progress over {no_progress_count} consecutive \
+                             self-exit commit-recovery respawns. Blocking story."
                        );
+                        // AC4: block message names the specific cause.
                        let reason = format!(
-                            "agent stuck — {no_progress_count} respawns without commits or \
-                             new file edits"
+                            "stuck-respawn cap reached: {NO_PROGRESS_CAP} consecutive \
+                             self-exits with no file or read progress"
                        );
                        if let Err(e) =
                            crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
@@ -206,7 +240,8 @@ impl AgentPool {
                            "[pipeline] Coder '{agent_name}' exited with uncommitted work \
                             for '{story_id}' (no-progress {no_progress_count}/\
                             {NO_PROGRESS_CAP}, total {total_attempts}/\
-                             {TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \
+                             {TOTAL_ATTEMPTS_CAP}; diff_progress={diff_progress}, \
+                             read_progress={read_progress}, forced_exit={forced_exit}). \
                             Issuing commit-only respawn."
                        );
                        let addendum = "\n\nYou have uncommitted work in this worktree. \
@@ -302,10 +337,13 @@ impl AgentPool {
                        });
                    }
                } else if completion.gates_passed {
-                    // Clear any stale recovery key when the coder succeeds normally.
+                    // Clear any stale recovery keys when the coder succeeds normally.
                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
                        story_id,
                    ));
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
+                        story_id,
+                    ));
                    // Determine effective QA mode for this story.
                    let qa_mode = {
                        let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
@@ -361,11 +399,14 @@ impl AgentPool {
                        }
                    }
                } else {
-                    // Clear any stale recovery key when gates fail normally (agent committed
+                    // Clear any stale recovery keys when gates fail normally (agent committed
                    // but the build is broken — treat as a standard retry, not a recovery).
                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
                        story_id,
                    ));
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
+                        story_id,
+                    ));
                    // Bug 645 / 668: Before retry/block, check if the agent left committed
                    // work AND the agent had a passing run_tests result captured during its
                    // session.  An agent may crash mid-output (e.g. Claude Code CLI PTY write
@@ -724,6 +765,109 @@ mod helpers;
 use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store};
 pub(crate) use helpers::{should_block_story, spawn_pipeline_advance};

+/// Parse a huskies agent log and return the set of file paths passed to the
+/// Read tool in that session.  Returns an empty set if the log cannot be read.
+///
+/// Used by [`collect_read_progress`] to detect read-exploration progress even
+/// when the worktree diff did not grow (story 1089, AC2).
+fn collect_read_files_from_log(
+    project_root: &std::path::Path,
+    story_id: &str,
+    agent_name: &str,
+    session_id: &str,
+) -> std::collections::HashSet<String> {
+    let log_path = crate::agent_log::log_file_path(project_root, story_id, agent_name, session_id);
+    let mut files = std::collections::HashSet::new();
+
+    let log_text = match std::fs::read_to_string(&log_path) {
+        Ok(t) => t,
+        Err(_) => return files,
+    };
+
+    for line in log_text.lines() {
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        let entry: serde_json::Value = match serde_json::from_str(trimmed) {
+            Ok(v) => v,
+            Err(_) => continue,
+        };
+        // Only look at agent_json events where data.type == "assistant".
+        if entry.get("type").and_then(|t| t.as_str()) != Some("agent_json") {
+            continue;
+        }
+        let data = match entry.get("data") {
+            Some(d) => d,
+            None => continue,
+        };
+        if data.get("type").and_then(|t| t.as_str()) != Some("assistant") {
+            continue;
+        }
+        let content = match data.pointer("/message/content").and_then(|c| c.as_array()) {
+            Some(c) => c,
+            None => continue,
+        };
+        for item in content {
+            if item.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
+                continue;
+            }
+            if item.get("name").and_then(|n| n.as_str()) != Some("Read") {
+                continue;
+            }
+            if let Some(path) = item.pointer("/input/file_path").and_then(|p| p.as_str()) {
+                files.insert(path.to_string());
+            }
+        }
+    }
+
+    files
+}
+
+/// Return `true` if the agent read any files in `session_id` that were not in
+/// the cumulative read set for `story_id`.  Updates the stored cumulative set
+/// when new files are found (story 1089, AC2).
+fn collect_read_progress(
+    project_root: &std::path::Path,
+    story_id: &str,
+    agent_name: &str,
+    session_id: &str,
+) -> bool {
+    let session_files = collect_read_files_from_log(project_root, story_id, agent_name, session_id);
+    if session_files.is_empty() {
+        return false;
+    }
+
+    let stored_set: std::collections::HashSet<String> =
+        crate::db::read_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id))
+            .map(|s| {
+                s.lines()
+                    .filter(|l| !l.is_empty())
+                    .map(str::to_string)
+                    .collect()
+            })
+            .unwrap_or_default();
+
+    let union: std::collections::HashSet<String> =
+        stored_set.union(&session_files).cloned().collect();
+
+    if union.len() > stored_set.len() {
+        let mut sorted: Vec<&String> = union.iter().collect();
+        sorted.sort();
+        crate::db::write_content(
+            crate::db::ContentKey::CommitRecoveryReadSet(story_id),
+            &sorted
+                .into_iter()
+                .map(String::as_str)
+                .collect::<Vec<_>>()
+                .join("\n"),
+        );
+        true
+    } else {
+        false
+    }
+}
+
 #[cfg(test)]
 mod tests;
 #[cfg(test)]
@@ -1077,7 +1077,7 @@ stage = "coder"
        "Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
    );
    assert!(
-        block_reason.contains("without commits or new file edits"),
+        block_reason.contains("self-exits with no file or read progress"),
        "Block reason should describe the no-progress condition, got: {block_reason}"
    );

@@ -1193,7 +1193,7 @@ stage = "coder"
        "Story must be blocked once total commit-recovery attempts hits the outer cap"
    );
    assert!(
-        block_reason.contains("flapped") && block_reason.contains("without ever committing"),
+        block_reason.contains("commit absent") && block_reason.contains("never committed"),
        "Block reason should describe the flapping pattern, got: {block_reason}"
    );

@@ -808,6 +808,7 @@ pub(super) async fn run_agent_spawn(
                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
                        &sid,
                    ));
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(&sid));

                    // Remove agent from the pool and unblock any wait_for_agent callers.
                    let tx_done = {
@@ -873,6 +874,17 @@ pub(super) async fn run_agent_spawn(
                    return;
                }

+                // AC1 (story 1089): mark forced exits so the commit-recovery
+                // stuck counter is not incremented for API errors, network
+                // failures, or Claude-API budget exhaustion.  A non-zero exit
+                // code means the CLI was forced out, not that it chose to stop.
+                if !result.exit_ok {
+                    crate::db::write_content(
+                        crate::db::ContentKey::CommitRecoveryForcedExit(&sid),
+                        "1",
+                    );
+                }
+
                // Server-owned completion: run acceptance gates automatically
                // when the agent process exits normally.
                super::super::pipeline::run_server_owned_completion(
@@ -1246,12 +1258,13 @@ mod tests {
                "abc123",
            );

-            // Rate-limit exit handler: reset all three counters (the fix).
+            // Rate-limit exit handler: reset all counters (the fix).
            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
                story_id,
            ));
            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
+            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id));

            // CommitRecoveryPending must be cleared after each rate-limit exit.
            assert!(
@@ -59,6 +59,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
                // Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its
                // first "system" event.  Detected by: non-zero exit AND no session.
                aborted_signal: !result.exit_ok && result.session_id.is_none(),
+                exit_ok: result.exit_ok,
                session_id: result.session_id,
                token_usage: result.token_usage,
                rate_limit_exit: result.rate_limit_exit,
@@ -92,6 +93,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
                Ok(RuntimeResult {
                    aborted_signal: !fallback_result.exit_ok
                        && fallback_result.session_id.is_none(),
+                    exit_ok: fallback_result.exit_ok,
                    session_id: fallback_result.session_id,
                    token_usage: fallback_result.token_usage,
                    rate_limit_exit: fallback_result.rate_limit_exit,
@@ -135,6 +135,7 @@ impl AgentRuntime for GeminiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -151,6 +152,7 @@ impl AgentRuntime for GeminiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -254,6 +256,7 @@ impl AgentRuntime for GeminiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -339,6 +342,7 @@ impl AgentRuntime for GeminiRuntime {
        Ok(RuntimeResult {
            session_id: None,
            token_usage: Some(total_usage),
+            exit_ok: true,
            aborted_signal: false,
            rate_limit_exit: false,
            rate_limit_reset_at: None,
@@ -55,6 +55,12 @@ pub struct RuntimeContext {
 pub struct RuntimeResult {
    pub session_id: Option<String>,
    pub token_usage: Option<TokenUsage>,
+    /// `true` when the process exited with exit code 0; `false` for non-zero exits
+    /// (API errors, network failures, or Claude-API-level budget exhaustion).  Always
+    /// `true` for API-based runtimes (OpenAI, Gemini) which have no exit-code concept.
+    /// Used by the commit-recovery path to skip the stuck-respawn counter for forced
+    /// exits (story 1089, AC1).
+    pub exit_ok: bool,
    /// `true` when the process exited with a failure AND no session was established.
    ///
    /// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion
@@ -169,6 +175,7 @@ mod tests {
                cache_read_input_tokens: 0,
                total_cost_usd: 0.01,
            }),
+            exit_ok: true,
            aborted_signal: false,
            rate_limit_exit: false,
            rate_limit_reset_at: None,
@@ -186,6 +193,7 @@ mod tests {
        let result = RuntimeResult {
            session_id: None,
            token_usage: None,
+            exit_ok: true,
            aborted_signal: false,
            rate_limit_exit: false,
            rate_limit_reset_at: None,
@@ -122,6 +122,7 @@ impl AgentRuntime for OpenAiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -138,6 +139,7 @@ impl AgentRuntime for OpenAiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -224,6 +226,7 @@ impl AgentRuntime for OpenAiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,