huskies: merge 1053

2026-05-14 18:32:43 +00:00
parent bb5abcd042
commit 96e227d8d4
7 changed files with 215 additions and 0 deletions
@@ -798,6 +798,89 @@ pub(super) async fn run_agent_spawn(
                    });
                }
            } else {
+                // Rate-limit exit: reset commit-recovery counters and respawn
+                // without consuming a retry slot (bug 1053).
+                if result.rate_limit_exit {
+                    slog!(
+                        "[agents] Rate-limit exit for '{sid}:{aname}': resetting \
+                         commit-recovery counters and respawning without consuming \
+                         a retry slot."
+                    );
+                    // Deleting these counters prevents the no-progress logic from
+                    // treating this exit as an agent-stuck event.
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(&sid));
+                    crate::db::delete_content(
+                        crate::db::ContentKey::CommitRecoveryDiffFingerprint(&sid),
+                    );
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
+                        &sid,
+                    ));
+
+                    // Remove agent from the pool and unblock any wait_for_agent callers.
+                    let tx_done = {
+                        let mut lock = match agents_ref.lock() {
+                            Ok(a) => a,
+                            Err(_) => return,
+                        };
+                        if let Some(agent) = lock.remove(&key_clone) {
+                            agent.tx
+                        } else {
+                            tx_clone.clone()
+                        }
+                    };
+                    let _ = tx_done.send(AgentEvent::Done {
+                        story_id: sid.clone(),
+                        agent_name: aname.clone(),
+                        session_id: result.session_id.clone(),
+                    });
+                    AgentPool::notify_agent_state_changed(&watcher_tx_clone);
+
+                    // Honour the backoff window (AC3): delay inside a background
+                    // task so the current task returns immediately.
+                    let reset_at = result.rate_limit_reset_at;
+                    let session_for_resume = result.session_id.clone();
+                    let agents_for_respawn = Arc::clone(&agents_ref);
+                    let watcher_for_respawn = watcher_tx_clone.clone();
+                    let sid_r = sid.clone();
+                    let aname_r = aname.clone();
+                    let root_r = project_root_clone.clone();
+                    let port_r = port_for_task;
+                    tokio::spawn(async move {
+                        if let Some(reset_at) = reset_at {
+                            let wait = (reset_at - chrono::Utc::now())
+                                .to_std()
+                                .unwrap_or(std::time::Duration::ZERO);
+                            if !wait.is_zero() {
+                                slog!(
+                                    "[agents] Rate-limit backoff for '{sid_r}': \
+                                     waiting {}s before respawn.",
+                                    wait.as_secs()
+                                );
+                                tokio::time::sleep(wait).await;
+                            }
+                        }
+                        let pool = AgentPool {
+                            agents: agents_for_respawn,
+                            port: port_r,
+                            child_killers: Arc::new(Mutex::new(HashMap::new())),
+                            watcher_tx: watcher_for_respawn,
+                            status_broadcaster: Arc::new(
+                                crate::service::status::StatusBroadcaster::new(),
+                            ),
+                        };
+                        if let Err(e) = pool
+                            .start_agent(&root_r, &sid_r, Some(&aname_r), None, session_for_resume)
+                            .await
+                        {
+                            slog_error!(
+                                "[agents] Failed to respawn '{aname_r}' for '{sid_r}' \
+                                 after rate-limit exit: {e}"
+                            );
+                        }
+                    });
+                    return;
+                }
+
                // Server-owned completion: run acceptance gates automatically
                // when the agent process exits normally.
                super::super::pipeline::run_server_owned_completion(
@@ -1131,4 +1214,82 @@ mod tests {
            "a count of {final_count} triggers blocking (>= {CAP})"
        );
    }
+
+    /// Bug 1053: 3 consecutive rate-limit exits must NOT block the story.
+    ///
+    /// Each rate-limit exit resets CommitRecoveryPending, CommitRecoveryTotalAttempts,
+    /// and CommitRecoveryDiffFingerprint without incrementing retry_count.  After 3
+    /// simulated rate-limit exits the story must remain in 2_current/ — not blocked.
+    #[test]
+    fn three_consecutive_rate_limit_exits_do_not_block() {
+        crate::crdt_state::init_for_test();
+        crate::db::ensure_content_store();
+
+        let story_id = "9953_rate_limit_no_block_1053";
+        crate::db::write_item_with_content(
+            story_id,
+            "2_current",
+            "---\nname: Rate Limit Test\n---\n",
+            crate::db::ItemMeta::named("Rate Limit Test"),
+        );
+
+        // Without the fix, each commit-recovery respawn would write
+        // CommitRecoveryPending=N and CommitRecoveryTotalAttempts=N, and after
+        // NO_PROGRESS_CAP (3) or TOTAL_ATTEMPTS_CAP (8) the story would be blocked.
+        // With the fix, the rate-limit exit handler deletes all three counters before
+        // the pipeline advance runs, so they never accumulate.
+        const RATE_LIMIT_EXITS: u32 = 3;
+        for cycle in 1..=RATE_LIMIT_EXITS {
+            // Simulate: pipeline advance would have written these counts.
+            crate::db::write_content(
+                crate::db::ContentKey::CommitRecoveryPending(story_id),
+                &cycle.to_string(),
+            );
+            crate::db::write_content(
+                crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
+                &cycle.to_string(),
+            );
+            crate::db::write_content(
+                crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
+                "abc123",
+            );
+
+            // Rate-limit exit handler: reset all three counters (the fix).
+            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
+            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
+                story_id,
+            ));
+            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
+
+            // CommitRecoveryPending must be cleared after each rate-limit exit.
+            assert!(
+                crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(story_id))
+                    .is_none(),
+                "CommitRecoveryPending must be None after rate-limit exit #{cycle}"
+            );
+
+            // retry_count must remain 0 — the rate-limit path never calls
+            // bump_retry_count.
+            let retry_count = crate::crdt_state::read_item(story_id)
+                .map(|item| item.retry_count())
+                .unwrap_or(0);
+            assert_eq!(
+                retry_count, 0,
+                "retry_count must not be incremented by rate-limit exits \
+                 (got {retry_count} on cycle {cycle})"
+            );
+        }
+
+        // After RATE_LIMIT_EXITS consecutive rate-limit exits the story must NOT
+        // be blocked — it stays in 2_current/ for the next respawn attempt.
+        let item = crate::crdt_state::read_item(story_id)
+            .expect("story must be in CRDT after rate-limit exits");
+        assert_ne!(
+            item.stage().dir_name(),
+            "blocked",
+            "story must NOT be blocked after {RATE_LIMIT_EXITS} consecutive rate-limit exits; \
+             got stage: {}",
+            item.stage().dir_name()
+        );
+    }
 }
@@ -261,6 +261,10 @@ fn run_agent_pty_blocking(
    // earlier-than-expected emit), the extension implicitly drops back to 0
    // and the base inactivity timeout resumes (story 916).
    let mut block_until: Option<chrono::DateTime<chrono::Utc>> = None;
+    // Track whether a rate-limit hard block was seen so the completion handler
+    // can distinguish a rate-limit exit from a genuine no-progress exit (bug 1053).
+    let mut rate_limit_hard_block_seen = false;
+    let mut rate_limit_reset_at_captured: Option<chrono::DateTime<chrono::Utc>> = None;

    loop {
        let effective_timeout = base_timeout.map(|base| {
@@ -388,6 +392,10 @@ fn run_agent_pty_blocking(
                            default
                        }
                    };
+                    // Mark that a hard block was seen so the completion handler
+                    // can distinguish this exit from a genuine no-progress exit.
+                    rate_limit_hard_block_seen = true;
+                    rate_limit_reset_at_captured = Some(reset_at);
                    // Pause the inactivity clock until the rate limit resets
                    // (story 916).  Without this the watchdog kills the agent
                    // mid-wait — mergemaster especially, whose 15-minute
@@ -480,5 +488,7 @@ fn run_agent_pty_blocking(
        session_id,
        token_usage,
        exit_ok,
+        rate_limit_exit: rate_limit_hard_block_seen,
+        rate_limit_reset_at: rate_limit_reset_at_captured,
    })
 }
@@ -13,6 +13,15 @@ pub(in crate::agents) struct PtyResult {
    /// `true` if the child process exited with a zero exit code (normal completion).
    /// `false` if the process was killed by a signal or exited with a non-zero code.
    pub exit_ok: bool,
+    /// `true` when a rate-limit hard block event was received during this session.
+    ///
+    /// Detected by seeing a `rate_limit_event` with a non-`allowed_warning` status in
+    /// the PTY output.  The completion handler uses this flag to distinguish a
+    /// rate-limit-forced exit from a genuine no-progress exit (bug 1053).
+    pub rate_limit_exit: bool,
+    /// When the API rate limit is scheduled to reset; `None` when no hard-block
+    /// event was seen or when the `reset_at` field was absent from the event.
+    pub rate_limit_reset_at: Option<chrono::DateTime<chrono::Utc>>,
 }

 pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String {
@@ -71,6 +71,8 @@ impl AgentRuntime for ClaudeCodeRuntime {
                aborted_signal: !result.exit_ok && result.session_id.is_none(),
                session_id: result.session_id,
                token_usage: result.token_usage,
+                rate_limit_exit: result.rate_limit_exit,
+                rate_limit_reset_at: result.rate_limit_reset_at,
            }),
            Err(e) if ctx.session_id_to_resume.is_some() && ctx.fresh_prompt.is_some() => {
                // Resume failed — fall back to a fresh session without --resume.
@@ -103,6 +105,8 @@ impl AgentRuntime for ClaudeCodeRuntime {
                        && fallback_result.session_id.is_none(),
                    session_id: fallback_result.session_id,
                    token_usage: fallback_result.token_usage,
+                    rate_limit_exit: fallback_result.rate_limit_exit,
+                    rate_limit_reset_at: fallback_result.rate_limit_reset_at,
                })
            }
            Err(e) => Err(e),
@@ -136,6 +136,8 @@ impl AgentRuntime for GeminiRuntime {
                    session_id: None,
                    token_usage: Some(total_usage),
                    aborted_signal: false,
+                    rate_limit_exit: false,
+                    rate_limit_reset_at: None,
                });
            }

@@ -150,6 +152,8 @@ impl AgentRuntime for GeminiRuntime {
                    session_id: None,
                    token_usage: Some(total_usage),
                    aborted_signal: false,
+                    rate_limit_exit: false,
+                    rate_limit_reset_at: None,
                });
            }

@@ -251,6 +255,8 @@ impl AgentRuntime for GeminiRuntime {
                    session_id: None,
                    token_usage: Some(total_usage),
                    aborted_signal: false,
+                    rate_limit_exit: false,
+                    rate_limit_reset_at: None,
                });
            }

@@ -334,6 +340,8 @@ impl AgentRuntime for GeminiRuntime {
            session_id: None,
            token_usage: Some(total_usage),
            aborted_signal: false,
+            rate_limit_exit: false,
+            rate_limit_reset_at: None,
        })
    }

@@ -63,6 +63,19 @@ pub struct RuntimeResult {
    /// uses this flag to skip acceptance gates and respawn without consuming a
    /// retry slot.  Always `false` for API-based runtimes (Gemini, OpenAI).
    pub aborted_signal: bool,
+    /// `true` when the Claude Code CLI received a rate-limit hard block event
+    /// before exiting (bug 1053).
+    ///
+    /// The completion handler uses this to distinguish a rate-limit-forced exit
+    /// from a genuine no-progress exit, so commit-recovery counters are not
+    /// incremented.  Always `false` for API-based runtimes (Gemini, OpenAI).
+    pub rate_limit_exit: bool,
+    /// When the API rate limit is scheduled to reset.
+    ///
+    /// Populated from the `reset_at` field of the `rate_limit_event` JSON when
+    /// `rate_limit_exit` is `true`.  The completion handler honours this window
+    /// before re-attempting the agent spawn.  `None` for API-based runtimes.
+    pub rate_limit_reset_at: Option<chrono::DateTime<chrono::Utc>>,
 }

 /// Runtime status reported by the backend.
@@ -157,6 +170,8 @@ mod tests {
                total_cost_usd: 0.01,
            }),
            aborted_signal: false,
+            rate_limit_exit: false,
+            rate_limit_reset_at: None,
        };
        assert_eq!(result.session_id, Some("sess-123".to_string()));
        assert!(result.token_usage.is_some());
@@ -172,6 +187,8 @@ mod tests {
            session_id: None,
            token_usage: None,
            aborted_signal: false,
+            rate_limit_exit: false,
+            rate_limit_reset_at: None,
        };
        assert!(result.session_id.is_none());
        assert!(result.token_usage.is_none());
@@ -123,6 +123,8 @@ impl AgentRuntime for OpenAiRuntime {
                    session_id: None,
                    token_usage: Some(total_usage),
                    aborted_signal: false,
+                    rate_limit_exit: false,
+                    rate_limit_reset_at: None,
                });
            }

@@ -137,6 +139,8 @@ impl AgentRuntime for OpenAiRuntime {
                    session_id: None,
                    token_usage: Some(total_usage),
                    aborted_signal: false,
+                    rate_limit_exit: false,
+                    rate_limit_reset_at: None,
                });
            }

@@ -221,6 +225,8 @@ impl AgentRuntime for OpenAiRuntime {
                    session_id: None,
                    token_usage: Some(total_usage),
                    aborted_signal: false,
+                    rate_limit_exit: false,
+                    rate_limit_reset_at: None,
                });
            }