huskies: merge 1053

2026-05-14 18:32:43 +00:00
parent bb5abcd042
commit 96e227d8d4
7 changed files with 215 additions and 0 deletions
@@ -798,6 +798,89 @@ pub(super) async fn run_agent_spawn(
                    });
                }
            } else {
+                // Rate-limit exit: reset commit-recovery counters and respawn
+                // without consuming a retry slot (bug 1053).
+                if result.rate_limit_exit {
+                    slog!(
+                        "[agents] Rate-limit exit for '{sid}:{aname}': resetting \
+                         commit-recovery counters and respawning without consuming \
+                         a retry slot."
+                    );
+                    // Deleting these counters prevents the no-progress logic from
+                    // treating this exit as an agent-stuck event.
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(&sid));
+                    crate::db::delete_content(
+                        crate::db::ContentKey::CommitRecoveryDiffFingerprint(&sid),
+                    );
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
+                        &sid,
+                    ));
+
+                    // Remove agent from the pool and unblock any wait_for_agent callers.
+                    let tx_done = {
+                        let mut lock = match agents_ref.lock() {
+                            Ok(a) => a,
+                            Err(_) => return,
+                        };
+                        if let Some(agent) = lock.remove(&key_clone) {
+                            agent.tx
+                        } else {
+                            tx_clone.clone()
+                        }
+                    };
+                    let _ = tx_done.send(AgentEvent::Done {
+                        story_id: sid.clone(),
+                        agent_name: aname.clone(),
+                        session_id: result.session_id.clone(),
+                    });
+                    AgentPool::notify_agent_state_changed(&watcher_tx_clone);
+
+                    // Honour the backoff window (AC3): delay inside a background
+                    // task so the current task returns immediately.
+                    let reset_at = result.rate_limit_reset_at;
+                    let session_for_resume = result.session_id.clone();
+                    let agents_for_respawn = Arc::clone(&agents_ref);
+                    let watcher_for_respawn = watcher_tx_clone.clone();
+                    let sid_r = sid.clone();
+                    let aname_r = aname.clone();
+                    let root_r = project_root_clone.clone();
+                    let port_r = port_for_task;
+                    tokio::spawn(async move {
+                        if let Some(reset_at) = reset_at {
+                            let wait = (reset_at - chrono::Utc::now())
+                                .to_std()
+                                .unwrap_or(std::time::Duration::ZERO);
+                            if !wait.is_zero() {
+                                slog!(
+                                    "[agents] Rate-limit backoff for '{sid_r}': \
+                                     waiting {}s before respawn.",
+                                    wait.as_secs()
+                                );
+                                tokio::time::sleep(wait).await;
+                            }
+                        }
+                        let pool = AgentPool {
+                            agents: agents_for_respawn,
+                            port: port_r,
+                            child_killers: Arc::new(Mutex::new(HashMap::new())),
+                            watcher_tx: watcher_for_respawn,
+                            status_broadcaster: Arc::new(
+                                crate::service::status::StatusBroadcaster::new(),
+                            ),
+                        };
+                        if let Err(e) = pool
+                            .start_agent(&root_r, &sid_r, Some(&aname_r), None, session_for_resume)
+                            .await
+                        {
+                            slog_error!(
+                                "[agents] Failed to respawn '{aname_r}' for '{sid_r}' \
+                                 after rate-limit exit: {e}"
+                            );
+                        }
+                    });
+                    return;
+                }
+
                // Server-owned completion: run acceptance gates automatically
                // when the agent process exits normally.
                super::super::pipeline::run_server_owned_completion(
@@ -1131,4 +1214,82 @@ mod tests {
            "a count of {final_count} triggers blocking (>= {CAP})"
        );
    }
+
+    /// Bug 1053: 3 consecutive rate-limit exits must NOT block the story.
+    ///
+    /// Each rate-limit exit resets CommitRecoveryPending, CommitRecoveryTotalAttempts,
+    /// and CommitRecoveryDiffFingerprint without incrementing retry_count.  After 3
+    /// simulated rate-limit exits the story must remain in 2_current/ — not blocked.
+    #[test]
+    fn three_consecutive_rate_limit_exits_do_not_block() {
+        crate::crdt_state::init_for_test();
+        crate::db::ensure_content_store();
+
+        let story_id = "9953_rate_limit_no_block_1053";
+        crate::db::write_item_with_content(
+            story_id,
+            "2_current",
+            "---\nname: Rate Limit Test\n---\n",
+            crate::db::ItemMeta::named("Rate Limit Test"),
+        );
+
+        // Without the fix, each commit-recovery respawn would write
+        // CommitRecoveryPending=N and CommitRecoveryTotalAttempts=N, and after
+        // NO_PROGRESS_CAP (3) or TOTAL_ATTEMPTS_CAP (8) the story would be blocked.
+        // With the fix, the rate-limit exit handler deletes all three counters before
+        // the pipeline advance runs, so they never accumulate.
+        const RATE_LIMIT_EXITS: u32 = 3;
+        for cycle in 1..=RATE_LIMIT_EXITS {
+            // Simulate: pipeline advance would have written these counts.
+            crate::db::write_content(
+                crate::db::ContentKey::CommitRecoveryPending(story_id),
+                &cycle.to_string(),
+            );
+            crate::db::write_content(
+                crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
+                &cycle.to_string(),
+            );
+            crate::db::write_content(
+                crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
+                "abc123",
+            );
+
+            // Rate-limit exit handler: reset all three counters (the fix).
+            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
+            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
+                story_id,
+            ));
+            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
+
+            // CommitRecoveryPending must be cleared after each rate-limit exit.
+            assert!(
+                crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(story_id))
+                    .is_none(),
+                "CommitRecoveryPending must be None after rate-limit exit #{cycle}"
+            );
+
+            // retry_count must remain 0 — the rate-limit path never calls
+            // bump_retry_count.
+            let retry_count = crate::crdt_state::read_item(story_id)
+                .map(|item| item.retry_count())
+                .unwrap_or(0);
+            assert_eq!(
+                retry_count, 0,
+                "retry_count must not be incremented by rate-limit exits \
+                 (got {retry_count} on cycle {cycle})"
+            );
+        }
+
+        // After RATE_LIMIT_EXITS consecutive rate-limit exits the story must NOT
+        // be blocked — it stays in 2_current/ for the next respawn attempt.
+        let item = crate::crdt_state::read_item(story_id)
+            .expect("story must be in CRDT after rate-limit exits");
+        assert_ne!(
+            item.stage().dir_name(),
+            "blocked",
+            "story must NOT be blocked after {RATE_LIMIT_EXITS} consecutive rate-limit exits; \
+             got stage: {}",
+            item.stage().dir_name()
+        );
+    }
 }