diff --git a/server/src/agents/pool/start/spawn.rs b/server/src/agents/pool/start/spawn.rs index 01a2aa5a..6017df32 100644 --- a/server/src/agents/pool/start/spawn.rs +++ b/server/src/agents/pool/start/spawn.rs @@ -798,6 +798,89 @@ pub(super) async fn run_agent_spawn( }); } } else { + // Rate-limit exit: reset commit-recovery counters and respawn + // without consuming a retry slot (bug 1053). + if result.rate_limit_exit { + slog!( + "[agents] Rate-limit exit for '{sid}:{aname}': resetting \ + commit-recovery counters and respawning without consuming \ + a retry slot." + ); + // Deleting these counters prevents the no-progress logic from + // treating this exit as an agent-stuck event. + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(&sid)); + crate::db::delete_content( + crate::db::ContentKey::CommitRecoveryDiffFingerprint(&sid), + ); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts( + &sid, + )); + + // Remove agent from the pool and unblock any wait_for_agent callers. + let tx_done = { + let mut lock = match agents_ref.lock() { + Ok(a) => a, + Err(_) => return, + }; + if let Some(agent) = lock.remove(&key_clone) { + agent.tx + } else { + tx_clone.clone() + } + }; + let _ = tx_done.send(AgentEvent::Done { + story_id: sid.clone(), + agent_name: aname.clone(), + session_id: result.session_id.clone(), + }); + AgentPool::notify_agent_state_changed(&watcher_tx_clone); + + // Honour the backoff window (AC3): delay inside a background + // task so the current task returns immediately. + let reset_at = result.rate_limit_reset_at; + let session_for_resume = result.session_id.clone(); + let agents_for_respawn = Arc::clone(&agents_ref); + let watcher_for_respawn = watcher_tx_clone.clone(); + let sid_r = sid.clone(); + let aname_r = aname.clone(); + let root_r = project_root_clone.clone(); + let port_r = port_for_task; + tokio::spawn(async move { + if let Some(reset_at) = reset_at { + let wait = (reset_at - chrono::Utc::now()) + .to_std() + .unwrap_or(std::time::Duration::ZERO); + if !wait.is_zero() { + slog!( + "[agents] Rate-limit backoff for '{sid_r}': \ + waiting {}s before respawn.", + wait.as_secs() + ); + tokio::time::sleep(wait).await; + } + } + let pool = AgentPool { + agents: agents_for_respawn, + port: port_r, + child_killers: Arc::new(Mutex::new(HashMap::new())), + watcher_tx: watcher_for_respawn, + status_broadcaster: Arc::new( + crate::service::status::StatusBroadcaster::new(), + ), + }; + if let Err(e) = pool + .start_agent(&root_r, &sid_r, Some(&aname_r), None, session_for_resume) + .await + { + slog_error!( + "[agents] Failed to respawn '{aname_r}' for '{sid_r}' \ + after rate-limit exit: {e}" + ); + } + }); + return; + } + // Server-owned completion: run acceptance gates automatically // when the agent process exits normally. super::super::pipeline::run_server_owned_completion( @@ -1131,4 +1214,82 @@ mod tests { "a count of {final_count} triggers blocking (>= {CAP})" ); } + + /// Bug 1053: 3 consecutive rate-limit exits must NOT block the story. + /// + /// Each rate-limit exit resets CommitRecoveryPending, CommitRecoveryTotalAttempts, + /// and CommitRecoveryDiffFingerprint without incrementing retry_count. After 3 + /// simulated rate-limit exits the story must remain in 2_current/ — not blocked. + #[test] + fn three_consecutive_rate_limit_exits_do_not_block() { + crate::crdt_state::init_for_test(); + crate::db::ensure_content_store(); + + let story_id = "9953_rate_limit_no_block_1053"; + crate::db::write_item_with_content( + story_id, + "2_current", + "---\nname: Rate Limit Test\n---\n", + crate::db::ItemMeta::named("Rate Limit Test"), + ); + + // Without the fix, each commit-recovery respawn would write + // CommitRecoveryPending=N and CommitRecoveryTotalAttempts=N, and after + // NO_PROGRESS_CAP (3) or TOTAL_ATTEMPTS_CAP (8) the story would be blocked. + // With the fix, the rate-limit exit handler deletes all three counters before + // the pipeline advance runs, so they never accumulate. + const RATE_LIMIT_EXITS: u32 = 3; + for cycle in 1..=RATE_LIMIT_EXITS { + // Simulate: pipeline advance would have written these counts. + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryPending(story_id), + &cycle.to_string(), + ); + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id), + &cycle.to_string(), + ); + crate::db::write_content( + crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id), + "abc123", + ); + + // Rate-limit exit handler: reset all three counters (the fix). + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id)); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint( + story_id, + )); + crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id)); + + // CommitRecoveryPending must be cleared after each rate-limit exit. + assert!( + crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(story_id)) + .is_none(), + "CommitRecoveryPending must be None after rate-limit exit #{cycle}" + ); + + // retry_count must remain 0 — the rate-limit path never calls + // bump_retry_count. + let retry_count = crate::crdt_state::read_item(story_id) + .map(|item| item.retry_count()) + .unwrap_or(0); + assert_eq!( + retry_count, 0, + "retry_count must not be incremented by rate-limit exits \ + (got {retry_count} on cycle {cycle})" + ); + } + + // After RATE_LIMIT_EXITS consecutive rate-limit exits the story must NOT + // be blocked — it stays in 2_current/ for the next respawn attempt. + let item = crate::crdt_state::read_item(story_id) + .expect("story must be in CRDT after rate-limit exits"); + assert_ne!( + item.stage().dir_name(), + "blocked", + "story must NOT be blocked after {RATE_LIMIT_EXITS} consecutive rate-limit exits; \ + got stage: {}", + item.stage().dir_name() + ); + } } diff --git a/server/src/agents/pty/runner.rs b/server/src/agents/pty/runner.rs index cdefa4b3..b23091b0 100644 --- a/server/src/agents/pty/runner.rs +++ b/server/src/agents/pty/runner.rs @@ -261,6 +261,10 @@ fn run_agent_pty_blocking( // earlier-than-expected emit), the extension implicitly drops back to 0 // and the base inactivity timeout resumes (story 916). let mut block_until: Option> = None; + // Track whether a rate-limit hard block was seen so the completion handler + // can distinguish a rate-limit exit from a genuine no-progress exit (bug 1053). + let mut rate_limit_hard_block_seen = false; + let mut rate_limit_reset_at_captured: Option> = None; loop { let effective_timeout = base_timeout.map(|base| { @@ -388,6 +392,10 @@ fn run_agent_pty_blocking( default } }; + // Mark that a hard block was seen so the completion handler + // can distinguish this exit from a genuine no-progress exit. + rate_limit_hard_block_seen = true; + rate_limit_reset_at_captured = Some(reset_at); // Pause the inactivity clock until the rate limit resets // (story 916). Without this the watchdog kills the agent // mid-wait — mergemaster especially, whose 15-minute @@ -480,5 +488,7 @@ fn run_agent_pty_blocking( session_id, token_usage, exit_ok, + rate_limit_exit: rate_limit_hard_block_seen, + rate_limit_reset_at: rate_limit_reset_at_captured, }) } diff --git a/server/src/agents/pty/types.rs b/server/src/agents/pty/types.rs index 6f8a7cf3..fb384b90 100644 --- a/server/src/agents/pty/types.rs +++ b/server/src/agents/pty/types.rs @@ -13,6 +13,15 @@ pub(in crate::agents) struct PtyResult { /// `true` if the child process exited with a zero exit code (normal completion). /// `false` if the process was killed by a signal or exited with a non-zero code. pub exit_ok: bool, + /// `true` when a rate-limit hard block event was received during this session. + /// + /// Detected by seeing a `rate_limit_event` with a non-`allowed_warning` status in + /// the PTY output. The completion handler uses this flag to distinguish a + /// rate-limit-forced exit from a genuine no-progress exit (bug 1053). + pub rate_limit_exit: bool, + /// When the API rate limit is scheduled to reset; `None` when no hard-block + /// event was seen or when the `reset_at` field was absent from the event. + pub rate_limit_reset_at: Option>, } pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String { diff --git a/server/src/agents/runtime/claude_code.rs b/server/src/agents/runtime/claude_code.rs index ea619d30..45bff128 100644 --- a/server/src/agents/runtime/claude_code.rs +++ b/server/src/agents/runtime/claude_code.rs @@ -71,6 +71,8 @@ impl AgentRuntime for ClaudeCodeRuntime { aborted_signal: !result.exit_ok && result.session_id.is_none(), session_id: result.session_id, token_usage: result.token_usage, + rate_limit_exit: result.rate_limit_exit, + rate_limit_reset_at: result.rate_limit_reset_at, }), Err(e) if ctx.session_id_to_resume.is_some() && ctx.fresh_prompt.is_some() => { // Resume failed — fall back to a fresh session without --resume. @@ -103,6 +105,8 @@ impl AgentRuntime for ClaudeCodeRuntime { && fallback_result.session_id.is_none(), session_id: fallback_result.session_id, token_usage: fallback_result.token_usage, + rate_limit_exit: fallback_result.rate_limit_exit, + rate_limit_reset_at: fallback_result.rate_limit_reset_at, }) } Err(e) => Err(e), diff --git a/server/src/agents/runtime/gemini/mod.rs b/server/src/agents/runtime/gemini/mod.rs index b1b1f587..d7841556 100644 --- a/server/src/agents/runtime/gemini/mod.rs +++ b/server/src/agents/runtime/gemini/mod.rs @@ -136,6 +136,8 @@ impl AgentRuntime for GeminiRuntime { session_id: None, token_usage: Some(total_usage), aborted_signal: false, + rate_limit_exit: false, + rate_limit_reset_at: None, }); } @@ -150,6 +152,8 @@ impl AgentRuntime for GeminiRuntime { session_id: None, token_usage: Some(total_usage), aborted_signal: false, + rate_limit_exit: false, + rate_limit_reset_at: None, }); } @@ -251,6 +255,8 @@ impl AgentRuntime for GeminiRuntime { session_id: None, token_usage: Some(total_usage), aborted_signal: false, + rate_limit_exit: false, + rate_limit_reset_at: None, }); } @@ -334,6 +340,8 @@ impl AgentRuntime for GeminiRuntime { session_id: None, token_usage: Some(total_usage), aborted_signal: false, + rate_limit_exit: false, + rate_limit_reset_at: None, }) } diff --git a/server/src/agents/runtime/mod.rs b/server/src/agents/runtime/mod.rs index ebbda696..c65ddfbc 100644 --- a/server/src/agents/runtime/mod.rs +++ b/server/src/agents/runtime/mod.rs @@ -63,6 +63,19 @@ pub struct RuntimeResult { /// uses this flag to skip acceptance gates and respawn without consuming a /// retry slot. Always `false` for API-based runtimes (Gemini, OpenAI). pub aborted_signal: bool, + /// `true` when the Claude Code CLI received a rate-limit hard block event + /// before exiting (bug 1053). + /// + /// The completion handler uses this to distinguish a rate-limit-forced exit + /// from a genuine no-progress exit, so commit-recovery counters are not + /// incremented. Always `false` for API-based runtimes (Gemini, OpenAI). + pub rate_limit_exit: bool, + /// When the API rate limit is scheduled to reset. + /// + /// Populated from the `reset_at` field of the `rate_limit_event` JSON when + /// `rate_limit_exit` is `true`. The completion handler honours this window + /// before re-attempting the agent spawn. `None` for API-based runtimes. + pub rate_limit_reset_at: Option>, } /// Runtime status reported by the backend. @@ -157,6 +170,8 @@ mod tests { total_cost_usd: 0.01, }), aborted_signal: false, + rate_limit_exit: false, + rate_limit_reset_at: None, }; assert_eq!(result.session_id, Some("sess-123".to_string())); assert!(result.token_usage.is_some()); @@ -172,6 +187,8 @@ mod tests { session_id: None, token_usage: None, aborted_signal: false, + rate_limit_exit: false, + rate_limit_reset_at: None, }; assert!(result.session_id.is_none()); assert!(result.token_usage.is_none()); diff --git a/server/src/agents/runtime/openai.rs b/server/src/agents/runtime/openai.rs index 800fe584..7a54dbf2 100644 --- a/server/src/agents/runtime/openai.rs +++ b/server/src/agents/runtime/openai.rs @@ -123,6 +123,8 @@ impl AgentRuntime for OpenAiRuntime { session_id: None, token_usage: Some(total_usage), aborted_signal: false, + rate_limit_exit: false, + rate_limit_reset_at: None, }); } @@ -137,6 +139,8 @@ impl AgentRuntime for OpenAiRuntime { session_id: None, token_usage: Some(total_usage), aborted_signal: false, + rate_limit_exit: false, + rate_limit_reset_at: None, }); } @@ -221,6 +225,8 @@ impl AgentRuntime for OpenAiRuntime { session_id: None, token_usage: Some(total_usage), aborted_signal: false, + rate_limit_exit: false, + rate_limit_reset_at: None, }); }