From 90b31fc84f95d938ec5217c4071034abc92f6134 Mon Sep 17 00:00:00 2001 From: Timmy Date: Tue, 12 May 2026 16:22:21 +0100 Subject: [PATCH] fix(916): rate-limit hard block extends inactivity deadline so the watchdog doesn't kill mid-wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When claude-code emits a rate_limit_event with status != allowed_warning, the subprocess waits internally for the limit to clear before retrying. No PTY output flows during that window, so the inactivity timeout in the PTY runner would fire and kill the agent — mergemaster especially, whose 15-minute inactivity window is shorter than typical rate-limit backoffs. Track `block_until = Some(reset_at)` on hard-block events and add the remaining time-until-reset to the per-iteration recv timeout. Once reset_at passes (or an earlier emit arrives), the extension implicitly drops to 0 and the base inactivity timeout resumes. Turn/budget counts aren't affected — they come from the session log and only advance when API calls actually complete, so a stalled retry doesn't burn either. Regression test in agents/pty/mod.rs spawns a script that emits a hard-block with reset_at = now+3s, sleeps 3s, then exits, with inactivity_timeout_secs = 1. Without the fix the runner kills the script at 1s; with the fix the deadline is bumped past the sleep and the run completes cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/src/agents/pty/mod.rs | 51 +++++++++++++++++++++++++++++++++ server/src/agents/pty/runner.rs | 25 ++++++++++++++-- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/server/src/agents/pty/mod.rs b/server/src/agents/pty/mod.rs index 2f56477b..a672b496 100644 --- a/server/src/agents/pty/mod.rs +++ b/server/src/agents/pty/mod.rs @@ -204,6 +204,57 @@ mod tests { } } + /// Story 916: a rate-limit hard block must extend the inactivity deadline + /// by the backoff duration so the watchdog doesn't kill the agent while + /// it's legitimately waiting for the limit to clear. + /// + /// Script emits a hard-block event with reset_at = now + 3s, then sleeps + /// 3s, then exits. With inactivity_timeout_secs = 1, the run would fail + /// without the extension; with it, the deadline is bumped to ~4s and the + /// script gets to complete cleanly. + #[tokio::test] + async fn rate_limit_hard_block_extends_inactivity_deadline() { + use std::os::unix::fs::PermissionsExt; + + let tmp = tempfile::tempdir().unwrap(); + let script = tmp.path().join("emit_then_wait.sh"); + let reset_at = chrono::Utc::now() + chrono::Duration::seconds(3); + let reset_at_str = reset_at.to_rfc3339_opts(chrono::SecondsFormat::Secs, true); + let body = format!( + "#!/bin/sh\nprintf '%s\\n' '{{\"type\":\"rate_limit_event\",\"rate_limit_info\":{{\"status\":\"hard_block\",\"reset_at\":\"{reset_at_str}\"}}}}'\nsleep 3\n" + ); + std::fs::write(&script, body).unwrap(); + std::fs::set_permissions(&script, std::fs::Permissions::from_mode(0o755)).unwrap(); + + let (tx, _rx) = broadcast::channel::(64); + let (watcher_tx, _watcher_rx) = broadcast::channel::(16); + let event_log = Arc::new(Mutex::new(Vec::new())); + let child_killers = Arc::new(Mutex::new(HashMap::new())); + + let result = run_agent_pty_streaming( + "916_story_rate_limit_extension", + "mergemaster", + "sh", + &[script.to_string_lossy().to_string()], + "--", + "/tmp", + &tx, + &event_log, + None, + 1, // inactivity_timeout_secs = 1s; would expire before the 3s sleep without the extension + child_killers, + watcher_tx, + None, + ) + .await; + + assert!( + result.is_ok(), + "PTY run should not be killed by inactivity timeout during rate-limit block: {:?}", + result.err() + ); + } + #[test] fn test_emit_event_writes_to_log_writer() { let tmp = tempfile::tempdir().unwrap(); diff --git a/server/src/agents/pty/runner.rs b/server/src/agents/pty/runner.rs index 3c000edf..44929cbe 100644 --- a/server/src/agents/pty/runner.rs +++ b/server/src/agents/pty/runner.rs @@ -232,7 +232,7 @@ fn run_agent_pty_blocking( slog!("[agent:{sid_for_reader}:{aname_for_reader}] Reader thread exiting"); }); - let timeout_dur = if inactivity_timeout_secs > 0 { + let base_timeout = if inactivity_timeout_secs > 0 { Some(std::time::Duration::from_secs(inactivity_timeout_secs)) } else { None @@ -240,9 +240,24 @@ fn run_agent_pty_blocking( let mut session_id: Option = None; let mut token_usage: Option = None; + // When the agent receives a rate-limit hard block, claude-code waits + // internally for the limit to clear and emits no PTY output during that + // window. Extend each recv's deadline by `block_until - now` so the + // inactivity timeout doesn't fire while we're legitimately waiting on + // Anthropic. Once `block_until` passes (or the block is cleared by an + // earlier-than-expected emit), the extension implicitly drops back to 0 + // and the base inactivity timeout resumes (story 916). + let mut block_until: Option> = None; loop { - let recv_result = match timeout_dur { + let effective_timeout = base_timeout.map(|base| { + let extra = block_until + .and_then(|t| (t - chrono::Utc::now()).to_std().ok()) + .unwrap_or(std::time::Duration::ZERO); + base + extra + }); + + let recv_result = match effective_timeout { Some(dur) => line_rx.recv_timeout(dur), None => line_rx .recv() @@ -351,6 +366,12 @@ fn run_agent_pty_blocking( default } }; + // Pause the inactivity clock until the rate limit resets + // (story 916). Without this the watchdog kills the agent + // mid-wait — mergemaster especially, whose 15-minute + // inactivity window is shorter than typical rate-limit + // backoffs. + block_until = Some(reset_at); let _ = watcher_tx.send(WatcherEvent::RateLimitHardBlock { story_id: story_id.to_string(), agent_name: agent_name.to_string(),