fix(916): rate-limit hard block extends inactivity deadline so the watchdog doesn't kill mid-wait

When claude-code emits a rate_limit_event with status != allowed_warning,
the subprocess waits internally for the limit to clear before retrying. No
PTY output flows during that window, so the inactivity timeout in the PTY
runner would fire and kill the agent — mergemaster especially, whose
15-minute inactivity window is shorter than typical rate-limit backoffs.

Track `block_until = Some(reset_at)` on hard-block events and add the
remaining time-until-reset to the per-iteration recv timeout. Once reset_at
passes (or an earlier emit arrives), the extension implicitly drops to 0
and the base inactivity timeout resumes. Turn/budget counts aren't affected
— they come from the session log and only advance when API calls actually
complete, so a stalled retry doesn't burn either.

Regression test in agents/pty/mod.rs spawns a script that emits a hard-block
with reset_at = now+3s, sleeps 3s, then exits, with inactivity_timeout_secs
= 1. Without the fix the runner kills the script at 1s; with the fix the
deadline is bumped past the sleep and the run completes cleanly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Timmy
2026-05-12 16:22:21 +01:00
parent 8421104645
commit 90b31fc84f
2 changed files with 74 additions and 2 deletions
+51
View File
@@ -204,6 +204,57 @@ mod tests {
}
}
/// Story 916: a rate-limit hard block must extend the inactivity deadline
/// by the backoff duration so the watchdog doesn't kill the agent while
/// it's legitimately waiting for the limit to clear.
///
/// Script emits a hard-block event with reset_at = now + 3s, then sleeps
/// 3s, then exits. With inactivity_timeout_secs = 1, the run would fail
/// without the extension; with it, the deadline is bumped to ~4s and the
/// script gets to complete cleanly.
#[tokio::test]
async fn rate_limit_hard_block_extends_inactivity_deadline() {
use std::os::unix::fs::PermissionsExt;
let tmp = tempfile::tempdir().unwrap();
let script = tmp.path().join("emit_then_wait.sh");
let reset_at = chrono::Utc::now() + chrono::Duration::seconds(3);
let reset_at_str = reset_at.to_rfc3339_opts(chrono::SecondsFormat::Secs, true);
let body = format!(
"#!/bin/sh\nprintf '%s\\n' '{{\"type\":\"rate_limit_event\",\"rate_limit_info\":{{\"status\":\"hard_block\",\"reset_at\":\"{reset_at_str}\"}}}}'\nsleep 3\n"
);
std::fs::write(&script, body).unwrap();
std::fs::set_permissions(&script, std::fs::Permissions::from_mode(0o755)).unwrap();
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
let (watcher_tx, _watcher_rx) = broadcast::channel::<WatcherEvent>(16);
let event_log = Arc::new(Mutex::new(Vec::new()));
let child_killers = Arc::new(Mutex::new(HashMap::new()));
let result = run_agent_pty_streaming(
"916_story_rate_limit_extension",
"mergemaster",
"sh",
&[script.to_string_lossy().to_string()],
"--",
"/tmp",
&tx,
&event_log,
None,
1, // inactivity_timeout_secs = 1s; would expire before the 3s sleep without the extension
child_killers,
watcher_tx,
None,
)
.await;
assert!(
result.is_ok(),
"PTY run should not be killed by inactivity timeout during rate-limit block: {:?}",
result.err()
);
}
#[test]
fn test_emit_event_writes_to_log_writer() {
let tmp = tempfile::tempdir().unwrap();
+23 -2
View File
@@ -232,7 +232,7 @@ fn run_agent_pty_blocking(
slog!("[agent:{sid_for_reader}:{aname_for_reader}] Reader thread exiting");
});
let timeout_dur = if inactivity_timeout_secs > 0 {
let base_timeout = if inactivity_timeout_secs > 0 {
Some(std::time::Duration::from_secs(inactivity_timeout_secs))
} else {
None
@@ -240,9 +240,24 @@ fn run_agent_pty_blocking(
let mut session_id: Option<String> = None;
let mut token_usage: Option<TokenUsage> = None;
// When the agent receives a rate-limit hard block, claude-code waits
// internally for the limit to clear and emits no PTY output during that
// window. Extend each recv's deadline by `block_until - now` so the
// inactivity timeout doesn't fire while we're legitimately waiting on
// Anthropic. Once `block_until` passes (or the block is cleared by an
// earlier-than-expected emit), the extension implicitly drops back to 0
// and the base inactivity timeout resumes (story 916).
let mut block_until: Option<chrono::DateTime<chrono::Utc>> = None;
loop {
let recv_result = match timeout_dur {
let effective_timeout = base_timeout.map(|base| {
let extra = block_until
.and_then(|t| (t - chrono::Utc::now()).to_std().ok())
.unwrap_or(std::time::Duration::ZERO);
base + extra
});
let recv_result = match effective_timeout {
Some(dur) => line_rx.recv_timeout(dur),
None => line_rx
.recv()
@@ -351,6 +366,12 @@ fn run_agent_pty_blocking(
default
}
};
// Pause the inactivity clock until the rate limit resets
// (story 916). Without this the watchdog kills the agent
// mid-wait — mergemaster especially, whose 15-minute
// inactivity window is shorter than typical rate-limit
// backoffs.
block_until = Some(reset_at);
let _ = watcher_tx.send(WatcherEvent::RateLimitHardBlock {
story_id: story_id.to_string(),
agent_name: agent_name.to_string(),