huskies: merge 1053

This commit is contained in:
dave
2026-05-14 18:32:43 +00:00
parent bb5abcd042
commit 96e227d8d4
7 changed files with 215 additions and 0 deletions
+161
View File
@@ -798,6 +798,89 @@ pub(super) async fn run_agent_spawn(
});
}
} else {
// Rate-limit exit: reset commit-recovery counters and respawn
// without consuming a retry slot (bug 1053).
if result.rate_limit_exit {
slog!(
"[agents] Rate-limit exit for '{sid}:{aname}': resetting \
commit-recovery counters and respawning without consuming \
a retry slot."
);
// Deleting these counters prevents the no-progress logic from
// treating this exit as an agent-stuck event.
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(&sid));
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(&sid),
);
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
&sid,
));
// Remove agent from the pool and unblock any wait_for_agent callers.
let tx_done = {
let mut lock = match agents_ref.lock() {
Ok(a) => a,
Err(_) => return,
};
if let Some(agent) = lock.remove(&key_clone) {
agent.tx
} else {
tx_clone.clone()
}
};
let _ = tx_done.send(AgentEvent::Done {
story_id: sid.clone(),
agent_name: aname.clone(),
session_id: result.session_id.clone(),
});
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
// Honour the backoff window (AC3): delay inside a background
// task so the current task returns immediately.
let reset_at = result.rate_limit_reset_at;
let session_for_resume = result.session_id.clone();
let agents_for_respawn = Arc::clone(&agents_ref);
let watcher_for_respawn = watcher_tx_clone.clone();
let sid_r = sid.clone();
let aname_r = aname.clone();
let root_r = project_root_clone.clone();
let port_r = port_for_task;
tokio::spawn(async move {
if let Some(reset_at) = reset_at {
let wait = (reset_at - chrono::Utc::now())
.to_std()
.unwrap_or(std::time::Duration::ZERO);
if !wait.is_zero() {
slog!(
"[agents] Rate-limit backoff for '{sid_r}': \
waiting {}s before respawn.",
wait.as_secs()
);
tokio::time::sleep(wait).await;
}
}
let pool = AgentPool {
agents: agents_for_respawn,
port: port_r,
child_killers: Arc::new(Mutex::new(HashMap::new())),
watcher_tx: watcher_for_respawn,
status_broadcaster: Arc::new(
crate::service::status::StatusBroadcaster::new(),
),
};
if let Err(e) = pool
.start_agent(&root_r, &sid_r, Some(&aname_r), None, session_for_resume)
.await
{
slog_error!(
"[agents] Failed to respawn '{aname_r}' for '{sid_r}' \
after rate-limit exit: {e}"
);
}
});
return;
}
// Server-owned completion: run acceptance gates automatically
// when the agent process exits normally.
super::super::pipeline::run_server_owned_completion(
@@ -1131,4 +1214,82 @@ mod tests {
"a count of {final_count} triggers blocking (>= {CAP})"
);
}
/// Bug 1053: 3 consecutive rate-limit exits must NOT block the story.
///
/// Each rate-limit exit resets CommitRecoveryPending, CommitRecoveryTotalAttempts,
/// and CommitRecoveryDiffFingerprint without incrementing retry_count. After 3
/// simulated rate-limit exits the story must remain in 2_current/ — not blocked.
#[test]
fn three_consecutive_rate_limit_exits_do_not_block() {
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
let story_id = "9953_rate_limit_no_block_1053";
crate::db::write_item_with_content(
story_id,
"2_current",
"---\nname: Rate Limit Test\n---\n",
crate::db::ItemMeta::named("Rate Limit Test"),
);
// Without the fix, each commit-recovery respawn would write
// CommitRecoveryPending=N and CommitRecoveryTotalAttempts=N, and after
// NO_PROGRESS_CAP (3) or TOTAL_ATTEMPTS_CAP (8) the story would be blocked.
// With the fix, the rate-limit exit handler deletes all three counters before
// the pipeline advance runs, so they never accumulate.
const RATE_LIMIT_EXITS: u32 = 3;
for cycle in 1..=RATE_LIMIT_EXITS {
// Simulate: pipeline advance would have written these counts.
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryPending(story_id),
&cycle.to_string(),
);
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
&cycle.to_string(),
);
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
"abc123",
);
// Rate-limit exit handler: reset all three counters (the fix).
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
story_id,
));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
// CommitRecoveryPending must be cleared after each rate-limit exit.
assert!(
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(story_id))
.is_none(),
"CommitRecoveryPending must be None after rate-limit exit #{cycle}"
);
// retry_count must remain 0 — the rate-limit path never calls
// bump_retry_count.
let retry_count = crate::crdt_state::read_item(story_id)
.map(|item| item.retry_count())
.unwrap_or(0);
assert_eq!(
retry_count, 0,
"retry_count must not be incremented by rate-limit exits \
(got {retry_count} on cycle {cycle})"
);
}
// After RATE_LIMIT_EXITS consecutive rate-limit exits the story must NOT
// be blocked — it stays in 2_current/ for the next respawn attempt.
let item = crate::crdt_state::read_item(story_id)
.expect("story must be in CRDT after rate-limit exits");
assert_ne!(
item.stage().dir_name(),
"blocked",
"story must NOT be blocked after {RATE_LIMIT_EXITS} consecutive rate-limit exits; \
got stage: {}",
item.stage().dir_name()
);
}
}
+10
View File
@@ -261,6 +261,10 @@ fn run_agent_pty_blocking(
// earlier-than-expected emit), the extension implicitly drops back to 0
// and the base inactivity timeout resumes (story 916).
let mut block_until: Option<chrono::DateTime<chrono::Utc>> = None;
// Track whether a rate-limit hard block was seen so the completion handler
// can distinguish a rate-limit exit from a genuine no-progress exit (bug 1053).
let mut rate_limit_hard_block_seen = false;
let mut rate_limit_reset_at_captured: Option<chrono::DateTime<chrono::Utc>> = None;
loop {
let effective_timeout = base_timeout.map(|base| {
@@ -388,6 +392,10 @@ fn run_agent_pty_blocking(
default
}
};
// Mark that a hard block was seen so the completion handler
// can distinguish this exit from a genuine no-progress exit.
rate_limit_hard_block_seen = true;
rate_limit_reset_at_captured = Some(reset_at);
// Pause the inactivity clock until the rate limit resets
// (story 916). Without this the watchdog kills the agent
// mid-wait — mergemaster especially, whose 15-minute
@@ -480,5 +488,7 @@ fn run_agent_pty_blocking(
session_id,
token_usage,
exit_ok,
rate_limit_exit: rate_limit_hard_block_seen,
rate_limit_reset_at: rate_limit_reset_at_captured,
})
}
+9
View File
@@ -13,6 +13,15 @@ pub(in crate::agents) struct PtyResult {
/// `true` if the child process exited with a zero exit code (normal completion).
/// `false` if the process was killed by a signal or exited with a non-zero code.
pub exit_ok: bool,
/// `true` when a rate-limit hard block event was received during this session.
///
/// Detected by seeing a `rate_limit_event` with a non-`allowed_warning` status in
/// the PTY output. The completion handler uses this flag to distinguish a
/// rate-limit-forced exit from a genuine no-progress exit (bug 1053).
pub rate_limit_exit: bool,
/// When the API rate limit is scheduled to reset; `None` when no hard-block
/// event was seen or when the `reset_at` field was absent from the event.
pub rate_limit_reset_at: Option<chrono::DateTime<chrono::Utc>>,
}
pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String {
+4
View File
@@ -71,6 +71,8 @@ impl AgentRuntime for ClaudeCodeRuntime {
aborted_signal: !result.exit_ok && result.session_id.is_none(),
session_id: result.session_id,
token_usage: result.token_usage,
rate_limit_exit: result.rate_limit_exit,
rate_limit_reset_at: result.rate_limit_reset_at,
}),
Err(e) if ctx.session_id_to_resume.is_some() && ctx.fresh_prompt.is_some() => {
// Resume failed — fall back to a fresh session without --resume.
@@ -103,6 +105,8 @@ impl AgentRuntime for ClaudeCodeRuntime {
&& fallback_result.session_id.is_none(),
session_id: fallback_result.session_id,
token_usage: fallback_result.token_usage,
rate_limit_exit: fallback_result.rate_limit_exit,
rate_limit_reset_at: fallback_result.rate_limit_reset_at,
})
}
Err(e) => Err(e),
+8
View File
@@ -136,6 +136,8 @@ impl AgentRuntime for GeminiRuntime {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
});
}
@@ -150,6 +152,8 @@ impl AgentRuntime for GeminiRuntime {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
});
}
@@ -251,6 +255,8 @@ impl AgentRuntime for GeminiRuntime {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
});
}
@@ -334,6 +340,8 @@ impl AgentRuntime for GeminiRuntime {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
})
}
+17
View File
@@ -63,6 +63,19 @@ pub struct RuntimeResult {
/// uses this flag to skip acceptance gates and respawn without consuming a
/// retry slot. Always `false` for API-based runtimes (Gemini, OpenAI).
pub aborted_signal: bool,
/// `true` when the Claude Code CLI received a rate-limit hard block event
/// before exiting (bug 1053).
///
/// The completion handler uses this to distinguish a rate-limit-forced exit
/// from a genuine no-progress exit, so commit-recovery counters are not
/// incremented. Always `false` for API-based runtimes (Gemini, OpenAI).
pub rate_limit_exit: bool,
/// When the API rate limit is scheduled to reset.
///
/// Populated from the `reset_at` field of the `rate_limit_event` JSON when
/// `rate_limit_exit` is `true`. The completion handler honours this window
/// before re-attempting the agent spawn. `None` for API-based runtimes.
pub rate_limit_reset_at: Option<chrono::DateTime<chrono::Utc>>,
}
/// Runtime status reported by the backend.
@@ -157,6 +170,8 @@ mod tests {
total_cost_usd: 0.01,
}),
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
};
assert_eq!(result.session_id, Some("sess-123".to_string()));
assert!(result.token_usage.is_some());
@@ -172,6 +187,8 @@ mod tests {
session_id: None,
token_usage: None,
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
};
assert!(result.session_id.is_none());
assert!(result.token_usage.is_none());
+6
View File
@@ -123,6 +123,8 @@ impl AgentRuntime for OpenAiRuntime {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
});
}
@@ -137,6 +139,8 @@ impl AgentRuntime for OpenAiRuntime {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
});
}
@@ -221,6 +225,8 @@ impl AgentRuntime for OpenAiRuntime {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
rate_limit_exit: false,
rate_limit_reset_at: None,
});
}