huskies: merge 1053
This commit is contained in:
@@ -798,6 +798,89 @@ pub(super) async fn run_agent_spawn(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// Rate-limit exit: reset commit-recovery counters and respawn
|
||||||
|
// without consuming a retry slot (bug 1053).
|
||||||
|
if result.rate_limit_exit {
|
||||||
|
slog!(
|
||||||
|
"[agents] Rate-limit exit for '{sid}:{aname}': resetting \
|
||||||
|
commit-recovery counters and respawning without consuming \
|
||||||
|
a retry slot."
|
||||||
|
);
|
||||||
|
// Deleting these counters prevents the no-progress logic from
|
||||||
|
// treating this exit as an agent-stuck event.
|
||||||
|
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(&sid));
|
||||||
|
crate::db::delete_content(
|
||||||
|
crate::db::ContentKey::CommitRecoveryDiffFingerprint(&sid),
|
||||||
|
);
|
||||||
|
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
|
||||||
|
&sid,
|
||||||
|
));
|
||||||
|
|
||||||
|
// Remove agent from the pool and unblock any wait_for_agent callers.
|
||||||
|
let tx_done = {
|
||||||
|
let mut lock = match agents_ref.lock() {
|
||||||
|
Ok(a) => a,
|
||||||
|
Err(_) => return,
|
||||||
|
};
|
||||||
|
if let Some(agent) = lock.remove(&key_clone) {
|
||||||
|
agent.tx
|
||||||
|
} else {
|
||||||
|
tx_clone.clone()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let _ = tx_done.send(AgentEvent::Done {
|
||||||
|
story_id: sid.clone(),
|
||||||
|
agent_name: aname.clone(),
|
||||||
|
session_id: result.session_id.clone(),
|
||||||
|
});
|
||||||
|
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
|
||||||
|
|
||||||
|
// Honour the backoff window (AC3): delay inside a background
|
||||||
|
// task so the current task returns immediately.
|
||||||
|
let reset_at = result.rate_limit_reset_at;
|
||||||
|
let session_for_resume = result.session_id.clone();
|
||||||
|
let agents_for_respawn = Arc::clone(&agents_ref);
|
||||||
|
let watcher_for_respawn = watcher_tx_clone.clone();
|
||||||
|
let sid_r = sid.clone();
|
||||||
|
let aname_r = aname.clone();
|
||||||
|
let root_r = project_root_clone.clone();
|
||||||
|
let port_r = port_for_task;
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Some(reset_at) = reset_at {
|
||||||
|
let wait = (reset_at - chrono::Utc::now())
|
||||||
|
.to_std()
|
||||||
|
.unwrap_or(std::time::Duration::ZERO);
|
||||||
|
if !wait.is_zero() {
|
||||||
|
slog!(
|
||||||
|
"[agents] Rate-limit backoff for '{sid_r}': \
|
||||||
|
waiting {}s before respawn.",
|
||||||
|
wait.as_secs()
|
||||||
|
);
|
||||||
|
tokio::time::sleep(wait).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let pool = AgentPool {
|
||||||
|
agents: agents_for_respawn,
|
||||||
|
port: port_r,
|
||||||
|
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
||||||
|
watcher_tx: watcher_for_respawn,
|
||||||
|
status_broadcaster: Arc::new(
|
||||||
|
crate::service::status::StatusBroadcaster::new(),
|
||||||
|
),
|
||||||
|
};
|
||||||
|
if let Err(e) = pool
|
||||||
|
.start_agent(&root_r, &sid_r, Some(&aname_r), None, session_for_resume)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
slog_error!(
|
||||||
|
"[agents] Failed to respawn '{aname_r}' for '{sid_r}' \
|
||||||
|
after rate-limit exit: {e}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Server-owned completion: run acceptance gates automatically
|
// Server-owned completion: run acceptance gates automatically
|
||||||
// when the agent process exits normally.
|
// when the agent process exits normally.
|
||||||
super::super::pipeline::run_server_owned_completion(
|
super::super::pipeline::run_server_owned_completion(
|
||||||
@@ -1131,4 +1214,82 @@ mod tests {
|
|||||||
"a count of {final_count} triggers blocking (>= {CAP})"
|
"a count of {final_count} triggers blocking (>= {CAP})"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Bug 1053: 3 consecutive rate-limit exits must NOT block the story.
|
||||||
|
///
|
||||||
|
/// Each rate-limit exit resets CommitRecoveryPending, CommitRecoveryTotalAttempts,
|
||||||
|
/// and CommitRecoveryDiffFingerprint without incrementing retry_count. After 3
|
||||||
|
/// simulated rate-limit exits the story must remain in 2_current/ — not blocked.
|
||||||
|
#[test]
|
||||||
|
fn three_consecutive_rate_limit_exits_do_not_block() {
|
||||||
|
crate::crdt_state::init_for_test();
|
||||||
|
crate::db::ensure_content_store();
|
||||||
|
|
||||||
|
let story_id = "9953_rate_limit_no_block_1053";
|
||||||
|
crate::db::write_item_with_content(
|
||||||
|
story_id,
|
||||||
|
"2_current",
|
||||||
|
"---\nname: Rate Limit Test\n---\n",
|
||||||
|
crate::db::ItemMeta::named("Rate Limit Test"),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Without the fix, each commit-recovery respawn would write
|
||||||
|
// CommitRecoveryPending=N and CommitRecoveryTotalAttempts=N, and after
|
||||||
|
// NO_PROGRESS_CAP (3) or TOTAL_ATTEMPTS_CAP (8) the story would be blocked.
|
||||||
|
// With the fix, the rate-limit exit handler deletes all three counters before
|
||||||
|
// the pipeline advance runs, so they never accumulate.
|
||||||
|
const RATE_LIMIT_EXITS: u32 = 3;
|
||||||
|
for cycle in 1..=RATE_LIMIT_EXITS {
|
||||||
|
// Simulate: pipeline advance would have written these counts.
|
||||||
|
crate::db::write_content(
|
||||||
|
crate::db::ContentKey::CommitRecoveryPending(story_id),
|
||||||
|
&cycle.to_string(),
|
||||||
|
);
|
||||||
|
crate::db::write_content(
|
||||||
|
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||||
|
&cycle.to_string(),
|
||||||
|
);
|
||||||
|
crate::db::write_content(
|
||||||
|
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||||
|
"abc123",
|
||||||
|
);
|
||||||
|
|
||||||
|
// Rate-limit exit handler: reset all three counters (the fix).
|
||||||
|
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
|
||||||
|
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
|
||||||
|
story_id,
|
||||||
|
));
|
||||||
|
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
|
||||||
|
|
||||||
|
// CommitRecoveryPending must be cleared after each rate-limit exit.
|
||||||
|
assert!(
|
||||||
|
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(story_id))
|
||||||
|
.is_none(),
|
||||||
|
"CommitRecoveryPending must be None after rate-limit exit #{cycle}"
|
||||||
|
);
|
||||||
|
|
||||||
|
// retry_count must remain 0 — the rate-limit path never calls
|
||||||
|
// bump_retry_count.
|
||||||
|
let retry_count = crate::crdt_state::read_item(story_id)
|
||||||
|
.map(|item| item.retry_count())
|
||||||
|
.unwrap_or(0);
|
||||||
|
assert_eq!(
|
||||||
|
retry_count, 0,
|
||||||
|
"retry_count must not be incremented by rate-limit exits \
|
||||||
|
(got {retry_count} on cycle {cycle})"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// After RATE_LIMIT_EXITS consecutive rate-limit exits the story must NOT
|
||||||
|
// be blocked — it stays in 2_current/ for the next respawn attempt.
|
||||||
|
let item = crate::crdt_state::read_item(story_id)
|
||||||
|
.expect("story must be in CRDT after rate-limit exits");
|
||||||
|
assert_ne!(
|
||||||
|
item.stage().dir_name(),
|
||||||
|
"blocked",
|
||||||
|
"story must NOT be blocked after {RATE_LIMIT_EXITS} consecutive rate-limit exits; \
|
||||||
|
got stage: {}",
|
||||||
|
item.stage().dir_name()
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -261,6 +261,10 @@ fn run_agent_pty_blocking(
|
|||||||
// earlier-than-expected emit), the extension implicitly drops back to 0
|
// earlier-than-expected emit), the extension implicitly drops back to 0
|
||||||
// and the base inactivity timeout resumes (story 916).
|
// and the base inactivity timeout resumes (story 916).
|
||||||
let mut block_until: Option<chrono::DateTime<chrono::Utc>> = None;
|
let mut block_until: Option<chrono::DateTime<chrono::Utc>> = None;
|
||||||
|
// Track whether a rate-limit hard block was seen so the completion handler
|
||||||
|
// can distinguish a rate-limit exit from a genuine no-progress exit (bug 1053).
|
||||||
|
let mut rate_limit_hard_block_seen = false;
|
||||||
|
let mut rate_limit_reset_at_captured: Option<chrono::DateTime<chrono::Utc>> = None;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let effective_timeout = base_timeout.map(|base| {
|
let effective_timeout = base_timeout.map(|base| {
|
||||||
@@ -388,6 +392,10 @@ fn run_agent_pty_blocking(
|
|||||||
default
|
default
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
// Mark that a hard block was seen so the completion handler
|
||||||
|
// can distinguish this exit from a genuine no-progress exit.
|
||||||
|
rate_limit_hard_block_seen = true;
|
||||||
|
rate_limit_reset_at_captured = Some(reset_at);
|
||||||
// Pause the inactivity clock until the rate limit resets
|
// Pause the inactivity clock until the rate limit resets
|
||||||
// (story 916). Without this the watchdog kills the agent
|
// (story 916). Without this the watchdog kills the agent
|
||||||
// mid-wait — mergemaster especially, whose 15-minute
|
// mid-wait — mergemaster especially, whose 15-minute
|
||||||
@@ -480,5 +488,7 @@ fn run_agent_pty_blocking(
|
|||||||
session_id,
|
session_id,
|
||||||
token_usage,
|
token_usage,
|
||||||
exit_ok,
|
exit_ok,
|
||||||
|
rate_limit_exit: rate_limit_hard_block_seen,
|
||||||
|
rate_limit_reset_at: rate_limit_reset_at_captured,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,6 +13,15 @@ pub(in crate::agents) struct PtyResult {
|
|||||||
/// `true` if the child process exited with a zero exit code (normal completion).
|
/// `true` if the child process exited with a zero exit code (normal completion).
|
||||||
/// `false` if the process was killed by a signal or exited with a non-zero code.
|
/// `false` if the process was killed by a signal or exited with a non-zero code.
|
||||||
pub exit_ok: bool,
|
pub exit_ok: bool,
|
||||||
|
/// `true` when a rate-limit hard block event was received during this session.
|
||||||
|
///
|
||||||
|
/// Detected by seeing a `rate_limit_event` with a non-`allowed_warning` status in
|
||||||
|
/// the PTY output. The completion handler uses this flag to distinguish a
|
||||||
|
/// rate-limit-forced exit from a genuine no-progress exit (bug 1053).
|
||||||
|
pub rate_limit_exit: bool,
|
||||||
|
/// When the API rate limit is scheduled to reset; `None` when no hard-block
|
||||||
|
/// event was seen or when the `reset_at` field was absent from the event.
|
||||||
|
pub rate_limit_reset_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String {
|
pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String {
|
||||||
|
|||||||
@@ -71,6 +71,8 @@ impl AgentRuntime for ClaudeCodeRuntime {
|
|||||||
aborted_signal: !result.exit_ok && result.session_id.is_none(),
|
aborted_signal: !result.exit_ok && result.session_id.is_none(),
|
||||||
session_id: result.session_id,
|
session_id: result.session_id,
|
||||||
token_usage: result.token_usage,
|
token_usage: result.token_usage,
|
||||||
|
rate_limit_exit: result.rate_limit_exit,
|
||||||
|
rate_limit_reset_at: result.rate_limit_reset_at,
|
||||||
}),
|
}),
|
||||||
Err(e) if ctx.session_id_to_resume.is_some() && ctx.fresh_prompt.is_some() => {
|
Err(e) if ctx.session_id_to_resume.is_some() && ctx.fresh_prompt.is_some() => {
|
||||||
// Resume failed — fall back to a fresh session without --resume.
|
// Resume failed — fall back to a fresh session without --resume.
|
||||||
@@ -103,6 +105,8 @@ impl AgentRuntime for ClaudeCodeRuntime {
|
|||||||
&& fallback_result.session_id.is_none(),
|
&& fallback_result.session_id.is_none(),
|
||||||
session_id: fallback_result.session_id,
|
session_id: fallback_result.session_id,
|
||||||
token_usage: fallback_result.token_usage,
|
token_usage: fallback_result.token_usage,
|
||||||
|
rate_limit_exit: fallback_result.rate_limit_exit,
|
||||||
|
rate_limit_reset_at: fallback_result.rate_limit_reset_at,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(e) => Err(e),
|
Err(e) => Err(e),
|
||||||
|
|||||||
@@ -136,6 +136,8 @@ impl AgentRuntime for GeminiRuntime {
|
|||||||
session_id: None,
|
session_id: None,
|
||||||
token_usage: Some(total_usage),
|
token_usage: Some(total_usage),
|
||||||
aborted_signal: false,
|
aborted_signal: false,
|
||||||
|
rate_limit_exit: false,
|
||||||
|
rate_limit_reset_at: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -150,6 +152,8 @@ impl AgentRuntime for GeminiRuntime {
|
|||||||
session_id: None,
|
session_id: None,
|
||||||
token_usage: Some(total_usage),
|
token_usage: Some(total_usage),
|
||||||
aborted_signal: false,
|
aborted_signal: false,
|
||||||
|
rate_limit_exit: false,
|
||||||
|
rate_limit_reset_at: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -251,6 +255,8 @@ impl AgentRuntime for GeminiRuntime {
|
|||||||
session_id: None,
|
session_id: None,
|
||||||
token_usage: Some(total_usage),
|
token_usage: Some(total_usage),
|
||||||
aborted_signal: false,
|
aborted_signal: false,
|
||||||
|
rate_limit_exit: false,
|
||||||
|
rate_limit_reset_at: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -334,6 +340,8 @@ impl AgentRuntime for GeminiRuntime {
|
|||||||
session_id: None,
|
session_id: None,
|
||||||
token_usage: Some(total_usage),
|
token_usage: Some(total_usage),
|
||||||
aborted_signal: false,
|
aborted_signal: false,
|
||||||
|
rate_limit_exit: false,
|
||||||
|
rate_limit_reset_at: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -63,6 +63,19 @@ pub struct RuntimeResult {
|
|||||||
/// uses this flag to skip acceptance gates and respawn without consuming a
|
/// uses this flag to skip acceptance gates and respawn without consuming a
|
||||||
/// retry slot. Always `false` for API-based runtimes (Gemini, OpenAI).
|
/// retry slot. Always `false` for API-based runtimes (Gemini, OpenAI).
|
||||||
pub aborted_signal: bool,
|
pub aborted_signal: bool,
|
||||||
|
/// `true` when the Claude Code CLI received a rate-limit hard block event
|
||||||
|
/// before exiting (bug 1053).
|
||||||
|
///
|
||||||
|
/// The completion handler uses this to distinguish a rate-limit-forced exit
|
||||||
|
/// from a genuine no-progress exit, so commit-recovery counters are not
|
||||||
|
/// incremented. Always `false` for API-based runtimes (Gemini, OpenAI).
|
||||||
|
pub rate_limit_exit: bool,
|
||||||
|
/// When the API rate limit is scheduled to reset.
|
||||||
|
///
|
||||||
|
/// Populated from the `reset_at` field of the `rate_limit_event` JSON when
|
||||||
|
/// `rate_limit_exit` is `true`. The completion handler honours this window
|
||||||
|
/// before re-attempting the agent spawn. `None` for API-based runtimes.
|
||||||
|
pub rate_limit_reset_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Runtime status reported by the backend.
|
/// Runtime status reported by the backend.
|
||||||
@@ -157,6 +170,8 @@ mod tests {
|
|||||||
total_cost_usd: 0.01,
|
total_cost_usd: 0.01,
|
||||||
}),
|
}),
|
||||||
aborted_signal: false,
|
aborted_signal: false,
|
||||||
|
rate_limit_exit: false,
|
||||||
|
rate_limit_reset_at: None,
|
||||||
};
|
};
|
||||||
assert_eq!(result.session_id, Some("sess-123".to_string()));
|
assert_eq!(result.session_id, Some("sess-123".to_string()));
|
||||||
assert!(result.token_usage.is_some());
|
assert!(result.token_usage.is_some());
|
||||||
@@ -172,6 +187,8 @@ mod tests {
|
|||||||
session_id: None,
|
session_id: None,
|
||||||
token_usage: None,
|
token_usage: None,
|
||||||
aborted_signal: false,
|
aborted_signal: false,
|
||||||
|
rate_limit_exit: false,
|
||||||
|
rate_limit_reset_at: None,
|
||||||
};
|
};
|
||||||
assert!(result.session_id.is_none());
|
assert!(result.session_id.is_none());
|
||||||
assert!(result.token_usage.is_none());
|
assert!(result.token_usage.is_none());
|
||||||
|
|||||||
@@ -123,6 +123,8 @@ impl AgentRuntime for OpenAiRuntime {
|
|||||||
session_id: None,
|
session_id: None,
|
||||||
token_usage: Some(total_usage),
|
token_usage: Some(total_usage),
|
||||||
aborted_signal: false,
|
aborted_signal: false,
|
||||||
|
rate_limit_exit: false,
|
||||||
|
rate_limit_reset_at: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,6 +139,8 @@ impl AgentRuntime for OpenAiRuntime {
|
|||||||
session_id: None,
|
session_id: None,
|
||||||
token_usage: Some(total_usage),
|
token_usage: Some(total_usage),
|
||||||
aborted_signal: false,
|
aborted_signal: false,
|
||||||
|
rate_limit_exit: false,
|
||||||
|
rate_limit_reset_at: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -221,6 +225,8 @@ impl AgentRuntime for OpenAiRuntime {
|
|||||||
session_id: None,
|
session_id: None,
|
||||||
token_usage: Some(total_usage),
|
token_usage: Some(total_usage),
|
||||||
aborted_signal: false,
|
aborted_signal: false,
|
||||||
|
rate_limit_exit: false,
|
||||||
|
rate_limit_reset_at: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user