huskies: merge 1053

This commit is contained in:
dave
2026-05-14 18:32:43 +00:00
parent bb5abcd042
commit 96e227d8d4
7 changed files with 215 additions and 0 deletions
+161
View File
@@ -798,6 +798,89 @@ pub(super) async fn run_agent_spawn(
});
}
} else {
// Rate-limit exit: reset commit-recovery counters and respawn
// without consuming a retry slot (bug 1053).
if result.rate_limit_exit {
slog!(
"[agents] Rate-limit exit for '{sid}:{aname}': resetting \
commit-recovery counters and respawning without consuming \
a retry slot."
);
// Deleting these counters prevents the no-progress logic from
// treating this exit as an agent-stuck event.
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(&sid));
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(&sid),
);
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
&sid,
));
// Remove agent from the pool and unblock any wait_for_agent callers.
let tx_done = {
let mut lock = match agents_ref.lock() {
Ok(a) => a,
Err(_) => return,
};
if let Some(agent) = lock.remove(&key_clone) {
agent.tx
} else {
tx_clone.clone()
}
};
let _ = tx_done.send(AgentEvent::Done {
story_id: sid.clone(),
agent_name: aname.clone(),
session_id: result.session_id.clone(),
});
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
// Honour the backoff window (AC3): delay inside a background
// task so the current task returns immediately.
let reset_at = result.rate_limit_reset_at;
let session_for_resume = result.session_id.clone();
let agents_for_respawn = Arc::clone(&agents_ref);
let watcher_for_respawn = watcher_tx_clone.clone();
let sid_r = sid.clone();
let aname_r = aname.clone();
let root_r = project_root_clone.clone();
let port_r = port_for_task;
tokio::spawn(async move {
if let Some(reset_at) = reset_at {
let wait = (reset_at - chrono::Utc::now())
.to_std()
.unwrap_or(std::time::Duration::ZERO);
if !wait.is_zero() {
slog!(
"[agents] Rate-limit backoff for '{sid_r}': \
waiting {}s before respawn.",
wait.as_secs()
);
tokio::time::sleep(wait).await;
}
}
let pool = AgentPool {
agents: agents_for_respawn,
port: port_r,
child_killers: Arc::new(Mutex::new(HashMap::new())),
watcher_tx: watcher_for_respawn,
status_broadcaster: Arc::new(
crate::service::status::StatusBroadcaster::new(),
),
};
if let Err(e) = pool
.start_agent(&root_r, &sid_r, Some(&aname_r), None, session_for_resume)
.await
{
slog_error!(
"[agents] Failed to respawn '{aname_r}' for '{sid_r}' \
after rate-limit exit: {e}"
);
}
});
return;
}
// Server-owned completion: run acceptance gates automatically
// when the agent process exits normally.
super::super::pipeline::run_server_owned_completion(
@@ -1131,4 +1214,82 @@ mod tests {
"a count of {final_count} triggers blocking (>= {CAP})"
);
}
/// Bug 1053: 3 consecutive rate-limit exits must NOT block the story.
///
/// Each rate-limit exit resets CommitRecoveryPending, CommitRecoveryTotalAttempts,
/// and CommitRecoveryDiffFingerprint without incrementing retry_count. After 3
/// simulated rate-limit exits the story must remain in 2_current/ — not blocked.
#[test]
fn three_consecutive_rate_limit_exits_do_not_block() {
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
let story_id = "9953_rate_limit_no_block_1053";
crate::db::write_item_with_content(
story_id,
"2_current",
"---\nname: Rate Limit Test\n---\n",
crate::db::ItemMeta::named("Rate Limit Test"),
);
// Without the fix, each commit-recovery respawn would write
// CommitRecoveryPending=N and CommitRecoveryTotalAttempts=N, and after
// NO_PROGRESS_CAP (3) or TOTAL_ATTEMPTS_CAP (8) the story would be blocked.
// With the fix, the rate-limit exit handler deletes all three counters before
// the pipeline advance runs, so they never accumulate.
const RATE_LIMIT_EXITS: u32 = 3;
for cycle in 1..=RATE_LIMIT_EXITS {
// Simulate: pipeline advance would have written these counts.
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryPending(story_id),
&cycle.to_string(),
);
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
&cycle.to_string(),
);
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
"abc123",
);
// Rate-limit exit handler: reset all three counters (the fix).
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
story_id,
));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
// CommitRecoveryPending must be cleared after each rate-limit exit.
assert!(
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(story_id))
.is_none(),
"CommitRecoveryPending must be None after rate-limit exit #{cycle}"
);
// retry_count must remain 0 — the rate-limit path never calls
// bump_retry_count.
let retry_count = crate::crdt_state::read_item(story_id)
.map(|item| item.retry_count())
.unwrap_or(0);
assert_eq!(
retry_count, 0,
"retry_count must not be incremented by rate-limit exits \
(got {retry_count} on cycle {cycle})"
);
}
// After RATE_LIMIT_EXITS consecutive rate-limit exits the story must NOT
// be blocked — it stays in 2_current/ for the next respawn attempt.
let item = crate::crdt_state::read_item(story_id)
.expect("story must be in CRDT after rate-limit exits");
assert_ne!(
item.stage().dir_name(),
"blocked",
"story must NOT be blocked after {RATE_LIMIT_EXITS} consecutive rate-limit exits; \
got stage: {}",
item.stage().dir_name()
);
}
}