huskies: merge 882

This commit is contained in:
dave
2026-04-30 00:31:08 +00:00
parent a796bd933f
commit b0de86767a
7 changed files with 173 additions and 3 deletions
+143 -1
View File
@@ -15,7 +15,7 @@ use crate::agent_log::AgentLogWriter;
use crate::config::ProjectConfig;
use crate::http::context::AppContext;
use crate::io::watcher::WatcherEvent;
use crate::slog_error;
use crate::{slog, slog_error};
use super::super::super::runtime::{
AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext,
@@ -413,6 +413,95 @@ pub(super) async fn run_agent_spawn(
.find_agent(&aname)
.map(agent_config_stage)
.unwrap_or_else(|| pipeline_stage(&aname));
// AC1/AC2 (bug 882): CLI crashed (SIGABRT) before establishing a
// session. Respawn immediately without running gates or incrementing
// retry_count. Cap consecutive crash-respawns at 5 to avoid
// infinite loops; after the cap, block the story with a clear reason.
if result.aborted_signal && stage != PipelineStage::Mergemaster {
const ABORT_RESPAWN_CAP: u32 = 5;
let db_key = format!("{sid}:abort_respawn_count");
let count = crate::db::read_content(&db_key)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1;
crate::db::write_content(&db_key, &count.to_string());
// Remove the agent entry from the pool and emit Done so that
// any caller blocked on wait_for_agent is unblocked.
let tx_done = {
let mut lock = match agents_ref.lock() {
Ok(a) => a,
Err(_) => return,
};
if let Some(agent) = lock.remove(&key_clone) {
agent.tx
} else {
tx_clone.clone()
}
};
let _ = tx_done.send(AgentEvent::Done {
story_id: sid.clone(),
agent_name: aname.clone(),
session_id: None,
});
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
if count >= ABORT_RESPAWN_CAP {
let reason = format!(
"CLI crashed before establishing a session (signal=Aborted, no session) \
{count} times in a row. Stopping to avoid an infinite respawn loop."
);
slog_error!(
"[agents] Story '{sid}' blocked after {count} consecutive CLI crashes."
);
if let Err(e) = crate::agents::lifecycle::transition_to_blocked(&sid, &reason) {
slog_error!("[agents] Failed to block '{sid}' after abort cap: {e}");
}
let _ = watcher_tx_clone.send(WatcherEvent::StoryBlocked {
story_id: sid.clone(),
reason,
});
} else {
slog!(
"[agents] CLI crashed before session for '{sid}:{aname}' \
(abort respawn {count}/{ABORT_RESPAWN_CAP}). \
Respawning without consuming a retry slot."
);
let agents_for_respawn = Arc::clone(&agents_ref);
let watcher_for_respawn = watcher_tx_clone.clone();
let sid_r = sid.clone();
let aname_r = aname.clone();
let root_r = project_root_clone.clone();
let port_r = port_for_task;
tokio::spawn(async move {
let pool = AgentPool {
agents: agents_for_respawn,
port: port_r,
child_killers: Arc::new(Mutex::new(HashMap::new())),
watcher_tx: watcher_for_respawn,
status_broadcaster: Arc::new(
crate::service::status::StatusBroadcaster::new(),
),
};
if let Err(e) = pool
.start_agent(&root_r, &sid_r, Some(&aname_r), None, None)
.await
{
slog_error!(
"[agents] Failed to respawn '{aname_r}' for '{sid_r}' \
after CLI crash: {e}"
);
}
});
}
return;
}
// Reset the abort-respawn counter on any non-aborted exit so that
// a single successful run clears the consecutive-crash history.
crate::db::delete_content(&format!("{sid}:abort_respawn_count"));
if stage == PipelineStage::Mergemaster {
let (tx_done, done_session_id) = {
let mut lock = match agents_ref.lock() {
@@ -582,4 +671,57 @@ mod tests {
"gate_output must appear in value"
);
}
/// AC3 (bug 882): simulates the abort-respawn counter mechanism to verify that
/// retry_count is never bumped during consecutive aborted+no-session exits and
/// that the abort counter reaches the cap (5) before blocking.
#[test]
fn abort_respawn_leaves_retry_count_unchanged_and_caps_at_five() {
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
let story_id = "9962_story_abort_respawn_882";
crate::db::write_item_with_content(story_id, "2_current", "---\nname: Test\n---\n");
let db_key = format!("{story_id}:abort_respawn_count");
const CAP: u32 = 5;
// Simulate CAP consecutive abort-before-session exits.
for expected_count in 1u32..=CAP {
// This is exactly the counter logic in run_agent_spawn's abort path.
let count = crate::db::read_content(&db_key)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1;
crate::db::write_content(&db_key, &count.to_string());
assert_eq!(
count, expected_count,
"abort counter must increment by 1 each time"
);
// retry_count must remain 0 — the abort path never calls bump_retry_count.
let retry_count = crate::crdt_state::read_item(story_id)
.and_then(|item| item.retry_count)
.map(|r| r as u32)
.unwrap_or(0);
assert_eq!(
retry_count, 0,
"retry_count must not be incremented by the abort-respawn path \
(got {retry_count} on cycle {expected_count})"
);
}
// After CAP cycles the counter equals the cap — the story would be blocked.
let final_count: u32 = crate::db::read_content(&db_key)
.and_then(|s| s.trim().parse().ok())
.unwrap_or(0);
assert_eq!(
final_count, CAP,
"counter must equal {CAP} after {CAP} abort cycles"
);
assert!(
final_count >= CAP,
"a count of {final_count} triggers blocking (>= {CAP})"
);
}
}