huskies: merge 882
This commit is contained in:
@@ -15,7 +15,7 @@ use crate::agent_log::AgentLogWriter;
|
||||
use crate::config::ProjectConfig;
|
||||
use crate::http::context::AppContext;
|
||||
use crate::io::watcher::WatcherEvent;
|
||||
use crate::slog_error;
|
||||
use crate::{slog, slog_error};
|
||||
|
||||
use super::super::super::runtime::{
|
||||
AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext,
|
||||
@@ -413,6 +413,95 @@ pub(super) async fn run_agent_spawn(
|
||||
.find_agent(&aname)
|
||||
.map(agent_config_stage)
|
||||
.unwrap_or_else(|| pipeline_stage(&aname));
|
||||
|
||||
// AC1/AC2 (bug 882): CLI crashed (SIGABRT) before establishing a
|
||||
// session. Respawn immediately without running gates or incrementing
|
||||
// retry_count. Cap consecutive crash-respawns at 5 to avoid
|
||||
// infinite loops; after the cap, block the story with a clear reason.
|
||||
if result.aborted_signal && stage != PipelineStage::Mergemaster {
|
||||
const ABORT_RESPAWN_CAP: u32 = 5;
|
||||
let db_key = format!("{sid}:abort_respawn_count");
|
||||
let count = crate::db::read_content(&db_key)
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0)
|
||||
+ 1;
|
||||
crate::db::write_content(&db_key, &count.to_string());
|
||||
|
||||
// Remove the agent entry from the pool and emit Done so that
|
||||
// any caller blocked on wait_for_agent is unblocked.
|
||||
let tx_done = {
|
||||
let mut lock = match agents_ref.lock() {
|
||||
Ok(a) => a,
|
||||
Err(_) => return,
|
||||
};
|
||||
if let Some(agent) = lock.remove(&key_clone) {
|
||||
agent.tx
|
||||
} else {
|
||||
tx_clone.clone()
|
||||
}
|
||||
};
|
||||
let _ = tx_done.send(AgentEvent::Done {
|
||||
story_id: sid.clone(),
|
||||
agent_name: aname.clone(),
|
||||
session_id: None,
|
||||
});
|
||||
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
|
||||
|
||||
if count >= ABORT_RESPAWN_CAP {
|
||||
let reason = format!(
|
||||
"CLI crashed before establishing a session (signal=Aborted, no session) \
|
||||
{count} times in a row. Stopping to avoid an infinite respawn loop."
|
||||
);
|
||||
slog_error!(
|
||||
"[agents] Story '{sid}' blocked after {count} consecutive CLI crashes."
|
||||
);
|
||||
if let Err(e) = crate::agents::lifecycle::transition_to_blocked(&sid, &reason) {
|
||||
slog_error!("[agents] Failed to block '{sid}' after abort cap: {e}");
|
||||
}
|
||||
let _ = watcher_tx_clone.send(WatcherEvent::StoryBlocked {
|
||||
story_id: sid.clone(),
|
||||
reason,
|
||||
});
|
||||
} else {
|
||||
slog!(
|
||||
"[agents] CLI crashed before session for '{sid}:{aname}' \
|
||||
(abort respawn {count}/{ABORT_RESPAWN_CAP}). \
|
||||
Respawning without consuming a retry slot."
|
||||
);
|
||||
let agents_for_respawn = Arc::clone(&agents_ref);
|
||||
let watcher_for_respawn = watcher_tx_clone.clone();
|
||||
let sid_r = sid.clone();
|
||||
let aname_r = aname.clone();
|
||||
let root_r = project_root_clone.clone();
|
||||
let port_r = port_for_task;
|
||||
tokio::spawn(async move {
|
||||
let pool = AgentPool {
|
||||
agents: agents_for_respawn,
|
||||
port: port_r,
|
||||
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
||||
watcher_tx: watcher_for_respawn,
|
||||
status_broadcaster: Arc::new(
|
||||
crate::service::status::StatusBroadcaster::new(),
|
||||
),
|
||||
};
|
||||
if let Err(e) = pool
|
||||
.start_agent(&root_r, &sid_r, Some(&aname_r), None, None)
|
||||
.await
|
||||
{
|
||||
slog_error!(
|
||||
"[agents] Failed to respawn '{aname_r}' for '{sid_r}' \
|
||||
after CLI crash: {e}"
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Reset the abort-respawn counter on any non-aborted exit so that
|
||||
// a single successful run clears the consecutive-crash history.
|
||||
crate::db::delete_content(&format!("{sid}:abort_respawn_count"));
|
||||
|
||||
if stage == PipelineStage::Mergemaster {
|
||||
let (tx_done, done_session_id) = {
|
||||
let mut lock = match agents_ref.lock() {
|
||||
@@ -582,4 +671,57 @@ mod tests {
|
||||
"gate_output must appear in value"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC3 (bug 882): simulates the abort-respawn counter mechanism to verify that
|
||||
/// retry_count is never bumped during consecutive aborted+no-session exits and
|
||||
/// that the abort counter reaches the cap (5) before blocking.
|
||||
#[test]
|
||||
fn abort_respawn_leaves_retry_count_unchanged_and_caps_at_five() {
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::db::ensure_content_store();
|
||||
|
||||
let story_id = "9962_story_abort_respawn_882";
|
||||
crate::db::write_item_with_content(story_id, "2_current", "---\nname: Test\n---\n");
|
||||
|
||||
let db_key = format!("{story_id}:abort_respawn_count");
|
||||
const CAP: u32 = 5;
|
||||
|
||||
// Simulate CAP consecutive abort-before-session exits.
|
||||
for expected_count in 1u32..=CAP {
|
||||
// This is exactly the counter logic in run_agent_spawn's abort path.
|
||||
let count = crate::db::read_content(&db_key)
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0)
|
||||
+ 1;
|
||||
crate::db::write_content(&db_key, &count.to_string());
|
||||
assert_eq!(
|
||||
count, expected_count,
|
||||
"abort counter must increment by 1 each time"
|
||||
);
|
||||
|
||||
// retry_count must remain 0 — the abort path never calls bump_retry_count.
|
||||
let retry_count = crate::crdt_state::read_item(story_id)
|
||||
.and_then(|item| item.retry_count)
|
||||
.map(|r| r as u32)
|
||||
.unwrap_or(0);
|
||||
assert_eq!(
|
||||
retry_count, 0,
|
||||
"retry_count must not be incremented by the abort-respawn path \
|
||||
(got {retry_count} on cycle {expected_count})"
|
||||
);
|
||||
}
|
||||
|
||||
// After CAP cycles the counter equals the cap — the story would be blocked.
|
||||
let final_count: u32 = crate::db::read_content(&db_key)
|
||||
.and_then(|s| s.trim().parse().ok())
|
||||
.unwrap_or(0);
|
||||
assert_eq!(
|
||||
final_count, CAP,
|
||||
"counter must equal {CAP} after {CAP} abort cycles"
|
||||
);
|
||||
assert!(
|
||||
final_count >= CAP,
|
||||
"a count of {final_count} triggers blocking (>= {CAP})"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user