huskies: merge 882
This commit is contained in:
@@ -15,7 +15,7 @@ use crate::agent_log::AgentLogWriter;
|
||||
use crate::config::ProjectConfig;
|
||||
use crate::http::context::AppContext;
|
||||
use crate::io::watcher::WatcherEvent;
|
||||
use crate::slog_error;
|
||||
use crate::{slog, slog_error};
|
||||
|
||||
use super::super::super::runtime::{
|
||||
AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext,
|
||||
@@ -413,6 +413,95 @@ pub(super) async fn run_agent_spawn(
|
||||
.find_agent(&aname)
|
||||
.map(agent_config_stage)
|
||||
.unwrap_or_else(|| pipeline_stage(&aname));
|
||||
|
||||
// AC1/AC2 (bug 882): CLI crashed (SIGABRT) before establishing a
|
||||
// session. Respawn immediately without running gates or incrementing
|
||||
// retry_count. Cap consecutive crash-respawns at 5 to avoid
|
||||
// infinite loops; after the cap, block the story with a clear reason.
|
||||
if result.aborted_signal && stage != PipelineStage::Mergemaster {
|
||||
const ABORT_RESPAWN_CAP: u32 = 5;
|
||||
let db_key = format!("{sid}:abort_respawn_count");
|
||||
let count = crate::db::read_content(&db_key)
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0)
|
||||
+ 1;
|
||||
crate::db::write_content(&db_key, &count.to_string());
|
||||
|
||||
// Remove the agent entry from the pool and emit Done so that
|
||||
// any caller blocked on wait_for_agent is unblocked.
|
||||
let tx_done = {
|
||||
let mut lock = match agents_ref.lock() {
|
||||
Ok(a) => a,
|
||||
Err(_) => return,
|
||||
};
|
||||
if let Some(agent) = lock.remove(&key_clone) {
|
||||
agent.tx
|
||||
} else {
|
||||
tx_clone.clone()
|
||||
}
|
||||
};
|
||||
let _ = tx_done.send(AgentEvent::Done {
|
||||
story_id: sid.clone(),
|
||||
agent_name: aname.clone(),
|
||||
session_id: None,
|
||||
});
|
||||
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
|
||||
|
||||
if count >= ABORT_RESPAWN_CAP {
|
||||
let reason = format!(
|
||||
"CLI crashed before establishing a session (signal=Aborted, no session) \
|
||||
{count} times in a row. Stopping to avoid an infinite respawn loop."
|
||||
);
|
||||
slog_error!(
|
||||
"[agents] Story '{sid}' blocked after {count} consecutive CLI crashes."
|
||||
);
|
||||
if let Err(e) = crate::agents::lifecycle::transition_to_blocked(&sid, &reason) {
|
||||
slog_error!("[agents] Failed to block '{sid}' after abort cap: {e}");
|
||||
}
|
||||
let _ = watcher_tx_clone.send(WatcherEvent::StoryBlocked {
|
||||
story_id: sid.clone(),
|
||||
reason,
|
||||
});
|
||||
} else {
|
||||
slog!(
|
||||
"[agents] CLI crashed before session for '{sid}:{aname}' \
|
||||
(abort respawn {count}/{ABORT_RESPAWN_CAP}). \
|
||||
Respawning without consuming a retry slot."
|
||||
);
|
||||
let agents_for_respawn = Arc::clone(&agents_ref);
|
||||
let watcher_for_respawn = watcher_tx_clone.clone();
|
||||
let sid_r = sid.clone();
|
||||
let aname_r = aname.clone();
|
||||
let root_r = project_root_clone.clone();
|
||||
let port_r = port_for_task;
|
||||
tokio::spawn(async move {
|
||||
let pool = AgentPool {
|
||||
agents: agents_for_respawn,
|
||||
port: port_r,
|
||||
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
||||
watcher_tx: watcher_for_respawn,
|
||||
status_broadcaster: Arc::new(
|
||||
crate::service::status::StatusBroadcaster::new(),
|
||||
),
|
||||
};
|
||||
if let Err(e) = pool
|
||||
.start_agent(&root_r, &sid_r, Some(&aname_r), None, None)
|
||||
.await
|
||||
{
|
||||
slog_error!(
|
||||
"[agents] Failed to respawn '{aname_r}' for '{sid_r}' \
|
||||
after CLI crash: {e}"
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Reset the abort-respawn counter on any non-aborted exit so that
|
||||
// a single successful run clears the consecutive-crash history.
|
||||
crate::db::delete_content(&format!("{sid}:abort_respawn_count"));
|
||||
|
||||
if stage == PipelineStage::Mergemaster {
|
||||
let (tx_done, done_session_id) = {
|
||||
let mut lock = match agents_ref.lock() {
|
||||
@@ -582,4 +671,57 @@ mod tests {
|
||||
"gate_output must appear in value"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC3 (bug 882): simulates the abort-respawn counter mechanism to verify that
|
||||
/// retry_count is never bumped during consecutive aborted+no-session exits and
|
||||
/// that the abort counter reaches the cap (5) before blocking.
|
||||
#[test]
|
||||
fn abort_respawn_leaves_retry_count_unchanged_and_caps_at_five() {
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::db::ensure_content_store();
|
||||
|
||||
let story_id = "9962_story_abort_respawn_882";
|
||||
crate::db::write_item_with_content(story_id, "2_current", "---\nname: Test\n---\n");
|
||||
|
||||
let db_key = format!("{story_id}:abort_respawn_count");
|
||||
const CAP: u32 = 5;
|
||||
|
||||
// Simulate CAP consecutive abort-before-session exits.
|
||||
for expected_count in 1u32..=CAP {
|
||||
// This is exactly the counter logic in run_agent_spawn's abort path.
|
||||
let count = crate::db::read_content(&db_key)
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0)
|
||||
+ 1;
|
||||
crate::db::write_content(&db_key, &count.to_string());
|
||||
assert_eq!(
|
||||
count, expected_count,
|
||||
"abort counter must increment by 1 each time"
|
||||
);
|
||||
|
||||
// retry_count must remain 0 — the abort path never calls bump_retry_count.
|
||||
let retry_count = crate::crdt_state::read_item(story_id)
|
||||
.and_then(|item| item.retry_count)
|
||||
.map(|r| r as u32)
|
||||
.unwrap_or(0);
|
||||
assert_eq!(
|
||||
retry_count, 0,
|
||||
"retry_count must not be incremented by the abort-respawn path \
|
||||
(got {retry_count} on cycle {expected_count})"
|
||||
);
|
||||
}
|
||||
|
||||
// After CAP cycles the counter equals the cap — the story would be blocked.
|
||||
let final_count: u32 = crate::db::read_content(&db_key)
|
||||
.and_then(|s| s.trim().parse().ok())
|
||||
.unwrap_or(0);
|
||||
assert_eq!(
|
||||
final_count, CAP,
|
||||
"counter must equal {CAP} after {CAP} abort cycles"
|
||||
);
|
||||
assert!(
|
||||
final_count >= CAP,
|
||||
"a count of {final_count} triggers blocking (>= {CAP})"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -399,14 +399,16 @@ fn run_agent_pty_blocking(
|
||||
|
||||
let _ = child.kill();
|
||||
let wait_result = child.wait();
|
||||
match &wait_result {
|
||||
let exit_ok = match &wait_result {
|
||||
Ok(status) => {
|
||||
slog!("[agent:{story_id}:{agent_name}] Child exited: {status:?}");
|
||||
status.success()
|
||||
}
|
||||
Err(e) => {
|
||||
slog!("[agent:{story_id}:{agent_name}] Child wait error: {e}");
|
||||
false
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Wait for the reader thread to finish so it releases the cloned PTY
|
||||
// master fd before we return. Without this, the next PTY spawn for the
|
||||
@@ -434,5 +436,6 @@ fn run_agent_pty_blocking(
|
||||
Ok(PtyResult {
|
||||
session_id,
|
||||
token_usage,
|
||||
exit_ok,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -10,6 +10,9 @@ use crate::agents::TokenUsage;
|
||||
pub(in crate::agents) struct PtyResult {
|
||||
pub session_id: Option<String>,
|
||||
pub token_usage: Option<TokenUsage>,
|
||||
/// `true` if the child process exited with a zero exit code (normal completion).
|
||||
/// `false` if the process was killed by a signal or exited with a non-zero code.
|
||||
pub exit_ok: bool,
|
||||
}
|
||||
|
||||
pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String {
|
||||
|
||||
@@ -61,6 +61,9 @@ impl AgentRuntime for ClaudeCodeRuntime {
|
||||
|
||||
match pty_result {
|
||||
Ok(result) => Ok(RuntimeResult {
|
||||
// Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its
|
||||
// first "system" event. Detected by: non-zero exit AND no session.
|
||||
aborted_signal: !result.exit_ok && result.session_id.is_none(),
|
||||
session_id: result.session_id,
|
||||
token_usage: result.token_usage,
|
||||
}),
|
||||
@@ -90,6 +93,8 @@ impl AgentRuntime for ClaudeCodeRuntime {
|
||||
)
|
||||
.await?;
|
||||
Ok(RuntimeResult {
|
||||
aborted_signal: !fallback_result.exit_ok
|
||||
&& fallback_result.session_id.is_none(),
|
||||
session_id: fallback_result.session_id,
|
||||
token_usage: fallback_result.token_usage,
|
||||
})
|
||||
|
||||
@@ -135,6 +135,7 @@ impl AgentRuntime for GeminiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
aborted_signal: false,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -148,6 +149,7 @@ impl AgentRuntime for GeminiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
aborted_signal: false,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -248,6 +250,7 @@ impl AgentRuntime for GeminiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
aborted_signal: false,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -330,6 +333,7 @@ impl AgentRuntime for GeminiRuntime {
|
||||
Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
aborted_signal: false,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,14 @@ pub struct RuntimeContext {
|
||||
pub struct RuntimeResult {
|
||||
pub session_id: Option<String>,
|
||||
pub token_usage: Option<TokenUsage>,
|
||||
/// `true` when the process exited with a failure AND no session was established.
|
||||
///
|
||||
/// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion
|
||||
/// failure) before it could emit its first "system" event — the classic
|
||||
/// "signal=Aborted, Session: None" case (bug 882). The completion handler
|
||||
/// uses this flag to skip acceptance gates and respawn without consuming a
|
||||
/// retry slot. Always `false` for API-based runtimes (Gemini, OpenAI).
|
||||
pub aborted_signal: bool,
|
||||
}
|
||||
|
||||
/// Runtime status reported by the backend.
|
||||
@@ -138,6 +146,7 @@ mod tests {
|
||||
cache_read_input_tokens: 0,
|
||||
total_cost_usd: 0.01,
|
||||
}),
|
||||
aborted_signal: false,
|
||||
};
|
||||
assert_eq!(result.session_id, Some("sess-123".to_string()));
|
||||
assert!(result.token_usage.is_some());
|
||||
@@ -152,6 +161,7 @@ mod tests {
|
||||
let result = RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: None,
|
||||
aborted_signal: false,
|
||||
};
|
||||
assert!(result.session_id.is_none());
|
||||
assert!(result.token_usage.is_none());
|
||||
|
||||
@@ -122,6 +122,7 @@ impl AgentRuntime for OpenAiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
aborted_signal: false,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -135,6 +136,7 @@ impl AgentRuntime for OpenAiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
aborted_signal: false,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -218,6 +220,7 @@ impl AgentRuntime for OpenAiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
aborted_signal: false,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user