huskies: merge 882

This commit is contained in:
dave
2026-04-30 00:31:08 +00:00
parent a796bd933f
commit b0de86767a
7 changed files with 173 additions and 3 deletions
+143 -1
View File
@@ -15,7 +15,7 @@ use crate::agent_log::AgentLogWriter;
use crate::config::ProjectConfig;
use crate::http::context::AppContext;
use crate::io::watcher::WatcherEvent;
use crate::slog_error;
use crate::{slog, slog_error};
use super::super::super::runtime::{
AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext,
@@ -413,6 +413,95 @@ pub(super) async fn run_agent_spawn(
.find_agent(&aname)
.map(agent_config_stage)
.unwrap_or_else(|| pipeline_stage(&aname));
// AC1/AC2 (bug 882): CLI crashed (SIGABRT) before establishing a
// session. Respawn immediately without running gates or incrementing
// retry_count. Cap consecutive crash-respawns at 5 to avoid
// infinite loops; after the cap, block the story with a clear reason.
if result.aborted_signal && stage != PipelineStage::Mergemaster {
const ABORT_RESPAWN_CAP: u32 = 5;
let db_key = format!("{sid}:abort_respawn_count");
let count = crate::db::read_content(&db_key)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1;
crate::db::write_content(&db_key, &count.to_string());
// Remove the agent entry from the pool and emit Done so that
// any caller blocked on wait_for_agent is unblocked.
let tx_done = {
let mut lock = match agents_ref.lock() {
Ok(a) => a,
Err(_) => return,
};
if let Some(agent) = lock.remove(&key_clone) {
agent.tx
} else {
tx_clone.clone()
}
};
let _ = tx_done.send(AgentEvent::Done {
story_id: sid.clone(),
agent_name: aname.clone(),
session_id: None,
});
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
if count >= ABORT_RESPAWN_CAP {
let reason = format!(
"CLI crashed before establishing a session (signal=Aborted, no session) \
{count} times in a row. Stopping to avoid an infinite respawn loop."
);
slog_error!(
"[agents] Story '{sid}' blocked after {count} consecutive CLI crashes."
);
if let Err(e) = crate::agents::lifecycle::transition_to_blocked(&sid, &reason) {
slog_error!("[agents] Failed to block '{sid}' after abort cap: {e}");
}
let _ = watcher_tx_clone.send(WatcherEvent::StoryBlocked {
story_id: sid.clone(),
reason,
});
} else {
slog!(
"[agents] CLI crashed before session for '{sid}:{aname}' \
(abort respawn {count}/{ABORT_RESPAWN_CAP}). \
Respawning without consuming a retry slot."
);
let agents_for_respawn = Arc::clone(&agents_ref);
let watcher_for_respawn = watcher_tx_clone.clone();
let sid_r = sid.clone();
let aname_r = aname.clone();
let root_r = project_root_clone.clone();
let port_r = port_for_task;
tokio::spawn(async move {
let pool = AgentPool {
agents: agents_for_respawn,
port: port_r,
child_killers: Arc::new(Mutex::new(HashMap::new())),
watcher_tx: watcher_for_respawn,
status_broadcaster: Arc::new(
crate::service::status::StatusBroadcaster::new(),
),
};
if let Err(e) = pool
.start_agent(&root_r, &sid_r, Some(&aname_r), None, None)
.await
{
slog_error!(
"[agents] Failed to respawn '{aname_r}' for '{sid_r}' \
after CLI crash: {e}"
);
}
});
}
return;
}
// Reset the abort-respawn counter on any non-aborted exit so that
// a single successful run clears the consecutive-crash history.
crate::db::delete_content(&format!("{sid}:abort_respawn_count"));
if stage == PipelineStage::Mergemaster {
let (tx_done, done_session_id) = {
let mut lock = match agents_ref.lock() {
@@ -582,4 +671,57 @@ mod tests {
"gate_output must appear in value"
);
}
/// AC3 (bug 882): simulates the abort-respawn counter mechanism to verify that
/// retry_count is never bumped during consecutive aborted+no-session exits and
/// that the abort counter reaches the cap (5) before blocking.
#[test]
fn abort_respawn_leaves_retry_count_unchanged_and_caps_at_five() {
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
let story_id = "9962_story_abort_respawn_882";
crate::db::write_item_with_content(story_id, "2_current", "---\nname: Test\n---\n");
let db_key = format!("{story_id}:abort_respawn_count");
const CAP: u32 = 5;
// Simulate CAP consecutive abort-before-session exits.
for expected_count in 1u32..=CAP {
// This is exactly the counter logic in run_agent_spawn's abort path.
let count = crate::db::read_content(&db_key)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1;
crate::db::write_content(&db_key, &count.to_string());
assert_eq!(
count, expected_count,
"abort counter must increment by 1 each time"
);
// retry_count must remain 0 — the abort path never calls bump_retry_count.
let retry_count = crate::crdt_state::read_item(story_id)
.and_then(|item| item.retry_count)
.map(|r| r as u32)
.unwrap_or(0);
assert_eq!(
retry_count, 0,
"retry_count must not be incremented by the abort-respawn path \
(got {retry_count} on cycle {expected_count})"
);
}
// After CAP cycles the counter equals the cap — the story would be blocked.
let final_count: u32 = crate::db::read_content(&db_key)
.and_then(|s| s.trim().parse().ok())
.unwrap_or(0);
assert_eq!(
final_count, CAP,
"counter must equal {CAP} after {CAP} abort cycles"
);
assert!(
final_count >= CAP,
"a count of {final_count} triggers blocking (>= {CAP})"
);
}
}
+5 -2
View File
@@ -399,14 +399,16 @@ fn run_agent_pty_blocking(
let _ = child.kill();
let wait_result = child.wait();
match &wait_result {
let exit_ok = match &wait_result {
Ok(status) => {
slog!("[agent:{story_id}:{agent_name}] Child exited: {status:?}");
status.success()
}
Err(e) => {
slog!("[agent:{story_id}:{agent_name}] Child wait error: {e}");
false
}
}
};
// Wait for the reader thread to finish so it releases the cloned PTY
// master fd before we return. Without this, the next PTY spawn for the
@@ -434,5 +436,6 @@ fn run_agent_pty_blocking(
Ok(PtyResult {
session_id,
token_usage,
exit_ok,
})
}
+3
View File
@@ -10,6 +10,9 @@ use crate::agents::TokenUsage;
pub(in crate::agents) struct PtyResult {
pub session_id: Option<String>,
pub token_usage: Option<TokenUsage>,
/// `true` if the child process exited with a zero exit code (normal completion).
/// `false` if the process was killed by a signal or exited with a non-zero code.
pub exit_ok: bool,
}
pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String {
+5
View File
@@ -61,6 +61,9 @@ impl AgentRuntime for ClaudeCodeRuntime {
match pty_result {
Ok(result) => Ok(RuntimeResult {
// Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its
// first "system" event. Detected by: non-zero exit AND no session.
aborted_signal: !result.exit_ok && result.session_id.is_none(),
session_id: result.session_id,
token_usage: result.token_usage,
}),
@@ -90,6 +93,8 @@ impl AgentRuntime for ClaudeCodeRuntime {
)
.await?;
Ok(RuntimeResult {
aborted_signal: !fallback_result.exit_ok
&& fallback_result.session_id.is_none(),
session_id: fallback_result.session_id,
token_usage: fallback_result.token_usage,
})
+4
View File
@@ -135,6 +135,7 @@ impl AgentRuntime for GeminiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
});
}
@@ -148,6 +149,7 @@ impl AgentRuntime for GeminiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
});
}
@@ -248,6 +250,7 @@ impl AgentRuntime for GeminiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
});
}
@@ -330,6 +333,7 @@ impl AgentRuntime for GeminiRuntime {
Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
})
}
+10
View File
@@ -47,6 +47,14 @@ pub struct RuntimeContext {
pub struct RuntimeResult {
pub session_id: Option<String>,
pub token_usage: Option<TokenUsage>,
/// `true` when the process exited with a failure AND no session was established.
///
/// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion
/// failure) before it could emit its first "system" event — the classic
/// "signal=Aborted, Session: None" case (bug 882). The completion handler
/// uses this flag to skip acceptance gates and respawn without consuming a
/// retry slot. Always `false` for API-based runtimes (Gemini, OpenAI).
pub aborted_signal: bool,
}
/// Runtime status reported by the backend.
@@ -138,6 +146,7 @@ mod tests {
cache_read_input_tokens: 0,
total_cost_usd: 0.01,
}),
aborted_signal: false,
};
assert_eq!(result.session_id, Some("sess-123".to_string()));
assert!(result.token_usage.is_some());
@@ -152,6 +161,7 @@ mod tests {
let result = RuntimeResult {
session_id: None,
token_usage: None,
aborted_signal: false,
};
assert!(result.session_id.is_none());
assert!(result.token_usage.is_none());
+3
View File
@@ -122,6 +122,7 @@ impl AgentRuntime for OpenAiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
});
}
@@ -135,6 +136,7 @@ impl AgentRuntime for OpenAiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
});
}
@@ -218,6 +220,7 @@ impl AgentRuntime for OpenAiRuntime {
return Ok(RuntimeResult {
session_id: None,
token_usage: Some(total_usage),
aborted_signal: false,
});
}