huskies: merge 1008

This commit is contained in:
dave
2026-05-14 08:41:49 +00:00
parent 761b6934f1
commit ebf58ef224
9 changed files with 170 additions and 2 deletions
@@ -163,6 +163,7 @@ mod tests {
project_root: None, project_root: None,
log_session_id: None, log_session_id: None,
merge_failure_reported: false, merge_failure_reported: false,
merge_success_reported: false,
throttled: None, throttled: None,
termination_reason: None, termination_reason: None,
status_buffer: None, status_buffer: None,
@@ -1227,3 +1227,70 @@ async fn coder_completion_with_test_evidence_and_zero_commits_does_not_advance()
"run_tests evidence must be cleared after pipeline advance consumes it" "run_tests evidence must be cleared after pipeline advance consumes it"
); );
} }
// ── bug 1008: successful mergemaster exit must not re-spawn or block ──────────
/// AC4 regression (bug 1008): when `merge_agent_work` returns success with
/// `story_archived: true`, the spawn.rs exit handler must:
/// (a) not re-spawn the mergemaster,
/// (b) transition the story to done (already done by the merge runner before
/// writing `ContentKey::MergeSuccess` — verified via CRDT stage), and
/// (c) clear `MergeMasterSpawnCount` so a future re-entry starts fresh.
///
/// This test simulates the exit handler path: it seeds `ContentKey::MergeSuccess`
/// (as the merge runner would), seeds the story as Done in the CRDT (as
/// `move_story_to_done` would), then exercises the spawn.rs logic directly.
#[test]
fn successful_mergemaster_exit_does_not_respawn_or_block() {
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
let story_id = "9908_story_merge_success_1008";
// Seed the story as Done in the CRDT (as move_story_to_done would have done).
crate::db::write_item_with_content(
story_id,
"5_done",
"---\nname: Merge Success Test\n---\n",
crate::db::ItemMeta::named("Merge Success Test"),
);
// Simulate the merge runner writing MergeSuccess BEFORE the agent exited.
crate::db::write_content(crate::db::ContentKey::MergeSuccess(story_id), "1");
// Simulate a pre-existing spawn count (e.g. a previous transient exit).
crate::db::write_content(crate::db::ContentKey::MergeMasterSpawnCount(story_id), "1");
// Simulate the exit handler: read the DB key (as spawn.rs does).
let merge_succeeded =
crate::db::read_content(crate::db::ContentKey::MergeSuccess(story_id)).is_some();
assert!(
merge_succeeded,
"MergeSuccess key must be present before the exit handler runs"
);
// Simulate what spawn.rs does on merge_succeeded=true.
crate::db::delete_content(crate::db::ContentKey::MergeSuccess(story_id));
crate::db::delete_content(crate::db::ContentKey::MergeMasterSpawnCount(story_id));
// (a) No re-spawn: MergeSuccess key is gone (no reassign triggered).
assert!(
crate::db::read_content(crate::db::ContentKey::MergeSuccess(story_id)).is_none(),
"(a) MergeSuccess key must be cleared after exit handler runs"
);
// (b) Story is still Done (not moved to blocked).
if let Ok(Some(item)) = crate::pipeline_state::read_typed(story_id) {
assert_eq!(
item.stage.dir_name(),
"done",
"(b) Story must remain in done after successful mergemaster exit"
);
}
// (c) Spawn count cleared so future re-entry starts fresh.
assert!(
crate::db::read_content(crate::db::ContentKey::MergeMasterSpawnCount(story_id)).is_none(),
"(c) MergeMasterSpawnCount must be cleared after successful merge exit"
);
}
@@ -27,6 +27,47 @@ impl AgentPool {
} }
} }
/// Record that the squash merge for `story_id` succeeded with the story
/// archived. Sets `merge_success_reported = true` on the active
/// mergemaster agent so the exit handler in `spawn.rs` can distinguish
/// a clean successful exit from a transient crash (bug 1008).
///
/// If the agent was already removed from the pool (race: `remove_agents_for_story`
/// ran first) this is a no-op; the `ContentKey::MergeSuccess` DB key written
/// by the caller acts as the authoritative fallback in that case.
pub fn set_merge_success_reported(&self, story_id: &str) {
match self.agents.lock() {
Ok(mut lock) => {
let found = lock.iter_mut().find(|(key, agent)| {
let key_story_id = key
.rsplit_once(':')
.map(|(sid, _)| sid)
.unwrap_or(key.as_str());
key_story_id == story_id
&& pipeline_stage(&agent.agent_name) == PipelineStage::Mergemaster
});
match found {
Some((_, agent)) => {
agent.merge_success_reported = true;
slog!(
"[pipeline] Merge success flag set for '{story_id}:{}'",
agent.agent_name
);
}
None => {
slog!(
"[pipeline] set_merge_success_reported: no running mergemaster \
for '{story_id}' — DB key is the authoritative fallback"
);
}
}
}
Err(e) => {
slog_error!("[pipeline] set_merge_success_reported: could not lock agents: {e}");
}
}
}
/// Record that the mergemaster agent for `story_id` explicitly reported a /// Record that the mergemaster agent for `story_id` explicitly reported a
/// merge failure via the `report_merge_failure` MCP tool. /// merge failure via the `report_merge_failure` MCP tool.
/// ///
@@ -268,6 +268,20 @@ impl AgentPool {
} }
} }
// AC1 (bug 1008): Before writing "completed" to the CRDT (which
// unblocks the mergemaster agent's get_merge_status poll), record
// that the merge succeeded. The exit handler in spawn.rs reads
// this flag/key to skip the transient-respawn logic for a clean
// successful exit. Must happen BEFORE the CRDT write below so the
// flag is always present when the agent sees "completed" and exits.
if success
&& let Ok(ref r) = report
&& r.story_archived
{
pool.set_merge_success_reported(&sid);
crate::db::write_content(crate::db::ContentKey::MergeSuccess(&sid), "1");
}
// Update CRDT with terminal status. // Update CRDT with terminal status.
match &report { match &report {
Ok(r) => { Ok(r) => {
+1
View File
@@ -322,6 +322,7 @@ impl AgentPool {
project_root: Some(project_root.to_path_buf()), project_root: Some(project_root.to_path_buf()),
log_session_id: Some(log_session_id.clone()), log_session_id: Some(log_session_id.clone()),
merge_failure_reported: false, merge_failure_reported: false,
merge_success_reported: false,
throttled: None, throttled: None,
termination_reason: None, termination_reason: None,
status_buffer: Some(status_buffer), status_buffer: Some(status_buffer),
+28 -2
View File
@@ -608,7 +608,7 @@ pub(super) async fn run_agent_spawn(
crate::db::delete_content(crate::db::ContentKey::AbortRespawnCount(&sid)); crate::db::delete_content(crate::db::ContentKey::AbortRespawnCount(&sid));
if stage == PipelineStage::Mergemaster { if stage == PipelineStage::Mergemaster {
let (tx_done, done_session_id, merge_failure_reported) = { let (tx_done, done_session_id, merge_failure_reported, merge_success_reported) = {
let mut lock = match agents_ref.lock() { let mut lock = match agents_ref.lock() {
Ok(a) => a, Ok(a) => a,
Err(_) => return, Err(_) => return,
@@ -618,11 +618,37 @@ pub(super) async fn run_agent_spawn(
agent.tx, agent.tx,
agent.session_id.or(result.session_id), agent.session_id.or(result.session_id),
agent.merge_failure_reported, agent.merge_failure_reported,
agent.merge_success_reported,
) )
} else { } else {
(tx_clone.clone(), result.session_id, false) (tx_clone.clone(), result.session_id, false, false)
} }
}; };
// AC1 (bug 1008): Check for a successful merge exit BEFORE the
// transient/failure path. The merge runner sets the flag on the
// agent record and writes a DB fallback key (for the race where
// remove_agents_for_story ran first) before marking the job
// "completed". A clean exit after success → no re-spawn, no
// blocked transition, spawn count reset.
let merge_succeeded = merge_success_reported
|| crate::db::read_content(crate::db::ContentKey::MergeSuccess(&sid)).is_some();
if merge_succeeded {
crate::db::delete_content(crate::db::ContentKey::MergeSuccess(&sid));
// AC3: reset spawn count so future re-entries start fresh.
crate::db::delete_content(crate::db::ContentKey::MergeMasterSpawnCount(&sid));
slog!(
"[agents] Mergemaster '{aname}' for '{sid}' exited after \
successful merge — no re-spawn."
);
let _ = tx_done.send(AgentEvent::Done {
story_id: sid.clone(),
agent_name: aname.clone(),
session_id: done_session_id,
});
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
// Do NOT send WorkItem/reassign — story is already Done.
return;
}
// Clear any stale Running merge job so the next mergemaster // Clear any stale Running merge job so the next mergemaster
// can call start_merge_agent_work without hitting "Merge // can call start_merge_agent_work without hitting "Merge
// already in progress" (bug 498). // already in progress" (bug 498).
+7
View File
@@ -35,6 +35,7 @@ impl AgentPool {
project_root: None, project_root: None,
log_session_id: None, log_session_id: None,
merge_failure_reported: false, merge_failure_reported: false,
merge_success_reported: false,
throttled: None, throttled: None,
termination_reason: None, termination_reason: None,
status_buffer: None, status_buffer: None,
@@ -73,6 +74,7 @@ impl AgentPool {
project_root: None, project_root: None,
log_session_id: None, log_session_id: None,
merge_failure_reported: false, merge_failure_reported: false,
merge_success_reported: false,
throttled: None, throttled: None,
termination_reason: None, termination_reason: None,
status_buffer: None, status_buffer: None,
@@ -108,6 +110,7 @@ impl AgentPool {
project_root: Some(project_root), project_root: Some(project_root),
log_session_id: None, log_session_id: None,
merge_failure_reported: false, merge_failure_reported: false,
merge_success_reported: false,
throttled: None, throttled: None,
termination_reason: None, termination_reason: None,
status_buffer: None, status_buffer: None,
@@ -142,6 +145,7 @@ impl AgentPool {
project_root: None, project_root: None,
log_session_id: Some(log_session_id.to_string()), log_session_id: Some(log_session_id.to_string()),
merge_failure_reported: false, merge_failure_reported: false,
merge_success_reported: false,
throttled: None, throttled: None,
termination_reason: None, termination_reason: None,
status_buffer: None, status_buffer: None,
@@ -176,6 +180,7 @@ impl AgentPool {
project_root: None, project_root: None,
log_session_id: None, log_session_id: None,
merge_failure_reported: false, merge_failure_reported: false,
merge_success_reported: false,
throttled: None, throttled: None,
termination_reason: None, termination_reason: None,
status_buffer: Some(StatusEventBuffer::new(&self.status_broadcaster)), status_buffer: Some(StatusEventBuffer::new(&self.status_broadcaster)),
@@ -233,6 +238,7 @@ impl AgentPool {
project_root: Some(project_root), project_root: Some(project_root),
log_session_id: None, log_session_id: None,
merge_failure_reported: false, merge_failure_reported: false,
merge_success_reported: false,
throttled: None, throttled: None,
termination_reason: None, termination_reason: None,
status_buffer: None, status_buffer: None,
@@ -267,6 +273,7 @@ impl AgentPool {
project_root: None, project_root: None,
log_session_id: None, log_session_id: None,
merge_failure_reported: false, merge_failure_reported: false,
merge_success_reported: false,
throttled: None, throttled: None,
termination_reason: None, termination_reason: None,
status_buffer: None, status_buffer: None,
+5
View File
@@ -81,6 +81,11 @@ pub(super) struct StoryAgent {
/// worktree (which compiles fine) and returns `gates_passed=true` even /// worktree (which compiles fine) and returns `gates_passed=true` even
/// though the code was never squash-merged onto master. /// though the code was never squash-merged onto master.
pub(super) merge_failure_reported: bool, pub(super) merge_failure_reported: bool,
/// Set to `true` by the merge runner when the squash merge succeeds and
/// `story_archived: true` (bug 1008). Lets the exit handler in
/// `spawn.rs` recognise a clean successful exit and skip the
/// transient-respawn logic entirely.
pub(super) merge_success_reported: bool,
/// Set when a rate-limit event was received for this agent. /// Set when a rate-limit event was received for this agent.
/// Carries the expiry time so the scheduler can decide when to retry. /// Carries the expiry time so the scheduler can decide when to retry.
pub(super) throttled: Option<crate::agents::AgentExecution>, pub(super) throttled: Option<crate::agents::AgentExecution>,
+6
View File
@@ -37,6 +37,11 @@ pub enum ContentKey<'a> {
/// CRDT projection layer can reconstruct the typed kind on server restart /// CRDT projection layer can reconstruct the typed kind on server restart
/// without substring-scanning the gate output string (story 986). /// without substring-scanning the gate output string (story 986).
MergeFailureKind(&'a str), MergeFailureKind(&'a str),
/// Flag set by the merge runner when a squash merge succeeds with
/// `story_archived: true`. Written before the CRDT job status is set to
/// "completed" so the mergemaster agent exit handler in `spawn.rs` can
/// distinguish a clean success from a transient crash (bug 1008).
MergeSuccess(&'a str),
} }
impl<'a> ContentKey<'a> { impl<'a> ContentKey<'a> {
@@ -54,6 +59,7 @@ impl<'a> ContentKey<'a> {
ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"), ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"),
ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"), ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"),
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"), ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),
} }
} }
} }