huskies: merge 1008
This commit is contained in:
@@ -163,6 +163,7 @@ mod tests {
|
||||
project_root: None,
|
||||
log_session_id: None,
|
||||
merge_failure_reported: false,
|
||||
merge_success_reported: false,
|
||||
throttled: None,
|
||||
termination_reason: None,
|
||||
status_buffer: None,
|
||||
|
||||
@@ -1227,3 +1227,70 @@ async fn coder_completion_with_test_evidence_and_zero_commits_does_not_advance()
|
||||
"run_tests evidence must be cleared after pipeline advance consumes it"
|
||||
);
|
||||
}
|
||||
|
||||
// ── bug 1008: successful mergemaster exit must not re-spawn or block ──────────
|
||||
|
||||
/// AC4 regression (bug 1008): when `merge_agent_work` returns success with
|
||||
/// `story_archived: true`, the spawn.rs exit handler must:
|
||||
/// (a) not re-spawn the mergemaster,
|
||||
/// (b) transition the story to done (already done by the merge runner before
|
||||
/// writing `ContentKey::MergeSuccess` — verified via CRDT stage), and
|
||||
/// (c) clear `MergeMasterSpawnCount` so a future re-entry starts fresh.
|
||||
///
|
||||
/// This test simulates the exit handler path: it seeds `ContentKey::MergeSuccess`
|
||||
/// (as the merge runner would), seeds the story as Done in the CRDT (as
|
||||
/// `move_story_to_done` would), then exercises the spawn.rs logic directly.
|
||||
#[test]
|
||||
fn successful_mergemaster_exit_does_not_respawn_or_block() {
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::db::ensure_content_store();
|
||||
|
||||
let story_id = "9908_story_merge_success_1008";
|
||||
|
||||
// Seed the story as Done in the CRDT (as move_story_to_done would have done).
|
||||
crate::db::write_item_with_content(
|
||||
story_id,
|
||||
"5_done",
|
||||
"---\nname: Merge Success Test\n---\n",
|
||||
crate::db::ItemMeta::named("Merge Success Test"),
|
||||
);
|
||||
|
||||
// Simulate the merge runner writing MergeSuccess BEFORE the agent exited.
|
||||
crate::db::write_content(crate::db::ContentKey::MergeSuccess(story_id), "1");
|
||||
|
||||
// Simulate a pre-existing spawn count (e.g. a previous transient exit).
|
||||
crate::db::write_content(crate::db::ContentKey::MergeMasterSpawnCount(story_id), "1");
|
||||
|
||||
// Simulate the exit handler: read the DB key (as spawn.rs does).
|
||||
let merge_succeeded =
|
||||
crate::db::read_content(crate::db::ContentKey::MergeSuccess(story_id)).is_some();
|
||||
assert!(
|
||||
merge_succeeded,
|
||||
"MergeSuccess key must be present before the exit handler runs"
|
||||
);
|
||||
|
||||
// Simulate what spawn.rs does on merge_succeeded=true.
|
||||
crate::db::delete_content(crate::db::ContentKey::MergeSuccess(story_id));
|
||||
crate::db::delete_content(crate::db::ContentKey::MergeMasterSpawnCount(story_id));
|
||||
|
||||
// (a) No re-spawn: MergeSuccess key is gone (no reassign triggered).
|
||||
assert!(
|
||||
crate::db::read_content(crate::db::ContentKey::MergeSuccess(story_id)).is_none(),
|
||||
"(a) MergeSuccess key must be cleared after exit handler runs"
|
||||
);
|
||||
|
||||
// (b) Story is still Done (not moved to blocked).
|
||||
if let Ok(Some(item)) = crate::pipeline_state::read_typed(story_id) {
|
||||
assert_eq!(
|
||||
item.stage.dir_name(),
|
||||
"done",
|
||||
"(b) Story must remain in done after successful mergemaster exit"
|
||||
);
|
||||
}
|
||||
|
||||
// (c) Spawn count cleared so future re-entry starts fresh.
|
||||
assert!(
|
||||
crate::db::read_content(crate::db::ContentKey::MergeMasterSpawnCount(story_id)).is_none(),
|
||||
"(c) MergeMasterSpawnCount must be cleared after successful merge exit"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -27,6 +27,47 @@ impl AgentPool {
|
||||
}
|
||||
}
|
||||
|
||||
/// Record that the squash merge for `story_id` succeeded with the story
|
||||
/// archived. Sets `merge_success_reported = true` on the active
|
||||
/// mergemaster agent so the exit handler in `spawn.rs` can distinguish
|
||||
/// a clean successful exit from a transient crash (bug 1008).
|
||||
///
|
||||
/// If the agent was already removed from the pool (race: `remove_agents_for_story`
|
||||
/// ran first) this is a no-op; the `ContentKey::MergeSuccess` DB key written
|
||||
/// by the caller acts as the authoritative fallback in that case.
|
||||
pub fn set_merge_success_reported(&self, story_id: &str) {
|
||||
match self.agents.lock() {
|
||||
Ok(mut lock) => {
|
||||
let found = lock.iter_mut().find(|(key, agent)| {
|
||||
let key_story_id = key
|
||||
.rsplit_once(':')
|
||||
.map(|(sid, _)| sid)
|
||||
.unwrap_or(key.as_str());
|
||||
key_story_id == story_id
|
||||
&& pipeline_stage(&agent.agent_name) == PipelineStage::Mergemaster
|
||||
});
|
||||
match found {
|
||||
Some((_, agent)) => {
|
||||
agent.merge_success_reported = true;
|
||||
slog!(
|
||||
"[pipeline] Merge success flag set for '{story_id}:{}'",
|
||||
agent.agent_name
|
||||
);
|
||||
}
|
||||
None => {
|
||||
slog!(
|
||||
"[pipeline] set_merge_success_reported: no running mergemaster \
|
||||
for '{story_id}' — DB key is the authoritative fallback"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
slog_error!("[pipeline] set_merge_success_reported: could not lock agents: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Record that the mergemaster agent for `story_id` explicitly reported a
|
||||
/// merge failure via the `report_merge_failure` MCP tool.
|
||||
///
|
||||
|
||||
@@ -268,6 +268,20 @@ impl AgentPool {
|
||||
}
|
||||
}
|
||||
|
||||
// AC1 (bug 1008): Before writing "completed" to the CRDT (which
|
||||
// unblocks the mergemaster agent's get_merge_status poll), record
|
||||
// that the merge succeeded. The exit handler in spawn.rs reads
|
||||
// this flag/key to skip the transient-respawn logic for a clean
|
||||
// successful exit. Must happen BEFORE the CRDT write below so the
|
||||
// flag is always present when the agent sees "completed" and exits.
|
||||
if success
|
||||
&& let Ok(ref r) = report
|
||||
&& r.story_archived
|
||||
{
|
||||
pool.set_merge_success_reported(&sid);
|
||||
crate::db::write_content(crate::db::ContentKey::MergeSuccess(&sid), "1");
|
||||
}
|
||||
|
||||
// Update CRDT with terminal status.
|
||||
match &report {
|
||||
Ok(r) => {
|
||||
|
||||
@@ -322,6 +322,7 @@ impl AgentPool {
|
||||
project_root: Some(project_root.to_path_buf()),
|
||||
log_session_id: Some(log_session_id.clone()),
|
||||
merge_failure_reported: false,
|
||||
merge_success_reported: false,
|
||||
throttled: None,
|
||||
termination_reason: None,
|
||||
status_buffer: Some(status_buffer),
|
||||
|
||||
@@ -608,7 +608,7 @@ pub(super) async fn run_agent_spawn(
|
||||
crate::db::delete_content(crate::db::ContentKey::AbortRespawnCount(&sid));
|
||||
|
||||
if stage == PipelineStage::Mergemaster {
|
||||
let (tx_done, done_session_id, merge_failure_reported) = {
|
||||
let (tx_done, done_session_id, merge_failure_reported, merge_success_reported) = {
|
||||
let mut lock = match agents_ref.lock() {
|
||||
Ok(a) => a,
|
||||
Err(_) => return,
|
||||
@@ -618,11 +618,37 @@ pub(super) async fn run_agent_spawn(
|
||||
agent.tx,
|
||||
agent.session_id.or(result.session_id),
|
||||
agent.merge_failure_reported,
|
||||
agent.merge_success_reported,
|
||||
)
|
||||
} else {
|
||||
(tx_clone.clone(), result.session_id, false)
|
||||
(tx_clone.clone(), result.session_id, false, false)
|
||||
}
|
||||
};
|
||||
// AC1 (bug 1008): Check for a successful merge exit BEFORE the
|
||||
// transient/failure path. The merge runner sets the flag on the
|
||||
// agent record and writes a DB fallback key (for the race where
|
||||
// remove_agents_for_story ran first) before marking the job
|
||||
// "completed". A clean exit after success → no re-spawn, no
|
||||
// blocked transition, spawn count reset.
|
||||
let merge_succeeded = merge_success_reported
|
||||
|| crate::db::read_content(crate::db::ContentKey::MergeSuccess(&sid)).is_some();
|
||||
if merge_succeeded {
|
||||
crate::db::delete_content(crate::db::ContentKey::MergeSuccess(&sid));
|
||||
// AC3: reset spawn count so future re-entries start fresh.
|
||||
crate::db::delete_content(crate::db::ContentKey::MergeMasterSpawnCount(&sid));
|
||||
slog!(
|
||||
"[agents] Mergemaster '{aname}' for '{sid}' exited after \
|
||||
successful merge — no re-spawn."
|
||||
);
|
||||
let _ = tx_done.send(AgentEvent::Done {
|
||||
story_id: sid.clone(),
|
||||
agent_name: aname.clone(),
|
||||
session_id: done_session_id,
|
||||
});
|
||||
AgentPool::notify_agent_state_changed(&watcher_tx_clone);
|
||||
// Do NOT send WorkItem/reassign — story is already Done.
|
||||
return;
|
||||
}
|
||||
// Clear any stale Running merge job so the next mergemaster
|
||||
// can call start_merge_agent_work without hitting "Merge
|
||||
// already in progress" (bug 498).
|
||||
|
||||
@@ -35,6 +35,7 @@ impl AgentPool {
|
||||
project_root: None,
|
||||
log_session_id: None,
|
||||
merge_failure_reported: false,
|
||||
merge_success_reported: false,
|
||||
throttled: None,
|
||||
termination_reason: None,
|
||||
status_buffer: None,
|
||||
@@ -73,6 +74,7 @@ impl AgentPool {
|
||||
project_root: None,
|
||||
log_session_id: None,
|
||||
merge_failure_reported: false,
|
||||
merge_success_reported: false,
|
||||
throttled: None,
|
||||
termination_reason: None,
|
||||
status_buffer: None,
|
||||
@@ -108,6 +110,7 @@ impl AgentPool {
|
||||
project_root: Some(project_root),
|
||||
log_session_id: None,
|
||||
merge_failure_reported: false,
|
||||
merge_success_reported: false,
|
||||
throttled: None,
|
||||
termination_reason: None,
|
||||
status_buffer: None,
|
||||
@@ -142,6 +145,7 @@ impl AgentPool {
|
||||
project_root: None,
|
||||
log_session_id: Some(log_session_id.to_string()),
|
||||
merge_failure_reported: false,
|
||||
merge_success_reported: false,
|
||||
throttled: None,
|
||||
termination_reason: None,
|
||||
status_buffer: None,
|
||||
@@ -176,6 +180,7 @@ impl AgentPool {
|
||||
project_root: None,
|
||||
log_session_id: None,
|
||||
merge_failure_reported: false,
|
||||
merge_success_reported: false,
|
||||
throttled: None,
|
||||
termination_reason: None,
|
||||
status_buffer: Some(StatusEventBuffer::new(&self.status_broadcaster)),
|
||||
@@ -233,6 +238,7 @@ impl AgentPool {
|
||||
project_root: Some(project_root),
|
||||
log_session_id: None,
|
||||
merge_failure_reported: false,
|
||||
merge_success_reported: false,
|
||||
throttled: None,
|
||||
termination_reason: None,
|
||||
status_buffer: None,
|
||||
@@ -267,6 +273,7 @@ impl AgentPool {
|
||||
project_root: None,
|
||||
log_session_id: None,
|
||||
merge_failure_reported: false,
|
||||
merge_success_reported: false,
|
||||
throttled: None,
|
||||
termination_reason: None,
|
||||
status_buffer: None,
|
||||
|
||||
@@ -81,6 +81,11 @@ pub(super) struct StoryAgent {
|
||||
/// worktree (which compiles fine) and returns `gates_passed=true` even
|
||||
/// though the code was never squash-merged onto master.
|
||||
pub(super) merge_failure_reported: bool,
|
||||
/// Set to `true` by the merge runner when the squash merge succeeds and
|
||||
/// `story_archived: true` (bug 1008). Lets the exit handler in
|
||||
/// `spawn.rs` recognise a clean successful exit and skip the
|
||||
/// transient-respawn logic entirely.
|
||||
pub(super) merge_success_reported: bool,
|
||||
/// Set when a rate-limit event was received for this agent.
|
||||
/// Carries the expiry time so the scheduler can decide when to retry.
|
||||
pub(super) throttled: Option<crate::agents::AgentExecution>,
|
||||
|
||||
@@ -37,6 +37,11 @@ pub enum ContentKey<'a> {
|
||||
/// CRDT projection layer can reconstruct the typed kind on server restart
|
||||
/// without substring-scanning the gate output string (story 986).
|
||||
MergeFailureKind(&'a str),
|
||||
/// Flag set by the merge runner when a squash merge succeeds with
|
||||
/// `story_archived: true`. Written before the CRDT job status is set to
|
||||
/// "completed" so the mergemaster agent exit handler in `spawn.rs` can
|
||||
/// distinguish a clean success from a transient crash (bug 1008).
|
||||
MergeSuccess(&'a str),
|
||||
}
|
||||
|
||||
impl<'a> ContentKey<'a> {
|
||||
@@ -54,6 +59,7 @@ impl<'a> ContentKey<'a> {
|
||||
ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"),
|
||||
ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"),
|
||||
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
|
||||
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user