huskies: merge 920

This commit is contained in:
dave
2026-05-12 16:36:15 +00:00
parent 19b7edb60c
commit 86e8f2441f
3 changed files with 131 additions and 4 deletions
@@ -657,6 +657,92 @@ mod tests {
);
}
// ── Story 920: transient vs genuine mergemaster termination ──────────────
/// AC4 (transient): a mergemaster that was killed transiently (no
/// report_merge_failure, spawn count below cap) must be re-spawned by the
/// next auto-assign pass — `mergemaster_attempted` stays false.
#[tokio::test]
async fn transient_mergemaster_exit_allows_respawn() {
let tmp = tempfile::tempdir().unwrap();
let sk = tmp.path().join(".huskies");
std::fs::create_dir_all(&sk).unwrap();
std::fs::write(
sk.join("project.toml"),
"[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
)
.unwrap();
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
crate::db::write_item_with_content(
"920_story_transient",
"4_merge_failure",
"---\nname: Transient\nmerge_failure: \"CONFLICT (content): foo.rs\"\n---\n",
crate::db::ItemMeta::from_yaml(
"---\nname: Transient\nmerge_failure: \"CONFLICT (content): foo.rs\"\n---\n",
),
);
// Simulate two previous transient exits (below cap of 3) recorded in DB.
crate::db::write_content("920_story_transient:mergemaster_spawn_count", "2");
// mergemaster_attempted must still be false (transient exits don't set it).
let pool = AgentPool::new_test(3001);
pool.auto_assign_available_work(tmp.path()).await;
let agents = pool.agents.lock().unwrap();
let respawned = agents.iter().any(|(key, a)| {
key.contains("920_story_transient")
&& a.agent_name == "mergemaster"
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
});
assert!(
respawned,
"mergemaster must re-spawn after transient terminations while below cap"
);
}
/// AC4 (genuine): after report_merge_failure, mergemaster_attempted is set
/// to true and auto-assign must not trigger another re-spawn.
#[tokio::test]
async fn genuine_mergemaster_exit_no_respawn() {
let tmp = tempfile::tempdir().unwrap();
let sk = tmp.path().join(".huskies");
std::fs::create_dir_all(&sk).unwrap();
std::fs::write(
sk.join("project.toml"),
"[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
)
.unwrap();
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
// mergemaster_attempted is set by the exit path when genuine give-up occurs.
crate::db::write_item_with_content(
"920_story_genuine",
"4_merge_failure",
"---\nname: Genuine\nmerge_failure: \"CONFLICT (content): bar.rs\"\n---\n",
crate::db::ItemMeta::from_yaml(
"---\nname: Genuine\nmerge_failure: \"CONFLICT (content): bar.rs\"\n---\n",
),
);
// The CRDT register is the sole authority; set it explicitly as the
// spawn exit path would after report_merge_failure.
crate::crdt_state::set_mergemaster_attempted("920_story_genuine", true);
let pool = AgentPool::new_test(3001);
pool.auto_assign_available_work(tmp.path()).await;
let agents = pool.agents.lock().unwrap();
let spawned = agents.iter().any(|(key, a)| {
key.contains("920_story_genuine")
&& a.agent_name == "mergemaster"
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
});
assert!(
!spawned,
"mergemaster must not re-spawn after genuine give-up (mergemaster_attempted=true)"
);
}
/// Two concurrent auto_assign_available_work calls must not assign the same
/// agent to two stories simultaneously. After both complete, at most one
/// Pending/Running entry must exist per agent name.
@@ -146,7 +146,6 @@ impl AgentPool {
"[auto-assign] Content conflict on '{story_id}'; \
auto-spawning mergemaster '{agent_name}'."
);
crate::crdt_state::set_mergemaster_attempted(story_id, true);
if let Err(e) = self
.start_agent(project_root, story_id, Some(&agent_name), None, None)
.await
+45 -3
View File
@@ -512,15 +512,19 @@ pub(super) async fn run_agent_spawn(
crate::db::delete_content(&format!("{sid}:abort_respawn_count"));
if stage == PipelineStage::Mergemaster {
let (tx_done, done_session_id) = {
let (tx_done, done_session_id, merge_failure_reported) = {
let mut lock = match agents_ref.lock() {
Ok(a) => a,
Err(_) => return,
};
if let Some(agent) = lock.remove(&key_clone) {
(agent.tx, agent.session_id.or(result.session_id))
(
agent.tx,
agent.session_id.or(result.session_id),
agent.merge_failure_reported,
)
} else {
(tx_clone.clone(), result.session_id)
(tx_clone.clone(), result.session_id, false)
}
};
// Clear any stale Running merge job so the next mergemaster
@@ -531,6 +535,44 @@ pub(super) async fn run_agent_spawn(
{
crate::crdt_state::delete_merge_job(&sid);
}
// Classify termination: genuine (report_merge_failure called, or
// the transient-respawn budget is exhausted) vs transient
// (watchdog / rate-limit / crash without an explicit give-up call).
// Only mark mergemaster_attempted on a genuine give-up so that
// transient exits can be re-spawned up to the cap (story 920).
const MERGEMASTER_RESPAWN_CAP: u32 = 3;
let spawn_count_key = format!("{sid}:mergemaster_spawn_count");
let is_genuine = if merge_failure_reported {
slog!(
"[agents] Mergemaster '{aname}' for '{sid}' gave up genuinely \
(report_merge_failure called)."
);
true
} else {
let count = crate::db::read_content(&spawn_count_key)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1;
crate::db::write_content(&spawn_count_key, &count.to_string());
if count >= MERGEMASTER_RESPAWN_CAP {
slog!(
"[agents] Mergemaster '{aname}' for '{sid}' exhausted \
respawn budget ({count}/{MERGEMASTER_RESPAWN_CAP}); \
marking as permanently blocked."
);
true
} else {
slog!(
"[agents] Mergemaster '{aname}' for '{sid}' terminated \
transiently (spawn {count}/{MERGEMASTER_RESPAWN_CAP}); \
will re-spawn."
);
false
}
};
if is_genuine {
crate::crdt_state::set_mergemaster_attempted(&sid, true);
}
let _ = tx_done.send(AgentEvent::Done {
story_id: sid.clone(),
agent_name: aname.clone(),