huskies: merge 773

This commit is contained in:
dave
2026-04-28 10:19:43 +00:00
parent 83f7e41932
commit 7faacb6664
9 changed files with 167 additions and 133 deletions
+2 -2
View File
@@ -1,6 +1,6 @@
//! Merge operations — rebases agent work onto master and runs post-merge validation. //! Merge operations — rebases agent work onto master and runs post-merge validation.
use serde::Serialize; use serde::{Deserialize, Serialize};
mod squash; mod squash;
@@ -28,7 +28,7 @@ pub struct MergeJob {
} }
/// Result of a mergemaster merge operation. /// Result of a mergemaster merge operation.
#[derive(Debug, Serialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
pub struct MergeReport { pub struct MergeReport {
pub story_id: String, pub story_id: String,
pub success: bool, pub success: bool,
@@ -311,14 +311,8 @@ impl AgentPool {
// Skip if a merge job is already running for this story (e.g. triggered // Skip if a merge job is already running for this story (e.g. triggered
// by a previous auto-assign pass or by pipeline advancement). // by a previous auto-assign pass or by pipeline advancement).
let already_running = self let already_running = crate::crdt_state::read_merge_job(story_id.as_str())
.merge_jobs .is_some_and(|job| job.status == "running");
.lock()
.ok()
.and_then(|jobs| jobs.get(story_id.as_str()).cloned())
.is_some_and(|job| {
matches!(job.status, crate::agents::merge::MergeJobStatus::Running)
});
if already_running { if already_running {
continue; continue;
} }
-5
View File
@@ -37,10 +37,6 @@ pub struct AgentPool {
/// an `AgentStateChanged` event is emitted so the frontend can refresh the /// an `AgentStateChanged` event is emitted so the frontend can refresh the
/// pipeline board without waiting for a filesystem event. /// pipeline board without waiting for a filesystem event.
watcher_tx: broadcast::Sender<WatcherEvent>, watcher_tx: broadcast::Sender<WatcherEvent>,
/// Tracks background merge jobs started by `merge_agent_work`, keyed by story_id.
/// The MCP tool returns immediately and the mergemaster agent polls
/// `get_merge_status` until the job reaches a terminal state.
merge_jobs: Arc<Mutex<HashMap<String, super::merge::MergeJob>>>,
/// Project-scoped status broadcaster. Each agent session creates a /// Project-scoped status broadcaster. Each agent session creates a
/// [`crate::service::status::buffer::StatusEventBuffer`] subscribed to this /// [`crate::service::status::buffer::StatusEventBuffer`] subscribed to this
/// broadcaster so it passively accumulates pipeline events without side effects. /// broadcaster so it passively accumulates pipeline events without side effects.
@@ -58,7 +54,6 @@ impl AgentPool {
port, port,
child_killers: Arc::new(Mutex::new(HashMap::new())), child_killers: Arc::new(Mutex::new(HashMap::new())),
watcher_tx: watcher_tx.clone(), watcher_tx: watcher_tx.clone(),
merge_jobs: Arc::new(Mutex::new(HashMap::new())),
status_broadcaster: Arc::new(StatusBroadcaster::new()), status_broadcaster: Arc::new(StatusBroadcaster::new()),
}; };
@@ -35,7 +35,6 @@ pub(crate) fn spawn_pipeline_advance(
port, port,
child_killers: Arc::new(Mutex::new(HashMap::new())), child_killers: Arc::new(Mutex::new(HashMap::new())),
watcher_tx, watcher_tx,
merge_jobs: Arc::new(Mutex::new(HashMap::new())),
status_broadcaster: Arc::new(crate::service::status::StatusBroadcaster::new()), status_broadcaster: Arc::new(crate::service::status::StatusBroadcaster::new()),
}; };
pool.run_pipeline_advance( pool.run_pipeline_advance(
@@ -119,7 +119,6 @@ impl AgentPool {
port: self.port, port: self.port,
child_killers: Arc::clone(&self.child_killers), child_killers: Arc::clone(&self.child_killers),
watcher_tx: self.watcher_tx.clone(), watcher_tx: self.watcher_tx.clone(),
merge_jobs: Arc::clone(&self.merge_jobs),
status_broadcaster: Arc::clone(&self.status_broadcaster), status_broadcaster: Arc::clone(&self.status_broadcaster),
}; };
let sid = story_id.to_string(); let sid = story_id.to_string();
+156 -104
View File
@@ -27,6 +27,28 @@ fn is_process_alive(_pid: u32) -> bool {
true true
} }
/// Encode a `pid` into the CRDT `error` field for a Running merge job.
fn encode_pid(pid: u32) -> String {
format!("{{\"pid\":{pid}}}")
}
/// Decode a `pid` from the CRDT `error` field of a Running merge job.
fn decode_pid(error: Option<&str>) -> u32 {
error
.and_then(|e| serde_json::from_str::<serde_json::Value>(e).ok())
.and_then(|v| v["pid"].as_u64())
.map(|p| p as u32)
.unwrap_or(0)
}
/// Current Unix timestamp in seconds as `f64`.
fn unix_now() -> f64 {
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs_f64()
}
impl AgentPool { impl AgentPool {
/// Start the merge pipeline as a background task. /// Start the merge pipeline as a background task.
/// ///
@@ -45,60 +67,50 @@ impl AgentPool {
// applying the double-start guard. This handles the case where the // applying the double-start guard. This handles the case where the
// server crashed mid-merge: the next attempt finds a Running entry // server crashed mid-merge: the next attempt finds a Running entry
// whose owning process is gone and clears it automatically. // whose owning process is gone and clears it automatically.
{ if let Some(jobs) = crate::crdt_state::read_all_merge_jobs() {
let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?; for job in jobs {
let stale_ids: Vec<String> = jobs if job.status == "running" {
.iter() let pid = decode_pid(job.error.as_deref());
.filter_map(|(sid, job)| { if pid > 0 && !is_process_alive(pid) {
if matches!(job.status, crate::agents::merge::MergeJobStatus::Running) slog!(
&& !is_process_alive(job.pid) "[merge] Cleared stale Running merge job for '{}' (dead pid {})",
{ job.story_id,
Some(sid.clone()) pid
} else { );
None crate::crdt_state::delete_merge_job(&job.story_id);
}
})
.collect();
for sid in stale_ids {
let dead_pid = jobs[&sid].pid;
jobs.remove(&sid);
slog!("[merge] Cleared stale Running merge job for '{sid}' (dead pid {dead_pid})");
}
}
// Guard against double-starts; clear any completed/failed entry so the
// caller can retry without needing to call a separate cleanup step.
{
let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
if let Some(job) = jobs.get(story_id) {
match &job.status {
crate::agents::merge::MergeJobStatus::Running => {
return Err(format!(
"Merge already in progress for '{story_id}'. \
Use get_merge_status to poll for completion."
));
}
// Completed or Failed: clear stale entry so we can start fresh.
_ => {
jobs.remove(story_id);
} }
} }
} }
} }
// Insert Running job. // Guard against double-starts; clear any completed/failed entry so the
{ // caller can retry without needing to call a separate cleanup step.
let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?; if let Some(job) = crate::crdt_state::read_merge_job(story_id) {
jobs.insert( match job.status.as_str() {
story_id.to_string(), "running" => {
crate::agents::merge::MergeJob { return Err(format!(
story_id: story_id.to_string(), "Merge already in progress for '{story_id}'. \
status: crate::agents::merge::MergeJobStatus::Running, Use get_merge_status to poll for completion."
pid: std::process::id(), ));
}, }
); // Completed or Failed: clear stale entry so we can start fresh.
_ => {
crate::crdt_state::delete_merge_job(story_id);
}
}
} }
// Insert Running job into CRDT.
let started_at = unix_now();
let pid = std::process::id();
crate::crdt_state::write_merge_job(
story_id,
"running",
started_at,
None,
Some(&encode_pid(pid)),
);
let pool = Arc::clone(self); let pool = Arc::clone(self);
let root = project_root.to_path_buf(); let root = project_root.to_path_buf();
let sid = story_id.to_string(); let sid = story_id.to_string();
@@ -107,6 +119,8 @@ impl AgentPool {
let report = pool.run_merge_pipeline(&root, &sid).await; let report = pool.run_merge_pipeline(&root, &sid).await;
let success = matches!(&report, Ok(r) if r.success); let success = matches!(&report, Ok(r) if r.success);
let finished_at = unix_now();
// On any failure: record merge_failure in CRDT and emit notification. // On any failure: record merge_failure in CRDT and emit notification.
if !success { if !success {
let reason = match &report { let reason = match &report {
@@ -154,15 +168,29 @@ impl AgentPool {
} }
} }
let status = match report { // Update CRDT with terminal status.
Ok(r) => crate::agents::merge::MergeJobStatus::Completed(r), match &report {
Err(e) => crate::agents::merge::MergeJobStatus::Failed(e), Ok(r) => {
}; let report_json = serde_json::to_string(r).unwrap_or_else(|_| String::new());
if let Ok(mut jobs) = pool.merge_jobs.lock() crate::crdt_state::write_merge_job(
&& let Some(job) = jobs.get_mut(&sid) &sid,
{ "completed",
job.status = status; started_at,
Some(finished_at),
Some(&report_json),
);
}
Err(e) => {
crate::crdt_state::write_merge_job(
&sid,
"failed",
started_at,
Some(finished_at),
Some(e),
);
}
} }
if !success { if !success {
pool.auto_assign_available_work(&root).await; pool.auto_assign_available_work(&root).await;
} }
@@ -233,11 +261,46 @@ impl AgentPool {
} }
/// Check the status of a background merge job. /// Check the status of a background merge job.
///
/// Reads from the CRDT `merge_jobs` collection and reconstructs the full
/// [`MergeJob`] struct. The CRDT `error` field encodes the `pid` for
/// Running jobs (as `{"pid":N}`) and the serialised [`MergeReport`] for
/// Completed jobs.
pub fn get_merge_status(&self, story_id: &str) -> Option<crate::agents::merge::MergeJob> { pub fn get_merge_status(&self, story_id: &str) -> Option<crate::agents::merge::MergeJob> {
self.merge_jobs let view = crate::crdt_state::read_merge_job(story_id)?;
.lock() let (status, pid) = match view.status.as_str() {
.ok() "running" => {
.and_then(|jobs| jobs.get(story_id).cloned()) let pid = decode_pid(view.error.as_deref());
(crate::agents::merge::MergeJobStatus::Running, pid)
}
"completed" => {
let report = view
.error
.as_deref()
.and_then(|e| serde_json::from_str::<crate::agents::merge::MergeReport>(e).ok())
.unwrap_or_else(|| crate::agents::merge::MergeReport {
story_id: story_id.to_string(),
success: false,
had_conflicts: false,
conflicts_resolved: false,
conflict_details: None,
gates_passed: false,
gate_output: String::new(),
worktree_cleaned_up: false,
story_archived: false,
});
(crate::agents::merge::MergeJobStatus::Completed(report), 0)
}
_ => {
let err = view.error.unwrap_or_else(|| "Unknown error".to_string());
(crate::agents::merge::MergeJobStatus::Failed(err), 0)
}
};
Some(crate::agents::merge::MergeJob {
story_id: story_id.to_string(),
status,
pid,
})
} }
/// Trigger a deterministic server-side merge for `story_id` without spawning /// Trigger a deterministic server-side merge for `story_id` without spawning
@@ -253,7 +316,6 @@ impl AgentPool {
port: self.port, port: self.port,
child_killers: Arc::clone(&self.child_killers), child_killers: Arc::clone(&self.child_killers),
watcher_tx: self.watcher_tx.clone(), watcher_tx: self.watcher_tx.clone(),
merge_jobs: Arc::clone(&self.merge_jobs),
status_broadcaster: Arc::clone(&self.status_broadcaster), status_broadcaster: Arc::clone(&self.status_broadcaster),
}); });
if let Err(e) = pool.start_merge_agent_work(project_root, story_id) { if let Err(e) = pool.start_merge_agent_work(project_root, story_id) {
@@ -345,27 +407,25 @@ mod tests {
async fn stale_running_merge_job_is_cleared_and_retry_succeeds() { async fn stale_running_merge_job_is_cleared_and_retry_succeeds() {
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
let pool = Arc::new(AgentPool::new_test(3001)); let pool = Arc::new(AgentPool::new_test(3001));
// Inject a stale Running entry, simulating a mergemaster that died // Inject a stale Running entry via CRDT, simulating a mergemaster that
// before the merge pipeline completed. Use the current process PID so // died before the merge pipeline completed. Use the current process PID
// the stale-lock sweep (which checks whether the PID is alive) does NOT // so the stale-lock sweep does NOT auto-remove it — this test verifies
// auto-remove it — this test verifies the double-start guard path. // the double-start guard path.
{ crate::crdt_state::write_merge_job(
let mut jobs = pool.merge_jobs.lock().unwrap(); "77_story_stale",
jobs.insert( "running",
"77_story_stale".to_string(), 1.0,
MergeJob { None,
story_id: "77_story_stale".to_string(), Some(&encode_pid(std::process::id())),
status: MergeJobStatus::Running, );
pid: std::process::id(),
},
);
}
// With a stale Running entry, start_merge_agent_work must be blocked. // With a stale Running entry, start_merge_agent_work must be blocked.
let blocked = pool.start_merge_agent_work(repo, "77_story_stale"); let blocked = pool.start_merge_agent_work(repo, "77_story_stale");
@@ -380,14 +440,7 @@ mod tests {
); );
// Simulate the mergemaster exit path: clear the stale Running entry. // Simulate the mergemaster exit path: clear the stale Running entry.
{ crate::crdt_state::delete_merge_job("77_story_stale");
let mut jobs = pool.merge_jobs.lock().unwrap();
if let Some(job) = jobs.get("77_story_stale")
&& matches!(job.status, MergeJobStatus::Running)
{
jobs.remove("77_story_stale");
}
}
// After clearing, start_merge_agent_work must succeed (it will fail // After clearing, start_merge_agent_work must succeed (it will fail
// the pipeline because there's no feature branch, but it must not be // the pipeline because there's no feature branch, but it must not be
@@ -410,6 +463,8 @@ mod tests {
async fn stale_merge_job_with_dead_pid_is_swept_on_new_merge_attempt() { async fn stale_merge_job_with_dead_pid_is_swept_on_new_merge_attempt() {
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
@@ -425,25 +480,18 @@ mod tests {
pid pid
}; };
// Seed merge_jobs with a Running entry whose PID is dead. // Seed CRDT merge_jobs with a Running entry whose PID is dead.
{ crate::crdt_state::write_merge_job(
let mut jobs = pool.merge_jobs.lock().unwrap(); "719_stale_other",
jobs.insert( "running",
"719_stale_other".to_string(), 1.0,
MergeJob { None,
story_id: "719_stale_other".to_string(), Some(&encode_pid(dead_pid)),
status: MergeJobStatus::Running, );
pid: dead_pid,
},
);
}
// Verify the entry is present before the sweep. // Verify the entry is present before the sweep.
assert!( assert!(
pool.merge_jobs crate::crdt_state::read_merge_job("719_stale_other").is_some(),
.lock()
.unwrap()
.contains_key("719_stale_other"),
"stale entry should exist before new merge attempt" "stale entry should exist before new merge attempt"
); );
@@ -453,11 +501,7 @@ mod tests {
// The stale entry must have been cleared. // The stale entry must have been cleared.
assert!( assert!(
!pool crate::crdt_state::read_merge_job("719_stale_other").is_none(),
.merge_jobs
.lock()
.unwrap()
.contains_key("719_stale_other"),
"stale entry with dead pid must be removed when a new merge attempt starts" "stale entry with dead pid must be removed when a new merge attempt starts"
); );
} }
@@ -485,6 +529,7 @@ mod tests {
async fn merge_agent_work_returns_error_when_branch_not_found() { async fn merge_agent_work_returns_error_when_branch_not_found() {
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
@@ -509,6 +554,7 @@ mod tests {
use std::fs; use std::fs;
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
@@ -686,6 +732,7 @@ mod tests {
use std::fs; use std::fs;
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
@@ -802,6 +849,7 @@ mod tests {
use std::fs; use std::fs;
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
@@ -884,6 +932,7 @@ mod tests {
use std::fs; use std::fs;
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
@@ -993,6 +1042,7 @@ mod tests {
use std::fs; use std::fs;
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
@@ -1111,6 +1161,7 @@ mod tests {
use std::os::unix::fs::PermissionsExt; use std::os::unix::fs::PermissionsExt;
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
@@ -1226,6 +1277,7 @@ mod tests {
use std::fs; use std::fs;
use tempfile::tempdir; use tempfile::tempdir;
crate::crdt_state::init_for_test();
let tmp = tempdir().unwrap(); let tmp = tempdir().unwrap();
let repo = tmp.path(); let repo = tmp.path();
init_git_repo(repo); init_git_repo(repo);
+4 -4
View File
@@ -35,11 +35,11 @@ impl AgentPool {
/// Used by `list_agents` and `get_pipeline_status` to surface in-flight /// Used by `list_agents` and `get_pipeline_status` to surface in-flight
/// deterministic merges that hold the merge lock but have no agent entry. /// deterministic merges that hold the merge lock but have no agent entry.
pub fn list_running_merges(&self) -> Result<Vec<String>, String> { pub fn list_running_merges(&self) -> Result<Vec<String>, String> {
let jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?; let jobs = crate::crdt_state::read_all_merge_jobs().unwrap_or_default();
Ok(jobs Ok(jobs
.values() .into_iter()
.filter(|job| matches!(job.status, crate::agents::merge::MergeJobStatus::Running)) .filter(|job| job.status == "running")
.map(|job| job.story_id.clone()) .map(|job| job.story_id)
.collect()) .collect())
} }
-1
View File
@@ -350,7 +350,6 @@ impl AgentPool {
log_writer.clone(), log_writer.clone(),
self.child_killers.clone(), self.child_killers.clone(),
self.watcher_tx.clone(), self.watcher_tx.clone(),
Arc::clone(&self.merge_jobs),
inactivity_timeout_secs, inactivity_timeout_secs,
prior_events, prior_events,
)); ));
+3 -7
View File
@@ -16,7 +16,6 @@ use crate::config::ProjectConfig;
use crate::io::watcher::WatcherEvent; use crate::io::watcher::WatcherEvent;
use crate::slog_error; use crate::slog_error;
use super::super::super::merge::MergeJob;
use super::super::super::runtime::{ use super::super::super::runtime::{
AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext, AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext,
}; };
@@ -46,7 +45,6 @@ pub(super) async fn run_agent_spawn(
log_writer: Option<Arc<Mutex<AgentLogWriter>>>, log_writer: Option<Arc<Mutex<AgentLogWriter>>>,
child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>, child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
watcher_tx: broadcast::Sender<WatcherEvent>, watcher_tx: broadcast::Sender<WatcherEvent>,
merge_jobs: Arc<Mutex<HashMap<String, MergeJob>>>,
inactivity_timeout_secs: u64, inactivity_timeout_secs: u64,
// Formatted `<recent-events>` block drained from the previous session's // Formatted `<recent-events>` block drained from the previous session's
// buffer. Prepended to the first agent turn so the agent sees what // buffer. Prepended to the first agent turn so the agent sees what
@@ -70,7 +68,6 @@ pub(super) async fn run_agent_spawn(
let log_writer_clone = log_writer; let log_writer_clone = log_writer;
let child_killers_clone = child_killers; let child_killers_clone = child_killers;
let watcher_tx_clone = watcher_tx; let watcher_tx_clone = watcher_tx;
let merge_jobs_clone = merge_jobs;
let _ = inactivity_timeout_secs; // currently unused inside the closure body let _ = inactivity_timeout_secs; // currently unused inside the closure body
// Step 1: create the worktree (slow — git checkout, pnpm install, etc.) // Step 1: create the worktree (slow — git checkout, pnpm install, etc.)
@@ -328,11 +325,10 @@ pub(super) async fn run_agent_spawn(
// Clear any stale Running merge job so the next mergemaster // Clear any stale Running merge job so the next mergemaster
// can call start_merge_agent_work without hitting "Merge // can call start_merge_agent_work without hitting "Merge
// already in progress" (bug 498). // already in progress" (bug 498).
if let Ok(mut jobs) = merge_jobs_clone.lock() if crate::crdt_state::read_merge_job(&sid)
&& let Some(job) = jobs.get(&sid) .is_some_and(|job| job.status == "running")
&& matches!(job.status, crate::agents::merge::MergeJobStatus::Running)
{ {
jobs.remove(&sid); crate::crdt_state::delete_merge_job(&sid);
} }
let _ = tx_done.send(AgentEvent::Done { let _ = tx_done.send(AgentEvent::Done {
story_id: sid.clone(), story_id: sid.clone(),