huskies: merge 773
This commit is contained in:
@@ -27,6 +27,28 @@ fn is_process_alive(_pid: u32) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// Encode a `pid` into the CRDT `error` field for a Running merge job.
|
||||
fn encode_pid(pid: u32) -> String {
|
||||
format!("{{\"pid\":{pid}}}")
|
||||
}
|
||||
|
||||
/// Decode a `pid` from the CRDT `error` field of a Running merge job.
|
||||
fn decode_pid(error: Option<&str>) -> u32 {
|
||||
error
|
||||
.and_then(|e| serde_json::from_str::<serde_json::Value>(e).ok())
|
||||
.and_then(|v| v["pid"].as_u64())
|
||||
.map(|p| p as u32)
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Current Unix timestamp in seconds as `f64`.
|
||||
fn unix_now() -> f64 {
|
||||
std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs_f64()
|
||||
}
|
||||
|
||||
impl AgentPool {
|
||||
/// Start the merge pipeline as a background task.
|
||||
///
|
||||
@@ -45,60 +67,50 @@ impl AgentPool {
|
||||
// applying the double-start guard. This handles the case where the
|
||||
// server crashed mid-merge: the next attempt finds a Running entry
|
||||
// whose owning process is gone and clears it automatically.
|
||||
{
|
||||
let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
|
||||
let stale_ids: Vec<String> = jobs
|
||||
.iter()
|
||||
.filter_map(|(sid, job)| {
|
||||
if matches!(job.status, crate::agents::merge::MergeJobStatus::Running)
|
||||
&& !is_process_alive(job.pid)
|
||||
{
|
||||
Some(sid.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
for sid in stale_ids {
|
||||
let dead_pid = jobs[&sid].pid;
|
||||
jobs.remove(&sid);
|
||||
slog!("[merge] Cleared stale Running merge job for '{sid}' (dead pid {dead_pid})");
|
||||
}
|
||||
}
|
||||
|
||||
// Guard against double-starts; clear any completed/failed entry so the
|
||||
// caller can retry without needing to call a separate cleanup step.
|
||||
{
|
||||
let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
|
||||
if let Some(job) = jobs.get(story_id) {
|
||||
match &job.status {
|
||||
crate::agents::merge::MergeJobStatus::Running => {
|
||||
return Err(format!(
|
||||
"Merge already in progress for '{story_id}'. \
|
||||
Use get_merge_status to poll for completion."
|
||||
));
|
||||
}
|
||||
// Completed or Failed: clear stale entry so we can start fresh.
|
||||
_ => {
|
||||
jobs.remove(story_id);
|
||||
if let Some(jobs) = crate::crdt_state::read_all_merge_jobs() {
|
||||
for job in jobs {
|
||||
if job.status == "running" {
|
||||
let pid = decode_pid(job.error.as_deref());
|
||||
if pid > 0 && !is_process_alive(pid) {
|
||||
slog!(
|
||||
"[merge] Cleared stale Running merge job for '{}' (dead pid {})",
|
||||
job.story_id,
|
||||
pid
|
||||
);
|
||||
crate::crdt_state::delete_merge_job(&job.story_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Insert Running job.
|
||||
{
|
||||
let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
|
||||
jobs.insert(
|
||||
story_id.to_string(),
|
||||
crate::agents::merge::MergeJob {
|
||||
story_id: story_id.to_string(),
|
||||
status: crate::agents::merge::MergeJobStatus::Running,
|
||||
pid: std::process::id(),
|
||||
},
|
||||
);
|
||||
// Guard against double-starts; clear any completed/failed entry so the
|
||||
// caller can retry without needing to call a separate cleanup step.
|
||||
if let Some(job) = crate::crdt_state::read_merge_job(story_id) {
|
||||
match job.status.as_str() {
|
||||
"running" => {
|
||||
return Err(format!(
|
||||
"Merge already in progress for '{story_id}'. \
|
||||
Use get_merge_status to poll for completion."
|
||||
));
|
||||
}
|
||||
// Completed or Failed: clear stale entry so we can start fresh.
|
||||
_ => {
|
||||
crate::crdt_state::delete_merge_job(story_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Insert Running job into CRDT.
|
||||
let started_at = unix_now();
|
||||
let pid = std::process::id();
|
||||
crate::crdt_state::write_merge_job(
|
||||
story_id,
|
||||
"running",
|
||||
started_at,
|
||||
None,
|
||||
Some(&encode_pid(pid)),
|
||||
);
|
||||
|
||||
let pool = Arc::clone(self);
|
||||
let root = project_root.to_path_buf();
|
||||
let sid = story_id.to_string();
|
||||
@@ -107,6 +119,8 @@ impl AgentPool {
|
||||
let report = pool.run_merge_pipeline(&root, &sid).await;
|
||||
let success = matches!(&report, Ok(r) if r.success);
|
||||
|
||||
let finished_at = unix_now();
|
||||
|
||||
// On any failure: record merge_failure in CRDT and emit notification.
|
||||
if !success {
|
||||
let reason = match &report {
|
||||
@@ -154,15 +168,29 @@ impl AgentPool {
|
||||
}
|
||||
}
|
||||
|
||||
let status = match report {
|
||||
Ok(r) => crate::agents::merge::MergeJobStatus::Completed(r),
|
||||
Err(e) => crate::agents::merge::MergeJobStatus::Failed(e),
|
||||
};
|
||||
if let Ok(mut jobs) = pool.merge_jobs.lock()
|
||||
&& let Some(job) = jobs.get_mut(&sid)
|
||||
{
|
||||
job.status = status;
|
||||
// Update CRDT with terminal status.
|
||||
match &report {
|
||||
Ok(r) => {
|
||||
let report_json = serde_json::to_string(r).unwrap_or_else(|_| String::new());
|
||||
crate::crdt_state::write_merge_job(
|
||||
&sid,
|
||||
"completed",
|
||||
started_at,
|
||||
Some(finished_at),
|
||||
Some(&report_json),
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
crate::crdt_state::write_merge_job(
|
||||
&sid,
|
||||
"failed",
|
||||
started_at,
|
||||
Some(finished_at),
|
||||
Some(e),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if !success {
|
||||
pool.auto_assign_available_work(&root).await;
|
||||
}
|
||||
@@ -233,11 +261,46 @@ impl AgentPool {
|
||||
}
|
||||
|
||||
/// Check the status of a background merge job.
|
||||
///
|
||||
/// Reads from the CRDT `merge_jobs` collection and reconstructs the full
|
||||
/// [`MergeJob`] struct. The CRDT `error` field encodes the `pid` for
|
||||
/// Running jobs (as `{"pid":N}`) and the serialised [`MergeReport`] for
|
||||
/// Completed jobs.
|
||||
pub fn get_merge_status(&self, story_id: &str) -> Option<crate::agents::merge::MergeJob> {
|
||||
self.merge_jobs
|
||||
.lock()
|
||||
.ok()
|
||||
.and_then(|jobs| jobs.get(story_id).cloned())
|
||||
let view = crate::crdt_state::read_merge_job(story_id)?;
|
||||
let (status, pid) = match view.status.as_str() {
|
||||
"running" => {
|
||||
let pid = decode_pid(view.error.as_deref());
|
||||
(crate::agents::merge::MergeJobStatus::Running, pid)
|
||||
}
|
||||
"completed" => {
|
||||
let report = view
|
||||
.error
|
||||
.as_deref()
|
||||
.and_then(|e| serde_json::from_str::<crate::agents::merge::MergeReport>(e).ok())
|
||||
.unwrap_or_else(|| crate::agents::merge::MergeReport {
|
||||
story_id: story_id.to_string(),
|
||||
success: false,
|
||||
had_conflicts: false,
|
||||
conflicts_resolved: false,
|
||||
conflict_details: None,
|
||||
gates_passed: false,
|
||||
gate_output: String::new(),
|
||||
worktree_cleaned_up: false,
|
||||
story_archived: false,
|
||||
});
|
||||
(crate::agents::merge::MergeJobStatus::Completed(report), 0)
|
||||
}
|
||||
_ => {
|
||||
let err = view.error.unwrap_or_else(|| "Unknown error".to_string());
|
||||
(crate::agents::merge::MergeJobStatus::Failed(err), 0)
|
||||
}
|
||||
};
|
||||
Some(crate::agents::merge::MergeJob {
|
||||
story_id: story_id.to_string(),
|
||||
status,
|
||||
pid,
|
||||
})
|
||||
}
|
||||
|
||||
/// Trigger a deterministic server-side merge for `story_id` without spawning
|
||||
@@ -253,7 +316,6 @@ impl AgentPool {
|
||||
port: self.port,
|
||||
child_killers: Arc::clone(&self.child_killers),
|
||||
watcher_tx: self.watcher_tx.clone(),
|
||||
merge_jobs: Arc::clone(&self.merge_jobs),
|
||||
status_broadcaster: Arc::clone(&self.status_broadcaster),
|
||||
});
|
||||
if let Err(e) = pool.start_merge_agent_work(project_root, story_id) {
|
||||
@@ -345,27 +407,25 @@ mod tests {
|
||||
async fn stale_running_merge_job_is_cleared_and_retry_succeeds() {
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
|
||||
let pool = Arc::new(AgentPool::new_test(3001));
|
||||
|
||||
// Inject a stale Running entry, simulating a mergemaster that died
|
||||
// before the merge pipeline completed. Use the current process PID so
|
||||
// the stale-lock sweep (which checks whether the PID is alive) does NOT
|
||||
// auto-remove it — this test verifies the double-start guard path.
|
||||
{
|
||||
let mut jobs = pool.merge_jobs.lock().unwrap();
|
||||
jobs.insert(
|
||||
"77_story_stale".to_string(),
|
||||
MergeJob {
|
||||
story_id: "77_story_stale".to_string(),
|
||||
status: MergeJobStatus::Running,
|
||||
pid: std::process::id(),
|
||||
},
|
||||
);
|
||||
}
|
||||
// Inject a stale Running entry via CRDT, simulating a mergemaster that
|
||||
// died before the merge pipeline completed. Use the current process PID
|
||||
// so the stale-lock sweep does NOT auto-remove it — this test verifies
|
||||
// the double-start guard path.
|
||||
crate::crdt_state::write_merge_job(
|
||||
"77_story_stale",
|
||||
"running",
|
||||
1.0,
|
||||
None,
|
||||
Some(&encode_pid(std::process::id())),
|
||||
);
|
||||
|
||||
// With a stale Running entry, start_merge_agent_work must be blocked.
|
||||
let blocked = pool.start_merge_agent_work(repo, "77_story_stale");
|
||||
@@ -380,14 +440,7 @@ mod tests {
|
||||
);
|
||||
|
||||
// Simulate the mergemaster exit path: clear the stale Running entry.
|
||||
{
|
||||
let mut jobs = pool.merge_jobs.lock().unwrap();
|
||||
if let Some(job) = jobs.get("77_story_stale")
|
||||
&& matches!(job.status, MergeJobStatus::Running)
|
||||
{
|
||||
jobs.remove("77_story_stale");
|
||||
}
|
||||
}
|
||||
crate::crdt_state::delete_merge_job("77_story_stale");
|
||||
|
||||
// After clearing, start_merge_agent_work must succeed (it will fail
|
||||
// the pipeline because there's no feature branch, but it must not be
|
||||
@@ -410,6 +463,8 @@ mod tests {
|
||||
async fn stale_merge_job_with_dead_pid_is_swept_on_new_merge_attempt() {
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
@@ -425,25 +480,18 @@ mod tests {
|
||||
pid
|
||||
};
|
||||
|
||||
// Seed merge_jobs with a Running entry whose PID is dead.
|
||||
{
|
||||
let mut jobs = pool.merge_jobs.lock().unwrap();
|
||||
jobs.insert(
|
||||
"719_stale_other".to_string(),
|
||||
MergeJob {
|
||||
story_id: "719_stale_other".to_string(),
|
||||
status: MergeJobStatus::Running,
|
||||
pid: dead_pid,
|
||||
},
|
||||
);
|
||||
}
|
||||
// Seed CRDT merge_jobs with a Running entry whose PID is dead.
|
||||
crate::crdt_state::write_merge_job(
|
||||
"719_stale_other",
|
||||
"running",
|
||||
1.0,
|
||||
None,
|
||||
Some(&encode_pid(dead_pid)),
|
||||
);
|
||||
|
||||
// Verify the entry is present before the sweep.
|
||||
assert!(
|
||||
pool.merge_jobs
|
||||
.lock()
|
||||
.unwrap()
|
||||
.contains_key("719_stale_other"),
|
||||
crate::crdt_state::read_merge_job("719_stale_other").is_some(),
|
||||
"stale entry should exist before new merge attempt"
|
||||
);
|
||||
|
||||
@@ -453,11 +501,7 @@ mod tests {
|
||||
|
||||
// The stale entry must have been cleared.
|
||||
assert!(
|
||||
!pool
|
||||
.merge_jobs
|
||||
.lock()
|
||||
.unwrap()
|
||||
.contains_key("719_stale_other"),
|
||||
crate::crdt_state::read_merge_job("719_stale_other").is_none(),
|
||||
"stale entry with dead pid must be removed when a new merge attempt starts"
|
||||
);
|
||||
}
|
||||
@@ -485,6 +529,7 @@ mod tests {
|
||||
async fn merge_agent_work_returns_error_when_branch_not_found() {
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
@@ -509,6 +554,7 @@ mod tests {
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
@@ -686,6 +732,7 @@ mod tests {
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
@@ -802,6 +849,7 @@ mod tests {
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
@@ -884,6 +932,7 @@ mod tests {
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
@@ -993,6 +1042,7 @@ mod tests {
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
@@ -1111,6 +1161,7 @@ mod tests {
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
@@ -1226,6 +1277,7 @@ mod tests {
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempdir().unwrap();
|
||||
let repo = tmp.path();
|
||||
init_git_repo(repo);
|
||||
|
||||
Reference in New Issue
Block a user