fix: merge_agent_work blocks until complete instead of requiring polling

The mergemaster agent was burning all 30 turns polling get_merge_status
every 2 seconds while the merge pipeline takes ~2 minutes. It would
exhaust turns, exit, restart, and repeat — never seeing the result.

merge_agent_work now blocks with a 10-second internal poll loop and
returns the final result directly. The agent calls it once and gets
the answer. No more polling turns wasted.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
dave
2026-04-11 17:43:50 +00:00
parent 599fbdc71d
commit d06241c20c
9 changed files with 242 additions and 159 deletions
@@ -139,15 +139,27 @@ impl AgentPool {
);
let empty_diff_reason = "Feature branch has no code changes — the coder agent \
did not produce any commits.";
// Write merge_failure and blocked to content store + CRDT.
let contents = crate::db::read_content(story_id)
.unwrap_or_else(|| "---\nname: unknown\n---\n".to_string());
let updated = crate::io::story_metadata::write_merge_failure_in_content(
&contents,
empty_diff_reason,
);
let blocked = crate::io::story_metadata::write_blocked_in_content(&updated);
crate::db::write_item_with_content(story_id, stage_dir, &blocked);
// Write merge_failure and blocked to content store.
if let Some(contents) = crate::db::read_content(story_id) {
let updated = crate::io::story_metadata::write_merge_failure_in_content(
&contents,
empty_diff_reason,
);
let blocked = crate::io::story_metadata::write_blocked_in_content(&updated);
crate::db::write_content(story_id, &blocked);
crate::db::write_item_with_content(story_id, stage_dir, &blocked);
} else {
// Fallback: filesystem.
let story_path = project_root
.join(".huskies/work")
.join(stage_dir)
.join(format!("{story_id}.md"));
let _ = crate::io::story_metadata::write_merge_failure(
&story_path,
empty_diff_reason,
);
let _ = crate::io::story_metadata::write_blocked(&story_path);
}
let _ = self.watcher_tx.send(crate::io::watcher::WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason: empty_diff_reason.to_string(),
+25 -16
View File
@@ -209,10 +209,15 @@ impl AgentPool {
message: format!("Failed to advance to QA: {e}"),
});
} else {
// Set review_hold in the content store + CRDT.
if let Some(contents) = crate::db::read_content(story_id) {
let updated = crate::io::story_metadata::write_review_hold_in_content(&contents);
crate::db::write_item_with_content(story_id, "3_qa", &updated);
let story_path = project_root
.join(".huskies/work/3_qa")
.join(format!("{story_id}.md"));
if let Err(e) =
crate::io::story_metadata::write_review_hold(&story_path)
{
eprintln!(
"[startup:reconcile] Failed to set review_hold on '{story_id}': {e}"
);
}
eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/ (qa: human — holding for review).");
let _ = progress_tx.send(ReconciliationEvent {
@@ -262,25 +267,29 @@ impl AgentPool {
if item_type == "spike" {
true
} else {
let story_path = project_root
.join(".huskies/work/3_qa")
.join(format!("{story_id}.md"));
let default_qa = crate::config::ProjectConfig::load(project_root)
.unwrap_or_default()
.default_qa_mode();
if let Some(contents) = crate::db::read_content(story_id) {
matches!(
crate::io::story_metadata::resolve_qa_mode_from_content(&contents, default_qa),
crate::io::story_metadata::QaMode::Human
)
} else {
matches!(default_qa, crate::io::story_metadata::QaMode::Human)
}
matches!(
crate::io::story_metadata::resolve_qa_mode(&story_path, default_qa),
crate::io::story_metadata::QaMode::Human
)
}
};
if needs_human_review {
// Set review_hold in the content store + CRDT.
if let Some(contents) = crate::db::read_content(story_id) {
let updated = crate::io::story_metadata::write_review_hold_in_content(&contents);
crate::db::write_item_with_content(story_id, "3_qa", &updated);
let story_path = project_root
.join(".huskies/work/3_qa")
.join(format!("{story_id}.md"));
if let Err(e) =
crate::io::story_metadata::write_review_hold(&story_path)
{
eprintln!(
"[startup:reconcile] Failed to set review_hold on '{story_id}': {e}"
);
}
eprintln!(
"[startup:reconcile] '{story_id}' passed QA — holding for human review."
+26 -34
View File
@@ -7,7 +7,7 @@
//! Passing no dependency numbers clears the field entirely.
use super::CommandContext;
use crate::io::story_metadata::{clear_front_matter_field_in_content, parse_front_matter, set_front_matter_field};
use crate::io::story_metadata::{parse_front_matter, write_depends_on};
/// Handle the `depends` command.
///
@@ -51,7 +51,7 @@ pub(super) fn handle_depends(ctx: &CommandContext) -> Option<String> {
}
// Find the story by numeric prefix: CRDT → content store → filesystem.
let (story_id, stage_dir, _path, content) =
let (story_id, _stage_dir, path, content) =
match crate::chat::lookup::find_story_by_number(ctx.project_root, num_str) {
Some(found) => found,
None => {
@@ -61,36 +61,24 @@ pub(super) fn handle_depends(ctx: &CommandContext) -> Option<String> {
}
};
let contents = match content.or_else(|| crate::db::read_content(&story_id)) {
Some(c) => c,
None => return Some(format!("No content found for **{story_id}**.")),
};
let story_name = parse_front_matter(&contents)
.ok()
let story_name = content
.or_else(|| std::fs::read_to_string(&path).ok())
.and_then(|c| parse_front_matter(&c).ok())
.and_then(|m| m.name)
.unwrap_or_else(|| story_id.clone());
// Update depends_on in the content store + CRDT.
let updated = if deps.is_empty() {
clear_front_matter_field_in_content(&contents, "depends_on")
} else {
let nums: Vec<String> = deps.iter().map(|n| n.to_string()).collect();
let yaml_value = format!("[{}]", nums.join(", "));
set_front_matter_field(&contents, "depends_on", &yaml_value)
};
crate::db::write_item_with_content(&story_id, &stage_dir, &updated);
if deps.is_empty() {
Some(format!(
match write_depends_on(&path, &deps) {
Ok(()) if deps.is_empty() => Some(format!(
"Cleared all dependencies for **{story_name}** ({story_id})."
))
} else {
let nums: Vec<String> = deps.iter().map(|n| n.to_string()).collect();
Some(format!(
"Set depends_on: [{}] for **{story_name}** ({story_id}).",
nums.join(", ")
))
)),
Ok(()) => {
let nums: Vec<String> = deps.iter().map(|n| n.to_string()).collect();
Some(format!(
"Set depends_on: [{}] for **{story_name}** ({story_id}).",
nums.join(", ")
))
}
Err(e) => Some(format!("Failed to update dependencies for {story_id}: {e}")),
}
}
@@ -200,11 +188,13 @@ mod tests {
output.contains("477") && output.contains("478"),
"response should mention dep numbers: {output}"
);
let contents = crate::db::read_content("42_story_foo")
.expect("content store should have the story");
let contents = std::fs::read_to_string(
tmp.path().join(".huskies/work/1_backlog/42_story_foo.md"),
)
.unwrap();
assert!(
contents.contains("depends_on: [477, 478]"),
"content store should have depends_on set: {contents}"
"file should have depends_on set: {contents}"
);
}
@@ -222,11 +212,13 @@ mod tests {
output.contains("Cleared"),
"should confirm clearing deps: {output}"
);
let contents = crate::db::read_content("10_story_bar")
.expect("content store should have the story");
let contents = std::fs::read_to_string(
tmp.path().join(".huskies/work/2_current/10_story_bar.md"),
)
.unwrap();
assert!(
!contents.contains("depends_on"),
"content store should have depends_on cleared: {contents}"
"file should have depends_on cleared: {contents}"
);
}
+63 -4
View File
@@ -5,7 +5,7 @@
//! and returns a confirmation.
use super::CommandContext;
use crate::io::story_metadata::{clear_front_matter_field_in_content, parse_front_matter, set_front_matter_field};
use crate::io::story_metadata::{clear_front_matter_field, clear_front_matter_field_in_content, parse_front_matter, set_front_matter_field};
use std::path::Path;
/// Handle the `unblock` command.
@@ -33,7 +33,7 @@ pub(super) fn handle_unblock(ctx: &CommandContext) -> Option<String> {
///
/// Lookup priority: CRDT → content store → filesystem (Story 512).
pub(crate) fn unblock_by_number(project_root: &Path, story_number: &str) -> String {
let (story_id, _stage_dir, _path, _content) =
let (story_id, _stage_dir, path, _content) =
match crate::chat::lookup::find_story_by_number(project_root, story_number) {
Some(found) => found,
None => {
@@ -43,8 +43,15 @@ pub(crate) fn unblock_by_number(project_root: &Path, story_number: &str) -> Stri
}
};
// All state lives in the content store + CRDT now.
unblock_by_story_id(&story_id)
// Prefer DB-backed unblock when the story is in the content store.
// Note: `content` may have come from the filesystem fallback in
// `find_story_by_number`, so we must re-check the DB rather than
// relying on `content.is_some()` alone.
if crate::db::read_content(&story_id).is_some() {
unblock_by_story_id(&story_id)
} else {
unblock_by_path(&path, &story_id)
}
}
/// Unblock a story using the content store (DB-backed).
@@ -92,6 +99,58 @@ fn unblock_by_story_id(story_id: &str) -> String {
format!("Unblocked **{story_name}** ({story_id}). Cleared: {}. Retry count reset to 0.", cleared.join(", "))
}
/// Core unblock logic: reset blocked state for a known story file path.
///
/// Reads front matter, verifies the story is blocked, clears the `blocked`
/// flag, and resets `retry_count` to 0. Also used by the MCP `unblock` tool
/// when the caller has already resolved the story path from a full `story_id`.
pub(crate) fn unblock_by_path(path: &Path, story_id: &str) -> String {
let contents = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(e) => return format!("Failed to read story file: {e}"),
};
let meta = match parse_front_matter(&contents) {
Ok(m) => m,
Err(e) => return format!("Failed to parse front matter for **{story_id}**: {e}"),
};
let story_name = meta.name.as_deref().unwrap_or(story_id).to_string();
let has_blocked = meta.blocked == Some(true);
let has_merge_failure = meta.merge_failure.is_some();
if !has_blocked && !has_merge_failure {
return format!(
"**{story_name}** ({story_id}) is not blocked. Nothing to unblock."
);
}
// Clear the blocked flag if present.
if has_blocked && let Err(e) = clear_front_matter_field(path, "blocked") {
return format!("Failed to clear blocked flag on **{story_id}**: {e}");
}
// Clear merge_failure if present.
if has_merge_failure && let Err(e) = clear_front_matter_field(path, "merge_failure") {
return format!("Failed to clear merge_failure on **{story_id}**: {e}");
}
// Reset retry_count to 0 (re-read the updated file, modify, write).
let updated_contents = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(e) => return format!("Failed to re-read story file after unblocking: {e}"),
};
let with_retry_reset = set_front_matter_field(&updated_contents, "retry_count", "0");
if let Err(e) = std::fs::write(path, &with_retry_reset) {
return format!("Failed to reset retry_count on **{story_id}**: {e}");
}
let mut cleared = Vec::new();
if has_blocked { cleared.push("blocked"); }
if has_merge_failure { cleared.push("merge_failure"); }
format!("Unblocked **{story_name}** ({story_id}). Cleared: {}. Retry count reset to 0.", cleared.join(", "))
}
// ---------------------------------------------------------------------------
// Tests
+1 -1
View File
@@ -3,7 +3,7 @@
/// The CRDT document is the primary source of truth for pipeline item
/// metadata (stage, name, agent, etc.). CRDT ops are persisted to SQLite so
/// state survives restarts. The filesystem `.huskies/work/` directories are
/// no longer written to — all state lives in the CRDT and DB content store.
/// still updated as a secondary output for backwards compatibility.
///
/// Stage transitions detected by `write_item()` are broadcast as [`CrdtEvent`]s
/// so subscribers (auto-assign, WebSocket, notifications) can react without
-72
View File
@@ -404,78 +404,6 @@ pub fn next_item_number() -> u32 {
max_num + 1
}
/// One-time migration: sync CRDT stages from the pipeline_items DB table.
///
/// During the filesystem→CRDT migration, many stories were imported into the
/// CRDT with stage `1_backlog` but then moved forward (to done/archived) via
/// filesystem-only moves that never wrote CRDT ops. This leaves stale
/// `1_backlog` entries in the CRDT for stories that are actually done.
///
/// This function reads the authoritative stage from `pipeline_items` and
/// calls `write_item` to correct any CRDT entries that disagree.
#[cfg(test)]
pub async fn sync_crdt_stages_from_db(db_path: &Path) {
slog!("[db-sync] START: sync_crdt_stages_from_db called with {}", db_path.display());
let options = SqliteConnectOptions::new().filename(db_path);
let Ok(pool) = SqlitePool::connect_with(options).await else {
slog!("[db-sync] FAIL: could not connect to pipeline.db");
return;
};
type SyncRow = (String, String, Option<String>, Option<String>, Option<i64>, Option<bool>, Option<String>);
let rows: Vec<SyncRow> =
sqlx::query_as(
"SELECT id, stage, name, agent, retry_count, blocked, depends_on FROM pipeline_items"
)
.fetch_all(&pool)
.await
.unwrap_or_default();
slog!("[db-sync] loaded {} rows from pipeline_items", rows.len());
let mut corrected = 0u32;
let mut skipped = 0u32;
let mut first_few = 0u32;
for (story_id, db_stage, name, agent, retry_count, blocked, depends_on) in &rows {
let crdt_stage = crate::crdt_state::read_item(story_id)
.map(|v| v.stage.clone());
if first_few < 5 {
slog!("[db-sync] sample: '{story_id}' crdt={crdt_stage:?} db={db_stage}");
first_few += 1;
}
// Skip stale "deleted" shadow rows left by old code that used the
// "deleted" sentinel as a soft-delete instead of issuing a real SQL
// DELETE. Syncing these back into the CRDT would resurrect tombstoned
// items with stage = "deleted".
if db_stage == "deleted" {
skipped += 1;
continue;
}
if crdt_stage.as_deref() != Some(db_stage.as_str()) {
crate::crdt_state::write_item(
story_id,
db_stage,
name.as_deref(),
agent.as_deref(),
*retry_count,
*blocked,
depends_on.as_deref(),
None,
None,
None, // merged_at unknown for migrated items; epoch fallback sweeps them
);
corrected += 1;
} else {
skipped += 1;
}
}
slog!("[db-sync] DONE: corrected={corrected} skipped={skipped} total={}", rows.len());
}
#[cfg(test)]
mod tests {
+72 -16
View File
@@ -1,6 +1,6 @@
use crate::agents::move_story_to_merge;
use crate::http::context::AppContext;
use crate::io::story_metadata::write_merge_failure_in_content;
use crate::io::story_metadata::write_merge_failure;
use crate::slog;
use crate::slog_warn;
use serde_json::{json, Value};
@@ -14,12 +14,58 @@ pub(super) fn tool_merge_agent_work(args: &Value, ctx: &AppContext) -> Result<St
let project_root = ctx.agents.get_project_root(&ctx.state)?;
ctx.agents.start_merge_agent_work(&project_root, story_id)?;
serde_json::to_string_pretty(&json!({
"story_id": story_id,
"status": "started",
"message": "Merge pipeline started. Poll get_merge_status(story_id) every 10-15 seconds until status is 'completed' or 'failed'."
}))
.map_err(|e| format!("Serialization error: {e}"))
// Block until the merge completes instead of returning immediately.
// This prevents the mergemaster from burning all its turns polling
// get_merge_status in a tight loop.
let sid = story_id.to_string();
let agents = ctx.agents.clone();
loop {
std::thread::sleep(std::time::Duration::from_secs(10));
if let Some(job) = agents.get_merge_status(&sid) {
match &job.status {
crate::agents::merge::MergeJobStatus::Running => continue,
_ => return tool_get_merge_status_inner(&sid, &job),
}
} else {
return Err(format!("Merge job disappeared for '{sid}'."));
}
}
}
fn tool_get_merge_status_inner(
story_id: &str,
job: &crate::agents::merge::MergeJob,
) -> Result<String, String> {
match &job.status {
crate::agents::merge::MergeJobStatus::Running => {
serde_json::to_string_pretty(&json!({
"story_id": story_id,
"status": "running",
"message": "Merge pipeline is still running."
}))
.map_err(|e| format!("Serialization error: {e}"))
}
crate::agents::merge::MergeJobStatus::Completed(report) => {
serde_json::to_string_pretty(&json!({
"story_id": story_id,
"status": "completed",
"success": report.success,
"had_conflicts": report.had_conflicts,
"conflicts_resolved": report.conflicts_resolved,
"gates_passed": report.gates_passed,
"gate_output": report.gate_output,
}))
.map_err(|e| format!("Serialization error: {e}"))
}
crate::agents::merge::MergeJobStatus::Failed(err) => {
serde_json::to_string_pretty(&json!({
"story_id": story_id,
"status": "failed",
"error": err,
}))
.map_err(|e| format!("Serialization error: {e}"))
}
}
}
pub(super) fn tool_get_merge_status(args: &Value, ctx: &AppContext) -> Result<String, String> {
@@ -134,16 +180,26 @@ pub(super) fn tool_report_merge_failure(args: &Value, ctx: &AppContext) -> Resul
reason: reason.to_string(),
});
// Persist the failure reason to the content store + CRDT so it
// Persist the failure reason to the story file's front matter so it
// survives server restarts and is visible in the web UI.
if let Some(contents) = crate::db::read_content(story_id) {
let updated = write_merge_failure_in_content(&contents, reason);
crate::db::write_item_with_content(story_id, "4_merge", &updated);
} else {
slog_warn!(
"[mergemaster] No content in store for '{story_id}'; \
merge_failure not persisted"
);
if let Ok(project_root) = ctx.state.get_project_root() {
let story_file = project_root
.join(".huskies")
.join("work")
.join("4_merge")
.join(format!("{story_id}.md"));
if story_file.exists() {
if let Err(e) = write_merge_failure(&story_file, reason) {
slog_warn!(
"[mergemaster] Failed to persist merge_failure to story file for '{story_id}': {e}"
);
}
} else {
slog_warn!(
"[mergemaster] Story file not found in 4_merge/ for '{story_id}'; \
merge_failure not persisted to front matter"
);
}
}
Ok(format!(
+6 -4
View File
@@ -46,10 +46,12 @@ pub(super) async fn tool_approve_qa(args: &Value, ctx: &AppContext) -> Result<St
let project_root = ctx.agents.get_project_root(&ctx.state)?;
// Clear review_hold in content store + CRDT before moving.
if let Some(contents) = crate::db::read_content(story_id) {
let updated = crate::io::story_metadata::clear_front_matter_field_in_content(&contents, "review_hold");
crate::db::write_item_with_content(story_id, "3_qa", &updated);
// Clear review_hold before moving
let qa_path = project_root
.join(".huskies/work/3_qa")
.join(format!("{story_id}.md"));
if qa_path.exists() {
let _ = crate::io::story_metadata::clear_front_matter_field(&qa_path, "review_hold");
}
// Move story from work/3_qa/ to work/4_merge/
+28 -3
View File
@@ -133,10 +133,25 @@ fn build_metadata(front: FrontMatter) -> StoryMetadata {
/// Write or update a `merge_failure:` field in the YAML front matter of a story file.
///
/// The reason is stored as a quoted YAML string so that colons, hashes, and newlines
/// in the failure message do not break front-matter parsing.
/// If no front matter is present, this is a no-op (returns Ok).
pub fn write_merge_failure(path: &Path, reason: &str) -> Result<(), String> {
let contents =
fs::read_to_string(path).map_err(|e| format!("Failed to read story file: {e}"))?;
// Produce a YAML-safe inline quoted string: collapse newlines, escape inner quotes.
let escaped = reason.replace('"', "\\\"").replace('\n', " ").replace('\r', "");
let yaml_value = format!("\"{escaped}\"");
let updated = set_front_matter_field(&contents, "merge_failure", &yaml_value);
fs::write(path, &updated).map_err(|e| format!("Failed to write story file: {e}"))?;
Ok(())
}
/// Write `review_hold: true` to the YAML front matter of a story file.
///
/// Used to mark spikes that have passed QA and are waiting for human review.
#[cfg(test)]
pub fn write_review_hold(path: &Path) -> Result<(), String> {
let contents =
fs::read_to_string(path).map_err(|e| format!("Failed to read story file: {e}"))?;
@@ -149,7 +164,6 @@ pub fn write_review_hold(path: &Path) -> Result<(), String> {
///
/// If front matter is present and contains the key, the line is removed.
/// If no front matter or key is not found, the file is left unchanged.
#[cfg(test)]
pub fn clear_front_matter_field(path: &Path, key: &str) -> Result<(), String> {
let contents =
fs::read_to_string(path).map_err(|e| format!("Failed to read story file: {e}"))?;
@@ -227,12 +241,23 @@ pub fn set_front_matter_field(contents: &str, key: &str, value: &str) -> String
result
}
/// Write `blocked: true` to the YAML front matter of a story file.
///
/// Used to mark stories that have exceeded the retry limit and should not
/// be auto-assigned again.
pub fn write_blocked(path: &Path) -> Result<(), String> {
let contents =
fs::read_to_string(path).map_err(|e| format!("Failed to read story file: {e}"))?;
let updated = set_front_matter_field(&contents, "blocked", "true");
fs::write(path, &updated).map_err(|e| format!("Failed to write story file: {e}"))?;
Ok(())
}
/// Write or update a `depends_on:` field in the YAML front matter of a story file.
///
/// Serialises `deps` as an inline YAML sequence, e.g. `[477, 478]`.
/// If `deps` is empty the field is removed.
/// If no front matter is present, this is a no-op (returns Ok).
#[cfg(test)]
pub fn write_depends_on(path: &Path, deps: &[u32]) -> Result<(), String> {
let contents =
fs::read_to_string(path).map_err(|e| format!("Failed to read story file: {e}"))?;