huskies: merge 954

This commit is contained in:
dave
2026-05-13 09:30:44 +00:00
parent c228ae1640
commit 765d54fc4b
8 changed files with 290 additions and 9 deletions
+5
View File
@@ -157,6 +157,11 @@ pub struct CompletionReport {
pub summary: String,
pub gates_passed: bool,
pub gate_output: String,
/// True when the coder exited with no commits but left uncommitted content in
/// the worktree. The pipeline advance will issue a commit-only recovery respawn
/// rather than a normal retry, and will NOT consume a `retry_count` slot.
#[serde(default)]
pub needs_commit_recovery: bool,
}
/// Token usage from a Claude Code session's `result` event.
@@ -528,6 +528,12 @@ mod tests {
"CONFLICT (content): server/src/lib.rs",
crate::db::ItemMeta::named("Conflict"),
);
// After master c228ae16, has_content_conflict_failure reads from
// {story_id}:gate_output (not the story description), so seed it there.
crate::db::write_content(
"9860_story_conflict:gate_output",
"CONFLICT (content): server/src/lib.rs",
);
let pool = AgentPool::new_test(3001);
pool.auto_assign_available_work(tmp.path()).await;
@@ -681,6 +687,12 @@ mod tests {
"CONFLICT (content): foo.rs",
crate::db::ItemMeta::named("Transient"),
);
// After master c228ae16, has_content_conflict_failure reads from
// {story_id}:gate_output (not the story description), so seed it there.
crate::db::write_content(
"920_story_transient:gate_output",
"CONFLICT (content): foo.rs",
);
// Simulate two previous transient exits (below cap of 3) recorded in DB.
crate::db::write_content("920_story_transient:mergemaster_spawn_count", "2");
+55 -1
View File
@@ -74,7 +74,58 @@ impl AgentPool {
// Supervisors and unknown agents do not advance the pipeline.
}
PipelineStage::Coder => {
if completion.gates_passed {
if completion.needs_commit_recovery {
// The coder exited with uncommitted content but no commits.
// Check if this is already a second recovery attempt (the
// first recovery respawn also produced no commits).
let recovery_key = format!("{story_id}:commit_recovery_pending");
if crate::db::read_content(&recovery_key).is_some() {
// Second attempt still produced no commits → block.
crate::db::delete_content(&recovery_key);
slog!(
"[pipeline] Coder '{agent_name}' (commit-recovery respawn) \
still produced no commits for '{story_id}'. Blocking story."
);
let reason = "agent declined to commit recoverable work".to_string();
if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
{
slog_error!("[pipeline] Failed to block '{story_id}': {e}");
}
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason,
});
} else {
// First occurrence: issue a commit-only recovery respawn.
// This does NOT consume a retry_count slot.
crate::db::write_content(&recovery_key, "1");
slog!(
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
for '{story_id}'. Issuing commit-only recovery respawn."
);
let addendum = "\n\nYou have uncommitted work in this worktree. \
Your only task this session is run_tests → git_add → git_commit. \
Do not explore further.";
if let Err(e) = self
.start_agent(
&project_root,
story_id,
Some(agent_name),
Some(addendum),
previous_session_id,
)
.await
{
slog_error!(
"[pipeline] Failed to start commit-recovery respawn \
for '{story_id}': {e}"
);
}
}
} else if completion.gates_passed {
// Clear any stale recovery key when the coder succeeds normally.
crate::db::delete_content(&format!("{story_id}:commit_recovery_pending"));
// Determine effective QA mode for this story.
let qa_mode = {
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
@@ -130,6 +181,9 @@ impl AgentPool {
}
}
} else {
// Clear any stale recovery key when gates fail normally (agent committed
// but the build is broken — treat as a standard retry, not a recovery).
crate::db::delete_content(&format!("{story_id}:commit_recovery_pending"));
// Bug 645 / 668: Before retry/block, check if the agent left committed
// work AND the agent had a passing run_tests result captured during its
// session. An agent may crash mid-output (e.g. Claude Code CLI PTY write
@@ -27,6 +27,7 @@ async fn pipeline_advance_coder_gates_pass_server_qa_moves_to_merge() {
summary: "done".to_string(),
gates_passed: true,
gate_output: String::new(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
None,
@@ -72,6 +73,7 @@ async fn pipeline_advance_coder_gates_pass_agent_qa_moves_to_qa() {
summary: "done".to_string(),
gates_passed: true,
gate_output: String::new(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
None,
@@ -114,6 +116,7 @@ async fn pipeline_advance_qa_gates_pass_moves_story_to_merge() {
summary: "QA done".to_string(),
gates_passed: true,
gate_output: String::new(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
None,
@@ -148,6 +151,7 @@ async fn pipeline_advance_supervisor_does_not_advance() {
summary: "supervised".to_string(),
gates_passed: true,
gate_output: String::new(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
None,
@@ -216,6 +220,7 @@ stage = "qa"
summary: "done".to_string(),
gates_passed: true,
gate_output: String::new(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
None,
@@ -65,6 +65,7 @@ async fn mergemaster_blocks_and_sends_story_blocked_when_no_commits_ahead() {
summary: "done".to_string(),
gates_passed: true,
gate_output: String::new(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
None,
@@ -182,6 +183,7 @@ stage = "qa"
summary: "QA done".to_string(),
gates_passed: true,
gate_output: String::new(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
None,
@@ -266,6 +268,7 @@ async fn stale_mergemaster_advance_for_done_story_is_noop() {
summary: "stale advance".to_string(),
gates_passed: true,
gate_output: String::new(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
None,
@@ -406,6 +409,7 @@ async fn work_survived_advances_to_qa_instead_of_blocking() {
summary: "Agent crashed".to_string(),
gates_passed: false,
gate_output: "Worktree has uncommitted changes".to_string(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
Some(wt_path),
@@ -505,6 +509,7 @@ async fn no_committed_work_still_retries_and_blocks() {
summary: "Agent crashed".to_string(),
gates_passed: false,
gate_output: "Tests failed".to_string(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
Some(wt_path),
@@ -635,6 +640,7 @@ async fn gates_failed_no_test_evidence_does_not_advance() {
summary: "Gates failed".to_string(),
gates_passed: false,
gate_output: "Tests failed".to_string(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
Some(wt_path),
@@ -758,6 +764,7 @@ async fn gates_failed_with_test_evidence_and_committed_work_advances() {
summary: "Agent crashed".to_string(),
gates_passed: false,
gate_output: "PTY write assertion failed".to_string(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
Some(wt_path),
@@ -839,6 +846,7 @@ stage = "coder"
summary: "Tests failed".to_string(),
gates_passed: false,
gate_output: "error[E0308]: mismatched types\n --> src/lib.rs:5:10".to_string(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
None,
@@ -873,6 +881,191 @@ stage = "coder"
);
}
// ── story 954: commit-only recovery respawn ───────────────────────────────
/// AC1+AC2: when a coder exits with `needs_commit_recovery=true` (uncommitted
/// work, zero commits), the pipeline issues a commit-only recovery respawn
/// WITHOUT consuming a retry_count slot.
#[tokio::test]
async fn commit_recovery_respawn_does_not_consume_retry_count() {
use std::fs;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
fs::create_dir_all(root.join(".huskies")).unwrap();
fs::write(
root.join(".huskies/project.toml"),
r#"
max_retries = 3
[[agent]]
name = "coder-1"
role = "Coder"
command = "echo"
args = ["noop"]
prompt = "test"
stage = "coder"
"#,
)
.unwrap();
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
crate::db::write_item_with_content(
"9954_story_recovery",
"2_current",
"---\nname: Recovery Test\n---\n",
crate::db::ItemMeta::named("Recovery Test"),
);
// Ensure no stale recovery key exists.
crate::db::delete_content("9954_story_recovery:commit_recovery_pending");
let pool = AgentPool::new_test(3001);
pool.run_pipeline_advance(
"9954_story_recovery",
"coder-1",
CompletionReport {
summary: "exited".to_string(),
gates_passed: false,
gate_output: "Worktree has uncommitted changes".to_string(),
needs_commit_recovery: true,
},
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
// The recovery respawn must have been issued — coder-1 should be Pending/Running.
let agents = pool.agents.lock().unwrap();
let coder_restarted = agents.values().any(|a| {
a.agent_name == "coder-1" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
});
assert!(
coder_restarted,
"Commit-recovery respawn must be issued when needs_commit_recovery=true. \
Pool: {:?}",
agents
.iter()
.map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status))
.collect::<Vec<_>>()
);
drop(agents);
// retry_count must NOT have been incremented (AC 2).
let item = crate::crdt_state::read_item("9954_story_recovery").expect("story must be in CRDT");
assert_eq!(
item.retry_count(),
0,
"retry_count must NOT be incremented for a commit-recovery respawn (AC 2): got {}",
item.retry_count()
);
// The recovery key must be set so a second failure triggers a block.
assert!(
crate::db::read_content("9954_story_recovery:commit_recovery_pending").is_some(),
"commit_recovery_pending key must be set after issuing recovery respawn"
);
}
/// AC3: when the commit-recovery respawn also exits with `needs_commit_recovery=true`,
/// the story moves to `blocked` with reason "agent declined to commit recoverable work".
#[tokio::test]
async fn second_commit_recovery_failure_blocks_story() {
use std::fs;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
fs::create_dir_all(root.join(".huskies")).unwrap();
fs::write(
root.join(".huskies/project.toml"),
r#"
max_retries = 3
[[agent]]
name = "coder-1"
role = "Coder"
command = "echo"
args = ["noop"]
prompt = "test"
stage = "coder"
"#,
)
.unwrap();
crate::crdt_state::init_for_test();
crate::db::ensure_content_store();
crate::db::write_item_with_content(
"9955_story_recovery2",
"2_current",
"---\nname: Recovery2 Test\n---\n",
crate::db::ItemMeta::named("Recovery2 Test"),
);
// Simulate the recovery key already being set (first recovery respawn was
// issued previously).
crate::db::write_content("9955_story_recovery2:commit_recovery_pending", "1");
let pool = AgentPool::new_test(3001);
let mut rx = pool.watcher_tx.subscribe();
pool.run_pipeline_advance(
"9955_story_recovery2",
"coder-1",
CompletionReport {
summary: "exited again".to_string(),
gates_passed: false,
gate_output: "Worktree has uncommitted changes".to_string(),
needs_commit_recovery: true,
},
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
// The story must be blocked (not retried again).
let mut got_blocked = false;
let mut block_reason = String::new();
while let Ok(evt) = rx.try_recv() {
if let WatcherEvent::StoryBlocked { story_id, reason } = evt
&& story_id == "9955_story_recovery2"
{
got_blocked = true;
block_reason = reason;
break;
}
}
assert!(
got_blocked,
"Story must be blocked when commit-recovery respawn also produces no commits (AC 3)"
);
assert_eq!(
block_reason, "agent declined to commit recoverable work",
"Block reason must match AC 3 spec"
);
// The recovery key must be cleared after blocking.
assert!(
crate::db::read_content("9955_story_recovery2:commit_recovery_pending").is_none(),
"commit_recovery_pending key must be cleared after blocking the story"
);
// retry_count must NOT have been incremented (AC 2: recovery never consumes a slot).
let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT");
assert_eq!(
item.retry_count(),
0,
"retry_count must NOT be incremented during commit-recovery path: got {}",
item.retry_count()
);
}
// ── bug 953: bug-645 path must not advance when feature branch has zero commits ──
/// Regression test for bug 953: when a coder agent exits with gates_passed=false
@@ -957,6 +1150,7 @@ async fn coder_completion_with_test_evidence_and_zero_commits_does_not_advance()
summary: "Agent crashed mid-output".to_string(),
gates_passed: false,
gate_output: "PTY write assertion failed".to_string(),
needs_commit_recovery: false,
},
Some(root.to_path_buf()),
Some(wt_path),
@@ -69,6 +69,7 @@ impl AgentPool {
summary: summary.to_string(),
gates_passed,
gate_output,
needs_commit_recovery: false,
};
// Extract data for pipeline advance, then remove the entry so
@@ -95,10 +95,13 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
}
}
// Run acceptance gates.
let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path {
// Run acceptance gates. Third element of the tuple is `needs_commit_recovery`:
// true when the coder exited with uncommitted content but zero commits — the
// pipeline advance will issue a commit-only recovery respawn rather than a
// normal retry, and will NOT consume a `retry_count` slot.
let (gates_passed, gate_output, needs_commit_recovery) = if let Some(wt_path) = worktree_path {
let path = wt_path;
match tokio::task::spawn_blocking(move || {
match tokio::task::spawn_blocking(move || -> Result<(bool, String, bool), String> {
// If the worktree is dirty, check whether committed work survived.
// An agent crash (e.g. Claude Code CLI's `output.write(&bytes).is_ok()`
// assertion — bug 645) can leave uncommitted files behind even though
@@ -133,7 +136,10 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
})
.unwrap_or(false)
} else {
return Ok((false, dirty_msg));
// Dirty worktree AND no committed work: the coder exited
// without committing. Signal a commit-only recovery respawn
// rather than consuming a normal retry slot.
return Ok((false, dirty_msg, true));
}
} else {
false
@@ -152,9 +158,10 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
"Agent exited with no commits on the feature branch. \
The agent did not produce any code changes."
.to_string(),
false,
));
}
let result = crate::agents::gates::run_acceptance_gates(&path);
let (passed, output) = crate::agents::gates::run_acceptance_gates(&path)?;
// Restore stashed uncommitted changes.
if stashed {
let _ = std::process::Command::new("git")
@@ -162,18 +169,19 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
.current_dir(&path)
.output();
}
result
Ok((passed, output, false))
})
.await
{
Ok(Ok(result)) => result,
Ok(Err(e)) => (false, e),
Err(e) => (false, format!("Gate check task panicked: {e}")),
Ok(Err(e)) => (false, e, false),
Err(e) => (false, format!("Gate check task panicked: {e}"), false),
}
} else {
(
false,
"No worktree path available to run acceptance gates".to_string(),
false,
)
};
@@ -192,6 +200,7 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
summary: "Agent process exited normally".to_string(),
gates_passed,
gate_output,
needs_commit_recovery,
};
// Store completion report, extract data for pipeline advance, then
@@ -97,6 +97,7 @@ async fn server_owned_completion_skips_when_already_completed() {
summary: "Already done".to_string(),
gates_passed: true,
gate_output: String::new(),
needs_commit_recovery: false,
};
pool.inject_test_agent_with_completion(
"s10",