huskies: merge 954
This commit is contained in:
@@ -74,7 +74,58 @@ impl AgentPool {
|
||||
// Supervisors and unknown agents do not advance the pipeline.
|
||||
}
|
||||
PipelineStage::Coder => {
|
||||
if completion.gates_passed {
|
||||
if completion.needs_commit_recovery {
|
||||
// The coder exited with uncommitted content but no commits.
|
||||
// Check if this is already a second recovery attempt (the
|
||||
// first recovery respawn also produced no commits).
|
||||
let recovery_key = format!("{story_id}:commit_recovery_pending");
|
||||
if crate::db::read_content(&recovery_key).is_some() {
|
||||
// Second attempt still produced no commits → block.
|
||||
crate::db::delete_content(&recovery_key);
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' (commit-recovery respawn) \
|
||||
still produced no commits for '{story_id}'. Blocking story."
|
||||
);
|
||||
let reason = "agent declined to commit recoverable work".to_string();
|
||||
if let Err(e) =
|
||||
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
||||
{
|
||||
slog_error!("[pipeline] Failed to block '{story_id}': {e}");
|
||||
}
|
||||
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
|
||||
story_id: story_id.to_string(),
|
||||
reason,
|
||||
});
|
||||
} else {
|
||||
// First occurrence: issue a commit-only recovery respawn.
|
||||
// This does NOT consume a retry_count slot.
|
||||
crate::db::write_content(&recovery_key, "1");
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
|
||||
for '{story_id}'. Issuing commit-only recovery respawn."
|
||||
);
|
||||
let addendum = "\n\nYou have uncommitted work in this worktree. \
|
||||
Your only task this session is run_tests → git_add → git_commit. \
|
||||
Do not explore further.";
|
||||
if let Err(e) = self
|
||||
.start_agent(
|
||||
&project_root,
|
||||
story_id,
|
||||
Some(agent_name),
|
||||
Some(addendum),
|
||||
previous_session_id,
|
||||
)
|
||||
.await
|
||||
{
|
||||
slog_error!(
|
||||
"[pipeline] Failed to start commit-recovery respawn \
|
||||
for '{story_id}': {e}"
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if completion.gates_passed {
|
||||
// Clear any stale recovery key when the coder succeeds normally.
|
||||
crate::db::delete_content(&format!("{story_id}:commit_recovery_pending"));
|
||||
// Determine effective QA mode for this story.
|
||||
let qa_mode = {
|
||||
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
|
||||
@@ -130,6 +181,9 @@ impl AgentPool {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Clear any stale recovery key when gates fail normally (agent committed
|
||||
// but the build is broken — treat as a standard retry, not a recovery).
|
||||
crate::db::delete_content(&format!("{story_id}:commit_recovery_pending"));
|
||||
// Bug 645 / 668: Before retry/block, check if the agent left committed
|
||||
// work AND the agent had a passing run_tests result captured during its
|
||||
// session. An agent may crash mid-output (e.g. Claude Code CLI PTY write
|
||||
|
||||
@@ -27,6 +27,7 @@ async fn pipeline_advance_coder_gates_pass_server_qa_moves_to_merge() {
|
||||
summary: "done".to_string(),
|
||||
gates_passed: true,
|
||||
gate_output: String::new(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
@@ -72,6 +73,7 @@ async fn pipeline_advance_coder_gates_pass_agent_qa_moves_to_qa() {
|
||||
summary: "done".to_string(),
|
||||
gates_passed: true,
|
||||
gate_output: String::new(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
@@ -114,6 +116,7 @@ async fn pipeline_advance_qa_gates_pass_moves_story_to_merge() {
|
||||
summary: "QA done".to_string(),
|
||||
gates_passed: true,
|
||||
gate_output: String::new(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
@@ -148,6 +151,7 @@ async fn pipeline_advance_supervisor_does_not_advance() {
|
||||
summary: "supervised".to_string(),
|
||||
gates_passed: true,
|
||||
gate_output: String::new(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
@@ -216,6 +220,7 @@ stage = "qa"
|
||||
summary: "done".to_string(),
|
||||
gates_passed: true,
|
||||
gate_output: String::new(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
|
||||
@@ -65,6 +65,7 @@ async fn mergemaster_blocks_and_sends_story_blocked_when_no_commits_ahead() {
|
||||
summary: "done".to_string(),
|
||||
gates_passed: true,
|
||||
gate_output: String::new(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
@@ -182,6 +183,7 @@ stage = "qa"
|
||||
summary: "QA done".to_string(),
|
||||
gates_passed: true,
|
||||
gate_output: String::new(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
@@ -266,6 +268,7 @@ async fn stale_mergemaster_advance_for_done_story_is_noop() {
|
||||
summary: "stale advance".to_string(),
|
||||
gates_passed: true,
|
||||
gate_output: String::new(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
@@ -406,6 +409,7 @@ async fn work_survived_advances_to_qa_instead_of_blocking() {
|
||||
summary: "Agent crashed".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "Worktree has uncommitted changes".to_string(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
Some(wt_path),
|
||||
@@ -505,6 +509,7 @@ async fn no_committed_work_still_retries_and_blocks() {
|
||||
summary: "Agent crashed".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "Tests failed".to_string(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
Some(wt_path),
|
||||
@@ -635,6 +640,7 @@ async fn gates_failed_no_test_evidence_does_not_advance() {
|
||||
summary: "Gates failed".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "Tests failed".to_string(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
Some(wt_path),
|
||||
@@ -758,6 +764,7 @@ async fn gates_failed_with_test_evidence_and_committed_work_advances() {
|
||||
summary: "Agent crashed".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "PTY write assertion failed".to_string(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
Some(wt_path),
|
||||
@@ -839,6 +846,7 @@ stage = "coder"
|
||||
summary: "Tests failed".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "error[E0308]: mismatched types\n --> src/lib.rs:5:10".to_string(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
@@ -873,6 +881,191 @@ stage = "coder"
|
||||
);
|
||||
}
|
||||
|
||||
// ── story 954: commit-only recovery respawn ───────────────────────────────
|
||||
|
||||
/// AC1+AC2: when a coder exits with `needs_commit_recovery=true` (uncommitted
|
||||
/// work, zero commits), the pipeline issues a commit-only recovery respawn
|
||||
/// WITHOUT consuming a retry_count slot.
|
||||
#[tokio::test]
|
||||
async fn commit_recovery_respawn_does_not_consume_retry_count() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
fs::create_dir_all(root.join(".huskies")).unwrap();
|
||||
fs::write(
|
||||
root.join(".huskies/project.toml"),
|
||||
r#"
|
||||
max_retries = 3
|
||||
|
||||
[[agent]]
|
||||
name = "coder-1"
|
||||
role = "Coder"
|
||||
command = "echo"
|
||||
args = ["noop"]
|
||||
prompt = "test"
|
||||
stage = "coder"
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::db::ensure_content_store();
|
||||
crate::db::write_item_with_content(
|
||||
"9954_story_recovery",
|
||||
"2_current",
|
||||
"---\nname: Recovery Test\n---\n",
|
||||
crate::db::ItemMeta::named("Recovery Test"),
|
||||
);
|
||||
// Ensure no stale recovery key exists.
|
||||
crate::db::delete_content("9954_story_recovery:commit_recovery_pending");
|
||||
|
||||
let pool = AgentPool::new_test(3001);
|
||||
|
||||
pool.run_pipeline_advance(
|
||||
"9954_story_recovery",
|
||||
"coder-1",
|
||||
CompletionReport {
|
||||
summary: "exited".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "Worktree has uncommitted changes".to_string(),
|
||||
needs_commit_recovery: true,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
false,
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
// The recovery respawn must have been issued — coder-1 should be Pending/Running.
|
||||
let agents = pool.agents.lock().unwrap();
|
||||
let coder_restarted = agents.values().any(|a| {
|
||||
a.agent_name == "coder-1" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
|
||||
});
|
||||
assert!(
|
||||
coder_restarted,
|
||||
"Commit-recovery respawn must be issued when needs_commit_recovery=true. \
|
||||
Pool: {:?}",
|
||||
agents
|
||||
.iter()
|
||||
.map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
drop(agents);
|
||||
|
||||
// retry_count must NOT have been incremented (AC 2).
|
||||
let item = crate::crdt_state::read_item("9954_story_recovery").expect("story must be in CRDT");
|
||||
assert_eq!(
|
||||
item.retry_count(),
|
||||
0,
|
||||
"retry_count must NOT be incremented for a commit-recovery respawn (AC 2): got {}",
|
||||
item.retry_count()
|
||||
);
|
||||
|
||||
// The recovery key must be set so a second failure triggers a block.
|
||||
assert!(
|
||||
crate::db::read_content("9954_story_recovery:commit_recovery_pending").is_some(),
|
||||
"commit_recovery_pending key must be set after issuing recovery respawn"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC3: when the commit-recovery respawn also exits with `needs_commit_recovery=true`,
|
||||
/// the story moves to `blocked` with reason "agent declined to commit recoverable work".
|
||||
#[tokio::test]
|
||||
async fn second_commit_recovery_failure_blocks_story() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
fs::create_dir_all(root.join(".huskies")).unwrap();
|
||||
fs::write(
|
||||
root.join(".huskies/project.toml"),
|
||||
r#"
|
||||
max_retries = 3
|
||||
|
||||
[[agent]]
|
||||
name = "coder-1"
|
||||
role = "Coder"
|
||||
command = "echo"
|
||||
args = ["noop"]
|
||||
prompt = "test"
|
||||
stage = "coder"
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::db::ensure_content_store();
|
||||
crate::db::write_item_with_content(
|
||||
"9955_story_recovery2",
|
||||
"2_current",
|
||||
"---\nname: Recovery2 Test\n---\n",
|
||||
crate::db::ItemMeta::named("Recovery2 Test"),
|
||||
);
|
||||
|
||||
// Simulate the recovery key already being set (first recovery respawn was
|
||||
// issued previously).
|
||||
crate::db::write_content("9955_story_recovery2:commit_recovery_pending", "1");
|
||||
|
||||
let pool = AgentPool::new_test(3001);
|
||||
let mut rx = pool.watcher_tx.subscribe();
|
||||
|
||||
pool.run_pipeline_advance(
|
||||
"9955_story_recovery2",
|
||||
"coder-1",
|
||||
CompletionReport {
|
||||
summary: "exited again".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "Worktree has uncommitted changes".to_string(),
|
||||
needs_commit_recovery: true,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
None,
|
||||
false,
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
// The story must be blocked (not retried again).
|
||||
let mut got_blocked = false;
|
||||
let mut block_reason = String::new();
|
||||
while let Ok(evt) = rx.try_recv() {
|
||||
if let WatcherEvent::StoryBlocked { story_id, reason } = evt
|
||||
&& story_id == "9955_story_recovery2"
|
||||
{
|
||||
got_blocked = true;
|
||||
block_reason = reason;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert!(
|
||||
got_blocked,
|
||||
"Story must be blocked when commit-recovery respawn also produces no commits (AC 3)"
|
||||
);
|
||||
assert_eq!(
|
||||
block_reason, "agent declined to commit recoverable work",
|
||||
"Block reason must match AC 3 spec"
|
||||
);
|
||||
|
||||
// The recovery key must be cleared after blocking.
|
||||
assert!(
|
||||
crate::db::read_content("9955_story_recovery2:commit_recovery_pending").is_none(),
|
||||
"commit_recovery_pending key must be cleared after blocking the story"
|
||||
);
|
||||
|
||||
// retry_count must NOT have been incremented (AC 2: recovery never consumes a slot).
|
||||
let item = crate::crdt_state::read_item("9955_story_recovery2").expect("story must be in CRDT");
|
||||
assert_eq!(
|
||||
item.retry_count(),
|
||||
0,
|
||||
"retry_count must NOT be incremented during commit-recovery path: got {}",
|
||||
item.retry_count()
|
||||
);
|
||||
}
|
||||
|
||||
// ── bug 953: bug-645 path must not advance when feature branch has zero commits ──
|
||||
|
||||
/// Regression test for bug 953: when a coder agent exits with gates_passed=false
|
||||
@@ -957,6 +1150,7 @@ async fn coder_completion_with_test_evidence_and_zero_commits_does_not_advance()
|
||||
summary: "Agent crashed mid-output".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "PTY write assertion failed".to_string(),
|
||||
needs_commit_recovery: false,
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
Some(wt_path),
|
||||
|
||||
@@ -69,6 +69,7 @@ impl AgentPool {
|
||||
summary: summary.to_string(),
|
||||
gates_passed,
|
||||
gate_output,
|
||||
needs_commit_recovery: false,
|
||||
};
|
||||
|
||||
// Extract data for pipeline advance, then remove the entry so
|
||||
|
||||
@@ -95,10 +95,13 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
|
||||
}
|
||||
}
|
||||
|
||||
// Run acceptance gates.
|
||||
let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path {
|
||||
// Run acceptance gates. Third element of the tuple is `needs_commit_recovery`:
|
||||
// true when the coder exited with uncommitted content but zero commits — the
|
||||
// pipeline advance will issue a commit-only recovery respawn rather than a
|
||||
// normal retry, and will NOT consume a `retry_count` slot.
|
||||
let (gates_passed, gate_output, needs_commit_recovery) = if let Some(wt_path) = worktree_path {
|
||||
let path = wt_path;
|
||||
match tokio::task::spawn_blocking(move || {
|
||||
match tokio::task::spawn_blocking(move || -> Result<(bool, String, bool), String> {
|
||||
// If the worktree is dirty, check whether committed work survived.
|
||||
// An agent crash (e.g. Claude Code CLI's `output.write(&bytes).is_ok()`
|
||||
// assertion — bug 645) can leave uncommitted files behind even though
|
||||
@@ -133,7 +136,10 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
|
||||
})
|
||||
.unwrap_or(false)
|
||||
} else {
|
||||
return Ok((false, dirty_msg));
|
||||
// Dirty worktree AND no committed work: the coder exited
|
||||
// without committing. Signal a commit-only recovery respawn
|
||||
// rather than consuming a normal retry slot.
|
||||
return Ok((false, dirty_msg, true));
|
||||
}
|
||||
} else {
|
||||
false
|
||||
@@ -152,9 +158,10 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
|
||||
"Agent exited with no commits on the feature branch. \
|
||||
The agent did not produce any code changes."
|
||||
.to_string(),
|
||||
false,
|
||||
));
|
||||
}
|
||||
let result = crate::agents::gates::run_acceptance_gates(&path);
|
||||
let (passed, output) = crate::agents::gates::run_acceptance_gates(&path)?;
|
||||
// Restore stashed uncommitted changes.
|
||||
if stashed {
|
||||
let _ = std::process::Command::new("git")
|
||||
@@ -162,18 +169,19 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
|
||||
.current_dir(&path)
|
||||
.output();
|
||||
}
|
||||
result
|
||||
Ok((passed, output, false))
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(Ok(result)) => result,
|
||||
Ok(Err(e)) => (false, e),
|
||||
Err(e) => (false, format!("Gate check task panicked: {e}")),
|
||||
Ok(Err(e)) => (false, e, false),
|
||||
Err(e) => (false, format!("Gate check task panicked: {e}"), false),
|
||||
}
|
||||
} else {
|
||||
(
|
||||
false,
|
||||
"No worktree path available to run acceptance gates".to_string(),
|
||||
false,
|
||||
)
|
||||
};
|
||||
|
||||
@@ -192,6 +200,7 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
|
||||
summary: "Agent process exited normally".to_string(),
|
||||
gates_passed,
|
||||
gate_output,
|
||||
needs_commit_recovery,
|
||||
};
|
||||
|
||||
// Store completion report, extract data for pipeline advance, then
|
||||
|
||||
@@ -97,6 +97,7 @@ async fn server_owned_completion_skips_when_already_completed() {
|
||||
summary: "Already done".to_string(),
|
||||
gates_passed: true,
|
||||
gate_output: String::new(),
|
||||
needs_commit_recovery: false,
|
||||
};
|
||||
pool.inject_test_agent_with_completion(
|
||||
"s10",
|
||||
|
||||
Reference in New Issue
Block a user