huskies: merge 645_bug_agent_runtime_panics_with_output_write_bytes_is_ok_assertion_marking_stories_falsely_blocked

This commit is contained in:
dave
2026-04-26 10:50:40 +00:00
parent d8f9be5b23
commit f88bb5f486
5 changed files with 526 additions and 2 deletions
+285 -1
View File
@@ -115,8 +115,78 @@ impl AgentPool {
}
}
} else {
// Bug 645: Before retry/block, check if the agent left committed
// work that compiles. An agent may crash mid-output (e.g. Claude
// Code CLI PTY write assertion) after having already committed valid
// code. When committed work survives and `cargo check` passes,
// advance to QA instead of wasting retries.
let work_survived = worktree_path.as_ref().is_some_and(|wt_path| {
crate::agents::gates::worktree_has_committed_work(wt_path)
&& crate::agents::gates::cargo_check_in_worktree(wt_path)
});
if work_survived {
slog!(
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}' but \
committed work survives and compiles. Advancing to QA instead of \
retrying (bug 645)."
);
let qa_mode = {
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
if item_type == "spike" {
crate::io::story_metadata::QaMode::Human
} else {
let default_qa = config.default_qa_mode();
resolve_qa_mode_from_store(&project_root, story_id, default_qa)
}
};
match qa_mode {
crate::io::story_metadata::QaMode::Server => {
if let Err(e) = crate::agents::lifecycle::move_story_to_merge(
&project_root,
story_id,
) {
slog_error!(
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
);
} else {
self.start_mergemaster_or_block(&project_root, story_id)
.await;
}
}
crate::io::story_metadata::QaMode::Agent => {
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(
&project_root,
story_id,
) {
slog_error!(
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
);
} else if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), None, None)
.await
{
slog_error!(
"[pipeline] Failed to start qa for '{story_id}': {e}"
);
}
}
crate::io::story_metadata::QaMode::Human => {
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(
&project_root,
story_id,
) {
slog_error!(
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
);
} else {
write_review_hold_to_store(story_id);
}
}
}
} else
// Increment retry count and check if blocked.
if let Some(reason) = should_block_story(story_id, config.max_retries, "coder")
if let Some(reason) =
should_block_story(story_id, config.max_retries, "coder")
{
// Story has exceeded retry limit — do not restart.
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
@@ -1062,4 +1132,218 @@ stage = "qa"
);
}
}
// ── bug 645: work-survived check advances to QA instead of blocking ──
/// Integration test: when a coder agent fails gates but committed work
/// survives and compiles, the story advances to QA (not retry/block).
/// Simulates an agent that commits work and then dies mid-output.
#[tokio::test]
async fn work_survived_advances_to_qa_instead_of_blocking() {
use std::fs;
use std::process::Command;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
// Init a git repo with a minimal Cargo project.
Command::new("git")
.args(["init"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["config", "user.email", "test@test.com"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["config", "user.name", "Test"])
.current_dir(root)
.output()
.unwrap();
fs::write(
root.join("Cargo.toml"),
"[package]\nname = \"test_proj\"\nversion = \"0.1.0\"\nedition = \"2021\"\n",
)
.unwrap();
fs::create_dir_all(root.join("src")).unwrap();
fs::write(root.join("src/lib.rs"), "// empty\n").unwrap();
Command::new("git")
.args(["add", "."])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["commit", "-m", "init"])
.current_dir(root)
.output()
.unwrap();
// Create a worktree on a feature branch.
let wt_path = tmp.path().join("wt");
Command::new("git")
.args([
"worktree",
"add",
&wt_path.to_string_lossy(),
"-b",
"feature/story-9945_story_survived",
])
.current_dir(root)
.output()
.unwrap();
// Commit valid code on the feature branch.
fs::write(wt_path.join("src/lib.rs"), "pub fn survived() {}\n").unwrap();
Command::new("git")
.args(["add", "."])
.current_dir(&wt_path)
.output()
.unwrap();
Command::new("git")
.args(["commit", "-m", "add survived fn"])
.current_dir(&wt_path)
.output()
.unwrap();
// Set up the story in the content store.
crate::db::ensure_content_store();
crate::db::write_content("9945_story_survived", "---\nname: Survived Test\n---\n");
crate::db::write_item_with_content(
"9945_story_survived",
"2_current",
"---\nname: Survived Test\n---\n",
);
let pool = AgentPool::new_test(3001);
// Simulate coder failing gates (e.g. agent crashed, dirty worktree).
pool.run_pipeline_advance(
"9945_story_survived",
"coder-1",
CompletionReport {
summary: "Agent crashed".to_string(),
gates_passed: false,
gate_output: "Worktree has uncommitted changes".to_string(),
},
Some(root.to_path_buf()),
Some(wt_path),
false,
None,
)
.await;
// Story should have advanced — content store should reflect the move.
// The work-survived check should have moved it to QA (or merge for
// server qa mode), NOT incremented retry_count.
let content = crate::db::read_content("9945_story_survived")
.expect("story should exist in content store");
assert!(
!content.contains("blocked"),
"story should NOT be blocked when committed work survives: {content}"
);
assert!(
!content.contains("retry_count"),
"story should NOT have retry_count when work survived: {content}"
);
}
/// Backwards-compat: agents that die WITHOUT committed work still get
/// the existing retry/block treatment.
#[tokio::test]
async fn no_committed_work_still_retries_and_blocks() {
use std::fs;
use std::process::Command;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
// Init a git repo (no Cargo project needed — cargo check will fail).
Command::new("git")
.args(["init"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["config", "user.email", "test@test.com"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["config", "user.name", "Test"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["commit", "--allow-empty", "-m", "init"])
.current_dir(root)
.output()
.unwrap();
// Create a worktree with NO commits on the feature branch.
let wt_path = tmp.path().join("wt");
Command::new("git")
.args([
"worktree",
"add",
&wt_path.to_string_lossy(),
"-b",
"feature/story-9946_story_nowork",
])
.current_dir(root)
.output()
.unwrap();
// Set up the story with max_retries=1 so it blocks immediately.
crate::db::ensure_content_store();
crate::db::write_content("9946_story_nowork", "---\nname: No Work Test\n---\n");
crate::db::write_item_with_content(
"9946_story_nowork",
"2_current",
"---\nname: No Work Test\n---\n",
);
// Write a project.toml with max_retries = 1.
fs::create_dir_all(root.join(".huskies")).unwrap();
fs::write(
root.join(".huskies/project.toml"),
"max_retries = 1\n\n[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
)
.unwrap();
let pool = AgentPool::new_test(3001);
let mut rx = pool.watcher_tx.subscribe();
// Simulate coder failing gates with NO committed work on the worktree.
pool.run_pipeline_advance(
"9946_story_nowork",
"coder-1",
CompletionReport {
summary: "Agent crashed".to_string(),
gates_passed: false,
gate_output: "Tests failed".to_string(),
},
Some(root.to_path_buf()),
Some(wt_path),
false,
None,
)
.await;
// With no committed work and max_retries=1, the story should be blocked.
let mut got_blocked = false;
while let Ok(evt) = rx.try_recv() {
if let WatcherEvent::StoryBlocked { story_id, .. } = &evt
&& story_id == "9946_story_nowork"
{
got_blocked = true;
break;
}
}
assert!(
got_blocked,
"Story with no committed work should be blocked after exceeding retry limit"
);
}
}