huskies: merge 645_bug_agent_runtime_panics_with_output_write_bytes_is_ok_assertion_marking_stories_falsely_blocked

This commit is contained in:
dave
2026-04-26 10:50:40 +00:00
parent d8f9be5b23
commit f88bb5f486
5 changed files with 526 additions and 2 deletions
+285 -1
View File
@@ -115,8 +115,78 @@ impl AgentPool {
}
}
} else {
// Bug 645: Before retry/block, check if the agent left committed
// work that compiles. An agent may crash mid-output (e.g. Claude
// Code CLI PTY write assertion) after having already committed valid
// code. When committed work survives and `cargo check` passes,
// advance to QA instead of wasting retries.
let work_survived = worktree_path.as_ref().is_some_and(|wt_path| {
crate::agents::gates::worktree_has_committed_work(wt_path)
&& crate::agents::gates::cargo_check_in_worktree(wt_path)
});
if work_survived {
slog!(
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}' but \
committed work survives and compiles. Advancing to QA instead of \
retrying (bug 645)."
);
let qa_mode = {
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
if item_type == "spike" {
crate::io::story_metadata::QaMode::Human
} else {
let default_qa = config.default_qa_mode();
resolve_qa_mode_from_store(&project_root, story_id, default_qa)
}
};
match qa_mode {
crate::io::story_metadata::QaMode::Server => {
if let Err(e) = crate::agents::lifecycle::move_story_to_merge(
&project_root,
story_id,
) {
slog_error!(
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
);
} else {
self.start_mergemaster_or_block(&project_root, story_id)
.await;
}
}
crate::io::story_metadata::QaMode::Agent => {
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(
&project_root,
story_id,
) {
slog_error!(
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
);
} else if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), None, None)
.await
{
slog_error!(
"[pipeline] Failed to start qa for '{story_id}': {e}"
);
}
}
crate::io::story_metadata::QaMode::Human => {
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(
&project_root,
story_id,
) {
slog_error!(
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
);
} else {
write_review_hold_to_store(story_id);
}
}
}
} else
// Increment retry count and check if blocked.
if let Some(reason) = should_block_story(story_id, config.max_retries, "coder")
if let Some(reason) =
should_block_story(story_id, config.max_retries, "coder")
{
// Story has exceeded retry limit — do not restart.
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
@@ -1062,4 +1132,218 @@ stage = "qa"
);
}
}
// ── bug 645: work-survived check advances to QA instead of blocking ──
/// Integration test: when a coder agent fails gates but committed work
/// survives and compiles, the story advances to QA (not retry/block).
/// Simulates an agent that commits work and then dies mid-output.
#[tokio::test]
async fn work_survived_advances_to_qa_instead_of_blocking() {
use std::fs;
use std::process::Command;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
// Init a git repo with a minimal Cargo project.
Command::new("git")
.args(["init"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["config", "user.email", "test@test.com"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["config", "user.name", "Test"])
.current_dir(root)
.output()
.unwrap();
fs::write(
root.join("Cargo.toml"),
"[package]\nname = \"test_proj\"\nversion = \"0.1.0\"\nedition = \"2021\"\n",
)
.unwrap();
fs::create_dir_all(root.join("src")).unwrap();
fs::write(root.join("src/lib.rs"), "// empty\n").unwrap();
Command::new("git")
.args(["add", "."])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["commit", "-m", "init"])
.current_dir(root)
.output()
.unwrap();
// Create a worktree on a feature branch.
let wt_path = tmp.path().join("wt");
Command::new("git")
.args([
"worktree",
"add",
&wt_path.to_string_lossy(),
"-b",
"feature/story-9945_story_survived",
])
.current_dir(root)
.output()
.unwrap();
// Commit valid code on the feature branch.
fs::write(wt_path.join("src/lib.rs"), "pub fn survived() {}\n").unwrap();
Command::new("git")
.args(["add", "."])
.current_dir(&wt_path)
.output()
.unwrap();
Command::new("git")
.args(["commit", "-m", "add survived fn"])
.current_dir(&wt_path)
.output()
.unwrap();
// Set up the story in the content store.
crate::db::ensure_content_store();
crate::db::write_content("9945_story_survived", "---\nname: Survived Test\n---\n");
crate::db::write_item_with_content(
"9945_story_survived",
"2_current",
"---\nname: Survived Test\n---\n",
);
let pool = AgentPool::new_test(3001);
// Simulate coder failing gates (e.g. agent crashed, dirty worktree).
pool.run_pipeline_advance(
"9945_story_survived",
"coder-1",
CompletionReport {
summary: "Agent crashed".to_string(),
gates_passed: false,
gate_output: "Worktree has uncommitted changes".to_string(),
},
Some(root.to_path_buf()),
Some(wt_path),
false,
None,
)
.await;
// Story should have advanced — content store should reflect the move.
// The work-survived check should have moved it to QA (or merge for
// server qa mode), NOT incremented retry_count.
let content = crate::db::read_content("9945_story_survived")
.expect("story should exist in content store");
assert!(
!content.contains("blocked"),
"story should NOT be blocked when committed work survives: {content}"
);
assert!(
!content.contains("retry_count"),
"story should NOT have retry_count when work survived: {content}"
);
}
/// Backwards-compat: agents that die WITHOUT committed work still get
/// the existing retry/block treatment.
#[tokio::test]
async fn no_committed_work_still_retries_and_blocks() {
use std::fs;
use std::process::Command;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
// Init a git repo (no Cargo project needed — cargo check will fail).
Command::new("git")
.args(["init"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["config", "user.email", "test@test.com"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["config", "user.name", "Test"])
.current_dir(root)
.output()
.unwrap();
Command::new("git")
.args(["commit", "--allow-empty", "-m", "init"])
.current_dir(root)
.output()
.unwrap();
// Create a worktree with NO commits on the feature branch.
let wt_path = tmp.path().join("wt");
Command::new("git")
.args([
"worktree",
"add",
&wt_path.to_string_lossy(),
"-b",
"feature/story-9946_story_nowork",
])
.current_dir(root)
.output()
.unwrap();
// Set up the story with max_retries=1 so it blocks immediately.
crate::db::ensure_content_store();
crate::db::write_content("9946_story_nowork", "---\nname: No Work Test\n---\n");
crate::db::write_item_with_content(
"9946_story_nowork",
"2_current",
"---\nname: No Work Test\n---\n",
);
// Write a project.toml with max_retries = 1.
fs::create_dir_all(root.join(".huskies")).unwrap();
fs::write(
root.join(".huskies/project.toml"),
"max_retries = 1\n\n[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
)
.unwrap();
let pool = AgentPool::new_test(3001);
let mut rx = pool.watcher_tx.subscribe();
// Simulate coder failing gates with NO committed work on the worktree.
pool.run_pipeline_advance(
"9946_story_nowork",
"coder-1",
CompletionReport {
summary: "Agent crashed".to_string(),
gates_passed: false,
gate_output: "Tests failed".to_string(),
},
Some(root.to_path_buf()),
Some(wt_path),
false,
None,
)
.await;
// With no committed work and max_retries=1, the story should be blocked.
let mut got_blocked = false;
while let Ok(evt) = rx.try_recv() {
if let WatcherEvent::StoryBlocked { story_id, .. } = &evt
&& story_id == "9946_story_nowork"
{
got_blocked = true;
break;
}
}
assert!(
got_blocked,
"Story with no committed work should be blocked after exceeding retry limit"
);
}
}
+103 -1
View File
@@ -232,7 +232,31 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path {
let path = wt_path;
match tokio::task::spawn_blocking(move || {
crate::agents::gates::check_uncommitted_changes(&path)?;
// If the worktree is dirty, check whether committed work survived.
// An agent crash (e.g. Claude Code CLI's `output.write(&bytes).is_ok()`
// assertion — bug 645) can leave uncommitted files behind even though
// the agent already committed valid work. In that case, clean up the
// dirty files and proceed with gates on the committed code.
if let Err(dirty_msg) = crate::agents::gates::check_uncommitted_changes(&path) {
if crate::agents::gates::worktree_has_committed_work(&path) {
crate::slog!(
"[agents] Worktree dirty but committed work exists — \
resetting uncommitted changes and proceeding with gates. \
Dirty state: {dirty_msg}"
);
// Reset dirty files so gates run against committed code only.
let _ = std::process::Command::new("git")
.args(["checkout", "."])
.current_dir(&path)
.output();
let _ = std::process::Command::new("git")
.args(["clean", "-fd"])
.current_dir(&path)
.output();
} else {
return Ok((false, dirty_msg));
}
}
// AC5: Fail early if the coder finished with no commits on the feature branch.
// This prevents empty-diff stories from advancing through QA to merge.
if !crate::agents::gates::worktree_has_committed_work(&path) {
@@ -642,4 +666,82 @@ mod tests {
"Agent must remain in pool — run_server_owned_completion is a no-op for mergemaster"
);
}
/// Bug 645: when an agent crashes leaving dirty files but committed work,
/// server-owned completion should clean up the dirty files and run gates
/// on the committed code instead of failing immediately.
#[tokio::test]
async fn server_owned_completion_cleans_dirty_worktree_with_committed_work() {
use std::fs;
use tempfile::tempdir;
let tmp = tempdir().unwrap();
let project_root = tmp.path().join("project");
fs::create_dir_all(&project_root).unwrap();
init_git_repo(&project_root);
// Create a worktree on a feature branch with committed code.
let wt_path = tmp.path().join("wt");
Command::new("git")
.args([
"worktree",
"add",
&wt_path.to_string_lossy(),
"-b",
"feature/story-645_test",
])
.current_dir(&project_root)
.output()
.unwrap();
// Commit a valid file.
fs::write(wt_path.join("work.txt"), "done").unwrap();
Command::new("git")
.args(["add", "."])
.current_dir(&wt_path)
.output()
.unwrap();
Command::new("git")
.args(["commit", "-m", "coder: add work"])
.current_dir(&wt_path)
.output()
.unwrap();
// Now simulate crash leaving dirty files.
fs::write(wt_path.join("dirty.txt"), "crash residue").unwrap();
let pool = AgentPool::new_test(3001);
pool.inject_test_agent_with_path(
"645_test",
"coder-1",
AgentStatus::Running,
wt_path.clone(),
);
let mut rx = pool.subscribe("645_test", "coder-1").unwrap();
run_server_owned_completion(
&pool.agents,
pool.port,
"645_test",
"coder-1",
Some("sess-645".to_string()),
pool.watcher_tx.clone(),
)
.await;
// The dirty file should have been cleaned up.
assert!(
!wt_path.join("dirty.txt").exists(),
"dirty file should be cleaned up after server-owned completion"
);
// A Done event should have been emitted (completion ran, didn't fail
// on dirty worktree).
let event = rx.try_recv().expect("should emit Done event");
assert!(
matches!(event, AgentEvent::Done { .. }),
"expected Done event, got: {event:?}"
);
}
}