huskies: merge 645_bug_agent_runtime_panics_with_output_write_bytes_is_ok_assertion_marking_stories_falsely_blocked
This commit is contained in:
@@ -181,6 +181,25 @@ The mergemaster agent only runs against stories in `4_merge/`. It:
|
||||
4. On failure beyond the retry budget, writes `merge_failure` and blocks the
|
||||
story (auto-assign then skips it).
|
||||
|
||||
### Agent terminated with committed work (bug 645 recovery path)
|
||||
|
||||
When a coder agent terminates abnormally (e.g. the Claude Code CLI's
|
||||
`output.write(&bytes).is_ok()` PTY write assertion fires mid-session), the
|
||||
server-owned completion path detects the crash and checks for surviving work:
|
||||
|
||||
1. If the worktree is dirty but has commits ahead of master, reset the
|
||||
uncommitted files (`git checkout . && git clean -fd`) and run gates
|
||||
against the committed code.
|
||||
2. If gates still fail but `git log master..HEAD` shows commits and
|
||||
`cargo check` passes, **advance to QA** instead of entering the
|
||||
retry/block path. This is the "work survived" check, implemented in
|
||||
`server/src/agents/pool/pipeline/advance.rs`.
|
||||
3. Agents that die WITHOUT committed work (no commits ahead of master)
|
||||
still follow the existing retry → block path unchanged.
|
||||
|
||||
This prevents false-positive blocking of stories where the agent completed
|
||||
meaningful work before crashing.
|
||||
|
||||
### Watchdog (current production)
|
||||
|
||||
The "watchdog" at `server/src/agents/pool/auto_assign/watchdog.rs` runs every
|
||||
|
||||
@@ -41,6 +41,26 @@ pub(crate) fn worktree_has_committed_work(wt_path: &Path) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
/// Run `cargo check` in the given worktree directory to verify committed code compiles.
|
||||
///
|
||||
/// Resets any dirty (uncommitted) files first via `git checkout .` so that only
|
||||
/// the committed state is evaluated. Used as part of the "work survived" check
|
||||
/// when an agent crashes mid-output (bug 645).
|
||||
pub(crate) fn cargo_check_in_worktree(wt_path: &Path) -> bool {
|
||||
// Reset uncommitted changes so cargo check evaluates only committed code.
|
||||
let _ = Command::new("git")
|
||||
.args(["checkout", "."])
|
||||
.current_dir(wt_path)
|
||||
.output();
|
||||
|
||||
Command::new("cargo")
|
||||
.args(["check"])
|
||||
.current_dir(wt_path)
|
||||
.output()
|
||||
.map(|o| o.status.success())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Check whether the given directory has any uncommitted git changes.
|
||||
/// Returns `Err` with a descriptive message if there are any.
|
||||
pub(crate) fn check_uncommitted_changes(path: &Path) -> Result<(), String> {
|
||||
@@ -434,4 +454,87 @@ mod tests {
|
||||
// Now the feature branch is ahead of the base branch.
|
||||
assert!(worktree_has_committed_work(&wt_path));
|
||||
}
|
||||
|
||||
// ── cargo_check_in_worktree tests ────────────────────────────────────────
|
||||
|
||||
/// Bug 645: cargo_check_in_worktree resets dirty files before checking.
|
||||
#[test]
|
||||
fn cargo_check_in_worktree_resets_dirty_files() {
|
||||
use std::fs;
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let project_root = tmp.path().join("project");
|
||||
fs::create_dir_all(&project_root).unwrap();
|
||||
init_git_repo(&project_root);
|
||||
|
||||
// Create a minimal Cargo project so cargo check works.
|
||||
fs::write(
|
||||
project_root.join("Cargo.toml"),
|
||||
"[package]\nname = \"test_proj\"\nversion = \"0.1.0\"\nedition = \"2021\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
fs::create_dir_all(project_root.join("src")).unwrap();
|
||||
fs::write(project_root.join("src/lib.rs"), "// empty lib\n").unwrap();
|
||||
Command::new("git")
|
||||
.args(["add", "."])
|
||||
.current_dir(&project_root)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args([
|
||||
"-c",
|
||||
"user.email=test@test.com",
|
||||
"-c",
|
||||
"user.name=Test",
|
||||
"commit",
|
||||
"-m",
|
||||
"add cargo project",
|
||||
])
|
||||
.current_dir(&project_root)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Create a worktree on a feature branch.
|
||||
let wt_path = tmp.path().join("wt");
|
||||
Command::new("git")
|
||||
.args([
|
||||
"worktree",
|
||||
"add",
|
||||
&wt_path.to_string_lossy(),
|
||||
"-b",
|
||||
"feature/story-645_test",
|
||||
])
|
||||
.current_dir(&project_root)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Commit valid code.
|
||||
fs::write(wt_path.join("src/lib.rs"), "pub fn hello() {}\n").unwrap();
|
||||
Command::new("git")
|
||||
.args(["add", "."])
|
||||
.current_dir(&wt_path)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args([
|
||||
"-c",
|
||||
"user.email=test@test.com",
|
||||
"-c",
|
||||
"user.name=Test",
|
||||
"commit",
|
||||
"-m",
|
||||
"add hello fn",
|
||||
])
|
||||
.current_dir(&wt_path)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Now simulate a crash leaving dirty files (broken syntax).
|
||||
fs::write(wt_path.join("src/lib.rs"), "THIS IS BROKEN SYNTAX!!!\n").unwrap();
|
||||
|
||||
// cargo_check_in_worktree should reset dirty files and check committed code.
|
||||
assert!(
|
||||
cargo_check_in_worktree(&wt_path),
|
||||
"cargo check should pass on committed code after resetting dirty files"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,8 +115,78 @@ impl AgentPool {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Bug 645: Before retry/block, check if the agent left committed
|
||||
// work that compiles. An agent may crash mid-output (e.g. Claude
|
||||
// Code CLI PTY write assertion) after having already committed valid
|
||||
// code. When committed work survives and `cargo check` passes,
|
||||
// advance to QA instead of wasting retries.
|
||||
let work_survived = worktree_path.as_ref().is_some_and(|wt_path| {
|
||||
crate::agents::gates::worktree_has_committed_work(wt_path)
|
||||
&& crate::agents::gates::cargo_check_in_worktree(wt_path)
|
||||
});
|
||||
if work_survived {
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}' but \
|
||||
committed work survives and compiles. Advancing to QA instead of \
|
||||
retrying (bug 645)."
|
||||
);
|
||||
let qa_mode = {
|
||||
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
|
||||
if item_type == "spike" {
|
||||
crate::io::story_metadata::QaMode::Human
|
||||
} else {
|
||||
let default_qa = config.default_qa_mode();
|
||||
resolve_qa_mode_from_store(&project_root, story_id, default_qa)
|
||||
}
|
||||
};
|
||||
match qa_mode {
|
||||
crate::io::story_metadata::QaMode::Server => {
|
||||
if let Err(e) = crate::agents::lifecycle::move_story_to_merge(
|
||||
&project_root,
|
||||
story_id,
|
||||
) {
|
||||
slog_error!(
|
||||
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
|
||||
);
|
||||
} else {
|
||||
self.start_mergemaster_or_block(&project_root, story_id)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
crate::io::story_metadata::QaMode::Agent => {
|
||||
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(
|
||||
&project_root,
|
||||
story_id,
|
||||
) {
|
||||
slog_error!(
|
||||
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
|
||||
);
|
||||
} else if let Err(e) = self
|
||||
.start_agent(&project_root, story_id, Some("qa"), None, None)
|
||||
.await
|
||||
{
|
||||
slog_error!(
|
||||
"[pipeline] Failed to start qa for '{story_id}': {e}"
|
||||
);
|
||||
}
|
||||
}
|
||||
crate::io::story_metadata::QaMode::Human => {
|
||||
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(
|
||||
&project_root,
|
||||
story_id,
|
||||
) {
|
||||
slog_error!(
|
||||
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
|
||||
);
|
||||
} else {
|
||||
write_review_hold_to_store(story_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else
|
||||
// Increment retry count and check if blocked.
|
||||
if let Some(reason) = should_block_story(story_id, config.max_retries, "coder")
|
||||
if let Some(reason) =
|
||||
should_block_story(story_id, config.max_retries, "coder")
|
||||
{
|
||||
// Story has exceeded retry limit — do not restart.
|
||||
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
|
||||
@@ -1062,4 +1132,218 @@ stage = "qa"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── bug 645: work-survived check advances to QA instead of blocking ──
|
||||
|
||||
/// Integration test: when a coder agent fails gates but committed work
|
||||
/// survives and compiles, the story advances to QA (not retry/block).
|
||||
/// Simulates an agent that commits work and then dies mid-output.
|
||||
#[tokio::test]
|
||||
async fn work_survived_advances_to_qa_instead_of_blocking() {
|
||||
use std::fs;
|
||||
use std::process::Command;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
// Init a git repo with a minimal Cargo project.
|
||||
Command::new("git")
|
||||
.args(["init"])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args(["config", "user.email", "test@test.com"])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args(["config", "user.name", "Test"])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
fs::write(
|
||||
root.join("Cargo.toml"),
|
||||
"[package]\nname = \"test_proj\"\nversion = \"0.1.0\"\nedition = \"2021\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
fs::create_dir_all(root.join("src")).unwrap();
|
||||
fs::write(root.join("src/lib.rs"), "// empty\n").unwrap();
|
||||
Command::new("git")
|
||||
.args(["add", "."])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args(["commit", "-m", "init"])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Create a worktree on a feature branch.
|
||||
let wt_path = tmp.path().join("wt");
|
||||
Command::new("git")
|
||||
.args([
|
||||
"worktree",
|
||||
"add",
|
||||
&wt_path.to_string_lossy(),
|
||||
"-b",
|
||||
"feature/story-9945_story_survived",
|
||||
])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Commit valid code on the feature branch.
|
||||
fs::write(wt_path.join("src/lib.rs"), "pub fn survived() {}\n").unwrap();
|
||||
Command::new("git")
|
||||
.args(["add", "."])
|
||||
.current_dir(&wt_path)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args(["commit", "-m", "add survived fn"])
|
||||
.current_dir(&wt_path)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Set up the story in the content store.
|
||||
crate::db::ensure_content_store();
|
||||
crate::db::write_content("9945_story_survived", "---\nname: Survived Test\n---\n");
|
||||
crate::db::write_item_with_content(
|
||||
"9945_story_survived",
|
||||
"2_current",
|
||||
"---\nname: Survived Test\n---\n",
|
||||
);
|
||||
|
||||
let pool = AgentPool::new_test(3001);
|
||||
|
||||
// Simulate coder failing gates (e.g. agent crashed, dirty worktree).
|
||||
pool.run_pipeline_advance(
|
||||
"9945_story_survived",
|
||||
"coder-1",
|
||||
CompletionReport {
|
||||
summary: "Agent crashed".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "Worktree has uncommitted changes".to_string(),
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
Some(wt_path),
|
||||
false,
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Story should have advanced — content store should reflect the move.
|
||||
// The work-survived check should have moved it to QA (or merge for
|
||||
// server qa mode), NOT incremented retry_count.
|
||||
let content = crate::db::read_content("9945_story_survived")
|
||||
.expect("story should exist in content store");
|
||||
assert!(
|
||||
!content.contains("blocked"),
|
||||
"story should NOT be blocked when committed work survives: {content}"
|
||||
);
|
||||
assert!(
|
||||
!content.contains("retry_count"),
|
||||
"story should NOT have retry_count when work survived: {content}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Backwards-compat: agents that die WITHOUT committed work still get
|
||||
/// the existing retry/block treatment.
|
||||
#[tokio::test]
|
||||
async fn no_committed_work_still_retries_and_blocks() {
|
||||
use std::fs;
|
||||
use std::process::Command;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
// Init a git repo (no Cargo project needed — cargo check will fail).
|
||||
Command::new("git")
|
||||
.args(["init"])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args(["config", "user.email", "test@test.com"])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args(["config", "user.name", "Test"])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args(["commit", "--allow-empty", "-m", "init"])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Create a worktree with NO commits on the feature branch.
|
||||
let wt_path = tmp.path().join("wt");
|
||||
Command::new("git")
|
||||
.args([
|
||||
"worktree",
|
||||
"add",
|
||||
&wt_path.to_string_lossy(),
|
||||
"-b",
|
||||
"feature/story-9946_story_nowork",
|
||||
])
|
||||
.current_dir(root)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Set up the story with max_retries=1 so it blocks immediately.
|
||||
crate::db::ensure_content_store();
|
||||
crate::db::write_content("9946_story_nowork", "---\nname: No Work Test\n---\n");
|
||||
crate::db::write_item_with_content(
|
||||
"9946_story_nowork",
|
||||
"2_current",
|
||||
"---\nname: No Work Test\n---\n",
|
||||
);
|
||||
|
||||
// Write a project.toml with max_retries = 1.
|
||||
fs::create_dir_all(root.join(".huskies")).unwrap();
|
||||
fs::write(
|
||||
root.join(".huskies/project.toml"),
|
||||
"max_retries = 1\n\n[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let pool = AgentPool::new_test(3001);
|
||||
let mut rx = pool.watcher_tx.subscribe();
|
||||
|
||||
// Simulate coder failing gates with NO committed work on the worktree.
|
||||
pool.run_pipeline_advance(
|
||||
"9946_story_nowork",
|
||||
"coder-1",
|
||||
CompletionReport {
|
||||
summary: "Agent crashed".to_string(),
|
||||
gates_passed: false,
|
||||
gate_output: "Tests failed".to_string(),
|
||||
},
|
||||
Some(root.to_path_buf()),
|
||||
Some(wt_path),
|
||||
false,
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
// With no committed work and max_retries=1, the story should be blocked.
|
||||
let mut got_blocked = false;
|
||||
while let Ok(evt) = rx.try_recv() {
|
||||
if let WatcherEvent::StoryBlocked { story_id, .. } = &evt
|
||||
&& story_id == "9946_story_nowork"
|
||||
{
|
||||
got_blocked = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert!(
|
||||
got_blocked,
|
||||
"Story with no committed work should be blocked after exceeding retry limit"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -232,7 +232,31 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
|
||||
let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path {
|
||||
let path = wt_path;
|
||||
match tokio::task::spawn_blocking(move || {
|
||||
crate::agents::gates::check_uncommitted_changes(&path)?;
|
||||
// If the worktree is dirty, check whether committed work survived.
|
||||
// An agent crash (e.g. Claude Code CLI's `output.write(&bytes).is_ok()`
|
||||
// assertion — bug 645) can leave uncommitted files behind even though
|
||||
// the agent already committed valid work. In that case, clean up the
|
||||
// dirty files and proceed with gates on the committed code.
|
||||
if let Err(dirty_msg) = crate::agents::gates::check_uncommitted_changes(&path) {
|
||||
if crate::agents::gates::worktree_has_committed_work(&path) {
|
||||
crate::slog!(
|
||||
"[agents] Worktree dirty but committed work exists — \
|
||||
resetting uncommitted changes and proceeding with gates. \
|
||||
Dirty state: {dirty_msg}"
|
||||
);
|
||||
// Reset dirty files so gates run against committed code only.
|
||||
let _ = std::process::Command::new("git")
|
||||
.args(["checkout", "."])
|
||||
.current_dir(&path)
|
||||
.output();
|
||||
let _ = std::process::Command::new("git")
|
||||
.args(["clean", "-fd"])
|
||||
.current_dir(&path)
|
||||
.output();
|
||||
} else {
|
||||
return Ok((false, dirty_msg));
|
||||
}
|
||||
}
|
||||
// AC5: Fail early if the coder finished with no commits on the feature branch.
|
||||
// This prevents empty-diff stories from advancing through QA to merge.
|
||||
if !crate::agents::gates::worktree_has_committed_work(&path) {
|
||||
@@ -642,4 +666,82 @@ mod tests {
|
||||
"Agent must remain in pool — run_server_owned_completion is a no-op for mergemaster"
|
||||
);
|
||||
}
|
||||
|
||||
/// Bug 645: when an agent crashes leaving dirty files but committed work,
|
||||
/// server-owned completion should clean up the dirty files and run gates
|
||||
/// on the committed code instead of failing immediately.
|
||||
#[tokio::test]
|
||||
async fn server_owned_completion_cleans_dirty_worktree_with_committed_work() {
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
let tmp = tempdir().unwrap();
|
||||
let project_root = tmp.path().join("project");
|
||||
fs::create_dir_all(&project_root).unwrap();
|
||||
init_git_repo(&project_root);
|
||||
|
||||
// Create a worktree on a feature branch with committed code.
|
||||
let wt_path = tmp.path().join("wt");
|
||||
Command::new("git")
|
||||
.args([
|
||||
"worktree",
|
||||
"add",
|
||||
&wt_path.to_string_lossy(),
|
||||
"-b",
|
||||
"feature/story-645_test",
|
||||
])
|
||||
.current_dir(&project_root)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Commit a valid file.
|
||||
fs::write(wt_path.join("work.txt"), "done").unwrap();
|
||||
Command::new("git")
|
||||
.args(["add", "."])
|
||||
.current_dir(&wt_path)
|
||||
.output()
|
||||
.unwrap();
|
||||
Command::new("git")
|
||||
.args(["commit", "-m", "coder: add work"])
|
||||
.current_dir(&wt_path)
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// Now simulate crash leaving dirty files.
|
||||
fs::write(wt_path.join("dirty.txt"), "crash residue").unwrap();
|
||||
|
||||
let pool = AgentPool::new_test(3001);
|
||||
pool.inject_test_agent_with_path(
|
||||
"645_test",
|
||||
"coder-1",
|
||||
AgentStatus::Running,
|
||||
wt_path.clone(),
|
||||
);
|
||||
|
||||
let mut rx = pool.subscribe("645_test", "coder-1").unwrap();
|
||||
|
||||
run_server_owned_completion(
|
||||
&pool.agents,
|
||||
pool.port,
|
||||
"645_test",
|
||||
"coder-1",
|
||||
Some("sess-645".to_string()),
|
||||
pool.watcher_tx.clone(),
|
||||
)
|
||||
.await;
|
||||
|
||||
// The dirty file should have been cleaned up.
|
||||
assert!(
|
||||
!wt_path.join("dirty.txt").exists(),
|
||||
"dirty file should be cleaned up after server-owned completion"
|
||||
);
|
||||
|
||||
// A Done event should have been emitted (completion ran, didn't fail
|
||||
// on dirty worktree).
|
||||
let event = rx.try_recv().expect("should emit Done event");
|
||||
assert!(
|
||||
matches!(event, AgentEvent::Done { .. }),
|
||||
"expected Done event, got: {event:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +36,22 @@ impl Drop for ChildKillerGuard {
|
||||
}
|
||||
|
||||
/// Spawn claude agent in a PTY and stream events through the broadcast channel.
|
||||
///
|
||||
/// ## Bug 645: `output.write(&bytes).is_ok()` assertion in Claude Code CLI
|
||||
///
|
||||
/// The Claude Code CLI can panic with an `output.write(&bytes).is_ok()` assertion
|
||||
/// when writing to its stdout (the PTY slave end). This occurs inside the child
|
||||
/// process — not in this server code — when the PTY pipe breaks or fills. The
|
||||
/// `output` in the assertion is the CLI's stdout writer, and the write fails when
|
||||
/// the PTY master side is closed or the kernel pipe buffer is exhausted.
|
||||
///
|
||||
/// When this happens, the child process dies, the PTY reader thread in this
|
||||
/// function receives EOF, and `run_agent_pty_blocking` returns `Ok(PtyResult)`.
|
||||
/// The server then runs completion gates via `run_server_owned_completion`.
|
||||
///
|
||||
/// If the agent committed valid work before crashing, the "work survived" check
|
||||
/// in `pipeline::advance` detects the committed code and advances the story to
|
||||
/// QA instead of entering the retry/block path.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(in crate::agents) async fn run_agent_pty_streaming(
|
||||
story_id: &str,
|
||||
|
||||
Reference in New Issue
Block a user