diff --git a/.huskies/specs/tech/STATE_MACHINE.md b/.huskies/specs/tech/STATE_MACHINE.md index 18c30055..e3b31c31 100644 --- a/.huskies/specs/tech/STATE_MACHINE.md +++ b/.huskies/specs/tech/STATE_MACHINE.md @@ -181,6 +181,25 @@ The mergemaster agent only runs against stories in `4_merge/`. It: 4. On failure beyond the retry budget, writes `merge_failure` and blocks the story (auto-assign then skips it). +### Agent terminated with committed work (bug 645 recovery path) + +When a coder agent terminates abnormally (e.g. the Claude Code CLI's +`output.write(&bytes).is_ok()` PTY write assertion fires mid-session), the +server-owned completion path detects the crash and checks for surviving work: + +1. If the worktree is dirty but has commits ahead of master, reset the + uncommitted files (`git checkout . && git clean -fd`) and run gates + against the committed code. +2. If gates still fail but `git log master..HEAD` shows commits and + `cargo check` passes, **advance to QA** instead of entering the + retry/block path. This is the "work survived" check, implemented in + `server/src/agents/pool/pipeline/advance.rs`. +3. Agents that die WITHOUT committed work (no commits ahead of master) + still follow the existing retry → block path unchanged. + +This prevents false-positive blocking of stories where the agent completed +meaningful work before crashing. + ### Watchdog (current production) The "watchdog" at `server/src/agents/pool/auto_assign/watchdog.rs` runs every diff --git a/server/src/agents/gates.rs b/server/src/agents/gates.rs index 074daeaf..d587a3dc 100644 --- a/server/src/agents/gates.rs +++ b/server/src/agents/gates.rs @@ -41,6 +41,26 @@ pub(crate) fn worktree_has_committed_work(wt_path: &Path) -> bool { } } +/// Run `cargo check` in the given worktree directory to verify committed code compiles. +/// +/// Resets any dirty (uncommitted) files first via `git checkout .` so that only +/// the committed state is evaluated. Used as part of the "work survived" check +/// when an agent crashes mid-output (bug 645). +pub(crate) fn cargo_check_in_worktree(wt_path: &Path) -> bool { + // Reset uncommitted changes so cargo check evaluates only committed code. + let _ = Command::new("git") + .args(["checkout", "."]) + .current_dir(wt_path) + .output(); + + Command::new("cargo") + .args(["check"]) + .current_dir(wt_path) + .output() + .map(|o| o.status.success()) + .unwrap_or(false) +} + /// Check whether the given directory has any uncommitted git changes. /// Returns `Err` with a descriptive message if there are any. pub(crate) fn check_uncommitted_changes(path: &Path) -> Result<(), String> { @@ -434,4 +454,87 @@ mod tests { // Now the feature branch is ahead of the base branch. assert!(worktree_has_committed_work(&wt_path)); } + + // ── cargo_check_in_worktree tests ──────────────────────────────────────── + + /// Bug 645: cargo_check_in_worktree resets dirty files before checking. + #[test] + fn cargo_check_in_worktree_resets_dirty_files() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let project_root = tmp.path().join("project"); + fs::create_dir_all(&project_root).unwrap(); + init_git_repo(&project_root); + + // Create a minimal Cargo project so cargo check works. + fs::write( + project_root.join("Cargo.toml"), + "[package]\nname = \"test_proj\"\nversion = \"0.1.0\"\nedition = \"2021\"\n", + ) + .unwrap(); + fs::create_dir_all(project_root.join("src")).unwrap(); + fs::write(project_root.join("src/lib.rs"), "// empty lib\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(&project_root) + .output() + .unwrap(); + Command::new("git") + .args([ + "-c", + "user.email=test@test.com", + "-c", + "user.name=Test", + "commit", + "-m", + "add cargo project", + ]) + .current_dir(&project_root) + .output() + .unwrap(); + + // Create a worktree on a feature branch. + let wt_path = tmp.path().join("wt"); + Command::new("git") + .args([ + "worktree", + "add", + &wt_path.to_string_lossy(), + "-b", + "feature/story-645_test", + ]) + .current_dir(&project_root) + .output() + .unwrap(); + + // Commit valid code. + fs::write(wt_path.join("src/lib.rs"), "pub fn hello() {}\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(&wt_path) + .output() + .unwrap(); + Command::new("git") + .args([ + "-c", + "user.email=test@test.com", + "-c", + "user.name=Test", + "commit", + "-m", + "add hello fn", + ]) + .current_dir(&wt_path) + .output() + .unwrap(); + + // Now simulate a crash leaving dirty files (broken syntax). + fs::write(wt_path.join("src/lib.rs"), "THIS IS BROKEN SYNTAX!!!\n").unwrap(); + + // cargo_check_in_worktree should reset dirty files and check committed code. + assert!( + cargo_check_in_worktree(&wt_path), + "cargo check should pass on committed code after resetting dirty files" + ); + } } diff --git a/server/src/agents/pool/pipeline/advance.rs b/server/src/agents/pool/pipeline/advance.rs index 1103229b..2a976f9f 100644 --- a/server/src/agents/pool/pipeline/advance.rs +++ b/server/src/agents/pool/pipeline/advance.rs @@ -115,8 +115,78 @@ impl AgentPool { } } } else { + // Bug 645: Before retry/block, check if the agent left committed + // work that compiles. An agent may crash mid-output (e.g. Claude + // Code CLI PTY write assertion) after having already committed valid + // code. When committed work survives and `cargo check` passes, + // advance to QA instead of wasting retries. + let work_survived = worktree_path.as_ref().is_some_and(|wt_path| { + crate::agents::gates::worktree_has_committed_work(wt_path) + && crate::agents::gates::cargo_check_in_worktree(wt_path) + }); + if work_survived { + slog!( + "[pipeline] Coder '{agent_name}' failed gates for '{story_id}' but \ + committed work survives and compiles. Advancing to QA instead of \ + retrying (bug 645)." + ); + let qa_mode = { + let item_type = crate::agents::lifecycle::item_type_from_id(story_id); + if item_type == "spike" { + crate::io::story_metadata::QaMode::Human + } else { + let default_qa = config.default_qa_mode(); + resolve_qa_mode_from_store(&project_root, story_id, default_qa) + } + }; + match qa_mode { + crate::io::story_metadata::QaMode::Server => { + if let Err(e) = crate::agents::lifecycle::move_story_to_merge( + &project_root, + story_id, + ) { + slog_error!( + "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}" + ); + } else { + self.start_mergemaster_or_block(&project_root, story_id) + .await; + } + } + crate::io::story_metadata::QaMode::Agent => { + if let Err(e) = crate::agents::lifecycle::move_story_to_qa( + &project_root, + story_id, + ) { + slog_error!( + "[pipeline] Failed to move '{story_id}' to 3_qa/: {e}" + ); + } else if let Err(e) = self + .start_agent(&project_root, story_id, Some("qa"), None, None) + .await + { + slog_error!( + "[pipeline] Failed to start qa for '{story_id}': {e}" + ); + } + } + crate::io::story_metadata::QaMode::Human => { + if let Err(e) = crate::agents::lifecycle::move_story_to_qa( + &project_root, + story_id, + ) { + slog_error!( + "[pipeline] Failed to move '{story_id}' to 3_qa/: {e}" + ); + } else { + write_review_hold_to_store(story_id); + } + } + } + } else // Increment retry count and check if blocked. - if let Some(reason) = should_block_story(story_id, config.max_retries, "coder") + if let Some(reason) = + should_block_story(story_id, config.max_retries, "coder") { // Story has exceeded retry limit — do not restart. let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked { @@ -1062,4 +1132,218 @@ stage = "qa" ); } } + + // ── bug 645: work-survived check advances to QA instead of blocking ── + + /// Integration test: when a coder agent fails gates but committed work + /// survives and compiles, the story advances to QA (not retry/block). + /// Simulates an agent that commits work and then dies mid-output. + #[tokio::test] + async fn work_survived_advances_to_qa_instead_of_blocking() { + use std::fs; + use std::process::Command; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Init a git repo with a minimal Cargo project. + Command::new("git") + .args(["init"]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.email", "test@test.com"]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.name", "Test"]) + .current_dir(root) + .output() + .unwrap(); + fs::write( + root.join("Cargo.toml"), + "[package]\nname = \"test_proj\"\nversion = \"0.1.0\"\nedition = \"2021\"\n", + ) + .unwrap(); + fs::create_dir_all(root.join("src")).unwrap(); + fs::write(root.join("src/lib.rs"), "// empty\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "init"]) + .current_dir(root) + .output() + .unwrap(); + + // Create a worktree on a feature branch. + let wt_path = tmp.path().join("wt"); + Command::new("git") + .args([ + "worktree", + "add", + &wt_path.to_string_lossy(), + "-b", + "feature/story-9945_story_survived", + ]) + .current_dir(root) + .output() + .unwrap(); + + // Commit valid code on the feature branch. + fs::write(wt_path.join("src/lib.rs"), "pub fn survived() {}\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(&wt_path) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "add survived fn"]) + .current_dir(&wt_path) + .output() + .unwrap(); + + // Set up the story in the content store. + crate::db::ensure_content_store(); + crate::db::write_content("9945_story_survived", "---\nname: Survived Test\n---\n"); + crate::db::write_item_with_content( + "9945_story_survived", + "2_current", + "---\nname: Survived Test\n---\n", + ); + + let pool = AgentPool::new_test(3001); + + // Simulate coder failing gates (e.g. agent crashed, dirty worktree). + pool.run_pipeline_advance( + "9945_story_survived", + "coder-1", + CompletionReport { + summary: "Agent crashed".to_string(), + gates_passed: false, + gate_output: "Worktree has uncommitted changes".to_string(), + }, + Some(root.to_path_buf()), + Some(wt_path), + false, + None, + ) + .await; + + // Story should have advanced — content store should reflect the move. + // The work-survived check should have moved it to QA (or merge for + // server qa mode), NOT incremented retry_count. + let content = crate::db::read_content("9945_story_survived") + .expect("story should exist in content store"); + assert!( + !content.contains("blocked"), + "story should NOT be blocked when committed work survives: {content}" + ); + assert!( + !content.contains("retry_count"), + "story should NOT have retry_count when work survived: {content}" + ); + } + + /// Backwards-compat: agents that die WITHOUT committed work still get + /// the existing retry/block treatment. + #[tokio::test] + async fn no_committed_work_still_retries_and_blocks() { + use std::fs; + use std::process::Command; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Init a git repo (no Cargo project needed — cargo check will fail). + Command::new("git") + .args(["init"]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.email", "test@test.com"]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.name", "Test"]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "--allow-empty", "-m", "init"]) + .current_dir(root) + .output() + .unwrap(); + + // Create a worktree with NO commits on the feature branch. + let wt_path = tmp.path().join("wt"); + Command::new("git") + .args([ + "worktree", + "add", + &wt_path.to_string_lossy(), + "-b", + "feature/story-9946_story_nowork", + ]) + .current_dir(root) + .output() + .unwrap(); + + // Set up the story with max_retries=1 so it blocks immediately. + crate::db::ensure_content_store(); + crate::db::write_content("9946_story_nowork", "---\nname: No Work Test\n---\n"); + crate::db::write_item_with_content( + "9946_story_nowork", + "2_current", + "---\nname: No Work Test\n---\n", + ); + + // Write a project.toml with max_retries = 1. + fs::create_dir_all(root.join(".huskies")).unwrap(); + fs::write( + root.join(".huskies/project.toml"), + "max_retries = 1\n\n[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + let mut rx = pool.watcher_tx.subscribe(); + + // Simulate coder failing gates with NO committed work on the worktree. + pool.run_pipeline_advance( + "9946_story_nowork", + "coder-1", + CompletionReport { + summary: "Agent crashed".to_string(), + gates_passed: false, + gate_output: "Tests failed".to_string(), + }, + Some(root.to_path_buf()), + Some(wt_path), + false, + None, + ) + .await; + + // With no committed work and max_retries=1, the story should be blocked. + let mut got_blocked = false; + while let Ok(evt) = rx.try_recv() { + if let WatcherEvent::StoryBlocked { story_id, .. } = &evt + && story_id == "9946_story_nowork" + { + got_blocked = true; + break; + } + } + assert!( + got_blocked, + "Story with no committed work should be blocked after exceeding retry limit" + ); + } } diff --git a/server/src/agents/pool/pipeline/completion.rs b/server/src/agents/pool/pipeline/completion.rs index 2beedcd0..9d53a7f9 100644 --- a/server/src/agents/pool/pipeline/completion.rs +++ b/server/src/agents/pool/pipeline/completion.rs @@ -232,7 +232,31 @@ pub(in crate::agents::pool) async fn run_server_owned_completion( let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path { let path = wt_path; match tokio::task::spawn_blocking(move || { - crate::agents::gates::check_uncommitted_changes(&path)?; + // If the worktree is dirty, check whether committed work survived. + // An agent crash (e.g. Claude Code CLI's `output.write(&bytes).is_ok()` + // assertion — bug 645) can leave uncommitted files behind even though + // the agent already committed valid work. In that case, clean up the + // dirty files and proceed with gates on the committed code. + if let Err(dirty_msg) = crate::agents::gates::check_uncommitted_changes(&path) { + if crate::agents::gates::worktree_has_committed_work(&path) { + crate::slog!( + "[agents] Worktree dirty but committed work exists — \ + resetting uncommitted changes and proceeding with gates. \ + Dirty state: {dirty_msg}" + ); + // Reset dirty files so gates run against committed code only. + let _ = std::process::Command::new("git") + .args(["checkout", "."]) + .current_dir(&path) + .output(); + let _ = std::process::Command::new("git") + .args(["clean", "-fd"]) + .current_dir(&path) + .output(); + } else { + return Ok((false, dirty_msg)); + } + } // AC5: Fail early if the coder finished with no commits on the feature branch. // This prevents empty-diff stories from advancing through QA to merge. if !crate::agents::gates::worktree_has_committed_work(&path) { @@ -642,4 +666,82 @@ mod tests { "Agent must remain in pool — run_server_owned_completion is a no-op for mergemaster" ); } + + /// Bug 645: when an agent crashes leaving dirty files but committed work, + /// server-owned completion should clean up the dirty files and run gates + /// on the committed code instead of failing immediately. + #[tokio::test] + async fn server_owned_completion_cleans_dirty_worktree_with_committed_work() { + use std::fs; + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let project_root = tmp.path().join("project"); + fs::create_dir_all(&project_root).unwrap(); + init_git_repo(&project_root); + + // Create a worktree on a feature branch with committed code. + let wt_path = tmp.path().join("wt"); + Command::new("git") + .args([ + "worktree", + "add", + &wt_path.to_string_lossy(), + "-b", + "feature/story-645_test", + ]) + .current_dir(&project_root) + .output() + .unwrap(); + + // Commit a valid file. + fs::write(wt_path.join("work.txt"), "done").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(&wt_path) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "coder: add work"]) + .current_dir(&wt_path) + .output() + .unwrap(); + + // Now simulate crash leaving dirty files. + fs::write(wt_path.join("dirty.txt"), "crash residue").unwrap(); + + let pool = AgentPool::new_test(3001); + pool.inject_test_agent_with_path( + "645_test", + "coder-1", + AgentStatus::Running, + wt_path.clone(), + ); + + let mut rx = pool.subscribe("645_test", "coder-1").unwrap(); + + run_server_owned_completion( + &pool.agents, + pool.port, + "645_test", + "coder-1", + Some("sess-645".to_string()), + pool.watcher_tx.clone(), + ) + .await; + + // The dirty file should have been cleaned up. + assert!( + !wt_path.join("dirty.txt").exists(), + "dirty file should be cleaned up after server-owned completion" + ); + + // A Done event should have been emitted (completion ran, didn't fail + // on dirty worktree). + let event = rx.try_recv().expect("should emit Done event"); + assert!( + matches!(event, AgentEvent::Done { .. }), + "expected Done event, got: {event:?}" + ); + } } diff --git a/server/src/agents/pty.rs b/server/src/agents/pty.rs index 1107f402..322df415 100644 --- a/server/src/agents/pty.rs +++ b/server/src/agents/pty.rs @@ -36,6 +36,22 @@ impl Drop for ChildKillerGuard { } /// Spawn claude agent in a PTY and stream events through the broadcast channel. +/// +/// ## Bug 645: `output.write(&bytes).is_ok()` assertion in Claude Code CLI +/// +/// The Claude Code CLI can panic with an `output.write(&bytes).is_ok()` assertion +/// when writing to its stdout (the PTY slave end). This occurs inside the child +/// process — not in this server code — when the PTY pipe breaks or fills. The +/// `output` in the assertion is the CLI's stdout writer, and the write fails when +/// the PTY master side is closed or the kernel pipe buffer is exhausted. +/// +/// When this happens, the child process dies, the PTY reader thread in this +/// function receives EOF, and `run_agent_pty_blocking` returns `Ok(PtyResult)`. +/// The server then runs completion gates via `run_server_owned_completion`. +/// +/// If the agent committed valid work before crashing, the "work survived" check +/// in `pipeline::advance` detects the committed code and advances the story to +/// QA instead of entering the retry/block path. #[allow(clippy::too_many_arguments)] pub(in crate::agents) async fn run_agent_pty_streaming( story_id: &str,