diff --git a/server/src/agents/pool.rs b/server/src/agents/pool.rs index b13c8cc..057391b 100644 --- a/server/src/agents/pool.rs +++ b/server/src/agents/pool.rs @@ -348,20 +348,16 @@ impl AgentPool { // Create persistent log writer (needs resolved_name, so must be after // the atomic resolution above). - let log_writer = match AgentLogWriter::new( - project_root, - story_id, - &resolved_name, - &log_session_id, - ) { - Ok(w) => Some(Arc::new(Mutex::new(w))), - Err(e) => { - eprintln!( - "[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}" - ); - None - } - }; + let log_writer = + match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) { + Ok(w) => Some(Arc::new(Mutex::new(w))), + Err(e) => { + eprintln!( + "[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}" + ); + None + } + }; // Notify WebSocket clients that a new agent is pending. Self::notify_agent_state_changed(&self.watcher_tx); @@ -420,9 +416,10 @@ impl AgentPool { } let _ = tx_clone.send(event); if let Ok(mut agents) = agents_ref.lock() - && let Some(agent) = agents.get_mut(&key_clone) { - agent.status = AgentStatus::Failed; - } + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Failed; + } Self::notify_agent_state_changed(&watcher_tx_clone); return; } @@ -458,9 +455,10 @@ impl AgentPool { } let _ = tx_clone.send(event); if let Ok(mut agents) = agents_ref.lock() - && let Some(agent) = agents.get_mut(&key_clone) { - agent.status = AgentStatus::Failed; - } + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Failed; + } Self::notify_agent_state_changed(&watcher_tx_clone); return; } @@ -528,9 +526,10 @@ impl AgentPool { } let _ = tx_clone.send(event); if let Ok(mut agents) = agents_ref.lock() - && let Some(agent) = agents.get_mut(&key_clone) { - agent.status = AgentStatus::Failed; - } + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Failed; + } Self::notify_agent_state_changed(&watcher_tx_clone); } } @@ -707,8 +706,7 @@ impl AgentPool { } } - let deadline = - tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms); + let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms); loop { let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); @@ -841,16 +839,12 @@ impl AgentPool { ); if let Err(e) = super::lifecycle::move_story_to_qa(&project_root, story_id) { slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"); - return; - } - if let Err(e) = self + } else if let Err(e) = self .start_agent(&project_root, story_id, Some("qa"), None) .await { slog_error!("[pipeline] Failed to start qa agent for '{story_id}': {e}"); } - // Coder slot is now free — pick up any other unassigned work in 2_current/. - self.auto_assign_available_work(&project_root).await; } else { slog!( "[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting." @@ -874,7 +868,9 @@ impl AgentPool { PipelineStage::Qa => { if completion.gates_passed { // Run coverage gate in the QA worktree before advancing to merge. - let coverage_path = worktree_path.clone().unwrap_or_else(|| project_root.clone()); + let coverage_path = worktree_path + .clone() + .unwrap_or_else(|| project_root.clone()); let cp = coverage_path.clone(); let coverage_result = tokio::task::spawn_blocking(move || super::gates::run_coverage_gate(&cp)) @@ -906,33 +902,37 @@ impl AgentPool { // Hold in 3_qa/ for human review. let qa_dir = project_root.join(".story_kit/work/3_qa"); let story_path = qa_dir.join(format!("{story_id}.md")); - if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) { - slog_error!("[pipeline] Failed to set review_hold on '{story_id}': {e}"); + if let Err(e) = + crate::io::story_metadata::write_review_hold(&story_path) + { + slog_error!( + "[pipeline] Failed to set review_hold on '{story_id}': {e}" + ); } slog!( "[pipeline] QA passed for '{story_id}'. \ Holding for human review. \ Worktree preserved at: {worktree_path:?}" ); - // Free up the QA slot without advancing. - self.auto_assign_available_work(&project_root).await; } else { slog!( "[pipeline] QA passed gates and coverage for '{story_id}'. \ manual_qa: false — moving directly to merge." ); - if let Err(e) = super::lifecycle::move_story_to_merge(&project_root, story_id) { - slog_error!("[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"); - return; - } - if let Err(e) = self + if let Err(e) = + super::lifecycle::move_story_to_merge(&project_root, story_id) + { + slog_error!( + "[pipeline] Failed to move '{story_id}' to 4_merge/: {e}" + ); + } else if let Err(e) = self .start_agent(&project_root, story_id, Some("mergemaster"), None) .await { - slog_error!("[pipeline] Failed to start mergemaster for '{story_id}': {e}"); + slog_error!( + "[pipeline] Failed to start mergemaster for '{story_id}': {e}" + ); } - // QA slot is now free — pick up any other unassigned work in 3_qa/. - self.auto_assign_available_work(&project_root).await; } } else { slog!( @@ -952,9 +952,7 @@ impl AgentPool { } } } else { - slog!( - "[pipeline] QA failed gates for '{story_id}'. Restarting." - ); + slog!("[pipeline] QA failed gates for '{story_id}'. Restarting."); let context = format!( "\n\n---\n## Previous QA Attempt Failed\n\ The acceptance gates failed with the following output:\n{}\n\n\ @@ -979,71 +977,82 @@ impl AgentPool { mergemaster explicitly reported a merge failure. \ Story stays in 4_merge/ for human review." ); - return; - } - - // Run script/test on master (project_root) as the post-merge verification. - slog!( - "[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master." - ); - let root = project_root.clone(); - let test_result = tokio::task::spawn_blocking(move || super::gates::run_project_tests(&root)) - .await - .unwrap_or_else(|e| { - slog_warn!("[pipeline] Post-merge test task panicked: {e}"); - Ok((false, format!("Test task panicked: {e}"))) - }); - let (passed, output) = match test_result { - Ok(pair) => pair, - Err(e) => (false, e), - }; - - if passed { - slog!( - "[pipeline] Post-merge tests passed for '{story_id}'. Moving to done." - ); - if let Err(e) = super::lifecycle::move_story_to_archived(&project_root, story_id) { - slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}"); - } - self.remove_agents_for_story(story_id); - // Mergemaster slot is now free — pick up any other items in 4_merge/. - self.auto_assign_available_work(&project_root).await; - // TODO: Re-enable worktree cleanup once we have persistent agent logs. - // Removing worktrees destroys evidence needed to debug empty-commit agents. - // let config = - // crate::config::ProjectConfig::load(&project_root).unwrap_or_default(); - // if let Err(e) = - // worktree::remove_worktree_by_story_id(&project_root, story_id, &config) - // .await - // { - // slog!( - // "[pipeline] Failed to remove worktree for '{story_id}': {e}" - // ); - // } - slog!( - "[pipeline] Story '{story_id}' done. Worktree preserved for inspection." - ); } else { + // Run script/test on master (project_root) as the post-merge verification. slog!( - "[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster." + "[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master." ); - let context = format!( - "\n\n---\n## Post-Merge Test Failed\n\ + let root = project_root.clone(); + let test_result = + tokio::task::spawn_blocking(move || super::gates::run_project_tests(&root)) + .await + .unwrap_or_else(|e| { + slog_warn!("[pipeline] Post-merge test task panicked: {e}"); + Ok((false, format!("Test task panicked: {e}"))) + }); + let (passed, output) = match test_result { + Ok(pair) => pair, + Err(e) => (false, e), + }; + + if passed { + slog!( + "[pipeline] Post-merge tests passed for '{story_id}'. Moving to done." + ); + if let Err(e) = + super::lifecycle::move_story_to_archived(&project_root, story_id) + { + slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}"); + } + self.remove_agents_for_story(story_id); + // TODO: Re-enable worktree cleanup once we have persistent agent logs. + // Removing worktrees destroys evidence needed to debug empty-commit agents. + // let config = + // crate::config::ProjectConfig::load(&project_root).unwrap_or_default(); + // if let Err(e) = + // worktree::remove_worktree_by_story_id(&project_root, story_id, &config) + // .await + // { + // slog!( + // "[pipeline] Failed to remove worktree for '{story_id}': {e}" + // ); + // } + slog!( + "[pipeline] Story '{story_id}' done. Worktree preserved for inspection." + ); + } else { + slog!( + "[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster." + ); + let context = format!( + "\n\n---\n## Post-Merge Test Failed\n\ The tests on master failed with the following output:\n{}\n\n\ Please investigate and resolve the failures, then call merge_agent_work again.", - output - ); - if let Err(e) = self - .start_agent(&project_root, story_id, Some("mergemaster"), Some(&context)) - .await - { - slog_error!( - "[pipeline] Failed to restart mergemaster for '{story_id}': {e}" + output ); + if let Err(e) = self + .start_agent( + &project_root, + story_id, + Some("mergemaster"), + Some(&context), + ) + .await + { + slog_error!( + "[pipeline] Failed to restart mergemaster for '{story_id}': {e}" + ); + } } } } } + + // Always scan for unassigned work after any agent completes, regardless + // of the outcome (success, failure, restart). This ensures stories that + // failed agent assignment due to busy agents are retried when agents + // become available (bug 295). + self.auto_assign_available_work(&project_root).await; } /// Internal: report that an agent has finished work on a story. @@ -1114,7 +1123,13 @@ impl AgentPool { // Extract data for pipeline advance, then remove the entry so // completed agents never appear in list_agents. - let (tx, session_id, project_root_for_advance, wt_path_for_advance, merge_failure_reported_for_advance) = { + let ( + tx, + session_id, + project_root_for_advance, + wt_path_for_advance, + merge_failure_reported_for_advance, + ) = { let mut agents = self.agents.lock().map_err(|e| e.to_string())?; let agent = agents.get_mut(&key).ok_or_else(|| { format!("Agent '{agent_name}' for story '{story_id}' disappeared during gate check") @@ -1267,14 +1282,14 @@ impl AgentPool { }); } - let story_archived = super::lifecycle::move_story_to_archived(project_root, story_id).is_ok(); + let story_archived = + super::lifecycle::move_story_to_archived(project_root, story_id).is_ok(); if story_archived { self.remove_agents_for_story(story_id); } let worktree_cleaned_up = if wt_path.exists() { - let config = crate::config::ProjectConfig::load(project_root) - .unwrap_or_default(); + let config = crate::config::ProjectConfig::load(project_root).unwrap_or_default(); worktree::remove_worktree_by_story_id(project_root, story_id, &config) .await .is_ok() @@ -1306,21 +1321,14 @@ impl AgentPool { } /// Get project root helper. - pub fn get_project_root( - &self, - state: &crate::state::SessionState, - ) -> Result { + pub fn get_project_root(&self, state: &crate::state::SessionState) -> Result { state.get_project_root() } /// Get the log session ID and project root for an agent, if available. /// /// Used by MCP tools to find the persistent log file for a completed agent. - pub fn get_log_info( - &self, - story_id: &str, - agent_name: &str, - ) -> Option<(String, PathBuf)> { + pub fn get_log_info(&self, story_id: &str, agent_name: &str) -> Option<(String, PathBuf)> { let key = composite_key(story_id, agent_name); let agents = self.agents.lock().ok()?; let agent = agents.get(&key)?; @@ -1364,9 +1372,7 @@ impl AgentPool { } } Err(e) => { - slog_error!( - "[pipeline] set_merge_failure_reported: could not lock agents: {e}" - ); + slog_error!("[pipeline] set_merge_failure_reported: could not lock agents: {e}"); } } } @@ -1678,9 +1684,7 @@ impl AgentPool { continue; } Err(e) => { - eprintln!( - "[startup:reconcile] Gate check task panicked for '{story_id}': {e}" - ); + eprintln!("[startup:reconcile] Gate check task panicked for '{story_id}': {e}"); let _ = progress_tx.send(ReconciliationEvent { story_id: story_id.clone(), status: "failed".to_string(), @@ -1703,9 +1707,7 @@ impl AgentPool { continue; } - eprintln!( - "[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/)." - ); + eprintln!("[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/)."); if stage_dir == "2_current" { // Coder stage → advance to QA. @@ -1727,16 +1729,15 @@ impl AgentPool { } else if stage_dir == "3_qa" { // QA stage → run coverage gate before advancing to merge. let wt_path_for_cov = wt_path.clone(); - let coverage_result = - tokio::task::spawn_blocking(move || super::gates::run_coverage_gate(&wt_path_for_cov)) - .await; + let coverage_result = tokio::task::spawn_blocking(move || { + super::gates::run_coverage_gate(&wt_path_for_cov) + }) + .await; let (coverage_passed, coverage_output) = match coverage_result { Ok(Ok(pair)) => pair, Ok(Err(e)) => { - eprintln!( - "[startup:reconcile] Coverage gate error for '{story_id}': {e}" - ); + eprintln!("[startup:reconcile] Coverage gate error for '{story_id}': {e}"); let _ = progress_tx.send(ReconciliationEvent { story_id: story_id.clone(), status: "failed".to_string(), @@ -1788,7 +1789,9 @@ impl AgentPool { status: "review_hold".to_string(), message: "Passed QA — waiting for human review.".to_string(), }); - } else if let Err(e) = super::lifecycle::move_story_to_merge(project_root, story_id) { + } else if let Err(e) = + super::lifecycle::move_story_to_merge(project_root, story_id) + { eprintln!( "[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}" ); @@ -1923,17 +1926,14 @@ impl AgentPool { /// is triggered so that free agents can pick up unassigned work. pub fn spawn_watchdog(pool: Arc, project_root: Option) { tokio::spawn(async move { - let mut interval = - tokio::time::interval(std::time::Duration::from_secs(30)); + let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); loop { interval.tick().await; let found = check_orphaned_agents(&pool.agents); if found > 0 && let Some(ref root) = project_root { - slog!( - "[watchdog] {found} orphaned agent(s) detected; triggering auto-assign." - ); + slog!("[watchdog] {found} orphaned agent(s) detected; triggering auto-assign."); pool.auto_assign_available_work(root).await; } } @@ -1992,7 +1992,11 @@ fn find_active_story_stage(project_root: &Path, story_id: &str) -> Option<&'stat /// /// Returns `Some(agent_name)` if the front matter specifies an agent, or `None` /// if the field is absent or the file cannot be read / parsed. -fn read_story_front_matter_agent(project_root: &Path, stage_dir: &str, story_id: &str) -> Option { +fn read_story_front_matter_agent( + project_root: &Path, + stage_dir: &str, + story_id: &str, +) -> Option { use crate::io::story_metadata::parse_front_matter; let path = project_root .join(".story_kit") @@ -2030,10 +2034,7 @@ fn is_agent_free(agents: &HashMap, agent_name: &str) -> bool } fn scan_stage_items(project_root: &Path, stage_dir: &str) -> Vec { - let dir = project_root - .join(".story_kit") - .join("work") - .join(stage_dir); + let dir = project_root.join(".story_kit").join("work").join(stage_dir); if !dir.is_dir() { return Vec::new(); } @@ -2124,7 +2125,12 @@ fn check_orphaned_agents(agents: &Mutex>) -> usize { .rsplit_once(':') .map(|(s, _)| s.to_string()) .unwrap_or_else(|| key.clone()); - return Some((key.clone(), story_id, agent.tx.clone(), agent.status.clone())); + return Some(( + key.clone(), + story_id, + agent.tx.clone(), + agent.status.clone(), + )); } None }) @@ -2440,9 +2446,7 @@ mod tests { #[tokio::test] async fn report_completion_rejects_nonexistent_agent() { let pool = AgentPool::new_test(3001); - let result = pool - .report_completion("no_story", "no_bot", "done") - .await; + let result = pool.report_completion("no_story", "no_bot", "done").await; assert!(result.is_err()); let msg = result.unwrap_err(); assert!(msg.contains("No agent"), "unexpected: {msg}"); @@ -2518,8 +2522,15 @@ mod tests { // Subscribe before calling so we can check if Done event was emitted. let mut rx = pool.subscribe("s10", "coder-1").unwrap(); - run_server_owned_completion(&pool.agents, pool.port, "s10", "coder-1", Some("sess-1".to_string()), pool.watcher_tx.clone()) - .await; + run_server_owned_completion( + &pool.agents, + pool.port, + "s10", + "coder-1", + Some("sess-1".to_string()), + pool.watcher_tx.clone(), + ) + .await; // Status should remain Completed (unchanged) — no gate re-run. let agents = pool.agents.lock().unwrap(); @@ -2527,10 +2538,7 @@ mod tests { let agent = agents.get(&key).unwrap(); assert_eq!(agent.status, AgentStatus::Completed); // Summary should still be the original, not overwritten. - assert_eq!( - agent.completion.as_ref().unwrap().summary, - "Already done" - ); + assert_eq!(agent.completion.as_ref().unwrap().summary, "Already done"); drop(agents); // No Done event should have been emitted. @@ -2558,8 +2566,15 @@ mod tests { let mut rx = pool.subscribe("s11", "coder-1").unwrap(); - run_server_owned_completion(&pool.agents, pool.port, "s11", "coder-1", Some("sess-2".to_string()), pool.watcher_tx.clone()) - .await; + run_server_owned_completion( + &pool.agents, + pool.port, + "s11", + "coder-1", + Some("sess-2".to_string()), + pool.watcher_tx.clone(), + ) + .await; // Agent entry should be removed from the map after completion. let agents = pool.agents.lock().unwrap(); @@ -2601,8 +2616,15 @@ mod tests { let mut rx = pool.subscribe("s12", "coder-1").unwrap(); - run_server_owned_completion(&pool.agents, pool.port, "s12", "coder-1", None, pool.watcher_tx.clone()) - .await; + run_server_owned_completion( + &pool.agents, + pool.port, + "s12", + "coder-1", + None, + pool.watcher_tx.clone(), + ) + .await; // Agent entry should be removed from the map after completion (even on failure). let agents = pool.agents.lock().unwrap(); @@ -2625,8 +2647,15 @@ mod tests { async fn server_owned_completion_nonexistent_agent_is_noop() { let pool = AgentPool::new_test(3001); // Should not panic or error — just silently return. - run_server_owned_completion(&pool.agents, pool.port, "nonexistent", "bot", None, pool.watcher_tx.clone()) - .await; + run_server_owned_completion( + &pool.agents, + pool.port, + "nonexistent", + "bot", + None, + pool.watcher_tx.clone(), + ) + .await; } // ── pipeline advance tests ──────────────────────────────────────────────── @@ -2656,7 +2685,7 @@ mod tests { None, false, ) - .await; + .await; // Story should have moved to 3_qa/ (start_agent for qa will fail in tests but // the file move happens before that). @@ -2699,11 +2728,12 @@ mod tests { None, false, ) - .await; + .await; // Story should have moved to 4_merge/ assert!( - root.join(".story_kit/work/4_merge/51_story_test.md").exists(), + root.join(".story_kit/work/4_merge/51_story_test.md") + .exists(), "story should be in 4_merge/" ); assert!( @@ -2735,7 +2765,7 @@ mod tests { None, false, ) - .await; + .await; // Story should NOT have moved (supervisors don't advance pipeline) assert!( @@ -2805,7 +2835,7 @@ stage = "qa" None, false, ) - .await; + .await; // The pipeline advance should have sent AgentStateChanged events via // the pool's watcher_tx (not a dummy channel). Collect all events. @@ -2916,12 +2946,7 @@ stage = "qa" ); // Should NOT appear as a coder assert!( - !is_story_assigned_for_stage( - &config, - &agents, - "42_story_foo", - &PipelineStage::Coder - ), + !is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Coder), "qa-2 should not be detected as a coder" ); } @@ -2967,7 +2992,11 @@ name = "coder-3" let agents = pool.agents.lock().unwrap(); let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder); - assert_eq!(free, Some("coder-2"), "coder-2 should be the first free coder"); + assert_eq!( + free, + Some("coder-2"), + "coder-2 should be the first free coder" + ); } #[test] @@ -3070,10 +3099,7 @@ stage = "coder" fs::create_dir_all(&qa).unwrap(); fs::write(qa.join("11_story_test.md"), "test").unwrap(); - assert_eq!( - find_active_story_stage(root, "11_story_test"), - Some("3_qa") - ); + assert_eq!(find_active_story_stage(root, "11_story_test"), Some("3_qa")); } #[test] @@ -3125,7 +3151,10 @@ stage = "coder" pool.inject_test_agent("story_b", "qa", AgentStatus::Failed); let found = check_orphaned_agents(&pool.agents); - assert_eq!(found, 0, "no orphans should be detected for terminal agents"); + assert_eq!( + found, 0, + "no orphans should be detected for terminal agents" + ); } #[tokio::test] @@ -3134,10 +3163,17 @@ stage = "coder" let handle = tokio::spawn(async {}); tokio::time::sleep(std::time::Duration::from_millis(20)).await; - assert!(handle.is_finished(), "task should be finished before injection"); + assert!( + handle.is_finished(), + "task should be finished before injection" + ); - let tx = - pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle); + let tx = pool.inject_test_agent_with_handle( + "orphan_story", + "coder", + AgentStatus::Running, + handle, + ); let mut rx = tx.subscribe(); pool.run_watchdog_once(); @@ -3170,12 +3206,7 @@ stage = "coder" let handle = tokio::spawn(async {}); tokio::time::sleep(std::time::Duration::from_millis(20)).await; - pool.inject_test_agent_with_handle( - "orphan_story", - "coder", - AgentStatus::Running, - handle, - ); + pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle); // Before watchdog: agent is Running. { @@ -3260,11 +3291,18 @@ stage = "coder" // Agent entries for the archived story should be gone. let remaining = pool.list_agents().unwrap(); - assert_eq!(remaining.len(), 1, "only the other story's agent should remain"); + assert_eq!( + remaining.len(), + 1, + "only the other story's agent should remain" + ); assert_eq!(remaining[0].story_id, "61_story_other"); // Story file should be in 5_done/ - assert!(root.join(".story_kit/work/5_done/60_story_cleanup.md").exists()); + assert!( + root.join(".story_kit/work/5_done/60_story_cleanup.md") + .exists() + ); } // ── kill_all_children tests ──────────────────────────────────── @@ -3515,9 +3553,7 @@ stage = "coder" pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending); - let result = pool - .start_agent(tmp.path(), "story-3", None, None) - .await; + let result = pool.start_agent(tmp.path(), "story-3", None, None).await; assert!(result.is_err()); let err = result.unwrap_err(); assert!( @@ -3545,18 +3581,12 @@ stage = "coder" ) .unwrap(); // Place the story in 1_backlog/. - std::fs::write( - backlog.join("story-3.md"), - "---\nname: Story 3\n---\n", - ) - .unwrap(); + std::fs::write(backlog.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap(); let pool = AgentPool::new_test(3001); pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); - let result = pool - .start_agent(tmp.path(), "story-3", None, None) - .await; + let result = pool.start_agent(tmp.path(), "story-3", None, None).await; // Should fail because all coders are busy. assert!(result.is_err()); @@ -3597,11 +3627,7 @@ stage = "coder" ) .unwrap(); // Place the story in 2_current/ (simulating the "queued" state). - std::fs::write( - current.join("story-3.md"), - "---\nname: Story 3\n---\n", - ) - .unwrap(); + std::fs::write(current.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap(); let pool = AgentPool::new_test(3001); // No agents are running — coder-1 is free. @@ -3637,20 +3663,14 @@ stage = "coder" ) .unwrap(); // Place the story directly in 2_current/. - std::fs::write( - current.join("story-5.md"), - "---\nname: Story 5\n---\n", - ) - .unwrap(); + std::fs::write(current.join("story-5.md"), "---\nname: Story 5\n---\n").unwrap(); let pool = AgentPool::new_test(3001); // start_agent should attempt to assign coder-1 (no infra, so it will // fail for git reasons), but must NOT fail due to the story already // being in 2_current/. - let result = pool - .start_agent(tmp.path(), "story-5", None, None) - .await; + let result = pool.start_agent(tmp.path(), "story-5", None, None).await; match result { Ok(_) => {} Err(e) => { @@ -3710,20 +3730,14 @@ stage = "coder" // Write a minimal project.toml so ProjectConfig::load can find the "qa" agent. let sk_dir = root.join(".story_kit"); fs::create_dir_all(&sk_dir).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"qa\"\n", - ) - .unwrap(); + fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); let pool = AgentPool::new_test(3001); // Simulate qa already running on story-a. pool.inject_test_agent("story-a", "qa", AgentStatus::Running); // Attempt to start qa on story-b — must be rejected. - let result = pool - .start_agent(root, "story-b", Some("qa"), None) - .await; + let result = pool.start_agent(root, "story-b", Some("qa"), None).await; assert!( result.is_err(), @@ -3747,11 +3761,7 @@ stage = "coder" let sk_dir = root.join(".story_kit"); fs::create_dir_all(&sk_dir).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"qa\"\n", - ) - .unwrap(); + fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); let pool = AgentPool::new_test(3001); // Previous run completed — should NOT block a new story. @@ -3761,9 +3771,7 @@ stage = "coder" // NOT fail at the concurrency check. We detect the difference by inspecting // the error message: a concurrency rejection says "already running", while a // later failure (missing story file, missing claude binary, etc.) says something else. - let result = pool - .start_agent(root, "story-b", Some("qa"), None) - .await; + let result = pool.start_agent(root, "story-b", Some("qa"), None).await; if let Err(ref e) = result { assert!( @@ -3795,21 +3803,13 @@ stage = "coder" // Minimal project.toml with a "qa" agent. let sk_dir = root.join(".story_kit"); fs::create_dir_all(&sk_dir).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"qa\"\n", - ) - .unwrap(); + fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); // Create the story in upcoming so `move_story_to_current` succeeds, // but do NOT init a git repo — `create_worktree` will fail in the spawn. let upcoming = root.join(".story_kit/work/1_backlog"); fs::create_dir_all(&upcoming).unwrap(); - fs::write( - upcoming.join("50_story_test.md"), - "---\nname: Test\n---\n", - ) - .unwrap(); + fs::write(upcoming.join("50_story_test.md"), "---\nname: Test\n---\n").unwrap(); let pool = AgentPool::new_test(3099); @@ -3858,9 +3858,7 @@ stage = "coder" let events = pool .drain_events("50_story_test", "qa") .expect("drain_events should succeed"); - let has_error_event = events - .iter() - .any(|e| matches!(e, AgentEvent::Error { .. })); + let has_error_event = events.iter().any(|e| matches!(e, AgentEvent::Error { .. })); assert!( has_error_event, "event_log must contain AgentEvent::Error after worktree creation fails" @@ -3880,11 +3878,7 @@ stage = "coder" let sk_dir = root.join(".story_kit"); fs::create_dir_all(&sk_dir).unwrap(); - fs::write( - sk_dir.join("project.toml"), - "[[agent]]\nname = \"qa\"\n", - ) - .unwrap(); + fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); let pool = AgentPool::new_test(3099); @@ -3893,9 +3887,7 @@ stage = "coder" // Attempting to start the same agent on a different story must be // rejected — the Running entry must still be there. - let result = pool - .start_agent(root, "story-y", Some("qa"), None) - .await; + let result = pool.start_agent(root, "story-y", Some("qa"), None).await; assert!(result.is_err()); let err = result.unwrap_err(); @@ -3920,7 +3912,11 @@ stage = "coder" let sk_dir = root.join(".story_kit"); fs::create_dir_all(&sk_dir).unwrap(); - fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"coder-1\"\n").unwrap(); + fs::write( + sk_dir.join("project.toml"), + "[[agent]]\nname = \"coder-1\"\n", + ) + .unwrap(); let pool = AgentPool::new_test(3099); @@ -4041,7 +4037,10 @@ stage = "coder" .start_agent(root, "42_story_foo", Some("coder-2"), None) .await; - assert!(result.is_err(), "second coder on same story must be rejected"); + assert!( + result.is_err(), + "second coder on same story must be rejected" + ); let err = result.unwrap_err(); assert!( err.contains("same pipeline stage"), @@ -4144,10 +4143,7 @@ stage = "coder" // Exactly one call must be rejected with a stage-conflict error. let stage_rejections = [&r1, &r2] .iter() - .filter(|r| { - r.as_ref() - .is_err_and(|e| e.contains("same pipeline stage")) - }) + .filter(|r| r.as_ref().is_err_and(|e| e.contains("same pipeline stage"))) .count(); assert_eq!( @@ -4359,7 +4355,9 @@ stage = "coder" MergeJobStatus::Completed(report) => { assert!(!report.had_conflicts, "should have no conflicts"); assert!( - report.success || report.gate_output.contains("Failed to run") || !report.gates_passed, + report.success + || report.gate_output.contains("Failed to run") + || !report.gates_passed, "report should be coherent: {report:?}" ); if report.story_archived { @@ -4454,7 +4452,9 @@ stage = "coder" // Run the squash-merge. The failing script/test makes quality gates // fail → fast-forward must NOT happen. - let result = crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test").unwrap(); + let result = + crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test") + .unwrap(); let head_after = String::from_utf8( Command::new("git") @@ -4489,7 +4489,11 @@ stage = "coder" init_git_repo(repo); // Create a file on master. - fs::write(repo.join("code.rs"), "fn main() {\n println!(\"hello\");\n}\n").unwrap(); + fs::write( + repo.join("code.rs"), + "fn main() {\n println!(\"hello\");\n}\n", + ) + .unwrap(); Command::new("git") .args(["add", "."]) .current_dir(repo) @@ -4507,7 +4511,11 @@ stage = "coder" .current_dir(repo) .output() .unwrap(); - fs::write(repo.join("code.rs"), "fn main() {\n println!(\"hello\");\n feature_fn();\n}\n").unwrap(); + fs::write( + repo.join("code.rs"), + "fn main() {\n println!(\"hello\");\n feature_fn();\n}\n", + ) + .unwrap(); Command::new("git") .args(["add", "."]) .current_dir(repo) @@ -4525,7 +4533,11 @@ stage = "coder" .current_dir(repo) .output() .unwrap(); - fs::write(repo.join("code.rs"), "fn main() {\n println!(\"hello\");\n master_fn();\n}\n").unwrap(); + fs::write( + repo.join("code.rs"), + "fn main() {\n println!(\"hello\");\n master_fn();\n}\n", + ) + .unwrap(); Command::new("git") .args(["add", "."]) .current_dir(repo) @@ -4717,9 +4729,7 @@ stage = "coder" // and the story stays in 2_current/. The important assertion is that // reconcile ran without panicking and the story is in a consistent state. let in_current = current.join("61_story_test.md").exists(); - let in_qa = root - .join(".story_kit/work/3_qa/61_story_test.md") - .exists(); + let in_qa = root.join(".story_kit/work/3_qa/61_story_test.md").exists(); assert!( in_current || in_qa, "story should be in 2_current/ or 3_qa/ after reconciliation" @@ -4746,11 +4756,7 @@ stage = "coder" let qa_dir = tmp.path().join(".story_kit/work/3_qa"); std::fs::create_dir_all(&qa_dir).unwrap(); let spike_path = qa_dir.join("10_spike_research.md"); - std::fs::write( - &spike_path, - "---\nname: Research spike\n---\n# Spike\n", - ) - .unwrap(); + std::fs::write(&spike_path, "---\nname: Research spike\n---\n# Spike\n").unwrap(); assert!(!has_review_hold(tmp.path(), "3_qa", "10_spike_research")); } @@ -4828,17 +4834,19 @@ stage = "coder" let agents = pool.agents.lock().unwrap(); // coder-1 must NOT have been assigned (wrong stage for 3_qa/). - let coder_assigned = agents - .values() - .any(|a| a.agent_name == "coder-1" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)); + let coder_assigned = agents.values().any(|a| { + a.agent_name == "coder-1" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); assert!( !coder_assigned, "coder-1 should not be assigned to a QA-stage story" ); // qa-1 should have been assigned instead. - let qa_assigned = agents - .values() - .any(|a| a.agent_name == "qa-1" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)); + let qa_assigned = agents.values().any(|a| { + a.agent_name == "qa-1" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); assert!( qa_assigned, "qa-1 should be assigned as fallback for the QA-stage story" @@ -4873,17 +4881,19 @@ stage = "coder" let agents = pool.agents.lock().unwrap(); // coder-1 should have been picked (it matches the stage and is preferred). - let coder1_assigned = agents - .values() - .any(|a| a.agent_name == "coder-1" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)); + let coder1_assigned = agents.values().any(|a| { + a.agent_name == "coder-1" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); assert!( coder1_assigned, "coder-1 should be assigned when it matches the stage and is preferred" ); // coder-2 must NOT be assigned (not preferred). - let coder2_assigned = agents - .values() - .any(|a| a.agent_name == "coder-2" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running)); + let coder2_assigned = agents.values().any(|a| { + a.agent_name == "coder-2" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); assert!( !coder2_assigned, "coder-2 should not be assigned when coder-1 is explicitly preferred" @@ -4923,4 +4933,99 @@ stage = "coder" "No agent should be started when no stage-appropriate agent is available" ); } + + /// Bug 295: when a coder completes and QA is busy on another story, + /// the newly QA-queued story must be picked up when `run_pipeline_advance` + /// finishes for the busy QA agent's story (because auto_assign is now + /// called unconditionally at the end of pipeline advance). + #[tokio::test] + async fn pipeline_advance_picks_up_waiting_qa_stories_after_completion() { + use std::fs; + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let sk = root.join(".story_kit"); + let qa_dir = sk.join("work/3_qa"); + fs::create_dir_all(&qa_dir).unwrap(); + + // Configure a single QA agent. + fs::write( + sk.join("project.toml"), + r#" +[[agent]] +name = "qa" +stage = "qa" +"#, + ) + .unwrap(); + + // Story 292 is in QA with QA agent running (will "complete" via + // run_pipeline_advance below). Story 293 is in QA with NO agent — + // simulating the "stuck" state from bug 295. + fs::write( + qa_dir.join("292_story_first.md"), + "---\nname: First\nmanual_qa: true\n---\n", + ) + .unwrap(); + fs::write( + qa_dir.join("293_story_second.md"), + "---\nname: Second\nmanual_qa: true\n---\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + // QA is currently running on story 292. + pool.inject_test_agent("292_story_first", "qa", AgentStatus::Running); + + // Verify that 293 cannot get a QA agent right now (QA is busy). + { + let agents = pool.agents.lock().unwrap(); + assert!( + !is_agent_free(&agents, "qa"), + "qa should be busy on story 292" + ); + } + + // Simulate QA completing on story 292: remove the agent from the pool + // (as run_server_owned_completion does) then run pipeline advance. + { + let mut agents = pool.agents.lock().unwrap(); + agents.remove(&composite_key("292_story_first", "qa")); + } + + // Pipeline advance for QA with gates_passed=true will: + // 1. Run coverage gate (will "pass" trivially in test — no script/test_coverage) + // 2. Set review_hold on 292 (manual_qa: true) + // 3. Call auto_assign_available_work (the fix from bug 295) + // 4. auto_assign should find 293 in 3_qa/ with no agent and start qa on it + pool.run_pipeline_advance( + "292_story_first", + "qa", + CompletionReport { + summary: "QA done".to_string(), + gates_passed: true, + gate_output: String::new(), + }, + Some(root.to_path_buf()), + None, + false, + ) + .await; + + // After pipeline advance, auto_assign should have started QA on story 293. + let agents = pool.agents.lock().unwrap(); + let qa_on_293 = agents.values().any(|a| { + a.agent_name == "qa" + && matches!(a.status, AgentStatus::Pending | AgentStatus::Running) + }); + assert!( + qa_on_293, + "auto_assign should have started qa for story 293 after 292's QA completed, \ + but no qa agent is pending/running. Pool: {:?}", + agents + .iter() + .map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status)) + .collect::>() + ); + } }