fix: call auto_assign_available_work after every pipeline advance (bug 295)

Stories got stuck in QA/merge when agents were busy at assignment time.
Consolidates auto_assign into a single unconditional call at the end of
run_pipeline_advance, so whenever any agent completes, the system
immediately scans for pending work and assigns free agents.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dave
2026-03-19 09:53:41 +00:00
parent 28b29b55a8
commit 6c413e1fc7

View File

@@ -348,12 +348,8 @@ impl AgentPool {
// Create persistent log writer (needs resolved_name, so must be after
// the atomic resolution above).
let log_writer = match AgentLogWriter::new(
project_root,
story_id,
&resolved_name,
&log_session_id,
) {
let log_writer =
match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) {
Ok(w) => Some(Arc::new(Mutex::new(w))),
Err(e) => {
eprintln!(
@@ -420,7 +416,8 @@ impl AgentPool {
}
let _ = tx_clone.send(event);
if let Ok(mut agents) = agents_ref.lock()
&& let Some(agent) = agents.get_mut(&key_clone) {
&& let Some(agent) = agents.get_mut(&key_clone)
{
agent.status = AgentStatus::Failed;
}
Self::notify_agent_state_changed(&watcher_tx_clone);
@@ -458,7 +455,8 @@ impl AgentPool {
}
let _ = tx_clone.send(event);
if let Ok(mut agents) = agents_ref.lock()
&& let Some(agent) = agents.get_mut(&key_clone) {
&& let Some(agent) = agents.get_mut(&key_clone)
{
agent.status = AgentStatus::Failed;
}
Self::notify_agent_state_changed(&watcher_tx_clone);
@@ -528,7 +526,8 @@ impl AgentPool {
}
let _ = tx_clone.send(event);
if let Ok(mut agents) = agents_ref.lock()
&& let Some(agent) = agents.get_mut(&key_clone) {
&& let Some(agent) = agents.get_mut(&key_clone)
{
agent.status = AgentStatus::Failed;
}
Self::notify_agent_state_changed(&watcher_tx_clone);
@@ -707,8 +706,7 @@ impl AgentPool {
}
}
let deadline =
tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms);
let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms);
loop {
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
@@ -841,16 +839,12 @@ impl AgentPool {
);
if let Err(e) = super::lifecycle::move_story_to_qa(&project_root, story_id) {
slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
return;
}
if let Err(e) = self
} else if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), None)
.await
{
slog_error!("[pipeline] Failed to start qa agent for '{story_id}': {e}");
}
// Coder slot is now free — pick up any other unassigned work in 2_current/.
self.auto_assign_available_work(&project_root).await;
} else {
slog!(
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting."
@@ -874,7 +868,9 @@ impl AgentPool {
PipelineStage::Qa => {
if completion.gates_passed {
// Run coverage gate in the QA worktree before advancing to merge.
let coverage_path = worktree_path.clone().unwrap_or_else(|| project_root.clone());
let coverage_path = worktree_path
.clone()
.unwrap_or_else(|| project_root.clone());
let cp = coverage_path.clone();
let coverage_result =
tokio::task::spawn_blocking(move || super::gates::run_coverage_gate(&cp))
@@ -906,33 +902,37 @@ impl AgentPool {
// Hold in 3_qa/ for human review.
let qa_dir = project_root.join(".story_kit/work/3_qa");
let story_path = qa_dir.join(format!("{story_id}.md"));
if let Err(e) = crate::io::story_metadata::write_review_hold(&story_path) {
slog_error!("[pipeline] Failed to set review_hold on '{story_id}': {e}");
if let Err(e) =
crate::io::story_metadata::write_review_hold(&story_path)
{
slog_error!(
"[pipeline] Failed to set review_hold on '{story_id}': {e}"
);
}
slog!(
"[pipeline] QA passed for '{story_id}'. \
Holding for human review. \
Worktree preserved at: {worktree_path:?}"
);
// Free up the QA slot without advancing.
self.auto_assign_available_work(&project_root).await;
} else {
slog!(
"[pipeline] QA passed gates and coverage for '{story_id}'. \
manual_qa: false — moving directly to merge."
);
if let Err(e) = super::lifecycle::move_story_to_merge(&project_root, story_id) {
slog_error!("[pipeline] Failed to move '{story_id}' to 4_merge/: {e}");
return;
}
if let Err(e) = self
if let Err(e) =
super::lifecycle::move_story_to_merge(&project_root, story_id)
{
slog_error!(
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
);
} else if let Err(e) = self
.start_agent(&project_root, story_id, Some("mergemaster"), None)
.await
{
slog_error!("[pipeline] Failed to start mergemaster for '{story_id}': {e}");
slog_error!(
"[pipeline] Failed to start mergemaster for '{story_id}': {e}"
);
}
// QA slot is now free — pick up any other unassigned work in 3_qa/.
self.auto_assign_available_work(&project_root).await;
}
} else {
slog!(
@@ -952,9 +952,7 @@ impl AgentPool {
}
}
} else {
slog!(
"[pipeline] QA failed gates for '{story_id}'. Restarting."
);
slog!("[pipeline] QA failed gates for '{story_id}'. Restarting.");
let context = format!(
"\n\n---\n## Previous QA Attempt Failed\n\
The acceptance gates failed with the following output:\n{}\n\n\
@@ -979,15 +977,14 @@ impl AgentPool {
mergemaster explicitly reported a merge failure. \
Story stays in 4_merge/ for human review."
);
return;
}
} else {
// Run script/test on master (project_root) as the post-merge verification.
slog!(
"[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master."
);
let root = project_root.clone();
let test_result = tokio::task::spawn_blocking(move || super::gates::run_project_tests(&root))
let test_result =
tokio::task::spawn_blocking(move || super::gates::run_project_tests(&root))
.await
.unwrap_or_else(|e| {
slog_warn!("[pipeline] Post-merge test task panicked: {e}");
@@ -1002,12 +999,12 @@ impl AgentPool {
slog!(
"[pipeline] Post-merge tests passed for '{story_id}'. Moving to done."
);
if let Err(e) = super::lifecycle::move_story_to_archived(&project_root, story_id) {
if let Err(e) =
super::lifecycle::move_story_to_archived(&project_root, story_id)
{
slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}");
}
self.remove_agents_for_story(story_id);
// Mergemaster slot is now free — pick up any other items in 4_merge/.
self.auto_assign_available_work(&project_root).await;
// TODO: Re-enable worktree cleanup once we have persistent agent logs.
// Removing worktrees destroys evidence needed to debug empty-commit agents.
// let config =
@@ -1034,7 +1031,12 @@ impl AgentPool {
output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some("mergemaster"), Some(&context))
.start_agent(
&project_root,
story_id,
Some("mergemaster"),
Some(&context),
)
.await
{
slog_error!(
@@ -1046,6 +1048,13 @@ impl AgentPool {
}
}
// Always scan for unassigned work after any agent completes, regardless
// of the outcome (success, failure, restart). This ensures stories that
// failed agent assignment due to busy agents are retried when agents
// become available (bug 295).
self.auto_assign_available_work(&project_root).await;
}
/// Internal: report that an agent has finished work on a story.
///
/// **Note:** This is no longer exposed as an MCP tool. The server now
@@ -1114,7 +1123,13 @@ impl AgentPool {
// Extract data for pipeline advance, then remove the entry so
// completed agents never appear in list_agents.
let (tx, session_id, project_root_for_advance, wt_path_for_advance, merge_failure_reported_for_advance) = {
let (
tx,
session_id,
project_root_for_advance,
wt_path_for_advance,
merge_failure_reported_for_advance,
) = {
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
let agent = agents.get_mut(&key).ok_or_else(|| {
format!("Agent '{agent_name}' for story '{story_id}' disappeared during gate check")
@@ -1267,14 +1282,14 @@ impl AgentPool {
});
}
let story_archived = super::lifecycle::move_story_to_archived(project_root, story_id).is_ok();
let story_archived =
super::lifecycle::move_story_to_archived(project_root, story_id).is_ok();
if story_archived {
self.remove_agents_for_story(story_id);
}
let worktree_cleaned_up = if wt_path.exists() {
let config = crate::config::ProjectConfig::load(project_root)
.unwrap_or_default();
let config = crate::config::ProjectConfig::load(project_root).unwrap_or_default();
worktree::remove_worktree_by_story_id(project_root, story_id, &config)
.await
.is_ok()
@@ -1306,21 +1321,14 @@ impl AgentPool {
}
/// Get project root helper.
pub fn get_project_root(
&self,
state: &crate::state::SessionState,
) -> Result<PathBuf, String> {
pub fn get_project_root(&self, state: &crate::state::SessionState) -> Result<PathBuf, String> {
state.get_project_root()
}
/// Get the log session ID and project root for an agent, if available.
///
/// Used by MCP tools to find the persistent log file for a completed agent.
pub fn get_log_info(
&self,
story_id: &str,
agent_name: &str,
) -> Option<(String, PathBuf)> {
pub fn get_log_info(&self, story_id: &str, agent_name: &str) -> Option<(String, PathBuf)> {
let key = composite_key(story_id, agent_name);
let agents = self.agents.lock().ok()?;
let agent = agents.get(&key)?;
@@ -1364,9 +1372,7 @@ impl AgentPool {
}
}
Err(e) => {
slog_error!(
"[pipeline] set_merge_failure_reported: could not lock agents: {e}"
);
slog_error!("[pipeline] set_merge_failure_reported: could not lock agents: {e}");
}
}
}
@@ -1678,9 +1684,7 @@ impl AgentPool {
continue;
}
Err(e) => {
eprintln!(
"[startup:reconcile] Gate check task panicked for '{story_id}': {e}"
);
eprintln!("[startup:reconcile] Gate check task panicked for '{story_id}': {e}");
let _ = progress_tx.send(ReconciliationEvent {
story_id: story_id.clone(),
status: "failed".to_string(),
@@ -1703,9 +1707,7 @@ impl AgentPool {
continue;
}
eprintln!(
"[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/)."
);
eprintln!("[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/).");
if stage_dir == "2_current" {
// Coder stage → advance to QA.
@@ -1727,16 +1729,15 @@ impl AgentPool {
} else if stage_dir == "3_qa" {
// QA stage → run coverage gate before advancing to merge.
let wt_path_for_cov = wt_path.clone();
let coverage_result =
tokio::task::spawn_blocking(move || super::gates::run_coverage_gate(&wt_path_for_cov))
let coverage_result = tokio::task::spawn_blocking(move || {
super::gates::run_coverage_gate(&wt_path_for_cov)
})
.await;
let (coverage_passed, coverage_output) = match coverage_result {
Ok(Ok(pair)) => pair,
Ok(Err(e)) => {
eprintln!(
"[startup:reconcile] Coverage gate error for '{story_id}': {e}"
);
eprintln!("[startup:reconcile] Coverage gate error for '{story_id}': {e}");
let _ = progress_tx.send(ReconciliationEvent {
story_id: story_id.clone(),
status: "failed".to_string(),
@@ -1788,7 +1789,9 @@ impl AgentPool {
status: "review_hold".to_string(),
message: "Passed QA — waiting for human review.".to_string(),
});
} else if let Err(e) = super::lifecycle::move_story_to_merge(project_root, story_id) {
} else if let Err(e) =
super::lifecycle::move_story_to_merge(project_root, story_id)
{
eprintln!(
"[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}"
);
@@ -1923,17 +1926,14 @@ impl AgentPool {
/// is triggered so that free agents can pick up unassigned work.
pub fn spawn_watchdog(pool: Arc<AgentPool>, project_root: Option<PathBuf>) {
tokio::spawn(async move {
let mut interval =
tokio::time::interval(std::time::Duration::from_secs(30));
let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
loop {
interval.tick().await;
let found = check_orphaned_agents(&pool.agents);
if found > 0
&& let Some(ref root) = project_root
{
slog!(
"[watchdog] {found} orphaned agent(s) detected; triggering auto-assign."
);
slog!("[watchdog] {found} orphaned agent(s) detected; triggering auto-assign.");
pool.auto_assign_available_work(root).await;
}
}
@@ -1992,7 +1992,11 @@ fn find_active_story_stage(project_root: &Path, story_id: &str) -> Option<&'stat
///
/// Returns `Some(agent_name)` if the front matter specifies an agent, or `None`
/// if the field is absent or the file cannot be read / parsed.
fn read_story_front_matter_agent(project_root: &Path, stage_dir: &str, story_id: &str) -> Option<String> {
fn read_story_front_matter_agent(
project_root: &Path,
stage_dir: &str,
story_id: &str,
) -> Option<String> {
use crate::io::story_metadata::parse_front_matter;
let path = project_root
.join(".story_kit")
@@ -2030,10 +2034,7 @@ fn is_agent_free(agents: &HashMap<String, StoryAgent>, agent_name: &str) -> bool
}
fn scan_stage_items(project_root: &Path, stage_dir: &str) -> Vec<String> {
let dir = project_root
.join(".story_kit")
.join("work")
.join(stage_dir);
let dir = project_root.join(".story_kit").join("work").join(stage_dir);
if !dir.is_dir() {
return Vec::new();
}
@@ -2124,7 +2125,12 @@ fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>) -> usize {
.rsplit_once(':')
.map(|(s, _)| s.to_string())
.unwrap_or_else(|| key.clone());
return Some((key.clone(), story_id, agent.tx.clone(), agent.status.clone()));
return Some((
key.clone(),
story_id,
agent.tx.clone(),
agent.status.clone(),
));
}
None
})
@@ -2440,9 +2446,7 @@ mod tests {
#[tokio::test]
async fn report_completion_rejects_nonexistent_agent() {
let pool = AgentPool::new_test(3001);
let result = pool
.report_completion("no_story", "no_bot", "done")
.await;
let result = pool.report_completion("no_story", "no_bot", "done").await;
assert!(result.is_err());
let msg = result.unwrap_err();
assert!(msg.contains("No agent"), "unexpected: {msg}");
@@ -2518,7 +2522,14 @@ mod tests {
// Subscribe before calling so we can check if Done event was emitted.
let mut rx = pool.subscribe("s10", "coder-1").unwrap();
run_server_owned_completion(&pool.agents, pool.port, "s10", "coder-1", Some("sess-1".to_string()), pool.watcher_tx.clone())
run_server_owned_completion(
&pool.agents,
pool.port,
"s10",
"coder-1",
Some("sess-1".to_string()),
pool.watcher_tx.clone(),
)
.await;
// Status should remain Completed (unchanged) — no gate re-run.
@@ -2527,10 +2538,7 @@ mod tests {
let agent = agents.get(&key).unwrap();
assert_eq!(agent.status, AgentStatus::Completed);
// Summary should still be the original, not overwritten.
assert_eq!(
agent.completion.as_ref().unwrap().summary,
"Already done"
);
assert_eq!(agent.completion.as_ref().unwrap().summary, "Already done");
drop(agents);
// No Done event should have been emitted.
@@ -2558,7 +2566,14 @@ mod tests {
let mut rx = pool.subscribe("s11", "coder-1").unwrap();
run_server_owned_completion(&pool.agents, pool.port, "s11", "coder-1", Some("sess-2".to_string()), pool.watcher_tx.clone())
run_server_owned_completion(
&pool.agents,
pool.port,
"s11",
"coder-1",
Some("sess-2".to_string()),
pool.watcher_tx.clone(),
)
.await;
// Agent entry should be removed from the map after completion.
@@ -2601,7 +2616,14 @@ mod tests {
let mut rx = pool.subscribe("s12", "coder-1").unwrap();
run_server_owned_completion(&pool.agents, pool.port, "s12", "coder-1", None, pool.watcher_tx.clone())
run_server_owned_completion(
&pool.agents,
pool.port,
"s12",
"coder-1",
None,
pool.watcher_tx.clone(),
)
.await;
// Agent entry should be removed from the map after completion (even on failure).
@@ -2625,7 +2647,14 @@ mod tests {
async fn server_owned_completion_nonexistent_agent_is_noop() {
let pool = AgentPool::new_test(3001);
// Should not panic or error — just silently return.
run_server_owned_completion(&pool.agents, pool.port, "nonexistent", "bot", None, pool.watcher_tx.clone())
run_server_owned_completion(
&pool.agents,
pool.port,
"nonexistent",
"bot",
None,
pool.watcher_tx.clone(),
)
.await;
}
@@ -2703,7 +2732,8 @@ mod tests {
// Story should have moved to 4_merge/
assert!(
root.join(".story_kit/work/4_merge/51_story_test.md").exists(),
root.join(".story_kit/work/4_merge/51_story_test.md")
.exists(),
"story should be in 4_merge/"
);
assert!(
@@ -2916,12 +2946,7 @@ stage = "qa"
);
// Should NOT appear as a coder
assert!(
!is_story_assigned_for_stage(
&config,
&agents,
"42_story_foo",
&PipelineStage::Coder
),
!is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Coder),
"qa-2 should not be detected as a coder"
);
}
@@ -2967,7 +2992,11 @@ name = "coder-3"
let agents = pool.agents.lock().unwrap();
let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
assert_eq!(free, Some("coder-2"), "coder-2 should be the first free coder");
assert_eq!(
free,
Some("coder-2"),
"coder-2 should be the first free coder"
);
}
#[test]
@@ -3070,10 +3099,7 @@ stage = "coder"
fs::create_dir_all(&qa).unwrap();
fs::write(qa.join("11_story_test.md"), "test").unwrap();
assert_eq!(
find_active_story_stage(root, "11_story_test"),
Some("3_qa")
);
assert_eq!(find_active_story_stage(root, "11_story_test"), Some("3_qa"));
}
#[test]
@@ -3125,7 +3151,10 @@ stage = "coder"
pool.inject_test_agent("story_b", "qa", AgentStatus::Failed);
let found = check_orphaned_agents(&pool.agents);
assert_eq!(found, 0, "no orphans should be detected for terminal agents");
assert_eq!(
found, 0,
"no orphans should be detected for terminal agents"
);
}
#[tokio::test]
@@ -3134,10 +3163,17 @@ stage = "coder"
let handle = tokio::spawn(async {});
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
assert!(handle.is_finished(), "task should be finished before injection");
assert!(
handle.is_finished(),
"task should be finished before injection"
);
let tx =
pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle);
let tx = pool.inject_test_agent_with_handle(
"orphan_story",
"coder",
AgentStatus::Running,
handle,
);
let mut rx = tx.subscribe();
pool.run_watchdog_once();
@@ -3170,12 +3206,7 @@ stage = "coder"
let handle = tokio::spawn(async {});
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
pool.inject_test_agent_with_handle(
"orphan_story",
"coder",
AgentStatus::Running,
handle,
);
pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle);
// Before watchdog: agent is Running.
{
@@ -3260,11 +3291,18 @@ stage = "coder"
// Agent entries for the archived story should be gone.
let remaining = pool.list_agents().unwrap();
assert_eq!(remaining.len(), 1, "only the other story's agent should remain");
assert_eq!(
remaining.len(),
1,
"only the other story's agent should remain"
);
assert_eq!(remaining[0].story_id, "61_story_other");
// Story file should be in 5_done/
assert!(root.join(".story_kit/work/5_done/60_story_cleanup.md").exists());
assert!(
root.join(".story_kit/work/5_done/60_story_cleanup.md")
.exists()
);
}
// ── kill_all_children tests ────────────────────────────────────
@@ -3515,9 +3553,7 @@ stage = "coder"
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending);
let result = pool
.start_agent(tmp.path(), "story-3", None, None)
.await;
let result = pool.start_agent(tmp.path(), "story-3", None, None).await;
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
@@ -3545,18 +3581,12 @@ stage = "coder"
)
.unwrap();
// Place the story in 1_backlog/.
std::fs::write(
backlog.join("story-3.md"),
"---\nname: Story 3\n---\n",
)
.unwrap();
std::fs::write(backlog.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap();
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
let result = pool
.start_agent(tmp.path(), "story-3", None, None)
.await;
let result = pool.start_agent(tmp.path(), "story-3", None, None).await;
// Should fail because all coders are busy.
assert!(result.is_err());
@@ -3597,11 +3627,7 @@ stage = "coder"
)
.unwrap();
// Place the story in 2_current/ (simulating the "queued" state).
std::fs::write(
current.join("story-3.md"),
"---\nname: Story 3\n---\n",
)
.unwrap();
std::fs::write(current.join("story-3.md"), "---\nname: Story 3\n---\n").unwrap();
let pool = AgentPool::new_test(3001);
// No agents are running — coder-1 is free.
@@ -3637,20 +3663,14 @@ stage = "coder"
)
.unwrap();
// Place the story directly in 2_current/.
std::fs::write(
current.join("story-5.md"),
"---\nname: Story 5\n---\n",
)
.unwrap();
std::fs::write(current.join("story-5.md"), "---\nname: Story 5\n---\n").unwrap();
let pool = AgentPool::new_test(3001);
// start_agent should attempt to assign coder-1 (no infra, so it will
// fail for git reasons), but must NOT fail due to the story already
// being in 2_current/.
let result = pool
.start_agent(tmp.path(), "story-5", None, None)
.await;
let result = pool.start_agent(tmp.path(), "story-5", None, None).await;
match result {
Ok(_) => {}
Err(e) => {
@@ -3710,20 +3730,14 @@ stage = "coder"
// Write a minimal project.toml so ProjectConfig::load can find the "qa" agent.
let sk_dir = root.join(".story_kit");
fs::create_dir_all(&sk_dir).unwrap();
fs::write(
sk_dir.join("project.toml"),
"[[agent]]\nname = \"qa\"\n",
)
.unwrap();
fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
let pool = AgentPool::new_test(3001);
// Simulate qa already running on story-a.
pool.inject_test_agent("story-a", "qa", AgentStatus::Running);
// Attempt to start qa on story-b — must be rejected.
let result = pool
.start_agent(root, "story-b", Some("qa"), None)
.await;
let result = pool.start_agent(root, "story-b", Some("qa"), None).await;
assert!(
result.is_err(),
@@ -3747,11 +3761,7 @@ stage = "coder"
let sk_dir = root.join(".story_kit");
fs::create_dir_all(&sk_dir).unwrap();
fs::write(
sk_dir.join("project.toml"),
"[[agent]]\nname = \"qa\"\n",
)
.unwrap();
fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
let pool = AgentPool::new_test(3001);
// Previous run completed — should NOT block a new story.
@@ -3761,9 +3771,7 @@ stage = "coder"
// NOT fail at the concurrency check. We detect the difference by inspecting
// the error message: a concurrency rejection says "already running", while a
// later failure (missing story file, missing claude binary, etc.) says something else.
let result = pool
.start_agent(root, "story-b", Some("qa"), None)
.await;
let result = pool.start_agent(root, "story-b", Some("qa"), None).await;
if let Err(ref e) = result {
assert!(
@@ -3795,21 +3803,13 @@ stage = "coder"
// Minimal project.toml with a "qa" agent.
let sk_dir = root.join(".story_kit");
fs::create_dir_all(&sk_dir).unwrap();
fs::write(
sk_dir.join("project.toml"),
"[[agent]]\nname = \"qa\"\n",
)
.unwrap();
fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
// Create the story in upcoming so `move_story_to_current` succeeds,
// but do NOT init a git repo — `create_worktree` will fail in the spawn.
let upcoming = root.join(".story_kit/work/1_backlog");
fs::create_dir_all(&upcoming).unwrap();
fs::write(
upcoming.join("50_story_test.md"),
"---\nname: Test\n---\n",
)
.unwrap();
fs::write(upcoming.join("50_story_test.md"), "---\nname: Test\n---\n").unwrap();
let pool = AgentPool::new_test(3099);
@@ -3858,9 +3858,7 @@ stage = "coder"
let events = pool
.drain_events("50_story_test", "qa")
.expect("drain_events should succeed");
let has_error_event = events
.iter()
.any(|e| matches!(e, AgentEvent::Error { .. }));
let has_error_event = events.iter().any(|e| matches!(e, AgentEvent::Error { .. }));
assert!(
has_error_event,
"event_log must contain AgentEvent::Error after worktree creation fails"
@@ -3880,11 +3878,7 @@ stage = "coder"
let sk_dir = root.join(".story_kit");
fs::create_dir_all(&sk_dir).unwrap();
fs::write(
sk_dir.join("project.toml"),
"[[agent]]\nname = \"qa\"\n",
)
.unwrap();
fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap();
let pool = AgentPool::new_test(3099);
@@ -3893,9 +3887,7 @@ stage = "coder"
// Attempting to start the same agent on a different story must be
// rejected — the Running entry must still be there.
let result = pool
.start_agent(root, "story-y", Some("qa"), None)
.await;
let result = pool.start_agent(root, "story-y", Some("qa"), None).await;
assert!(result.is_err());
let err = result.unwrap_err();
@@ -3920,7 +3912,11 @@ stage = "coder"
let sk_dir = root.join(".story_kit");
fs::create_dir_all(&sk_dir).unwrap();
fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"coder-1\"\n").unwrap();
fs::write(
sk_dir.join("project.toml"),
"[[agent]]\nname = \"coder-1\"\n",
)
.unwrap();
let pool = AgentPool::new_test(3099);
@@ -4041,7 +4037,10 @@ stage = "coder"
.start_agent(root, "42_story_foo", Some("coder-2"), None)
.await;
assert!(result.is_err(), "second coder on same story must be rejected");
assert!(
result.is_err(),
"second coder on same story must be rejected"
);
let err = result.unwrap_err();
assert!(
err.contains("same pipeline stage"),
@@ -4144,10 +4143,7 @@ stage = "coder"
// Exactly one call must be rejected with a stage-conflict error.
let stage_rejections = [&r1, &r2]
.iter()
.filter(|r| {
r.as_ref()
.is_err_and(|e| e.contains("same pipeline stage"))
})
.filter(|r| r.as_ref().is_err_and(|e| e.contains("same pipeline stage")))
.count();
assert_eq!(
@@ -4359,7 +4355,9 @@ stage = "coder"
MergeJobStatus::Completed(report) => {
assert!(!report.had_conflicts, "should have no conflicts");
assert!(
report.success || report.gate_output.contains("Failed to run") || !report.gates_passed,
report.success
|| report.gate_output.contains("Failed to run")
|| !report.gates_passed,
"report should be coherent: {report:?}"
);
if report.story_archived {
@@ -4454,7 +4452,9 @@ stage = "coder"
// Run the squash-merge. The failing script/test makes quality gates
// fail → fast-forward must NOT happen.
let result = crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test").unwrap();
let result =
crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test")
.unwrap();
let head_after = String::from_utf8(
Command::new("git")
@@ -4489,7 +4489,11 @@ stage = "coder"
init_git_repo(repo);
// Create a file on master.
fs::write(repo.join("code.rs"), "fn main() {\n println!(\"hello\");\n}\n").unwrap();
fs::write(
repo.join("code.rs"),
"fn main() {\n println!(\"hello\");\n}\n",
)
.unwrap();
Command::new("git")
.args(["add", "."])
.current_dir(repo)
@@ -4507,7 +4511,11 @@ stage = "coder"
.current_dir(repo)
.output()
.unwrap();
fs::write(repo.join("code.rs"), "fn main() {\n println!(\"hello\");\n feature_fn();\n}\n").unwrap();
fs::write(
repo.join("code.rs"),
"fn main() {\n println!(\"hello\");\n feature_fn();\n}\n",
)
.unwrap();
Command::new("git")
.args(["add", "."])
.current_dir(repo)
@@ -4525,7 +4533,11 @@ stage = "coder"
.current_dir(repo)
.output()
.unwrap();
fs::write(repo.join("code.rs"), "fn main() {\n println!(\"hello\");\n master_fn();\n}\n").unwrap();
fs::write(
repo.join("code.rs"),
"fn main() {\n println!(\"hello\");\n master_fn();\n}\n",
)
.unwrap();
Command::new("git")
.args(["add", "."])
.current_dir(repo)
@@ -4717,9 +4729,7 @@ stage = "coder"
// and the story stays in 2_current/. The important assertion is that
// reconcile ran without panicking and the story is in a consistent state.
let in_current = current.join("61_story_test.md").exists();
let in_qa = root
.join(".story_kit/work/3_qa/61_story_test.md")
.exists();
let in_qa = root.join(".story_kit/work/3_qa/61_story_test.md").exists();
assert!(
in_current || in_qa,
"story should be in 2_current/ or 3_qa/ after reconciliation"
@@ -4746,11 +4756,7 @@ stage = "coder"
let qa_dir = tmp.path().join(".story_kit/work/3_qa");
std::fs::create_dir_all(&qa_dir).unwrap();
let spike_path = qa_dir.join("10_spike_research.md");
std::fs::write(
&spike_path,
"---\nname: Research spike\n---\n# Spike\n",
)
.unwrap();
std::fs::write(&spike_path, "---\nname: Research spike\n---\n# Spike\n").unwrap();
assert!(!has_review_hold(tmp.path(), "3_qa", "10_spike_research"));
}
@@ -4828,17 +4834,19 @@ stage = "coder"
let agents = pool.agents.lock().unwrap();
// coder-1 must NOT have been assigned (wrong stage for 3_qa/).
let coder_assigned = agents
.values()
.any(|a| a.agent_name == "coder-1" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running));
let coder_assigned = agents.values().any(|a| {
a.agent_name == "coder-1"
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
});
assert!(
!coder_assigned,
"coder-1 should not be assigned to a QA-stage story"
);
// qa-1 should have been assigned instead.
let qa_assigned = agents
.values()
.any(|a| a.agent_name == "qa-1" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running));
let qa_assigned = agents.values().any(|a| {
a.agent_name == "qa-1"
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
});
assert!(
qa_assigned,
"qa-1 should be assigned as fallback for the QA-stage story"
@@ -4873,17 +4881,19 @@ stage = "coder"
let agents = pool.agents.lock().unwrap();
// coder-1 should have been picked (it matches the stage and is preferred).
let coder1_assigned = agents
.values()
.any(|a| a.agent_name == "coder-1" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running));
let coder1_assigned = agents.values().any(|a| {
a.agent_name == "coder-1"
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
});
assert!(
coder1_assigned,
"coder-1 should be assigned when it matches the stage and is preferred"
);
// coder-2 must NOT be assigned (not preferred).
let coder2_assigned = agents
.values()
.any(|a| a.agent_name == "coder-2" && matches!(a.status, AgentStatus::Pending | AgentStatus::Running));
let coder2_assigned = agents.values().any(|a| {
a.agent_name == "coder-2"
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
});
assert!(
!coder2_assigned,
"coder-2 should not be assigned when coder-1 is explicitly preferred"
@@ -4923,4 +4933,99 @@ stage = "coder"
"No agent should be started when no stage-appropriate agent is available"
);
}
/// Bug 295: when a coder completes and QA is busy on another story,
/// the newly QA-queued story must be picked up when `run_pipeline_advance`
/// finishes for the busy QA agent's story (because auto_assign is now
/// called unconditionally at the end of pipeline advance).
#[tokio::test]
async fn pipeline_advance_picks_up_waiting_qa_stories_after_completion() {
use std::fs;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
let sk = root.join(".story_kit");
let qa_dir = sk.join("work/3_qa");
fs::create_dir_all(&qa_dir).unwrap();
// Configure a single QA agent.
fs::write(
sk.join("project.toml"),
r#"
[[agent]]
name = "qa"
stage = "qa"
"#,
)
.unwrap();
// Story 292 is in QA with QA agent running (will "complete" via
// run_pipeline_advance below). Story 293 is in QA with NO agent —
// simulating the "stuck" state from bug 295.
fs::write(
qa_dir.join("292_story_first.md"),
"---\nname: First\nmanual_qa: true\n---\n",
)
.unwrap();
fs::write(
qa_dir.join("293_story_second.md"),
"---\nname: Second\nmanual_qa: true\n---\n",
)
.unwrap();
let pool = AgentPool::new_test(3001);
// QA is currently running on story 292.
pool.inject_test_agent("292_story_first", "qa", AgentStatus::Running);
// Verify that 293 cannot get a QA agent right now (QA is busy).
{
let agents = pool.agents.lock().unwrap();
assert!(
!is_agent_free(&agents, "qa"),
"qa should be busy on story 292"
);
}
// Simulate QA completing on story 292: remove the agent from the pool
// (as run_server_owned_completion does) then run pipeline advance.
{
let mut agents = pool.agents.lock().unwrap();
agents.remove(&composite_key("292_story_first", "qa"));
}
// Pipeline advance for QA with gates_passed=true will:
// 1. Run coverage gate (will "pass" trivially in test — no script/test_coverage)
// 2. Set review_hold on 292 (manual_qa: true)
// 3. Call auto_assign_available_work (the fix from bug 295)
// 4. auto_assign should find 293 in 3_qa/ with no agent and start qa on it
pool.run_pipeline_advance(
"292_story_first",
"qa",
CompletionReport {
summary: "QA done".to_string(),
gates_passed: true,
gate_output: String::new(),
},
Some(root.to_path_buf()),
None,
false,
)
.await;
// After pipeline advance, auto_assign should have started QA on story 293.
let agents = pool.agents.lock().unwrap();
let qa_on_293 = agents.values().any(|a| {
a.agent_name == "qa"
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
});
assert!(
qa_on_293,
"auto_assign should have started qa for story 293 after 292's QA completed, \
but no qa agent is pending/running. Pool: {:?}",
agents
.iter()
.map(|(k, a)| format!("{k}: {} ({})", a.agent_name, a.status))
.collect::<Vec<_>>()
);
}
}