508 lines
25 KiB
Rust
508 lines
25 KiB
Rust
//! Pipeline advance — moves stories forward through pipeline stages after agent completion.
|
|
#![allow(unused_imports, dead_code)]
|
|
use crate::config::ProjectConfig;
|
|
use crate::io::watcher::WatcherEvent;
|
|
use crate::slog;
|
|
use crate::slog_error;
|
|
use crate::slog_warn;
|
|
use std::collections::HashMap;
|
|
use std::path::{Path, PathBuf};
|
|
use std::sync::{Arc, Mutex};
|
|
use tokio::sync::broadcast;
|
|
|
|
use super::super::super::{CompletionReport, PipelineStage, agent_config_stage, pipeline_stage};
|
|
use super::super::{AgentPool, StoryAgent};
|
|
|
|
impl AgentPool {
|
|
/// Pipeline advancement: after an agent completes, move the story to
|
|
/// the next pipeline stage and start the appropriate agent.
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub(super) async fn run_pipeline_advance(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
completion: CompletionReport,
|
|
project_root: Option<PathBuf>,
|
|
worktree_path: Option<PathBuf>,
|
|
merge_failure_reported: bool,
|
|
previous_session_id: Option<String>,
|
|
) {
|
|
let project_root = match project_root {
|
|
Some(p) => p,
|
|
None => {
|
|
slog_warn!("[pipeline] No project_root for '{story_id}:{agent_name}'");
|
|
return;
|
|
}
|
|
};
|
|
|
|
let config = ProjectConfig::load(&project_root).unwrap_or_default();
|
|
let stage = config
|
|
.find_agent(agent_name)
|
|
.map(agent_config_stage)
|
|
.unwrap_or_else(|| pipeline_stage(agent_name));
|
|
|
|
// If the story is frozen, do not advance the pipeline. The agent's work
|
|
// is done but the story stays at its current stage.
|
|
if crate::io::story_metadata::is_story_frozen_in_store(story_id) {
|
|
slog!("[pipeline] Story '{story_id}' is frozen; pipeline advancement suppressed.");
|
|
return;
|
|
}
|
|
|
|
match stage {
|
|
PipelineStage::Other => {
|
|
// Supervisors and unknown agents do not advance the pipeline.
|
|
}
|
|
PipelineStage::Coder => {
|
|
if completion.gates_passed {
|
|
// Determine effective QA mode for this story.
|
|
let qa_mode = {
|
|
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
|
|
if item_type == "spike" {
|
|
crate::io::story_metadata::QaMode::Human
|
|
} else {
|
|
let default_qa = config.default_qa_mode();
|
|
resolve_qa_mode_from_store(&project_root, story_id, default_qa)
|
|
}
|
|
};
|
|
|
|
match qa_mode {
|
|
crate::io::story_metadata::QaMode::Server => {
|
|
slog!(
|
|
"[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
|
|
qa: server — moving directly to merge."
|
|
);
|
|
if let Err(e) = crate::agents::lifecycle::move_story_to_merge(
|
|
&project_root,
|
|
story_id,
|
|
) {
|
|
slog_error!(
|
|
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
|
|
);
|
|
} else {
|
|
self.start_mergemaster_or_block(&project_root, story_id)
|
|
.await;
|
|
}
|
|
}
|
|
crate::io::story_metadata::QaMode::Agent => {
|
|
slog!(
|
|
"[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
|
|
qa: agent — moving to QA."
|
|
);
|
|
if let Err(e) =
|
|
crate::agents::lifecycle::move_story_to_qa(&project_root, story_id)
|
|
{
|
|
slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
|
|
} else if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some("qa"), None, None)
|
|
.await
|
|
{
|
|
slog_error!(
|
|
"[pipeline] Failed to start qa agent for '{story_id}': {e}"
|
|
);
|
|
}
|
|
}
|
|
crate::io::story_metadata::QaMode::Human => {
|
|
slog!(
|
|
"[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
|
|
qa: human — holding for human review."
|
|
);
|
|
if let Err(e) =
|
|
crate::agents::lifecycle::move_story_to_qa(&project_root, story_id)
|
|
{
|
|
slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
|
|
} else {
|
|
write_review_hold_to_store(story_id);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Bug 645 / 668: Before retry/block, check if the agent left committed
|
|
// work AND the agent had a passing run_tests result captured during its
|
|
// session. An agent may crash mid-output (e.g. Claude Code CLI PTY write
|
|
// assertion) after having already committed valid code and run tests.
|
|
// We require positive test evidence (not just cargo check) so that only
|
|
// stories with genuinely passing test suites are salvaged.
|
|
//
|
|
// The `run_tests` MCP tool writes `{story_id}:run_tests_ok` to the DB
|
|
// whenever script/test exits 0 inside a story worktree. Consume the
|
|
// evidence here so it does not persist to the next agent session.
|
|
let has_test_evidence =
|
|
crate::db::read_content(&format!("{story_id}:run_tests_ok")).is_some();
|
|
crate::db::delete_content(&format!("{story_id}:run_tests_ok"));
|
|
let work_survived = has_test_evidence
|
|
&& worktree_path.as_ref().is_some_and(|wt_path| {
|
|
crate::agents::gates::worktree_has_committed_work(wt_path)
|
|
});
|
|
if work_survived {
|
|
slog!(
|
|
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}' but \
|
|
committed work survives with captured passing tests. Advancing to QA \
|
|
instead of retrying (bug 645)."
|
|
);
|
|
let qa_mode = {
|
|
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
|
|
if item_type == "spike" {
|
|
crate::io::story_metadata::QaMode::Human
|
|
} else {
|
|
let default_qa = config.default_qa_mode();
|
|
resolve_qa_mode_from_store(&project_root, story_id, default_qa)
|
|
}
|
|
};
|
|
match qa_mode {
|
|
crate::io::story_metadata::QaMode::Server => {
|
|
if let Err(e) = crate::agents::lifecycle::move_story_to_merge(
|
|
&project_root,
|
|
story_id,
|
|
) {
|
|
slog_error!(
|
|
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
|
|
);
|
|
} else {
|
|
self.start_mergemaster_or_block(&project_root, story_id)
|
|
.await;
|
|
}
|
|
}
|
|
crate::io::story_metadata::QaMode::Agent => {
|
|
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(
|
|
&project_root,
|
|
story_id,
|
|
) {
|
|
slog_error!(
|
|
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
|
|
);
|
|
} else if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some("qa"), None, None)
|
|
.await
|
|
{
|
|
slog_error!(
|
|
"[pipeline] Failed to start qa for '{story_id}': {e}"
|
|
);
|
|
}
|
|
}
|
|
crate::io::story_metadata::QaMode::Human => {
|
|
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(
|
|
&project_root,
|
|
story_id,
|
|
) {
|
|
slog_error!(
|
|
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
|
|
);
|
|
} else {
|
|
write_review_hold_to_store(story_id);
|
|
}
|
|
}
|
|
}
|
|
} else
|
|
// Increment retry count and check if blocked.
|
|
if let Some(reason) =
|
|
should_block_story(story_id, config.max_retries, "coder")
|
|
{
|
|
// Story has exceeded retry limit — do not restart.
|
|
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
|
|
story_id: story_id.to_string(),
|
|
reason,
|
|
});
|
|
} else {
|
|
slog!(
|
|
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting."
|
|
);
|
|
let context = format!(
|
|
"\n\n---\n## Previous Attempt Failed\n\
|
|
The acceptance gates failed with the following output:\n{}\n\n\
|
|
Please review the failures above, fix the issues, and try again.",
|
|
completion.gate_output
|
|
);
|
|
if let Err(e) = self
|
|
.start_agent(
|
|
&project_root,
|
|
story_id,
|
|
Some(agent_name),
|
|
Some(&context),
|
|
previous_session_id,
|
|
)
|
|
.await
|
|
{
|
|
slog_error!(
|
|
"[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
PipelineStage::Qa => {
|
|
if completion.gates_passed {
|
|
// Run coverage gate in the QA worktree before advancing to merge.
|
|
let coverage_path = worktree_path
|
|
.clone()
|
|
.unwrap_or_else(|| project_root.clone());
|
|
let cp = coverage_path.clone();
|
|
let coverage_result = tokio::task::spawn_blocking(move || {
|
|
crate::agents::gates::run_coverage_gate(&cp)
|
|
})
|
|
.await
|
|
.unwrap_or_else(|e| {
|
|
slog_warn!("[pipeline] Coverage gate task panicked: {e}");
|
|
Ok((false, format!("Coverage gate task panicked: {e}")))
|
|
});
|
|
let (coverage_passed, coverage_output) = match coverage_result {
|
|
Ok(pair) => pair,
|
|
Err(e) => (false, e),
|
|
};
|
|
|
|
if coverage_passed {
|
|
// Check whether this item needs human review before merging.
|
|
let needs_human_review = {
|
|
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
|
|
if item_type == "spike" {
|
|
true // Spikes always need human review.
|
|
} else {
|
|
let default_qa = config.default_qa_mode();
|
|
matches!(
|
|
resolve_qa_mode_from_store(&project_root, story_id, default_qa),
|
|
crate::io::story_metadata::QaMode::Human
|
|
)
|
|
}
|
|
};
|
|
|
|
if needs_human_review {
|
|
// Hold in 3_qa/ for human review.
|
|
write_review_hold_to_store(story_id);
|
|
slog!(
|
|
"[pipeline] QA passed for '{story_id}'. \
|
|
Holding for human review. \
|
|
Worktree preserved at: {worktree_path:?}"
|
|
);
|
|
} else {
|
|
slog!(
|
|
"[pipeline] QA passed gates and coverage for '{story_id}'. \
|
|
Moving directly to merge."
|
|
);
|
|
if let Err(e) = crate::agents::lifecycle::move_story_to_merge(
|
|
&project_root,
|
|
story_id,
|
|
) {
|
|
slog_error!(
|
|
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
|
|
);
|
|
} else {
|
|
self.start_mergemaster_or_block(&project_root, story_id)
|
|
.await;
|
|
}
|
|
}
|
|
} else if let Some(reason) =
|
|
should_block_story(story_id, config.max_retries, "qa-coverage")
|
|
{
|
|
// Story has exceeded retry limit — do not restart.
|
|
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
|
|
story_id: story_id.to_string(),
|
|
reason,
|
|
});
|
|
} else {
|
|
slog!(
|
|
"[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA."
|
|
);
|
|
let context = format!(
|
|
"\n\n---\n## Coverage Gate Failed\n\
|
|
The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\
|
|
Please improve test coverage until the coverage gate passes.",
|
|
coverage_output
|
|
);
|
|
if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some("qa"), Some(&context), None)
|
|
.await
|
|
{
|
|
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
|
|
}
|
|
}
|
|
} else if let Some(reason) = should_block_story(story_id, config.max_retries, "qa")
|
|
{
|
|
// Story has exceeded retry limit — do not restart.
|
|
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
|
|
story_id: story_id.to_string(),
|
|
reason,
|
|
});
|
|
} else {
|
|
slog!("[pipeline] QA failed gates for '{story_id}'. Restarting.");
|
|
let context = format!(
|
|
"\n\n---\n## Previous QA Attempt Failed\n\
|
|
The acceptance gates failed with the following output:\n{}\n\n\
|
|
Please re-run and fix the issues.",
|
|
completion.gate_output
|
|
);
|
|
if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some("qa"), Some(&context), None)
|
|
.await
|
|
{
|
|
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
|
|
}
|
|
}
|
|
}
|
|
PipelineStage::Mergemaster => {
|
|
// Bug 529: Guard against stale mergemaster advances. If the story
|
|
// has already reached done or archived (e.g. a previous mergemaster
|
|
// succeeded), this advance is a zombie — skip it entirely to avoid
|
|
// phantom notifications and redundant post-merge test runs.
|
|
if let Ok(Some(typed_item)) = crate::pipeline_state::read_typed(story_id)
|
|
&& matches!(
|
|
typed_item.stage,
|
|
crate::pipeline_state::Stage::Done { .. }
|
|
| crate::pipeline_state::Stage::Archived { .. }
|
|
)
|
|
{
|
|
let current_dir = typed_item.stage.dir_name();
|
|
slog!(
|
|
"[pipeline] Skipping stale mergemaster advance for '{story_id}': \
|
|
story is already in work/{current_dir}/"
|
|
);
|
|
// Skip pipeline advancement — do not run post-merge tests,
|
|
// do not emit notifications, do not restart agents.
|
|
return;
|
|
}
|
|
|
|
// Block advancement if the mergemaster explicitly reported a failure.
|
|
// The server-owned gate check runs in the feature-branch worktree (not
|
|
// master), so `gates_passed=true` is misleading when no code was merged.
|
|
if merge_failure_reported {
|
|
slog!(
|
|
"[pipeline] Pipeline advancement blocked for '{story_id}': \
|
|
mergemaster explicitly reported a merge failure. \
|
|
Story stays in 4_merge/ for human review."
|
|
);
|
|
} else {
|
|
// Run script/test on master (project_root) as the post-merge verification.
|
|
slog!(
|
|
"[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master."
|
|
);
|
|
let root = project_root.clone();
|
|
let test_result = tokio::task::spawn_blocking(move || {
|
|
crate::agents::gates::run_project_tests(&root)
|
|
})
|
|
.await
|
|
.unwrap_or_else(|e| {
|
|
slog_warn!("[pipeline] Post-merge test task panicked: {e}");
|
|
Ok((false, format!("Test task panicked: {e}")))
|
|
});
|
|
let (passed, output) = match test_result {
|
|
Ok(pair) => pair,
|
|
Err(e) => (false, e),
|
|
};
|
|
|
|
if passed {
|
|
slog!(
|
|
"[pipeline] Post-merge tests passed for '{story_id}'. Moving to done."
|
|
);
|
|
if let Err(e) =
|
|
crate::agents::lifecycle::move_story_to_done(&project_root, story_id)
|
|
{
|
|
slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}");
|
|
}
|
|
self.remove_agents_for_story(story_id);
|
|
// TODO: Re-enable worktree cleanup once we have persistent agent logs.
|
|
// Removing worktrees destroys evidence needed to debug empty-commit agents.
|
|
// let config =
|
|
// crate::config::ProjectConfig::load(&project_root).unwrap_or_default();
|
|
// if let Err(e) =
|
|
// worktree::remove_worktree_by_story_id(&project_root, story_id, &config)
|
|
// .await
|
|
// {
|
|
// slog!(
|
|
// "[pipeline] Failed to remove worktree for '{story_id}': {e}"
|
|
// );
|
|
// }
|
|
slog!(
|
|
"[pipeline] Story '{story_id}' done. Worktree preserved for inspection."
|
|
);
|
|
} else if let Some(reason) =
|
|
should_block_story(story_id, config.max_retries, "mergemaster")
|
|
{
|
|
// Story has exceeded retry limit — do not restart.
|
|
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
|
|
story_id: story_id.to_string(),
|
|
reason,
|
|
});
|
|
} else {
|
|
slog!(
|
|
"[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster."
|
|
);
|
|
let context = format!(
|
|
"\n\n---\n## Post-Merge Test Failed\n\
|
|
The tests on master failed with the following output:\n{}\n\n\
|
|
Please investigate and resolve the failures, then call merge_agent_work again.",
|
|
output
|
|
);
|
|
if let Err(e) = self
|
|
.start_agent(
|
|
&project_root,
|
|
story_id,
|
|
Some("mergemaster"),
|
|
Some(&context),
|
|
None,
|
|
)
|
|
.await
|
|
{
|
|
slog_error!(
|
|
"[pipeline] Failed to restart mergemaster for '{story_id}': {e}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Always scan for unassigned work after any agent completes, regardless
|
|
// of the outcome (success, failure, restart). This ensures stories that
|
|
// failed agent assignment due to busy agents are retried when agents
|
|
// become available (bug 295).
|
|
self.auto_assign_available_work(&project_root).await;
|
|
}
|
|
|
|
/// Start the mergemaster agent for `story_id`, but only if the feature
|
|
/// branch has commits that are not yet on master.
|
|
///
|
|
/// If the branch has zero commits ahead of master, this logs an error and
|
|
/// sends a [`WatcherEvent::StoryBlocked`] instead of spawning a Claude
|
|
/// session. A no-op merge session was observed spending $0.82 in the
|
|
/// 2026-04-09 incident (story 519).
|
|
async fn start_mergemaster_or_block(&self, project_root: &Path, story_id: &str) {
|
|
let branch = format!("feature/story-{story_id}");
|
|
if !crate::agents::lifecycle::feature_branch_has_unmerged_changes(project_root, story_id) {
|
|
slog_error!(
|
|
"[mergemaster] Branch '{branch}' has no commits ahead of master — \
|
|
refusing to spawn merge session. \
|
|
Likely cause: the worktree was reset to master after the feature \
|
|
branch's commits were created. Investigate the worktree's git state \
|
|
before retrying. Story '{story_id}' stays in 4_merge/ for human review."
|
|
);
|
|
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
|
|
story_id: story_id.to_string(),
|
|
reason: format!(
|
|
"Feature branch '{branch}' has no commits ahead of master — nothing to merge. \
|
|
The worktree may have been reset to master. \
|
|
Check the worktree's git state and retry manually."
|
|
),
|
|
});
|
|
return;
|
|
}
|
|
if let Err(e) = self
|
|
.start_agent(project_root, story_id, Some("mergemaster"), None, None)
|
|
.await
|
|
{
|
|
slog_error!("[pipeline] Failed to start mergemaster for '{story_id}': {e}");
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Spawn pipeline advancement as a background task.
|
|
///
|
|
/// This is a **non-async** function so it does not participate in the opaque
|
|
/// type cycle between `start_agent` and `run_server_owned_completion`.
|
|
mod helpers;
|
|
|
|
use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store};
|
|
pub(crate) use helpers::{should_block_story, spawn_pipeline_advance};
|
|
|
|
#[cfg(test)]
|
|
mod tests;
|
|
#[cfg(test)]
|
|
mod tests_regression;
|