Files
huskies/server/src/agents/pool/pipeline/advance/mod.rs
T

875 lines
43 KiB
Rust
Raw Normal View History

//! Pipeline advance — moves stories forward through pipeline stages after agent completion.
#![allow(unused_imports, dead_code)]
use crate::config::ProjectConfig;
use crate::io::watcher::WatcherEvent;
use crate::slog;
use crate::slog_error;
use crate::slog_warn;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};
use tokio::sync::broadcast;
use super::super::super::{CompletionReport, PipelineStage, agent_config_stage, pipeline_stage};
use super::super::{AgentPool, StoryAgent};
2026-04-28 23:06:40 +00:00
/// Maximum number of bytes of gate output to include in the failure context
/// injected into the resumed session. Keeps the injected message focused —
/// the tail of the output (where errors appear) is always preserved.
const MAX_GATE_OUTPUT_BYTES: usize = 8_000;
/// Truncate gate output to [`MAX_GATE_OUTPUT_BYTES`], keeping the **tail**
/// (where compiler errors and test failures are reported).
2026-05-12 17:49:44 +00:00
#[allow(clippy::string_slice)] // adjusted is walked forward to a char boundary before slicing
2026-04-28 23:06:40 +00:00
fn truncate_gate_output(output: &str) -> &str {
if output.len() <= MAX_GATE_OUTPUT_BYTES {
return output;
}
let start = output.len() - MAX_GATE_OUTPUT_BYTES;
// Advance to the next valid UTF-8 char boundary.
let mut adjusted = start;
while !output.is_char_boundary(adjusted) {
adjusted += 1;
}
&output[adjusted..]
}
impl AgentPool {
/// Pipeline advancement: after an agent completes, move the story to
/// the next pipeline stage and start the appropriate agent.
#[allow(clippy::too_many_arguments)]
pub(super) async fn run_pipeline_advance(
&self,
story_id: &str,
agent_name: &str,
completion: CompletionReport,
project_root: Option<PathBuf>,
worktree_path: Option<PathBuf>,
merge_failure_reported: bool,
previous_session_id: Option<String>,
) {
let project_root = match project_root {
Some(p) => p,
None => {
slog_warn!("[pipeline] No project_root for '{story_id}:{agent_name}'");
return;
}
};
let config = ProjectConfig::load(&project_root).unwrap_or_default();
let stage = config
.find_agent(agent_name)
.map(agent_config_stage)
.unwrap_or_else(|| pipeline_stage(agent_name));
// If the story is frozen, do not advance the pipeline. The agent's work
// is done but the story stays at its current stage.
if crate::io::story_metadata::is_story_frozen_in_store(story_id) {
slog!("[pipeline] Story '{story_id}' is frozen; pipeline advancement suppressed.");
return;
}
match stage {
PipelineStage::Other => {
// Supervisors and unknown agents do not advance the pipeline.
}
PipelineStage::Coder => {
2026-05-13 09:30:44 +00:00
if completion.needs_commit_recovery {
// The coder exited with uncommitted content but no commits
// (typical "claude-code session boundary mid-sweep" pattern).
// Use a PROGRESS-AWARE retry cap: the agent gets unlimited
// respawns as long as progress is being made between attempts.
// Progress is satisfied if EITHER (a) the worktree diff grew,
// OR (b) the set of files the agent read grew. Raw tool-call
// count does NOT count — a looping agent can produce many calls.
// Only self-exited sessions with no file or read progress count
// toward the cap; forced exits (API error, network, budget
// exhaustion) are excluded (story 1089).
// After NO_PROGRESS_CAP consecutive qualifying no-progress
// respawns, block for human attention.
//
// TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
// keeps making file-edit progress every session, after this
// many total respawns without a commit we escalate — catches
// the "agent flaps between different edits but never commits"
// pattern that the progress-aware counter would never trigger.
const NO_PROGRESS_CAP: u32 = 3;
const TOTAL_ATTEMPTS_CAP: u32 = 8;
// AC1: consume the forced-exit flag written by spawn.rs when
// the agent process exited with a non-zero code.
let forced_exit = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryForcedExit(story_id),
)
.is_some();
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryForcedExit(
story_id,
));
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
std::process::Command::new("git")
.args(["diff", "master"])
.current_dir(p)
.output()
.ok()
.map(|out| out.stdout.len().to_string())
});
let stored_fingerprint = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
let diff_progress = current_fingerprint.is_some()
&& stored_fingerprint.as_ref() != current_fingerprint.as_ref();
// AC2: check read-file set progress as an additional signal.
let read_progress = previous_session_id.as_deref().is_some_and(|session_id| {
collect_read_progress(&project_root, story_id, agent_name, session_id)
});
let made_progress = diff_progress || read_progress;
let prev_no_progress_count = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryPending(story_id),
)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0);
// AC1: forced exits do not increment the stuck-respawn counter.
let no_progress_count = if forced_exit {
prev_no_progress_count
} else if made_progress || stored_fingerprint.is_none() {
1
} else {
prev_no_progress_count + 1
};
let total_attempts = crate::db::read_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
)
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(0)
+ 1;
if total_attempts >= TOTAL_ATTEMPTS_CAP {
// Outer cap reached: agent has been respawned too many
// times without ever committing. Block regardless of
// whether file-edit progress is still happening.
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
);
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
without a commit. Blocking story."
);
let reason = format!(
"commit absent after {total_attempts} respawns — \
agent kept making edits but never committed"
);
if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
{
slog_error!("[pipeline] Failed to block '{story_id}': {e}");
}
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason,
});
return;
}
if no_progress_count >= NO_PROGRESS_CAP {
// Cap reached → block for human attention.
2026-05-13 11:22:57 +00:00
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
);
crate::db::delete_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
);
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
2026-05-13 09:30:44 +00:00
slog!(
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
file or read progress over {no_progress_count} consecutive \
self-exit commit-recovery respawns. Blocking story."
);
// AC4: block message names the specific cause.
let reason = format!(
"stuck-respawn cap reached: {NO_PROGRESS_CAP} consecutive \
self-exits with no file or read progress"
2026-05-13 09:30:44 +00:00
);
if let Err(e) =
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
{
slog_error!("[pipeline] Failed to block '{story_id}': {e}");
}
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason,
});
} else {
// Below cap: respawn with commit-only prompt. Does NOT
// consume a retry_count slot.
2026-05-13 11:22:57 +00:00
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryPending(story_id),
&no_progress_count.to_string(),
2026-05-13 11:22:57 +00:00
);
if let Some(ref fp) = current_fingerprint {
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
fp,
);
}
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
&total_attempts.to_string(),
);
2026-05-13 09:30:44 +00:00
slog!(
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
for '{story_id}' (no-progress {no_progress_count}/\
{NO_PROGRESS_CAP}, total {total_attempts}/\
{TOTAL_ATTEMPTS_CAP}; diff_progress={diff_progress}, \
read_progress={read_progress}, forced_exit={forced_exit}). \
Issuing commit-only respawn."
2026-05-13 09:30:44 +00:00
);
let addendum = "\n\nYou have uncommitted work in this worktree. \
Your only task this session is run_tests → git_add → git_commit. \
Do not explore further.";
if let Err(e) = self
.start_agent(
&project_root,
story_id,
Some(agent_name),
Some(addendum),
previous_session_id,
)
.await
{
slog_error!(
"[pipeline] Failed to start commit-recovery respawn \
for '{story_id}': {e}"
);
}
}
2026-05-13 13:54:27 +00:00
} else if crate::db::read_content(crate::db::ContentKey::MergeFixupPending(
story_id,
))
.is_some()
{
// Merge gate fixup coder completed (story 981).
// Route back to merge on success, or to MergeFailure on failure.
// Neither path counts against retry_count (AC4).
crate::db::delete_content(crate::db::ContentKey::MergeFixupPending(story_id));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
// The FixupRequested transition set retry_count=1 so the gate output
// was injected into the spawn. Reset to 0 now so the fixup does not
// consume a retry slot (AC4).
crate::crdt_state::set_retry_count(story_id, 0);
if completion.gates_passed {
slog!(
"[pipeline] Merge fixup coder '{agent_name}' passed gates for \
'{story_id}'. Re-triggering merge."
);
if let Err(e) = crate::agents::lifecycle::move_story_to_merge(story_id) {
slog_error!(
"[pipeline] Failed to move '{story_id}' to 4_merge/ after \
fixup: {e}"
);
} else {
self.trigger_server_side_merge(&project_root, story_id);
}
} else {
slog!(
"[pipeline] Merge fixup coder '{agent_name}' failed gates for \
'{story_id}'. Transitioning back to MergeFailure."
);
// Two-step: Coding → Merge → MergeFailure.
// feature_branch follows the project convention; commits_ahead=1
// is a safe approximation (the actual count doesn't matter here —
// it is only used to reconstruct Merge via Unblock if a human
// later retries).
let branch =
crate::pipeline_state::BranchName(format!("feature/story-{story_id}"));
let commits_ahead = std::num::NonZeroU32::new(1).unwrap();
let qa_skip = crate::pipeline_state::PipelineEvent::QaSkipped {
feature_branch: branch,
commits_ahead,
};
if let Err(e) =
crate::pipeline_state::apply_transition_str(story_id, qa_skip, None)
{
slog_error!(
"[pipeline] Failed to move '{story_id}' Coding→Merge for \
fixup failure: {e}"
);
}
2026-05-13 15:30:03 +00:00
let kind = crate::pipeline_state::MergeFailureKind::GatesFailed(format!(
2026-05-13 13:54:27 +00:00
"Merge fixup coder could not resolve gate failures: {}",
truncate_gate_output(&completion.gate_output)
2026-05-13 15:30:03 +00:00
));
let display = kind.display_reason();
2026-05-13 13:54:27 +00:00
if let Err(e) =
2026-05-13 15:30:03 +00:00
crate::agents::lifecycle::transition_to_merge_failure(story_id, kind)
2026-05-13 13:54:27 +00:00
{
slog_error!(
"[pipeline] Failed to transition '{story_id}' to MergeFailure \
after fixup failure: {e}"
);
}
let _ = self.watcher_tx.send(WatcherEvent::MergeFailure {
story_id: story_id.to_string(),
2026-05-13 15:30:03 +00:00
reason: display,
2026-05-13 13:54:27 +00:00
});
}
2026-05-13 09:30:44 +00:00
} else if completion.gates_passed {
// Clear any stale recovery keys when the coder succeeds normally.
2026-05-13 11:22:57 +00:00
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
// Determine effective QA mode for this story.
let qa_mode = {
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
if item_type == "spike" {
crate::io::story_metadata::QaMode::Human
} else {
let default_qa = config.default_qa_mode();
resolve_qa_mode_from_store(&project_root, story_id, default_qa)
}
};
match qa_mode {
crate::io::story_metadata::QaMode::Server => {
slog!(
"[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
qa: server — moving directly to merge."
);
if let Err(e) = crate::agents::lifecycle::move_story_to_merge(story_id)
{
slog_error!(
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
);
} else {
2026-04-27 23:31:57 +00:00
self.trigger_server_side_merge(&project_root, story_id);
}
}
crate::io::story_metadata::QaMode::Agent => {
slog!(
"[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
qa: agent — moving to QA."
);
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(story_id) {
slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
} else if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), None, None)
.await
{
slog_error!(
"[pipeline] Failed to start qa agent for '{story_id}': {e}"
);
}
}
crate::io::story_metadata::QaMode::Human => {
slog!(
"[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. \
qa: human — holding for human review."
);
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(story_id) {
slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
} else {
write_review_hold_to_store(story_id);
}
}
}
} else {
// Clear any stale recovery keys when gates fail normally (agent committed
2026-05-13 09:30:44 +00:00
// but the build is broken — treat as a standard retry, not a recovery).
2026-05-13 11:22:57 +00:00
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
story_id,
));
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
story_id,
));
// Bug 645 / 668: Before retry/block, check if the agent left committed
// work AND the agent had a passing run_tests result captured during its
// session. An agent may crash mid-output (e.g. Claude Code CLI PTY write
// assertion) after having already committed valid code and run tests.
// We require positive test evidence (not just cargo check) so that only
// stories with genuinely passing test suites are salvaged.
//
// The `run_tests` MCP tool writes `{story_id}:run_tests_ok` to the DB
// whenever script/test exits 0 inside a story worktree. Consume the
// evidence here so it does not persist to the next agent session.
let has_test_evidence =
2026-05-13 11:22:57 +00:00
crate::db::read_content(crate::db::ContentKey::RunTestsOk(story_id))
.is_some();
crate::db::delete_content(crate::db::ContentKey::RunTestsOk(story_id));
let work_survived = has_test_evidence
&& worktree_path.as_ref().is_some_and(|wt_path| {
crate::agents::gates::worktree_has_committed_work(wt_path)
});
if work_survived {
slog!(
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}' but \
committed work survives with captured passing tests. Advancing to QA \
instead of retrying (bug 645)."
);
let qa_mode = {
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
if item_type == "spike" {
crate::io::story_metadata::QaMode::Human
} else {
let default_qa = config.default_qa_mode();
resolve_qa_mode_from_store(&project_root, story_id, default_qa)
}
};
match qa_mode {
crate::io::story_metadata::QaMode::Server => {
if let Err(e) =
crate::agents::lifecycle::move_story_to_merge(story_id)
{
slog_error!(
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
);
} else {
2026-04-27 23:31:57 +00:00
self.trigger_server_side_merge(&project_root, story_id);
}
}
crate::io::story_metadata::QaMode::Agent => {
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(story_id)
{
slog_error!(
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
);
} else if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), None, None)
.await
{
slog_error!(
"[pipeline] Failed to start qa for '{story_id}': {e}"
);
}
}
crate::io::story_metadata::QaMode::Human => {
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(story_id)
{
slog_error!(
"[pipeline] Failed to move '{story_id}' to 3_qa/: {e}"
);
} else {
write_review_hold_to_store(story_id);
}
}
}
} else {
// Persist gate_output so the retry spawn can inject it into
// --append-system-prompt (story 881).
crate::db::write_content(
2026-05-13 11:22:57 +00:00
crate::db::ContentKey::GateOutput(story_id),
&completion.gate_output,
);
// Increment retry count and check if blocked.
if let Some(reason) =
should_block_story(story_id, config.max_retries, "coder")
{
// Story has exceeded retry limit — do not restart.
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason,
});
} else {
slog!(
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting."
);
let context = format!(
"\n\n---\n## Previous Attempt Failed\n\
The acceptance gates failed with the following output:\n{}\n\n\
Please review the failures above, fix the issues, and try again.",
truncate_gate_output(&completion.gate_output)
);
if let Err(e) = self
.start_agent(
&project_root,
story_id,
Some(agent_name),
Some(&context),
previous_session_id,
)
.await
{
slog_error!(
"[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}"
);
}
}
}
}
}
PipelineStage::Qa => {
if completion.gates_passed {
// Run coverage gate in the QA worktree before advancing to merge.
let coverage_path = worktree_path
.clone()
.unwrap_or_else(|| project_root.clone());
let cp = coverage_path.clone();
let coverage_result = tokio::task::spawn_blocking(move || {
crate::agents::gates::run_coverage_gate(&cp)
})
.await
.unwrap_or_else(|e| {
slog_warn!("[pipeline] Coverage gate task panicked: {e}");
Ok((false, format!("Coverage gate task panicked: {e}")))
});
let (coverage_passed, coverage_output) = match coverage_result {
Ok(pair) => pair,
Err(e) => (false, e),
};
if coverage_passed {
// Check whether this item needs human review before merging.
let needs_human_review = {
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
if item_type == "spike" {
true // Spikes always need human review.
} else {
let default_qa = config.default_qa_mode();
matches!(
resolve_qa_mode_from_store(&project_root, story_id, default_qa),
crate::io::story_metadata::QaMode::Human
)
}
};
if needs_human_review {
// Hold in 3_qa/ for human review.
write_review_hold_to_store(story_id);
slog!(
"[pipeline] QA passed for '{story_id}'. \
Holding for human review. \
Worktree preserved at: {worktree_path:?}"
);
} else {
slog!(
"[pipeline] QA passed gates and coverage for '{story_id}'. \
Moving directly to merge."
);
if let Err(e) = crate::agents::lifecycle::move_story_to_merge(story_id)
{
slog_error!(
"[pipeline] Failed to move '{story_id}' to 4_merge/: {e}"
);
} else {
2026-04-27 23:31:57 +00:00
self.trigger_server_side_merge(&project_root, story_id);
}
}
} else if let Some(reason) =
should_block_story(story_id, config.max_retries, "qa-coverage")
{
// Story has exceeded retry limit — do not restart.
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason,
});
} else {
slog!(
"[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA."
);
let context = format!(
"\n\n---\n## Coverage Gate Failed\n\
The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\
Please improve test coverage until the coverage gate passes.",
coverage_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), Some(&context), None)
.await
{
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
}
}
} else {
// Persist gate_output so the retry spawn can inject it into
// --append-system-prompt (story 881).
crate::db::write_content(
2026-05-13 11:22:57 +00:00
crate::db::ContentKey::GateOutput(story_id),
&completion.gate_output,
);
if let Some(reason) = should_block_story(story_id, config.max_retries, "qa") {
// Story has exceeded retry limit — do not restart.
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason,
});
} else {
slog!("[pipeline] QA failed gates for '{story_id}'. Restarting.");
let context = format!(
"\n\n---\n## Previous QA Attempt Failed\n\
The acceptance gates failed with the following output:\n{}\n\n\
Please re-run and fix the issues.",
completion.gate_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), Some(&context), None)
.await
{
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
}
}
}
}
PipelineStage::Mergemaster => {
// Bug 529: Guard against stale mergemaster advances. If the story
// has already reached done or archived (e.g. a previous mergemaster
// succeeded), this advance is a zombie — skip it entirely to avoid
// phantom notifications and redundant post-merge test runs.
if let Ok(Some(typed_item)) = crate::pipeline_state::read_typed(story_id)
&& matches!(
typed_item.stage,
crate::pipeline_state::Stage::Done { .. }
| crate::pipeline_state::Stage::Archived { .. }
2026-05-13 16:43:19 +00:00
| crate::pipeline_state::Stage::Abandoned { .. }
| crate::pipeline_state::Stage::Superseded { .. }
| crate::pipeline_state::Stage::Rejected { .. }
)
{
let current_dir = typed_item.stage.dir_name();
slog!(
"[pipeline] Skipping stale mergemaster advance for '{story_id}': \
story is already in work/{current_dir}/"
);
// Skip pipeline advancement — do not run post-merge tests,
// do not emit notifications, do not restart agents.
return;
}
// Block advancement if the mergemaster explicitly reported a failure.
// The server-owned gate check runs in the feature-branch worktree (not
// master), so `gates_passed=true` is misleading when no code was merged.
if merge_failure_reported {
slog!(
"[pipeline] Pipeline advancement blocked for '{story_id}': \
mergemaster explicitly reported a merge failure. \
Story stays in 4_merge/ for human review."
);
} else {
// Run script/test on master (project_root) as the post-merge verification.
slog!(
"[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master."
);
let root = project_root.clone();
let test_result = tokio::task::spawn_blocking(move || {
crate::agents::gates::run_project_tests(&root)
})
.await
.unwrap_or_else(|e| {
slog_warn!("[pipeline] Post-merge test task panicked: {e}");
Ok((false, format!("Test task panicked: {e}")))
});
let (passed, output) = match test_result {
Ok(pair) => pair,
Err(e) => (false, e),
};
if passed {
slog!(
"[pipeline] Post-merge tests passed for '{story_id}'. Moving to done."
);
if let Err(e) = crate::agents::lifecycle::move_story_to_done(story_id) {
slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}");
}
self.remove_agents_for_story(story_id);
2026-05-13 08:01:08 +00:00
crate::crdt_state::delete_merge_job(story_id);
// TODO: Re-enable worktree cleanup once we have persistent agent logs.
// Removing worktrees destroys evidence needed to debug empty-commit agents.
// let config =
// crate::config::ProjectConfig::load(&project_root).unwrap_or_default();
// if let Err(e) =
// worktree::remove_worktree_by_story_id(&project_root, story_id, &config)
// .await
// {
// slog!(
// "[pipeline] Failed to remove worktree for '{story_id}': {e}"
// );
// }
slog!(
"[pipeline] Story '{story_id}' done. Worktree preserved for inspection."
);
} else if let Some(reason) =
should_block_story(story_id, config.max_retries, "mergemaster")
{
// Story has exceeded retry limit — do not restart.
let _ = self.watcher_tx.send(WatcherEvent::StoryBlocked {
story_id: story_id.to_string(),
reason,
});
} else {
slog!(
"[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster."
);
let context = format!(
"\n\n---\n## Post-Merge Test Failed\n\
The tests on master failed with the following output:\n{}\n\n\
Please investigate and resolve the failures, then call merge_agent_work again.",
output
);
if let Err(e) = self
.start_agent(
&project_root,
story_id,
Some("mergemaster"),
Some(&context),
None,
)
.await
{
slog_error!(
"[pipeline] Failed to restart mergemaster for '{story_id}': {e}"
);
}
}
}
}
}
// Always scan for unassigned work after any agent completes, regardless
// of the outcome (success, failure, restart). This ensures stories that
// failed agent assignment due to busy agents are retried when agents
// become available (bug 295).
self.auto_assign_available_work(&project_root).await;
}
}
/// Spawn pipeline advancement as a background task.
///
/// This is a **non-async** function so it does not participate in the opaque
/// type cycle between `start_agent` and `run_server_owned_completion`.
mod helpers;
use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store};
pub(crate) use helpers::{should_block_story, spawn_pipeline_advance};
/// Parse a huskies agent log and return the set of file paths passed to the
/// Read tool in that session. Returns an empty set if the log cannot be read.
///
/// Used by [`collect_read_progress`] to detect read-exploration progress even
/// when the worktree diff did not grow (story 1089, AC2).
fn collect_read_files_from_log(
project_root: &std::path::Path,
story_id: &str,
agent_name: &str,
session_id: &str,
) -> std::collections::HashSet<String> {
let log_path = crate::agent_log::log_file_path(project_root, story_id, agent_name, session_id);
let mut files = std::collections::HashSet::new();
let log_text = match std::fs::read_to_string(&log_path) {
Ok(t) => t,
Err(_) => return files,
};
for line in log_text.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let entry: serde_json::Value = match serde_json::from_str(trimmed) {
Ok(v) => v,
Err(_) => continue,
};
// Only look at agent_json events where data.type == "assistant".
if entry.get("type").and_then(|t| t.as_str()) != Some("agent_json") {
continue;
}
let data = match entry.get("data") {
Some(d) => d,
None => continue,
};
if data.get("type").and_then(|t| t.as_str()) != Some("assistant") {
continue;
}
let content = match data.pointer("/message/content").and_then(|c| c.as_array()) {
Some(c) => c,
None => continue,
};
for item in content {
if item.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
continue;
}
if item.get("name").and_then(|n| n.as_str()) != Some("Read") {
continue;
}
if let Some(path) = item.pointer("/input/file_path").and_then(|p| p.as_str()) {
files.insert(path.to_string());
}
}
}
files
}
/// Return `true` if the agent read any files in `session_id` that were not in
/// the cumulative read set for `story_id`. Updates the stored cumulative set
/// when new files are found (story 1089, AC2).
fn collect_read_progress(
project_root: &std::path::Path,
story_id: &str,
agent_name: &str,
session_id: &str,
) -> bool {
let session_files = collect_read_files_from_log(project_root, story_id, agent_name, session_id);
if session_files.is_empty() {
return false;
}
let stored_set: std::collections::HashSet<String> =
crate::db::read_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id))
.map(|s| {
s.lines()
.filter(|l| !l.is_empty())
.map(str::to_string)
.collect()
})
.unwrap_or_default();
let union: std::collections::HashSet<String> =
stored_set.union(&session_files).cloned().collect();
if union.len() > stored_set.len() {
let mut sorted: Vec<&String> = union.iter().collect();
sorted.sort();
crate::db::write_content(
crate::db::ContentKey::CommitRecoveryReadSet(story_id),
&sorted
.into_iter()
.map(String::as_str)
.collect::<Vec<_>>()
.join("\n"),
);
true
} else {
false
}
}
#[cfg(test)]
mod tests;
#[cfg(test)]
mod tests_regression;