story-kit: merge 311_story_server_enforced_retry_limits_for_failed_merge_and_empty_diff_stories

This commit is contained in:
Dave
2026-03-19 16:34:11 +00:00
parent 662e00f94a
commit 3b887e3085
9 changed files with 346 additions and 65 deletions

View File

@@ -925,22 +925,30 @@ impl AgentPool {
}
}
} else {
slog!(
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting."
);
let context = format!(
"\n\n---\n## Previous Attempt Failed\n\
The acceptance gates failed with the following output:\n{}\n\n\
Please review the failures above, fix the issues, and try again.",
completion.gate_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some(agent_name), Some(&context))
.await
{
slog_error!(
"[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}"
// Increment retry count and check if blocked.
let story_path = project_root
.join(".story_kit/work/2_current")
.join(format!("{story_id}.md"));
if should_block_story(&story_path, config.max_retries, story_id, "coder") {
// Story has exceeded retry limit — do not restart.
} else {
slog!(
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting."
);
let context = format!(
"\n\n---\n## Previous Attempt Failed\n\
The acceptance gates failed with the following output:\n{}\n\n\
Please review the failures above, fix the issues, and try again.",
completion.gate_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some(agent_name), Some(&context))
.await
{
slog_error!(
"[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}"
);
}
}
}
}
@@ -1017,14 +1025,42 @@ impl AgentPool {
}
}
} else {
slog!(
"[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA."
);
let story_path = project_root
.join(".story_kit/work/3_qa")
.join(format!("{story_id}.md"));
if should_block_story(&story_path, config.max_retries, story_id, "qa-coverage") {
// Story has exceeded retry limit — do not restart.
} else {
slog!(
"[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA."
);
let context = format!(
"\n\n---\n## Coverage Gate Failed\n\
The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\
Please improve test coverage until the coverage gate passes.",
coverage_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), Some(&context))
.await
{
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
}
}
}
} else {
let story_path = project_root
.join(".story_kit/work/3_qa")
.join(format!("{story_id}.md"));
if should_block_story(&story_path, config.max_retries, story_id, "qa") {
// Story has exceeded retry limit — do not restart.
} else {
slog!("[pipeline] QA failed gates for '{story_id}'. Restarting.");
let context = format!(
"\n\n---\n## Coverage Gate Failed\n\
The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\
Please improve test coverage until the coverage gate passes.",
coverage_output
"\n\n---\n## Previous QA Attempt Failed\n\
The acceptance gates failed with the following output:\n{}\n\n\
Please re-run and fix the issues.",
completion.gate_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), Some(&context))
@@ -1033,20 +1069,6 @@ impl AgentPool {
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
}
}
} else {
slog!("[pipeline] QA failed gates for '{story_id}'. Restarting.");
let context = format!(
"\n\n---\n## Previous QA Attempt Failed\n\
The acceptance gates failed with the following output:\n{}\n\n\
Please re-run and fix the issues.",
completion.gate_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), Some(&context))
.await
{
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
}
}
}
PipelineStage::Mergemaster => {
@@ -1103,27 +1125,34 @@ impl AgentPool {
"[pipeline] Story '{story_id}' done. Worktree preserved for inspection."
);
} else {
slog!(
"[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster."
);
let context = format!(
"\n\n---\n## Post-Merge Test Failed\n\
The tests on master failed with the following output:\n{}\n\n\
Please investigate and resolve the failures, then call merge_agent_work again.",
output
);
if let Err(e) = self
.start_agent(
&project_root,
story_id,
Some("mergemaster"),
Some(&context),
)
.await
{
slog_error!(
"[pipeline] Failed to restart mergemaster for '{story_id}': {e}"
let story_path = project_root
.join(".story_kit/work/4_merge")
.join(format!("{story_id}.md"));
if should_block_story(&story_path, config.max_retries, story_id, "mergemaster") {
// Story has exceeded retry limit — do not restart.
} else {
slog!(
"[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster."
);
let context = format!(
"\n\n---\n## Post-Merge Test Failed\n\
The tests on master failed with the following output:\n{}\n\n\
Please investigate and resolve the failures, then call merge_agent_work again.",
output
);
if let Err(e) = self
.start_agent(
&project_root,
story_id,
Some("mergemaster"),
Some(&context),
)
.await
{
slog_error!(
"[pipeline] Failed to restart mergemaster for '{story_id}': {e}"
);
}
}
}
}
@@ -1563,6 +1592,44 @@ impl AgentPool {
continue;
}
// Skip blocked stories (retry limit exceeded).
if is_story_blocked(project_root, stage_dir, story_id) {
continue;
}
// Skip stories in 4_merge/ that already have a reported merge failure.
// These need human intervention — auto-assigning a new mergemaster
// would just waste tokens on the same broken merge.
if *stage == PipelineStage::Mergemaster
&& has_merge_failure(project_root, stage_dir, story_id)
{
continue;
}
// AC6: Detect empty-diff stories in 4_merge/ before starting a
// mergemaster. If the worktree has no commits on the feature branch,
// write a merge_failure and block the story immediately.
if *stage == PipelineStage::Mergemaster
&& let Some(wt_path) = worktree::find_worktree_path(project_root, story_id)
&& !super::gates::worktree_has_committed_work(&wt_path)
{
slog_warn!(
"[auto-assign] Story '{story_id}' in 4_merge/ has no commits \
on feature branch. Writing merge_failure and blocking."
);
let story_path = project_root
.join(".story_kit/work")
.join(stage_dir)
.join(format!("{story_id}.md"));
let _ = crate::io::story_metadata::write_merge_failure(
&story_path,
"Feature branch has no code changes — the coder agent \
did not produce any commits.",
);
let _ = crate::io::story_metadata::write_blocked(&story_path);
continue;
}
// Re-acquire the lock on each iteration to see state changes
// from previous start_agent calls in the same pass.
let preferred_agent =
@@ -2195,6 +2262,80 @@ fn has_review_hold(project_root: &Path, stage_dir: &str, story_id: &str) -> bool
.unwrap_or(false)
}
/// Increment retry_count and block the story if it exceeds `max_retries`.
///
/// Returns `true` if the story is now blocked (caller should NOT restart the agent).
/// Returns `false` if the story may be retried.
/// When `max_retries` is 0, retry limits are disabled.
fn should_block_story(story_path: &Path, max_retries: u32, story_id: &str, stage_label: &str) -> bool {
use crate::io::story_metadata::{increment_retry_count, write_blocked};
if max_retries == 0 {
// Retry limits disabled.
return false;
}
match increment_retry_count(story_path) {
Ok(new_count) => {
if new_count >= max_retries {
slog_warn!(
"[pipeline] Story '{story_id}' reached retry limit ({new_count}/{max_retries}) \
at {stage_label} stage. Marking as blocked."
);
if let Err(e) = write_blocked(story_path) {
slog_error!("[pipeline] Failed to write blocked flag for '{story_id}': {e}");
}
true
} else {
slog!(
"[pipeline] Story '{story_id}' retry {new_count}/{max_retries} at {stage_label} stage."
);
false
}
}
Err(e) => {
slog_error!("[pipeline] Failed to increment retry_count for '{story_id}': {e}");
false // Don't block on error — allow retry.
}
}
}
/// Return `true` if the story file has `blocked: true` in its front matter.
fn is_story_blocked(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
use crate::io::story_metadata::parse_front_matter;
let path = project_root
.join(".story_kit")
.join("work")
.join(stage_dir)
.join(format!("{story_id}.md"));
let contents = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(_) => return false,
};
parse_front_matter(&contents)
.ok()
.and_then(|m| m.blocked)
.unwrap_or(false)
}
/// Return `true` if the story file has a `merge_failure` field in its front matter.
fn has_merge_failure(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
use crate::io::story_metadata::parse_front_matter;
let path = project_root
.join(".story_kit")
.join("work")
.join(stage_dir)
.join(format!("{story_id}.md"));
let contents = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(_) => return false,
};
parse_front_matter(&contents)
.ok()
.and_then(|m| m.merge_failure)
.is_some()
}
/// Return `true` if `agent_name` has no active (pending/running) entry in the pool.
fn is_agent_free(agents: &HashMap<String, StoryAgent>, agent_name: &str) -> bool {
!agents.values().any(|a| {
@@ -2420,6 +2561,16 @@ async fn run_server_owned_completion(
let path = wt_path;
match tokio::task::spawn_blocking(move || {
super::gates::check_uncommitted_changes(&path)?;
// AC5: Fail early if the coder finished with no commits on the feature branch.
// This prevents empty-diff stories from advancing through QA to merge.
if !super::gates::worktree_has_committed_work(&path) {
return Ok((
false,
"Agent exited with no commits on the feature branch. \
The agent did not produce any code changes."
.to_string(),
));
}
super::gates::run_acceptance_gates(&path)
})
.await