huskies: merge 543_story_resume_failed_coder_agents_with_resume_instead_of_starting_fresh_sessions

This commit is contained in:
dave
2026-04-12 12:52:46 +00:00
parent c80931c15c
commit 5f01631e6a
16 changed files with 135 additions and 52 deletions
@@ -262,7 +262,7 @@ impl AgentPool {
"[auto-assign] Assigning '{agent_name}' to '{story_id}' in {stage_dir}/"
);
if let Err(e) = self
.start_agent(project_root, story_id, Some(&agent_name), None)
.start_agent(project_root, story_id, Some(&agent_name), None, None)
.await
{
slog!(
+24 -5
View File
@@ -17,6 +17,7 @@ use super::super::{AgentPool, StoryAgent};
impl AgentPool {
/// Pipeline advancement: after an agent completes, move the story to
/// the next pipeline stage and start the appropriate agent.
#[allow(clippy::too_many_arguments)]
pub(super) async fn run_pipeline_advance(
&self,
story_id: &str,
@@ -25,6 +26,7 @@ impl AgentPool {
project_root: Option<PathBuf>,
worktree_path: Option<PathBuf>,
merge_failure_reported: bool,
previous_session_id: Option<String>,
) {
let project_root = match project_root {
Some(p) => p,
@@ -81,7 +83,7 @@ impl AgentPool {
if let Err(e) = crate::agents::lifecycle::move_story_to_qa(&project_root, story_id) {
slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
} else if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), None)
.start_agent(&project_root, story_id, Some("qa"), None, None)
.await
{
slog_error!("[pipeline] Failed to start qa agent for '{story_id}': {e}");
@@ -118,7 +120,13 @@ impl AgentPool {
completion.gate_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some(agent_name), Some(&context))
.start_agent(
&project_root,
story_id,
Some(agent_name),
Some(&context),
previous_session_id,
)
.await
{
slog_error!(
@@ -202,7 +210,7 @@ impl AgentPool {
coverage_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), Some(&context))
.start_agent(&project_root, story_id, Some("qa"), Some(&context), None)
.await
{
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
@@ -223,7 +231,7 @@ impl AgentPool {
completion.gate_output
);
if let Err(e) = self
.start_agent(&project_root, story_id, Some("qa"), Some(&context))
.start_agent(&project_root, story_id, Some("qa"), Some(&context), None)
.await
{
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
@@ -322,6 +330,7 @@ impl AgentPool {
story_id,
Some("mergemaster"),
Some(&context),
None,
)
.await
{
@@ -369,7 +378,7 @@ impl AgentPool {
return;
}
if let Err(e) = self
.start_agent(project_root, story_id, Some("mergemaster"), None)
.start_agent(project_root, story_id, Some("mergemaster"), None, None)
.await
{
slog_error!("[pipeline] Failed to start mergemaster for '{story_id}': {e}");
@@ -392,6 +401,7 @@ pub(super) fn spawn_pipeline_advance(
worktree_path: Option<PathBuf>,
watcher_tx: broadcast::Sender<WatcherEvent>,
merge_failure_reported: bool,
previous_session_id: Option<String>,
) {
let sid = story_id.to_string();
let aname = agent_name.to_string();
@@ -410,6 +420,7 @@ pub(super) fn spawn_pipeline_advance(
project_root,
worktree_path,
merge_failure_reported,
previous_session_id,
)
.await;
});
@@ -524,6 +535,7 @@ mod tests {
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
@@ -565,6 +577,7 @@ mod tests {
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
@@ -606,6 +619,7 @@ mod tests {
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
@@ -639,6 +653,7 @@ mod tests {
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
@@ -706,6 +721,7 @@ stage = "qa"
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
@@ -777,6 +793,7 @@ stage = "qa"
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
@@ -883,6 +900,7 @@ stage = "qa"
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
@@ -950,6 +968,7 @@ stage = "qa"
Some(root.to_path_buf()),
None,
false,
None,
)
.await;
@@ -83,6 +83,7 @@ impl AgentPool {
project_root_for_advance,
wt_path_for_advance,
merge_failure_reported_for_advance,
session_id_for_advance,
) = {
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
let agent = agents.get_mut(&key).ok_or_else(|| {
@@ -94,8 +95,9 @@ impl AgentPool {
let pr = agent.project_root.clone();
let wt = agent.worktree_info.as_ref().map(|w| w.path.clone());
let mfr = agent.merge_failure_reported;
let sid_advance = agent.session_id.clone();
agents.remove(&key);
(tx, sid, pr, wt, mfr)
(tx, sid, pr, wt, mfr, sid_advance)
};
// Emit Done so wait_for_agent unblocks.
@@ -128,6 +130,7 @@ impl AgentPool {
project_root_for_advance,
wt_path_for_advance,
merge_failure_reported_for_advance,
session_id_for_advance,
)
.await;
});
@@ -277,6 +280,8 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
lock.remove(&key);
(tx, pr, wt, mfr)
};
// The completed session's ID is used to resume if gates fail.
let previous_session_id = session_id.clone();
// Emit Done so wait_for_agent unblocks.
let _ = tx.send(AgentEvent::Done {
@@ -299,6 +304,7 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
wt_path_for_advance,
watcher_tx,
merge_failure_reported_for_advance,
previous_session_id,
);
}
+57 -34
View File
@@ -21,14 +21,22 @@ impl AgentPool {
/// agent (story 190). If all coders are busy the call fails with an error
/// indicating the story will be picked up when one becomes available.
///
/// If `resume_context` is provided, it is appended to the rendered prompt
/// so the agent can pick up from a previous failed attempt.
/// If `resume_context` is provided and `session_id_to_resume` is `None`,
/// the context is appended to the rendered prompt so the agent can pick up
/// from a previous failed attempt.
///
/// If `session_id_to_resume` is provided, the agent is launched with
/// `--resume <session_id>` instead of `-p <full_prompt>`. Only
/// `resume_context` (if any) is sent as the new message. This lets
/// the agent re-enter the previous conversation without re-reading
/// CLAUDE.md and README, satisfying story 543.
pub async fn start_agent(
&self,
project_root: &Path,
story_id: &str,
agent_name: Option<&str>,
resume_context: Option<&str>,
session_id_to_resume: Option<String>,
) -> Result<AgentInfo, String> {
let config = ProjectConfig::load(project_root)?;
@@ -310,6 +318,7 @@ impl AgentPool {
let project_root_clone = project_root.to_path_buf();
let config_clone = config.clone();
let resume_context_owned = resume_context.map(str::to_string);
let session_id_to_resume_owned = session_id_to_resume;
let sid = story_id.to_string();
let aname = resolved_name.clone();
let tx_clone = tx.clone();
@@ -397,10 +406,21 @@ impl AgentPool {
}
};
// Append resume context if this is a restart with failure information.
if let Some(ctx) = resume_context_owned {
prompt.push_str(&ctx);
}
// Build the effective prompt and determine resume session.
//
// When resuming a previous session, discard the full rendered prompt
// (which would re-read CLAUDE.md and README) and send only the gate
// failure context as a new message. On a fresh start, append the
// failure context to the original prompt as before.
let effective_prompt = match &session_id_to_resume_owned {
Some(_) => resume_context_owned.unwrap_or_default(),
None => {
if let Some(ctx) = resume_context_owned {
prompt.push_str(&ctx);
}
prompt
}
};
// Step 3: transition to Running now that the worktree is ready.
{
@@ -431,10 +451,11 @@ impl AgentPool {
agent_name: aname.clone(),
command,
args,
prompt,
prompt: effective_prompt,
cwd: wt_path_str,
inactivity_timeout_secs,
mcp_port: port_for_task,
session_id_to_resume: session_id_to_resume_owned.clone(),
};
runtime
.start(ctx, tx_clone.clone(), log_clone.clone(), log_writer_clone)
@@ -447,10 +468,11 @@ impl AgentPool {
agent_name: aname.clone(),
command,
args,
prompt,
prompt: effective_prompt,
cwd: wt_path_str,
inactivity_timeout_secs,
mcp_port: port_for_task,
session_id_to_resume: session_id_to_resume_owned.clone(),
};
runtime
.start(ctx, tx_clone.clone(), log_clone.clone(), log_writer_clone)
@@ -463,10 +485,11 @@ impl AgentPool {
agent_name: aname.clone(),
command,
args,
prompt,
prompt: effective_prompt,
cwd: wt_path_str,
inactivity_timeout_secs,
mcp_port: port_for_task,
session_id_to_resume: session_id_to_resume_owned,
};
runtime
.start(ctx, tx_clone.clone(), log_clone.clone(), log_writer_clone)
@@ -646,7 +669,7 @@ stage = "coder"
pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running);
let result = pool
.start_agent(tmp.path(), "42_my_story", None, None)
.start_agent(tmp.path(), "42_my_story", None, None, None)
.await;
match result {
Ok(info) => {
@@ -688,7 +711,7 @@ stage = "coder"
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending);
let result = pool.start_agent(tmp.path(), "story-3", None, None).await;
let result = pool.start_agent(tmp.path(), "story-3", None, None, None).await;
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
@@ -720,7 +743,7 @@ stage = "coder"
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
let result = pool.start_agent(tmp.path(), "story-3", None, None).await;
let result = pool.start_agent(tmp.path(), "story-3", None, None, None).await;
assert!(result.is_err());
let err = result.unwrap_err();
@@ -758,7 +781,7 @@ stage = "coder"
let pool = AgentPool::new_test(3001);
let result = pool.start_agent(tmp.path(), "story-5", None, None).await;
let result = pool.start_agent(tmp.path(), "story-5", None, None, None).await;
match result {
Ok(_) => {}
Err(e) => {
@@ -793,7 +816,7 @@ stage = "coder"
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
let result = pool
.start_agent(tmp.path(), "story-2", Some("coder-1"), None)
.start_agent(tmp.path(), "story-2", Some("coder-1"), None, None)
.await;
assert!(result.is_err());
let err = result.unwrap_err();
@@ -819,7 +842,7 @@ stage = "coder"
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("story-a", "qa", AgentStatus::Running);
let result = pool.start_agent(root, "story-b", Some("qa"), None).await;
let result = pool.start_agent(root, "story-b", Some("qa"), None, None).await;
assert!(
result.is_err(),
@@ -846,7 +869,7 @@ stage = "coder"
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("story-a", "qa", AgentStatus::Completed);
let result = pool.start_agent(root, "story-b", Some("qa"), None).await;
let result = pool.start_agent(root, "story-b", Some("qa"), None, None).await;
if let Err(ref e) = result {
assert!(
@@ -880,7 +903,7 @@ stage = "coder"
let pool = AgentPool::new_test(3099);
let result = pool
.start_agent(root, "50_story_test", Some("coder-1"), None)
.start_agent(root, "50_story_test", Some("coder-1"), None, None)
.await;
assert!(
@@ -938,7 +961,7 @@ stage = "coder"
let pool = AgentPool::new_test(3099);
pool.inject_test_agent("story-x", "qa", AgentStatus::Running);
let result = pool.start_agent(root, "story-y", Some("qa"), None).await;
let result = pool.start_agent(root, "story-y", Some("qa"), None, None).await;
assert!(result.is_err());
let err = result.unwrap_err();
@@ -969,7 +992,7 @@ stage = "coder"
pool.inject_test_agent("86_story_foo", "coder-1", AgentStatus::Pending);
let result = pool
.start_agent(root, "130_story_bar", Some("coder-1"), None)
.start_agent(root, "130_story_bar", Some("coder-1"), None, None)
.await;
assert!(result.is_err(), "second start_agent must be rejected");
@@ -1012,7 +1035,7 @@ stage = "coder"
let root1 = root.clone();
let t1 = tokio::spawn(async move {
pool1
.start_agent(&root1, "86_story_foo", Some("coder-1"), None)
.start_agent(&root1, "86_story_foo", Some("coder-1"), None, None)
.await
});
@@ -1020,7 +1043,7 @@ stage = "coder"
let root2 = root.clone();
let t2 = tokio::spawn(async move {
pool2
.start_agent(&root2, "130_story_bar", Some("coder-1"), None)
.start_agent(&root2, "130_story_bar", Some("coder-1"), None, None)
.await
});
@@ -1065,7 +1088,7 @@ stage = "coder"
pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
let result = pool
.start_agent(root, "42_story_foo", Some("coder-2"), None)
.start_agent(root, "42_story_foo", Some("coder-2"), None, None)
.await;
assert!(
@@ -1103,7 +1126,7 @@ stage = "coder"
pool.inject_test_agent("55_story_bar", "qa-1", AgentStatus::Running);
let result = pool
.start_agent(root, "55_story_bar", Some("qa-2"), None)
.start_agent(root, "55_story_bar", Some("qa-2"), None, None)
.await;
assert!(result.is_err(), "second qa on same story must be rejected");
@@ -1141,7 +1164,7 @@ stage = "coder"
let root1 = root.clone();
let t1 = tokio::spawn(async move {
pool1
.start_agent(&root1, "42_story_foo", Some("coder-1"), None)
.start_agent(&root1, "42_story_foo", Some("coder-1"), None, None)
.await
});
@@ -1149,7 +1172,7 @@ stage = "coder"
let root2 = root.clone();
let t2 = tokio::spawn(async move {
pool2
.start_agent(&root2, "42_story_foo", Some("coder-2"), None)
.start_agent(&root2, "42_story_foo", Some("coder-2"), None, None)
.await
});
@@ -1193,7 +1216,7 @@ stage = "coder"
pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
let result = pool
.start_agent(root, "99_story_baz", Some("coder-2"), None)
.start_agent(root, "99_story_baz", Some("coder-2"), None, None)
.await;
if let Err(ref e) = result {
@@ -1231,7 +1254,7 @@ stage = "coder"
let pool = AgentPool::new_test(3099);
let result = pool
.start_agent(root, "310_story_foo", Some("mergemaster"), None)
.start_agent(root, "310_story_foo", Some("mergemaster"), None, None)
.await;
assert!(
@@ -1269,7 +1292,7 @@ stage = "coder"
let pool = AgentPool::new_test(3099);
let result = pool
.start_agent(root, "8842_story_qa_guard", Some("coder-1"), None)
.start_agent(root, "8842_story_qa_guard", Some("coder-1"), None, None)
.await;
assert!(
@@ -1307,7 +1330,7 @@ stage = "coder"
let pool = AgentPool::new_test(3099);
let result = pool
.start_agent(root, "55_story_baz", Some("qa"), None)
.start_agent(root, "55_story_baz", Some("qa"), None, None)
.await;
assert!(
@@ -1343,7 +1366,7 @@ stage = "coder"
let pool = AgentPool::new_test(3099);
let result = pool
.start_agent(root, "77_story_sup", Some("supervisor"), None)
.start_agent(root, "77_story_sup", Some("supervisor"), None, None)
.await;
match result {
@@ -1379,7 +1402,7 @@ stage = "coder"
let pool = AgentPool::new_test(3099);
let result = pool
.start_agent(root, "88_story_ok", Some("mergemaster"), None)
.start_agent(root, "88_story_ok", Some("mergemaster"), None, None)
.await;
match result {
@@ -1435,7 +1458,7 @@ stage = "coder"
let pool = AgentPool::new_test(3098);
let result = pool
.start_agent(root, "502_story_split_brain", Some("mergemaster"), None)
.start_agent(root, "502_story_split_brain", Some("mergemaster"), None, None)
.await;
// Stage check must not reject mergemaster.
@@ -1494,7 +1517,7 @@ stage = "coder"
pool.inject_test_agent("other-story", "coder-sonnet", AgentStatus::Running);
let result = pool
.start_agent(tmp.path(), "368_story_test", None, None)
.start_agent(tmp.path(), "368_story_test", None, None, None)
.await;
match result {
Ok(info) => {
@@ -1557,7 +1580,7 @@ stage = "coder"
pool.inject_test_agent("other-story", "coder-opus", AgentStatus::Running);
let result = pool
.start_agent(tmp.path(), "368_story_test", None, None)
.start_agent(tmp.path(), "368_story_test", None, None, None)
.await;
assert!(result.is_err(), "expected error when preferred agent is busy");
let err = result.unwrap_err();