//! Agent start — spawns a new agent process in a worktree for a given story. use crate::agent_log::AgentLogWriter; use crate::config::ProjectConfig; use crate::slog_error; use std::path::Path; use std::sync::{Arc, Mutex}; use tokio::sync::broadcast; use super::super::runtime::{ AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext, }; use super::super::{ AgentEvent, AgentInfo, AgentStatus, PipelineStage, agent_config_stage, pipeline_stage, }; use super::types::{PendingGuard, StoryAgent, composite_key}; use super::worktree::find_active_story_stage; use super::{AgentPool, auto_assign}; impl AgentPool { /// Start an agent for a story: load config, create worktree, spawn agent. /// /// When `agent_name` is `None`, automatically selects the first idle coder /// agent (story 190). If all coders are busy the call fails with an error /// indicating the story will be picked up when one becomes available. /// /// If `resume_context` is provided and `session_id_to_resume` is `None`, /// the context is appended to the rendered prompt so the agent can pick up /// from a previous failed attempt. /// /// If `session_id_to_resume` is provided, the agent is launched with /// `--resume ` instead of `-p `. Only /// `resume_context` (if any) is sent as the new message. This lets /// the agent re-enter the previous conversation without re-reading /// CLAUDE.md and README, satisfying story 543. pub async fn start_agent( &self, project_root: &Path, story_id: &str, agent_name: Option<&str>, resume_context: Option<&str>, session_id_to_resume: Option, ) -> Result { let config = ProjectConfig::load(project_root)?; // Validate explicit agent name early (no lock needed). if let Some(name) = agent_name { config .find_agent(name) .ok_or_else(|| format!("No agent named '{name}' in config"))?; } // Create name-independent shared resources before the lock so they are // ready for the atomic check-and-insert (story 132). let (tx, _) = broadcast::channel::(1024); let event_log: Arc>> = Arc::new(Mutex::new(Vec::new())); let log_session_id = uuid::Uuid::new_v4().to_string(); // Move story from backlog/ to current/ before checking agent // availability so that auto_assign_available_work can pick it up even // when all coders are currently busy (story 203). Only do this for // Coder-stage agents — QA and Mergemaster must attach to the story // at its existing stage (3_qa or 4_merge) and must NOT be demoted // back to 2_current/ on attach (bug 502). When `agent_name` is None // we are auto-selecting an idle coder, so still move. let starting_a_coder = agent_name .and_then(|n| config.find_agent(n).map(agent_config_stage)) .map(|s| s == PipelineStage::Coder) .unwrap_or(true); if starting_a_coder { crate::agents::lifecycle::move_story_to_current(project_root, story_id)?; } // Validate that the agent's configured stage matches the story's // pipeline stage. This prevents any caller (auto-assign, MCP tool, // pipeline advance, supervisor) from starting a wrong-stage agent on // a story — e.g. mergemaster on a coding-stage story (bug 312). if let Some(name) = agent_name { let agent_stage = config .find_agent(name) .map(agent_config_stage) .unwrap_or_else(|| pipeline_stage(name)); if agent_stage != PipelineStage::Other && let Some(story_stage_dir) = find_active_story_stage(project_root, story_id) { let expected_stage = match story_stage_dir { "2_current" => PipelineStage::Coder, "3_qa" => PipelineStage::Qa, "4_merge" => PipelineStage::Mergemaster, _ => PipelineStage::Other, }; if expected_stage != PipelineStage::Other && expected_stage != agent_stage { return Err(format!( "Agent '{name}' (stage: {agent_stage:?}) cannot be assigned to \ story '{story_id}' in {story_stage_dir}/ (requires stage: {expected_stage:?})" )); } } } // Read the preferred agent from the story's front matter before acquiring // the lock. When no explicit agent_name is given, this lets start_agent // honour `agent: coder-opus` written by the `assign` command — mirroring // the auto_assign path (bug 379). let front_matter_agent: Option = if agent_name.is_none() { crate::db::read_content(story_id).and_then(|contents| { crate::io::story_metadata::parse_front_matter(&contents) .ok()? .agent }) } else { None }; // Atomically resolve agent name, check availability, and register as // Pending. When `agent_name` is `None` the first idle coder is // selected inside the lock so no TOCTOU race can occur between the // availability check and the Pending insert (story 132, story 190). // // The `PendingGuard` ensures that if any step below fails the entry is // removed from the pool so it does not permanently block auto-assign // (bug 118). let resolved_name: String; let key: String; { let mut agents = self.agents.lock().map_err(|e| e.to_string())?; resolved_name = match agent_name { Some(name) => name.to_string(), None => { // Honour the `agent:` field in the story's front matter so that // `start 368` after `assign 368 opus` picks the right agent // (bug 379). Mirrors the auto_assign selection logic. if let Some(ref pref) = front_matter_agent { let stage_matches = config .find_agent(pref) .map(|cfg| agent_config_stage(cfg) == PipelineStage::Coder) .unwrap_or(false); if stage_matches { if auto_assign::is_agent_free(&agents, pref) { pref.clone() } else { return Err(format!( "Preferred agent '{pref}' from story front matter is busy; \ story '{story_id}' has been queued in work/2_current/ and will \ be auto-assigned when it becomes available" )); } } else { // Stage mismatch — fall back to any free coder. auto_assign::find_free_agent_for_stage( &config, &agents, &PipelineStage::Coder, ) .map(|s| s.to_string()) .ok_or_else(|| { if config .agent .iter() .any(|a| agent_config_stage(a) == PipelineStage::Coder) { format!( "All coder agents are busy; story '{story_id}' has been \ queued in work/2_current/ and will be auto-assigned when \ one becomes available" ) } else { "No coder agent configured. Specify an agent_name explicitly." .to_string() } })? } } else { auto_assign::find_free_agent_for_stage( &config, &agents, &PipelineStage::Coder, ) .map(|s| s.to_string()) .ok_or_else(|| { if config .agent .iter() .any(|a| agent_config_stage(a) == PipelineStage::Coder) { format!( "All coder agents are busy; story '{story_id}' has been \ queued in work/2_current/ and will be auto-assigned when \ one becomes available" ) } else { "No coder agent configured. Specify an agent_name explicitly." .to_string() } })? } } }; key = composite_key(story_id, &resolved_name); // Check for duplicate assignment (same story + same agent already active). if let Some(agent) = agents.get(&key) && (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending) { return Err(format!( "Agent '{resolved_name}' for story '{story_id}' is already {}", agent.status )); } // Enforce single-stage concurrency: reject if there is already a // Running/Pending agent at the same pipeline stage for this story. // This prevents two coders (or two QA/mergemaster agents) from // corrupting each other's work in the same worktree. // Applies to both explicit and auto-selected agents; the Other // stage (supervisors, unknown agents) is exempt. let resolved_stage = config .find_agent(&resolved_name) .map(agent_config_stage) .unwrap_or_else(|| pipeline_stage(&resolved_name)); if resolved_stage != PipelineStage::Other && let Some(conflicting_name) = agents.iter().find_map(|(k, a)| { let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k); if k_story == story_id && a.agent_name != resolved_name && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) { let a_stage = config .find_agent(&a.agent_name) .map(agent_config_stage) .unwrap_or_else(|| pipeline_stage(&a.agent_name)); if a_stage == resolved_stage { Some(a.agent_name.clone()) } else { None } } else { None } }) { return Err(format!( "Cannot start '{resolved_name}' on story '{story_id}': \ '{conflicting_name}' is already active at the same pipeline stage" )); } // Enforce single-instance concurrency for explicitly-named agents: // if this agent is already running on any other story, reject. // Auto-selected agents are already guaranteed idle by // find_free_agent_for_stage, so this check is only needed for // explicit requests. if agent_name.is_some() && let Some(busy_story) = agents.iter().find_map(|(k, a)| { if a.agent_name == resolved_name && matches!(a.status, AgentStatus::Running | AgentStatus::Pending) { Some( k.rsplit_once(':') .map(|(sid, _)| sid) .unwrap_or(k) .to_string(), ) } else { None } }) { return Err(format!( "Agent '{resolved_name}' is already running on story '{busy_story}'; \ story '{story_id}' will be picked up when the agent becomes available" )); } agents.insert( key.clone(), StoryAgent { agent_name: resolved_name.clone(), status: AgentStatus::Pending, worktree_info: None, session_id: None, tx: tx.clone(), task_handle: None, event_log: event_log.clone(), completion: None, project_root: Some(project_root.to_path_buf()), log_session_id: Some(log_session_id.clone()), merge_failure_reported: false, throttled: false, }, ); } let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone()); // Create persistent log writer (needs resolved_name, so must be after // the atomic resolution above). let log_writer = match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) { Ok(w) => Some(Arc::new(Mutex::new(w))), Err(e) => { eprintln!( "[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}" ); None } }; // Notify WebSocket clients that a new agent is pending. Self::notify_agent_state_changed(&self.watcher_tx); let _ = tx.send(AgentEvent::Status { story_id: story_id.to_string(), agent_name: resolved_name.clone(), status: "pending".to_string(), }); // Extract inactivity timeout from the agent config before cloning config. let inactivity_timeout_secs = config .find_agent(&resolved_name) .map(|a| a.inactivity_timeout_secs) .unwrap_or(300); // Clone all values needed inside the background spawn. let project_root_clone = project_root.to_path_buf(); let config_clone = config.clone(); let resume_context_owned = resume_context.map(str::to_string); let session_id_to_resume_owned = session_id_to_resume; let sid = story_id.to_string(); let aname = resolved_name.clone(); let tx_clone = tx.clone(); let agents_ref = self.agents.clone(); let key_clone = key.clone(); let log_clone = event_log.clone(); let port_for_task = self.port; let log_writer_clone = log_writer.clone(); let child_killers_clone = self.child_killers.clone(); let watcher_tx_clone = self.watcher_tx.clone(); let merge_jobs_clone = Arc::clone(&self.merge_jobs); // Spawn the background task. Worktree creation and agent launch happen here // so `start_agent` returns immediately after registering the agent as // Pending — non-blocking by design (story 157). let handle = tokio::spawn(async move { // Step 1: create the worktree (slow — git checkout, pnpm install, etc.) let wt_info = match crate::worktree::create_worktree( &project_root_clone, &sid, &config_clone, port_for_task, ) .await { Ok(wt) => wt, Err(e) => { let error_msg = format!("Failed to create worktree: {e}"); slog_error!("[agents] {error_msg}"); let event = AgentEvent::Error { story_id: sid.clone(), agent_name: aname.clone(), message: error_msg, }; if let Ok(mut log) = log_clone.lock() { log.push(event.clone()); } let _ = tx_clone.send(event); if let Ok(mut agents) = agents_ref.lock() && let Some(agent) = agents.get_mut(&key_clone) { agent.status = AgentStatus::Failed; } AgentPool::notify_agent_state_changed(&watcher_tx_clone); return; } }; // Step 2: store worktree info and render agent command/args/prompt. let wt_path_str = wt_info.path.to_string_lossy().to_string(); { if let Ok(mut agents) = agents_ref.lock() && let Some(agent) = agents.get_mut(&key_clone) { agent.worktree_info = Some(wt_info.clone()); } } let (command, args, mut prompt) = match config_clone.render_agent_args( &wt_path_str, &sid, Some(&aname), Some(&wt_info.base_branch), ) { Ok(result) => result, Err(e) => { let error_msg = format!("Failed to render agent args: {e}"); slog_error!("[agents] {error_msg}"); let event = AgentEvent::Error { story_id: sid.clone(), agent_name: aname.clone(), message: error_msg, }; if let Ok(mut log) = log_clone.lock() { log.push(event.clone()); } let _ = tx_clone.send(event); if let Ok(mut agents) = agents_ref.lock() && let Some(agent) = agents.get_mut(&key_clone) { agent.status = AgentStatus::Failed; } AgentPool::notify_agent_state_changed(&watcher_tx_clone); return; } }; // Build the effective prompt and determine resume session. // // When resuming a previous session, discard the full rendered prompt // (which would re-read CLAUDE.md and README) and send only the gate // failure context as a new message. On a fresh start, append the // failure context to the original prompt as before. let effective_prompt = match &session_id_to_resume_owned { Some(_) => resume_context_owned.unwrap_or_default(), None => { if let Some(ctx) = resume_context_owned { prompt.push_str(&ctx); } prompt } }; // Step 3: transition to Running now that the worktree is ready. { if let Ok(mut agents) = agents_ref.lock() && let Some(agent) = agents.get_mut(&key_clone) { agent.status = AgentStatus::Running; } } let _ = tx_clone.send(AgentEvent::Status { story_id: sid.clone(), agent_name: aname.clone(), status: "running".to_string(), }); AgentPool::notify_agent_state_changed(&watcher_tx_clone); // Step 4: launch the agent process via the configured runtime. let runtime_name = config_clone .find_agent(&aname) .and_then(|a| a.runtime.as_deref()) .unwrap_or("claude-code"); let run_result = match runtime_name { "claude-code" => { let runtime = ClaudeCodeRuntime::new( child_killers_clone.clone(), watcher_tx_clone.clone(), ); let ctx = RuntimeContext { story_id: sid.clone(), agent_name: aname.clone(), command, args, prompt: effective_prompt, cwd: wt_path_str, inactivity_timeout_secs, mcp_port: port_for_task, session_id_to_resume: session_id_to_resume_owned.clone(), }; runtime .start(ctx, tx_clone.clone(), log_clone.clone(), log_writer_clone) .await } "gemini" => { let runtime = GeminiRuntime::new(); let ctx = RuntimeContext { story_id: sid.clone(), agent_name: aname.clone(), command, args, prompt: effective_prompt, cwd: wt_path_str, inactivity_timeout_secs, mcp_port: port_for_task, session_id_to_resume: session_id_to_resume_owned.clone(), }; runtime .start(ctx, tx_clone.clone(), log_clone.clone(), log_writer_clone) .await } "openai" => { let runtime = OpenAiRuntime::new(); let ctx = RuntimeContext { story_id: sid.clone(), agent_name: aname.clone(), command, args, prompt: effective_prompt, cwd: wt_path_str, inactivity_timeout_secs, mcp_port: port_for_task, session_id_to_resume: session_id_to_resume_owned, }; runtime .start(ctx, tx_clone.clone(), log_clone.clone(), log_writer_clone) .await } other => Err(format!( "Unknown agent runtime '{other}'; check the 'runtime' field in project.toml. \ Supported: 'claude-code', 'gemini', 'openai'" )), }; match run_result { Ok(result) => { // Persist token usage if the agent reported it. if let Some(ref usage) = result.token_usage && let Ok(agents) = agents_ref.lock() && let Some(agent) = agents.get(&key_clone) && let Some(ref pr) = agent.project_root { let model = config_clone .find_agent(&aname) .and_then(|a| a.model.clone()); let record = crate::agents::token_usage::build_record( &sid, &aname, model, usage.clone(), ); if let Err(e) = crate::agents::token_usage::append_record(pr, &record) { slog_error!( "[agents] Failed to persist token usage for \ {sid}:{aname}: {e}" ); } } // Mergemaster agents have their own completion path via // start_merge_agent_work / run_merge_pipeline and must NOT go // through server-owned gates. When a mergemaster exits early // (e.g. rate-limited before calling start_merge_agent_work) the // feature-branch worktree compiles fine and post-merge tests on // master pass (nothing changed), which would wrongly advance the // story to 5_done/ without any squash merge having occurred. // Instead: just remove the agent from the pool and let // auto-assign restart a new mergemaster for the story. let stage = config_clone .find_agent(&aname) .map(agent_config_stage) .unwrap_or_else(|| pipeline_stage(&aname)); if stage == PipelineStage::Mergemaster { let (tx_done, done_session_id) = { let mut lock = match agents_ref.lock() { Ok(a) => a, Err(_) => return, }; if let Some(agent) = lock.remove(&key_clone) { (agent.tx, agent.session_id.or(result.session_id)) } else { (tx_clone.clone(), result.session_id) } }; // Clear any stale Running merge job so the next mergemaster // can call start_merge_agent_work without hitting "Merge // already in progress" (bug 498). if let Ok(mut jobs) = merge_jobs_clone.lock() && let Some(job) = jobs.get(&sid) && matches!(job.status, crate::agents::merge::MergeJobStatus::Running) { jobs.remove(&sid); } let _ = tx_done.send(AgentEvent::Done { story_id: sid.clone(), agent_name: aname.clone(), session_id: done_session_id, }); AgentPool::notify_agent_state_changed(&watcher_tx_clone); // Send a WorkItem event so the auto-assign watcher loop // re-dispatches a new mergemaster if the story still needs // merging. This avoids an async call to start_agent inside // a tokio::spawn (which would require Send). let _ = watcher_tx_clone.send(crate::io::watcher::WatcherEvent::WorkItem { stage: "4_merge".to_string(), item_id: sid.clone(), action: "reassign".to_string(), commit_msg: String::new(), from_stage: None, }); } else { // Server-owned completion: run acceptance gates automatically // when the agent process exits normally. super::pipeline::run_server_owned_completion( &agents_ref, port_for_task, &sid, &aname, result.session_id, watcher_tx_clone.clone(), ) .await; AgentPool::notify_agent_state_changed(&watcher_tx_clone); } } Err(e) => { slog_error!("[agents] Agent process error for {aname} on {sid}: {e}"); let event = AgentEvent::Error { story_id: sid.clone(), agent_name: aname.clone(), message: e, }; if let Ok(mut log) = log_clone.lock() { log.push(event.clone()); } let _ = tx_clone.send(event); if let Ok(mut agents) = agents_ref.lock() && let Some(agent) = agents.get_mut(&key_clone) { agent.status = AgentStatus::Failed; } AgentPool::notify_agent_state_changed(&watcher_tx_clone); } } }); // Store the task handle while the agent is still Pending. { let mut agents = self.agents.lock().map_err(|e| e.to_string())?; if let Some(agent) = agents.get_mut(&key) { agent.task_handle = Some(handle); } } // Agent successfully spawned — prevent the guard from removing the entry. pending_guard.disarm(); Ok(AgentInfo { story_id: story_id.to_string(), agent_name: resolved_name, status: AgentStatus::Pending, session_id: None, worktree_path: None, base_branch: None, completion: None, log_session_id: Some(log_session_id), throttled: false, }) } } #[cfg(test)] mod tests { use super::super::AgentPool; use crate::agents::{AgentEvent, AgentStatus}; #[tokio::test] async fn start_agent_auto_selects_second_coder_when_first_busy() { let tmp = tempfile::tempdir().unwrap(); let sk = tmp.path().join(".huskies"); std::fs::create_dir_all(&sk).unwrap(); std::fs::write( sk.join("project.toml"), r#" [[agent]] name = "supervisor" stage = "other" [[agent]] name = "coder-1" stage = "coder" [[agent]] name = "coder-2" stage = "coder" "#, ) .unwrap(); let pool = AgentPool::new_test(3001); pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running); let result = pool .start_agent(tmp.path(), "42_my_story", None, None, None) .await; match result { Ok(info) => { assert_eq!(info.agent_name, "coder-2"); } Err(err) => { assert!( !err.contains("All coder agents are busy"), "should have selected coder-2 but got: {err}" ); assert!( !err.contains("No coder agent configured"), "should not fail on agent selection, got: {err}" ); } } } #[tokio::test] async fn start_agent_returns_busy_when_all_coders_occupied() { let tmp = tempfile::tempdir().unwrap(); let sk = tmp.path().join(".huskies"); std::fs::create_dir_all(&sk).unwrap(); std::fs::write( sk.join("project.toml"), r#" [[agent]] name = "coder-1" stage = "coder" [[agent]] name = "coder-2" stage = "coder" "#, ) .unwrap(); let pool = AgentPool::new_test(3001); pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending); let result = pool .start_agent(tmp.path(), "story-3", None, None, None) .await; assert!(result.is_err()); let err = result.unwrap_err(); assert!( err.contains("All coder agents are busy"), "expected busy error, got: {err}" ); } #[tokio::test] async fn start_agent_moves_story_to_current_when_coders_busy() { let tmp = tempfile::tempdir().unwrap(); let sk = tmp.path().join(".huskies"); let backlog = sk.join("work/1_backlog"); std::fs::create_dir_all(&backlog).unwrap(); std::fs::write( sk.join("project.toml"), r#" [[agent]] name = "coder-1" stage = "coder" "#, ) .unwrap(); let story_content = "---\nname: Story 3\n---\n"; std::fs::write(backlog.join("story-3.md"), story_content).unwrap(); crate::db::ensure_content_store(); crate::db::write_content("story-3", story_content); let pool = AgentPool::new_test(3001); pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); let result = pool .start_agent(tmp.path(), "story-3", None, None, None) .await; assert!(result.is_err()); let err = result.unwrap_err(); assert!( err.contains("All coder agents are busy"), "expected busy error, got: {err}" ); assert!( err.contains("queued in work/2_current/"), "expected story-to-current message, got: {err}" ); // The lifecycle function updates the content store (not the filesystem), // so verify the move via the DB. let content = crate::db::read_content("story-3") .expect("story-3 should be in content store after move to current"); assert!( content.contains("name: Story 3"), "story-3 content should be preserved after move" ); } #[tokio::test] async fn start_agent_story_already_in_current_is_noop() { let tmp = tempfile::tempdir().unwrap(); let sk = tmp.path().join(".huskies"); let current = sk.join("work/2_current"); std::fs::create_dir_all(¤t).unwrap(); std::fs::write( sk.join("project.toml"), "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", ) .unwrap(); std::fs::write(current.join("story-5.md"), "---\nname: Story 5\n---\n").unwrap(); let pool = AgentPool::new_test(3001); let result = pool .start_agent(tmp.path(), "story-5", None, None, None) .await; match result { Ok(_) => {} Err(e) => { assert!( !e.contains("Failed to move"), "should not fail on idempotent move, got: {e}" ); } } } #[tokio::test] async fn start_agent_explicit_name_unchanged_when_busy() { let tmp = tempfile::tempdir().unwrap(); let sk = tmp.path().join(".huskies"); std::fs::create_dir_all(&sk).unwrap(); std::fs::write( sk.join("project.toml"), r#" [[agent]] name = "coder-1" stage = "coder" [[agent]] name = "coder-2" stage = "coder" "#, ) .unwrap(); let pool = AgentPool::new_test(3001); pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running); let result = pool .start_agent(tmp.path(), "story-2", Some("coder-1"), None, None) .await; assert!(result.is_err()); let err = result.unwrap_err(); assert!( err.contains("coder-1") && err.contains("already running"), "expected explicit busy error, got: {err}" ); } // ── start_agent single-instance concurrency tests ───────────────────────── #[tokio::test] async fn start_agent_rejects_when_same_agent_already_running_on_another_story() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(&sk_dir).unwrap(); fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); let pool = AgentPool::new_test(3001); pool.inject_test_agent("story-a", "qa", AgentStatus::Running); let result = pool .start_agent(root, "story-b", Some("qa"), None, None) .await; assert!( result.is_err(), "start_agent should fail when qa is already running on another story" ); let err = result.unwrap_err(); assert!( err.contains("already running") || err.contains("becomes available"), "error message should explain why: got '{err}'" ); } #[tokio::test] async fn start_agent_allows_new_story_when_previous_run_is_completed() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(&sk_dir).unwrap(); fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); let pool = AgentPool::new_test(3001); pool.inject_test_agent("story-a", "qa", AgentStatus::Completed); let result = pool .start_agent(root, "story-b", Some("qa"), None, None) .await; if let Err(ref e) = result { assert!( !e.contains("already running") && !e.contains("becomes available"), "completed agent must not trigger the concurrency guard: got '{e}'" ); } } // ── bug 118: pending entry cleanup on start_agent failure ──────────────── #[tokio::test] async fn start_agent_cleans_up_pending_entry_on_failure() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(&sk_dir).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", ) .unwrap(); let upcoming = root.join(".huskies/work/1_backlog"); fs::create_dir_all(&upcoming).unwrap(); fs::write(upcoming.join("50_story_test.md"), "---\nname: Test\n---\n").unwrap(); let pool = AgentPool::new_test(3099); let result = pool .start_agent(root, "50_story_test", Some("coder-1"), None, None) .await; assert!( result.is_ok(), "start_agent should return Ok(Pending) immediately: {:?}", result.err() ); assert_eq!( result.unwrap().status, AgentStatus::Pending, "initial status must be Pending" ); let final_info = pool .wait_for_agent("50_story_test", "coder-1", 5000) .await .expect("wait_for_agent should not time out"); assert_eq!( final_info.status, AgentStatus::Failed, "agent must transition to Failed after worktree creation error" ); let agents = pool.agents.lock().unwrap(); let failed_entry = agents .values() .find(|a| a.agent_name == "coder-1" && a.status == AgentStatus::Failed); assert!( failed_entry.is_some(), "agent pool must retain a Failed entry so the UI can show the error state" ); drop(agents); let events = pool .drain_events("50_story_test", "coder-1") .expect("drain_events should succeed"); let has_error_event = events.iter().any(|e| matches!(e, AgentEvent::Error { .. })); assert!( has_error_event, "event_log must contain AgentEvent::Error after worktree creation fails" ); } #[tokio::test] async fn start_agent_guard_does_not_remove_running_entry() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(&sk_dir).unwrap(); fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\n").unwrap(); let pool = AgentPool::new_test(3099); pool.inject_test_agent("story-x", "qa", AgentStatus::Running); let result = pool .start_agent(root, "story-y", Some("qa"), None, None) .await; assert!(result.is_err()); let err = result.unwrap_err(); assert!( err.contains("already running") || err.contains("becomes available"), "running entry must survive: got '{err}'" ); } // ── TOCTOU race-condition regression tests (story 132) ─────────────────── #[tokio::test] async fn toctou_pending_entry_blocks_same_agent_on_different_story() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(&sk_dir).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"coder-1\"\n", ) .unwrap(); let pool = AgentPool::new_test(3099); pool.inject_test_agent("86_story_foo", "coder-1", AgentStatus::Pending); let result = pool .start_agent(root, "130_story_bar", Some("coder-1"), None, None) .await; assert!(result.is_err(), "second start_agent must be rejected"); let err = result.unwrap_err(); assert!( err.contains("already running") || err.contains("becomes available"), "expected concurrency-rejection message, got: '{err}'" ); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn toctou_concurrent_start_agent_same_agent_exactly_one_concurrency_rejection() { use std::fs; use std::sync::Arc; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path().to_path_buf(); let sk_dir = root.join(".huskies"); fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap(); fs::write( root.join(".huskies/project.toml"), "[[agent]]\nname = \"coder-1\"\n", ) .unwrap(); fs::write( root.join(".huskies/work/1_backlog/86_story_foo.md"), "---\nname: Foo\n---\n", ) .unwrap(); fs::write( root.join(".huskies/work/1_backlog/130_story_bar.md"), "---\nname: Bar\n---\n", ) .unwrap(); let pool = Arc::new(AgentPool::new_test(3099)); let pool1 = pool.clone(); let root1 = root.clone(); let t1 = tokio::spawn(async move { pool1 .start_agent(&root1, "86_story_foo", Some("coder-1"), None, None) .await }); let pool2 = pool.clone(); let root2 = root.clone(); let t2 = tokio::spawn(async move { pool2 .start_agent(&root2, "130_story_bar", Some("coder-1"), None, None) .await }); let (r1, r2) = tokio::join!(t1, t2); let r1 = r1.unwrap(); let r2 = r2.unwrap(); let concurrency_rejections = [&r1, &r2] .iter() .filter(|r| { r.as_ref().is_err_and(|e| { e.contains("already running") || e.contains("becomes available") }) }) .count(); assert_eq!( concurrency_rejections, 1, "exactly one call must be rejected by the concurrency check; \ got r1={r1:?} r2={r2:?}" ); } // ── story-230: prevent duplicate stage agents on same story ─────────────── #[tokio::test] async fn start_agent_rejects_second_coder_stage_on_same_story() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(&sk_dir).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n", ) .unwrap(); let pool = AgentPool::new_test(3099); pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running); let result = pool .start_agent(root, "42_story_foo", Some("coder-2"), None, None) .await; assert!( result.is_err(), "second coder on same story must be rejected" ); let err = result.unwrap_err(); assert!( err.contains("same pipeline stage"), "error must mention same pipeline stage, got: '{err}'" ); assert!( err.contains("coder-1") && err.contains("coder-2"), "error must name both agents, got: '{err}'" ); } #[tokio::test] async fn start_agent_rejects_second_qa_stage_on_same_story() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(&sk_dir).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n\n\ [[agent]]\nname = \"qa-2\"\nstage = \"qa\"\n", ) .unwrap(); let pool = AgentPool::new_test(3099); pool.inject_test_agent("55_story_bar", "qa-1", AgentStatus::Running); let result = pool .start_agent(root, "55_story_bar", Some("qa-2"), None, None) .await; assert!(result.is_err(), "second qa on same story must be rejected"); let err = result.unwrap_err(); assert!( err.contains("same pipeline stage"), "error must mention same pipeline stage, got: '{err}'" ); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn start_agent_concurrent_two_coders_same_story_exactly_one_stage_rejection() { use std::fs; use std::sync::Arc; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path().to_path_buf(); let sk_dir = root.join(".huskies"); fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); fs::write( root.join(".huskies/project.toml"), "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n", ) .unwrap(); fs::write( root.join(".huskies/work/2_current/42_story_foo.md"), "---\nname: Foo\n---\n", ) .unwrap(); let pool = Arc::new(AgentPool::new_test(3099)); let pool1 = pool.clone(); let root1 = root.clone(); let t1 = tokio::spawn(async move { pool1 .start_agent(&root1, "42_story_foo", Some("coder-1"), None, None) .await }); let pool2 = pool.clone(); let root2 = root.clone(); let t2 = tokio::spawn(async move { pool2 .start_agent(&root2, "42_story_foo", Some("coder-2"), None, None) .await }); let (r1, r2) = tokio::join!(t1, t2); let r1 = r1.unwrap(); let r2 = r2.unwrap(); let stage_rejections = [&r1, &r2] .iter() .filter(|r| r.as_ref().is_err_and(|e| e.contains("same pipeline stage"))) .count(); assert_eq!( stage_rejections, 1, "exactly one call must be rejected by the stage-conflict check; \ got r1={r1:?} r2={r2:?}" ); } #[tokio::test] async fn start_agent_two_coders_different_stories_not_blocked_by_stage_check() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap(); fs::write( root.join(".huskies/project.toml"), "[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n", ) .unwrap(); fs::write( root.join(".huskies/work/1_backlog/99_story_baz.md"), "---\nname: Baz\n---\n", ) .unwrap(); let pool = AgentPool::new_test(3099); pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running); let result = pool .start_agent(root, "99_story_baz", Some("coder-2"), None, None) .await; if let Err(ref e) = result { assert!( !e.contains("same pipeline stage"), "stage-conflict guard must not fire for agents on different stories; \ got: '{e}'" ); } } // ── bug 312: stage-pipeline mismatch guard in start_agent ────────────── #[tokio::test] async fn start_agent_rejects_mergemaster_on_coding_stage_story() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", ) .unwrap(); crate::db::ensure_content_store(); crate::db::write_item_with_content("310_story_foo", "2_current", "---\nname: Foo\n---\n"); let pool = AgentPool::new_test(3099); let result = pool .start_agent(root, "310_story_foo", Some("mergemaster"), None, None) .await; assert!( result.is_err(), "mergemaster must not be assigned to a story in 2_current/" ); let err = result.unwrap_err(); assert!( err.contains("stage") && err.contains("2_current"), "error must mention stage mismatch, got: '{err}'" ); } #[tokio::test] async fn start_agent_rejects_coder_on_qa_stage_story() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(&sk_dir).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\ [[agent]]\nname = \"qa\"\nstage = \"qa\"\n", ) .unwrap(); crate::db::ensure_content_store(); crate::db::write_item_with_content( "8842_story_qa_guard", "3_qa", "---\nname: QA Guard\n---\n", ); let pool = AgentPool::new_test(3099); let result = pool .start_agent(root, "8842_story_qa_guard", Some("coder-1"), None, None) .await; assert!( result.is_err(), "coder must not be assigned to a story in 3_qa/" ); let err = result.unwrap_err(); assert!( err.contains("stage") && err.contains("3_qa"), "error must mention stage mismatch, got: '{err}'" ); } #[tokio::test] async fn start_agent_rejects_qa_on_merge_stage_story() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(&sk_dir).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"qa\"\nstage = \"qa\"\n\n\ [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", ) .unwrap(); crate::db::ensure_content_store(); crate::db::write_item_with_content("55_story_baz", "4_merge", "---\nname: Baz\n---\n"); let pool = AgentPool::new_test(3099); let result = pool .start_agent(root, "55_story_baz", Some("qa"), None, None) .await; assert!( result.is_err(), "qa must not be assigned to a story in 4_merge/" ); let err = result.unwrap_err(); assert!( err.contains("stage") && err.contains("4_merge"), "error must mention stage mismatch, got: '{err}'" ); } #[tokio::test] async fn start_agent_allows_supervisor_on_any_stage() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(sk_dir.join("work/2_current")).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"supervisor\"\nstage = \"other\"\n", ) .unwrap(); fs::write( sk_dir.join("work/2_current/77_story_sup.md"), "---\nname: Sup\n---\n", ) .unwrap(); let pool = AgentPool::new_test(3099); let result = pool .start_agent(root, "77_story_sup", Some("supervisor"), None, None) .await; match result { Ok(_) => {} Err(e) => { assert!( !e.contains("stage:") || !e.contains("cannot be assigned"), "supervisor should not be rejected for stage mismatch, got: '{e}'" ); } } } #[tokio::test] async fn start_agent_allows_correct_stage_agent() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", ) .unwrap(); fs::write( sk_dir.join("work/4_merge/88_story_ok.md"), "---\nname: OK\n---\n", ) .unwrap(); let pool = AgentPool::new_test(3099); let result = pool .start_agent(root, "88_story_ok", Some("mergemaster"), None, None) .await; match result { Ok(_) => {} Err(e) => { assert!( !e.contains("cannot be assigned"), "mergemaster on 4_merge/ story should not fail stage check, got: '{e}'" ); } } } /// Bug 502: when start_agent is called for a non-Coder agent (mergemaster /// or qa) on a story that's in 4_merge/, the unconditional /// move_story_to_current at the top of start_agent must NOT fire — even /// when a stale split-brain shadow of the story exists in 1_backlog/. /// /// Pre-fix behaviour: move_story_to_current would find the 1_backlog /// shadow and move it to 2_current/. find_active_story_stage would then /// report 2_current/, the stage check would expect a Coder-stage agent, /// and mergemaster would be rejected — leaving the story in 2_current/ /// to be picked up by the next auto-assign tick as a coder. Infinite loop. /// Observed live on 2026-04-09 against story 478. #[tokio::test] async fn start_agent_does_not_demote_merge_stage_story_with_backlog_shadow() { use std::fs; let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); let sk_dir = root.join(".huskies"); fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap(); fs::create_dir_all(sk_dir.join("work/4_merge")).unwrap(); fs::write( sk_dir.join("project.toml"), "[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n", ) .unwrap(); // Real copy in 4_merge/ (where the story actually is per the DB). fs::write( sk_dir.join("work/4_merge/502_story_split_brain.md"), "---\nname: Split Brain\n---\n", ) .unwrap(); // Stale split-brain shadow in 1_backlog/ (post-491/492 migration // artifact — the filesystem shadow that bit us in production). fs::write( sk_dir.join("work/1_backlog/502_story_split_brain.md"), "---\nname: Split Brain\n---\n", ) .unwrap(); let pool = AgentPool::new_test(3098); let result = pool .start_agent( root, "502_story_split_brain", Some("mergemaster"), None, None, ) .await; // Stage check must not reject mergemaster. if let Err(ref e) = result { assert!( !e.contains("cannot be assigned"), "mergemaster on 4_merge/ story must not fail stage check even \ when a 1_backlog shadow exists, got: '{e}'" ); } // Critical: the story must still be in 4_merge/ after the call. // Before the fix, line 53 of start.rs would have demoted it to // 2_current/ via move_story_to_current finding the 1_backlog shadow. assert!( sk_dir .join("work/4_merge/502_story_split_brain.md") .exists(), "story must still be in 4_merge/ after start_agent(mergemaster, ...)" ); assert!( !sk_dir .join("work/2_current/502_story_split_brain.md") .exists(), "story must NOT have been demoted to 2_current/ — that's bug 502" ); } // ── front matter agent preference (bug 379) ────────────────────────────── #[tokio::test] async fn start_agent_honours_front_matter_agent_when_idle() { let tmp = tempfile::tempdir().unwrap(); let sk = tmp.path().join(".huskies"); let backlog = sk.join("work/1_backlog"); std::fs::create_dir_all(&backlog).unwrap(); std::fs::write( sk.join("project.toml"), r#" [[agent]] name = "coder-sonnet" stage = "coder" [[agent]] name = "coder-opus" stage = "coder" "#, ) .unwrap(); // Story file with agent preference in front matter. std::fs::write( backlog.join("368_story_test.md"), "---\nname: Test Story\nagent: coder-opus\n---\n# Story 368\n", ) .unwrap(); let pool = AgentPool::new_test(3010); // coder-sonnet is busy so without front matter the auto-selection // would skip coder-opus and try something else. pool.inject_test_agent("other-story", "coder-sonnet", AgentStatus::Running); let result = pool .start_agent(tmp.path(), "368_story_test", None, None, None) .await; match result { Ok(info) => { assert_eq!( info.agent_name, "coder-opus", "should pick the front-matter preferred agent" ); } Err(err) => { // Allowed to fail for infrastructure reasons (no git repo), // but NOT due to agent selection ignoring the preference. assert!( !err.contains("All coder agents are busy"), "should not report busy when coder-opus is idle: {err}" ); assert!( !err.contains("coder-sonnet"), "should not have picked coder-sonnet: {err}" ); } } } #[tokio::test] async fn start_agent_returns_error_when_front_matter_agent_busy() { let tmp = tempfile::tempdir().unwrap(); let sk = tmp.path().join(".huskies"); let backlog = sk.join("work/1_backlog"); let current = sk.join("work/2_current"); std::fs::create_dir_all(&backlog).unwrap(); std::fs::create_dir_all(¤t).unwrap(); std::fs::write( sk.join("project.toml"), r#" [[agent]] name = "coder-sonnet" stage = "coder" [[agent]] name = "coder-opus" stage = "coder" "#, ) .unwrap(); let story_content = "---\nname: Test Story\nagent: coder-opus\n---\n# Story 368\n"; std::fs::write(backlog.join("368_story_test.md"), story_content).unwrap(); // Also write to the filesystem current dir and content store so that // start_agent reads the correct front matter even when another test has // left a stale entry for "368_story_test" in the global CRDT. std::fs::write(current.join("368_story_test.md"), story_content).unwrap(); crate::db::ensure_content_store(); crate::db::write_content("368_story_test", story_content); let pool = AgentPool::new_test(3011); // Preferred agent is busy — should NOT fall back to coder-sonnet. pool.inject_test_agent("other-story", "coder-opus", AgentStatus::Running); let result = pool .start_agent(tmp.path(), "368_story_test", None, None, None) .await; assert!( result.is_err(), "expected error when preferred agent is busy" ); let err = result.unwrap_err(); assert!( err.contains("coder-opus"), "error should mention the preferred agent: {err}" ); assert!( err.contains("busy") || err.contains("queued"), "error should say agent is busy or story is queued: {err}" ); } }