4757 lines
178 KiB
Rust
4757 lines
178 KiB
Rust
use crate::agent_log::AgentLogWriter;
|
|
use crate::config::ProjectConfig;
|
|
use crate::io::watcher::WatcherEvent;
|
|
use crate::slog;
|
|
use crate::slog_error;
|
|
use crate::slog_warn;
|
|
use crate::worktree::{self, WorktreeInfo};
|
|
use portable_pty::ChildKiller;
|
|
use std::collections::HashMap;
|
|
use std::path::{Path, PathBuf};
|
|
use std::sync::{Arc, Mutex};
|
|
use tokio::sync::broadcast;
|
|
|
|
use super::{
|
|
AgentEvent, AgentInfo, AgentStatus, CompletionReport, PipelineStage, ReconciliationEvent,
|
|
agent_config_stage, pipeline_stage,
|
|
};
|
|
|
|
/// Build the composite key used to track agents in the pool.
|
|
fn composite_key(story_id: &str, agent_name: &str) -> String {
|
|
format!("{story_id}:{agent_name}")
|
|
}
|
|
|
|
/// RAII guard that removes a pending agent entry from the pool on drop.
|
|
///
|
|
/// Created after inserting a `Pending` entry into the agent HashMap.
|
|
/// If `start_agent` succeeds (the agent process is spawned and status
|
|
/// transitions to `Running`), call [`disarm`](Self::disarm) to prevent
|
|
/// cleanup. If any intermediate step fails and the guard is dropped
|
|
/// without being disarmed, the pending entry is removed so it cannot
|
|
/// block future auto-assign dispatches.
|
|
struct PendingGuard {
|
|
agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
|
|
key: String,
|
|
armed: bool,
|
|
}
|
|
|
|
impl PendingGuard {
|
|
fn new(agents: Arc<Mutex<HashMap<String, StoryAgent>>>, key: String) -> Self {
|
|
Self {
|
|
agents,
|
|
key,
|
|
armed: true,
|
|
}
|
|
}
|
|
|
|
/// Prevent the guard from cleaning up the entry (call after
|
|
/// successful spawn).
|
|
fn disarm(&mut self) {
|
|
self.armed = false;
|
|
}
|
|
}
|
|
|
|
impl Drop for PendingGuard {
|
|
fn drop(&mut self) {
|
|
if self.armed
|
|
&& let Ok(mut agents) = self.agents.lock()
|
|
&& agents
|
|
.get(&self.key)
|
|
.is_some_and(|a| a.status == AgentStatus::Pending)
|
|
{
|
|
agents.remove(&self.key);
|
|
slog!(
|
|
"[agents] Cleaned up leaked Pending entry for '{}'",
|
|
self.key
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
struct StoryAgent {
|
|
agent_name: String,
|
|
status: AgentStatus,
|
|
worktree_info: Option<WorktreeInfo>,
|
|
session_id: Option<String>,
|
|
tx: broadcast::Sender<AgentEvent>,
|
|
task_handle: Option<tokio::task::JoinHandle<()>>,
|
|
/// Accumulated events for polling via get_agent_output.
|
|
event_log: Arc<Mutex<Vec<AgentEvent>>>,
|
|
/// Set when the agent calls report_completion.
|
|
completion: Option<CompletionReport>,
|
|
/// Project root, stored for pipeline advancement after completion.
|
|
project_root: Option<PathBuf>,
|
|
/// UUID identifying the log file for this session.
|
|
log_session_id: Option<String>,
|
|
/// Set to `true` when the agent calls `report_merge_failure`.
|
|
/// Prevents the pipeline from blindly advancing to `5_done/` after a
|
|
/// failed merge: the server-owned gate check runs in the feature-branch
|
|
/// worktree (which compiles fine) and returns `gates_passed=true` even
|
|
/// though the code was never squash-merged onto master.
|
|
merge_failure_reported: bool,
|
|
}
|
|
|
|
/// Build an `AgentInfo` snapshot from a `StoryAgent` map entry.
|
|
fn agent_info_from_entry(story_id: &str, agent: &StoryAgent) -> AgentInfo {
|
|
AgentInfo {
|
|
story_id: story_id.to_string(),
|
|
agent_name: agent.agent_name.clone(),
|
|
status: agent.status.clone(),
|
|
session_id: agent.session_id.clone(),
|
|
worktree_path: agent
|
|
.worktree_info
|
|
.as_ref()
|
|
.map(|wt| wt.path.to_string_lossy().to_string()),
|
|
base_branch: agent
|
|
.worktree_info
|
|
.as_ref()
|
|
.map(|wt| wt.base_branch.clone()),
|
|
completion: agent.completion.clone(),
|
|
log_session_id: agent.log_session_id.clone(),
|
|
}
|
|
}
|
|
|
|
/// Manages concurrent story agents, each in its own worktree.
|
|
pub struct AgentPool {
|
|
agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
|
|
port: u16,
|
|
/// Registry of active PTY child process killers, keyed by "{story_id}:{agent_name}".
|
|
/// Used to terminate child processes on server shutdown or agent stop, preventing
|
|
/// orphaned Claude Code processes from running after the server exits.
|
|
child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
|
|
/// Broadcast channel for notifying WebSocket clients of agent state changes.
|
|
/// When an agent transitions state (Pending, Running, Completed, Failed, Stopped),
|
|
/// an `AgentStateChanged` event is emitted so the frontend can refresh the
|
|
/// pipeline board without waiting for a filesystem event.
|
|
watcher_tx: broadcast::Sender<WatcherEvent>,
|
|
/// Tracks background merge jobs started by `merge_agent_work`, keyed by story_id.
|
|
/// The MCP tool returns immediately and the mergemaster agent polls
|
|
/// `get_merge_status` until the job reaches a terminal state.
|
|
merge_jobs: Arc<Mutex<HashMap<String, super::merge::MergeJob>>>,
|
|
}
|
|
|
|
impl AgentPool {
|
|
pub fn new(port: u16, watcher_tx: broadcast::Sender<WatcherEvent>) -> Self {
|
|
Self {
|
|
agents: Arc::new(Mutex::new(HashMap::new())),
|
|
port,
|
|
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
|
watcher_tx,
|
|
merge_jobs: Arc::new(Mutex::new(HashMap::new())),
|
|
}
|
|
}
|
|
|
|
/// Create a pool with a dummy watcher channel for unit tests.
|
|
#[cfg(test)]
|
|
pub fn new_test(port: u16) -> Self {
|
|
let (watcher_tx, _) = broadcast::channel(16);
|
|
Self::new(port, watcher_tx)
|
|
}
|
|
|
|
/// Notify WebSocket clients that agent state has changed, so the pipeline
|
|
/// board and agent panel can refresh.
|
|
fn notify_agent_state_changed(watcher_tx: &broadcast::Sender<WatcherEvent>) {
|
|
let _ = watcher_tx.send(WatcherEvent::AgentStateChanged);
|
|
}
|
|
|
|
/// Kill all active PTY child processes.
|
|
///
|
|
/// Called on server shutdown to prevent orphaned Claude Code processes from
|
|
/// continuing to run after the server exits. Each registered killer is called
|
|
/// once, then the registry is cleared.
|
|
pub fn kill_all_children(&self) {
|
|
if let Ok(mut killers) = self.child_killers.lock() {
|
|
for (key, killer) in killers.iter_mut() {
|
|
slog!("[agents] Killing child process for {key} on shutdown");
|
|
let _ = killer.kill();
|
|
}
|
|
killers.clear();
|
|
}
|
|
}
|
|
|
|
/// Kill and deregister the child process for a specific agent key.
|
|
///
|
|
/// Used by `stop_agent` to ensure the PTY child is terminated even though
|
|
/// aborting a `spawn_blocking` task handle does not interrupt the blocking thread.
|
|
fn kill_child_for_key(&self, key: &str) {
|
|
if let Ok(mut killers) = self.child_killers.lock()
|
|
&& let Some(mut killer) = killers.remove(key)
|
|
{
|
|
slog!("[agents] Killing child process for {key} on stop");
|
|
let _ = killer.kill();
|
|
}
|
|
}
|
|
|
|
/// Start an agent for a story: load config, create worktree, spawn agent.
|
|
///
|
|
/// When `agent_name` is `None`, automatically selects the first idle coder
|
|
/// agent (story 190). If all coders are busy the call fails with an error
|
|
/// indicating the story will be picked up when one becomes available.
|
|
///
|
|
/// If `resume_context` is provided, it is appended to the rendered prompt
|
|
/// so the agent can pick up from a previous failed attempt.
|
|
pub async fn start_agent(
|
|
&self,
|
|
project_root: &Path,
|
|
story_id: &str,
|
|
agent_name: Option<&str>,
|
|
resume_context: Option<&str>,
|
|
) -> Result<AgentInfo, String> {
|
|
let config = ProjectConfig::load(project_root)?;
|
|
|
|
// Validate explicit agent name early (no lock needed).
|
|
if let Some(name) = agent_name {
|
|
config
|
|
.find_agent(name)
|
|
.ok_or_else(|| format!("No agent named '{name}' in config"))?;
|
|
}
|
|
|
|
// Create name-independent shared resources before the lock so they are
|
|
// ready for the atomic check-and-insert (story 132).
|
|
let (tx, _) = broadcast::channel::<AgentEvent>(1024);
|
|
let event_log: Arc<Mutex<Vec<AgentEvent>>> = Arc::new(Mutex::new(Vec::new()));
|
|
let log_session_id = uuid::Uuid::new_v4().to_string();
|
|
|
|
// Move story from upcoming/ to current/ before checking agent
|
|
// availability so that auto_assign_available_work can pick it up even
|
|
// when all coders are currently busy (story 203). This is idempotent:
|
|
// if the story is already in 2_current/ or a later stage, the call is
|
|
// a no-op.
|
|
super::lifecycle::move_story_to_current(project_root, story_id)?;
|
|
|
|
// Atomically resolve agent name, check availability, and register as
|
|
// Pending. When `agent_name` is `None` the first idle coder is
|
|
// selected inside the lock so no TOCTOU race can occur between the
|
|
// availability check and the Pending insert (story 132, story 190).
|
|
//
|
|
// The `PendingGuard` ensures that if any step below fails the entry is
|
|
// removed from the pool so it does not permanently block auto-assign
|
|
// (bug 118).
|
|
let resolved_name: String;
|
|
let key: String;
|
|
{
|
|
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
|
|
resolved_name = match agent_name {
|
|
Some(name) => name.to_string(),
|
|
None => find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder)
|
|
.map(|s| s.to_string())
|
|
.ok_or_else(|| {
|
|
if config
|
|
.agent
|
|
.iter()
|
|
.any(|a| agent_config_stage(a) == PipelineStage::Coder)
|
|
{
|
|
format!(
|
|
"All coder agents are busy; story '{story_id}' has been \
|
|
queued in work/2_current/ and will be auto-assigned when \
|
|
one becomes available"
|
|
)
|
|
} else {
|
|
"No coder agent configured. Specify an agent_name explicitly."
|
|
.to_string()
|
|
}
|
|
})?,
|
|
};
|
|
|
|
key = composite_key(story_id, &resolved_name);
|
|
|
|
// Check for duplicate assignment (same story + same agent already active).
|
|
if let Some(agent) = agents.get(&key)
|
|
&& (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending)
|
|
{
|
|
return Err(format!(
|
|
"Agent '{resolved_name}' for story '{story_id}' is already {}",
|
|
agent.status
|
|
));
|
|
}
|
|
// Enforce single-stage concurrency: reject if there is already a
|
|
// Running/Pending agent at the same pipeline stage for this story.
|
|
// This prevents two coders (or two QA/mergemaster agents) from
|
|
// corrupting each other's work in the same worktree.
|
|
// Applies to both explicit and auto-selected agents; the Other
|
|
// stage (supervisors, unknown agents) is exempt.
|
|
let resolved_stage = config
|
|
.find_agent(&resolved_name)
|
|
.map(agent_config_stage)
|
|
.unwrap_or_else(|| pipeline_stage(&resolved_name));
|
|
if resolved_stage != PipelineStage::Other
|
|
&& let Some(conflicting_name) = agents.iter().find_map(|(k, a)| {
|
|
let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k);
|
|
if k_story == story_id
|
|
&& a.agent_name != resolved_name
|
|
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
|
{
|
|
let a_stage = config
|
|
.find_agent(&a.agent_name)
|
|
.map(agent_config_stage)
|
|
.unwrap_or_else(|| pipeline_stage(&a.agent_name));
|
|
if a_stage == resolved_stage {
|
|
Some(a.agent_name.clone())
|
|
} else {
|
|
None
|
|
}
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
{
|
|
return Err(format!(
|
|
"Cannot start '{resolved_name}' on story '{story_id}': \
|
|
'{conflicting_name}' is already active at the same pipeline stage"
|
|
));
|
|
}
|
|
// Enforce single-instance concurrency for explicitly-named agents:
|
|
// if this agent is already running on any other story, reject.
|
|
// Auto-selected agents are already guaranteed idle by
|
|
// find_free_agent_for_stage, so this check is only needed for
|
|
// explicit requests.
|
|
if agent_name.is_some()
|
|
&& let Some(busy_story) = agents.iter().find_map(|(k, a)| {
|
|
if a.agent_name == resolved_name
|
|
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
|
{
|
|
Some(
|
|
k.rsplit_once(':')
|
|
.map(|(sid, _)| sid)
|
|
.unwrap_or(k)
|
|
.to_string(),
|
|
)
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
{
|
|
return Err(format!(
|
|
"Agent '{resolved_name}' is already running on story '{busy_story}'; \
|
|
story '{story_id}' will be picked up when the agent becomes available"
|
|
));
|
|
}
|
|
agents.insert(
|
|
key.clone(),
|
|
StoryAgent {
|
|
agent_name: resolved_name.clone(),
|
|
status: AgentStatus::Pending,
|
|
worktree_info: None,
|
|
session_id: None,
|
|
tx: tx.clone(),
|
|
task_handle: None,
|
|
event_log: event_log.clone(),
|
|
completion: None,
|
|
project_root: Some(project_root.to_path_buf()),
|
|
log_session_id: Some(log_session_id.clone()),
|
|
merge_failure_reported: false,
|
|
},
|
|
);
|
|
}
|
|
let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone());
|
|
|
|
// Create persistent log writer (needs resolved_name, so must be after
|
|
// the atomic resolution above).
|
|
let log_writer = match AgentLogWriter::new(
|
|
project_root,
|
|
story_id,
|
|
&resolved_name,
|
|
&log_session_id,
|
|
) {
|
|
Ok(w) => Some(Arc::new(Mutex::new(w))),
|
|
Err(e) => {
|
|
eprintln!(
|
|
"[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}"
|
|
);
|
|
None
|
|
}
|
|
};
|
|
|
|
// Notify WebSocket clients that a new agent is pending.
|
|
Self::notify_agent_state_changed(&self.watcher_tx);
|
|
|
|
let _ = tx.send(AgentEvent::Status {
|
|
story_id: story_id.to_string(),
|
|
agent_name: resolved_name.clone(),
|
|
status: "pending".to_string(),
|
|
});
|
|
|
|
// Extract inactivity timeout from the agent config before cloning config.
|
|
let inactivity_timeout_secs = config
|
|
.find_agent(&resolved_name)
|
|
.map(|a| a.inactivity_timeout_secs)
|
|
.unwrap_or(300);
|
|
|
|
// Clone all values needed inside the background spawn.
|
|
let project_root_clone = project_root.to_path_buf();
|
|
let config_clone = config.clone();
|
|
let resume_context_owned = resume_context.map(str::to_string);
|
|
let sid = story_id.to_string();
|
|
let aname = resolved_name.clone();
|
|
let tx_clone = tx.clone();
|
|
let agents_ref = self.agents.clone();
|
|
let key_clone = key.clone();
|
|
let log_clone = event_log.clone();
|
|
let port_for_task = self.port;
|
|
let log_writer_clone = log_writer.clone();
|
|
let child_killers_clone = self.child_killers.clone();
|
|
let watcher_tx_clone = self.watcher_tx.clone();
|
|
|
|
// Spawn the background task. Worktree creation and agent launch happen here
|
|
// so `start_agent` returns immediately after registering the agent as
|
|
// Pending — non-blocking by design (story 157).
|
|
let handle = tokio::spawn(async move {
|
|
// Step 1: create the worktree (slow — git checkout, pnpm install, etc.)
|
|
let wt_info = match worktree::create_worktree(
|
|
&project_root_clone,
|
|
&sid,
|
|
&config_clone,
|
|
port_for_task,
|
|
)
|
|
.await
|
|
{
|
|
Ok(wt) => wt,
|
|
Err(e) => {
|
|
let error_msg = format!("Failed to create worktree: {e}");
|
|
slog_error!("[agents] {error_msg}");
|
|
let event = AgentEvent::Error {
|
|
story_id: sid.clone(),
|
|
agent_name: aname.clone(),
|
|
message: error_msg,
|
|
};
|
|
if let Ok(mut log) = log_clone.lock() {
|
|
log.push(event.clone());
|
|
}
|
|
let _ = tx_clone.send(event);
|
|
if let Ok(mut agents) = agents_ref.lock()
|
|
&& let Some(agent) = agents.get_mut(&key_clone) {
|
|
agent.status = AgentStatus::Failed;
|
|
}
|
|
Self::notify_agent_state_changed(&watcher_tx_clone);
|
|
return;
|
|
}
|
|
};
|
|
|
|
// Step 2: store worktree info and render agent command/args/prompt.
|
|
let wt_path_str = wt_info.path.to_string_lossy().to_string();
|
|
{
|
|
if let Ok(mut agents) = agents_ref.lock()
|
|
&& let Some(agent) = agents.get_mut(&key_clone)
|
|
{
|
|
agent.worktree_info = Some(wt_info.clone());
|
|
}
|
|
}
|
|
|
|
let (command, args, mut prompt) = match config_clone.render_agent_args(
|
|
&wt_path_str,
|
|
&sid,
|
|
Some(&aname),
|
|
Some(&wt_info.base_branch),
|
|
) {
|
|
Ok(result) => result,
|
|
Err(e) => {
|
|
let error_msg = format!("Failed to render agent args: {e}");
|
|
slog_error!("[agents] {error_msg}");
|
|
let event = AgentEvent::Error {
|
|
story_id: sid.clone(),
|
|
agent_name: aname.clone(),
|
|
message: error_msg,
|
|
};
|
|
if let Ok(mut log) = log_clone.lock() {
|
|
log.push(event.clone());
|
|
}
|
|
let _ = tx_clone.send(event);
|
|
if let Ok(mut agents) = agents_ref.lock()
|
|
&& let Some(agent) = agents.get_mut(&key_clone) {
|
|
agent.status = AgentStatus::Failed;
|
|
}
|
|
Self::notify_agent_state_changed(&watcher_tx_clone);
|
|
return;
|
|
}
|
|
};
|
|
|
|
// Append resume context if this is a restart with failure information.
|
|
if let Some(ctx) = resume_context_owned {
|
|
prompt.push_str(&ctx);
|
|
}
|
|
|
|
// Step 3: transition to Running now that the worktree is ready.
|
|
{
|
|
if let Ok(mut agents) = agents_ref.lock()
|
|
&& let Some(agent) = agents.get_mut(&key_clone)
|
|
{
|
|
agent.status = AgentStatus::Running;
|
|
}
|
|
}
|
|
let _ = tx_clone.send(AgentEvent::Status {
|
|
story_id: sid.clone(),
|
|
agent_name: aname.clone(),
|
|
status: "running".to_string(),
|
|
});
|
|
Self::notify_agent_state_changed(&watcher_tx_clone);
|
|
|
|
// Step 4: launch the agent process.
|
|
match super::pty::run_agent_pty_streaming(
|
|
&sid,
|
|
&aname,
|
|
&command,
|
|
&args,
|
|
&prompt,
|
|
&wt_path_str,
|
|
&tx_clone,
|
|
&log_clone,
|
|
log_writer_clone,
|
|
inactivity_timeout_secs,
|
|
child_killers_clone,
|
|
)
|
|
.await
|
|
{
|
|
Ok(session_id) => {
|
|
// Server-owned completion: run acceptance gates automatically
|
|
// when the agent process exits normally.
|
|
run_server_owned_completion(
|
|
&agents_ref,
|
|
port_for_task,
|
|
&sid,
|
|
&aname,
|
|
session_id,
|
|
watcher_tx_clone.clone(),
|
|
)
|
|
.await;
|
|
Self::notify_agent_state_changed(&watcher_tx_clone);
|
|
}
|
|
Err(e) => {
|
|
slog_error!("[agents] Agent process error for {aname} on {sid}: {e}");
|
|
let event = AgentEvent::Error {
|
|
story_id: sid.clone(),
|
|
agent_name: aname.clone(),
|
|
message: e,
|
|
};
|
|
if let Ok(mut log) = log_clone.lock() {
|
|
log.push(event.clone());
|
|
}
|
|
let _ = tx_clone.send(event);
|
|
if let Ok(mut agents) = agents_ref.lock()
|
|
&& let Some(agent) = agents.get_mut(&key_clone) {
|
|
agent.status = AgentStatus::Failed;
|
|
}
|
|
Self::notify_agent_state_changed(&watcher_tx_clone);
|
|
}
|
|
}
|
|
});
|
|
|
|
// Store the task handle while the agent is still Pending.
|
|
{
|
|
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
if let Some(agent) = agents.get_mut(&key) {
|
|
agent.task_handle = Some(handle);
|
|
}
|
|
}
|
|
|
|
// Agent successfully spawned — prevent the guard from removing the entry.
|
|
pending_guard.disarm();
|
|
|
|
Ok(AgentInfo {
|
|
story_id: story_id.to_string(),
|
|
agent_name: resolved_name,
|
|
status: AgentStatus::Pending,
|
|
session_id: None,
|
|
worktree_path: None,
|
|
base_branch: None,
|
|
completion: None,
|
|
log_session_id: Some(log_session_id),
|
|
})
|
|
}
|
|
|
|
/// Stop a running agent. Worktree is preserved for inspection.
|
|
pub async fn stop_agent(
|
|
&self,
|
|
_project_root: &Path,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
) -> Result<(), String> {
|
|
let key = composite_key(story_id, agent_name);
|
|
|
|
let (worktree_info, task_handle, tx) = {
|
|
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
let agent = agents
|
|
.get_mut(&key)
|
|
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
|
|
|
|
let wt = agent.worktree_info.clone();
|
|
let handle = agent.task_handle.take();
|
|
let tx = agent.tx.clone();
|
|
agent.status = AgentStatus::Failed;
|
|
(wt, handle, tx)
|
|
};
|
|
|
|
// Abort the task and kill the PTY child process.
|
|
// Note: aborting a spawn_blocking task handle does not interrupt the blocking
|
|
// thread, so we must also kill the child process directly via the killer registry.
|
|
if let Some(handle) = task_handle {
|
|
handle.abort();
|
|
let _ = handle.await;
|
|
}
|
|
self.kill_child_for_key(&key);
|
|
|
|
// Preserve worktree for inspection — don't destroy agent's work on stop.
|
|
if let Some(ref wt) = worktree_info {
|
|
slog!(
|
|
"[agents] Worktree preserved for {story_id}:{agent_name}: {}",
|
|
wt.path.display()
|
|
);
|
|
}
|
|
|
|
let _ = tx.send(AgentEvent::Status {
|
|
story_id: story_id.to_string(),
|
|
agent_name: agent_name.to_string(),
|
|
status: "stopped".to_string(),
|
|
});
|
|
|
|
// Remove from map
|
|
{
|
|
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
agents.remove(&key);
|
|
}
|
|
|
|
// Notify WebSocket clients so pipeline board and agent panel update.
|
|
Self::notify_agent_state_changed(&self.watcher_tx);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Return the names of configured agents for `stage` that are not currently
|
|
/// running or pending.
|
|
pub fn available_agents_for_stage(
|
|
&self,
|
|
config: &ProjectConfig,
|
|
stage: &PipelineStage,
|
|
) -> Result<Vec<String>, String> {
|
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
Ok(config
|
|
.agent
|
|
.iter()
|
|
.filter(|cfg| agent_config_stage(cfg) == *stage)
|
|
.filter(|cfg| {
|
|
!agents.values().any(|a| {
|
|
a.agent_name == cfg.name
|
|
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
|
})
|
|
})
|
|
.map(|cfg| cfg.name.clone())
|
|
.collect())
|
|
}
|
|
|
|
/// List all agents with their status.
|
|
pub fn list_agents(&self) -> Result<Vec<AgentInfo>, String> {
|
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
Ok(agents
|
|
.iter()
|
|
.map(|(key, agent)| {
|
|
// Extract story_id from composite key "story_id:agent_name"
|
|
let story_id = key
|
|
.rsplit_once(':')
|
|
.map(|(sid, _)| sid.to_string())
|
|
.unwrap_or_else(|| key.clone());
|
|
agent_info_from_entry(&story_id, agent)
|
|
})
|
|
.collect())
|
|
}
|
|
|
|
/// Subscribe to events for a story agent.
|
|
pub fn subscribe(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
) -> Result<broadcast::Receiver<AgentEvent>, String> {
|
|
let key = composite_key(story_id, agent_name);
|
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
let agent = agents
|
|
.get(&key)
|
|
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
|
|
Ok(agent.tx.subscribe())
|
|
}
|
|
|
|
/// Drain accumulated events for polling. Returns all events since the last drain.
|
|
pub fn drain_events(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
) -> Result<Vec<AgentEvent>, String> {
|
|
let key = composite_key(story_id, agent_name);
|
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
let agent = agents
|
|
.get(&key)
|
|
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
|
|
let mut log = agent.event_log.lock().map_err(|e| e.to_string())?;
|
|
Ok(log.drain(..).collect())
|
|
}
|
|
|
|
/// Block until the agent reaches a terminal state (completed, failed, stopped).
|
|
/// Returns the agent's final `AgentInfo`.
|
|
/// `timeout_ms` caps how long to wait; returns an error if the deadline passes.
|
|
pub async fn wait_for_agent(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
timeout_ms: u64,
|
|
) -> Result<AgentInfo, String> {
|
|
// Subscribe before checking status so we don't miss the terminal event
|
|
// if the agent completes in the window between the two operations.
|
|
let mut rx = self.subscribe(story_id, agent_name)?;
|
|
|
|
// Return immediately if already in a terminal state.
|
|
{
|
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
let key = composite_key(story_id, agent_name);
|
|
if let Some(agent) = agents.get(&key)
|
|
&& matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed)
|
|
{
|
|
return Ok(agent_info_from_entry(story_id, agent));
|
|
}
|
|
}
|
|
|
|
let deadline =
|
|
tokio::time::Instant::now() + std::time::Duration::from_millis(timeout_ms);
|
|
|
|
loop {
|
|
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
|
|
if remaining.is_zero() {
|
|
return Err(format!(
|
|
"Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'"
|
|
));
|
|
}
|
|
|
|
match tokio::time::timeout(remaining, rx.recv()).await {
|
|
Ok(Ok(event)) => {
|
|
let is_terminal = match &event {
|
|
AgentEvent::Done { .. } | AgentEvent::Error { .. } => true,
|
|
AgentEvent::Status { status, .. } if status == "stopped" => true,
|
|
_ => false,
|
|
};
|
|
if is_terminal {
|
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
let key = composite_key(story_id, agent_name);
|
|
return Ok(if let Some(agent) = agents.get(&key) {
|
|
agent_info_from_entry(story_id, agent)
|
|
} else {
|
|
// Agent was removed from map (e.g. stop_agent removes it after
|
|
// the "stopped" status event is sent).
|
|
let (status, session_id) = match event {
|
|
AgentEvent::Done { session_id, .. } => {
|
|
(AgentStatus::Completed, session_id)
|
|
}
|
|
_ => (AgentStatus::Failed, None),
|
|
};
|
|
AgentInfo {
|
|
story_id: story_id.to_string(),
|
|
agent_name: agent_name.to_string(),
|
|
status,
|
|
session_id,
|
|
worktree_path: None,
|
|
base_branch: None,
|
|
completion: None,
|
|
log_session_id: None,
|
|
}
|
|
});
|
|
}
|
|
}
|
|
Ok(Err(broadcast::error::RecvError::Lagged(_))) => {
|
|
// Missed some buffered events — check current status before resuming.
|
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
let key = composite_key(story_id, agent_name);
|
|
if let Some(agent) = agents.get(&key)
|
|
&& matches!(agent.status, AgentStatus::Completed | AgentStatus::Failed)
|
|
{
|
|
return Ok(agent_info_from_entry(story_id, agent));
|
|
}
|
|
// Still running — continue the loop.
|
|
}
|
|
Ok(Err(broadcast::error::RecvError::Closed)) => {
|
|
// Channel closed: no more events will arrive. Return current state.
|
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
let key = composite_key(story_id, agent_name);
|
|
if let Some(agent) = agents.get(&key) {
|
|
return Ok(agent_info_from_entry(story_id, agent));
|
|
}
|
|
return Err(format!(
|
|
"Agent '{agent_name}' for story '{story_id}' channel closed unexpectedly"
|
|
));
|
|
}
|
|
Err(_) => {
|
|
return Err(format!(
|
|
"Timed out after {timeout_ms}ms waiting for agent '{agent_name}' on story '{story_id}'"
|
|
));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Create a worktree for the given story using the server port (writes .mcp.json).
|
|
pub async fn create_worktree(
|
|
&self,
|
|
project_root: &Path,
|
|
story_id: &str,
|
|
) -> Result<worktree::WorktreeInfo, String> {
|
|
let config = ProjectConfig::load(project_root)?;
|
|
worktree::create_worktree(project_root, story_id, &config, self.port).await
|
|
}
|
|
|
|
/// Advance the pipeline after an agent completes.
|
|
///
|
|
/// Called internally by `report_completion` as a background task.
|
|
/// Reads the stored completion report and project_root from the agent,
|
|
/// then drives the next pipeline stage based on the agent's role:
|
|
///
|
|
/// - **Coder** + gates passed → move story to `work/3_qa/`, start `qa` agent.
|
|
/// - **Coder** + gates failed → restart the same coder agent with failure context.
|
|
/// - **QA** + gates passed + coverage passed → move story to `work/4_merge/`, start `mergemaster` agent.
|
|
/// - **QA** + gates passed + coverage failed → restart `qa` with coverage failure context.
|
|
/// - **QA** + gates failed → restart `qa` with failure context.
|
|
/// - **Mergemaster** → run `script/test` on master; if pass: archive + cleanup worktree;
|
|
/// if fail: restart `mergemaster` with failure context.
|
|
/// - **Other** (supervisor, unknown) → no automatic advancement.
|
|
async fn run_pipeline_advance(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
completion: CompletionReport,
|
|
project_root: Option<PathBuf>,
|
|
worktree_path: Option<PathBuf>,
|
|
merge_failure_reported: bool,
|
|
) {
|
|
let project_root = match project_root {
|
|
Some(p) => p,
|
|
None => {
|
|
slog_warn!("[pipeline] No project_root for '{story_id}:{agent_name}'");
|
|
return;
|
|
}
|
|
};
|
|
|
|
let config = ProjectConfig::load(&project_root).unwrap_or_default();
|
|
let stage = config
|
|
.find_agent(agent_name)
|
|
.map(agent_config_stage)
|
|
.unwrap_or_else(|| pipeline_stage(agent_name));
|
|
|
|
match stage {
|
|
PipelineStage::Other => {
|
|
// Supervisors and unknown agents do not advance the pipeline.
|
|
}
|
|
PipelineStage::Coder => {
|
|
if completion.gates_passed {
|
|
slog!(
|
|
"[pipeline] Coder '{agent_name}' passed gates for '{story_id}'. Moving to QA."
|
|
);
|
|
if let Err(e) = super::lifecycle::move_story_to_qa(&project_root, story_id) {
|
|
slog_error!("[pipeline] Failed to move '{story_id}' to 3_qa/: {e}");
|
|
return;
|
|
}
|
|
if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some("qa"), None)
|
|
.await
|
|
{
|
|
slog_error!("[pipeline] Failed to start qa agent for '{story_id}': {e}");
|
|
}
|
|
// Coder slot is now free — pick up any other unassigned work in 2_current/.
|
|
self.auto_assign_available_work(&project_root).await;
|
|
} else {
|
|
slog!(
|
|
"[pipeline] Coder '{agent_name}' failed gates for '{story_id}'. Restarting."
|
|
);
|
|
let context = format!(
|
|
"\n\n---\n## Previous Attempt Failed\n\
|
|
The acceptance gates failed with the following output:\n{}\n\n\
|
|
Please review the failures above, fix the issues, and try again.",
|
|
completion.gate_output
|
|
);
|
|
if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some(agent_name), Some(&context))
|
|
.await
|
|
{
|
|
slog_error!(
|
|
"[pipeline] Failed to restart coder '{agent_name}' for '{story_id}': {e}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
PipelineStage::Qa => {
|
|
if completion.gates_passed {
|
|
// Run coverage gate in the QA worktree before advancing to merge.
|
|
let coverage_path = worktree_path.clone().unwrap_or_else(|| project_root.clone());
|
|
let cp = coverage_path.clone();
|
|
let coverage_result =
|
|
tokio::task::spawn_blocking(move || super::gates::run_coverage_gate(&cp))
|
|
.await
|
|
.unwrap_or_else(|e| {
|
|
slog_warn!("[pipeline] Coverage gate task panicked: {e}");
|
|
Ok((false, format!("Coverage gate task panicked: {e}")))
|
|
});
|
|
let (coverage_passed, coverage_output) = match coverage_result {
|
|
Ok(pair) => pair,
|
|
Err(e) => (false, e),
|
|
};
|
|
|
|
if coverage_passed {
|
|
// Spikes skip merge — they stay in 3_qa/ for human review.
|
|
if super::lifecycle::item_type_from_id(story_id) == "spike" {
|
|
// Mark the spike as held for review so auto-assign won't
|
|
// restart QA on it.
|
|
let qa_dir = project_root.join(".story_kit/work/3_qa");
|
|
let spike_path = qa_dir.join(format!("{story_id}.md"));
|
|
if let Err(e) = crate::io::story_metadata::write_review_hold(&spike_path) {
|
|
slog_error!("[pipeline] Failed to set review_hold on '{story_id}': {e}");
|
|
}
|
|
slog!(
|
|
"[pipeline] QA passed for spike '{story_id}'. \
|
|
Stopping for human review (skipping merge). \
|
|
Worktree preserved at: {worktree_path:?}"
|
|
);
|
|
// Free up the QA slot without advancing the spike.
|
|
self.auto_assign_available_work(&project_root).await;
|
|
} else {
|
|
slog!(
|
|
"[pipeline] QA passed gates and coverage for '{story_id}'. Moving to merge."
|
|
);
|
|
if let Err(e) = super::lifecycle::move_story_to_merge(&project_root, story_id) {
|
|
slog_error!("[pipeline] Failed to move '{story_id}' to 4_merge/: {e}");
|
|
return;
|
|
}
|
|
if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some("mergemaster"), None)
|
|
.await
|
|
{
|
|
slog_error!("[pipeline] Failed to start mergemaster for '{story_id}': {e}");
|
|
}
|
|
// QA slot is now free — pick up any other unassigned work in 3_qa/.
|
|
self.auto_assign_available_work(&project_root).await;
|
|
}
|
|
} else {
|
|
slog!(
|
|
"[pipeline] QA coverage gate failed for '{story_id}'. Restarting QA."
|
|
);
|
|
let context = format!(
|
|
"\n\n---\n## Coverage Gate Failed\n\
|
|
The coverage gate (script/test_coverage) failed with the following output:\n{}\n\n\
|
|
Please improve test coverage until the coverage gate passes.",
|
|
coverage_output
|
|
);
|
|
if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some("qa"), Some(&context))
|
|
.await
|
|
{
|
|
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
|
|
}
|
|
}
|
|
} else {
|
|
slog!(
|
|
"[pipeline] QA failed gates for '{story_id}'. Restarting."
|
|
);
|
|
let context = format!(
|
|
"\n\n---\n## Previous QA Attempt Failed\n\
|
|
The acceptance gates failed with the following output:\n{}\n\n\
|
|
Please re-run and fix the issues.",
|
|
completion.gate_output
|
|
);
|
|
if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some("qa"), Some(&context))
|
|
.await
|
|
{
|
|
slog_error!("[pipeline] Failed to restart qa for '{story_id}': {e}");
|
|
}
|
|
}
|
|
}
|
|
PipelineStage::Mergemaster => {
|
|
// Block advancement if the mergemaster explicitly reported a failure.
|
|
// The server-owned gate check runs in the feature-branch worktree (not
|
|
// master), so `gates_passed=true` is misleading when no code was merged.
|
|
if merge_failure_reported {
|
|
slog!(
|
|
"[pipeline] Pipeline advancement blocked for '{story_id}': \
|
|
mergemaster explicitly reported a merge failure. \
|
|
Story stays in 4_merge/ for human review."
|
|
);
|
|
return;
|
|
}
|
|
|
|
// Run script/test on master (project_root) as the post-merge verification.
|
|
slog!(
|
|
"[pipeline] Mergemaster completed for '{story_id}'. Running post-merge tests on master."
|
|
);
|
|
let root = project_root.clone();
|
|
let test_result = tokio::task::spawn_blocking(move || super::gates::run_project_tests(&root))
|
|
.await
|
|
.unwrap_or_else(|e| {
|
|
slog_warn!("[pipeline] Post-merge test task panicked: {e}");
|
|
Ok((false, format!("Test task panicked: {e}")))
|
|
});
|
|
let (passed, output) = match test_result {
|
|
Ok(pair) => pair,
|
|
Err(e) => (false, e),
|
|
};
|
|
|
|
if passed {
|
|
slog!(
|
|
"[pipeline] Post-merge tests passed for '{story_id}'. Moving to done."
|
|
);
|
|
if let Err(e) = super::lifecycle::move_story_to_archived(&project_root, story_id) {
|
|
slog_error!("[pipeline] Failed to move '{story_id}' to done: {e}");
|
|
}
|
|
self.remove_agents_for_story(story_id);
|
|
// Mergemaster slot is now free — pick up any other items in 4_merge/.
|
|
self.auto_assign_available_work(&project_root).await;
|
|
// TODO: Re-enable worktree cleanup once we have persistent agent logs.
|
|
// Removing worktrees destroys evidence needed to debug empty-commit agents.
|
|
// let config =
|
|
// crate::config::ProjectConfig::load(&project_root).unwrap_or_default();
|
|
// if let Err(e) =
|
|
// worktree::remove_worktree_by_story_id(&project_root, story_id, &config)
|
|
// .await
|
|
// {
|
|
// slog!(
|
|
// "[pipeline] Failed to remove worktree for '{story_id}': {e}"
|
|
// );
|
|
// }
|
|
slog!(
|
|
"[pipeline] Story '{story_id}' done. Worktree preserved for inspection."
|
|
);
|
|
} else {
|
|
slog!(
|
|
"[pipeline] Post-merge tests failed for '{story_id}'. Restarting mergemaster."
|
|
);
|
|
let context = format!(
|
|
"\n\n---\n## Post-Merge Test Failed\n\
|
|
The tests on master failed with the following output:\n{}\n\n\
|
|
Please investigate and resolve the failures, then call merge_agent_work again.",
|
|
output
|
|
);
|
|
if let Err(e) = self
|
|
.start_agent(&project_root, story_id, Some("mergemaster"), Some(&context))
|
|
.await
|
|
{
|
|
slog_error!(
|
|
"[pipeline] Failed to restart mergemaster for '{story_id}': {e}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Internal: report that an agent has finished work on a story.
|
|
///
|
|
/// **Note:** This is no longer exposed as an MCP tool. The server now
|
|
/// automatically runs completion gates when an agent process exits
|
|
/// (see `run_server_owned_completion`). This method is retained for
|
|
/// backwards compatibility and testing.
|
|
///
|
|
/// - Rejects with an error if the worktree has uncommitted changes.
|
|
/// - Runs acceptance gates (cargo clippy + cargo nextest run / cargo test).
|
|
/// - Stores the `CompletionReport` on the agent record.
|
|
/// - Transitions status to `Completed` (gates passed) or `Failed` (gates failed).
|
|
/// - Emits a `Done` event so `wait_for_agent` unblocks.
|
|
#[allow(dead_code)]
|
|
pub async fn report_completion(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
summary: &str,
|
|
) -> Result<CompletionReport, String> {
|
|
let key = composite_key(story_id, agent_name);
|
|
|
|
// Verify agent exists, is Running, and grab its worktree path.
|
|
let worktree_path = {
|
|
let agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
let agent = agents
|
|
.get(&key)
|
|
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
|
|
|
|
if agent.status != AgentStatus::Running {
|
|
return Err(format!(
|
|
"Agent '{agent_name}' for story '{story_id}' is not running (status: {}). \
|
|
report_completion can only be called by a running agent.",
|
|
agent.status
|
|
));
|
|
}
|
|
|
|
agent
|
|
.worktree_info
|
|
.as_ref()
|
|
.map(|wt| wt.path.clone())
|
|
.ok_or_else(|| {
|
|
format!(
|
|
"Agent '{agent_name}' for story '{story_id}' has no worktree. \
|
|
Cannot run acceptance gates."
|
|
)
|
|
})?
|
|
};
|
|
|
|
let path = worktree_path.clone();
|
|
|
|
// Run gate checks in a blocking thread to avoid stalling the async runtime.
|
|
let (gates_passed, gate_output) = tokio::task::spawn_blocking(move || {
|
|
// Step 1: Reject if worktree is dirty.
|
|
super::gates::check_uncommitted_changes(&path)?;
|
|
// Step 2: Run clippy + tests and return (passed, output).
|
|
super::gates::run_acceptance_gates(&path)
|
|
})
|
|
.await
|
|
.map_err(|e| format!("Gate check task panicked: {e}"))??;
|
|
|
|
let report = CompletionReport {
|
|
summary: summary.to_string(),
|
|
gates_passed,
|
|
gate_output,
|
|
};
|
|
|
|
// Extract data for pipeline advance, then remove the entry so
|
|
// completed agents never appear in list_agents.
|
|
let (tx, session_id, project_root_for_advance, wt_path_for_advance, merge_failure_reported_for_advance) = {
|
|
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
let agent = agents.get_mut(&key).ok_or_else(|| {
|
|
format!("Agent '{agent_name}' for story '{story_id}' disappeared during gate check")
|
|
})?;
|
|
agent.completion = Some(report.clone());
|
|
let tx = agent.tx.clone();
|
|
let sid = agent.session_id.clone();
|
|
let pr = agent.project_root.clone();
|
|
let wt = agent.worktree_info.as_ref().map(|w| w.path.clone());
|
|
let mfr = agent.merge_failure_reported;
|
|
agents.remove(&key);
|
|
(tx, sid, pr, wt, mfr)
|
|
};
|
|
|
|
// Emit Done so wait_for_agent unblocks.
|
|
let _ = tx.send(AgentEvent::Done {
|
|
story_id: story_id.to_string(),
|
|
agent_name: agent_name.to_string(),
|
|
session_id,
|
|
});
|
|
|
|
// Notify WebSocket clients that the agent is gone.
|
|
Self::notify_agent_state_changed(&self.watcher_tx);
|
|
|
|
// Advance the pipeline state machine in a background task.
|
|
let pool_clone = Self {
|
|
agents: Arc::clone(&self.agents),
|
|
port: self.port,
|
|
child_killers: Arc::clone(&self.child_killers),
|
|
watcher_tx: self.watcher_tx.clone(),
|
|
merge_jobs: Arc::clone(&self.merge_jobs),
|
|
};
|
|
let sid = story_id.to_string();
|
|
let aname = agent_name.to_string();
|
|
let report_for_advance = report.clone();
|
|
tokio::spawn(async move {
|
|
pool_clone
|
|
.run_pipeline_advance(
|
|
&sid,
|
|
&aname,
|
|
report_for_advance,
|
|
project_root_for_advance,
|
|
wt_path_for_advance,
|
|
merge_failure_reported_for_advance,
|
|
)
|
|
.await;
|
|
});
|
|
|
|
Ok(report)
|
|
}
|
|
|
|
/// Run the full mergemaster pipeline for a completed story:
|
|
///
|
|
/// 1. Squash-merge the story's feature branch into the current branch (master).
|
|
/// 2. If conflicts are found: abort the merge and report them.
|
|
/// 3. Quality gates run **inside the merge worktree** before master is touched.
|
|
/// 4. If gates pass: cherry-pick the squash commit onto master and archive the story.
|
|
///
|
|
/// Returns a `MergeReport` with full details of what happened.
|
|
/// Start the merge pipeline as a background task.
|
|
///
|
|
/// Returns immediately so the MCP tool call doesn't time out (the full
|
|
/// pipeline — squash merge + quality gates — takes well over 60 seconds,
|
|
/// exceeding Claude Code's MCP tool-call timeout).
|
|
///
|
|
/// The mergemaster agent should poll [`get_merge_status`](Self::get_merge_status)
|
|
/// until the job reaches a terminal state.
|
|
pub fn start_merge_agent_work(
|
|
self: &Arc<Self>,
|
|
project_root: &Path,
|
|
story_id: &str,
|
|
) -> Result<(), String> {
|
|
// Guard against double-starts.
|
|
{
|
|
let jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
|
|
if let Some(job) = jobs.get(story_id)
|
|
&& matches!(job.status, super::merge::MergeJobStatus::Running)
|
|
{
|
|
return Err(format!(
|
|
"Merge already in progress for '{story_id}'. \
|
|
Use get_merge_status to poll for completion."
|
|
));
|
|
}
|
|
}
|
|
|
|
// Insert Running job.
|
|
{
|
|
let mut jobs = self.merge_jobs.lock().map_err(|e| e.to_string())?;
|
|
jobs.insert(
|
|
story_id.to_string(),
|
|
super::merge::MergeJob {
|
|
story_id: story_id.to_string(),
|
|
status: super::merge::MergeJobStatus::Running,
|
|
},
|
|
);
|
|
}
|
|
|
|
let pool = Arc::clone(self);
|
|
let root = project_root.to_path_buf();
|
|
let sid = story_id.to_string();
|
|
|
|
tokio::spawn(async move {
|
|
let report = pool.run_merge_pipeline(&root, &sid).await;
|
|
let failed = report.is_err();
|
|
let status = match report {
|
|
Ok(r) => super::merge::MergeJobStatus::Completed(r),
|
|
Err(e) => super::merge::MergeJobStatus::Failed(e),
|
|
};
|
|
if let Ok(mut jobs) = pool.merge_jobs.lock()
|
|
&& let Some(job) = jobs.get_mut(&sid)
|
|
{
|
|
job.status = status;
|
|
}
|
|
if failed {
|
|
pool.auto_assign_available_work(&root).await;
|
|
}
|
|
});
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// The actual merge pipeline, run inside a background task.
|
|
async fn run_merge_pipeline(
|
|
self: &Arc<Self>,
|
|
project_root: &Path,
|
|
story_id: &str,
|
|
) -> Result<super::merge::MergeReport, String> {
|
|
let branch = format!("feature/story-{story_id}");
|
|
let wt_path = worktree::worktree_path(project_root, story_id);
|
|
let root = project_root.to_path_buf();
|
|
let sid = story_id.to_string();
|
|
let br = branch.clone();
|
|
|
|
let merge_result =
|
|
tokio::task::spawn_blocking(move || super::merge::run_squash_merge(&root, &br, &sid))
|
|
.await
|
|
.map_err(|e| format!("Merge task panicked: {e}"))??;
|
|
|
|
if !merge_result.success {
|
|
return Ok(super::merge::MergeReport {
|
|
story_id: story_id.to_string(),
|
|
success: false,
|
|
had_conflicts: merge_result.had_conflicts,
|
|
conflicts_resolved: merge_result.conflicts_resolved,
|
|
conflict_details: merge_result.conflict_details,
|
|
gates_passed: merge_result.gates_passed,
|
|
gate_output: merge_result.output,
|
|
worktree_cleaned_up: false,
|
|
story_archived: false,
|
|
});
|
|
}
|
|
|
|
let story_archived = super::lifecycle::move_story_to_archived(project_root, story_id).is_ok();
|
|
if story_archived {
|
|
self.remove_agents_for_story(story_id);
|
|
}
|
|
|
|
let worktree_cleaned_up = if wt_path.exists() {
|
|
let config = crate::config::ProjectConfig::load(project_root)
|
|
.unwrap_or_default();
|
|
worktree::remove_worktree_by_story_id(project_root, story_id, &config)
|
|
.await
|
|
.is_ok()
|
|
} else {
|
|
false
|
|
};
|
|
|
|
self.auto_assign_available_work(project_root).await;
|
|
|
|
Ok(super::merge::MergeReport {
|
|
story_id: story_id.to_string(),
|
|
success: true,
|
|
had_conflicts: merge_result.had_conflicts,
|
|
conflicts_resolved: merge_result.conflicts_resolved,
|
|
conflict_details: merge_result.conflict_details,
|
|
gates_passed: true,
|
|
gate_output: merge_result.output,
|
|
worktree_cleaned_up,
|
|
story_archived,
|
|
})
|
|
}
|
|
|
|
/// Check the status of a background merge job.
|
|
pub fn get_merge_status(&self, story_id: &str) -> Option<super::merge::MergeJob> {
|
|
self.merge_jobs
|
|
.lock()
|
|
.ok()
|
|
.and_then(|jobs| jobs.get(story_id).cloned())
|
|
}
|
|
|
|
/// Return the port this server is running on.
|
|
pub fn port(&self) -> u16 {
|
|
self.port
|
|
}
|
|
|
|
/// Get project root helper.
|
|
pub fn get_project_root(
|
|
&self,
|
|
state: &crate::state::SessionState,
|
|
) -> Result<PathBuf, String> {
|
|
state.get_project_root()
|
|
}
|
|
|
|
/// Get the log session ID and project root for an agent, if available.
|
|
///
|
|
/// Used by MCP tools to find the persistent log file for a completed agent.
|
|
pub fn get_log_info(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
) -> Option<(String, PathBuf)> {
|
|
let key = composite_key(story_id, agent_name);
|
|
let agents = self.agents.lock().ok()?;
|
|
let agent = agents.get(&key)?;
|
|
let session_id = agent.log_session_id.clone()?;
|
|
let project_root = agent.project_root.clone()?;
|
|
Some((session_id, project_root))
|
|
}
|
|
|
|
/// Record that the mergemaster agent for `story_id` explicitly reported a
|
|
/// merge failure via the `report_merge_failure` MCP tool.
|
|
///
|
|
/// Sets `merge_failure_reported = true` on the active mergemaster agent so
|
|
/// that `run_pipeline_advance` can block advancement to `5_done/` even when
|
|
/// the server-owned gate check returns `gates_passed=true` (those gates run
|
|
/// in the feature-branch worktree, not on master).
|
|
pub fn set_merge_failure_reported(&self, story_id: &str) {
|
|
match self.agents.lock() {
|
|
Ok(mut lock) => {
|
|
let found = lock.iter_mut().find(|(key, agent)| {
|
|
let key_story_id = key
|
|
.rsplit_once(':')
|
|
.map(|(sid, _)| sid)
|
|
.unwrap_or(key.as_str());
|
|
key_story_id == story_id
|
|
&& pipeline_stage(&agent.agent_name) == PipelineStage::Mergemaster
|
|
});
|
|
match found {
|
|
Some((_, agent)) => {
|
|
agent.merge_failure_reported = true;
|
|
slog!(
|
|
"[pipeline] Merge failure flag set for '{story_id}:{}'",
|
|
agent.agent_name
|
|
);
|
|
}
|
|
None => {
|
|
slog_warn!(
|
|
"[pipeline] set_merge_failure_reported: no running mergemaster found \
|
|
for story '{story_id}' — flag not set"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
Err(e) => {
|
|
slog_error!(
|
|
"[pipeline] set_merge_failure_reported: could not lock agents: {e}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Test helper: inject a pre-built agent entry so unit tests can exercise
|
|
/// wait/subscribe logic without spawning a real process.
|
|
#[cfg(test)]
|
|
pub fn inject_test_agent(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
status: AgentStatus,
|
|
) -> broadcast::Sender<AgentEvent> {
|
|
let (tx, _) = broadcast::channel::<AgentEvent>(64);
|
|
let key = composite_key(story_id, agent_name);
|
|
let mut agents = self.agents.lock().unwrap();
|
|
agents.insert(
|
|
key,
|
|
StoryAgent {
|
|
agent_name: agent_name.to_string(),
|
|
status,
|
|
worktree_info: None,
|
|
session_id: None,
|
|
tx: tx.clone(),
|
|
task_handle: None,
|
|
event_log: Arc::new(Mutex::new(Vec::new())),
|
|
completion: None,
|
|
project_root: None,
|
|
log_session_id: None,
|
|
merge_failure_reported: false,
|
|
},
|
|
);
|
|
tx
|
|
}
|
|
|
|
/// Test helper: inject an agent with a specific worktree path for testing
|
|
/// gate-related logic.
|
|
#[cfg(test)]
|
|
pub fn inject_test_agent_with_path(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
status: AgentStatus,
|
|
worktree_path: PathBuf,
|
|
) -> broadcast::Sender<AgentEvent> {
|
|
let (tx, _) = broadcast::channel::<AgentEvent>(64);
|
|
let key = composite_key(story_id, agent_name);
|
|
let mut agents = self.agents.lock().unwrap();
|
|
agents.insert(
|
|
key,
|
|
StoryAgent {
|
|
agent_name: agent_name.to_string(),
|
|
status,
|
|
worktree_info: Some(WorktreeInfo {
|
|
path: worktree_path,
|
|
branch: format!("feature/story-{story_id}"),
|
|
base_branch: "master".to_string(),
|
|
}),
|
|
session_id: None,
|
|
tx: tx.clone(),
|
|
task_handle: None,
|
|
event_log: Arc::new(Mutex::new(Vec::new())),
|
|
completion: None,
|
|
project_root: None,
|
|
log_session_id: None,
|
|
merge_failure_reported: false,
|
|
},
|
|
);
|
|
tx
|
|
}
|
|
|
|
/// Automatically assign free agents to stories waiting in the active pipeline stages.
|
|
///
|
|
/// Scans `work/2_current/`, `work/3_qa/`, and `work/4_merge/` for items that have no
|
|
/// active agent and assigns the first free agent of the appropriate role. Items in
|
|
/// `work/1_upcoming/` are never auto-started.
|
|
///
|
|
/// Respects the configured agent roster: the maximum number of concurrently active agents
|
|
/// per role is bounded by the count of agents of that role defined in `project.toml`.
|
|
pub async fn auto_assign_available_work(&self, project_root: &Path) {
|
|
let config = match ProjectConfig::load(project_root) {
|
|
Ok(c) => c,
|
|
Err(e) => {
|
|
slog_warn!("[auto-assign] Failed to load project config: {e}");
|
|
return;
|
|
}
|
|
};
|
|
|
|
// Process each active pipeline stage in order.
|
|
let stages: [(&str, PipelineStage); 3] = [
|
|
("2_current", PipelineStage::Coder),
|
|
("3_qa", PipelineStage::Qa),
|
|
("4_merge", PipelineStage::Mergemaster),
|
|
];
|
|
|
|
for (stage_dir, stage) in &stages {
|
|
let items = scan_stage_items(project_root, stage_dir);
|
|
if items.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
for story_id in &items {
|
|
// Items marked with review_hold (e.g. spikes after QA passes) stay
|
|
// in their current stage for human review — don't auto-assign agents.
|
|
if has_review_hold(project_root, stage_dir, story_id) {
|
|
continue;
|
|
}
|
|
|
|
// Re-acquire the lock on each iteration to see state changes
|
|
// from previous start_agent calls in the same pass.
|
|
let preferred_agent =
|
|
read_story_front_matter_agent(project_root, stage_dir, story_id);
|
|
|
|
// Outcome: (already_assigned, chosen_agent, preferred_busy)
|
|
// preferred_busy=true means the story has a specific agent requested but it is
|
|
// currently occupied — the story should wait rather than fall back.
|
|
let (already_assigned, free_agent, preferred_busy) = {
|
|
let agents = match self.agents.lock() {
|
|
Ok(a) => a,
|
|
Err(e) => {
|
|
slog_error!("[auto-assign] Failed to lock agents: {e}");
|
|
break;
|
|
}
|
|
};
|
|
let assigned = is_story_assigned_for_stage(&config, &agents, story_id, stage);
|
|
if assigned {
|
|
(true, None, false)
|
|
} else if let Some(ref pref) = preferred_agent {
|
|
// Story has a front-matter agent preference.
|
|
if is_agent_free(&agents, pref) {
|
|
(false, Some(pref.clone()), false)
|
|
} else {
|
|
(false, None, true)
|
|
}
|
|
} else {
|
|
let free = find_free_agent_for_stage(&config, &agents, stage)
|
|
.map(|s| s.to_string());
|
|
(false, free, false)
|
|
}
|
|
};
|
|
|
|
if already_assigned {
|
|
// Story already has an active agent — skip silently.
|
|
continue;
|
|
}
|
|
|
|
if preferred_busy {
|
|
// The story requests a specific agent that is currently busy.
|
|
// Do not fall back to a different agent; let this story wait.
|
|
slog!(
|
|
"[auto-assign] Preferred agent '{}' busy for '{story_id}'; story will wait.",
|
|
preferred_agent.as_deref().unwrap_or("?")
|
|
);
|
|
continue;
|
|
}
|
|
|
|
match free_agent {
|
|
Some(agent_name) => {
|
|
slog!(
|
|
"[auto-assign] Assigning '{agent_name}' to '{story_id}' in {stage_dir}/"
|
|
);
|
|
if let Err(e) = self
|
|
.start_agent(project_root, story_id, Some(&agent_name), None)
|
|
.await
|
|
{
|
|
slog!(
|
|
"[auto-assign] Failed to start '{agent_name}' for '{story_id}': {e}"
|
|
);
|
|
}
|
|
}
|
|
None => {
|
|
// No free agents of this type — stop scanning this stage.
|
|
slog!(
|
|
"[auto-assign] All {:?} agents busy; remaining items in {stage_dir}/ will wait.",
|
|
stage
|
|
);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Reconcile stories whose agent work was committed while the server was offline.
|
|
///
|
|
/// On server startup the in-memory agent pool is empty, so any story that an agent
|
|
/// completed during a previous session is stuck: the worktree has committed work but
|
|
/// the pipeline never advanced. This method detects those stories, re-runs the
|
|
/// acceptance gates, and advances the pipeline stage so that `auto_assign_available_work`
|
|
/// (called immediately after) picks up the right next-stage agents.
|
|
///
|
|
/// Algorithm:
|
|
/// 1. List all worktree directories under `{project_root}/.story_kit/worktrees/`.
|
|
/// 2. For each worktree, check whether its feature branch has commits ahead of the
|
|
/// base branch (`master` / `main`).
|
|
/// 3. If committed work is found AND the story is in `2_current/` or `3_qa/`:
|
|
/// - Run acceptance gates (uncommitted-change check + clippy + tests).
|
|
/// - On pass + `2_current/`: move the story to `3_qa/`.
|
|
/// - On pass + `3_qa/`: run the coverage gate; if that also passes move to `4_merge/`.
|
|
/// - On failure: leave the story where it is so `auto_assign_available_work` can
|
|
/// start a fresh agent to retry.
|
|
/// 4. Stories in `4_merge/` are left for `auto_assign_available_work` to handle via a
|
|
/// fresh mergemaster (squash-merge must be re-executed by the mergemaster agent).
|
|
pub async fn reconcile_on_startup(
|
|
&self,
|
|
project_root: &Path,
|
|
progress_tx: &broadcast::Sender<ReconciliationEvent>,
|
|
) {
|
|
let worktrees = match worktree::list_worktrees(project_root) {
|
|
Ok(wt) => wt,
|
|
Err(e) => {
|
|
eprintln!("[startup:reconcile] Failed to list worktrees: {e}");
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: String::new(),
|
|
status: "done".to_string(),
|
|
message: format!("Reconciliation failed: {e}"),
|
|
});
|
|
return;
|
|
}
|
|
};
|
|
|
|
for wt_entry in &worktrees {
|
|
let story_id = &wt_entry.story_id;
|
|
let wt_path = wt_entry.path.clone();
|
|
|
|
// Determine which active stage the story is in.
|
|
let stage_dir = match find_active_story_stage(project_root, story_id) {
|
|
Some(s) => s,
|
|
None => continue, // Not in any active stage (upcoming/archived or unknown).
|
|
};
|
|
|
|
// 4_merge/ is left for auto_assign to handle with a fresh mergemaster.
|
|
if stage_dir == "4_merge" {
|
|
continue;
|
|
}
|
|
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "checking".to_string(),
|
|
message: format!("Checking for committed work in {stage_dir}/"),
|
|
});
|
|
|
|
// Check whether the worktree has commits ahead of the base branch.
|
|
let wt_path_for_check = wt_path.clone();
|
|
let has_work = tokio::task::spawn_blocking(move || {
|
|
super::gates::worktree_has_committed_work(&wt_path_for_check)
|
|
})
|
|
.await
|
|
.unwrap_or(false);
|
|
|
|
if !has_work {
|
|
eprintln!(
|
|
"[startup:reconcile] No committed work for '{story_id}' in {stage_dir}/; skipping."
|
|
);
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "skipped".to_string(),
|
|
message: "No committed work found; skipping.".to_string(),
|
|
});
|
|
continue;
|
|
}
|
|
|
|
eprintln!(
|
|
"[startup:reconcile] Found committed work for '{story_id}' in {stage_dir}/. Running acceptance gates."
|
|
);
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "gates_running".to_string(),
|
|
message: "Running acceptance gates…".to_string(),
|
|
});
|
|
|
|
// Run acceptance gates on the worktree.
|
|
let wt_path_for_gates = wt_path.clone();
|
|
let gates_result = tokio::task::spawn_blocking(move || {
|
|
super::gates::check_uncommitted_changes(&wt_path_for_gates)?;
|
|
super::gates::run_acceptance_gates(&wt_path_for_gates)
|
|
})
|
|
.await;
|
|
|
|
let (gates_passed, gate_output) = match gates_result {
|
|
Ok(Ok(pair)) => pair,
|
|
Ok(Err(e)) => {
|
|
eprintln!("[startup:reconcile] Gate check error for '{story_id}': {e}");
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "failed".to_string(),
|
|
message: format!("Gate error: {e}"),
|
|
});
|
|
continue;
|
|
}
|
|
Err(e) => {
|
|
eprintln!(
|
|
"[startup:reconcile] Gate check task panicked for '{story_id}': {e}"
|
|
);
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "failed".to_string(),
|
|
message: format!("Gate task panicked: {e}"),
|
|
});
|
|
continue;
|
|
}
|
|
};
|
|
|
|
if !gates_passed {
|
|
eprintln!(
|
|
"[startup:reconcile] Gates failed for '{story_id}': {gate_output}\n\
|
|
Leaving in {stage_dir}/ for auto-assign to restart the agent."
|
|
);
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "failed".to_string(),
|
|
message: "Gates failed; will be retried by auto-assign.".to_string(),
|
|
});
|
|
continue;
|
|
}
|
|
|
|
eprintln!(
|
|
"[startup:reconcile] Gates passed for '{story_id}' (stage: {stage_dir}/)."
|
|
);
|
|
|
|
if stage_dir == "2_current" {
|
|
// Coder stage → advance to QA.
|
|
if let Err(e) = super::lifecycle::move_story_to_qa(project_root, story_id) {
|
|
eprintln!("[startup:reconcile] Failed to move '{story_id}' to 3_qa/: {e}");
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "failed".to_string(),
|
|
message: format!("Failed to advance to QA: {e}"),
|
|
});
|
|
} else {
|
|
eprintln!("[startup:reconcile] Moved '{story_id}' → 3_qa/.");
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "advanced".to_string(),
|
|
message: "Gates passed — moved to QA.".to_string(),
|
|
});
|
|
}
|
|
} else if stage_dir == "3_qa" {
|
|
// QA stage → run coverage gate before advancing to merge.
|
|
let wt_path_for_cov = wt_path.clone();
|
|
let coverage_result =
|
|
tokio::task::spawn_blocking(move || super::gates::run_coverage_gate(&wt_path_for_cov))
|
|
.await;
|
|
|
|
let (coverage_passed, coverage_output) = match coverage_result {
|
|
Ok(Ok(pair)) => pair,
|
|
Ok(Err(e)) => {
|
|
eprintln!(
|
|
"[startup:reconcile] Coverage gate error for '{story_id}': {e}"
|
|
);
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "failed".to_string(),
|
|
message: format!("Coverage gate error: {e}"),
|
|
});
|
|
continue;
|
|
}
|
|
Err(e) => {
|
|
eprintln!(
|
|
"[startup:reconcile] Coverage gate panicked for '{story_id}': {e}"
|
|
);
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "failed".to_string(),
|
|
message: format!("Coverage gate panicked: {e}"),
|
|
});
|
|
continue;
|
|
}
|
|
};
|
|
|
|
if coverage_passed {
|
|
// Spikes skip the merge stage — stay in 3_qa/ for human review.
|
|
if super::lifecycle::item_type_from_id(story_id) == "spike" {
|
|
let spike_path = project_root
|
|
.join(".story_kit/work/3_qa")
|
|
.join(format!("{story_id}.md"));
|
|
if let Err(e) = crate::io::story_metadata::write_review_hold(&spike_path) {
|
|
eprintln!(
|
|
"[startup:reconcile] Failed to set review_hold on spike '{story_id}': {e}"
|
|
);
|
|
}
|
|
eprintln!(
|
|
"[startup:reconcile] Spike '{story_id}' passed QA — holding for human review."
|
|
);
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "review_hold".to_string(),
|
|
message: "Spike passed QA — waiting for human review.".to_string(),
|
|
});
|
|
} else if let Err(e) = super::lifecycle::move_story_to_merge(project_root, story_id) {
|
|
eprintln!(
|
|
"[startup:reconcile] Failed to move '{story_id}' to 4_merge/: {e}"
|
|
);
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "failed".to_string(),
|
|
message: format!("Failed to advance to merge: {e}"),
|
|
});
|
|
} else {
|
|
eprintln!("[startup:reconcile] Moved '{story_id}' → 4_merge/.");
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "advanced".to_string(),
|
|
message: "Gates passed — moved to merge.".to_string(),
|
|
});
|
|
}
|
|
} else {
|
|
eprintln!(
|
|
"[startup:reconcile] Coverage gate failed for '{story_id}': {coverage_output}\n\
|
|
Leaving in 3_qa/ for auto-assign to restart the QA agent."
|
|
);
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: story_id.clone(),
|
|
status: "failed".to_string(),
|
|
message: "Coverage gate failed; will be retried.".to_string(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Signal that reconciliation is complete.
|
|
let _ = progress_tx.send(ReconciliationEvent {
|
|
story_id: String::new(),
|
|
status: "done".to_string(),
|
|
message: "Startup reconciliation complete.".to_string(),
|
|
});
|
|
}
|
|
|
|
/// Test helper: inject an agent with a completion report and project_root
|
|
/// for testing pipeline advance logic without spawning real agents.
|
|
#[cfg(test)]
|
|
pub fn inject_test_agent_with_completion(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
status: AgentStatus,
|
|
project_root: PathBuf,
|
|
completion: CompletionReport,
|
|
) -> broadcast::Sender<AgentEvent> {
|
|
let (tx, _) = broadcast::channel::<AgentEvent>(64);
|
|
let key = composite_key(story_id, agent_name);
|
|
let mut agents = self.agents.lock().unwrap();
|
|
agents.insert(
|
|
key,
|
|
StoryAgent {
|
|
agent_name: agent_name.to_string(),
|
|
status,
|
|
worktree_info: None,
|
|
session_id: None,
|
|
tx: tx.clone(),
|
|
task_handle: None,
|
|
event_log: Arc::new(Mutex::new(Vec::new())),
|
|
completion: Some(completion),
|
|
project_root: Some(project_root),
|
|
log_session_id: None,
|
|
merge_failure_reported: false,
|
|
},
|
|
);
|
|
tx
|
|
}
|
|
|
|
/// Inject a Running agent with a pre-built (possibly finished) task handle.
|
|
/// Used by watchdog tests to simulate an orphaned agent.
|
|
#[cfg(test)]
|
|
pub fn inject_test_agent_with_handle(
|
|
&self,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
status: AgentStatus,
|
|
task_handle: tokio::task::JoinHandle<()>,
|
|
) -> broadcast::Sender<AgentEvent> {
|
|
let (tx, _) = broadcast::channel::<AgentEvent>(64);
|
|
let key = composite_key(story_id, agent_name);
|
|
let mut agents = self.agents.lock().unwrap();
|
|
agents.insert(
|
|
key,
|
|
StoryAgent {
|
|
agent_name: agent_name.to_string(),
|
|
status,
|
|
worktree_info: None,
|
|
session_id: None,
|
|
tx: tx.clone(),
|
|
task_handle: Some(task_handle),
|
|
event_log: Arc::new(Mutex::new(Vec::new())),
|
|
completion: None,
|
|
project_root: None,
|
|
log_session_id: None,
|
|
merge_failure_reported: false,
|
|
},
|
|
);
|
|
tx
|
|
}
|
|
|
|
/// Test helper: inject a child killer into the registry.
|
|
#[cfg(test)]
|
|
pub fn inject_child_killer(&self, key: &str, killer: Box<dyn ChildKiller + Send + Sync>) {
|
|
let mut killers = self.child_killers.lock().unwrap();
|
|
killers.insert(key.to_string(), killer);
|
|
}
|
|
|
|
/// Test helper: return the number of registered child killers.
|
|
#[cfg(test)]
|
|
pub fn child_killer_count(&self) -> usize {
|
|
self.child_killers.lock().unwrap().len()
|
|
}
|
|
|
|
/// Run a single watchdog pass synchronously (test helper).
|
|
#[cfg(test)]
|
|
pub fn run_watchdog_once(&self) {
|
|
check_orphaned_agents(&self.agents);
|
|
}
|
|
|
|
/// Spawn a background watchdog task that periodically checks for Running agents
|
|
/// whose underlying task has already finished (orphaned entries). Any such agent
|
|
/// is marked Failed and an Error event is emitted so that `wait_for_agent` unblocks.
|
|
///
|
|
/// The watchdog runs every 30 seconds. It is a safety net for edge cases where the
|
|
/// PTY read loop exits without updating the agent status (e.g. a panic in the
|
|
/// spawn_blocking task, or an external SIGKILL that closes the PTY fd immediately).
|
|
///
|
|
/// When orphaned agents are detected and a `project_root` is provided, auto-assign
|
|
/// is triggered so that free agents can pick up unassigned work.
|
|
pub fn spawn_watchdog(pool: Arc<AgentPool>, project_root: Option<PathBuf>) {
|
|
tokio::spawn(async move {
|
|
let mut interval =
|
|
tokio::time::interval(std::time::Duration::from_secs(30));
|
|
loop {
|
|
interval.tick().await;
|
|
let found = check_orphaned_agents(&pool.agents);
|
|
if found > 0
|
|
&& let Some(ref root) = project_root
|
|
{
|
|
slog!(
|
|
"[watchdog] {found} orphaned agent(s) detected; triggering auto-assign."
|
|
);
|
|
pool.auto_assign_available_work(root).await;
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
/// Remove all agent entries for a given story_id from the pool.
|
|
///
|
|
/// Called when a story is archived so that stale entries don't accumulate.
|
|
/// Returns the number of entries removed.
|
|
pub fn remove_agents_for_story(&self, story_id: &str) -> usize {
|
|
let mut agents = match self.agents.lock() {
|
|
Ok(a) => a,
|
|
Err(e) => {
|
|
slog_error!("[agents] Failed to lock pool for cleanup of '{story_id}': {e}");
|
|
return 0;
|
|
}
|
|
};
|
|
let prefix = format!("{story_id}:");
|
|
let keys_to_remove: Vec<String> = agents
|
|
.keys()
|
|
.filter(|k| k.starts_with(&prefix))
|
|
.cloned()
|
|
.collect();
|
|
let count = keys_to_remove.len();
|
|
for key in &keys_to_remove {
|
|
agents.remove(key);
|
|
}
|
|
if count > 0 {
|
|
slog!("[agents] Removed {count} agent entries for archived story '{story_id}'");
|
|
}
|
|
count
|
|
}
|
|
}
|
|
|
|
/// Return the active pipeline stage directory name for `story_id`, or `None` if the
|
|
/// story is not in any active stage (`2_current/`, `3_qa/`, `4_merge/`).
|
|
fn find_active_story_stage(project_root: &Path, story_id: &str) -> Option<&'static str> {
|
|
const STAGES: [&str; 3] = ["2_current", "3_qa", "4_merge"];
|
|
for stage in &STAGES {
|
|
let path = project_root
|
|
.join(".story_kit")
|
|
.join("work")
|
|
.join(stage)
|
|
.join(format!("{story_id}.md"));
|
|
if path.exists() {
|
|
return Some(stage);
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Scan a work pipeline stage directory and return story IDs, sorted alphabetically.
|
|
/// Returns an empty `Vec` if the directory does not exist.
|
|
/// Read the optional `agent:` field from the front matter of a story file.
|
|
///
|
|
/// Returns `Some(agent_name)` if the front matter specifies an agent, or `None`
|
|
/// if the field is absent or the file cannot be read / parsed.
|
|
fn read_story_front_matter_agent(project_root: &Path, stage_dir: &str, story_id: &str) -> Option<String> {
|
|
use crate::io::story_metadata::parse_front_matter;
|
|
let path = project_root
|
|
.join(".story_kit")
|
|
.join("work")
|
|
.join(stage_dir)
|
|
.join(format!("{story_id}.md"));
|
|
let contents = std::fs::read_to_string(path).ok()?;
|
|
parse_front_matter(&contents).ok()?.agent
|
|
}
|
|
|
|
/// Return `true` if the story file in the given stage has `review_hold: true` in its front matter.
|
|
fn has_review_hold(project_root: &Path, stage_dir: &str, story_id: &str) -> bool {
|
|
use crate::io::story_metadata::parse_front_matter;
|
|
let path = project_root
|
|
.join(".story_kit")
|
|
.join("work")
|
|
.join(stage_dir)
|
|
.join(format!("{story_id}.md"));
|
|
let contents = match std::fs::read_to_string(path) {
|
|
Ok(c) => c,
|
|
Err(_) => return false,
|
|
};
|
|
parse_front_matter(&contents)
|
|
.ok()
|
|
.and_then(|m| m.review_hold)
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
/// Return `true` if `agent_name` has no active (pending/running) entry in the pool.
|
|
fn is_agent_free(agents: &HashMap<String, StoryAgent>, agent_name: &str) -> bool {
|
|
!agents.values().any(|a| {
|
|
a.agent_name == agent_name
|
|
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
|
})
|
|
}
|
|
|
|
fn scan_stage_items(project_root: &Path, stage_dir: &str) -> Vec<String> {
|
|
let dir = project_root
|
|
.join(".story_kit")
|
|
.join("work")
|
|
.join(stage_dir);
|
|
if !dir.is_dir() {
|
|
return Vec::new();
|
|
}
|
|
let mut items = Vec::new();
|
|
if let Ok(entries) = std::fs::read_dir(&dir) {
|
|
for entry in entries.flatten() {
|
|
let path = entry.path();
|
|
if path.extension().and_then(|e| e.to_str()) == Some("md")
|
|
&& let Some(stem) = path.file_stem().and_then(|s| s.to_str())
|
|
{
|
|
items.push(stem.to_string());
|
|
}
|
|
}
|
|
}
|
|
items.sort();
|
|
items
|
|
}
|
|
|
|
/// Return `true` if `story_id` has any active (pending/running) agent matching `stage`.
|
|
///
|
|
/// Uses the explicit `stage` config field when the agent is found in `config`;
|
|
/// falls back to the legacy name-based heuristic for unlisted agents.
|
|
fn is_story_assigned_for_stage(
|
|
config: &ProjectConfig,
|
|
agents: &HashMap<String, StoryAgent>,
|
|
story_id: &str,
|
|
stage: &PipelineStage,
|
|
) -> bool {
|
|
agents.iter().any(|(key, agent)| {
|
|
// Composite key format: "{story_id}:{agent_name}"
|
|
let key_story_id = key.rsplit_once(':').map(|(sid, _)| sid).unwrap_or(key);
|
|
let agent_stage = config
|
|
.find_agent(&agent.agent_name)
|
|
.map(agent_config_stage)
|
|
.unwrap_or_else(|| pipeline_stage(&agent.agent_name));
|
|
key_story_id == story_id
|
|
&& agent_stage == *stage
|
|
&& matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
|
|
})
|
|
}
|
|
|
|
/// Find the first configured agent for `stage` that has no active (pending/running) assignment.
|
|
/// Returns `None` if all agents for that stage are busy or none are configured.
|
|
/// Uses the agent's explicit `stage` config field (preferred) or falls back to name-based detection.
|
|
fn find_free_agent_for_stage<'a>(
|
|
config: &'a ProjectConfig,
|
|
agents: &HashMap<String, StoryAgent>,
|
|
stage: &PipelineStage,
|
|
) -> Option<&'a str> {
|
|
for agent_config in &config.agent {
|
|
if agent_config_stage(agent_config) != *stage {
|
|
continue;
|
|
}
|
|
let is_busy = agents.values().any(|a| {
|
|
a.agent_name == agent_config.name
|
|
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
|
});
|
|
if !is_busy {
|
|
return Some(&agent_config.name);
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Scan the agent pool for Running entries whose backing tokio task has already
|
|
/// finished and mark them as Failed.
|
|
///
|
|
/// This handles the case where the PTY read loop or the spawned task exits
|
|
/// without updating the agent status — for example when the process is killed
|
|
/// externally and the PTY master fd returns EOF before our inactivity timeout
|
|
/// fires, but some other edge case prevents the normal cleanup path from running.
|
|
fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>) -> usize {
|
|
let mut lock = match agents.lock() {
|
|
Ok(l) => l,
|
|
Err(_) => return 0,
|
|
};
|
|
|
|
// Collect orphaned entries: Running or Pending agents whose task handle is finished.
|
|
// Pending agents can be orphaned if worktree creation panics before setting status.
|
|
let orphaned: Vec<(String, String, broadcast::Sender<AgentEvent>, AgentStatus)> = lock
|
|
.iter()
|
|
.filter_map(|(key, agent)| {
|
|
if matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
|
|
&& let Some(handle) = &agent.task_handle
|
|
&& handle.is_finished()
|
|
{
|
|
let story_id = key
|
|
.rsplit_once(':')
|
|
.map(|(s, _)| s.to_string())
|
|
.unwrap_or_else(|| key.clone());
|
|
return Some((key.clone(), story_id, agent.tx.clone(), agent.status.clone()));
|
|
}
|
|
None
|
|
})
|
|
.collect();
|
|
|
|
let count = orphaned.len();
|
|
for (key, story_id, tx, prev_status) in orphaned {
|
|
if let Some(agent) = lock.get_mut(&key) {
|
|
agent.status = AgentStatus::Failed;
|
|
slog!(
|
|
"[watchdog] Orphaned agent '{key}': task finished but status was {prev_status}. \
|
|
Marking Failed."
|
|
);
|
|
let _ = tx.send(AgentEvent::Error {
|
|
story_id,
|
|
agent_name: agent.agent_name.clone(),
|
|
message: "Agent process terminated unexpectedly (watchdog detected orphan)"
|
|
.to_string(),
|
|
});
|
|
}
|
|
}
|
|
count
|
|
}
|
|
|
|
/// Server-owned completion: runs acceptance gates when an agent process exits
|
|
/// normally, and advances the pipeline based on results.
|
|
///
|
|
/// This is a **free function** (not a method on `AgentPool`) to break the
|
|
/// opaque type cycle that would otherwise arise: `start_agent` → spawned task
|
|
/// → server-owned completion → pipeline advance → `start_agent`.
|
|
///
|
|
/// If the agent already has a completion report (e.g. from a legacy
|
|
/// `report_completion` call), this is a no-op to avoid double-running gates.
|
|
async fn run_server_owned_completion(
|
|
agents: &Arc<Mutex<HashMap<String, StoryAgent>>>,
|
|
port: u16,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
session_id: Option<String>,
|
|
watcher_tx: broadcast::Sender<WatcherEvent>,
|
|
) {
|
|
let key = composite_key(story_id, agent_name);
|
|
|
|
// Guard: skip if completion was already recorded (legacy path).
|
|
{
|
|
let lock = match agents.lock() {
|
|
Ok(a) => a,
|
|
Err(_) => return,
|
|
};
|
|
match lock.get(&key) {
|
|
Some(agent) if agent.completion.is_some() => {
|
|
slog!(
|
|
"[agents] Completion already recorded for '{story_id}:{agent_name}'; \
|
|
skipping server-owned gates."
|
|
);
|
|
return;
|
|
}
|
|
Some(_) => {}
|
|
None => return,
|
|
}
|
|
}
|
|
|
|
// Get worktree path for running gates.
|
|
let worktree_path = {
|
|
let lock = match agents.lock() {
|
|
Ok(a) => a,
|
|
Err(_) => return,
|
|
};
|
|
lock.get(&key)
|
|
.and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
|
|
};
|
|
|
|
// Run acceptance gates.
|
|
let (gates_passed, gate_output) = if let Some(wt_path) = worktree_path {
|
|
let path = wt_path;
|
|
match tokio::task::spawn_blocking(move || {
|
|
super::gates::check_uncommitted_changes(&path)?;
|
|
super::gates::run_acceptance_gates(&path)
|
|
})
|
|
.await
|
|
{
|
|
Ok(Ok(result)) => result,
|
|
Ok(Err(e)) => (false, e),
|
|
Err(e) => (false, format!("Gate check task panicked: {e}")),
|
|
}
|
|
} else {
|
|
(
|
|
false,
|
|
"No worktree path available to run acceptance gates".to_string(),
|
|
)
|
|
};
|
|
|
|
slog!(
|
|
"[agents] Server-owned completion for '{story_id}:{agent_name}': gates_passed={gates_passed}"
|
|
);
|
|
|
|
let report = CompletionReport {
|
|
summary: "Agent process exited normally".to_string(),
|
|
gates_passed,
|
|
gate_output,
|
|
};
|
|
|
|
// Store completion report, extract data for pipeline advance, then
|
|
// remove the entry so completed agents never appear in list_agents.
|
|
let (tx, project_root_for_advance, wt_path_for_advance, merge_failure_reported_for_advance) = {
|
|
let mut lock = match agents.lock() {
|
|
Ok(a) => a,
|
|
Err(_) => return,
|
|
};
|
|
let agent = match lock.get_mut(&key) {
|
|
Some(a) => a,
|
|
None => return,
|
|
};
|
|
agent.completion = Some(report.clone());
|
|
agent.session_id = session_id.clone();
|
|
let tx = agent.tx.clone();
|
|
let pr = agent.project_root.clone();
|
|
let wt = agent.worktree_info.as_ref().map(|w| w.path.clone());
|
|
let mfr = agent.merge_failure_reported;
|
|
lock.remove(&key);
|
|
(tx, pr, wt, mfr)
|
|
};
|
|
|
|
// Emit Done so wait_for_agent unblocks.
|
|
let _ = tx.send(AgentEvent::Done {
|
|
story_id: story_id.to_string(),
|
|
agent_name: agent_name.to_string(),
|
|
session_id,
|
|
});
|
|
|
|
// Notify WebSocket clients that the agent is gone.
|
|
AgentPool::notify_agent_state_changed(&watcher_tx);
|
|
|
|
// Advance the pipeline state machine in a background task.
|
|
spawn_pipeline_advance(
|
|
Arc::clone(agents),
|
|
port,
|
|
story_id,
|
|
agent_name,
|
|
report,
|
|
project_root_for_advance,
|
|
wt_path_for_advance,
|
|
watcher_tx,
|
|
merge_failure_reported_for_advance,
|
|
);
|
|
}
|
|
|
|
/// Spawn pipeline advancement as a background task.
|
|
///
|
|
/// This is a **non-async** function so it does not participate in the opaque
|
|
/// type cycle between `start_agent` and `run_server_owned_completion`.
|
|
#[allow(clippy::too_many_arguments)]
|
|
fn spawn_pipeline_advance(
|
|
agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
|
|
port: u16,
|
|
story_id: &str,
|
|
agent_name: &str,
|
|
completion: CompletionReport,
|
|
project_root: Option<PathBuf>,
|
|
worktree_path: Option<PathBuf>,
|
|
watcher_tx: broadcast::Sender<WatcherEvent>,
|
|
merge_failure_reported: bool,
|
|
) {
|
|
let sid = story_id.to_string();
|
|
let aname = agent_name.to_string();
|
|
tokio::spawn(async move {
|
|
let pool = AgentPool {
|
|
agents,
|
|
port,
|
|
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
|
watcher_tx,
|
|
merge_jobs: Arc::new(Mutex::new(HashMap::new())),
|
|
};
|
|
pool.run_pipeline_advance(
|
|
&sid,
|
|
&aname,
|
|
completion,
|
|
project_root,
|
|
worktree_path,
|
|
merge_failure_reported,
|
|
)
|
|
.await;
|
|
});
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::agents::merge::{MergeJob, MergeJobStatus};
|
|
use crate::agents::{
|
|
AgentEvent, AgentStatus, CompletionReport, PipelineStage, ReconciliationEvent,
|
|
lifecycle::move_story_to_archived,
|
|
};
|
|
use crate::config::ProjectConfig;
|
|
use crate::io::watcher::WatcherEvent;
|
|
use portable_pty::{CommandBuilder, PtySize, native_pty_system};
|
|
use std::collections::HashMap;
|
|
use std::path::PathBuf;
|
|
use std::process::Command;
|
|
use tokio::sync::broadcast;
|
|
|
|
fn init_git_repo(repo: &std::path::Path) {
|
|
Command::new("git")
|
|
.args(["init"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["config", "user.email", "test@test.com"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["config", "user.name", "Test"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "--allow-empty", "-m", "init"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
}
|
|
|
|
fn make_config(toml_str: &str) -> ProjectConfig {
|
|
ProjectConfig::parse(toml_str).unwrap()
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn wait_for_agent_returns_immediately_if_completed() {
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("s1", "bot", AgentStatus::Completed);
|
|
|
|
let info = pool.wait_for_agent("s1", "bot", 1000).await.unwrap();
|
|
assert_eq!(info.status, AgentStatus::Completed);
|
|
assert_eq!(info.story_id, "s1");
|
|
assert_eq!(info.agent_name, "bot");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn wait_for_agent_returns_immediately_if_failed() {
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("s2", "bot", AgentStatus::Failed);
|
|
|
|
let info = pool.wait_for_agent("s2", "bot", 1000).await.unwrap();
|
|
assert_eq!(info.status, AgentStatus::Failed);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn wait_for_agent_completes_on_done_event() {
|
|
let pool = AgentPool::new_test(3001);
|
|
let tx = pool.inject_test_agent("s3", "bot", AgentStatus::Running);
|
|
|
|
// Send Done event after a short delay
|
|
let tx_clone = tx.clone();
|
|
tokio::spawn(async move {
|
|
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
|
// Mark status via event; real code also updates the map, but for
|
|
// this unit test the map entry stays Running — we verify the
|
|
// wait loop reacts to the event.
|
|
let _ = tx_clone.send(AgentEvent::Done {
|
|
story_id: "s3".to_string(),
|
|
agent_name: "bot".to_string(),
|
|
session_id: Some("sess-abc".to_string()),
|
|
});
|
|
});
|
|
|
|
let info = pool.wait_for_agent("s3", "bot", 2000).await.unwrap();
|
|
// Status comes from the map entry (Running in this unit test)
|
|
// — the important thing is that wait_for_agent returned without timing out.
|
|
assert_eq!(info.story_id, "s3");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn wait_for_agent_times_out() {
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("s4", "bot", AgentStatus::Running);
|
|
|
|
let result = pool.wait_for_agent("s4", "bot", 50).await;
|
|
assert!(result.is_err());
|
|
let msg = result.unwrap_err();
|
|
assert!(msg.contains("Timed out"), "unexpected message: {msg}");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn wait_for_agent_errors_for_nonexistent() {
|
|
let pool = AgentPool::new_test(3001);
|
|
let result = pool.wait_for_agent("no_story", "no_bot", 100).await;
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn wait_for_agent_completes_on_stopped_status_event() {
|
|
let pool = AgentPool::new_test(3001);
|
|
let tx = pool.inject_test_agent("s5", "bot", AgentStatus::Running);
|
|
|
|
let tx_clone = tx.clone();
|
|
tokio::spawn(async move {
|
|
tokio::time::sleep(std::time::Duration::from_millis(30)).await;
|
|
let _ = tx_clone.send(AgentEvent::Status {
|
|
story_id: "s5".to_string(),
|
|
agent_name: "bot".to_string(),
|
|
status: "stopped".to_string(),
|
|
});
|
|
});
|
|
|
|
let info = pool.wait_for_agent("s5", "bot", 2000).await.unwrap();
|
|
assert_eq!(info.story_id, "s5");
|
|
}
|
|
|
|
// ── report_completion tests ────────────────────────────────────
|
|
|
|
#[tokio::test]
|
|
async fn report_completion_rejects_nonexistent_agent() {
|
|
let pool = AgentPool::new_test(3001);
|
|
let result = pool
|
|
.report_completion("no_story", "no_bot", "done")
|
|
.await;
|
|
assert!(result.is_err());
|
|
let msg = result.unwrap_err();
|
|
assert!(msg.contains("No agent"), "unexpected: {msg}");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn report_completion_rejects_non_running_agent() {
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("s6", "bot", AgentStatus::Completed);
|
|
|
|
let result = pool.report_completion("s6", "bot", "done").await;
|
|
assert!(result.is_err());
|
|
let msg = result.unwrap_err();
|
|
assert!(
|
|
msg.contains("not running"),
|
|
"expected 'not running' in: {msg}"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn report_completion_rejects_dirty_worktree() {
|
|
use std::fs;
|
|
use tempfile::tempdir;
|
|
|
|
let tmp = tempdir().unwrap();
|
|
let repo = tmp.path();
|
|
|
|
// Init a real git repo and make an initial commit
|
|
Command::new("git")
|
|
.args(["init"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "--allow-empty", "-m", "init"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Write an uncommitted file
|
|
fs::write(repo.join("dirty.txt"), "not committed").unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent_with_path("s7", "bot", AgentStatus::Running, repo.to_path_buf());
|
|
|
|
let result = pool.report_completion("s7", "bot", "done").await;
|
|
assert!(result.is_err());
|
|
let msg = result.unwrap_err();
|
|
assert!(
|
|
msg.contains("uncommitted"),
|
|
"expected 'uncommitted' in: {msg}"
|
|
);
|
|
}
|
|
|
|
// ── server-owned completion tests ───────────────────────────────────────────
|
|
|
|
#[tokio::test]
|
|
async fn server_owned_completion_skips_when_already_completed() {
|
|
let pool = AgentPool::new_test(3001);
|
|
let report = CompletionReport {
|
|
summary: "Already done".to_string(),
|
|
gates_passed: true,
|
|
gate_output: String::new(),
|
|
};
|
|
pool.inject_test_agent_with_completion(
|
|
"s10",
|
|
"coder-1",
|
|
AgentStatus::Completed,
|
|
PathBuf::from("/tmp/nonexistent"),
|
|
report,
|
|
);
|
|
|
|
// Subscribe before calling so we can check if Done event was emitted.
|
|
let mut rx = pool.subscribe("s10", "coder-1").unwrap();
|
|
|
|
run_server_owned_completion(&pool.agents, pool.port, "s10", "coder-1", Some("sess-1".to_string()), pool.watcher_tx.clone())
|
|
.await;
|
|
|
|
// Status should remain Completed (unchanged) — no gate re-run.
|
|
let agents = pool.agents.lock().unwrap();
|
|
let key = composite_key("s10", "coder-1");
|
|
let agent = agents.get(&key).unwrap();
|
|
assert_eq!(agent.status, AgentStatus::Completed);
|
|
// Summary should still be the original, not overwritten.
|
|
assert_eq!(
|
|
agent.completion.as_ref().unwrap().summary,
|
|
"Already done"
|
|
);
|
|
drop(agents);
|
|
|
|
// No Done event should have been emitted.
|
|
assert!(
|
|
rx.try_recv().is_err(),
|
|
"should not emit Done when completion already exists"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn server_owned_completion_runs_gates_on_clean_worktree() {
|
|
use tempfile::tempdir;
|
|
|
|
let tmp = tempdir().unwrap();
|
|
let repo = tmp.path();
|
|
init_git_repo(repo);
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent_with_path(
|
|
"s11",
|
|
"coder-1",
|
|
AgentStatus::Running,
|
|
repo.to_path_buf(),
|
|
);
|
|
|
|
let mut rx = pool.subscribe("s11", "coder-1").unwrap();
|
|
|
|
run_server_owned_completion(&pool.agents, pool.port, "s11", "coder-1", Some("sess-2".to_string()), pool.watcher_tx.clone())
|
|
.await;
|
|
|
|
// Agent entry should be removed from the map after completion.
|
|
let agents = pool.agents.lock().unwrap();
|
|
let key = composite_key("s11", "coder-1");
|
|
assert!(
|
|
agents.get(&key).is_none(),
|
|
"agent should be removed from map after completion"
|
|
);
|
|
drop(agents);
|
|
|
|
// A Done event should have been emitted with the session_id.
|
|
let event = rx.try_recv().expect("should emit Done event");
|
|
match &event {
|
|
AgentEvent::Done { session_id, .. } => {
|
|
assert_eq!(*session_id, Some("sess-2".to_string()));
|
|
}
|
|
other => panic!("expected Done event, got: {other:?}"),
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn server_owned_completion_fails_on_dirty_worktree() {
|
|
use std::fs;
|
|
use tempfile::tempdir;
|
|
|
|
let tmp = tempdir().unwrap();
|
|
let repo = tmp.path();
|
|
init_git_repo(repo);
|
|
// Create an uncommitted file.
|
|
fs::write(repo.join("dirty.txt"), "not committed").unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent_with_path(
|
|
"s12",
|
|
"coder-1",
|
|
AgentStatus::Running,
|
|
repo.to_path_buf(),
|
|
);
|
|
|
|
let mut rx = pool.subscribe("s12", "coder-1").unwrap();
|
|
|
|
run_server_owned_completion(&pool.agents, pool.port, "s12", "coder-1", None, pool.watcher_tx.clone())
|
|
.await;
|
|
|
|
// Agent entry should be removed from the map after completion (even on failure).
|
|
let agents = pool.agents.lock().unwrap();
|
|
let key = composite_key("s12", "coder-1");
|
|
assert!(
|
|
agents.get(&key).is_none(),
|
|
"agent should be removed from map after failed completion"
|
|
);
|
|
drop(agents);
|
|
|
|
// A Done event should have been emitted.
|
|
let event = rx.try_recv().expect("should emit Done event");
|
|
assert!(
|
|
matches!(event, AgentEvent::Done { .. }),
|
|
"expected Done event, got: {event:?}"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn server_owned_completion_nonexistent_agent_is_noop() {
|
|
let pool = AgentPool::new_test(3001);
|
|
// Should not panic or error — just silently return.
|
|
run_server_owned_completion(&pool.agents, pool.port, "nonexistent", "bot", None, pool.watcher_tx.clone())
|
|
.await;
|
|
}
|
|
|
|
// ── pipeline advance tests ────────────────────────────────────────────────
|
|
|
|
#[tokio::test]
|
|
async fn pipeline_advance_coder_gates_pass_moves_story_to_qa() {
|
|
use std::fs;
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
// Set up story in 2_current/
|
|
let current = root.join(".story_kit/work/2_current");
|
|
fs::create_dir_all(¤t).unwrap();
|
|
fs::write(current.join("50_story_test.md"), "test").unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
// Call pipeline advance directly with completion data.
|
|
pool.run_pipeline_advance(
|
|
"50_story_test",
|
|
"coder-1",
|
|
CompletionReport {
|
|
summary: "done".to_string(),
|
|
gates_passed: true,
|
|
gate_output: String::new(),
|
|
},
|
|
Some(root.to_path_buf()),
|
|
None,
|
|
false,
|
|
)
|
|
.await;
|
|
|
|
// Story should have moved to 3_qa/ (start_agent for qa will fail in tests but
|
|
// the file move happens before that).
|
|
assert!(
|
|
root.join(".story_kit/work/3_qa/50_story_test.md").exists(),
|
|
"story should be in 3_qa/"
|
|
);
|
|
assert!(
|
|
!current.join("50_story_test.md").exists(),
|
|
"story should not still be in 2_current/"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn pipeline_advance_qa_gates_pass_moves_story_to_merge() {
|
|
use std::fs;
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
// Set up story in 3_qa/
|
|
let qa_dir = root.join(".story_kit/work/3_qa");
|
|
fs::create_dir_all(&qa_dir).unwrap();
|
|
fs::write(qa_dir.join("51_story_test.md"), "test").unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.run_pipeline_advance(
|
|
"51_story_test",
|
|
"qa",
|
|
CompletionReport {
|
|
summary: "QA done".to_string(),
|
|
gates_passed: true,
|
|
gate_output: String::new(),
|
|
},
|
|
Some(root.to_path_buf()),
|
|
None,
|
|
false,
|
|
)
|
|
.await;
|
|
|
|
// Story should have moved to 4_merge/
|
|
assert!(
|
|
root.join(".story_kit/work/4_merge/51_story_test.md").exists(),
|
|
"story should be in 4_merge/"
|
|
);
|
|
assert!(
|
|
!qa_dir.join("51_story_test.md").exists(),
|
|
"story should not still be in 3_qa/"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn pipeline_advance_supervisor_does_not_advance() {
|
|
use std::fs;
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
let current = root.join(".story_kit/work/2_current");
|
|
fs::create_dir_all(¤t).unwrap();
|
|
fs::write(current.join("52_story_test.md"), "test").unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.run_pipeline_advance(
|
|
"52_story_test",
|
|
"supervisor",
|
|
CompletionReport {
|
|
summary: "supervised".to_string(),
|
|
gates_passed: true,
|
|
gate_output: String::new(),
|
|
},
|
|
Some(root.to_path_buf()),
|
|
None,
|
|
false,
|
|
)
|
|
.await;
|
|
|
|
// Story should NOT have moved (supervisors don't advance pipeline)
|
|
assert!(
|
|
current.join("52_story_test.md").exists(),
|
|
"story should still be in 2_current/ for supervisor"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn pipeline_advance_sends_agent_state_changed_to_watcher_tx() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
// Set up story in 2_current/
|
|
let current = root.join(".story_kit/work/2_current");
|
|
fs::create_dir_all(¤t).unwrap();
|
|
fs::write(current.join("173_story_test.md"), "test").unwrap();
|
|
// Ensure 3_qa/ exists for the move target
|
|
fs::create_dir_all(root.join(".story_kit/work/3_qa")).unwrap();
|
|
// Ensure 1_upcoming/ exists (start_agent calls move_story_to_current)
|
|
fs::create_dir_all(root.join(".story_kit/work/1_upcoming")).unwrap();
|
|
|
|
// Write a project.toml with a qa agent so start_agent can resolve it.
|
|
fs::create_dir_all(root.join(".story_kit")).unwrap();
|
|
fs::write(
|
|
root.join(".story_kit/project.toml"),
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
role = "Coder"
|
|
command = "echo"
|
|
args = ["noop"]
|
|
prompt = "test"
|
|
stage = "coder"
|
|
|
|
[[agent]]
|
|
name = "qa"
|
|
role = "QA"
|
|
command = "echo"
|
|
args = ["noop"]
|
|
prompt = "test"
|
|
stage = "qa"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
// Subscribe to the watcher channel BEFORE the pipeline advance.
|
|
let mut rx = pool.watcher_tx.subscribe();
|
|
|
|
// Call pipeline advance directly. This will:
|
|
// 1. Move the story to 3_qa/
|
|
// 2. Start the QA agent (which calls notify_agent_state_changed)
|
|
// Note: the actual agent process will fail (no real worktree), but the
|
|
// agent insertion and notification happen before the background spawn.
|
|
pool.run_pipeline_advance(
|
|
"173_story_test",
|
|
"coder-1",
|
|
CompletionReport {
|
|
summary: "done".to_string(),
|
|
gates_passed: true,
|
|
gate_output: String::new(),
|
|
},
|
|
Some(root.to_path_buf()),
|
|
None,
|
|
false,
|
|
)
|
|
.await;
|
|
|
|
// The pipeline advance should have sent AgentStateChanged events via
|
|
// the pool's watcher_tx (not a dummy channel). Collect all events.
|
|
let mut got_agent_state_changed = false;
|
|
while let Ok(evt) = rx.try_recv() {
|
|
if matches!(evt, WatcherEvent::AgentStateChanged) {
|
|
got_agent_state_changed = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
assert!(
|
|
got_agent_state_changed,
|
|
"pipeline advance should send AgentStateChanged through the real watcher_tx \
|
|
(bug 173: lozenges must update when agents are assigned during pipeline advance)"
|
|
);
|
|
}
|
|
|
|
// ── auto-assign helper tests ───────────────────────────────────
|
|
|
|
#[test]
|
|
fn scan_stage_items_returns_empty_for_missing_dir() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let items = scan_stage_items(tmp.path(), "2_current");
|
|
assert!(items.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn scan_stage_items_returns_sorted_story_ids() {
|
|
use std::fs;
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let stage_dir = tmp.path().join(".story_kit").join("work").join("2_current");
|
|
fs::create_dir_all(&stage_dir).unwrap();
|
|
fs::write(stage_dir.join("42_story_foo.md"), "---\nname: foo\n---").unwrap();
|
|
fs::write(stage_dir.join("10_story_bar.md"), "---\nname: bar\n---").unwrap();
|
|
fs::write(stage_dir.join("5_story_baz.md"), "---\nname: baz\n---").unwrap();
|
|
// non-md file should be ignored
|
|
fs::write(stage_dir.join("README.txt"), "ignore me").unwrap();
|
|
|
|
let items = scan_stage_items(tmp.path(), "2_current");
|
|
assert_eq!(items, vec!["10_story_bar", "42_story_foo", "5_story_baz"]);
|
|
}
|
|
|
|
#[test]
|
|
fn is_story_assigned_returns_true_for_running_coder() {
|
|
let config = ProjectConfig::default();
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
|
|
|
|
let agents = pool.agents.lock().unwrap();
|
|
assert!(is_story_assigned_for_stage(
|
|
&config,
|
|
&agents,
|
|
"42_story_foo",
|
|
&PipelineStage::Coder
|
|
));
|
|
// Same story but wrong stage — should be false
|
|
assert!(!is_story_assigned_for_stage(
|
|
&config,
|
|
&agents,
|
|
"42_story_foo",
|
|
&PipelineStage::Qa
|
|
));
|
|
// Different story — should be false
|
|
assert!(!is_story_assigned_for_stage(
|
|
&config,
|
|
&agents,
|
|
"99_story_other",
|
|
&PipelineStage::Coder
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn is_story_assigned_returns_false_for_completed_agent() {
|
|
let config = ProjectConfig::default();
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Completed);
|
|
|
|
let agents = pool.agents.lock().unwrap();
|
|
// Completed agents don't count as assigned
|
|
assert!(!is_story_assigned_for_stage(
|
|
&config,
|
|
&agents,
|
|
"42_story_foo",
|
|
&PipelineStage::Coder
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn is_story_assigned_uses_config_stage_field_for_nonstandard_names() {
|
|
let config = ProjectConfig::parse(
|
|
r#"
|
|
[[agent]]
|
|
name = "qa-2"
|
|
stage = "qa"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("42_story_foo", "qa-2", AgentStatus::Running);
|
|
|
|
let agents = pool.agents.lock().unwrap();
|
|
// qa-2 with stage=qa should be recognised as a QA agent
|
|
assert!(
|
|
is_story_assigned_for_stage(&config, &agents, "42_story_foo", &PipelineStage::Qa),
|
|
"qa-2 should be detected as assigned to QA stage"
|
|
);
|
|
// Should NOT appear as a coder
|
|
assert!(
|
|
!is_story_assigned_for_stage(
|
|
&config,
|
|
&agents,
|
|
"42_story_foo",
|
|
&PipelineStage::Coder
|
|
),
|
|
"qa-2 should not be detected as a coder"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn find_free_agent_returns_none_when_all_busy() {
|
|
let config = ProjectConfig::parse(
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
[[agent]]
|
|
name = "coder-2"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("s1", "coder-1", AgentStatus::Running);
|
|
pool.inject_test_agent("s2", "coder-2", AgentStatus::Running);
|
|
|
|
let agents = pool.agents.lock().unwrap();
|
|
let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
|
|
assert!(free.is_none(), "no free coders should be available");
|
|
}
|
|
|
|
#[test]
|
|
fn find_free_agent_returns_first_free_coder() {
|
|
let config = ProjectConfig::parse(
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
[[agent]]
|
|
name = "coder-2"
|
|
[[agent]]
|
|
name = "coder-3"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
// coder-1 is busy, coder-2 is free
|
|
pool.inject_test_agent("s1", "coder-1", AgentStatus::Running);
|
|
|
|
let agents = pool.agents.lock().unwrap();
|
|
let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
|
|
assert_eq!(free, Some("coder-2"), "coder-2 should be the first free coder");
|
|
}
|
|
|
|
#[test]
|
|
fn find_free_agent_ignores_completed_agents() {
|
|
let config = ProjectConfig::parse(
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
// coder-1 completed its previous story — it's free for a new one
|
|
pool.inject_test_agent("s1", "coder-1", AgentStatus::Completed);
|
|
|
|
let agents = pool.agents.lock().unwrap();
|
|
let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
|
|
assert_eq!(free, Some("coder-1"), "completed coder-1 should be free");
|
|
}
|
|
|
|
#[test]
|
|
fn find_free_agent_returns_none_for_wrong_stage() {
|
|
let config = ProjectConfig::parse(
|
|
r#"
|
|
[[agent]]
|
|
name = "qa"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let agents: HashMap<String, StoryAgent> = HashMap::new();
|
|
// Looking for a Coder but only QA is configured
|
|
let free = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
|
|
assert!(free.is_none());
|
|
// Looking for QA should find it
|
|
let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
|
|
assert_eq!(free_qa, Some("qa"));
|
|
}
|
|
|
|
#[test]
|
|
fn find_free_agent_uses_config_stage_field_not_name() {
|
|
// Agents named "qa-2" and "coder-opus" don't match the legacy name heuristic
|
|
// but should be picked up via their explicit stage field.
|
|
let config = ProjectConfig::parse(
|
|
r#"
|
|
[[agent]]
|
|
name = "qa-2"
|
|
stage = "qa"
|
|
|
|
[[agent]]
|
|
name = "coder-opus"
|
|
stage = "coder"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let agents: HashMap<String, StoryAgent> = HashMap::new();
|
|
|
|
// qa-2 should be found for PipelineStage::Qa via config stage field
|
|
let free_qa = find_free_agent_for_stage(&config, &agents, &PipelineStage::Qa);
|
|
assert_eq!(free_qa, Some("qa-2"), "qa-2 with stage=qa should be found");
|
|
|
|
// coder-opus should be found for PipelineStage::Coder via config stage field
|
|
let free_coder = find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder);
|
|
assert_eq!(
|
|
free_coder,
|
|
Some("coder-opus"),
|
|
"coder-opus with stage=coder should be found"
|
|
);
|
|
|
|
// Neither should match the other stage
|
|
let free_merge = find_free_agent_for_stage(&config, &agents, &PipelineStage::Mergemaster);
|
|
assert!(free_merge.is_none());
|
|
}
|
|
|
|
// ── find_active_story_stage tests ─────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn find_active_story_stage_detects_current() {
|
|
use std::fs;
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
let current = root.join(".story_kit/work/2_current");
|
|
fs::create_dir_all(¤t).unwrap();
|
|
fs::write(current.join("10_story_test.md"), "test").unwrap();
|
|
|
|
assert_eq!(
|
|
find_active_story_stage(root, "10_story_test"),
|
|
Some("2_current")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn find_active_story_stage_detects_qa() {
|
|
use std::fs;
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
let qa = root.join(".story_kit/work/3_qa");
|
|
fs::create_dir_all(&qa).unwrap();
|
|
fs::write(qa.join("11_story_test.md"), "test").unwrap();
|
|
|
|
assert_eq!(
|
|
find_active_story_stage(root, "11_story_test"),
|
|
Some("3_qa")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn find_active_story_stage_detects_merge() {
|
|
use std::fs;
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
let merge = root.join(".story_kit/work/4_merge");
|
|
fs::create_dir_all(&merge).unwrap();
|
|
fs::write(merge.join("12_story_test.md"), "test").unwrap();
|
|
|
|
assert_eq!(
|
|
find_active_story_stage(root, "12_story_test"),
|
|
Some("4_merge")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn find_active_story_stage_returns_none_for_unknown_story() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
assert_eq!(find_active_story_stage(tmp.path(), "99_nonexistent"), None);
|
|
}
|
|
|
|
// ── check_orphaned_agents return value tests (bug 161) ──────────────────
|
|
|
|
#[tokio::test]
|
|
async fn check_orphaned_agents_returns_count_of_orphaned_agents() {
|
|
let pool = AgentPool::new_test(3001);
|
|
|
|
// Spawn two tasks that finish immediately.
|
|
let h1 = tokio::spawn(async {});
|
|
let h2 = tokio::spawn(async {});
|
|
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
|
|
assert!(h1.is_finished());
|
|
assert!(h2.is_finished());
|
|
|
|
pool.inject_test_agent_with_handle("story_a", "coder", AgentStatus::Running, h1);
|
|
pool.inject_test_agent_with_handle("story_b", "coder", AgentStatus::Running, h2);
|
|
|
|
let found = check_orphaned_agents(&pool.agents);
|
|
assert_eq!(found, 2, "should detect both orphaned agents");
|
|
}
|
|
|
|
#[test]
|
|
fn check_orphaned_agents_returns_zero_when_no_orphans() {
|
|
let pool = AgentPool::new_test(3001);
|
|
// Inject agents in terminal states — not orphaned.
|
|
pool.inject_test_agent("story_a", "coder", AgentStatus::Completed);
|
|
pool.inject_test_agent("story_b", "qa", AgentStatus::Failed);
|
|
|
|
let found = check_orphaned_agents(&pool.agents);
|
|
assert_eq!(found, 0, "no orphans should be detected for terminal agents");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn watchdog_detects_orphaned_running_agent() {
|
|
let pool = AgentPool::new_test(3001);
|
|
|
|
let handle = tokio::spawn(async {});
|
|
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
|
|
assert!(handle.is_finished(), "task should be finished before injection");
|
|
|
|
let tx =
|
|
pool.inject_test_agent_with_handle("orphan_story", "coder", AgentStatus::Running, handle);
|
|
let mut rx = tx.subscribe();
|
|
|
|
pool.run_watchdog_once();
|
|
|
|
{
|
|
let agents = pool.agents.lock().unwrap();
|
|
let key = composite_key("orphan_story", "coder");
|
|
let agent = agents.get(&key).unwrap();
|
|
assert_eq!(
|
|
agent.status,
|
|
AgentStatus::Failed,
|
|
"watchdog must mark an orphaned Running agent as Failed"
|
|
);
|
|
}
|
|
|
|
let event = rx.try_recv().expect("watchdog must emit an Error event");
|
|
assert!(
|
|
matches!(event, AgentEvent::Error { .. }),
|
|
"expected AgentEvent::Error, got: {event:?}"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn watchdog_orphan_detection_returns_nonzero_enabling_auto_assign() {
|
|
// This test verifies the contract that `check_orphaned_agents` returns
|
|
// a non-zero count when orphans exist, which the watchdog uses to
|
|
// decide whether to trigger auto-assign (bug 161).
|
|
let pool = AgentPool::new_test(3001);
|
|
|
|
let handle = tokio::spawn(async {});
|
|
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
|
|
|
|
pool.inject_test_agent_with_handle(
|
|
"orphan_story",
|
|
"coder",
|
|
AgentStatus::Running,
|
|
handle,
|
|
);
|
|
|
|
// Before watchdog: agent is Running.
|
|
{
|
|
let agents = pool.agents.lock().unwrap();
|
|
let key = composite_key("orphan_story", "coder");
|
|
assert_eq!(agents.get(&key).unwrap().status, AgentStatus::Running);
|
|
}
|
|
|
|
// Run watchdog pass — should return 1 (orphan found).
|
|
let found = check_orphaned_agents(&pool.agents);
|
|
assert_eq!(
|
|
found, 1,
|
|
"watchdog must return 1 for a single orphaned agent"
|
|
);
|
|
|
|
// After watchdog: agent is Failed.
|
|
{
|
|
let agents = pool.agents.lock().unwrap();
|
|
let key = composite_key("orphan_story", "coder");
|
|
assert_eq!(
|
|
agents.get(&key).unwrap().status,
|
|
AgentStatus::Failed,
|
|
"orphaned agent must be marked Failed"
|
|
);
|
|
}
|
|
}
|
|
|
|
// ── remove_agents_for_story tests ────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn remove_agents_for_story_removes_all_entries() {
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("story_a", "coder-1", AgentStatus::Completed);
|
|
pool.inject_test_agent("story_a", "qa", AgentStatus::Failed);
|
|
pool.inject_test_agent("story_b", "coder-1", AgentStatus::Running);
|
|
|
|
let removed = pool.remove_agents_for_story("story_a");
|
|
assert_eq!(removed, 2, "should remove both agents for story_a");
|
|
|
|
let agents = pool.list_agents().unwrap();
|
|
assert_eq!(agents.len(), 1, "only story_b agent should remain");
|
|
assert_eq!(agents[0].story_id, "story_b");
|
|
}
|
|
|
|
#[test]
|
|
fn remove_agents_for_story_returns_zero_when_no_match() {
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("story_a", "coder-1", AgentStatus::Running);
|
|
|
|
let removed = pool.remove_agents_for_story("nonexistent");
|
|
assert_eq!(removed, 0);
|
|
|
|
let agents = pool.list_agents().unwrap();
|
|
assert_eq!(agents.len(), 1, "existing agents should not be affected");
|
|
}
|
|
|
|
// ── archive + cleanup integration test ───────────────────────────────────
|
|
|
|
#[tokio::test]
|
|
async fn archiving_story_removes_agent_entries_from_pool() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
// Set up story in 2_current/
|
|
let current = root.join(".story_kit/work/2_current");
|
|
fs::create_dir_all(¤t).unwrap();
|
|
fs::write(current.join("60_story_cleanup.md"), "test").unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("60_story_cleanup", "coder-1", AgentStatus::Completed);
|
|
pool.inject_test_agent("60_story_cleanup", "qa", AgentStatus::Completed);
|
|
pool.inject_test_agent("61_story_other", "coder-1", AgentStatus::Running);
|
|
|
|
// Verify all 3 agents exist.
|
|
assert_eq!(pool.list_agents().unwrap().len(), 3);
|
|
|
|
// Archive the story.
|
|
move_story_to_archived(root, "60_story_cleanup").unwrap();
|
|
pool.remove_agents_for_story("60_story_cleanup");
|
|
|
|
// Agent entries for the archived story should be gone.
|
|
let remaining = pool.list_agents().unwrap();
|
|
assert_eq!(remaining.len(), 1, "only the other story's agent should remain");
|
|
assert_eq!(remaining[0].story_id, "61_story_other");
|
|
|
|
// Story file should be in 5_done/
|
|
assert!(root.join(".story_kit/work/5_done/60_story_cleanup.md").exists());
|
|
}
|
|
|
|
// ── kill_all_children tests ────────────────────────────────────
|
|
|
|
/// Returns true if a process with the given PID is currently running.
|
|
fn process_is_running(pid: u32) -> bool {
|
|
std::process::Command::new("ps")
|
|
.arg("-p")
|
|
.arg(pid.to_string())
|
|
.stdout(std::process::Stdio::null())
|
|
.stderr(std::process::Stdio::null())
|
|
.status()
|
|
.map(|s| s.success())
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
#[test]
|
|
fn kill_all_children_is_safe_on_empty_pool() {
|
|
let pool = AgentPool::new_test(3001);
|
|
// Should not panic or deadlock on an empty registry.
|
|
pool.kill_all_children();
|
|
assert_eq!(pool.child_killer_count(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn kill_all_children_kills_real_process() {
|
|
// GIVEN: a real PTY child process (sleep 100) with its killer registered.
|
|
let pool = AgentPool::new_test(3001);
|
|
|
|
let pty_system = native_pty_system();
|
|
let pair = pty_system
|
|
.openpty(PtySize {
|
|
rows: 24,
|
|
cols: 80,
|
|
pixel_width: 0,
|
|
pixel_height: 0,
|
|
})
|
|
.expect("failed to open pty");
|
|
|
|
let mut cmd = CommandBuilder::new("sleep");
|
|
cmd.arg("100");
|
|
let mut child = pair
|
|
.slave
|
|
.spawn_command(cmd)
|
|
.expect("failed to spawn sleep");
|
|
let pid = child.process_id().expect("no pid");
|
|
|
|
pool.inject_child_killer("story:agent", child.clone_killer());
|
|
|
|
// Verify the process is alive before we kill it.
|
|
assert!(
|
|
process_is_running(pid),
|
|
"process {pid} should be running before kill_all_children"
|
|
);
|
|
|
|
// WHEN: kill_all_children() is called.
|
|
pool.kill_all_children();
|
|
|
|
// Collect the exit status (prevents zombie; also ensures signal was sent).
|
|
let _ = child.wait();
|
|
|
|
// THEN: the process should be dead.
|
|
assert!(
|
|
!process_is_running(pid),
|
|
"process {pid} should have been killed by kill_all_children"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn kill_all_children_clears_registry() {
|
|
// GIVEN: a pool with one registered killer.
|
|
let pool = AgentPool::new_test(3001);
|
|
|
|
let pty_system = native_pty_system();
|
|
let pair = pty_system
|
|
.openpty(PtySize {
|
|
rows: 24,
|
|
cols: 80,
|
|
pixel_width: 0,
|
|
pixel_height: 0,
|
|
})
|
|
.expect("failed to open pty");
|
|
|
|
let mut cmd = CommandBuilder::new("sleep");
|
|
cmd.arg("1");
|
|
let mut child = pair
|
|
.slave
|
|
.spawn_command(cmd)
|
|
.expect("failed to spawn sleep");
|
|
|
|
pool.inject_child_killer("story:agent", child.clone_killer());
|
|
assert_eq!(pool.child_killer_count(), 1);
|
|
|
|
// WHEN: kill_all_children() is called.
|
|
pool.kill_all_children();
|
|
let _ = child.wait();
|
|
|
|
// THEN: the registry is empty.
|
|
assert_eq!(
|
|
pool.child_killer_count(),
|
|
0,
|
|
"child_killers should be cleared after kill_all_children"
|
|
);
|
|
}
|
|
|
|
// ── available_agents_for_stage tests (story 190) ──────────────────────────
|
|
|
|
#[test]
|
|
fn available_agents_for_stage_returns_idle_agents() {
|
|
let config = make_config(
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
stage = "coder"
|
|
|
|
[[agent]]
|
|
name = "coder-2"
|
|
stage = "coder"
|
|
|
|
[[agent]]
|
|
name = "qa"
|
|
stage = "qa"
|
|
"#,
|
|
);
|
|
let pool = AgentPool::new_test(3001);
|
|
// coder-1 is busy on story-1
|
|
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
|
|
|
|
let available = pool
|
|
.available_agents_for_stage(&config, &PipelineStage::Coder)
|
|
.unwrap();
|
|
assert_eq!(available, vec!["coder-2"]);
|
|
|
|
let available_qa = pool
|
|
.available_agents_for_stage(&config, &PipelineStage::Qa)
|
|
.unwrap();
|
|
assert_eq!(available_qa, vec!["qa"]);
|
|
}
|
|
|
|
#[test]
|
|
fn available_agents_for_stage_returns_empty_when_all_busy() {
|
|
let config = make_config(
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
stage = "coder"
|
|
"#,
|
|
);
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
|
|
|
|
let available = pool
|
|
.available_agents_for_stage(&config, &PipelineStage::Coder)
|
|
.unwrap();
|
|
assert!(available.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn available_agents_for_stage_ignores_completed_agents() {
|
|
let config = make_config(
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
stage = "coder"
|
|
"#,
|
|
);
|
|
let pool = AgentPool::new_test(3001);
|
|
// Completed agents should not count as busy.
|
|
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Completed);
|
|
|
|
let available = pool
|
|
.available_agents_for_stage(&config, &PipelineStage::Coder)
|
|
.unwrap();
|
|
assert_eq!(available, vec!["coder-1"]);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn start_agent_auto_selects_second_coder_when_first_busy() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let sk = tmp.path().join(".story_kit");
|
|
std::fs::create_dir_all(&sk).unwrap();
|
|
std::fs::write(
|
|
sk.join("project.toml"),
|
|
r#"
|
|
[[agent]]
|
|
name = "supervisor"
|
|
stage = "other"
|
|
|
|
[[agent]]
|
|
name = "coder-1"
|
|
stage = "coder"
|
|
|
|
[[agent]]
|
|
name = "coder-2"
|
|
stage = "coder"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
// coder-1 is busy on another story
|
|
pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running);
|
|
|
|
// Call start_agent without agent_name — should pick coder-2
|
|
let result = pool
|
|
.start_agent(tmp.path(), "42_my_story", None, None)
|
|
.await;
|
|
// Will fail for infrastructure reasons (no git repo), but should NOT
|
|
// fail with "All coder agents are busy" — that would mean it didn't
|
|
// try coder-2.
|
|
match result {
|
|
Ok(info) => {
|
|
assert_eq!(info.agent_name, "coder-2");
|
|
}
|
|
Err(err) => {
|
|
assert!(
|
|
!err.contains("All coder agents are busy"),
|
|
"should have selected coder-2 but got: {err}"
|
|
);
|
|
assert!(
|
|
!err.contains("No coder agent configured"),
|
|
"should not fail on agent selection, got: {err}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn start_agent_returns_busy_when_all_coders_occupied() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let sk = tmp.path().join(".story_kit");
|
|
std::fs::create_dir_all(&sk).unwrap();
|
|
std::fs::write(
|
|
sk.join("project.toml"),
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
stage = "coder"
|
|
|
|
[[agent]]
|
|
name = "coder-2"
|
|
stage = "coder"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
|
|
pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending);
|
|
|
|
let result = pool
|
|
.start_agent(tmp.path(), "story-3", None, None)
|
|
.await;
|
|
assert!(result.is_err());
|
|
let err = result.unwrap_err();
|
|
assert!(
|
|
err.contains("All coder agents are busy"),
|
|
"expected busy error, got: {err}"
|
|
);
|
|
}
|
|
|
|
/// Story 203: when all coders are busy the story file must be moved from
|
|
/// 1_upcoming/ to 2_current/ so that auto_assign_available_work can pick
|
|
/// it up once a coder finishes.
|
|
#[tokio::test]
|
|
async fn start_agent_moves_story_to_current_when_coders_busy() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let sk = tmp.path().join(".story_kit");
|
|
let upcoming = sk.join("work/1_upcoming");
|
|
std::fs::create_dir_all(&upcoming).unwrap();
|
|
std::fs::write(
|
|
sk.join("project.toml"),
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
stage = "coder"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
// Place the story in 1_upcoming/.
|
|
std::fs::write(
|
|
upcoming.join("story-3.md"),
|
|
"---\nname: Story 3\n---\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
|
|
|
|
let result = pool
|
|
.start_agent(tmp.path(), "story-3", None, None)
|
|
.await;
|
|
|
|
// Should fail because all coders are busy.
|
|
assert!(result.is_err());
|
|
let err = result.unwrap_err();
|
|
assert!(
|
|
err.contains("All coder agents are busy"),
|
|
"expected busy error, got: {err}"
|
|
);
|
|
assert!(
|
|
err.contains("queued in work/2_current/"),
|
|
"expected story-to-current message, got: {err}"
|
|
);
|
|
|
|
// Story must have been moved to 2_current/.
|
|
let current_path = sk.join("work/2_current/story-3.md");
|
|
assert!(
|
|
current_path.exists(),
|
|
"story should be in 2_current/ after busy error, but was not"
|
|
);
|
|
let upcoming_path = upcoming.join("story-3.md");
|
|
assert!(
|
|
!upcoming_path.exists(),
|
|
"story should no longer be in 1_upcoming/"
|
|
);
|
|
}
|
|
|
|
/// Story 203: auto_assign_available_work must detect a story in 2_current/
|
|
/// with no active agent and start an agent for it.
|
|
#[tokio::test]
|
|
async fn auto_assign_picks_up_story_queued_in_current() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let sk = tmp.path().join(".story_kit");
|
|
let current = sk.join("work/2_current");
|
|
std::fs::create_dir_all(¤t).unwrap();
|
|
std::fs::write(
|
|
sk.join("project.toml"),
|
|
"[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
|
|
)
|
|
.unwrap();
|
|
// Place the story in 2_current/ (simulating the "queued" state).
|
|
std::fs::write(
|
|
current.join("story-3.md"),
|
|
"---\nname: Story 3\n---\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
// No agents are running — coder-1 is free.
|
|
|
|
// auto_assign will try to call start_agent, which will attempt to create
|
|
// a worktree (will fail without a git repo) — that is fine. We only need
|
|
// to verify the agent is registered as Pending before the background
|
|
// task eventually fails.
|
|
pool.auto_assign_available_work(tmp.path()).await;
|
|
|
|
let agents = pool.agents.lock().unwrap();
|
|
let has_pending = agents.values().any(|a| {
|
|
a.agent_name == "coder-1"
|
|
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
|
|
});
|
|
assert!(
|
|
has_pending,
|
|
"auto_assign should have started coder-1 for story-3, but pool is empty"
|
|
);
|
|
}
|
|
|
|
/// Story 203: if a story is already in 2_current/ or later, start_agent
|
|
/// must not fail — the move is a no-op.
|
|
#[tokio::test]
|
|
async fn start_agent_story_already_in_current_is_noop() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let sk = tmp.path().join(".story_kit");
|
|
let current = sk.join("work/2_current");
|
|
std::fs::create_dir_all(¤t).unwrap();
|
|
std::fs::write(
|
|
sk.join("project.toml"),
|
|
"[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
|
|
)
|
|
.unwrap();
|
|
// Place the story directly in 2_current/.
|
|
std::fs::write(
|
|
current.join("story-5.md"),
|
|
"---\nname: Story 5\n---\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
|
|
// start_agent should attempt to assign coder-1 (no infra, so it will
|
|
// fail for git reasons), but must NOT fail due to the story already
|
|
// being in 2_current/.
|
|
let result = pool
|
|
.start_agent(tmp.path(), "story-5", None, None)
|
|
.await;
|
|
match result {
|
|
Ok(_) => {}
|
|
Err(e) => {
|
|
assert!(
|
|
!e.contains("Failed to move"),
|
|
"should not fail on idempotent move, got: {e}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn start_agent_explicit_name_unchanged_when_busy() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let sk = tmp.path().join(".story_kit");
|
|
std::fs::create_dir_all(&sk).unwrap();
|
|
std::fs::write(
|
|
sk.join("project.toml"),
|
|
r#"
|
|
[[agent]]
|
|
name = "coder-1"
|
|
stage = "coder"
|
|
|
|
[[agent]]
|
|
name = "coder-2"
|
|
stage = "coder"
|
|
"#,
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
|
|
|
|
// Explicit request for coder-1 (busy) should fail even though coder-2 is free.
|
|
let result = pool
|
|
.start_agent(tmp.path(), "story-2", Some("coder-1"), None)
|
|
.await;
|
|
assert!(result.is_err());
|
|
let err = result.unwrap_err();
|
|
assert!(
|
|
err.contains("coder-1") && err.contains("already running"),
|
|
"expected explicit busy error, got: {err}"
|
|
);
|
|
}
|
|
|
|
// ── start_agent single-instance concurrency tests ─────────────────────────
|
|
|
|
/// Regression test for bug 97: the agent pool must reject a second concurrent
|
|
/// instance of the same agent name even if it would run on a different story.
|
|
#[tokio::test]
|
|
async fn start_agent_rejects_when_same_agent_already_running_on_another_story() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
// Write a minimal project.toml so ProjectConfig::load can find the "qa" agent.
|
|
let sk_dir = root.join(".story_kit");
|
|
fs::create_dir_all(&sk_dir).unwrap();
|
|
fs::write(
|
|
sk_dir.join("project.toml"),
|
|
"[[agent]]\nname = \"qa\"\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
// Simulate qa already running on story-a.
|
|
pool.inject_test_agent("story-a", "qa", AgentStatus::Running);
|
|
|
|
// Attempt to start qa on story-b — must be rejected.
|
|
let result = pool
|
|
.start_agent(root, "story-b", Some("qa"), None)
|
|
.await;
|
|
|
|
assert!(
|
|
result.is_err(),
|
|
"start_agent should fail when qa is already running on another story"
|
|
);
|
|
let err = result.unwrap_err();
|
|
assert!(
|
|
err.contains("already running") || err.contains("becomes available"),
|
|
"error message should explain why: got '{err}'"
|
|
);
|
|
}
|
|
|
|
/// Verify that the concurrency guard does NOT block an agent that is merely
|
|
/// Completed (not Running/Pending) — completed agents are free for new work.
|
|
#[tokio::test]
|
|
async fn start_agent_allows_new_story_when_previous_run_is_completed() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
let sk_dir = root.join(".story_kit");
|
|
fs::create_dir_all(&sk_dir).unwrap();
|
|
fs::write(
|
|
sk_dir.join("project.toml"),
|
|
"[[agent]]\nname = \"qa\"\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
// Previous run completed — should NOT block a new story.
|
|
pool.inject_test_agent("story-a", "qa", AgentStatus::Completed);
|
|
|
|
// The call will fail eventually (no real worktree / Claude CLI), but it must
|
|
// NOT fail at the concurrency check. We detect the difference by inspecting
|
|
// the error message: a concurrency rejection says "already running", while a
|
|
// later failure (missing story file, missing claude binary, etc.) says something else.
|
|
let result = pool
|
|
.start_agent(root, "story-b", Some("qa"), None)
|
|
.await;
|
|
|
|
if let Err(ref e) = result {
|
|
assert!(
|
|
!e.contains("already running") && !e.contains("becomes available"),
|
|
"completed agent must not trigger the concurrency guard: got '{e}'"
|
|
);
|
|
}
|
|
// result may be Ok (unlikely in test env) or Err for infra reasons — both fine.
|
|
}
|
|
|
|
// ── bug 118: pending entry cleanup on start_agent failure ────────────────
|
|
|
|
/// Regression test for bug 118: when worktree creation fails (e.g. because
|
|
/// there is no git repo), the Pending entry that was inserted into the agent
|
|
/// HashMap must not remain Pending — it must transition to Failed. This
|
|
/// prevents `find_free_agent_for_stage` / auto-assign from being permanently
|
|
/// blocked.
|
|
///
|
|
/// With story 157 the worktree creation moved into the background spawn, so
|
|
/// `start_agent` returns `Ok(Pending)` immediately. We use `wait_for_agent`
|
|
/// to block until the background task resolves.
|
|
#[tokio::test]
|
|
async fn start_agent_cleans_up_pending_entry_on_failure() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
// Minimal project.toml with a "qa" agent.
|
|
let sk_dir = root.join(".story_kit");
|
|
fs::create_dir_all(&sk_dir).unwrap();
|
|
fs::write(
|
|
sk_dir.join("project.toml"),
|
|
"[[agent]]\nname = \"qa\"\n",
|
|
)
|
|
.unwrap();
|
|
|
|
// Create the story in upcoming so `move_story_to_current` succeeds,
|
|
// but do NOT init a git repo — `create_worktree` will fail in the spawn.
|
|
let upcoming = root.join(".story_kit/work/1_upcoming");
|
|
fs::create_dir_all(&upcoming).unwrap();
|
|
fs::write(
|
|
upcoming.join("50_story_test.md"),
|
|
"---\nname: Test\n---\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3099);
|
|
|
|
let result = pool
|
|
.start_agent(root, "50_story_test", Some("qa"), None)
|
|
.await;
|
|
|
|
// With the non-blocking flow, start_agent returns Ok(Pending) immediately.
|
|
// Worktree creation failure happens asynchronously in the background.
|
|
assert!(
|
|
result.is_ok(),
|
|
"start_agent should return Ok(Pending) immediately: {:?}",
|
|
result.err()
|
|
);
|
|
assert_eq!(
|
|
result.unwrap().status,
|
|
AgentStatus::Pending,
|
|
"initial status must be Pending"
|
|
);
|
|
|
|
// Wait for the background task to reach a terminal state.
|
|
// It must fail (no git repo → create_worktree returns an error).
|
|
let final_info = pool
|
|
.wait_for_agent("50_story_test", "qa", 5000)
|
|
.await
|
|
.expect("wait_for_agent should not time out");
|
|
assert_eq!(
|
|
final_info.status,
|
|
AgentStatus::Failed,
|
|
"agent must transition to Failed after worktree creation error"
|
|
);
|
|
|
|
// The pool must retain a Failed entry (not disappear silently).
|
|
let agents = pool.agents.lock().unwrap();
|
|
let failed_entry = agents
|
|
.values()
|
|
.find(|a| a.agent_name == "qa" && a.status == AgentStatus::Failed);
|
|
assert!(
|
|
failed_entry.is_some(),
|
|
"agent pool must retain a Failed entry so the UI can show the error state"
|
|
);
|
|
drop(agents);
|
|
|
|
// The AgentEvent::Error must be persisted in the event_log so late
|
|
// subscribers / polling clients can see the failure reason.
|
|
let events = pool
|
|
.drain_events("50_story_test", "qa")
|
|
.expect("drain_events should succeed");
|
|
let has_error_event = events
|
|
.iter()
|
|
.any(|e| matches!(e, AgentEvent::Error { .. }));
|
|
assert!(
|
|
has_error_event,
|
|
"event_log must contain AgentEvent::Error after worktree creation fails"
|
|
);
|
|
}
|
|
|
|
/// Verify that a successful start_agent keeps the Running entry (guard is
|
|
/// disarmed). We cannot truly spawn an agent in tests, but we verify that
|
|
/// the concurrency check still blocks a second concurrent start — which
|
|
/// proves the first entry survived the guard.
|
|
#[tokio::test]
|
|
async fn start_agent_guard_does_not_remove_running_entry() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
let sk_dir = root.join(".story_kit");
|
|
fs::create_dir_all(&sk_dir).unwrap();
|
|
fs::write(
|
|
sk_dir.join("project.toml"),
|
|
"[[agent]]\nname = \"qa\"\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3099);
|
|
|
|
// Manually inject a Running agent (simulates successful start).
|
|
pool.inject_test_agent("story-x", "qa", AgentStatus::Running);
|
|
|
|
// Attempting to start the same agent on a different story must be
|
|
// rejected — the Running entry must still be there.
|
|
let result = pool
|
|
.start_agent(root, "story-y", Some("qa"), None)
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
let err = result.unwrap_err();
|
|
assert!(
|
|
err.contains("already running") || err.contains("becomes available"),
|
|
"running entry must survive: got '{err}'"
|
|
);
|
|
}
|
|
|
|
// ── TOCTOU race-condition regression tests (story 132) ───────────────────
|
|
|
|
/// Verify that a Pending entry (not just Running) blocks a concurrent
|
|
/// start_agent for the same agent name on a different story. This proves
|
|
/// the check-and-insert is atomic: the Pending entry is visible to the
|
|
/// second caller because it was inserted while the lock was still held.
|
|
#[tokio::test]
|
|
async fn toctou_pending_entry_blocks_same_agent_on_different_story() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
let sk_dir = root.join(".story_kit");
|
|
fs::create_dir_all(&sk_dir).unwrap();
|
|
fs::write(sk_dir.join("project.toml"), "[[agent]]\nname = \"coder-1\"\n").unwrap();
|
|
|
|
let pool = AgentPool::new_test(3099);
|
|
|
|
// Simulate what the winning concurrent call would have done: insert a
|
|
// Pending entry for coder-1 on story-86.
|
|
pool.inject_test_agent("86_story_foo", "coder-1", AgentStatus::Pending);
|
|
|
|
// Now attempt to start coder-1 on a *different* story — must be rejected.
|
|
let result = pool
|
|
.start_agent(root, "130_story_bar", Some("coder-1"), None)
|
|
.await;
|
|
|
|
assert!(result.is_err(), "second start_agent must be rejected");
|
|
let err = result.unwrap_err();
|
|
assert!(
|
|
err.contains("already running") || err.contains("becomes available"),
|
|
"expected concurrency-rejection message, got: '{err}'"
|
|
);
|
|
}
|
|
|
|
/// Concurrent start_agent calls for the same agent name on different stories
|
|
/// must result in exactly one rejection due to the concurrency check (not
|
|
/// due to an unrelated failure such as missing git repo).
|
|
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
|
async fn toctou_concurrent_start_agent_same_agent_exactly_one_concurrency_rejection() {
|
|
use std::fs;
|
|
use std::sync::Arc;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path().to_path_buf();
|
|
|
|
let sk_dir = root.join(".story_kit");
|
|
fs::create_dir_all(sk_dir.join("work/1_upcoming")).unwrap();
|
|
fs::write(
|
|
root.join(".story_kit/project.toml"),
|
|
"[[agent]]\nname = \"coder-1\"\n",
|
|
)
|
|
.unwrap();
|
|
// Both stories must exist in upcoming so move_story_to_current can run
|
|
// (only the winner reaches that point, but we set both up defensively).
|
|
fs::write(
|
|
root.join(".story_kit/work/1_upcoming/86_story_foo.md"),
|
|
"---\nname: Foo\n---\n",
|
|
)
|
|
.unwrap();
|
|
fs::write(
|
|
root.join(".story_kit/work/1_upcoming/130_story_bar.md"),
|
|
"---\nname: Bar\n---\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = Arc::new(AgentPool::new_test(3099));
|
|
|
|
let pool1 = pool.clone();
|
|
let root1 = root.clone();
|
|
let t1 = tokio::spawn(async move {
|
|
pool1
|
|
.start_agent(&root1, "86_story_foo", Some("coder-1"), None)
|
|
.await
|
|
});
|
|
|
|
let pool2 = pool.clone();
|
|
let root2 = root.clone();
|
|
let t2 = tokio::spawn(async move {
|
|
pool2
|
|
.start_agent(&root2, "130_story_bar", Some("coder-1"), None)
|
|
.await
|
|
});
|
|
|
|
let (r1, r2) = tokio::join!(t1, t2);
|
|
let r1 = r1.unwrap();
|
|
let r2 = r2.unwrap();
|
|
|
|
// The concurrency-rejection message always contains "already running" /
|
|
// "becomes available". Any other error (e.g., missing git repo) means
|
|
// that call *won* the atomic check-and-insert.
|
|
let concurrency_rejections = [&r1, &r2]
|
|
.iter()
|
|
.filter(|r| {
|
|
r.as_ref().is_err_and(|e| {
|
|
e.contains("already running") || e.contains("becomes available")
|
|
})
|
|
})
|
|
.count();
|
|
|
|
assert_eq!(
|
|
concurrency_rejections, 1,
|
|
"exactly one call must be rejected by the concurrency check; \
|
|
got r1={r1:?} r2={r2:?}"
|
|
);
|
|
}
|
|
|
|
// ── story-230: prevent duplicate stage agents on same story ───────────────
|
|
|
|
/// start_agent must reject a second coder on a story that already has a
|
|
/// Running coder, even if they are *different* agent names.
|
|
#[tokio::test]
|
|
async fn start_agent_rejects_second_coder_stage_on_same_story() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
let sk_dir = root.join(".story_kit");
|
|
fs::create_dir_all(&sk_dir).unwrap();
|
|
fs::write(
|
|
sk_dir.join("project.toml"),
|
|
"[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3099);
|
|
// coder-1 is already running on the story.
|
|
pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
|
|
|
|
// Attempt to start coder-2 on the *same* story — must be rejected.
|
|
let result = pool
|
|
.start_agent(root, "42_story_foo", Some("coder-2"), None)
|
|
.await;
|
|
|
|
assert!(result.is_err(), "second coder on same story must be rejected");
|
|
let err = result.unwrap_err();
|
|
assert!(
|
|
err.contains("same pipeline stage"),
|
|
"error must mention same pipeline stage, got: '{err}'"
|
|
);
|
|
assert!(
|
|
err.contains("coder-1") && err.contains("coder-2"),
|
|
"error must name both agents, got: '{err}'"
|
|
);
|
|
}
|
|
|
|
/// The stage-conflict check must also cover QA: a second QA agent on the
|
|
/// same story must be rejected.
|
|
#[tokio::test]
|
|
async fn start_agent_rejects_second_qa_stage_on_same_story() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
let sk_dir = root.join(".story_kit");
|
|
fs::create_dir_all(&sk_dir).unwrap();
|
|
// Two qa agents using the explicit stage field so name-based detection
|
|
// doesn't interfere.
|
|
fs::write(
|
|
sk_dir.join("project.toml"),
|
|
"[[agent]]\nname = \"qa-1\"\nstage = \"qa\"\n\n\
|
|
[[agent]]\nname = \"qa-2\"\nstage = \"qa\"\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3099);
|
|
pool.inject_test_agent("55_story_bar", "qa-1", AgentStatus::Running);
|
|
|
|
let result = pool
|
|
.start_agent(root, "55_story_bar", Some("qa-2"), None)
|
|
.await;
|
|
|
|
assert!(result.is_err(), "second qa on same story must be rejected");
|
|
let err = result.unwrap_err();
|
|
assert!(
|
|
err.contains("same pipeline stage"),
|
|
"error must mention same pipeline stage, got: '{err}'"
|
|
);
|
|
}
|
|
|
|
/// Regression test (story 230): concurrent start_agent calls with two
|
|
/// different coder names on the same story — exactly one must succeed
|
|
/// (or fail for infrastructure reasons), and exactly one must be rejected
|
|
/// with a stage-conflict error.
|
|
///
|
|
/// The story is pre-placed in `2_current/` so that both concurrent
|
|
/// `move_story_to_current` calls are no-ops, guaranteeing both reach the
|
|
/// lock where the stage-conflict check fires.
|
|
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
|
async fn start_agent_concurrent_two_coders_same_story_exactly_one_stage_rejection() {
|
|
use std::fs;
|
|
use std::sync::Arc;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path().to_path_buf();
|
|
|
|
let sk_dir = root.join(".story_kit");
|
|
// Place story directly in 2_current/ so move_story_to_current is a
|
|
// no-op for both concurrent callers, letting both reach the lock.
|
|
fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
|
|
fs::write(
|
|
root.join(".story_kit/project.toml"),
|
|
"[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
|
|
)
|
|
.unwrap();
|
|
fs::write(
|
|
root.join(".story_kit/work/2_current/42_story_foo.md"),
|
|
"---\nname: Foo\n---\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = Arc::new(AgentPool::new_test(3099));
|
|
|
|
let pool1 = pool.clone();
|
|
let root1 = root.clone();
|
|
let t1 = tokio::spawn(async move {
|
|
pool1
|
|
.start_agent(&root1, "42_story_foo", Some("coder-1"), None)
|
|
.await
|
|
});
|
|
|
|
let pool2 = pool.clone();
|
|
let root2 = root.clone();
|
|
let t2 = tokio::spawn(async move {
|
|
pool2
|
|
.start_agent(&root2, "42_story_foo", Some("coder-2"), None)
|
|
.await
|
|
});
|
|
|
|
let (r1, r2) = tokio::join!(t1, t2);
|
|
let r1 = r1.unwrap();
|
|
let r2 = r2.unwrap();
|
|
|
|
// Exactly one call must be rejected with a stage-conflict error.
|
|
let stage_rejections = [&r1, &r2]
|
|
.iter()
|
|
.filter(|r| {
|
|
r.as_ref()
|
|
.is_err_and(|e| e.contains("same pipeline stage"))
|
|
})
|
|
.count();
|
|
|
|
assert_eq!(
|
|
stage_rejections, 1,
|
|
"exactly one call must be rejected by the stage-conflict check; \
|
|
got r1={r1:?} r2={r2:?}"
|
|
);
|
|
}
|
|
|
|
/// Regression test (story 230): two coders on *different* stories must
|
|
/// not trigger the stage-conflict guard — the guard is per-story.
|
|
#[tokio::test]
|
|
async fn start_agent_two_coders_different_stories_not_blocked_by_stage_check() {
|
|
use std::fs;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
let sk_dir = root.join(".story_kit");
|
|
fs::create_dir_all(sk_dir.join("work/1_upcoming")).unwrap();
|
|
fs::write(
|
|
root.join(".story_kit/project.toml"),
|
|
"[[agent]]\nname = \"coder-1\"\n\n[[agent]]\nname = \"coder-2\"\n",
|
|
)
|
|
.unwrap();
|
|
fs::write(
|
|
root.join(".story_kit/work/1_upcoming/99_story_baz.md"),
|
|
"---\nname: Baz\n---\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = AgentPool::new_test(3099);
|
|
// coder-1 is running on a *different* story.
|
|
pool.inject_test_agent("42_story_foo", "coder-1", AgentStatus::Running);
|
|
|
|
// Starting coder-2 on story-99 must NOT be rejected by the stage
|
|
// guard (it may fail for infrastructure reasons like missing git repo,
|
|
// but not because of the stage-conflict check).
|
|
let result = pool
|
|
.start_agent(root, "99_story_baz", Some("coder-2"), None)
|
|
.await;
|
|
|
|
if let Err(ref e) = result {
|
|
assert!(
|
|
!e.contains("same pipeline stage"),
|
|
"stage-conflict guard must not fire for agents on different stories; \
|
|
got: '{e}'"
|
|
);
|
|
}
|
|
// result may be Ok (unlikely in test env) or Err for infra reasons — both fine.
|
|
}
|
|
|
|
/// Two concurrent auto_assign_available_work calls must not assign the same
|
|
/// agent to two stories simultaneously. After both complete, at most one
|
|
/// Pending/Running entry must exist per agent name.
|
|
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
|
async fn toctou_concurrent_auto_assign_no_duplicate_agent_assignments() {
|
|
use std::fs;
|
|
use std::sync::Arc;
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path().to_path_buf();
|
|
|
|
let sk_dir = root.join(".story_kit");
|
|
// Two stories waiting in 2_current, one coder agent.
|
|
fs::create_dir_all(sk_dir.join("work/2_current")).unwrap();
|
|
fs::write(
|
|
sk_dir.join("project.toml"),
|
|
"[[agent]]\nname = \"coder-1\"\n",
|
|
)
|
|
.unwrap();
|
|
fs::write(
|
|
sk_dir.join("work/2_current/86_story_foo.md"),
|
|
"---\nname: Foo\n---\n",
|
|
)
|
|
.unwrap();
|
|
fs::write(
|
|
sk_dir.join("work/2_current/130_story_bar.md"),
|
|
"---\nname: Bar\n---\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let pool = Arc::new(AgentPool::new_test(3099));
|
|
|
|
// Run two concurrent auto_assign calls.
|
|
let pool1 = pool.clone();
|
|
let root1 = root.clone();
|
|
let t1 = tokio::spawn(async move { pool1.auto_assign_available_work(&root1).await });
|
|
|
|
let pool2 = pool.clone();
|
|
let root2 = root.clone();
|
|
let t2 = tokio::spawn(async move { pool2.auto_assign_available_work(&root2).await });
|
|
|
|
let _ = tokio::join!(t1, t2);
|
|
|
|
// At most one Pending/Running entry should exist for coder-1.
|
|
let agents = pool.agents.lock().unwrap();
|
|
let active_coder_count = agents
|
|
.values()
|
|
.filter(|a| {
|
|
a.agent_name == "coder-1"
|
|
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
|
|
})
|
|
.count();
|
|
|
|
assert!(
|
|
active_coder_count <= 1,
|
|
"coder-1 must not be assigned to more than one story simultaneously; \
|
|
found {active_coder_count} active entries"
|
|
);
|
|
}
|
|
|
|
// ── merge_agent_work tests ────────────────────────────────────────────────
|
|
|
|
/// Helper: start a merge and poll until terminal state.
|
|
async fn run_merge_to_completion(
|
|
pool: &Arc<AgentPool>,
|
|
repo: &std::path::Path,
|
|
story_id: &str,
|
|
) -> MergeJob {
|
|
pool.start_merge_agent_work(repo, story_id).unwrap();
|
|
loop {
|
|
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
|
if let Some(job) = pool.get_merge_status(story_id)
|
|
&& !matches!(job.status, MergeJobStatus::Running)
|
|
{
|
|
return job;
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn merge_agent_work_returns_error_when_branch_not_found() {
|
|
use tempfile::tempdir;
|
|
|
|
let tmp = tempdir().unwrap();
|
|
let repo = tmp.path();
|
|
init_git_repo(repo);
|
|
|
|
let pool = Arc::new(AgentPool::new_test(3001));
|
|
let job = run_merge_to_completion(&pool, repo, "99_nonexistent").await;
|
|
match &job.status {
|
|
MergeJobStatus::Completed(report) => {
|
|
assert!(!report.success, "should fail when branch missing");
|
|
}
|
|
MergeJobStatus::Failed(_) => {
|
|
// Also acceptable — the pipeline errored out
|
|
}
|
|
MergeJobStatus::Running => {
|
|
panic!("should not still be running");
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn merge_agent_work_succeeds_on_clean_branch() {
|
|
use std::fs;
|
|
use tempfile::tempdir;
|
|
|
|
let tmp = tempdir().unwrap();
|
|
let repo = tmp.path();
|
|
init_git_repo(repo);
|
|
|
|
// Create a feature branch with a commit
|
|
Command::new("git")
|
|
.args(["checkout", "-b", "feature/story-23_test"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
fs::write(repo.join("feature.txt"), "feature content").unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "-m", "add feature"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Switch back to master (initial branch)
|
|
Command::new("git")
|
|
.args(["checkout", "master"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Create the story file in 4_merge/ so we can test archival
|
|
let merge_dir = repo.join(".story_kit/work/4_merge");
|
|
fs::create_dir_all(&merge_dir).unwrap();
|
|
let story_file = merge_dir.join("23_test.md");
|
|
fs::write(&story_file, "---\nname: Test\n---\n").unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "-m", "add story in merge"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
let pool = Arc::new(AgentPool::new_test(3001));
|
|
let job = run_merge_to_completion(&pool, repo, "23_test").await;
|
|
|
|
match &job.status {
|
|
MergeJobStatus::Completed(report) => {
|
|
assert!(!report.had_conflicts, "should have no conflicts");
|
|
assert!(
|
|
report.success || report.gate_output.contains("Failed to run") || !report.gates_passed,
|
|
"report should be coherent: {report:?}"
|
|
);
|
|
if report.story_archived {
|
|
let done = repo.join(".story_kit/work/5_done/23_test.md");
|
|
assert!(done.exists(), "done file should exist");
|
|
}
|
|
}
|
|
MergeJobStatus::Failed(e) => {
|
|
// Gate failures are acceptable in test env
|
|
assert!(
|
|
e.contains("Failed") || e.contains("failed"),
|
|
"unexpected failure: {e}"
|
|
);
|
|
}
|
|
MergeJobStatus::Running => panic!("should not still be running"),
|
|
}
|
|
}
|
|
|
|
// ── quality gate ordering test ────────────────────────────────
|
|
|
|
/// Regression test for bug 142: quality gates must run BEFORE the fast-forward
|
|
/// to master so that broken code never lands on master.
|
|
///
|
|
/// Setup: a repo with a failing `script/test`, a feature branch with one commit.
|
|
/// When `run_squash_merge` is called, the gates must detect failure and abort the
|
|
/// fast-forward, leaving master HEAD unchanged.
|
|
#[cfg(unix)]
|
|
#[test]
|
|
fn quality_gates_run_before_fast_forward_to_master() {
|
|
use std::fs;
|
|
use std::os::unix::fs::PermissionsExt;
|
|
use tempfile::tempdir;
|
|
|
|
let tmp = tempdir().unwrap();
|
|
let repo = tmp.path();
|
|
init_git_repo(repo);
|
|
|
|
// Add a failing script/test so quality gates will fail.
|
|
let script_dir = repo.join("script");
|
|
fs::create_dir_all(&script_dir).unwrap();
|
|
let script_test = script_dir.join("test");
|
|
fs::write(&script_test, "#!/usr/bin/env bash\nexit 1\n").unwrap();
|
|
let mut perms = fs::metadata(&script_test).unwrap().permissions();
|
|
perms.set_mode(0o755);
|
|
fs::set_permissions(&script_test, perms).unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "-m", "add failing script/test"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Create a feature branch with a commit.
|
|
Command::new("git")
|
|
.args(["checkout", "-b", "feature/story-142_test"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
fs::write(repo.join("change.txt"), "feature change").unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "-m", "feature work"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Switch back to master and record its HEAD.
|
|
Command::new("git")
|
|
.args(["checkout", "master"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
let head_before = String::from_utf8(
|
|
Command::new("git")
|
|
.args(["rev-parse", "HEAD"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap()
|
|
.stdout,
|
|
)
|
|
.unwrap()
|
|
.trim()
|
|
.to_string();
|
|
|
|
// Run the squash-merge. The failing script/test makes quality gates
|
|
// fail → fast-forward must NOT happen.
|
|
let result = crate::agents::merge::run_squash_merge(repo, "feature/story-142_test", "142_test").unwrap();
|
|
|
|
let head_after = String::from_utf8(
|
|
Command::new("git")
|
|
.args(["rev-parse", "HEAD"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap()
|
|
.stdout,
|
|
)
|
|
.unwrap()
|
|
.trim()
|
|
.to_string();
|
|
|
|
// Gates must have failed (script/test exits 1) so master should be untouched.
|
|
assert!(
|
|
!result.success,
|
|
"run_squash_merge must report failure when gates fail"
|
|
);
|
|
assert_eq!(
|
|
head_before, head_after,
|
|
"master HEAD must not advance when quality gates fail (bug 142)"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn merge_agent_work_conflict_does_not_break_master() {
|
|
use std::fs;
|
|
use tempfile::tempdir;
|
|
|
|
let tmp = tempdir().unwrap();
|
|
let repo = tmp.path();
|
|
init_git_repo(repo);
|
|
|
|
// Create a file on master.
|
|
fs::write(repo.join("code.rs"), "fn main() {\n println!(\"hello\");\n}\n").unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "-m", "initial code"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Feature branch: modify the same line differently.
|
|
Command::new("git")
|
|
.args(["checkout", "-b", "feature/story-42_story_foo"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
fs::write(repo.join("code.rs"), "fn main() {\n println!(\"hello\");\n feature_fn();\n}\n").unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "-m", "feature: add fn call"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Master: add different line at same location.
|
|
Command::new("git")
|
|
.args(["checkout", "master"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
fs::write(repo.join("code.rs"), "fn main() {\n println!(\"hello\");\n master_fn();\n}\n").unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "-m", "master: add fn call"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Create story file in 4_merge.
|
|
let merge_dir = repo.join(".story_kit/work/4_merge");
|
|
fs::create_dir_all(&merge_dir).unwrap();
|
|
fs::write(merge_dir.join("42_story_foo.md"), "---\nname: Test\n---\n").unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args(["commit", "-m", "add story"])
|
|
.current_dir(repo)
|
|
.output()
|
|
.unwrap();
|
|
|
|
let pool = Arc::new(AgentPool::new_test(3001));
|
|
let job = run_merge_to_completion(&pool, repo, "42_story_foo").await;
|
|
|
|
// Master should NEVER have conflict markers, regardless of merge outcome.
|
|
let master_code = fs::read_to_string(repo.join("code.rs")).unwrap();
|
|
assert!(
|
|
!master_code.contains("<<<<<<<"),
|
|
"master must never contain conflict markers:\n{master_code}"
|
|
);
|
|
assert!(
|
|
!master_code.contains(">>>>>>>"),
|
|
"master must never contain conflict markers:\n{master_code}"
|
|
);
|
|
|
|
// The report should accurately reflect what happened.
|
|
match &job.status {
|
|
MergeJobStatus::Completed(report) => {
|
|
assert!(report.had_conflicts, "should report conflicts");
|
|
}
|
|
MergeJobStatus::Failed(_) => {
|
|
// Acceptable — merge aborted due to conflicts
|
|
}
|
|
MergeJobStatus::Running => panic!("should not still be running"),
|
|
}
|
|
}
|
|
|
|
// ── reconcile_on_startup tests ────────────────────────────────────────────
|
|
|
|
#[tokio::test]
|
|
async fn reconcile_on_startup_noop_when_no_worktrees() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let pool = AgentPool::new_test(3001);
|
|
let (tx, _rx) = broadcast::channel(16);
|
|
// Should not panic; no worktrees to reconcile.
|
|
pool.reconcile_on_startup(tmp.path(), &tx).await;
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn reconcile_on_startup_emits_done_event() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let pool = AgentPool::new_test(3001);
|
|
let (tx, mut rx) = broadcast::channel::<ReconciliationEvent>(16);
|
|
pool.reconcile_on_startup(tmp.path(), &tx).await;
|
|
|
|
// Collect all events; the last must be "done".
|
|
let mut events: Vec<ReconciliationEvent> = Vec::new();
|
|
while let Ok(evt) = rx.try_recv() {
|
|
events.push(evt);
|
|
}
|
|
assert!(
|
|
events.iter().any(|e| e.status == "done"),
|
|
"reconcile_on_startup must emit a 'done' event; got: {:?}",
|
|
events.iter().map(|e| &e.status).collect::<Vec<_>>()
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn reconcile_on_startup_skips_story_without_committed_work() {
|
|
use std::fs;
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
// Set up story in 2_current/.
|
|
let current = root.join(".story_kit/work/2_current");
|
|
fs::create_dir_all(¤t).unwrap();
|
|
fs::write(current.join("60_story_test.md"), "test").unwrap();
|
|
|
|
// Create a worktree directory that is a fresh git repo with no commits
|
|
// ahead of its own base branch (simulates a worktree where no work was done).
|
|
let wt_dir = root.join(".story_kit/worktrees/60_story_test");
|
|
fs::create_dir_all(&wt_dir).unwrap();
|
|
init_git_repo(&wt_dir);
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
let (tx, _rx) = broadcast::channel(16);
|
|
pool.reconcile_on_startup(root, &tx).await;
|
|
|
|
// Story should still be in 2_current/ — nothing was reconciled.
|
|
assert!(
|
|
current.join("60_story_test.md").exists(),
|
|
"story should stay in 2_current/ when worktree has no committed work"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn reconcile_on_startup_runs_gates_on_worktree_with_committed_work() {
|
|
use std::fs;
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
// Set up a git repo for the project root.
|
|
init_git_repo(root);
|
|
|
|
// Set up story in 2_current/ and commit it so the project root is clean.
|
|
let current = root.join(".story_kit/work/2_current");
|
|
fs::create_dir_all(¤t).unwrap();
|
|
fs::write(current.join("61_story_test.md"), "test").unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(root)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args([
|
|
"-c",
|
|
"user.email=test@test.com",
|
|
"-c",
|
|
"user.name=Test",
|
|
"commit",
|
|
"-m",
|
|
"add story",
|
|
])
|
|
.current_dir(root)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Create a real git worktree for the story.
|
|
let wt_dir = root.join(".story_kit/worktrees/61_story_test");
|
|
fs::create_dir_all(wt_dir.parent().unwrap()).unwrap();
|
|
Command::new("git")
|
|
.args([
|
|
"worktree",
|
|
"add",
|
|
&wt_dir.to_string_lossy(),
|
|
"-b",
|
|
"feature/story-61_story_test",
|
|
])
|
|
.current_dir(root)
|
|
.output()
|
|
.unwrap();
|
|
|
|
// Add a commit to the feature branch (simulates coder completing work).
|
|
fs::write(wt_dir.join("implementation.txt"), "done").unwrap();
|
|
Command::new("git")
|
|
.args(["add", "."])
|
|
.current_dir(&wt_dir)
|
|
.output()
|
|
.unwrap();
|
|
Command::new("git")
|
|
.args([
|
|
"-c",
|
|
"user.email=test@test.com",
|
|
"-c",
|
|
"user.name=Test",
|
|
"commit",
|
|
"-m",
|
|
"implement story",
|
|
])
|
|
.current_dir(&wt_dir)
|
|
.output()
|
|
.unwrap();
|
|
|
|
assert!(
|
|
crate::agents::gates::worktree_has_committed_work(&wt_dir),
|
|
"test setup: worktree should have committed work"
|
|
);
|
|
|
|
let pool = AgentPool::new_test(3001);
|
|
let (tx, _rx) = broadcast::channel(16);
|
|
pool.reconcile_on_startup(root, &tx).await;
|
|
|
|
// In the test env, cargo clippy will fail (no Cargo.toml) so gates fail
|
|
// and the story stays in 2_current/. The important assertion is that
|
|
// reconcile ran without panicking and the story is in a consistent state.
|
|
let in_current = current.join("61_story_test.md").exists();
|
|
let in_qa = root
|
|
.join(".story_kit/work/3_qa/61_story_test.md")
|
|
.exists();
|
|
assert!(
|
|
in_current || in_qa,
|
|
"story should be in 2_current/ or 3_qa/ after reconciliation"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn has_review_hold_returns_true_when_set() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let qa_dir = tmp.path().join(".story_kit/work/3_qa");
|
|
std::fs::create_dir_all(&qa_dir).unwrap();
|
|
let spike_path = qa_dir.join("10_spike_research.md");
|
|
std::fs::write(
|
|
&spike_path,
|
|
"---\nname: Research spike\nreview_hold: true\n---\n# Spike\n",
|
|
)
|
|
.unwrap();
|
|
assert!(has_review_hold(tmp.path(), "3_qa", "10_spike_research"));
|
|
}
|
|
|
|
#[test]
|
|
fn has_review_hold_returns_false_when_not_set() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let qa_dir = tmp.path().join(".story_kit/work/3_qa");
|
|
std::fs::create_dir_all(&qa_dir).unwrap();
|
|
let spike_path = qa_dir.join("10_spike_research.md");
|
|
std::fs::write(
|
|
&spike_path,
|
|
"---\nname: Research spike\n---\n# Spike\n",
|
|
)
|
|
.unwrap();
|
|
assert!(!has_review_hold(tmp.path(), "3_qa", "10_spike_research"));
|
|
}
|
|
|
|
#[test]
|
|
fn has_review_hold_returns_false_when_file_missing() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
assert!(!has_review_hold(tmp.path(), "3_qa", "99_spike_missing"));
|
|
}
|
|
|
|
/// Story 265: auto_assign_available_work must skip spikes in 3_qa/ that
|
|
/// have review_hold: true set in their front matter.
|
|
#[tokio::test]
|
|
async fn auto_assign_skips_spikes_with_review_hold() {
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
let root = tmp.path();
|
|
|
|
// Create project.toml with a QA agent.
|
|
let sk = root.join(".story_kit");
|
|
std::fs::create_dir_all(&sk).unwrap();
|
|
std::fs::write(
|
|
sk.join("project.toml"),
|
|
"[[agents]]\nname = \"qa\"\nrole = \"qa\"\nmodel = \"test\"\nprompt = \"test\"\n",
|
|
)
|
|
.unwrap();
|
|
|
|
// Put a spike in 3_qa/ with review_hold: true.
|
|
let qa_dir = root.join(".story_kit/work/3_qa");
|
|
std::fs::create_dir_all(&qa_dir).unwrap();
|
|
std::fs::write(
|
|
qa_dir.join("20_spike_test.md"),
|
|
"---\nname: Test Spike\nreview_hold: true\n---\n# Spike\n",
|
|
)
|
|
.unwrap();
|
|
|
|
let (watcher_tx, _) = broadcast::channel::<WatcherEvent>(4);
|
|
let pool = AgentPool::new(3001, watcher_tx);
|
|
|
|
pool.auto_assign_available_work(root).await;
|
|
|
|
// No agent should have been started for the spike.
|
|
let agents = pool.agents.lock().unwrap();
|
|
assert!(
|
|
agents.is_empty(),
|
|
"No agents should be assigned to a spike with review_hold"
|
|
);
|
|
}
|
|
}
|