diff --git a/server/src/agents.rs b/server/src/agents.rs index 12cacd4..6897a4f 100644 --- a/server/src/agents.rs +++ b/server/src/agents.rs @@ -4,7 +4,7 @@ use crate::slog_error; use crate::slog_warn; use crate::config::{AgentConfig, ProjectConfig}; use crate::worktree::{self, WorktreeInfo}; -use portable_pty::{CommandBuilder, PtySize, native_pty_system}; +use portable_pty::{ChildKiller, CommandBuilder, PtySize, native_pty_system}; use serde::Serialize; use std::collections::HashMap; use std::io::{BufRead, BufReader}; @@ -243,6 +243,27 @@ fn agent_info_from_entry(story_id: &str, agent: &StoryAgent) -> AgentInfo { pub struct AgentPool { agents: Arc>>, port: u16, + /// Registry of active PTY child process killers, keyed by "{story_id}:{agent_name}". + /// Used to terminate child processes on server shutdown or agent stop, preventing + /// orphaned Claude Code processes from running after the server exits. + child_killers: Arc>>>, +} + +/// RAII guard that removes a child killer from the registry on drop. +/// +/// This ensures the killer is always cleaned up when `run_agent_pty_blocking` +/// returns, regardless of the exit path (normal completion, timeout, or error). +struct ChildKillerGuard { + killers: Arc>>>, + key: String, +} + +impl Drop for ChildKillerGuard { + fn drop(&mut self) { + if let Ok(mut killers) = self.killers.lock() { + killers.remove(&self.key); + } + } } impl AgentPool { @@ -250,6 +271,35 @@ impl AgentPool { Self { agents: Arc::new(Mutex::new(HashMap::new())), port, + child_killers: Arc::new(Mutex::new(HashMap::new())), + } + } + + /// Kill all active PTY child processes. + /// + /// Called on server shutdown to prevent orphaned Claude Code processes from + /// continuing to run after the server exits. Each registered killer is called + /// once, then the registry is cleared. + pub fn kill_all_children(&self) { + if let Ok(mut killers) = self.child_killers.lock() { + for (key, killer) in killers.iter_mut() { + slog!("[agents] Killing child process for {key} on shutdown"); + let _ = killer.kill(); + } + killers.clear(); + } + } + + /// Kill and deregister the child process for a specific agent key. + /// + /// Used by `stop_agent` to ensure the PTY child is terminated even though + /// aborting a `spawn_blocking` task handle does not interrupt the blocking thread. + fn kill_child_for_key(&self, key: &str) { + if let Ok(mut killers) = self.child_killers.lock() + && let Some(mut killer) = killers.remove(key) + { + slog!("[agents] Killing child process for {key} on stop"); + let _ = killer.kill(); } } @@ -394,6 +444,7 @@ impl AgentPool { let log_clone = event_log.clone(); let port_for_task = self.port; let log_writer_clone = log_writer.clone(); + let child_killers_clone = self.child_killers.clone(); // Spawn the background task. Worktree creation and agent launch happen here // so `start_agent` returns immediately after registering the agent as @@ -490,6 +541,7 @@ impl AgentPool { &log_clone, log_writer_clone, inactivity_timeout_secs, + child_killers_clone, ) .await { @@ -566,11 +618,14 @@ impl AgentPool { (wt, handle, tx) }; - // Abort the task + // Abort the task and kill the PTY child process. + // Note: aborting a spawn_blocking task handle does not interrupt the blocking + // thread, so we must also kill the child process directly via the killer registry. if let Some(handle) = task_handle { handle.abort(); let _ = handle.await; } + self.kill_child_for_key(&key); // Preserve worktree for inspection — don't destroy agent's work on stop. if let Some(ref wt) = worktree_info { @@ -1076,6 +1131,7 @@ impl AgentPool { let pool_clone = Self { agents: Arc::clone(&self.agents), port: self.port, + child_killers: Arc::clone(&self.child_killers), }; let sid = story_id.to_string(); let aname = agent_name.to_string(); @@ -1645,6 +1701,19 @@ impl AgentPool { tx } + /// Test helper: inject a child killer into the registry. + #[cfg(test)] + pub fn inject_child_killer(&self, key: &str, killer: Box) { + let mut killers = self.child_killers.lock().unwrap(); + killers.insert(key.to_string(), killer); + } + + /// Test helper: return the number of registered child killers. + #[cfg(test)] + pub fn child_killer_count(&self) -> usize { + self.child_killers.lock().unwrap().len() + } + /// Run a single watchdog pass synchronously (test helper). #[cfg(test)] pub fn run_watchdog_once(&self) { @@ -2002,7 +2071,11 @@ fn spawn_pipeline_advance( let sid = story_id.to_string(); let aname = agent_name.to_string(); tokio::spawn(async move { - let pool = AgentPool { agents, port }; + let pool = AgentPool { + agents, + port, + child_killers: Arc::new(Mutex::new(HashMap::new())), + }; pool.run_pipeline_advance_for_completed_agent(&sid, &aname) .await; }); @@ -3062,6 +3135,7 @@ async fn run_agent_pty_streaming( event_log: &Arc>>, log_writer: Option>>, inactivity_timeout_secs: u64, + child_killers: Arc>>>, ) -> Result, String> { let sid = story_id.to_string(); let aname = agent_name.to_string(); @@ -3084,6 +3158,7 @@ async fn run_agent_pty_streaming( &event_log, log_writer.as_deref(), inactivity_timeout_secs, + &child_killers, ) }) .await @@ -3121,6 +3196,7 @@ fn run_agent_pty_blocking( event_log: &Mutex>, log_writer: Option<&Mutex>, inactivity_timeout_secs: u64, + child_killers: &Arc>>>, ) -> Result, String> { let pty_system = native_pty_system(); @@ -3166,6 +3242,21 @@ fn run_agent_pty_blocking( .spawn_command(cmd) .map_err(|e| format!("Failed to spawn agent for {story_id}:{agent_name}: {e}"))?; + // Register the child killer so that kill_all_children() / stop_agent() can + // terminate this process on server shutdown, even if the blocking thread + // cannot be interrupted. The ChildKillerGuard deregisters on function exit. + let killer_key = composite_key(story_id, agent_name); + { + let killer = child.clone_killer(); + if let Ok(mut killers) = child_killers.lock() { + killers.insert(killer_key.clone(), killer); + } + } + let _killer_guard = ChildKillerGuard { + killers: Arc::clone(child_killers), + key: killer_key, + }; + drop(pair.slave); let reader = pair @@ -6003,4 +6094,107 @@ theirs ); } } + + // ── kill_all_children tests ──────────────────────────────────── + + /// Returns true if a process with the given PID is currently running. + fn process_is_running(pid: u32) -> bool { + std::process::Command::new("ps") + .arg("-p") + .arg(pid.to_string()) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map(|s| s.success()) + .unwrap_or(false) + } + + #[test] + fn kill_all_children_is_safe_on_empty_pool() { + let pool = AgentPool::new(3001); + // Should not panic or deadlock on an empty registry. + pool.kill_all_children(); + assert_eq!(pool.child_killer_count(), 0); + } + + #[test] + fn kill_all_children_kills_real_process() { + // GIVEN: a real PTY child process (sleep 100) with its killer registered. + let pool = AgentPool::new(3001); + + let pty_system = native_pty_system(); + let pair = pty_system + .openpty(PtySize { + rows: 24, + cols: 80, + pixel_width: 0, + pixel_height: 0, + }) + .expect("failed to open pty"); + + let mut cmd = CommandBuilder::new("sleep"); + cmd.arg("100"); + let mut child = pair + .slave + .spawn_command(cmd) + .expect("failed to spawn sleep"); + let pid = child.process_id().expect("no pid"); + + pool.inject_child_killer("story:agent", child.clone_killer()); + + // Verify the process is alive before we kill it. + assert!( + process_is_running(pid), + "process {pid} should be running before kill_all_children" + ); + + // WHEN: kill_all_children() is called. + pool.kill_all_children(); + + // Collect the exit status (prevents zombie; also ensures signal was sent). + let _ = child.wait(); + + // THEN: the process should be dead. + assert!( + !process_is_running(pid), + "process {pid} should have been killed by kill_all_children" + ); + } + + #[test] + fn kill_all_children_clears_registry() { + // GIVEN: a pool with one registered killer. + let pool = AgentPool::new(3001); + + let pty_system = native_pty_system(); + let pair = pty_system + .openpty(PtySize { + rows: 24, + cols: 80, + pixel_width: 0, + pixel_height: 0, + }) + .expect("failed to open pty"); + + let mut cmd = CommandBuilder::new("sleep"); + cmd.arg("1"); + let mut child = pair + .slave + .spawn_command(cmd) + .expect("failed to spawn sleep"); + + pool.inject_child_killer("story:agent", child.clone_killer()); + assert_eq!(pool.child_killer_count(), 1); + + // WHEN: kill_all_children() is called. + pool.kill_all_children(); + let _ = child.wait(); + + // THEN: the registry is empty. + assert_eq!( + pool.child_killer_count(), + 0, + "child_killers should be cleared after kill_all_children" + ); + } } diff --git a/server/src/main.rs b/server/src/main.rs index efbe92a..bd7ee23 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -110,6 +110,8 @@ async fn main() -> Result<(), std::io::Error> { let startup_root: Option = app_state.project_root.lock().unwrap().clone(); let startup_agents = Arc::clone(&agents); let startup_reconciliation_tx = reconciliation_tx.clone(); + // Clone for shutdown cleanup — kill orphaned PTY children before exiting. + let agents_for_shutdown = Arc::clone(&agents); let ctx = AppContext { state: app_state, @@ -170,6 +172,10 @@ async fn main() -> Result<(), std::io::Error> { let result = Server::new(TcpListener::bind(&addr)).run(app).await; + // Kill all active PTY child processes before exiting to prevent orphaned + // Claude Code processes from running after the server restarts. + agents_for_shutdown.kill_all_children(); + if let Some(ref path) = port_file { remove_port_file(path); }