huskies: merge 1090 refactor Migrate AgentPool::kill_all_children and kill_child_for_key to process_kill so server shutdown and stop_agent actually kill claude

This commit is contained in:
dave
2026-05-15 11:10:55 +00:00
parent fb82bd7bca
commit 4aa76ce673
14 changed files with 175 additions and 199 deletions
+6 -18
View File
@@ -13,7 +13,6 @@ mod tests {
use super::*;
use crate::agents::AgentEvent;
use crate::io::watcher::WatcherEvent;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use tokio::sync::broadcast;
@@ -41,7 +40,6 @@ mod tests {
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
let event_log = Arc::new(Mutex::new(Vec::new()));
let child_killers = Arc::new(Mutex::new(HashMap::new()));
// sh -p "--" <script>: -p = privileged mode, "--" = end options,
// then the script path is the file operand.
@@ -56,7 +54,6 @@ mod tests {
&event_log,
None,
0,
child_killers,
watcher_tx,
None,
None,
@@ -98,7 +95,6 @@ mod tests {
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
let event_log = Arc::new(Mutex::new(Vec::new()));
let child_killers = Arc::new(Mutex::new(HashMap::new()));
let result = run_agent_pty_streaming(
"423_story_rate_limit",
@@ -111,7 +107,6 @@ mod tests {
&event_log,
None,
0,
child_killers,
watcher_tx,
None,
None,
@@ -160,7 +155,6 @@ mod tests {
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
let event_log = Arc::new(Mutex::new(Vec::new()));
let child_killers = Arc::new(Mutex::new(HashMap::new()));
let before = chrono::Utc::now();
let result = run_agent_pty_streaming(
@@ -174,7 +168,6 @@ mod tests {
&event_log,
None,
0,
child_killers,
watcher_tx,
None,
None,
@@ -229,7 +222,6 @@ mod tests {
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
let (watcher_tx, _watcher_rx) = broadcast::channel::<WatcherEvent>(16);
let event_log = Arc::new(Mutex::new(Vec::new()));
let child_killers = Arc::new(Mutex::new(HashMap::new()));
let result = run_agent_pty_streaming(
"916_story_rate_limit_extension",
@@ -242,7 +234,6 @@ mod tests {
&event_log,
None,
1, // inactivity_timeout_secs = 1s; would expire before the 3s sleep without the extension
child_killers,
watcher_tx,
None,
None,
@@ -407,18 +398,16 @@ mod tests {
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
let (watcher_tx, _watcher_rx) = broadcast::channel::<WatcherEvent>(16);
let event_log = Arc::new(Mutex::new(Vec::new()));
let child_killers: Arc<
Mutex<HashMap<String, Box<dyn portable_pty::ChildKiller + Send + Sync>>>,
> = Arc::new(Mutex::new(HashMap::new()));
let child_killers_for_kill = Arc::clone(&child_killers);
// Spawn a task to kill the child after a short delay (simulating watchdog).
// Uses pids_matching on the script path — same mechanism as the production
// watchdog after the process_kill migration (story 1090).
let script_path_for_kill = script.to_string_lossy().to_string();
tokio::spawn(async move {
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
if let Ok(mut killers) = child_killers_for_kill.lock() {
for (_, killer) in killers.iter_mut() {
let _ = killer.kill();
}
let pids = crate::process_kill::pids_matching(&script_path_for_kill);
if !pids.is_empty() {
let _ = crate::process_kill::sigkill_pids_and_verify(&pids);
}
});
@@ -435,7 +424,6 @@ mod tests {
&event_log,
None,
0, // no inactivity timeout
child_killers,
watcher_tx,
None, // no session to resume
Some((project_root.clone(), "sonnet".to_string())),
+2 -21
View File
@@ -1,10 +1,9 @@
//! PTY process spawning and output loop: builds the command, drives the reader thread,
//! and dispatches parsed JSON events to the broadcast channel.
use std::collections::HashMap;
use std::io::{BufRead, BufReader};
use std::sync::{Arc, Mutex};
use portable_pty::{ChildKiller, CommandBuilder, PtySize, native_pty_system};
use portable_pty::{CommandBuilder, PtySize, native_pty_system};
use tokio::sync::broadcast;
use crate::agent_log::AgentLogWriter;
@@ -14,7 +13,7 @@ use crate::slog;
use crate::slog_warn;
use super::events::{emit_event, handle_agent_stream_event};
use super::types::{ChildKillerGuard, PtyResult, composite_key};
use super::types::PtyResult;
/// Spawn claude agent in a PTY and stream events through the broadcast channel.
///
@@ -55,7 +54,6 @@ pub(in crate::agents) async fn run_agent_pty_streaming(
event_log: &Arc<Mutex<Vec<AgentEvent>>>,
log_writer: Option<Arc<Mutex<AgentLogWriter>>>,
inactivity_timeout_secs: u64,
child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
watcher_tx: broadcast::Sender<WatcherEvent>,
session_id_to_resume: Option<&str>,
eager_record: Option<(std::path::PathBuf, String)>,
@@ -82,7 +80,6 @@ pub(in crate::agents) async fn run_agent_pty_streaming(
&event_log,
log_writer.as_deref(),
inactivity_timeout_secs,
&child_killers,
&watcher_tx,
resume_sid.as_deref(),
eager_record,
@@ -104,7 +101,6 @@ fn run_agent_pty_blocking(
event_log: &Mutex<Vec<AgentEvent>>,
log_writer: Option<&Mutex<AgentLogWriter>>,
inactivity_timeout_secs: u64,
child_killers: &Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
watcher_tx: &broadcast::Sender<WatcherEvent>,
session_id_to_resume: Option<&str>,
eager_record: Option<(std::path::PathBuf, String)>,
@@ -204,21 +200,6 @@ fn run_agent_pty_blocking(
.spawn_command(cmd)
.map_err(|e| format!("Failed to spawn agent for {story_id}:{agent_name}: {e}"))?;
// Register the child killer so that kill_all_children() / stop_agent() can
// terminate this process on server shutdown, even if the blocking thread
// cannot be interrupted. The ChildKillerGuard deregisters on function exit.
let killer_key = composite_key(story_id, agent_name);
{
let killer = child.clone_killer();
if let Ok(mut killers) = child_killers.lock() {
killers.insert(killer_key.clone(), killer);
}
}
let _killer_guard = ChildKillerGuard {
killers: Arc::clone(child_killers),
key: killer_key,
};
drop(pair.slave);
let reader = pair
-22
View File
@@ -1,9 +1,4 @@
//! Core types for the PTY runner: result container and process lifecycle helpers.
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use portable_pty::ChildKiller;
use crate::agents::TokenUsage;
/// Result from a PTY agent session, containing the session ID and token usage.
@@ -23,20 +18,3 @@ pub(in crate::agents) struct PtyResult {
/// event was seen or when the `reset_at` field was absent from the event.
pub rate_limit_reset_at: Option<chrono::DateTime<chrono::Utc>>,
}
pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String {
format!("{story_id}:{agent_name}")
}
pub(super) struct ChildKillerGuard {
pub killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
pub key: String,
}
impl Drop for ChildKillerGuard {
fn drop(&mut self) {
if let Ok(mut killers) = self.killers.lock() {
killers.remove(&self.key);
}
}
}