server/src/agents/pty/runner.rs

//! PTY process spawning and output loop: builds the command, drives the reader thread,
//! and dispatches parsed JSON events to the broadcast channel.
use std::collections::HashMap;
use std::io::{BufRead, BufReader};
use std::sync::{Arc, Mutex};

use portable_pty::{ChildKiller, CommandBuilder, PtySize, native_pty_system};
use tokio::sync::broadcast;

use crate::agent_log::AgentLogWriter;
use crate::agents::{AgentEvent, TokenUsage};
use crate::io::watcher::WatcherEvent;
use crate::slog;
use crate::slog_warn;

use super::events::{emit_event, handle_agent_stream_event};
use super::types::{ChildKillerGuard, PtyResult, composite_key};

/// Spawn claude agent in a PTY and stream events through the broadcast channel.
///
/// ## Bug 645: `output.write(&bytes).is_ok()` assertion in Claude Code CLI
///
/// The Claude Code CLI can panic with an `output.write(&bytes).is_ok()` assertion
/// when writing to its stdout (the PTY slave end).  This occurs inside the child
/// process — not in this server code — when the PTY pipe breaks or fills.  The
/// `output` in the assertion is the CLI's stdout writer, and the write fails when
/// the PTY master side is closed or the kernel pipe buffer is exhausted.
///
/// When this happens, the child process dies, the PTY reader thread in this
/// function receives EOF, and `run_agent_pty_blocking` returns `Ok(PtyResult)`.
/// The server then runs completion gates via `run_server_owned_completion`.
///
/// If the agent committed valid work before crashing, the "work survived" check
/// in `pipeline::advance` detects the committed code and advances the story to
/// QA instead of entering the retry/block path.
#[allow(clippy::too_many_arguments)]
pub(in crate::agents) async fn run_agent_pty_streaming(
    story_id: &str,
    agent_name: &str,
    command: &str,
    args: &[String],
    prompt: &str,
    cwd: &str,
    tx: &broadcast::Sender<AgentEvent>,
    event_log: &Arc<Mutex<Vec<AgentEvent>>>,
    log_writer: Option<Arc<Mutex<AgentLogWriter>>>,
    inactivity_timeout_secs: u64,
    child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
    watcher_tx: broadcast::Sender<WatcherEvent>,
    session_id_to_resume: Option<&str>,
) -> Result<PtyResult, String> {
    let sid = story_id.to_string();
    let aname = agent_name.to_string();
    let cmd = command.to_string();
    let args = args.to_vec();
    let prompt = prompt.to_string();
    let cwd = cwd.to_string();
    let tx = tx.clone();
    let event_log = event_log.clone();
    let resume_sid = session_id_to_resume.map(str::to_string);

    tokio::task::spawn_blocking(move || {
        run_agent_pty_blocking(
            &sid,
            &aname,
            &cmd,
            &args,
            &prompt,
            &cwd,
            &tx,
            &event_log,
            log_writer.as_deref(),
            inactivity_timeout_secs,
            &child_killers,
            &watcher_tx,
            resume_sid.as_deref(),
        )
    })
    .await
    .map_err(|e| format!("Agent task panicked: {e}"))?
}

#[allow(clippy::too_many_arguments)]
fn run_agent_pty_blocking(
    story_id: &str,
    agent_name: &str,
    command: &str,
    args: &[String],
    prompt: &str,
    cwd: &str,
    tx: &broadcast::Sender<AgentEvent>,
    event_log: &Mutex<Vec<AgentEvent>>,
    log_writer: Option<&Mutex<AgentLogWriter>>,
    inactivity_timeout_secs: u64,
    child_killers: &Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
    watcher_tx: &broadcast::Sender<WatcherEvent>,
    session_id_to_resume: Option<&str>,
) -> Result<PtyResult, String> {
    let pty_system = native_pty_system();

    let pair = pty_system
        .openpty(PtySize {
            rows: 50,
            cols: 200,
            pixel_width: 0,
            pixel_height: 0,
        })
        .map_err(|e| format!("Failed to open PTY: {e}"))?;

    let mut cmd = CommandBuilder::new(command);

    // Launch mode: resume an existing session or start fresh.
    if let Some(sid) = session_id_to_resume {
        // Resume: --resume <session_id> restores previous conversation context.
        // The failure context (or empty string) is sent as a new message via -p.
        // Always pass -p so --include-partial-messages (added below) works:
        // claude CLI requires --print/-p to be set when --include-partial-messages
        // is used, regardless of whether the prompt is empty.
        cmd.arg("--resume");
        cmd.arg(sid);
        cmd.arg("-p");
        cmd.arg(prompt);
    } else {
        // Fresh session: deliver the full rendered prompt via -p.
        cmd.arg("-p");
        cmd.arg(prompt);
    }

    // Add configured args (e.g., --directory /path/to/worktree, --model, etc.)
    for arg in args {
        cmd.arg(arg);
    }

    cmd.arg("--output-format");
    cmd.arg("stream-json");
    cmd.arg("--verbose");
    // Enable partial streaming so we receive thinking_delta and text_delta
    // events in real-time, rather than only complete assistant events.
    // Without this, thinking traces may not appear in the structured output
    // and instead leak as unstructured PTY text.
    cmd.arg("--include-partial-messages");

    // Agents use acceptEdits so file edits are auto-approved while other
    // tools (e.g. Bash) trigger the permission prompt tool, which auto-denies
    // for agents.  The worktree's .claude/settings.json allowlist further
    // controls which tools are pre-approved.
    cmd.arg("--permission-mode");
    cmd.arg("acceptEdits");
    cmd.arg("--permission-prompt-tool");
    cmd.arg("mcp__huskies__prompt_permission");

    cmd.cwd(cwd);
    cmd.env("NO_COLOR", "1");

    // Allow spawning Claude Code from within a Claude Code session
    cmd.env_remove("CLAUDECODE");
    cmd.env_remove("CLAUDE_CODE_ENTRYPOINT");

    // Count existing session files for this worktree to detect budget exhaustion.
    let session_dir = format!(
        "/home/huskies/.claude/projects/-workspace--huskies-worktrees-{}/",
        story_id.replace(['_', '.'], "-")
    );
    let session_count = std::fs::read_dir(&session_dir)
        .map(|d| {
            d.filter(|e| {
                e.as_ref()
                    .map(|e| e.path().extension().is_some_and(|ext| ext == "jsonl"))
                    .unwrap_or(false)
            })
            .count()
        })
        .unwrap_or(0);
    let session_bytes: u64 = std::fs::read_dir(&session_dir)
        .map(|d| {
            d.filter_map(|e| e.ok())
                .filter(|e| e.path().extension().is_some_and(|ext| ext == "jsonl"))
                .filter_map(|e| e.metadata().ok())
                .map(|m| m.len())
                .sum()
        })
        .unwrap_or(0);

    slog!(
        "[agent:{story_id}:{agent_name}] Spawning {command} in {cwd} with args: {args:?} \
         (prior_sessions={session_count}, session_log_bytes={session_bytes})"
    );

    let mut child = pair
        .slave
        .spawn_command(cmd)
        .map_err(|e| format!("Failed to spawn agent for {story_id}:{agent_name}: {e}"))?;

    // Register the child killer so that kill_all_children() / stop_agent() can
    // terminate this process on server shutdown, even if the blocking thread
    // cannot be interrupted.  The ChildKillerGuard deregisters on function exit.
    let killer_key = composite_key(story_id, agent_name);
    {
        let killer = child.clone_killer();
        if let Ok(mut killers) = child_killers.lock() {
            killers.insert(killer_key.clone(), killer);
        }
    }
    let _killer_guard = ChildKillerGuard {
        killers: Arc::clone(child_killers),
        key: killer_key,
    };

    drop(pair.slave);

    let reader = pair
        .master
        .try_clone_reader()
        .map_err(|e| format!("Failed to clone PTY reader: {e}"))?;

    drop(pair.master);

    // Spawn a reader thread to collect PTY output lines.
    // We use a channel so the main thread can apply an inactivity deadline
    // via recv_timeout: if no output arrives within the configured window
    // the process is killed and the agent is marked Failed.
    let (line_tx, line_rx) = std::sync::mpsc::channel::<std::io::Result<String>>();
    let sid_for_reader = story_id.to_string();
    let aname_for_reader = agent_name.to_string();
    let reader_handle = std::thread::spawn(move || {
        let buf_reader = BufReader::new(reader);
        for line in buf_reader.lines() {
            if line_tx.send(line).is_err() {
                break;
            }
        }
        slog!("[agent:{sid_for_reader}:{aname_for_reader}] Reader thread exiting");
    });

    let timeout_dur = if inactivity_timeout_secs > 0 {
        Some(std::time::Duration::from_secs(inactivity_timeout_secs))
    } else {
        None
    };

    let mut session_id: Option<String> = None;
    let mut token_usage: Option<TokenUsage> = None;

    loop {
        let recv_result = match timeout_dur {
            Some(dur) => line_rx.recv_timeout(dur),
            None => line_rx
                .recv()
                .map_err(|_| std::sync::mpsc::RecvTimeoutError::Disconnected),
        };

        let line = match recv_result {
            Ok(Ok(l)) => l,
            Ok(Err(_)) => {
                // IO error reading from PTY — treat as EOF.
                break;
            }
            Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {
                // Reader thread exited (EOF from PTY).
                break;
            }
            Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
                slog_warn!(
                    "[agent:{story_id}:{agent_name}] Inactivity timeout after \
                     {inactivity_timeout_secs}s with no output. Killing process."
                );
                let _ = child.kill();
                let _ = child.wait();
                return Err(format!(
                    "Agent inactivity timeout: no output received for {inactivity_timeout_secs}s"
                ));
            }
        };

        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }

        // Try to parse as JSON
        let json: serde_json::Value = match serde_json::from_str(trimmed) {
            Ok(j) => j,
            Err(_) => {
                // Non-JSON output (terminal escapes etc.) — send as raw output
                emit_event(
                    AgentEvent::Output {
                        story_id: story_id.to_string(),
                        agent_name: agent_name.to_string(),
                        text: trimmed.to_string(),
                    },
                    tx,
                    event_log,
                    log_writer,
                );
                continue;
            }
        };

        let event_type = json.get("type").and_then(|t| t.as_str()).unwrap_or("");

        match event_type {
            "system" => {
                session_id = json
                    .get("session_id")
                    .and_then(|s| s.as_str())
                    .map(|s| s.to_string());
            }
            // With --include-partial-messages, thinking and text arrive
            // incrementally via stream_event → content_block_delta. Handle
            // them here for real-time streaming to the frontend.
            "stream_event" => {
                if let Some(event) = json.get("event") {
                    handle_agent_stream_event(
                        event, story_id, agent_name, tx, event_log, log_writer,
                    );
                }
            }
            // Complete assistant events are skipped for content extraction
            // because thinking and text already arrived via stream_event.
            // The raw JSON is still forwarded as AgentJson below.
            "assistant" | "user" => {}
            "rate_limit_event" => {
                let rate_limit_info = json.get("rate_limit_info");
                let status = rate_limit_info
                    .and_then(|i| i.get("status"))
                    .and_then(|s| s.as_str())
                    .unwrap_or("");
                let is_hard_block = !status.is_empty() && status != "allowed_warning";
                let reset_at = rate_limit_info
                    .and_then(|i| i.get("reset_at"))
                    .and_then(|r| r.as_str())
                    .and_then(|r| chrono::DateTime::parse_from_rfc3339(r).ok())
                    .map(|dt| dt.with_timezone(&chrono::Utc));

                if is_hard_block {
                    let reset_at = match reset_at {
                        Some(t) => {
                            slog!(
                                "[agent:{story_id}:{agent_name}] API rate limit hard block \
                                 (status={status}); resets at {t}"
                            );
                            t
                        }
                        None => {
                            let default = chrono::Utc::now() + chrono::Duration::minutes(5);
                            slog!(
                                "[agent:{story_id}:{agent_name}] API rate limit hard block \
                                 (status={status}); no reset_at in rate_limit_info, \
                                 defaulting to 5-minute backoff ({default})"
                            );
                            default
                        }
                    };
                    let _ = watcher_tx.send(WatcherEvent::RateLimitHardBlock {
                        story_id: story_id.to_string(),
                        agent_name: agent_name.to_string(),
                        reset_at,
                    });
                } else {
                    slog!(
                        "[agent:{story_id}:{agent_name}] API rate limit warning received \
                         (status={status})"
                    );
                    let _ = watcher_tx.send(WatcherEvent::RateLimitWarning {
                        story_id: story_id.to_string(),
                        agent_name: agent_name.to_string(),
                    });
                }
            }
            "result" => {
                // Extract token usage from the result event.
                if let Some(usage) = TokenUsage::from_result_event(&json) {
                    slog!(
                        "[agent:{story_id}:{agent_name}] Token usage: in={} out={} cache_create={} cache_read={} cost=${:.4}",
                        usage.input_tokens,
                        usage.output_tokens,
                        usage.cache_creation_input_tokens,
                        usage.cache_read_input_tokens,
                        usage.total_cost_usd,
                    );
                    token_usage = Some(usage);
                }
            }
            _ => {}
        }

        // Forward all JSON events
        emit_event(
            AgentEvent::AgentJson {
                story_id: story_id.to_string(),
                agent_name: agent_name.to_string(),
                data: json,
            },
            tx,
            event_log,
            log_writer,
        );
    }

    let _ = child.kill();
    let wait_result = child.wait();
    match &wait_result {
        Ok(status) => {
            slog!("[agent:{story_id}:{agent_name}] Child exited: {status:?}");
        }
        Err(e) => {
            slog!("[agent:{story_id}:{agent_name}] Child wait error: {e}");
        }
    }

    // Wait for the reader thread to finish so it releases the cloned PTY
    // master fd before we return.  Without this, the next PTY spawn for the
    // same story can collide with a still-open fd from this session (#453).
    if let Err(e) = reader_handle.join() {
        slog!("[agent:{story_id}:{agent_name}] Reader thread panicked: {e:?}");
    }

    // Log whether session was created — Session: None indicates CLI died
    // before emitting any events (possible causes: rate limit, budget
    // exhaustion, PTY write failure, CLI crash).
    if session_id.is_none() {
        slog_warn!(
            "[agent:{story_id}:{agent_name}] SESSION NONE: CLI exited without creating a session. \
             Check for 'fatal runtime error' in agent logs. \
             prior_sessions={session_count}, session_log_bytes={session_bytes}"
        );
    }

    slog!(
        "[agent:{story_id}:{agent_name}] Done. Session: {:?}",
        session_id
    );

    Ok(PtyResult {
        session_id,
        token_usage,
    })
}
huskies: merge 801 2026-04-28 16:34:16 +00:00			`//! PTY process spawning and output loop: builds the command, drives the reader thread,`
			`//! and dispatches parsed JSON events to the broadcast channel.`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`use std::collections::HashMap;`
			`use std::io::{BufRead, BufReader};`
			`use std::sync::{Arc, Mutex};`

			`use portable_pty::{ChildKiller, CommandBuilder, PtySize, native_pty_system};`
			`use tokio::sync::broadcast;`

			`use crate::agent_log::AgentLogWriter;`
huskies: merge 801 2026-04-28 16:34:16 +00:00			`use crate::agents::{AgentEvent, TokenUsage};`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`use crate::io::watcher::WatcherEvent;`
			`use crate::slog;`
			`use crate::slog_warn;`

huskies: merge 801 2026-04-28 16:34:16 +00:00			`use super::events::{emit_event, handle_agent_stream_event};`
			`use super::types::{ChildKillerGuard, PtyResult, composite_key};`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00
			`/// Spawn claude agent in a PTY and stream events through the broadcast channel.`
huskies: merge 645_bug_agent_runtime_panics_with_output_write_bytes_is_ok_assertion_marking_stories_falsely_blocked 2026-04-26 10:50:40 +00:00			`///`
			/// ## Bug 645: `output.write(&bytes).is_ok()` assertion in Claude Code CLI
			`///`
			/// The Claude Code CLI can panic with an `output.write(&bytes).is_ok()` assertion
			`/// when writing to its stdout (the PTY slave end). This occurs inside the child`
			`/// process — not in this server code — when the PTY pipe breaks or fills. The`
			/// `output` in the assertion is the CLI's stdout writer, and the write fails when
			`/// the PTY master side is closed or the kernel pipe buffer is exhausted.`
			`///`
			`/// When this happens, the child process dies, the PTY reader thread in this`
			/// function receives EOF, and `run_agent_pty_blocking` returns `Ok(PtyResult)`.
			/// The server then runs completion gates via `run_server_owned_completion`.
			`///`
			`/// If the agent committed valid work before crashing, the "work survived" check`
			/// in `pipeline::advance` detects the committed code and advances the story to
			`/// QA instead of entering the retry/block path.`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`#[allow(clippy::too_many_arguments)]`
			`pub(in crate::agents) async fn run_agent_pty_streaming(`
			`story_id: &str,`
			`agent_name: &str,`
			`command: &str,`
			`args: &[String],`
			`prompt: &str,`
			`cwd: &str,`
			`tx: &broadcast::Sender<AgentEvent>,`
			`event_log: &Arc<Mutex<Vec<AgentEvent>>>,`
			`log_writer: Option<Arc<Mutex<AgentLogWriter>>>,`
			`inactivity_timeout_secs: u64,`
			`child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,`
			`watcher_tx: broadcast::Sender<WatcherEvent>,`
huskies: merge 543_story_resume_failed_coder_agents_with_resume_instead_of_starting_fresh_sessions 2026-04-12 12:52:46 +00:00			`session_id_to_resume: Option<&str>,`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`) -> Result<PtyResult, String> {`
			`let sid = story_id.to_string();`
			`let aname = agent_name.to_string();`
			`let cmd = command.to_string();`
			`let args = args.to_vec();`
			`let prompt = prompt.to_string();`
			`let cwd = cwd.to_string();`
			`let tx = tx.clone();`
			`let event_log = event_log.clone();`
huskies: merge 543_story_resume_failed_coder_agents_with_resume_instead_of_starting_fresh_sessions 2026-04-12 12:52:46 +00:00			`let resume_sid = session_id_to_resume.map(str::to_string);`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00
			`tokio::task::spawn_blocking(move \|\| {`
			`run_agent_pty_blocking(`
			`&sid,`
			`&aname,`
			`&cmd,`
			`&args,`
			`&prompt,`
			`&cwd,`
			`&tx,`
			`&event_log,`
			`log_writer.as_deref(),`
			`inactivity_timeout_secs,`
			`&child_killers,`
			`&watcher_tx,`
huskies: merge 543_story_resume_failed_coder_agents_with_resume_instead_of_starting_fresh_sessions 2026-04-12 12:52:46 +00:00			`resume_sid.as_deref(),`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`)`
			`})`
			`.await`
			`.map_err(\|e\| format!("Agent task panicked: {e}"))?`
			`}`

			`#[allow(clippy::too_many_arguments)]`
			`fn run_agent_pty_blocking(`
			`story_id: &str,`
			`agent_name: &str,`
			`command: &str,`
			`args: &[String],`
			`prompt: &str,`
			`cwd: &str,`
			`tx: &broadcast::Sender<AgentEvent>,`
			`event_log: &Mutex<Vec<AgentEvent>>,`
			`log_writer: Option<&Mutex<AgentLogWriter>>,`
			`inactivity_timeout_secs: u64,`
			`child_killers: &Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,`
			`watcher_tx: &broadcast::Sender<WatcherEvent>,`
huskies: merge 543_story_resume_failed_coder_agents_with_resume_instead_of_starting_fresh_sessions 2026-04-12 12:52:46 +00:00			`session_id_to_resume: Option<&str>,`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`) -> Result<PtyResult, String> {`
			`let pty_system = native_pty_system();`

			`let pair = pty_system`
			`.openpty(PtySize {`
			`rows: 50,`
			`cols: 200,`
			`pixel_width: 0,`
			`pixel_height: 0,`
			`})`
			`.map_err(\|e\| format!("Failed to open PTY: {e}"))?;`

			`let mut cmd = CommandBuilder::new(command);`

huskies: merge 543_story_resume_failed_coder_agents_with_resume_instead_of_starting_fresh_sessions 2026-04-12 12:52:46 +00:00			`// Launch mode: resume an existing session or start fresh.`
			`if let Some(sid) = session_id_to_resume {`
			`// Resume: --resume <session_id> restores previous conversation context.`
fix(pty): always pass -p on resume so --include-partial-messages works 2026-04-28 20:14:32 +00:00			`// The failure context (or empty string) is sent as a new message via -p.`
			`// Always pass -p so --include-partial-messages (added below) works:`
			`// claude CLI requires --print/-p to be set when --include-partial-messages`
			`// is used, regardless of whether the prompt is empty.`
huskies: merge 543_story_resume_failed_coder_agents_with_resume_instead_of_starting_fresh_sessions 2026-04-12 12:52:46 +00:00			`cmd.arg("--resume");`
			`cmd.arg(sid);`
fix(pty): always pass -p on resume so --include-partial-messages works 2026-04-28 20:14:32 +00:00			`cmd.arg("-p");`
			`cmd.arg(prompt);`
huskies: merge 543_story_resume_failed_coder_agents_with_resume_instead_of_starting_fresh_sessions 2026-04-12 12:52:46 +00:00			`} else {`
			`// Fresh session: deliver the full rendered prompt via -p.`
			`cmd.arg("-p");`
			`cmd.arg(prompt);`
			`}`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00
			`// Add configured args (e.g., --directory /path/to/worktree, --model, etc.)`
			`for arg in args {`
			`cmd.arg(arg);`
			`}`

			`cmd.arg("--output-format");`
			`cmd.arg("stream-json");`
			`cmd.arg("--verbose");`
			`// Enable partial streaming so we receive thinking_delta and text_delta`
			`// events in real-time, rather than only complete assistant events.`
			`// Without this, thinking traces may not appear in the structured output`
			`// and instead leak as unstructured PTY text.`
			`cmd.arg("--include-partial-messages");`

fix: async run_tests to prevent zombie cargo processes blocking gates 2026-04-11 22:00:05 +00:00			`// Agents use acceptEdits so file edits are auto-approved while other`
			`// tools (e.g. Bash) trigger the permission prompt tool, which auto-denies`
			`// for agents. The worktree's .claude/settings.json allowlist further`
			`// controls which tools are pre-approved.`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`cmd.arg("--permission-mode");`
fix: async run_tests to prevent zombie cargo processes blocking gates 2026-04-11 22:00:05 +00:00			`cmd.arg("acceptEdits");`
fix: switch agent permission mode from bypassPermissions to allowFullAutoEdit 2026-04-11 20:15:01 +00:00			`cmd.arg("--permission-prompt-tool");`
			`cmd.arg("mcp__huskies__prompt_permission");`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00
			`cmd.cwd(cwd);`
			`cmd.env("NO_COLOR", "1");`

			`// Allow spawning Claude Code from within a Claude Code session`
			`cmd.env_remove("CLAUDECODE");`
			`cmd.env_remove("CLAUDE_CODE_ENTRYPOINT");`

debug: add PTY spawn diagnostics for Session: None investigation 2026-04-11 11:21:06 +00:00			`// Count existing session files for this worktree to detect budget exhaustion.`
			`let session_dir = format!(`
			`"/home/huskies/.claude/projects/-workspace--huskies-worktrees-{}/",`
fix: collapse consecutive str::replace calls to satisfy clippy 2026-04-11 13:18:17 +00:00			`story_id.replace(['_', '.'], "-")`
debug: add PTY spawn diagnostics for Session: None investigation 2026-04-11 11:21:06 +00:00			`);`
			`let session_count = std::fs::read_dir(&session_dir)`
fix: add --all to cargo fmt in script/test and autoformat codebase 2026-04-13 14:07:08 +00:00			`.map(\|d\| {`
			`d.filter(\|e\| {`
			`e.as_ref()`
			`.map(\|e\| e.path().extension().is_some_and(\|ext\| ext == "jsonl"))`
			`.unwrap_or(false)`
			`})`
			`.count()`
			`})`
debug: add PTY spawn diagnostics for Session: None investigation 2026-04-11 11:21:06 +00:00			`.unwrap_or(0);`
			`let session_bytes: u64 = std::fs::read_dir(&session_dir)`
fix: add --all to cargo fmt in script/test and autoformat codebase 2026-04-13 14:07:08 +00:00			`.map(\|d\| {`
			`d.filter_map(\|e\| e.ok())`
			`.filter(\|e\| e.path().extension().is_some_and(\|ext\| ext == "jsonl"))`
			`.filter_map(\|e\| e.metadata().ok())`
			`.map(\|m\| m.len())`
			`.sum()`
			`})`
debug: add PTY spawn diagnostics for Session: None investigation 2026-04-11 11:21:06 +00:00			`.unwrap_or(0);`

			`slog!(`
			`"[agent:{story_id}:{agent_name}] Spawning {command} in {cwd} with args: {args:?} \`
			`(prior_sessions={session_count}, session_log_bytes={session_bytes})"`
			`);`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00
			`let mut child = pair`
			`.slave`
			`.spawn_command(cmd)`
			`.map_err(\|e\| format!("Failed to spawn agent for {story_id}:{agent_name}: {e}"))?;`

			`// Register the child killer so that kill_all_children() / stop_agent() can`
			`// terminate this process on server shutdown, even if the blocking thread`
			`// cannot be interrupted. The ChildKillerGuard deregisters on function exit.`
			`let killer_key = composite_key(story_id, agent_name);`
			`{`
			`let killer = child.clone_killer();`
			`if let Ok(mut killers) = child_killers.lock() {`
			`killers.insert(killer_key.clone(), killer);`
			`}`
			`}`
			`let _killer_guard = ChildKillerGuard {`
			`killers: Arc::clone(child_killers),`
			`key: killer_key,`
			`};`

			`drop(pair.slave);`

			`let reader = pair`
			`.master`
			`.try_clone_reader()`
			`.map_err(\|e\| format!("Failed to clone PTY reader: {e}"))?;`

			`drop(pair.master);`

			`// Spawn a reader thread to collect PTY output lines.`
			`// We use a channel so the main thread can apply an inactivity deadline`
			`// via recv_timeout: if no output arrives within the configured window`
			`// the process is killed and the agent is marked Failed.`
			`let (line_tx, line_rx) = std::sync::mpsc::channel::<std::io::Result<String>>();`
fix: join PTY reader thread before returning to prevent stale fd leak (#453 ) 2026-03-31 14:41:00 +00:00			`let sid_for_reader = story_id.to_string();`
			`let aname_for_reader = agent_name.to_string();`
			`let reader_handle = std::thread::spawn(move \|\| {`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`let buf_reader = BufReader::new(reader);`
			`for line in buf_reader.lines() {`
			`if line_tx.send(line).is_err() {`
			`break;`
			`}`
			`}`
fix: join PTY reader thread before returning to prevent stale fd leak (#453 ) 2026-03-31 14:41:00 +00:00			`slog!("[agent:{sid_for_reader}:{aname_for_reader}] Reader thread exiting");`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`});`

			`let timeout_dur = if inactivity_timeout_secs > 0 {`
			`Some(std::time::Duration::from_secs(inactivity_timeout_secs))`
			`} else {`
			`None`
			`};`

			`let mut session_id: Option<String> = None;`
			`let mut token_usage: Option<TokenUsage> = None;`

			`loop {`
			`let recv_result = match timeout_dur {`
			`Some(dur) => line_rx.recv_timeout(dur),`
			`None => line_rx`
			`.recv()`
			`.map_err(\|_\| std::sync::mpsc::RecvTimeoutError::Disconnected),`
			`};`

			`let line = match recv_result {`
			`Ok(Ok(l)) => l,`
			`Ok(Err(_)) => {`
			`// IO error reading from PTY — treat as EOF.`
			`break;`
			`}`
			`Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {`
			`// Reader thread exited (EOF from PTY).`
			`break;`
			`}`
			`Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {`
			`slog_warn!(`
			`"[agent:{story_id}:{agent_name}] Inactivity timeout after \`
			`{inactivity_timeout_secs}s with no output. Killing process."`
			`);`
			`let _ = child.kill();`
			`let _ = child.wait();`
			`return Err(format!(`
			`"Agent inactivity timeout: no output received for {inactivity_timeout_secs}s"`
			`));`
			`}`
			`};`

			`let trimmed = line.trim();`
			`if trimmed.is_empty() {`
			`continue;`
			`}`

			`// Try to parse as JSON`
			`let json: serde_json::Value = match serde_json::from_str(trimmed) {`
			`Ok(j) => j,`
			`Err(_) => {`
			`// Non-JSON output (terminal escapes etc.) — send as raw output`
			`emit_event(`
			`AgentEvent::Output {`
			`story_id: story_id.to_string(),`
			`agent_name: agent_name.to_string(),`
			`text: trimmed.to_string(),`
			`},`
			`tx,`
			`event_log,`
			`log_writer,`
			`);`
			`continue;`
			`}`
			`};`

			`let event_type = json.get("type").and_then(\|t\| t.as_str()).unwrap_or("");`

			`match event_type {`
			`"system" => {`
			`session_id = json`
			`.get("session_id")`
			`.and_then(\|s\| s.as_str())`
			`.map(\|s\| s.to_string());`
			`}`
			`// With --include-partial-messages, thinking and text arrive`
			`// incrementally via stream_event → content_block_delta. Handle`
			`// them here for real-time streaming to the frontend.`
			`"stream_event" => {`
			`if let Some(event) = json.get("event") {`
			`handle_agent_stream_event(`
fix: add --all to cargo fmt in script/test and autoformat codebase 2026-04-13 14:07:08 +00:00			`event, story_id, agent_name, tx, event_log, log_writer,`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`);`
			`}`
			`}`
			`// Complete assistant events are skipped for content extraction`
			`// because thinking and text already arrived via stream_event.`
			`// The raw JSON is still forwarded as AgentJson below.`
			`"assistant" \| "user" => {}`
			`"rate_limit_event" => {`
feat(423): auto-schedule timer on rate limit to resume after reset 2026-03-28 09:18:58 +00:00			`let rate_limit_info = json.get("rate_limit_info");`
			`let status = rate_limit_info`
			`.and_then(\|i\| i.get("status"))`
			`.and_then(\|s\| s.as_str())`
			`.unwrap_or("");`
			`let is_hard_block = !status.is_empty() && status != "allowed_warning";`
			`let reset_at = rate_limit_info`
			`.and_then(\|i\| i.get("reset_at"))`
			`.and_then(\|r\| r.as_str())`
			`.and_then(\|r\| chrono::DateTime::parse_from_rfc3339(r).ok())`
			`.map(\|dt\| dt.with_timezone(&chrono::Utc));`

			`if is_hard_block {`
huskies: merge 500_story_remove_duplicate_pty_debug_log_lines 2026-04-09 22:11:41 +00:00			`let reset_at = match reset_at {`
			`Some(t) => {`
			`slog!(`
			`"[agent:{story_id}:{agent_name}] API rate limit hard block \`
			`(status={status}); resets at {t}"`
			`);`
			`t`
			`}`
			`None => {`
fix: add --all to cargo fmt in script/test and autoformat codebase 2026-04-13 14:07:08 +00:00			`let default = chrono::Utc::now() + chrono::Duration::minutes(5);`
huskies: merge 500_story_remove_duplicate_pty_debug_log_lines 2026-04-09 22:11:41 +00:00			`slog!(`
			`"[agent:{story_id}:{agent_name}] API rate limit hard block \`
			`(status={status}); no reset_at in rate_limit_info, \`
			`defaulting to 5-minute backoff ({default})"`
			`);`
			`default`
			`}`
			`};`
huskies: merge 496_bug_hard_rate_limit_without_reset_at_never_auto_schedules_retry 2026-04-08 00:00:19 +00:00			`let _ = watcher_tx.send(WatcherEvent::RateLimitHardBlock {`
			`story_id: story_id.to_string(),`
			`agent_name: agent_name.to_string(),`
			`reset_at,`
			`});`
feat(423): auto-schedule timer on rate limit to resume after reset 2026-03-28 09:18:58 +00:00			`} else {`
			`slog!(`
			`"[agent:{story_id}:{agent_name}] API rate limit warning received \`
			`(status={status})"`
			`);`
			`let _ = watcher_tx.send(WatcherEvent::RateLimitWarning {`
			`story_id: story_id.to_string(),`
			`agent_name: agent_name.to_string(),`
			`});`
			`}`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`}`
			`"result" => {`
			`// Extract token usage from the result event.`
			`if let Some(usage) = TokenUsage::from_result_event(&json) {`
			`slog!(`
			`"[agent:{story_id}:{agent_name}] Token usage: in={} out={} cache_create={} cache_read={} cost=${:.4}",`
			`usage.input_tokens,`
			`usage.output_tokens,`
			`usage.cache_creation_input_tokens,`
			`usage.cache_read_input_tokens,`
			`usage.total_cost_usd,`
			`);`
			`token_usage = Some(usage);`
			`}`
			`}`
			`_ => {}`
			`}`

			`// Forward all JSON events`
			`emit_event(`
			`AgentEvent::AgentJson {`
			`story_id: story_id.to_string(),`
			`agent_name: agent_name.to_string(),`
			`data: json,`
			`},`
			`tx,`
			`event_log,`
			`log_writer,`
			`);`
			`}`

			`let _ = child.kill();`
debug: add PTY spawn diagnostics for Session: None investigation 2026-04-11 11:21:06 +00:00			`let wait_result = child.wait();`
			`match &wait_result {`
			`Ok(status) => {`
fix: add --all to cargo fmt in script/test and autoformat codebase 2026-04-13 14:07:08 +00:00			`slog!("[agent:{story_id}:{agent_name}] Child exited: {status:?}");`
debug: add PTY spawn diagnostics for Session: None investigation 2026-04-11 11:21:06 +00:00			`}`
			`Err(e) => {`
fix: add --all to cargo fmt in script/test and autoformat codebase 2026-04-13 14:07:08 +00:00			`slog!("[agent:{story_id}:{agent_name}] Child wait error: {e}");`
debug: add PTY spawn diagnostics for Session: None investigation 2026-04-11 11:21:06 +00:00			`}`
			`}`
Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00
fix: join PTY reader thread before returning to prevent stale fd leak (#453 ) 2026-03-31 14:41:00 +00:00			`// Wait for the reader thread to finish so it releases the cloned PTY`
			`// master fd before we return. Without this, the next PTY spawn for the`
			`// same story can collide with a still-open fd from this session (#453).`
			`if let Err(e) = reader_handle.join() {`
			`slog!("[agent:{story_id}:{agent_name}] Reader thread panicked: {e:?}");`
			`}`

debug: add PTY spawn diagnostics for Session: None investigation 2026-04-11 11:21:06 +00:00			`// Log whether session was created — Session: None indicates CLI died`
			`// before emitting any events (possible causes: rate limit, budget`
			`// exhaustion, PTY write failure, CLI crash).`
			`if session_id.is_none() {`
			`slog_warn!(`
			`"[agent:{story_id}:{agent_name}] SESSION NONE: CLI exited without creating a session. \`
			`Check for 'fatal runtime error' in agent logs. \`
			`prior_sessions={session_count}, session_log_bytes={session_bytes}"`
			`);`
			`}`

Restore codebase deleted by bad auto-commit `e4227cf` 2026-03-22 19:07:07 +00:00			`slog!(`
			`"[agent:{story_id}:{agent_name}] Done. Session: {:?}",`
			`session_id`
			`);`

			`Ok(PtyResult {`
			`session_id,`
			`token_usage,`
			`})`
			`}`