diff --git a/server/src/gateway/mod.rs b/server/src/gateway/mod.rs index c49e36c0..1fbcc1e5 100644 --- a/server/src/gateway/mod.rs +++ b/server/src/gateway/mod.rs @@ -75,6 +75,15 @@ pub fn build_gateway_route(state_arc: Arc) -> impl poem::Endpoint /// Start the gateway HTTP server. This is the entry point when `--gateway` is used. pub async fn run(config_path: &Path, port: u16) -> Result<(), std::io::Error> { + // Enforce one-active-gateway invariant: acquire an exclusive flock on the + // pidfile before doing anything else. A second gateway start while one is + // running will fail here with a clear error. The flock is held for the + // lifetime of `_pidfile_guard`; it is released automatically when this + // process exits, allowing the next gateway (spawned by the trampoline) to + // acquire it. + let _pidfile_guard = + crate::pidfile::acquire_gateway_pidfile().map_err(std::io::Error::other)?; + let config_dir = config_path .parent() .unwrap_or(std::path::Path::new(".")) diff --git a/server/src/main.rs b/server/src/main.rs index 42dc6900..292e0813 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -36,6 +36,8 @@ pub mod log_buffer; pub mod mesh; /// Node identity — Ed25519 keypair generation and stable node ID management. pub mod node_identity; +/// Gateway pidfile — exclusive flock on `$HOME/.huskies/gateway.pid`. +pub mod pidfile; /// Pipeline event bus — real-time broadcast of pipeline-transition events to persona subscribers. pub(crate) mod pipeline_event_bus; pub(crate) mod pipeline_state; diff --git a/server/src/pidfile.rs b/server/src/pidfile.rs new file mode 100644 index 00000000..a7c50706 --- /dev/null +++ b/server/src/pidfile.rs @@ -0,0 +1,121 @@ +//! Gateway pidfile — exclusive flock on `$HOME/.huskies/gateway.pid`. +//! +//! A gateway process holds the lock for its lifetime. A second gateway that +//! tries to start while one is already running fails immediately with a +//! human-readable error naming the existing process. A stale pidfile left by +//! a dead process is reclaimed automatically: the kernel releases flocks when +//! the file descriptor is closed, which happens when the process dies. + +use std::fs::{File, OpenOptions}; +use std::path::{Path, PathBuf}; + +// ── Guard ───────────────────────────────────────────────────────────────────── + +/// Held for the lifetime of the gateway process. Dropping it releases the flock. +#[derive(Debug)] +pub struct PidfileGuard { + _file: File, +} + +// ── Path resolution ─────────────────────────────────────────────────────────── + +/// Resolve `$HOME/.huskies/gateway.pid`, creating the directory if needed. +fn default_pidfile_path() -> Result { + let home = homedir::my_home() + .map_err(|e| format!("cannot determine home directory: {e}"))? + .ok_or_else(|| "HOME is not set".to_string())?; + let dir = home.join(".huskies"); + std::fs::create_dir_all(&dir).map_err(|e| format!("cannot create {}: {e}", dir.display()))?; + Ok(dir.join("gateway.pid")) +} + +// ── Public API ──────────────────────────────────────────────────────────────── + +/// Acquire the gateway pidfile at `$HOME/.huskies/gateway.pid`. +/// +/// Returns a [`PidfileGuard`] that holds the exclusive flock for as long as it +/// is in scope. Returns `Err("another gateway is at pid N")` when a live +/// gateway already holds the lock, or `Err(…)` for unexpected I/O failures. +pub fn acquire_gateway_pidfile() -> Result { + let path = default_pidfile_path()?; + acquire_gateway_pidfile_at(&path) +} + +/// Acquire the gateway pidfile at an explicit path. +/// +/// Separated from [`acquire_gateway_pidfile`] so that tests can supply a +/// temporary directory instead of touching `$HOME/.huskies`. +pub fn acquire_gateway_pidfile_at(path: &Path) -> Result { + let mut file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(false) + .open(path) + .map_err(|e| format!("cannot open pidfile {}: {e}", path.display()))?; + + #[cfg(unix)] + { + use std::os::unix::io::AsRawFd; + let ret = unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_EX | libc::LOCK_NB) }; + if ret != 0 { + let err = std::io::Error::last_os_error(); + if err.kind() == std::io::ErrorKind::WouldBlock + || err.raw_os_error() == Some(libc::EACCES) + { + // Another live process holds the lock — read its PID for the error message. + let pid_str = std::fs::read_to_string(path).unwrap_or_default(); + let pid = pid_str.trim().parse::().unwrap_or(0); + return Err(format!("another gateway is at pid {pid}")); + } + return Err(format!("flock failed: {err}")); + } + } + + // Write our PID (truncate first so no stale digits remain). + use std::io::{Seek, SeekFrom, Write}; + file.set_len(0) + .map_err(|e| format!("cannot truncate pidfile: {e}"))?; + file.seek(SeekFrom::Start(0)) + .map_err(|e| format!("cannot seek pidfile: {e}"))?; + write!(file, "{}", std::process::id()).map_err(|e| format!("cannot write pidfile: {e}"))?; + + Ok(PidfileGuard { _file: file }) +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + /// AC 2 & 3: second gateway fails with pid message; after release, the next + /// acquire succeeds (dead-PID reclaim). + #[cfg(unix)] + #[test] + fn second_gateway_fails_with_pid_message_then_reclaims() { + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("gateway.pid"); + + let guard1 = acquire_gateway_pidfile_at(&path).expect("first acquire should succeed"); + + let err = acquire_gateway_pidfile_at(&path) + .expect_err("second acquire should fail while first is held"); + + let my_pid = std::process::id(); + assert!( + err.contains("another gateway is at pid"), + "error should contain the prefix, got: {err}" + ); + assert!( + err.contains(&my_pid.to_string()), + "error should contain our PID {my_pid}, got: {err}" + ); + + // Release the first guard → flock is freed (simulates gateway death). + drop(guard1); + + // Third acquire must succeed — dead-PID reclaim. + acquire_gateway_pidfile_at(&path).expect("acquire after release should succeed"); + } +} diff --git a/server/src/trampoline.rs b/server/src/trampoline.rs index a7443ed2..6b02bed1 100644 --- a/server/src/trampoline.rs +++ b/server/src/trampoline.rs @@ -163,7 +163,10 @@ pub async fn execute_trampoline_core(job: &TrampolineJob) -> Result<(), String> } let _ = std::fs::copy(&job.old_binary_path, &job.backup_binary_path); - // Kill old gateway. + // Kill old gateway. Killing the process closes its file descriptors, + // which releases the exclusive flock held on `$HOME/.huskies/gateway.pid`. + // The new gateway (spawned below) will then acquire that flock on startup, + // ensuring the one-active-gateway invariant is maintained across the swap. kill_gateway_process(job.gateway_pid)?; // Start new gateway.