huskies: merge 1147 story One-active-gateway invariant via pidfile+flock — prevent double-gateway during restarts

This commit is contained in:
dave
2026-05-19 18:29:19 +00:00
parent 918f18c200
commit be7bdf8304
4 changed files with 136 additions and 1 deletions
+9
View File
@@ -75,6 +75,15 @@ pub fn build_gateway_route(state_arc: Arc<GatewayState>) -> impl poem::Endpoint
/// Start the gateway HTTP server. This is the entry point when `--gateway` is used.
pub async fn run(config_path: &Path, port: u16) -> Result<(), std::io::Error> {
// Enforce one-active-gateway invariant: acquire an exclusive flock on the
// pidfile before doing anything else. A second gateway start while one is
// running will fail here with a clear error. The flock is held for the
// lifetime of `_pidfile_guard`; it is released automatically when this
// process exits, allowing the next gateway (spawned by the trampoline) to
// acquire it.
let _pidfile_guard =
crate::pidfile::acquire_gateway_pidfile().map_err(std::io::Error::other)?;
let config_dir = config_path
.parent()
.unwrap_or(std::path::Path::new("."))
+2
View File
@@ -36,6 +36,8 @@ pub mod log_buffer;
pub mod mesh;
/// Node identity — Ed25519 keypair generation and stable node ID management.
pub mod node_identity;
/// Gateway pidfile — exclusive flock on `$HOME/.huskies/gateway.pid`.
pub mod pidfile;
/// Pipeline event bus — real-time broadcast of pipeline-transition events to persona subscribers.
pub(crate) mod pipeline_event_bus;
pub(crate) mod pipeline_state;
+121
View File
@@ -0,0 +1,121 @@
//! Gateway pidfile — exclusive flock on `$HOME/.huskies/gateway.pid`.
//!
//! A gateway process holds the lock for its lifetime. A second gateway that
//! tries to start while one is already running fails immediately with a
//! human-readable error naming the existing process. A stale pidfile left by
//! a dead process is reclaimed automatically: the kernel releases flocks when
//! the file descriptor is closed, which happens when the process dies.
use std::fs::{File, OpenOptions};
use std::path::{Path, PathBuf};
// ── Guard ─────────────────────────────────────────────────────────────────────
/// Held for the lifetime of the gateway process. Dropping it releases the flock.
#[derive(Debug)]
pub struct PidfileGuard {
_file: File,
}
// ── Path resolution ───────────────────────────────────────────────────────────
/// Resolve `$HOME/.huskies/gateway.pid`, creating the directory if needed.
fn default_pidfile_path() -> Result<PathBuf, String> {
let home = homedir::my_home()
.map_err(|e| format!("cannot determine home directory: {e}"))?
.ok_or_else(|| "HOME is not set".to_string())?;
let dir = home.join(".huskies");
std::fs::create_dir_all(&dir).map_err(|e| format!("cannot create {}: {e}", dir.display()))?;
Ok(dir.join("gateway.pid"))
}
// ── Public API ────────────────────────────────────────────────────────────────
/// Acquire the gateway pidfile at `$HOME/.huskies/gateway.pid`.
///
/// Returns a [`PidfileGuard`] that holds the exclusive flock for as long as it
/// is in scope. Returns `Err("another gateway is at pid N")` when a live
/// gateway already holds the lock, or `Err(…)` for unexpected I/O failures.
pub fn acquire_gateway_pidfile() -> Result<PidfileGuard, String> {
let path = default_pidfile_path()?;
acquire_gateway_pidfile_at(&path)
}
/// Acquire the gateway pidfile at an explicit path.
///
/// Separated from [`acquire_gateway_pidfile`] so that tests can supply a
/// temporary directory instead of touching `$HOME/.huskies`.
pub fn acquire_gateway_pidfile_at(path: &Path) -> Result<PidfileGuard, String> {
let mut file = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(false)
.open(path)
.map_err(|e| format!("cannot open pidfile {}: {e}", path.display()))?;
#[cfg(unix)]
{
use std::os::unix::io::AsRawFd;
let ret = unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_EX | libc::LOCK_NB) };
if ret != 0 {
let err = std::io::Error::last_os_error();
if err.kind() == std::io::ErrorKind::WouldBlock
|| err.raw_os_error() == Some(libc::EACCES)
{
// Another live process holds the lock — read its PID for the error message.
let pid_str = std::fs::read_to_string(path).unwrap_or_default();
let pid = pid_str.trim().parse::<u32>().unwrap_or(0);
return Err(format!("another gateway is at pid {pid}"));
}
return Err(format!("flock failed: {err}"));
}
}
// Write our PID (truncate first so no stale digits remain).
use std::io::{Seek, SeekFrom, Write};
file.set_len(0)
.map_err(|e| format!("cannot truncate pidfile: {e}"))?;
file.seek(SeekFrom::Start(0))
.map_err(|e| format!("cannot seek pidfile: {e}"))?;
write!(file, "{}", std::process::id()).map_err(|e| format!("cannot write pidfile: {e}"))?;
Ok(PidfileGuard { _file: file })
}
// ── Tests ─────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
/// AC 2 & 3: second gateway fails with pid message; after release, the next
/// acquire succeeds (dead-PID reclaim).
#[cfg(unix)]
#[test]
fn second_gateway_fails_with_pid_message_then_reclaims() {
let tmp = tempfile::tempdir().unwrap();
let path = tmp.path().join("gateway.pid");
let guard1 = acquire_gateway_pidfile_at(&path).expect("first acquire should succeed");
let err = acquire_gateway_pidfile_at(&path)
.expect_err("second acquire should fail while first is held");
let my_pid = std::process::id();
assert!(
err.contains("another gateway is at pid"),
"error should contain the prefix, got: {err}"
);
assert!(
err.contains(&my_pid.to_string()),
"error should contain our PID {my_pid}, got: {err}"
);
// Release the first guard → flock is freed (simulates gateway death).
drop(guard1);
// Third acquire must succeed — dead-PID reclaim.
acquire_gateway_pidfile_at(&path).expect("acquire after release should succeed");
}
}
+4 -1
View File
@@ -163,7 +163,10 @@ pub async fn execute_trampoline_core(job: &TrampolineJob) -> Result<(), String>
}
let _ = std::fs::copy(&job.old_binary_path, &job.backup_binary_path);
// Kill old gateway.
// Kill old gateway. Killing the process closes its file descriptors,
// which releases the exclusive flock held on `$HOME/.huskies/gateway.pid`.
// The new gateway (spawned below) will then acquire that flock on startup,
// ensuring the one-active-gateway invariant is maintained across the swap.
kill_gateway_process(job.gateway_pid)?;
// Start new gateway.