huskies: merge 1147 story One-active-gateway invariant via pidfile+flock — prevent double-gateway during restarts
This commit is contained in:
@@ -75,6 +75,15 @@ pub fn build_gateway_route(state_arc: Arc<GatewayState>) -> impl poem::Endpoint
|
||||
|
||||
/// Start the gateway HTTP server. This is the entry point when `--gateway` is used.
|
||||
pub async fn run(config_path: &Path, port: u16) -> Result<(), std::io::Error> {
|
||||
// Enforce one-active-gateway invariant: acquire an exclusive flock on the
|
||||
// pidfile before doing anything else. A second gateway start while one is
|
||||
// running will fail here with a clear error. The flock is held for the
|
||||
// lifetime of `_pidfile_guard`; it is released automatically when this
|
||||
// process exits, allowing the next gateway (spawned by the trampoline) to
|
||||
// acquire it.
|
||||
let _pidfile_guard =
|
||||
crate::pidfile::acquire_gateway_pidfile().map_err(std::io::Error::other)?;
|
||||
|
||||
let config_dir = config_path
|
||||
.parent()
|
||||
.unwrap_or(std::path::Path::new("."))
|
||||
|
||||
@@ -36,6 +36,8 @@ pub mod log_buffer;
|
||||
pub mod mesh;
|
||||
/// Node identity — Ed25519 keypair generation and stable node ID management.
|
||||
pub mod node_identity;
|
||||
/// Gateway pidfile — exclusive flock on `$HOME/.huskies/gateway.pid`.
|
||||
pub mod pidfile;
|
||||
/// Pipeline event bus — real-time broadcast of pipeline-transition events to persona subscribers.
|
||||
pub(crate) mod pipeline_event_bus;
|
||||
pub(crate) mod pipeline_state;
|
||||
|
||||
@@ -0,0 +1,121 @@
|
||||
//! Gateway pidfile — exclusive flock on `$HOME/.huskies/gateway.pid`.
|
||||
//!
|
||||
//! A gateway process holds the lock for its lifetime. A second gateway that
|
||||
//! tries to start while one is already running fails immediately with a
|
||||
//! human-readable error naming the existing process. A stale pidfile left by
|
||||
//! a dead process is reclaimed automatically: the kernel releases flocks when
|
||||
//! the file descriptor is closed, which happens when the process dies.
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
// ── Guard ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Held for the lifetime of the gateway process. Dropping it releases the flock.
|
||||
#[derive(Debug)]
|
||||
pub struct PidfileGuard {
|
||||
_file: File,
|
||||
}
|
||||
|
||||
// ── Path resolution ───────────────────────────────────────────────────────────
|
||||
|
||||
/// Resolve `$HOME/.huskies/gateway.pid`, creating the directory if needed.
|
||||
fn default_pidfile_path() -> Result<PathBuf, String> {
|
||||
let home = homedir::my_home()
|
||||
.map_err(|e| format!("cannot determine home directory: {e}"))?
|
||||
.ok_or_else(|| "HOME is not set".to_string())?;
|
||||
let dir = home.join(".huskies");
|
||||
std::fs::create_dir_all(&dir).map_err(|e| format!("cannot create {}: {e}", dir.display()))?;
|
||||
Ok(dir.join("gateway.pid"))
|
||||
}
|
||||
|
||||
// ── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Acquire the gateway pidfile at `$HOME/.huskies/gateway.pid`.
|
||||
///
|
||||
/// Returns a [`PidfileGuard`] that holds the exclusive flock for as long as it
|
||||
/// is in scope. Returns `Err("another gateway is at pid N")` when a live
|
||||
/// gateway already holds the lock, or `Err(…)` for unexpected I/O failures.
|
||||
pub fn acquire_gateway_pidfile() -> Result<PidfileGuard, String> {
|
||||
let path = default_pidfile_path()?;
|
||||
acquire_gateway_pidfile_at(&path)
|
||||
}
|
||||
|
||||
/// Acquire the gateway pidfile at an explicit path.
|
||||
///
|
||||
/// Separated from [`acquire_gateway_pidfile`] so that tests can supply a
|
||||
/// temporary directory instead of touching `$HOME/.huskies`.
|
||||
pub fn acquire_gateway_pidfile_at(path: &Path) -> Result<PidfileGuard, String> {
|
||||
let mut file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(false)
|
||||
.open(path)
|
||||
.map_err(|e| format!("cannot open pidfile {}: {e}", path.display()))?;
|
||||
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use std::os::unix::io::AsRawFd;
|
||||
let ret = unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_EX | libc::LOCK_NB) };
|
||||
if ret != 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
if err.kind() == std::io::ErrorKind::WouldBlock
|
||||
|| err.raw_os_error() == Some(libc::EACCES)
|
||||
{
|
||||
// Another live process holds the lock — read its PID for the error message.
|
||||
let pid_str = std::fs::read_to_string(path).unwrap_or_default();
|
||||
let pid = pid_str.trim().parse::<u32>().unwrap_or(0);
|
||||
return Err(format!("another gateway is at pid {pid}"));
|
||||
}
|
||||
return Err(format!("flock failed: {err}"));
|
||||
}
|
||||
}
|
||||
|
||||
// Write our PID (truncate first so no stale digits remain).
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
file.set_len(0)
|
||||
.map_err(|e| format!("cannot truncate pidfile: {e}"))?;
|
||||
file.seek(SeekFrom::Start(0))
|
||||
.map_err(|e| format!("cannot seek pidfile: {e}"))?;
|
||||
write!(file, "{}", std::process::id()).map_err(|e| format!("cannot write pidfile: {e}"))?;
|
||||
|
||||
Ok(PidfileGuard { _file: file })
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// AC 2 & 3: second gateway fails with pid message; after release, the next
|
||||
/// acquire succeeds (dead-PID reclaim).
|
||||
#[cfg(unix)]
|
||||
#[test]
|
||||
fn second_gateway_fails_with_pid_message_then_reclaims() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let path = tmp.path().join("gateway.pid");
|
||||
|
||||
let guard1 = acquire_gateway_pidfile_at(&path).expect("first acquire should succeed");
|
||||
|
||||
let err = acquire_gateway_pidfile_at(&path)
|
||||
.expect_err("second acquire should fail while first is held");
|
||||
|
||||
let my_pid = std::process::id();
|
||||
assert!(
|
||||
err.contains("another gateway is at pid"),
|
||||
"error should contain the prefix, got: {err}"
|
||||
);
|
||||
assert!(
|
||||
err.contains(&my_pid.to_string()),
|
||||
"error should contain our PID {my_pid}, got: {err}"
|
||||
);
|
||||
|
||||
// Release the first guard → flock is freed (simulates gateway death).
|
||||
drop(guard1);
|
||||
|
||||
// Third acquire must succeed — dead-PID reclaim.
|
||||
acquire_gateway_pidfile_at(&path).expect("acquire after release should succeed");
|
||||
}
|
||||
}
|
||||
@@ -163,7 +163,10 @@ pub async fn execute_trampoline_core(job: &TrampolineJob) -> Result<(), String>
|
||||
}
|
||||
let _ = std::fs::copy(&job.old_binary_path, &job.backup_binary_path);
|
||||
|
||||
// Kill old gateway.
|
||||
// Kill old gateway. Killing the process closes its file descriptors,
|
||||
// which releases the exclusive flock held on `$HOME/.huskies/gateway.pid`.
|
||||
// The new gateway (spawned below) will then acquire that flock on startup,
|
||||
// ensuring the one-active-gateway invariant is maintained across the swap.
|
||||
kill_gateway_process(job.gateway_pid)?;
|
||||
|
||||
// Start new gateway.
|
||||
|
||||
Reference in New Issue
Block a user