diff --git a/server/src/db/mod.rs b/server/src/db/mod.rs index 20a5e01b..67913940 100644 --- a/server/src/db/mod.rs +++ b/server/src/db/mod.rs @@ -29,7 +29,7 @@ pub mod shadow_write; pub use content_store::{ContentKey, all_content_ids, delete_content, read_content, write_content}; pub use ops::{ItemMeta, delete_item, move_item_stage, next_item_number, write_item_with_content}; -pub use shadow_write::{get_shared_pool, init}; +pub use shadow_write::{check_schema_drift, get_shared_pool, init}; #[cfg(test)] pub use content_store::ensure_content_store; @@ -395,6 +395,112 @@ mod tests { ); } + /// Regression: root cause of the 2026-05-14 21:07 production outage. + /// + /// A headless agent on a feature branch (whose binary includes a new + /// sqlx migration) must NEVER apply that migration to the production + /// pipeline.db. Verify that opening an agent-local DB and running + /// migrations on it leaves the production DB's `_sqlx_migrations` table + /// unchanged. + /// + /// The enforcement mechanism is in `init_subsystems(is_agent=true)`, which + /// redirects to a temp path. This test validates the SQLite isolation + /// property: migrations applied to one file are confined to that file. + #[tokio::test] + async fn agent_db_isolation_does_not_affect_production_db() { + let tmp = tempfile::tempdir().unwrap(); + let prod_db_path = tmp.path().join("production.db"); + let agent_db_path = tmp.path().join("agent_temp.db"); + + // Set up the production DB — apply the current compiled-in migrations. + let prod_opts = sqlx::sqlite::SqliteConnectOptions::new() + .filename(&prod_db_path) + .create_if_missing(true); + let prod_pool = sqlx::SqlitePool::connect_with(prod_opts).await.unwrap(); + sqlx::migrate!("./migrations") + .run(&prod_pool) + .await + .unwrap(); + + // Record the migration versions present in the production DB. + let before: Vec<(i64,)> = + sqlx::query_as("SELECT version FROM _sqlx_migrations ORDER BY version") + .fetch_all(&prod_pool) + .await + .unwrap(); + + // Simulate the agent opening its own isolated DB and running migrations. + let agent_opts = sqlx::sqlite::SqliteConnectOptions::new() + .filename(&agent_db_path) + .create_if_missing(true); + let agent_pool = sqlx::SqlitePool::connect_with(agent_opts).await.unwrap(); + sqlx::migrate!("./migrations") + .run(&agent_pool) + .await + .unwrap(); + + // Production DB must be completely unaffected by the agent's migration run. + let after: Vec<(i64,)> = + sqlx::query_as("SELECT version FROM _sqlx_migrations ORDER BY version") + .fetch_all(&prod_pool) + .await + .unwrap(); + + assert_eq!( + before, after, + "agent opening its own DB must not alter the production DB migration table" + ); + } + + /// Verify that `check_schema_drift` returns an empty list when all + /// migrations in the database are recognised by this binary. + #[tokio::test] + async fn check_schema_drift_empty_when_all_known() { + let tmp = tempfile::tempdir().unwrap(); + let db_path = tmp.path().join("drift_test.db"); + let opts = sqlx::sqlite::SqliteConnectOptions::new() + .filename(&db_path) + .create_if_missing(true); + let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap(); + sqlx::migrate!("./migrations").run(&pool).await.unwrap(); + + let drift = super::shadow_write::check_schema_drift(&pool).await; + assert!( + drift.is_empty(), + "no drift expected when DB matches the compiled-in migration set" + ); + } + + /// Verify that `check_schema_drift` identifies a manually-inserted + /// migration row that is not part of the compiled-in set. + #[tokio::test] + async fn check_schema_drift_detects_unknown_migration() { + let tmp = tempfile::tempdir().unwrap(); + let db_path = tmp.path().join("drift_future.db"); + let opts = sqlx::sqlite::SqliteConnectOptions::new() + .filename(&db_path) + .create_if_missing(true); + let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap(); + sqlx::migrate!("./migrations").run(&pool).await.unwrap(); + + // Inject a fake "future" migration that no binary compiled today would know. + let fake_checksum: Vec = vec![0u8; 20]; + sqlx::query( + "INSERT INTO _sqlx_migrations \ + (version, description, installed_on, success, checksum, execution_time) \ + VALUES (99999999999999, 'future_migration', '2099-01-01T00:00:00Z', 1, ?1, 0)", + ) + .bind(&fake_checksum) + .execute(&pool) + .await + .unwrap(); + + let drift = super::shadow_write::check_schema_drift(&pool).await; + assert_eq!(drift.len(), 1, "exactly one unknown migration expected"); + assert_eq!(drift[0].version, 99999999999999_i64); + assert_eq!(drift[0].description, "future_migration"); + } + /// Story 864: passing `ItemMeta::default()` against a content blob that /// LOOKS like front-matter must NOT silently extract metadata into the /// CRDT. The whole point of removing the implicit YAML round-trip is diff --git a/server/src/db/shadow_write.rs b/server/src/db/shadow_write.rs index 0ccb5bea..3fd4e612 100644 --- a/server/src/db/shadow_write.rs +++ b/server/src/db/shadow_write.rs @@ -11,10 +11,23 @@ use crate::slog; use sqlx::SqlitePool; use sqlx::sqlite::SqliteConnectOptions; use std::collections::HashMap; +use std::collections::HashSet; use std::path::Path; use std::sync::OnceLock; use tokio::sync::mpsc; +/// One migration row in the live database that is not in the compiled-in set. +/// +/// Returned by [`check_schema_drift`] for each unknown migration. +pub struct UnknownMigration { + /// sqlx migration version number (derived from the filename timestamp). + pub version: i64, + /// Human-readable description from the migration filename. + pub description: String, + /// When the migration was applied, as stored in `_sqlx_migrations.installed_on`. + pub installed_on: String, +} + /// The process-global SQLite pool, set once by [`init`]. /// /// Other modules call [`get_shared_pool`] to access the pool without needing @@ -133,3 +146,31 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> { let _ = PIPELINE_DB.set(PipelineDb { tx }); Ok(()) } + +/// Compare the live `_sqlx_migrations` table against the compiled-in migration +/// set and return any rows whose version is not known to this binary. +/// +/// A non-empty result means the database was previously opened by a newer +/// binary that applied additional migrations. The server must refuse to start +/// in that state because the schema may contain tables or columns that this +/// binary does not understand. +pub async fn check_schema_drift(pool: &SqlitePool) -> Vec { + let migrator = sqlx::migrate!("./migrations"); + let known: HashSet = migrator.migrations.iter().map(|m| m.version).collect(); + + let rows: Vec<(i64, String, String)> = sqlx::query_as( + "SELECT version, description, installed_on FROM _sqlx_migrations ORDER BY version", + ) + .fetch_all(pool) + .await + .unwrap_or_default(); + + rows.into_iter() + .filter(|(v, _, _)| !known.contains(v)) + .map(|(version, description, installed_on)| UnknownMigration { + version, + description, + installed_on, + }) + .collect() +} diff --git a/server/src/main.rs b/server/src/main.rs index 152cd025..919924ae 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -149,7 +149,7 @@ async fn main() -> Result<(), std::io::Error> { startup::project::open_project_root(is_init, explicit_path, &cwd, &app_state, &store, port) .await; - startup::project::init_subsystems(&app_state, &cwd).await; + startup::project::init_subsystems(&app_state, &cwd, is_agent).await; let crdt_join_token = cli .join_token diff --git a/server/src/startup/project.rs b/server/src/startup/project.rs index 2c56e14b..f9b0ea4b 100644 --- a/server/src/startup/project.rs +++ b/server/src/startup/project.rs @@ -217,7 +217,13 @@ async fn migrate_json_stores_to_sqlite(huskies_dir: &Path) { } /// Set up the server log file, node identity keypair, pipeline DB, and CRDT state. -pub(crate) async fn init_subsystems(app_state: &Arc, cwd: &Path) { +/// +/// When `is_agent` is `true` the pipeline database is opened at an isolated +/// temporary path (or at `HUSKIES_DB_PATH` if that env-var is set) so that the +/// headless build agent never touches the production `.huskies/pipeline.db`. +/// This prevents feature-branch migrations from being applied to the shared +/// database and bricking the next server restart. +pub(crate) async fn init_subsystems(app_state: &Arc, cwd: &Path, is_agent: bool) { // Enable persistent server log file now that the project root is known. if let Some(ref root) = *app_state.project_root.lock().unwrap() { let log_dir = root.join(".huskies").join("logs"); @@ -242,20 +248,91 @@ pub(crate) async fn init_subsystems(app_state: &Arc, cwd: &Path) { } } - // Initialise the SQLite pipeline shadow-write database and CRDT state layer. - // Clone the path out before the await so we don't hold the MutexGuard across - // an await point. - let pipeline_db_path = app_state - .project_root - .lock() - .unwrap() - .as_ref() - .map(|root| root.join(".huskies").join("pipeline.db")); + // Resolve the pipeline DB path. + // + // Priority order: + // 1. HUSKIES_DB_PATH env var (operator override, any mode) + // 2. Agent mode: process-local temp file so the production DB is never touched + // 3. Default: {project_root}/.huskies/pipeline.db + let pipeline_db_path: Option = if let Ok(env_path) = std::env::var("HUSKIES_DB_PATH") { + let p = PathBuf::from(&env_path); + crate::slog!("[db] HUSKIES_DB_PATH override: {}", p.display()); + Some(p) + } else if is_agent { + // Headless agent: use an isolated temp DB so that any migrations compiled + // into this binary (e.g. from a feature branch) are never applied to the + // production database. The temp file is process-unique and harmless to + // leave behind after the agent exits. + let pid = std::process::id(); + let temp_path = std::env::temp_dir().join(format!("huskies-agent-{pid}.db")); + crate::slog!( + "[db] Agent mode: using isolated DB at {} (not touching production pipeline.db)", + temp_path.display() + ); + Some(temp_path) + } else { + // Server mode: use the project-local production database. + app_state + .project_root + .lock() + .unwrap() + .as_ref() + .map(|root| root.join(".huskies").join("pipeline.db")) + }; if let Some(ref db_path) = pipeline_db_path { if let Err(e) = db::init(db_path).await { crate::slog!("[db] Failed to initialise pipeline.db: {e}"); } else { + // ── Migration drift self-check (server mode only) ───────────────────── + // + // In server mode, detect whether the live database contains migrations + // that were applied by a newer binary (e.g. a feature-branch agent that + // ran before the feature was merged). If so, log each unknown migration + // and exit with a clear actionable message. This is the root cause of + // the 2026-05-14 21:07 production outage where the server came up but + // the CRDT never initialised. + if !is_agent && let Some(pool) = db::get_shared_pool() { + let drift = db::check_schema_drift(pool).await; + if !drift.is_empty() { + for m in &drift { + crate::slog!( + "[db] UNKNOWN migration {} ('{}') applied at {} \ + is not in the compiled-in set", + m.version, + m.description, + m.installed_on, + ); + } + eprintln!(); + eprintln!( + "error: pipeline.db contains {} migration(s) that are not \ + recognised by this binary:", + drift.len() + ); + for m in &drift { + eprintln!( + " \u{2022} migration {} ('{}') applied at {}", + m.version, m.description, m.installed_on + ); + } + eprintln!(); + eprintln!( + "This means the database was previously opened by a newer \ + version of huskies." + ); + eprintln!( + "To fix: rebuild huskies from the latest source (the branch \ + that added these migrations) and restart." + ); + eprintln!( + "Do NOT start the old binary against this database — it will \ + behave incorrectly." + ); + std::process::exit(1); + } + } + // One-shot migration: move any existing JSON store files into SQLite. let huskies_dir = db_path.parent().unwrap_or(db_path); migrate_json_stores_to_sqlite(huskies_dir).await;