huskies: merge 1072

2026-05-15 01:21:38 +00:00
parent ae69cd50b1
commit 1506141155
4 changed files with 236 additions and 12 deletions
@@ -29,7 +29,7 @@ pub mod shadow_write;
 pub use content_store::{ContentKey, all_content_ids, delete_content, read_content, write_content};
 pub use ops::{ItemMeta, delete_item, move_item_stage, next_item_number, write_item_with_content};
-pub use shadow_write::{get_shared_pool, init};
+pub use shadow_write::{check_schema_drift, get_shared_pool, init};
 #[cfg(test)]
 pub use content_store::ensure_content_store;
@@ -395,6 +395,112 @@ mod tests {
        );
    }
    /// Regression: root cause of the 2026-05-14 21:07 production outage.
    ///
    /// A headless agent on a feature branch (whose binary includes a new
    /// sqlx migration) must NEVER apply that migration to the production
    /// pipeline.db.  Verify that opening an agent-local DB and running
    /// migrations on it leaves the production DB's `_sqlx_migrations` table
    /// unchanged.
    ///
    /// The enforcement mechanism is in `init_subsystems(is_agent=true)`, which
    /// redirects to a temp path.  This test validates the SQLite isolation
    /// property: migrations applied to one file are confined to that file.
    #[tokio::test]
    async fn agent_db_isolation_does_not_affect_production_db() {
        let tmp = tempfile::tempdir().unwrap();
        let prod_db_path = tmp.path().join("production.db");
        let agent_db_path = tmp.path().join("agent_temp.db");
        // Set up the production DB — apply the current compiled-in migrations.
        let prod_opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&prod_db_path)
            .create_if_missing(true);
        let prod_pool = sqlx::SqlitePool::connect_with(prod_opts).await.unwrap();
        sqlx::migrate!("./migrations")
            .run(&prod_pool)
            .await
            .unwrap();
        // Record the migration versions present in the production DB.
        let before: Vec<(i64,)> =
            sqlx::query_as("SELECT version FROM _sqlx_migrations ORDER BY version")
                .fetch_all(&prod_pool)
                .await
                .unwrap();
        // Simulate the agent opening its own isolated DB and running migrations.
        let agent_opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&agent_db_path)
            .create_if_missing(true);
        let agent_pool = sqlx::SqlitePool::connect_with(agent_opts).await.unwrap();
        sqlx::migrate!("./migrations")
            .run(&agent_pool)
            .await
            .unwrap();
        // Production DB must be completely unaffected by the agent's migration run.
        let after: Vec<(i64,)> =
            sqlx::query_as("SELECT version FROM _sqlx_migrations ORDER BY version")
                .fetch_all(&prod_pool)
                .await
                .unwrap();
        assert_eq!(
            before, after,
            "agent opening its own DB must not alter the production DB migration table"
        );
    }
    /// Verify that `check_schema_drift` returns an empty list when all
    /// migrations in the database are recognised by this binary.
    #[tokio::test]
    async fn check_schema_drift_empty_when_all_known() {
        let tmp = tempfile::tempdir().unwrap();
        let db_path = tmp.path().join("drift_test.db");
        let opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&db_path)
            .create_if_missing(true);
        let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
        sqlx::migrate!("./migrations").run(&pool).await.unwrap();
        let drift = super::shadow_write::check_schema_drift(&pool).await;
        assert!(
            drift.is_empty(),
            "no drift expected when DB matches the compiled-in migration set"
        );
    }
    /// Verify that `check_schema_drift` identifies a manually-inserted
    /// migration row that is not part of the compiled-in set.
    #[tokio::test]
    async fn check_schema_drift_detects_unknown_migration() {
        let tmp = tempfile::tempdir().unwrap();
        let db_path = tmp.path().join("drift_future.db");
        let opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&db_path)
            .create_if_missing(true);
        let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
        sqlx::migrate!("./migrations").run(&pool).await.unwrap();
        // Inject a fake "future" migration that no binary compiled today would know.
        let fake_checksum: Vec<u8> = vec![0u8; 20];
        sqlx::query(
            "INSERT INTO _sqlx_migrations \
             (version, description, installed_on, success, checksum, execution_time) \
             VALUES (99999999999999, 'future_migration', '2099-01-01T00:00:00Z', 1, ?1, 0)",
        )
        .bind(&fake_checksum)
        .execute(&pool)
        .await
        .unwrap();
        let drift = super::shadow_write::check_schema_drift(&pool).await;
        assert_eq!(drift.len(), 1, "exactly one unknown migration expected");
        assert_eq!(drift[0].version, 99999999999999_i64);
        assert_eq!(drift[0].description, "future_migration");
    }
    /// Story 864: passing `ItemMeta::default()` against a content blob that
    /// LOOKS like front-matter must NOT silently extract metadata into the
    /// CRDT.  The whole point of removing the implicit YAML round-trip is
@@ -11,10 +11,23 @@ use crate::slog;
 use sqlx::SqlitePool;
 use sqlx::sqlite::SqliteConnectOptions;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use std::path::Path;
 use std::sync::OnceLock;
 use tokio::sync::mpsc;
 /// One migration row in the live database that is not in the compiled-in set.
 ///
 /// Returned by [`check_schema_drift`] for each unknown migration.
 pub struct UnknownMigration {
    /// sqlx migration version number (derived from the filename timestamp).
    pub version: i64,
    /// Human-readable description from the migration filename.
    pub description: String,
    /// When the migration was applied, as stored in `_sqlx_migrations.installed_on`.
    pub installed_on: String,
 }
 /// The process-global SQLite pool, set once by [`init`].
 ///
 /// Other modules call [`get_shared_pool`] to access the pool without needing
@@ -133,3 +146,31 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
    let _ = PIPELINE_DB.set(PipelineDb { tx });
    Ok(())
 }
 /// Compare the live `_sqlx_migrations` table against the compiled-in migration
 /// set and return any rows whose version is not known to this binary.
 ///
 /// A non-empty result means the database was previously opened by a newer
 /// binary that applied additional migrations.  The server must refuse to start
 /// in that state because the schema may contain tables or columns that this
 /// binary does not understand.
 pub async fn check_schema_drift(pool: &SqlitePool) -> Vec<UnknownMigration> {
    let migrator = sqlx::migrate!("./migrations");
    let known: HashSet<i64> = migrator.migrations.iter().map(|m| m.version).collect();
    let rows: Vec<(i64, String, String)> = sqlx::query_as(
        "SELECT version, description, installed_on FROM _sqlx_migrations ORDER BY version",
    )
    .fetch_all(pool)
    .await
    .unwrap_or_default();
    rows.into_iter()
        .filter(|(v, _, _)| !known.contains(v))
        .map(|(version, description, installed_on)| UnknownMigration {
            version,
            description,
            installed_on,
        })
        .collect()
 }
@@ -149,7 +149,7 @@ async fn main() -> Result<(), std::io::Error> {
    startup::project::open_project_root(is_init, explicit_path, &cwd, &app_state, &store, port)
        .await;
-    startup::project::init_subsystems(&app_state, &cwd).await;
+    startup::project::init_subsystems(&app_state, &cwd, is_agent).await;
    let crdt_join_token = cli
        .join_token
@@ -217,7 +217,13 @@ async fn migrate_json_stores_to_sqlite(huskies_dir: &Path) {
 }
 /// Set up the server log file, node identity keypair, pipeline DB, and CRDT state.
-pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path) {
+///
 /// When `is_agent` is `true` the pipeline database is opened at an isolated
 /// temporary path (or at `HUSKIES_DB_PATH` if that env-var is set) so that the
 /// headless build agent never touches the production `.huskies/pipeline.db`.
 /// This prevents feature-branch migrations from being applied to the shared
 /// database and bricking the next server restart.
 pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path, is_agent: bool) {
    // Enable persistent server log file now that the project root is known.
    if let Some(ref root) = *app_state.project_root.lock().unwrap() {
        let log_dir = root.join(".huskies").join("logs");
@@ -242,20 +248,91 @@ pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path) {
        }
    }
-    // Initialise the SQLite pipeline shadow-write database and CRDT state layer.
+    // Resolve the pipeline DB path.
-    // Clone the path out before the await so we don't hold the MutexGuard across
+    //
-    // an await point.
+    // Priority order:
-    let pipeline_db_path = app_state
+    //   1. HUSKIES_DB_PATH env var (operator override, any mode)
    //   2. Agent mode: process-local temp file so the production DB is never touched
    //   3. Default: {project_root}/.huskies/pipeline.db
    let pipeline_db_path: Option<PathBuf> = if let Ok(env_path) = std::env::var("HUSKIES_DB_PATH") {
        let p = PathBuf::from(&env_path);
        crate::slog!("[db] HUSKIES_DB_PATH override: {}", p.display());
        Some(p)
    } else if is_agent {
        // Headless agent: use an isolated temp DB so that any migrations compiled
        // into this binary (e.g. from a feature branch) are never applied to the
        // production database.  The temp file is process-unique and harmless to
        // leave behind after the agent exits.
        let pid = std::process::id();
        let temp_path = std::env::temp_dir().join(format!("huskies-agent-{pid}.db"));
        crate::slog!(
            "[db] Agent mode: using isolated DB at {} (not touching production pipeline.db)",
            temp_path.display()
        );
        Some(temp_path)
    } else {
        // Server mode: use the project-local production database.
        app_state
            .project_root
            .lock()
            .unwrap()
            .as_ref()
-        .map(|root| root.join(".huskies").join("pipeline.db"));
+            .map(|root| root.join(".huskies").join("pipeline.db"))
    };
    if let Some(ref db_path) = pipeline_db_path {
        if let Err(e) = db::init(db_path).await {
            crate::slog!("[db] Failed to initialise pipeline.db: {e}");
        } else {
            // ── Migration drift self-check (server mode only) ─────────────────────
            //
            // In server mode, detect whether the live database contains migrations
            // that were applied by a newer binary (e.g. a feature-branch agent that
            // ran before the feature was merged).  If so, log each unknown migration
            // and exit with a clear actionable message.  This is the root cause of
            // the 2026-05-14 21:07 production outage where the server came up but
            // the CRDT never initialised.
            if !is_agent && let Some(pool) = db::get_shared_pool() {
                let drift = db::check_schema_drift(pool).await;
                if !drift.is_empty() {
                    for m in &drift {
                        crate::slog!(
                            "[db] UNKNOWN migration {} ('{}') applied at {} \
                             is not in the compiled-in set",
                            m.version,
                            m.description,
                            m.installed_on,
                        );
                    }
                    eprintln!();
                    eprintln!(
                        "error: pipeline.db contains {} migration(s) that are not \
                         recognised by this binary:",
                        drift.len()
                    );
                    for m in &drift {
                        eprintln!(
                            "  \u{2022} migration {} ('{}') applied at {}",
                            m.version, m.description, m.installed_on
                        );
                    }
                    eprintln!();
                    eprintln!(
                        "This means the database was previously opened by a newer \
                         version of huskies."
                    );
                    eprintln!(
                        "To fix: rebuild huskies from the latest source (the branch \
                         that added these migrations) and restart."
                    );
                    eprintln!(
                        "Do NOT start the old binary against this database — it will \
                         behave incorrectly."
                    );
                    std::process::exit(1);
                }
            }
            // One-shot migration: move any existing JSON store files into SQLite.
            let huskies_dir = db_path.parent().unwrap_or(db_path);
            migrate_json_stores_to_sqlite(huskies_dir).await;