huskies: merge 1072

This commit is contained in:
dave
2026-05-15 01:21:38 +00:00
parent ae69cd50b1
commit 1506141155
4 changed files with 236 additions and 12 deletions
+107 -1
View File
@@ -29,7 +29,7 @@ pub mod shadow_write;
pub use content_store::{ContentKey, all_content_ids, delete_content, read_content, write_content}; pub use content_store::{ContentKey, all_content_ids, delete_content, read_content, write_content};
pub use ops::{ItemMeta, delete_item, move_item_stage, next_item_number, write_item_with_content}; pub use ops::{ItemMeta, delete_item, move_item_stage, next_item_number, write_item_with_content};
pub use shadow_write::{get_shared_pool, init}; pub use shadow_write::{check_schema_drift, get_shared_pool, init};
#[cfg(test)] #[cfg(test)]
pub use content_store::ensure_content_store; pub use content_store::ensure_content_store;
@@ -395,6 +395,112 @@ mod tests {
); );
} }
/// Regression: root cause of the 2026-05-14 21:07 production outage.
///
/// A headless agent on a feature branch (whose binary includes a new
/// sqlx migration) must NEVER apply that migration to the production
/// pipeline.db. Verify that opening an agent-local DB and running
/// migrations on it leaves the production DB's `_sqlx_migrations` table
/// unchanged.
///
/// The enforcement mechanism is in `init_subsystems(is_agent=true)`, which
/// redirects to a temp path. This test validates the SQLite isolation
/// property: migrations applied to one file are confined to that file.
#[tokio::test]
async fn agent_db_isolation_does_not_affect_production_db() {
let tmp = tempfile::tempdir().unwrap();
let prod_db_path = tmp.path().join("production.db");
let agent_db_path = tmp.path().join("agent_temp.db");
// Set up the production DB — apply the current compiled-in migrations.
let prod_opts = sqlx::sqlite::SqliteConnectOptions::new()
.filename(&prod_db_path)
.create_if_missing(true);
let prod_pool = sqlx::SqlitePool::connect_with(prod_opts).await.unwrap();
sqlx::migrate!("./migrations")
.run(&prod_pool)
.await
.unwrap();
// Record the migration versions present in the production DB.
let before: Vec<(i64,)> =
sqlx::query_as("SELECT version FROM _sqlx_migrations ORDER BY version")
.fetch_all(&prod_pool)
.await
.unwrap();
// Simulate the agent opening its own isolated DB and running migrations.
let agent_opts = sqlx::sqlite::SqliteConnectOptions::new()
.filename(&agent_db_path)
.create_if_missing(true);
let agent_pool = sqlx::SqlitePool::connect_with(agent_opts).await.unwrap();
sqlx::migrate!("./migrations")
.run(&agent_pool)
.await
.unwrap();
// Production DB must be completely unaffected by the agent's migration run.
let after: Vec<(i64,)> =
sqlx::query_as("SELECT version FROM _sqlx_migrations ORDER BY version")
.fetch_all(&prod_pool)
.await
.unwrap();
assert_eq!(
before, after,
"agent opening its own DB must not alter the production DB migration table"
);
}
/// Verify that `check_schema_drift` returns an empty list when all
/// migrations in the database are recognised by this binary.
#[tokio::test]
async fn check_schema_drift_empty_when_all_known() {
let tmp = tempfile::tempdir().unwrap();
let db_path = tmp.path().join("drift_test.db");
let opts = sqlx::sqlite::SqliteConnectOptions::new()
.filename(&db_path)
.create_if_missing(true);
let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
sqlx::migrate!("./migrations").run(&pool).await.unwrap();
let drift = super::shadow_write::check_schema_drift(&pool).await;
assert!(
drift.is_empty(),
"no drift expected when DB matches the compiled-in migration set"
);
}
/// Verify that `check_schema_drift` identifies a manually-inserted
/// migration row that is not part of the compiled-in set.
#[tokio::test]
async fn check_schema_drift_detects_unknown_migration() {
let tmp = tempfile::tempdir().unwrap();
let db_path = tmp.path().join("drift_future.db");
let opts = sqlx::sqlite::SqliteConnectOptions::new()
.filename(&db_path)
.create_if_missing(true);
let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
sqlx::migrate!("./migrations").run(&pool).await.unwrap();
// Inject a fake "future" migration that no binary compiled today would know.
let fake_checksum: Vec<u8> = vec![0u8; 20];
sqlx::query(
"INSERT INTO _sqlx_migrations \
(version, description, installed_on, success, checksum, execution_time) \
VALUES (99999999999999, 'future_migration', '2099-01-01T00:00:00Z', 1, ?1, 0)",
)
.bind(&fake_checksum)
.execute(&pool)
.await
.unwrap();
let drift = super::shadow_write::check_schema_drift(&pool).await;
assert_eq!(drift.len(), 1, "exactly one unknown migration expected");
assert_eq!(drift[0].version, 99999999999999_i64);
assert_eq!(drift[0].description, "future_migration");
}
/// Story 864: passing `ItemMeta::default()` against a content blob that /// Story 864: passing `ItemMeta::default()` against a content blob that
/// LOOKS like front-matter must NOT silently extract metadata into the /// LOOKS like front-matter must NOT silently extract metadata into the
/// CRDT. The whole point of removing the implicit YAML round-trip is /// CRDT. The whole point of removing the implicit YAML round-trip is
+41
View File
@@ -11,10 +11,23 @@ use crate::slog;
use sqlx::SqlitePool; use sqlx::SqlitePool;
use sqlx::sqlite::SqliteConnectOptions; use sqlx::sqlite::SqliteConnectOptions;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::HashSet;
use std::path::Path; use std::path::Path;
use std::sync::OnceLock; use std::sync::OnceLock;
use tokio::sync::mpsc; use tokio::sync::mpsc;
/// One migration row in the live database that is not in the compiled-in set.
///
/// Returned by [`check_schema_drift`] for each unknown migration.
pub struct UnknownMigration {
/// sqlx migration version number (derived from the filename timestamp).
pub version: i64,
/// Human-readable description from the migration filename.
pub description: String,
/// When the migration was applied, as stored in `_sqlx_migrations.installed_on`.
pub installed_on: String,
}
/// The process-global SQLite pool, set once by [`init`]. /// The process-global SQLite pool, set once by [`init`].
/// ///
/// Other modules call [`get_shared_pool`] to access the pool without needing /// Other modules call [`get_shared_pool`] to access the pool without needing
@@ -133,3 +146,31 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
let _ = PIPELINE_DB.set(PipelineDb { tx }); let _ = PIPELINE_DB.set(PipelineDb { tx });
Ok(()) Ok(())
} }
/// Compare the live `_sqlx_migrations` table against the compiled-in migration
/// set and return any rows whose version is not known to this binary.
///
/// A non-empty result means the database was previously opened by a newer
/// binary that applied additional migrations. The server must refuse to start
/// in that state because the schema may contain tables or columns that this
/// binary does not understand.
pub async fn check_schema_drift(pool: &SqlitePool) -> Vec<UnknownMigration> {
let migrator = sqlx::migrate!("./migrations");
let known: HashSet<i64> = migrator.migrations.iter().map(|m| m.version).collect();
let rows: Vec<(i64, String, String)> = sqlx::query_as(
"SELECT version, description, installed_on FROM _sqlx_migrations ORDER BY version",
)
.fetch_all(pool)
.await
.unwrap_or_default();
rows.into_iter()
.filter(|(v, _, _)| !known.contains(v))
.map(|(version, description, installed_on)| UnknownMigration {
version,
description,
installed_on,
})
.collect()
}
+1 -1
View File
@@ -149,7 +149,7 @@ async fn main() -> Result<(), std::io::Error> {
startup::project::open_project_root(is_init, explicit_path, &cwd, &app_state, &store, port) startup::project::open_project_root(is_init, explicit_path, &cwd, &app_state, &store, port)
.await; .await;
startup::project::init_subsystems(&app_state, &cwd).await; startup::project::init_subsystems(&app_state, &cwd, is_agent).await;
let crdt_join_token = cli let crdt_join_token = cli
.join_token .join_token
+83 -6
View File
@@ -217,7 +217,13 @@ async fn migrate_json_stores_to_sqlite(huskies_dir: &Path) {
} }
/// Set up the server log file, node identity keypair, pipeline DB, and CRDT state. /// Set up the server log file, node identity keypair, pipeline DB, and CRDT state.
pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path) { ///
/// When `is_agent` is `true` the pipeline database is opened at an isolated
/// temporary path (or at `HUSKIES_DB_PATH` if that env-var is set) so that the
/// headless build agent never touches the production `.huskies/pipeline.db`.
/// This prevents feature-branch migrations from being applied to the shared
/// database and bricking the next server restart.
pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path, is_agent: bool) {
// Enable persistent server log file now that the project root is known. // Enable persistent server log file now that the project root is known.
if let Some(ref root) = *app_state.project_root.lock().unwrap() { if let Some(ref root) = *app_state.project_root.lock().unwrap() {
let log_dir = root.join(".huskies").join("logs"); let log_dir = root.join(".huskies").join("logs");
@@ -242,20 +248,91 @@ pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path) {
} }
} }
// Initialise the SQLite pipeline shadow-write database and CRDT state layer. // Resolve the pipeline DB path.
// Clone the path out before the await so we don't hold the MutexGuard across //
// an await point. // Priority order:
let pipeline_db_path = app_state // 1. HUSKIES_DB_PATH env var (operator override, any mode)
// 2. Agent mode: process-local temp file so the production DB is never touched
// 3. Default: {project_root}/.huskies/pipeline.db
let pipeline_db_path: Option<PathBuf> = if let Ok(env_path) = std::env::var("HUSKIES_DB_PATH") {
let p = PathBuf::from(&env_path);
crate::slog!("[db] HUSKIES_DB_PATH override: {}", p.display());
Some(p)
} else if is_agent {
// Headless agent: use an isolated temp DB so that any migrations compiled
// into this binary (e.g. from a feature branch) are never applied to the
// production database. The temp file is process-unique and harmless to
// leave behind after the agent exits.
let pid = std::process::id();
let temp_path = std::env::temp_dir().join(format!("huskies-agent-{pid}.db"));
crate::slog!(
"[db] Agent mode: using isolated DB at {} (not touching production pipeline.db)",
temp_path.display()
);
Some(temp_path)
} else {
// Server mode: use the project-local production database.
app_state
.project_root .project_root
.lock() .lock()
.unwrap() .unwrap()
.as_ref() .as_ref()
.map(|root| root.join(".huskies").join("pipeline.db")); .map(|root| root.join(".huskies").join("pipeline.db"))
};
if let Some(ref db_path) = pipeline_db_path { if let Some(ref db_path) = pipeline_db_path {
if let Err(e) = db::init(db_path).await { if let Err(e) = db::init(db_path).await {
crate::slog!("[db] Failed to initialise pipeline.db: {e}"); crate::slog!("[db] Failed to initialise pipeline.db: {e}");
} else { } else {
// ── Migration drift self-check (server mode only) ─────────────────────
//
// In server mode, detect whether the live database contains migrations
// that were applied by a newer binary (e.g. a feature-branch agent that
// ran before the feature was merged). If so, log each unknown migration
// and exit with a clear actionable message. This is the root cause of
// the 2026-05-14 21:07 production outage where the server came up but
// the CRDT never initialised.
if !is_agent && let Some(pool) = db::get_shared_pool() {
let drift = db::check_schema_drift(pool).await;
if !drift.is_empty() {
for m in &drift {
crate::slog!(
"[db] UNKNOWN migration {} ('{}') applied at {} \
is not in the compiled-in set",
m.version,
m.description,
m.installed_on,
);
}
eprintln!();
eprintln!(
"error: pipeline.db contains {} migration(s) that are not \
recognised by this binary:",
drift.len()
);
for m in &drift {
eprintln!(
" \u{2022} migration {} ('{}') applied at {}",
m.version, m.description, m.installed_on
);
}
eprintln!();
eprintln!(
"This means the database was previously opened by a newer \
version of huskies."
);
eprintln!(
"To fix: rebuild huskies from the latest source (the branch \
that added these migrations) and restart."
);
eprintln!(
"Do NOT start the old binary against this database — it will \
behave incorrectly."
);
std::process::exit(1);
}
}
// One-shot migration: move any existing JSON store files into SQLite. // One-shot migration: move any existing JSON store files into SQLite.
let huskies_dir = db_path.parent().unwrap_or(db_path); let huskies_dir = db_path.parent().unwrap_or(db_path);
migrate_json_stores_to_sqlite(huskies_dir).await; migrate_json_stores_to_sqlite(huskies_dir).await;