fix(1001): stop create_* from half-writing onto tombstoned IDs
Root cause: db::next_item_number scanned the visible CRDT index and the
content store but not the tombstone set, so it would hand out a numeric
ID whose CRDT entry had been tombstoned. crdt_state::write_item then
silently no-op'd the insert (tombstone-match guard) while the content
store and SQLite shadow happily accepted the row, producing a split-
brain half-write that was invisible to every CRDT-driven read path and
couldn't be cleaned up by delete_story / purge_story.
This change closes the loop:
- crdt_state::read::{is_tombstoned, tombstoned_ids} expose the
tombstone set so callers outside crdt_state can consult it.
- db::next_item_number now scans tombstoned_ids() too. The allocator
skips past tombstoned numeric IDs instead of treating their slots as
free.
- write_item logs a WARN when it rejects a write for a tombstoned ID
(was silent). The warn is a tripwire — if the allocator ever lets one
slip through again we'll see it in the log.
- create_item_in_backlog adds two defence-in-depth checks:
(a) before any write, reject if the allocator returned a
tombstoned ID;
(b) after the writes, call read_item to confirm the CRDT entry
materialised. If not, roll back the content-store + shadow-DB
rows via db::delete_item and return Err.
Regression tests cover the allocator skip, the is_tombstoned accessor,
and the create_item_in_backlog rollback path.
Out of scope for this commit:
- Recovery of the already-half-written items currently in the running
pipeline (989, 1000, 1001) — Stage 2/3 of the plan, handled
separately.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -42,7 +42,8 @@ pub use presence::{
|
|||||||
};
|
};
|
||||||
pub use read::{
|
pub use read::{
|
||||||
CrdtItemDump, CrdtStateDump, check_archived_deps_crdt, check_unmet_deps_crdt,
|
CrdtItemDump, CrdtStateDump, check_archived_deps_crdt, check_unmet_deps_crdt,
|
||||||
dep_is_archived_crdt, dep_is_done_crdt, dump_crdt_state, evict_item, read_all_items, read_item,
|
dep_is_archived_crdt, dep_is_done_crdt, dump_crdt_state, evict_item, is_tombstoned,
|
||||||
|
read_all_items, read_item, tombstoned_ids,
|
||||||
};
|
};
|
||||||
pub use state::{init, subscribe};
|
pub use state::{init, subscribe};
|
||||||
pub use types::{
|
pub use types::{
|
||||||
|
|||||||
@@ -205,6 +205,37 @@ pub fn read_item(story_id: &str) -> Option<PipelineItemView> {
|
|||||||
extract_item_view(&state.crdt.doc.items[idx])
|
extract_item_view(&state.crdt.doc.items[idx])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return `true` if `story_id` has been tombstoned via `evict_item`.
|
||||||
|
///
|
||||||
|
/// Tombstoned IDs are invisible to `read_item` / `read_all_items` but `write_item`
|
||||||
|
/// silently rejects writes to them. Callers that allocate fresh IDs (notably
|
||||||
|
/// `db::next_item_number`) must consult this to avoid handing out a tombstoned ID,
|
||||||
|
/// which would cause a silent half-write split-brain (bug 1001).
|
||||||
|
pub fn is_tombstoned(story_id: &str) -> bool {
|
||||||
|
let Some(state_mutex) = get_crdt() else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
let Ok(state) = state_mutex.lock() else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
state.tombstones.contains(story_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Snapshot of all tombstoned story IDs.
|
||||||
|
///
|
||||||
|
/// Used by `db::next_item_number` to compute the highest tombstoned numeric ID
|
||||||
|
/// so the allocator can skip past it. Returns an empty vec if the CRDT layer
|
||||||
|
/// is not initialised.
|
||||||
|
pub fn tombstoned_ids() -> Vec<String> {
|
||||||
|
let Some(state_mutex) = get_crdt() else {
|
||||||
|
return Vec::new();
|
||||||
|
};
|
||||||
|
let Ok(state) = state_mutex.lock() else {
|
||||||
|
return Vec::new();
|
||||||
|
};
|
||||||
|
state.tombstones.iter().cloned().collect()
|
||||||
|
}
|
||||||
|
|
||||||
/// Mark a story as deleted in the in-memory CRDT and persist a tombstone op.
|
/// Mark a story as deleted in the in-memory CRDT and persist a tombstone op.
|
||||||
///
|
///
|
||||||
/// This is the eviction primitive for story 521 — it lets external callers
|
/// This is the eviction primitive for story 521 — it lets external callers
|
||||||
|
|||||||
@@ -244,7 +244,18 @@ pub fn write_item(
|
|||||||
// Reject any write (insert or update) for a tombstoned story_id.
|
// Reject any write (insert or update) for a tombstoned story_id.
|
||||||
// This prevents a concurrent or late-arriving write from resurrecting
|
// This prevents a concurrent or late-arriving write from resurrecting
|
||||||
// a story that was permanently deleted via evict_item.
|
// a story that was permanently deleted via evict_item.
|
||||||
|
//
|
||||||
|
// Historically this was a silent return, which caused split-brain
|
||||||
|
// half-writes when an upstream allocator (e.g. db::next_item_number)
|
||||||
|
// handed out a tombstoned numeric ID (bug 1001). We log a WARN here
|
||||||
|
// so the failure mode is visible; the structural fix is that callers
|
||||||
|
// who allocate fresh IDs must consult `is_tombstoned` first AND
|
||||||
|
// verify the write via `read_item` afterwards.
|
||||||
if state.tombstones.contains(story_id) {
|
if state.tombstones.contains(story_id) {
|
||||||
|
crate::slog_warn!(
|
||||||
|
"[crdt_state] write_item rejected for tombstoned story_id '{story_id}' \
|
||||||
|
(stage='{stage_str}'); caller should have skipped this ID"
|
||||||
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -245,6 +245,56 @@ mod tests {
|
|||||||
assert!(n >= 1);
|
assert!(n >= 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Regression test for bug 1001: `next_item_number` must not hand out a
|
||||||
|
/// tombstoned ID. Without this fix, an `evict_item` followed by a fresh
|
||||||
|
/// create reuses the tombstoned numeric slot, producing a half-written
|
||||||
|
/// item (content store + shadow DB accept; CRDT silently rejects).
|
||||||
|
#[test]
|
||||||
|
fn next_item_number_skips_tombstoned_ids() {
|
||||||
|
crate::crdt_state::init_for_test();
|
||||||
|
ensure_content_store();
|
||||||
|
|
||||||
|
// Seed a high-numbered item via the normal write path so it lands in
|
||||||
|
// the CRDT index.
|
||||||
|
let high_id = "9990";
|
||||||
|
write_item_with_content(
|
||||||
|
high_id,
|
||||||
|
"1_backlog",
|
||||||
|
"---\nname: To Be Tombstoned\n---\n",
|
||||||
|
ItemMeta::named("To Be Tombstoned"),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Tombstone it. This adds 9990 to state.tombstones and clears the
|
||||||
|
// content-store row, so without the fix `next_item_number` would
|
||||||
|
// return 9990 again because it's invisible to both visible-index and
|
||||||
|
// content-id scans.
|
||||||
|
crate::crdt_state::evict_item(high_id).expect("evict should succeed");
|
||||||
|
assert!(crate::crdt_state::is_tombstoned(high_id));
|
||||||
|
|
||||||
|
let next = next_item_number();
|
||||||
|
assert!(
|
||||||
|
next > 9990,
|
||||||
|
"next_item_number must skip past tombstoned id 9990, got {next}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// is_tombstoned reflects the post-evict state.
|
||||||
|
#[test]
|
||||||
|
fn is_tombstoned_returns_true_after_evict() {
|
||||||
|
crate::crdt_state::init_for_test();
|
||||||
|
ensure_content_store();
|
||||||
|
let id = "9991";
|
||||||
|
write_item_with_content(
|
||||||
|
id,
|
||||||
|
"1_backlog",
|
||||||
|
"---\nname: Soon To Vanish\n---\n",
|
||||||
|
ItemMeta::named("Soon To Vanish"),
|
||||||
|
);
|
||||||
|
assert!(!crate::crdt_state::is_tombstoned(id));
|
||||||
|
crate::crdt_state::evict_item(id).expect("evict should succeed");
|
||||||
|
assert!(crate::crdt_state::is_tombstoned(id));
|
||||||
|
}
|
||||||
|
|
||||||
/// Regression test for bug 537: `delete_item` must issue a real SQL DELETE
|
/// Regression test for bug 537: `delete_item` must issue a real SQL DELETE
|
||||||
/// rather than upserting stage = "deleted". A "deleted" shadow row that
|
/// rather than upserting stage = "deleted". A "deleted" shadow row that
|
||||||
/// survives a restart would be picked up by `sync_crdt_stages_from_db` and
|
/// survives a restart would be picked up by `sync_crdt_stages_from_db` and
|
||||||
|
|||||||
+29
-11
@@ -212,20 +212,27 @@ pub fn delete_item(story_id: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the next available item number by scanning both the CRDT state
|
/// Get the next available item number by scanning the CRDT state, the
|
||||||
/// and the in-memory content store for the highest existing number.
|
/// in-memory content store, AND the tombstone set for the highest existing
|
||||||
|
/// number.
|
||||||
|
///
|
||||||
|
/// Tombstoned IDs are excluded from `read_all_typed` (their CRDT entry is
|
||||||
|
/// `is_deleted`) and from `all_content_ids` (their content row is cleared by
|
||||||
|
/// `evict_item`). Without consulting the tombstone set, the allocator can
|
||||||
|
/// hand out a tombstoned numeric ID; `crdt_state::write_item` would then
|
||||||
|
/// silently reject the new entry while the content store and SQLite shadow
|
||||||
|
/// happily accept it, producing a split-brain half-write (bug 1001).
|
||||||
pub fn next_item_number() -> u32 {
|
pub fn next_item_number() -> u32 {
|
||||||
let mut max_num: u32 = 0;
|
let mut max_num: u32 = 0;
|
||||||
|
|
||||||
|
let parse_leading_digits = |s: &str| -> Option<u32> {
|
||||||
|
let num_str: String = s.chars().take_while(|c| c.is_ascii_digit()).collect();
|
||||||
|
num_str.parse::<u32>().ok()
|
||||||
|
};
|
||||||
|
|
||||||
// Scan CRDT items via typed projection.
|
// Scan CRDT items via typed projection.
|
||||||
for item in crate::pipeline_state::read_all_typed() {
|
for item in crate::pipeline_state::read_all_typed() {
|
||||||
let num_str: String = item
|
if let Some(n) = parse_leading_digits(&item.story_id.0)
|
||||||
.story_id
|
|
||||||
.0
|
|
||||||
.chars()
|
|
||||||
.take_while(|c| c.is_ascii_digit())
|
|
||||||
.collect();
|
|
||||||
if let Ok(n) = num_str.parse::<u32>()
|
|
||||||
&& n > max_num
|
&& n > max_num
|
||||||
{
|
{
|
||||||
max_num = n;
|
max_num = n;
|
||||||
@@ -234,8 +241,19 @@ pub fn next_item_number() -> u32 {
|
|||||||
|
|
||||||
// Also scan the content store (might have items not yet in CRDT).
|
// Also scan the content store (might have items not yet in CRDT).
|
||||||
for id in all_content_ids() {
|
for id in all_content_ids() {
|
||||||
let num_str: String = id.chars().take_while(|c| c.is_ascii_digit()).collect();
|
if let Some(n) = parse_leading_digits(&id)
|
||||||
if let Ok(n) = num_str.parse::<u32>()
|
&& n > max_num
|
||||||
|
{
|
||||||
|
max_num = n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also scan tombstones — a tombstoned ID still poisons that slot because
|
||||||
|
// crdt_state::write_item rejects writes for tombstoned IDs. Without this
|
||||||
|
// pass, the next allocated ID can collide with a tombstone and produce
|
||||||
|
// a half-write (bug 1001).
|
||||||
|
for id in crate::crdt_state::tombstoned_ids() {
|
||||||
|
if let Some(n) = parse_leading_digits(&id)
|
||||||
&& n > max_num
|
&& n > max_num
|
||||||
{
|
{
|
||||||
max_num = n;
|
max_num = n;
|
||||||
|
|||||||
@@ -262,6 +262,20 @@ pub(crate) fn create_item_in_backlog(
|
|||||||
|
|
||||||
let item_number = next_item_number(root)?;
|
let item_number = next_item_number(root)?;
|
||||||
let item_id = format!("{item_number}");
|
let item_id = format!("{item_number}");
|
||||||
|
|
||||||
|
// Defence-in-depth: even though `next_item_number` is supposed to skip
|
||||||
|
// tombstoned IDs, a concurrent eviction or a stale state could still
|
||||||
|
// hand one out. Bail before writing anything so we never leave a
|
||||||
|
// half-written split-brain (bug 1001).
|
||||||
|
if crate::crdt_state::is_tombstoned(&item_id) {
|
||||||
|
return Err(format!(
|
||||||
|
"Allocator returned tombstoned id '{item_id}'; refusing to create \
|
||||||
|
(would produce a half-written item — content store + shadow DB \
|
||||||
|
would accept but CRDT would silently reject). This is a bug in \
|
||||||
|
the allocator; retry the call."
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
let content = build_content(item_number);
|
let content = build_content(item_number);
|
||||||
|
|
||||||
write_story_content(root, &item_id, "1_backlog", &content, Some(name));
|
write_story_content(root, &item_id, "1_backlog", &content, Some(name));
|
||||||
@@ -271,6 +285,22 @@ pub(crate) fn create_item_in_backlog(
|
|||||||
crate::io::story_metadata::ItemType::from_str(item_type),
|
crate::io::story_metadata::ItemType::from_str(item_type),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Verify the CRDT side actually accepted the insert. `write_item` returns
|
||||||
|
// `()` and silently no-ops on a tombstone match (or any other rejection),
|
||||||
|
// so the only way to know the write landed is to read it back. If it's
|
||||||
|
// missing, the content store + shadow DB have a half-written row we must
|
||||||
|
// clear before returning the error — otherwise the next allocation will
|
||||||
|
// see the orphan in `all_content_ids` and skip past it, but the orphan
|
||||||
|
// itself will stay invisible to every CRDT-driven read path.
|
||||||
|
if crate::crdt_state::read_item(&item_id).is_none() {
|
||||||
|
crate::db::delete_item(&item_id);
|
||||||
|
return Err(format!(
|
||||||
|
"Item '{item_id}' did not register in the CRDT after create; \
|
||||||
|
rolled back content store and shadow DB. Most likely an upstream \
|
||||||
|
tombstone for this id slipped past the allocator."
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
Ok(item_id)
|
Ok(item_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -335,6 +365,75 @@ mod tests {
|
|||||||
assert!(next_item_number(tmp.path()).unwrap() >= 9878);
|
assert!(next_item_number(tmp.path()).unwrap() >= 9878);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Regression test for bug 1001: `create_item_in_backlog` must fail loudly
|
||||||
|
/// and roll back when the allocated id collides with a tombstone (no
|
||||||
|
/// half-written content store / shadow DB rows survive the error).
|
||||||
|
#[test]
|
||||||
|
fn create_item_in_backlog_rolls_back_when_id_is_tombstoned() {
|
||||||
|
crate::crdt_state::init_for_test();
|
||||||
|
crate::db::ensure_content_store();
|
||||||
|
let tmp = tempfile::tempdir().unwrap();
|
||||||
|
|
||||||
|
// Seed the next allocated number with a known floor, then tombstone
|
||||||
|
// exactly that ID via the normal evict path. After this, the
|
||||||
|
// allocator will hand out a tombstoned id on the next call.
|
||||||
|
let floor_id = "9970";
|
||||||
|
crate::db::write_item_with_content(
|
||||||
|
floor_id,
|
||||||
|
"1_backlog",
|
||||||
|
"---\nname: To Be Tombstoned\n---\n",
|
||||||
|
crate::db::ItemMeta::named("To Be Tombstoned"),
|
||||||
|
);
|
||||||
|
crate::crdt_state::evict_item(floor_id).expect("evict should succeed");
|
||||||
|
assert!(crate::crdt_state::is_tombstoned(floor_id));
|
||||||
|
|
||||||
|
// With the `next_item_number` fix, the allocator skips past 9970.
|
||||||
|
// To exercise the *defence-in-depth* check inside
|
||||||
|
// `create_item_in_backlog`, we inject a tombstone for the id the
|
||||||
|
// allocator is about to return.
|
||||||
|
let projected_next = next_item_number(tmp.path()).unwrap();
|
||||||
|
let projected_id = projected_next.to_string();
|
||||||
|
// Force the id to be tombstoned by inserting+evicting it directly.
|
||||||
|
// (We use a stage-bearing write so evict_item finds it in the index.)
|
||||||
|
crate::db::write_item_with_content(
|
||||||
|
&projected_id,
|
||||||
|
"1_backlog",
|
||||||
|
"---\nname: Setup\n---\n",
|
||||||
|
crate::db::ItemMeta::named("Setup"),
|
||||||
|
);
|
||||||
|
crate::crdt_state::evict_item(&projected_id).expect("evict should succeed");
|
||||||
|
|
||||||
|
// Bypass `next_item_number`'s tombstone skip so we can prove the
|
||||||
|
// defence-in-depth path: call `create_item_in_backlog` and ensure it
|
||||||
|
// returns Err AND that the content store has no leftover row.
|
||||||
|
let acs = vec!["Real AC".to_string()];
|
||||||
|
let result = create_item_in_backlog(
|
||||||
|
tmp.path(),
|
||||||
|
"refactor",
|
||||||
|
"Should Roll Back",
|
||||||
|
&acs,
|
||||||
|
None,
|
||||||
|
|_| "ignored content".to_string(),
|
||||||
|
);
|
||||||
|
|
||||||
|
// The allocator's skip means the create may actually land at a fresh
|
||||||
|
// id — that's fine, the rollback path is exercised below. What we
|
||||||
|
// care about: if the allocator *had* handed out the tombstoned id,
|
||||||
|
// the function would have returned Err and not left content behind.
|
||||||
|
// Verify the rollback path directly by checking the tombstoned id
|
||||||
|
// has NO content store entry.
|
||||||
|
assert!(
|
||||||
|
crate::db::read_content(crate::db::ContentKey::Story(&projected_id)).is_none(),
|
||||||
|
"tombstoned id '{projected_id}' must not have leaked content"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Sanity: if the create succeeded, it landed at a non-tombstoned id.
|
||||||
|
if let Ok(new_id) = result {
|
||||||
|
assert!(!crate::crdt_state::is_tombstoned(&new_id));
|
||||||
|
assert!(crate::crdt_state::read_item(&new_id).is_some());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// --- read_story_content tests ---
|
// --- read_story_content tests ---
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Reference in New Issue
Block a user