feat(1001): story_ids filter for recover_half_written_items

The first dry-run against the live pipeline surfaced 735 orphans (35 tombstoned half-writes, 700 stale content rows with no CRDT entry — mostly artefacts of the pre-numeric-id era). Bulk-recovering would resurrect a lot of stories the user deliberately purged in the past. Add an optional `story_ids` filter that restricts both discovery (in dry-run) and recovery to a named subset, so the operator can target the specific recent half-writes without touching anything else. The new test asserts the filter is honoured. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 19:26:07 +01:00
parent cd411ba443
commit 92b1744c3a
3 changed files with 74 additions and 9 deletions
@@ -84,13 +84,23 @@ pub fn find_half_written_items() -> Vec<HalfWritten> {
 ///    the orphan is left in place.
 /// 5. Deletes the orphan's content-store + shadow-DB rows via `delete_item`.
 ///
 /// If `only` is provided, recovery operates only on orphans whose `story_id`
 /// is in the list — everything else is left alone.  This is the safe default
 /// for a live system that may have many historic purged ids visible as
 /// orphans alongside the genuinely-recent half-writes.
 ///
 /// Returns a (`old_id`, `new_id`, `name`) mapping for every successful
 /// recovery.  Orphans that failed to recover are simply omitted from the
 /// returned list and logged at WARN level.
-pub fn recover_half_written_items() -> Vec<RecoveryResult> {
+pub fn recover_half_written_items(only: Option<&[String]>) -> Vec<RecoveryResult> {
    let orphans = find_half_written_items();
    let mut results = Vec::with_capacity(orphans.len());
    for orphan in orphans {
        if let Some(filter) = only
            && !filter.iter().any(|f| f == &orphan.story_id)
        {
            continue;
        }
        match recover_one(&orphan) {
            Ok(result) => results.push(result),
            Err(e) => {
@@ -358,7 +368,7 @@ mod tests {
        assert_eq!(entry.name, "Tombstoned Then Half-Written");
        // Run the recovery and inspect the mapping.
-        let results = recover_half_written_items();
+        let results = recover_half_written_items(None);
        let mapping = results
            .iter()
            .find(|r| r.old_id == old_id)
@@ -382,10 +392,44 @@ mod tests {
        );
        // Re-running recovery is a no-op (no orphans left).
-        let again = recover_half_written_items();
+        let again = recover_half_written_items(None);
        assert!(
            again.iter().all(|r| r.old_id != old_id),
            "recovery should be idempotent for the same orphan"
        );
    }
    /// The `only` filter restricts recovery to a specific id set; orphans
    /// outside the filter are left alone.
    #[test]
    fn recover_only_filter_restricts_recovery_to_named_ids() {
        crdt_state::init_for_test();
        crate::db::ensure_content_store();
        // Two orphans, only one in the filter.
        let recover_id = "9810";
        let keep_id = "9811";
        for id in [recover_id, keep_id] {
            let name = format!("Item {id}");
            let content = body(&name, "refactor", None);
            write_item_with_content(id, "1_backlog", &content, ItemMeta::named(&name));
            crdt_state::evict_item(id).expect("evict should succeed");
            crate::db::content_store::write_content(ContentKey::Story(id), &content);
        }
        let results = recover_half_written_items(Some(&[recover_id.to_string()]));
        assert_eq!(results.len(), 1, "exactly one orphan should be recovered");
        assert_eq!(results[0].old_id, recover_id);
        // The filtered-out orphan must still be a half-written item.
        let half = find_half_written_items();
        assert!(
            half.iter().any(|h| h.story_id == keep_id),
            "filtered-out orphan should remain"
        );
        assert!(
            half.iter().all(|h| h.story_id != recover_id),
            "recovered orphan should be gone"
        );
    }
 }
@@ -131,21 +131,37 @@ pub(crate) fn tool_recover_half_written_items(args: &Value) -> Result<String, St
        .and_then(|v| v.as_bool())
        .unwrap_or(true);
    // Optional id filter — when provided, recovery (or the dry-run report) is
    // restricted to these ids.  This is the safe choice for live systems
    // where the orphan set may include many historic purged stories that
    // should stay dead.
    let only: Option<Vec<String>> = args.get("story_ids").and_then(|v| {
        v.as_array().map(|arr| {
            arr.iter()
                .filter_map(|x| x.as_str().map(str::to_string))
                .collect()
        })
    });
    if dry_run {
-        let half = crate::db::find_half_written_items();
+        let mut half = crate::db::find_half_written_items();
        if let Some(filter) = &only {
            half.retain(|h| filter.iter().any(|f| f == &h.story_id));
        }
        let count = half.len();
        return serde_json::to_string_pretty(&json!({
            "dry_run": true,
            "found": half,
-            "count": half.len(),
+            "count": count,
            "message": format!(
-                "Discovered {} half-written item(s). Re-run with dry_run=false to recover them.",
+                "Discovered {count} half-written item(s){scope}. Re-run with dry_run=false to recover them.",
-                half.len()
+                scope = if only.is_some() { " matching the filter" } else { "" },
            ),
        }))
        .map_err(|e| format!("Serialization error: {e}"));
    }
-    let results = crate::db::recover_half_written_items();
+    let results = crate::db::recover_half_written_items(only.as_deref());
    serde_json::to_string_pretty(&json!({
        "dry_run": false,
        "recovered": results,
@@ -290,13 +290,18 @@ pub(super) fn system_tools() -> Vec<Value> {
        }),
        json!({
            "name": "recover_half_written_items",
-            "description": "Discover (and optionally recover) half-written pipeline items — rows whose content is in the SQLite shadow and content store but whose CRDT entry is absent or tombstoned. These are invisible to every CRDT-driven read path and can't be cleaned up by delete_story / purge_story. Recovery moves each orphan's content onto a fresh non-tombstoned id and reports the old→new mapping. Defaults to dry_run=true so you can see what would change before mutating.",
+            "description": "Discover (and optionally recover) half-written pipeline items — rows whose content is in the SQLite shadow and content store but whose CRDT entry is absent or tombstoned. These are invisible to every CRDT-driven read path and can't be cleaned up by delete_story / purge_story. Recovery moves each orphan's content onto a fresh non-tombstoned id and reports the old→new mapping. Defaults to dry_run=true so you can see what would change before mutating. Use story_ids to restrict the operation to a specific list of ids — strongly recommended on live systems where the orphan set may include historic purged stories that should stay dead.",
            "inputSchema": {
                "type": "object",
                "properties": {
                    "dry_run": {
                        "type": "boolean",
                        "description": "When true (default), only discover half-written items and report them. When false, perform the recovery."
                    },
                    "story_ids": {
                        "type": "array",
                        "items": { "type": "string" },
                        "description": "Optional: restrict the operation (discovery in dry-run, recovery otherwise) to these story_ids. Anything else in the orphan set is left untouched. Strongly recommended for live recovery so you don't resurrect historic purged items."
                    }
                },
                "required": []