Compare commits
65 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ce688fc0bf | |||
| c131896432 | |||
| 42e6eec9e9 | |||
| fe00fe6a25 | |||
| c97b7c841f | |||
| 2d0387fe63 | |||
| 71d3047ef0 | |||
| d86cc38b2a | |||
| 21b2efd268 | |||
| badd522d60 | |||
| ecd3f600d9 | |||
| 099df17e77 | |||
| c88e42eba2 | |||
| 89058ebd49 | |||
| d8204ab7ed | |||
| e2ea1af4c8 | |||
| 08780475d0 | |||
| 6eb2742e7d | |||
| c1b7e12b0b | |||
| 53d44ff42a | |||
| 6331dea8b0 | |||
| 240beec7de | |||
| 7de167b21b | |||
| 49af014a84 | |||
| 73cf1c6ff9 | |||
| f8b1e14b74 | |||
| 265e6f9a15 | |||
| 40e995da88 | |||
| 6e4fb7fd4b | |||
| 0695ad7ae6 | |||
| eb6b07531a | |||
| 2d6846fe03 | |||
| a5bfd40233 | |||
| a40500eea9 | |||
| f8212f102f | |||
| 59302b465d | |||
| efafe44db1 | |||
| 6a2f81e873 | |||
| 3a43337735 | |||
| b6df89d24c | |||
| 10d992a7e4 | |||
| 5c63618b30 | |||
| 7db0b78e88 | |||
| 979492449e | |||
| 6fbe239313 | |||
| 26527e7dae | |||
| 04a57e92c2 | |||
| d59efa0b5c | |||
| 4216ced493 | |||
| 9f4f493486 | |||
| 63d86f1263 | |||
| 398a5806e7 | |||
| 1adc734801 | |||
| 0ae6dfd565 | |||
| 8531bac6cd | |||
| ce13c00ebd | |||
| 2857c3b46b | |||
| d944885ce9 | |||
| 62d1535e76 | |||
| 46556d308a | |||
| fc5481dbe4 | |||
| 01e60a670c | |||
| c4010854a5 | |||
| fb1311cdae | |||
| 4aa76ce673 |
@@ -15,6 +15,8 @@ _merge_parsed.json
|
||||
.huskies_port
|
||||
.huskies/bot.toml.bak
|
||||
.huskies/build_hash
|
||||
# Phantom 0-byte pipeline.db sometimes appears at repo root from old code; canonical DB lives at .huskies/pipeline.db
|
||||
/pipeline.db
|
||||
|
||||
# Per-worktree planning file (written by coder agents, must never reach squash commits)
|
||||
PLAN.md
|
||||
|
||||
+1
-1
@@ -56,7 +56,7 @@ There are no exceptions. The merge gate runs `source-map-check` and rejects the
|
||||
Before committing, run `cargo run -p source-map-gen --bin source-map-check -- --worktree . --base master` and address every missing-docs direction it prints. If you added a new module file (e.g. `foo.rs` or `foo/mod.rs`), the FIRST line of that file MUST be a `//! What this module is for` doc comment.
|
||||
|
||||
## Documentation
|
||||
Docs live in `website/docs/*.html` (static HTML), **not** Markdown files. When a story asks you to document something, edit the relevant `.html` file in `website/docs/`.
|
||||
Docs live in `website/app/docs/*.tsx` (Next.js pages), **not** Markdown files. When a story asks you to document something, edit the relevant `.tsx` file under `website/app/docs/`. Run `npm run build` in `website/` to verify your changes render correctly.
|
||||
|
||||
## Configuration files
|
||||
- Agent config: `.huskies/agents.toml` (preferred) or `[[agent]]` blocks in `.huskies/project.toml`
|
||||
|
||||
+78
-17
@@ -541,6 +541,7 @@
|
||||
"enum TerminationReason",
|
||||
"enum PipelineStage",
|
||||
"fn pipeline_stage",
|
||||
"fn canonical_pipeline_stage",
|
||||
"fn agent_config_stage",
|
||||
"struct CompletionReport",
|
||||
"struct TokenUsage",
|
||||
@@ -678,9 +679,7 @@
|
||||
"server/src/agents/pool/pipeline/mod.rs": [],
|
||||
"server/src/agents/pool/process.rs": [
|
||||
"fn kill_all_children",
|
||||
"fn kill_child_for_key",
|
||||
"fn inject_child_killer",
|
||||
"fn child_killer_count"
|
||||
"fn kill_child_for_key"
|
||||
],
|
||||
"server/src/agents/pool/query.rs": [
|
||||
"fn available_agents_for_stage",
|
||||
@@ -707,6 +706,7 @@
|
||||
],
|
||||
"server/src/agents/pool/stop.rs": [
|
||||
"fn stop_agent",
|
||||
"fn reconcile_canonical_agents",
|
||||
"fn remove_agents_for_story"
|
||||
],
|
||||
"server/src/agents/pool/test_helpers.rs": [
|
||||
@@ -752,9 +752,7 @@
|
||||
"fn run_agent_pty_streaming"
|
||||
],
|
||||
"server/src/agents/pty/types.rs": [
|
||||
"struct PtyResult",
|
||||
"fn composite_key",
|
||||
"struct ChildKillerGuard"
|
||||
"struct PtyResult"
|
||||
],
|
||||
"server/src/agents/runtime/claude_code.rs": [
|
||||
"struct ClaudeCodeRuntime",
|
||||
@@ -858,6 +856,9 @@
|
||||
"server/src/chat/commands/move_story.rs": [
|
||||
"fn handle_move"
|
||||
],
|
||||
"server/src/chat/commands/new_project.rs": [
|
||||
"fn handle_new_project_fallback"
|
||||
],
|
||||
"server/src/chat/commands/overview.rs": [
|
||||
"fn handle_overview"
|
||||
],
|
||||
@@ -898,6 +899,13 @@
|
||||
"server/src/chat/commands/unreleased.rs": [
|
||||
"fn handle_unreleased"
|
||||
],
|
||||
"server/src/chat/dispatcher.rs": [
|
||||
"type SpawnFn",
|
||||
"struct ChatDispatcher",
|
||||
"fn new",
|
||||
"fn submit",
|
||||
"fn stop"
|
||||
],
|
||||
"server/src/chat/history.rs": [
|
||||
"type ChatConversationHistory",
|
||||
"fn load_chat_history",
|
||||
@@ -908,6 +916,7 @@
|
||||
],
|
||||
"server/src/chat/mod.rs": [
|
||||
"mod commands",
|
||||
"mod dispatcher",
|
||||
"mod history",
|
||||
"mod lookup",
|
||||
"mod test_helpers",
|
||||
@@ -990,10 +999,10 @@
|
||||
"fn handle_message"
|
||||
],
|
||||
"server/src/chat/transport/matrix/bot/messages/mod.rs": [
|
||||
"fn format_user_prompt",
|
||||
"fn format_drained_events"
|
||||
"fn format_user_prompt"
|
||||
],
|
||||
"server/src/chat/transport/matrix/bot/messages/on_room_message.rs": [
|
||||
"fn eval_switch_command",
|
||||
"fn on_room_message"
|
||||
],
|
||||
"server/src/chat/transport/matrix/bot/mod.rs": [
|
||||
@@ -1037,6 +1046,7 @@
|
||||
"fn default_permission_timeout_secs",
|
||||
"fn default_aggregated_notifications_poll_interval_secs",
|
||||
"fn default_aggregated_notifications_enabled",
|
||||
"fn default_coalesce_window_ms",
|
||||
"fn default_transport",
|
||||
"fn default_whatsapp_provider",
|
||||
"struct BotConfig"
|
||||
@@ -1063,6 +1073,7 @@
|
||||
"mod config",
|
||||
"mod delete",
|
||||
"mod htop",
|
||||
"mod new_project",
|
||||
"mod rebuild",
|
||||
"mod reset",
|
||||
"mod rmtree",
|
||||
@@ -1070,6 +1081,13 @@
|
||||
"mod transport_impl",
|
||||
"fn spawn_bot"
|
||||
],
|
||||
"server/src/chat/transport/matrix/new_project.rs": [
|
||||
"struct NewProjectCommand",
|
||||
"fn extract_new_project_command",
|
||||
"fn detect_stack",
|
||||
"fn image_for_stack",
|
||||
"fn handle_new_project"
|
||||
],
|
||||
"server/src/chat/transport/matrix/rebuild.rs": [
|
||||
"struct RebuildCommand",
|
||||
"fn extract_rebuild_command",
|
||||
@@ -1275,6 +1293,13 @@
|
||||
"fn delete_agent_throttle",
|
||||
"fn extract_agent_throttle_view"
|
||||
],
|
||||
"server/src/crdt_state/lww_maps/event_log.rs": [
|
||||
"const GAP_PIPELINE_EVENT",
|
||||
"struct EventLogEntryRaw",
|
||||
"fn append_event_log_entry",
|
||||
"fn append_gap_log_entry",
|
||||
"fn read_all_event_log_entries"
|
||||
],
|
||||
"server/src/crdt_state/lww_maps/gateway_projects.rs": [
|
||||
"fn write_gateway_project",
|
||||
"fn read_all_gateway_projects",
|
||||
@@ -1282,6 +1307,12 @@
|
||||
"fn delete_gateway_project",
|
||||
"fn extract_gateway_project_view"
|
||||
],
|
||||
"server/src/crdt_state/lww_maps/llm_sessions.rs": [
|
||||
"fn write_llm_session",
|
||||
"fn read_llm_session",
|
||||
"fn assemble_and_advance_session",
|
||||
"fn extract_llm_session_view"
|
||||
],
|
||||
"server/src/crdt_state/lww_maps/merge_jobs.rs": [
|
||||
"fn write_merge_job",
|
||||
"fn read_all_merge_jobs",
|
||||
@@ -1357,10 +1388,13 @@
|
||||
"fn rebuild_active_agent_index",
|
||||
"fn rebuild_test_job_index",
|
||||
"fn rebuild_agent_throttle_index",
|
||||
"fn rebuild_gateway_project_index"
|
||||
"fn rebuild_gateway_project_index",
|
||||
"fn rebuild_llm_session_index"
|
||||
],
|
||||
"server/src/crdt_state/state/init.rs": [
|
||||
"fn init"
|
||||
"enum PersistMsg",
|
||||
"fn init",
|
||||
"fn flush_persistence"
|
||||
],
|
||||
"server/src/crdt_state/state/mod.rs": [
|
||||
"fn subscribe",
|
||||
@@ -1371,6 +1405,7 @@
|
||||
"fn init_for_test"
|
||||
],
|
||||
"server/src/crdt_state/state/statics.rs": [
|
||||
"static PERSIST_PENDING",
|
||||
"static CRDT_EVENT_TX",
|
||||
"static SYNC_TX",
|
||||
"static ALL_OPS",
|
||||
@@ -1386,6 +1421,12 @@
|
||||
"struct CrdtEvent",
|
||||
"struct GatewayConfigCrdt",
|
||||
"struct PipelineDoc",
|
||||
"struct EventLogEntryCrdt",
|
||||
"struct LlmSessionCrdt",
|
||||
"enum ScopeFilter",
|
||||
"fn from_scope_str",
|
||||
"fn to_scope_str",
|
||||
"struct LlmSessionView",
|
||||
"struct PipelineItemCrdt",
|
||||
"struct NodePresenceCrdt",
|
||||
"struct EpicId",
|
||||
@@ -1441,7 +1482,9 @@
|
||||
"fn migrate_legacy_stage_strings",
|
||||
"fn migrate_node_claims_to_agent_claims",
|
||||
"fn migrate_merge_job",
|
||||
"fn purge_done_stage_merge_jobs"
|
||||
"fn purge_done_stage_merge_jobs",
|
||||
"fn migrate_zombie_pipeline_rows",
|
||||
"fn sweep_zombie_rows"
|
||||
],
|
||||
"server/src/crdt_state/write/mod.rs": [],
|
||||
"server/src/crdt_state/write/tests.rs": [],
|
||||
@@ -1550,7 +1593,11 @@
|
||||
"fn named",
|
||||
"fn write_item_with_content",
|
||||
"fn move_item_stage",
|
||||
"fn sync_item_agent",
|
||||
"fn delete_item",
|
||||
"fn delete_item_sync",
|
||||
"fn sync_item_name",
|
||||
"fn sync_item_depends_on",
|
||||
"fn next_item_number"
|
||||
],
|
||||
"server/src/db/recover.rs": [
|
||||
@@ -1565,10 +1612,19 @@
|
||||
"struct PipelineWriteMsg",
|
||||
"struct PipelineDb",
|
||||
"static PIPELINE_DB",
|
||||
"static SHADOW_DB_PATH",
|
||||
"fn init",
|
||||
"fn backup_pre_pipeline_status",
|
||||
"fn check_schema_drift"
|
||||
],
|
||||
"server/src/event_log/mod.rs": [
|
||||
"type EventId",
|
||||
"struct LoggedEvent",
|
||||
"fn log_transition_event",
|
||||
"fn read_event_log",
|
||||
"fn insert_gap_sentinel",
|
||||
"fn spawn_event_log_subscriber"
|
||||
],
|
||||
"server/src/gateway/mod.rs": [
|
||||
"fn build_gateway_route",
|
||||
"fn run"
|
||||
@@ -1580,11 +1636,6 @@
|
||||
"server/src/http/agents_sse.rs": [
|
||||
"fn agent_stream"
|
||||
],
|
||||
"server/src/http/assets.rs": [
|
||||
"fn embedded_asset",
|
||||
"fn embedded_file",
|
||||
"fn embedded_index"
|
||||
],
|
||||
"server/src/http/context.rs": [
|
||||
"enum PermissionDecision",
|
||||
"struct PermissionForward",
|
||||
@@ -1817,7 +1868,6 @@
|
||||
],
|
||||
"server/src/http/mod.rs": [
|
||||
"mod agents_sse",
|
||||
"mod assets",
|
||||
"mod context",
|
||||
"mod events",
|
||||
"mod identity",
|
||||
@@ -2150,6 +2200,9 @@
|
||||
"struct CompletionResponse",
|
||||
"trait ModelProvider"
|
||||
],
|
||||
"server/src/llm_session/mod.rs": [
|
||||
"fn assemble_prompt_context"
|
||||
],
|
||||
"server/src/log_buffer.rs": [
|
||||
"enum LogLevel",
|
||||
"fn as_str",
|
||||
@@ -2170,11 +2223,14 @@
|
||||
"mod crdt_state",
|
||||
"mod crdt_sync",
|
||||
"mod crdt_wire",
|
||||
"mod event_log",
|
||||
"mod gateway",
|
||||
"mod llm_session",
|
||||
"mod log_buffer",
|
||||
"mod mesh",
|
||||
"mod node_identity",
|
||||
"mod pipeline_state",
|
||||
"mod process_kill",
|
||||
"mod rebuild",
|
||||
"mod services",
|
||||
"mod sled_uplink",
|
||||
@@ -2273,6 +2329,11 @@
|
||||
"fn stage_label",
|
||||
"fn stage_dir_name"
|
||||
],
|
||||
"server/src/process_kill.rs": [
|
||||
"fn sigkill_pids_and_verify",
|
||||
"fn pids_matching",
|
||||
"fn descendant_pids"
|
||||
],
|
||||
"server/src/rebuild.rs": [
|
||||
"enum ShutdownReason",
|
||||
"struct BotShutdownNotifier",
|
||||
|
||||
@@ -0,0 +1,221 @@
|
||||
# Chat-Driven Project Bootstrap
|
||||
|
||||
Design overview for going from "I want a new project" to a running,
|
||||
container-isolated, editor-accessible huskies project in one chat command.
|
||||
|
||||
## Goal
|
||||
|
||||
A user can say to Timmy in chat:
|
||||
|
||||
```
|
||||
new project myapp --stack rust
|
||||
new project legacy-rails --git git@github.com:me/legacy-rails.git
|
||||
```
|
||||
|
||||
and end up with:
|
||||
|
||||
1. A fresh docker container running the project's huskies node.
|
||||
2. The project's source code bind-mounted from the host so the user can
|
||||
edit it in any editor.
|
||||
3. SSH into the container so editors can run LSPs, builds, and tests
|
||||
inside the container — never on the host.
|
||||
4. Optional git remote configured for push to GitHub or Gitea.
|
||||
5. The new sled registered with the gateway, so Timmy can drive coders /
|
||||
mergemaster / etc. on the project via existing chat commands.
|
||||
|
||||
Manual repo creation on GitHub/Gitea remains the user's job. Everything
|
||||
downstream of that is orchestrated.
|
||||
|
||||
## Architecture at a Glance
|
||||
|
||||
```
|
||||
┌──────────────────────┐
|
||||
│ Browser / Matrix │───┐
|
||||
└──────────────────────┘ │
|
||||
▼
|
||||
┌───────────────────────┐
|
||||
│ Gateway (huskies-gw) │
|
||||
│ • chat dispatcher │
|
||||
│ • new-project │
|
||||
│ • routing │
|
||||
└─────────┬─────────────┘
|
||||
│
|
||||
┌─────────┴───────────────────────────────────┐
|
||||
│ docker engine (host) │
|
||||
│ ┌────────────┐ ┌────────────┐ ┌─────────┐ │
|
||||
│ │ project-A │ │ project-B │ │ ... │ │
|
||||
│ │ sled + │ │ sled + │ │ │ │
|
||||
│ │ sshd + │ │ sshd + │ │ │ │
|
||||
│ │ LSPs │ │ LSPs │ │ │ │
|
||||
│ └─────┬──────┘ └─────┬──────┘ └─────────┘ │
|
||||
└────────┼──────────────┼─────────────────────┘
|
||||
│ │
|
||||
bind mount │ │ bind mount
|
||||
┌────────┴───┐ ┌─────┴──────┐
|
||||
│ ~/code/A │ │ ~/code/B │ ◄── host
|
||||
└────────────┘ └────────────┘ editor opens
|
||||
these paths
|
||||
```
|
||||
|
||||
- One container per project. The container runs the project's huskies
|
||||
binary (sled), an SSH server, and the stack-appropriate LSP(s).
|
||||
- Source lives on the host (e.g. `~/code/<project>`), bind-mounted into
|
||||
the container at a known path. Host can git-diff, back up, or edit.
|
||||
- The gateway is editor-agnostic and project-agnostic — it talks to each
|
||||
sled via the existing rendezvous / CRDT-sync protocol.
|
||||
|
||||
## Three Personas
|
||||
|
||||
| Persona | What they do | What they need |
|
||||
|---------|--------------|----------------|
|
||||
| Chat-only user | Drives everything via Matrix/web chat | Installed huskies binary; chat client |
|
||||
| Editor-using technical user | Same + edits source in their editor | SSH config to the container + editor-specific remote-dev setup |
|
||||
| Multi-project user | Several projects running in parallel | Gateway-listed projects, all routable from one chat |
|
||||
|
||||
Chat-only users never touch SSH. Editor users go through a one-time
|
||||
"copy this SSH command into your editor's remote settings" handoff at
|
||||
project creation time.
|
||||
|
||||
## The Bootstrap Chat Command
|
||||
|
||||
```
|
||||
new project <name> [--stack <stack>] [--git <url>] [--path <host-path>]
|
||||
```
|
||||
|
||||
Flow:
|
||||
|
||||
1. **Validate**: name unique among existing projects; host path doesn't already
|
||||
exist; stack (if declared) is one of the supported overlays.
|
||||
2. **Allocate** a fresh per-project port range (gateway picks).
|
||||
3. **Create host directory** at `--path` (default `~/huskies/<name>/`).
|
||||
4. If `--git` provided, `git clone` into that directory; else `git init`.
|
||||
5. **Detect stack** from cloned content if not declared:
|
||||
- `Cargo.toml` → `rust`
|
||||
- `package.json` → `node`
|
||||
- `go.mod` → `go`
|
||||
- `pyproject.toml` / `requirements.txt` / `setup.py` → `python`
|
||||
- `Gemfile` → `ruby`
|
||||
- `pom.xml` / `build.gradle` → `jvm`
|
||||
- Multiple → pick the dominant, warn.
|
||||
- None → minimal base image, user can install tooling later.
|
||||
6. **Compose the container** from `huskies-project-base` + the stack
|
||||
overlay (Dockerfile fragments under `docker/stacks/<stack>/`).
|
||||
7. **Launch** the container with bind mount + port forwards + an
|
||||
auto-generated SSH key.
|
||||
8. **Seed `.huskies/project.toml`** with sensible defaults.
|
||||
9. **Register** the project with the gateway (`gateway_projects` LWW-map).
|
||||
10. **Reply in chat** with: project name, host path, SSH command, and
|
||||
a `huskies status <name>` invocation to verify.
|
||||
|
||||
## Container Template
|
||||
|
||||
Layered:
|
||||
|
||||
- **`huskies-project-base`**: debian-slim + git + huskies binary + sshd
|
||||
+ sudo + a `huskies` user with the SSH pubkey installed.
|
||||
- **`huskies-stack-<stack>`**: per-stack additions. E.g. rust gets
|
||||
`rustup` + `rust-analyzer` + `cargo-nextest`; node gets `node@22` +
|
||||
`typescript-language-server`; etc.
|
||||
- **Project layer**: the bind-mounted `/workspace` is the project source,
|
||||
written by the host's editor, read by the in-container tooling.
|
||||
|
||||
The container's SSH server is bound to a host-local port (not exposed
|
||||
externally). Auth is the per-project keypair generated at bootstrap;
|
||||
the public key sits inside the container, the private key on host.
|
||||
|
||||
## Build Sandbox Model
|
||||
|
||||
The threat: editing code in a host-side editor causes the editor (or its
|
||||
LSP plugin) to run `cargo check` / `npm install` / `pip install` /
|
||||
similar, which executes arbitrary code from project dependencies —
|
||||
`build.rs`, proc-macros, npm `postinstall`, Python `setup.py`, Ruby
|
||||
native-extension build scripts, etc. A malicious dependency compromises
|
||||
the host.
|
||||
|
||||
The mitigation: all build / type-check / dependency-install commands
|
||||
execute **inside the project container**. The host's editor connects to
|
||||
the container over SSH; rust-analyzer (or equivalent) runs inside the
|
||||
container; the host process never `exec`s untrusted build scripts.
|
||||
|
||||
Container isolation is the docker default plus:
|
||||
- No `--privileged`.
|
||||
- No host bind mounts beyond the project source and the SSH key.
|
||||
- No host network beyond the gateway's CRDT sync port.
|
||||
- `--cap-drop=ALL` plus the minimum caps needed (probably none).
|
||||
|
||||
This isn't a hardened sandbox in the gvisor / Firecracker sense — a
|
||||
docker-escape exploit on a compromised container still escalates to
|
||||
host. For most consumer threat models (malicious crate from
|
||||
crates.io / npm), docker's default isolation is sufficient. Tighter
|
||||
sandboxing (gvisor) is a separate future spike if needed.
|
||||
|
||||
## Editor Connection — Editor-Agnostic SSH
|
||||
|
||||
| Editor | Connection mechanism |
|
||||
|--------|----------------------|
|
||||
| VSCode | Remote-SSH extension |
|
||||
| JetBrains (IntelliJ/Rover) | JetBrains Gateway (SSH) |
|
||||
| Zed | Built-in SSH remoting (mac/linux only today) |
|
||||
| Vim/Neovim | SSH terminal session, or local nvim + LSP-over-SSH |
|
||||
| Emacs | TRAMP + remote LSP via lsp-mode |
|
||||
|
||||
All converge on: `ssh huskies@127.0.0.1 -p <project-port> -i ~/.huskies/<name>/id_ed25519`.
|
||||
That string is emitted in the bootstrap chat reply.
|
||||
|
||||
## Git Integration
|
||||
|
||||
- Initial setup is `git init` or `git clone` inside the container.
|
||||
- For push: user's existing GitHub / Gitea SSH key is bind-mounted
|
||||
read-only into the container at `~/.ssh/id_*`, OR the user supplies a
|
||||
push token via `huskies secrets set GIT_TOKEN=...` (stored as a Fly
|
||||
secret equivalent — for now, a chmod 600 file in the container).
|
||||
- The container's `git` config gets `user.name` / `user.email` from the
|
||||
gateway-level user identity.
|
||||
|
||||
## Decisions
|
||||
|
||||
| Decision | Choice | Alternative |
|
||||
|----------|--------|-------------|
|
||||
| Container per project | One container per project | One container many projects: simpler but breaks isolation, breaks per-project deps |
|
||||
| Editor model | SSH-remote (any editor) | VSCode Dev Containers only: simpler config but locks out everyone else |
|
||||
| Source location | Bind mount from host | Inside container only: breaks "I can also edit on my laptop" requirement |
|
||||
| Stack detection | Auto from project files, override with `--stack` | Always declared: more friction at bootstrap |
|
||||
| Push secrets | Bind-mounted host SSH key OR per-project token | Gateway holds tokens: bigger blast radius |
|
||||
|
||||
## Open Questions
|
||||
|
||||
1. **Per-project resource limits.** Should each container have a hard
|
||||
CPU / RAM cap so a runaway agent doesn't starve the host?
|
||||
2. **Lifecycle / cleanup.** If the user deletes a project from chat,
|
||||
what gets removed? Container yes; host source no (data loss); git
|
||||
remotes yes? Need a confirm step.
|
||||
3. **Multi-tenant.** Out of scope for this design (that's huskies.dev
|
||||
territory). This doc assumes single-user local-only.
|
||||
4. **Windows specifics.** Bind mounts work but line-ending /
|
||||
permission edge cases. Probably document "use WSL2 for best
|
||||
experience" rather than fight Windows native paths.
|
||||
5. **Gateway-on-host vs gateway-in-container.** The gateway today runs
|
||||
in its own container. New per-project containers connect via docker
|
||||
network. Need to confirm the network plumbing works for arbitrary
|
||||
per-project containers, not just the manually-configured ones.
|
||||
|
||||
## Phasing
|
||||
|
||||
The work breaks naturally into:
|
||||
|
||||
- **Phase 0 (now):** this design doc.
|
||||
- **Phase 1:** chat command exists and provisions a bare project
|
||||
container (no stack overlay, no SSH, no git clone — just
|
||||
"start a container, register with gateway"). Validates the
|
||||
orchestration shell.
|
||||
- **Phase 2:** stack-aware container template — base image + overlays;
|
||||
detection from project files.
|
||||
- **Phase 3:** SSH-remote editor access — sshd in the container,
|
||||
per-project keypair, chat-reply emits the connection string.
|
||||
- **Phase 4:** git integration — `--git <url>` clones, host SSH key
|
||||
mount, push verification.
|
||||
- **Phase 5:** per-project resource limits + cleanup chat commands.
|
||||
|
||||
Each phase ships independently and is usable on its own. Phase 1 alone
|
||||
gives chat-only users a working project; later phases add the editor
|
||||
and git polish.
|
||||
Generated
+1
-47
@@ -1911,7 +1911,7 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
|
||||
|
||||
[[package]]
|
||||
name = "huskies"
|
||||
version = "0.11.0"
|
||||
version = "0.12.1"
|
||||
dependencies = [
|
||||
"ammonia",
|
||||
"async-stream",
|
||||
@@ -1931,7 +1931,6 @@ dependencies = [
|
||||
"libc",
|
||||
"libsqlite3-sys",
|
||||
"matrix-sdk",
|
||||
"mime_guess",
|
||||
"mockito",
|
||||
"notify",
|
||||
"nutype",
|
||||
@@ -1941,7 +1940,6 @@ dependencies = [
|
||||
"rand 0.10.1",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"rust-embed",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
@@ -2978,16 +2976,6 @@ version = "0.1.54"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbf6f36070878c42c5233846cd3de24cf9016828fd47bc22957a687298bb21fc"
|
||||
|
||||
[[package]]
|
||||
name = "mime_guess"
|
||||
version = "2.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e"
|
||||
dependencies = [
|
||||
"mime",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.8.9"
|
||||
@@ -4206,40 +4194,6 @@ dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-embed"
|
||||
version = "8.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04113cb9355a377d83f06ef1f0a45b8ab8cd7d8b1288160717d66df5c7988d27"
|
||||
dependencies = [
|
||||
"rust-embed-impl",
|
||||
"rust-embed-utils",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-embed-impl"
|
||||
version = "8.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da0902e4c7c8e997159ab384e6d0fc91c221375f6894346ae107f47dd0f3ccaa"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rust-embed-utils",
|
||||
"syn 2.0.117",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-embed-utils"
|
||||
version = "8.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bcdef0be6fe7f6fa333b1073c949729274b05f123a0ad7efcb8efd878e5c3b1"
|
||||
dependencies = [
|
||||
"sha2 0.10.9",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.2"
|
||||
|
||||
@@ -79,6 +79,10 @@ cd frontend && npm install && npm run dev
|
||||
|
||||
Configuration lives in `.huskies/project.toml`. See `.huskies/bot.toml.*.example` for transport setup.
|
||||
|
||||
## Website
|
||||
|
||||
The huskies.dev website source has moved to [crashlabs/huskies-server](https://code.crashlabs.io/crashlabs/huskies-server).
|
||||
|
||||
## Architecture
|
||||
|
||||
Internal architecture documentation lives in [`docs/architecture/`](docs/architecture/):
|
||||
|
||||
+11
-2
@@ -46,8 +46,17 @@ WORKDIR /app
|
||||
# build.rs) can produce the release binary with embedded frontend assets.
|
||||
COPY . .
|
||||
|
||||
# Build frontend deps first (better layer caching)
|
||||
RUN cd frontend && npm ci
|
||||
# Build frontend deps first (better layer caching).
|
||||
# Cannot use `npm ci` because of npm's optional-dependencies bug
|
||||
# (npm/cli#4828): platform-specific bindings (e.g. rolldown's
|
||||
# linux-arm64-gnu native binary, introduced by 1119's vite 5→8 upgrade)
|
||||
# get listed in package-lock.json for the lockfile author's platform
|
||||
# only, so `npm ci` skips them on every other platform — the build
|
||||
# then fails at runtime with `Cannot find native binding`. Wipe the
|
||||
# lockfile + node_modules and let `npm install` resolve fresh for the
|
||||
# build platform. The lockfile mutation stays inside the container
|
||||
# image and never reaches the host repo.
|
||||
RUN cd frontend && rm -rf node_modules package-lock.json && npm install
|
||||
|
||||
# Build the release binary (build.rs runs npm run build for the frontend)
|
||||
RUN cargo build --release \
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
# huskies-project-base — minimal base for all project containers.
|
||||
#
|
||||
# This image provides git, the huskies server binary, and a non-root user.
|
||||
# It carries no language tooling. Per-stack overlays (docker/stacks/<name>/
|
||||
# Dockerfile.fragment) layer their toolchains on top of this base.
|
||||
#
|
||||
# Prerequisites: build the main `huskies` image first so its binary is
|
||||
# available as a build source.
|
||||
#
|
||||
# docker build -t huskies -f docker/Dockerfile .
|
||||
# docker build -t huskies-project-base -f docker/Dockerfile.base .
|
||||
#
|
||||
# To build a stack image (e.g. rust):
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/rust/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-rust -
|
||||
|
||||
FROM huskies AS huskies-src
|
||||
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libssl3 \
|
||||
procps \
|
||||
openssh-server \
|
||||
sudo \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy the huskies binary and entrypoint from the main image.
|
||||
COPY --from=huskies-src /usr/local/bin/huskies /usr/local/bin/huskies
|
||||
COPY --from=huskies-src /usr/local/bin/entrypoint.sh /usr/local/bin/entrypoint.sh
|
||||
|
||||
# Non-root user — Claude Code refuses --dangerously-skip-permissions as root.
|
||||
# -s /bin/bash required for SSH sessions to start a real shell.
|
||||
RUN groupadd -r huskies \
|
||||
&& useradd -r -g huskies -m -d /home/huskies -s /bin/bash huskies \
|
||||
&& mkdir -p /home/huskies/.claude \
|
||||
&& mkdir -p /home/huskies/.ssh \
|
||||
&& chmod 700 /home/huskies/.ssh \
|
||||
&& chown -R huskies:huskies /home/huskies \
|
||||
&& mkdir -p /workspace \
|
||||
&& chown huskies:huskies /workspace \
|
||||
&& git config --global init.defaultBranch master \
|
||||
&& echo "huskies ALL=(root) NOPASSWD: /usr/sbin/sshd" > /etc/sudoers.d/huskies-sshd \
|
||||
&& chmod 0440 /etc/sudoers.d/huskies-sshd \
|
||||
&& mkdir -p /run/sshd \
|
||||
&& sed -i \
|
||||
-e 's/#PasswordAuthentication yes/PasswordAuthentication no/' \
|
||||
-e 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' \
|
||||
-e 's/UsePAM yes/UsePAM no/' \
|
||||
/etc/ssh/sshd_config
|
||||
|
||||
# Shell profile for SSH sessions: land in /workspace and load toolchain paths.
|
||||
RUN printf 'cd /workspace\n[ -f "$HOME/.cargo/env" ] && . "$HOME/.cargo/env"\n' \
|
||||
> /home/huskies/.profile \
|
||||
&& chown huskies:huskies /home/huskies/.profile
|
||||
|
||||
USER huskies
|
||||
WORKDIR /workspace
|
||||
|
||||
EXPOSE 3001 22
|
||||
|
||||
ENTRYPOINT ["entrypoint.sh"]
|
||||
CMD ["huskies", "/workspace"]
|
||||
@@ -1,6 +1,22 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
# ── SSH authorized key ────────────────────────────────────────────────
|
||||
# HUSKIES_SSH_PUBKEY is set by `new project` when it generates a keypair.
|
||||
# Write it to authorized_keys so the user can connect with the matching
|
||||
# private key stored at ~/.huskies/<project>/id_ed25519 on the host.
|
||||
if [ -n "$HUSKIES_SSH_PUBKEY" ]; then
|
||||
mkdir -p /home/huskies/.ssh
|
||||
chmod 700 /home/huskies/.ssh
|
||||
printf '%s\n' "$HUSKIES_SSH_PUBKEY" > /home/huskies/.ssh/authorized_keys
|
||||
chmod 600 /home/huskies/.ssh/authorized_keys
|
||||
fi
|
||||
|
||||
# ── SSH daemon ────────────────────────────────────────────────────────
|
||||
# Start sshd in the background so the container accepts SSH connections.
|
||||
# Uses sudo (huskies has NOPASSWD for /usr/sbin/sshd in sudoers.d).
|
||||
sudo /usr/sbin/sshd -D -e &
|
||||
|
||||
# ── Git identity ─────────────────────────────────────────────────────
|
||||
# Agents commit code inside the container. Without a git identity,
|
||||
# commits fail or use garbage defaults. Fail loudly at startup so the
|
||||
@@ -25,6 +41,20 @@ export GIT_COMMITTER_NAME="$GIT_USER_NAME"
|
||||
export GIT_AUTHOR_EMAIL="$GIT_USER_EMAIL"
|
||||
export GIT_COMMITTER_EMAIL="$GIT_USER_EMAIL"
|
||||
|
||||
# ── Git credential helper (HTTPS push) ────────────────────────────────────
|
||||
# If GIT_PUSH_TOKEN is supplied at container creation time, configure git's
|
||||
# built-in credential store so `git push` over HTTPS authenticates without
|
||||
# user interaction. GIT_CLONE_URL provides the host portion of the URL used
|
||||
# as the key in ~/.git-credentials.
|
||||
if [ -n "$GIT_PUSH_TOKEN" ] && [ -n "$GIT_CLONE_URL" ]; then
|
||||
_scheme=$(echo "$GIT_CLONE_URL" | cut -d':' -f1)
|
||||
_host=$(echo "$GIT_CLONE_URL" | sed 's|^https\?://||' | cut -d'/' -f1)
|
||||
git config --global credential.helper store
|
||||
printf '%s://x-access-token:%s@%s\n' "$_scheme" "$GIT_PUSH_TOKEN" "$_host" \
|
||||
> /home/huskies/.git-credentials
|
||||
chmod 600 /home/huskies/.git-credentials
|
||||
fi
|
||||
|
||||
# ── Frontend native deps ────────────────────────────────────────────
|
||||
# The project repo is bind-mounted from the host, so node_modules/
|
||||
# may contain native binaries for the wrong platform (e.g. darwin
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
# Go stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with Go 1.22, gopls (official Go language server), and standard tooling.
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/go/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-go -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Official Go binary distribution — Debian's golang-go package is too old for gopls.
|
||||
# Update GOVERSION to pick up a newer release.
|
||||
ENV GOVERSION="1.22.3"
|
||||
RUN curl -fsSL "https://go.dev/dl/go${GOVERSION}.linux-amd64.tar.gz" \
|
||||
| tar -C /usr/local -xzf -
|
||||
|
||||
ENV PATH="/usr/local/go/bin:${PATH}"
|
||||
|
||||
# gopls: the official Go language server.
|
||||
# GOBIN=/usr/local/bin puts the binary on the system PATH for all users.
|
||||
RUN GOBIN=/usr/local/bin go install golang.org/x/tools/gopls@latest
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,4 @@
|
||||
# Stack detection markers for the go stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
go.mod
|
||||
@@ -0,0 +1,50 @@
|
||||
# JVM stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with OpenJDK 21, Maven, and eclipse.jdt.ls (the canonical Java/JVM LSP).
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/jvm/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-jvm -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# OpenJDK 21 (current LTS) and Maven for build support.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
openjdk-21-jdk-headless \
|
||||
maven \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV JAVA_HOME="/usr/lib/jvm/java-21-openjdk-amd64"
|
||||
|
||||
# Eclipse JDT Language Server — canonical LSP for Java/JVM (Java, Kotlin, Groovy).
|
||||
# Pin to a specific release; update JDTLS_VERSION + JDTLS_BUILD for upgrades.
|
||||
# All releases: https://github.com/eclipse-jdtls/eclipse.jdt.ls/releases
|
||||
ENV JDTLS_VERSION="1.38.0" \
|
||||
JDTLS_BUILD="202503271418"
|
||||
RUN mkdir -p /opt/jdtls \
|
||||
&& curl -fsSL \
|
||||
"https://download.eclipse.org/jdtls/milestones/${JDTLS_VERSION}/jdt-language-server-${JDTLS_VERSION}-${JDTLS_BUILD}.tar.gz" \
|
||||
| tar -xzf - -C /opt/jdtls
|
||||
|
||||
# Wrapper script so `jdtls` is available as a PATH command.
|
||||
RUN { \
|
||||
echo '#!/bin/sh'; \
|
||||
echo 'JAR=$(ls /opt/jdtls/plugins/org.eclipse.equinox.launcher_*.jar 2>/dev/null | head -1)'; \
|
||||
echo 'exec java \'; \
|
||||
echo ' -Declipse.application=org.eclipse.jdt.ls.core.id1 \'; \
|
||||
echo ' -Dosgi.bundles.defaultStartLevel=4 \'; \
|
||||
echo ' -Declipse.product=org.eclipse.jdt.ls.core.product \'; \
|
||||
echo ' -Dlog.protocol=true \'; \
|
||||
echo ' -Dlog.level=ALL \'; \
|
||||
echo ' -jar "$JAR" \'; \
|
||||
echo ' -configuration /opt/jdtls/config_linux \'; \
|
||||
echo ' "$@"'; \
|
||||
} > /usr/local/bin/jdtls \
|
||||
&& chmod +x /usr/local/bin/jdtls
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,6 @@
|
||||
# Stack detection markers for the jvm stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
pom.xml
|
||||
build.gradle
|
||||
build.gradle.kts
|
||||
@@ -0,0 +1,26 @@
|
||||
# Node stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with Node.js 22, TypeScript (tsc), and typescript-language-server.
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/node/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-node -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Node.js 22.x (LTS).
|
||||
RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
|
||||
&& apt-get install -y --no-install-recommends nodejs \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# TypeScript compiler and language server for LSP-aware agents.
|
||||
# tsc: TypeScript compiler (tsc --version)
|
||||
# typescript-language-server: LSP server used by editors/agents
|
||||
RUN npm install -g typescript typescript-language-server
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,7 @@
|
||||
# Stack detection markers for the node stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
# tsconfig.json is listed explicitly so TypeScript-only projects are detected
|
||||
# even without a package.json at the repo root.
|
||||
package.json
|
||||
tsconfig.json
|
||||
@@ -0,0 +1,27 @@
|
||||
# Python stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with Python 3, pip, and pyright (the Microsoft Python LSP / type checker).
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/python/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-python -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Python 3 runtime and pip.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# pyright: Microsoft's Python language server / static type checker.
|
||||
# --break-system-packages is required on Debian 12+ where pip is externally
|
||||
# managed; the flag is safe inside a Docker container.
|
||||
RUN pip install --no-cache-dir --break-system-packages pyright
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,6 @@
|
||||
# Stack detection markers for the python stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
pyproject.toml
|
||||
requirements.txt
|
||||
setup.py
|
||||
@@ -0,0 +1,28 @@
|
||||
# Ruby stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with Ruby, Bundler, and ruby-lsp (the Shopify Ruby language server).
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/ruby/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-ruby -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Ruby runtime, development headers (needed by native gem extensions), and Bundler.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ruby \
|
||||
ruby-dev \
|
||||
bundler \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# ruby-lsp: Shopify's Ruby language server (LSP-compliant, actively maintained).
|
||||
# Installed globally so the `ruby-lsp` binary is available on PATH.
|
||||
RUN gem install ruby-lsp
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,4 @@
|
||||
# Stack detection markers for the ruby stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
Gemfile
|
||||
@@ -0,0 +1,37 @@
|
||||
# Rust stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with a full Rust toolchain, rust-analyzer, and cargo-nextest.
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/rust/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-rust -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Build tools required by rustup and many Rust crates.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV RUSTUP_HOME="/home/huskies/.rustup" \
|
||||
CARGO_HOME="/home/huskies/.cargo"
|
||||
|
||||
# Install stable Rust + rust-analyzer component as the huskies user.
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
|
||||
| su huskies -c "sh -s -- -y --no-modify-path --default-toolchain stable" \
|
||||
&& /home/huskies/.cargo/bin/rustup component add rust-analyzer \
|
||||
&& chown -R huskies:huskies /home/huskies/.rustup /home/huskies/.cargo
|
||||
|
||||
# cargo-nextest: fast Rust test runner used by huskies quality gates.
|
||||
RUN curl -LsSf https://get.nexte.st/latest/linux | tar zxf - -C /usr/local/bin
|
||||
|
||||
ENV PATH="/home/huskies/.cargo/bin:${PATH}"
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,4 @@
|
||||
# Stack detection markers for the rust stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
Cargo.toml
|
||||
Generated
+798
-1068
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "huskies",
|
||||
"private": true,
|
||||
"version": "0.11.0",
|
||||
"version": "0.12.1",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
@@ -32,11 +32,11 @@
|
||||
"@types/node": "^25.0.0",
|
||||
"@types/react": "^19.1.8",
|
||||
"@types/react-dom": "^19.1.6",
|
||||
"@vitejs/plugin-react": "^4.6.0",
|
||||
"@vitest/coverage-v8": "^2.1.9",
|
||||
"@vitejs/plugin-react": "^5.2.0",
|
||||
"@vitest/coverage-v8": "^4.1.6",
|
||||
"jsdom": "^28.1.0",
|
||||
"typescript": "~5.8.3",
|
||||
"vite": "^5.4.21",
|
||||
"vitest": "^2.1.4"
|
||||
"vite": "^8.0.13",
|
||||
"vitest": "^4.1.6"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,6 +160,7 @@ describe("App", () => {
|
||||
});
|
||||
|
||||
it("shows error when openProject fails", async () => {
|
||||
const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
mockedApi.openProject.mockRejectedValue(new Error("Path does not exist"));
|
||||
|
||||
await renderApp();
|
||||
@@ -182,6 +183,7 @@ describe("App", () => {
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText(/Path does not exist/)).toBeInTheDocument();
|
||||
});
|
||||
errorSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("shows known projects list", async () => {
|
||||
|
||||
@@ -266,6 +266,8 @@ describe("subscribeAgentStream", () => {
|
||||
});
|
||||
|
||||
it("handles malformed JSON without throwing", () => {
|
||||
vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
|
||||
subscribeAgentStream("42_story_test", "coder", vi.fn());
|
||||
|
||||
expect(() => {
|
||||
|
||||
@@ -472,9 +472,16 @@ describe("Slash command handling (Story 374)", () => {
|
||||
});
|
||||
|
||||
describe("Story 1058: WebSocket errors do not appear in chat", () => {
|
||||
let consoleSpy: ReturnType<typeof vi.spyOn>;
|
||||
|
||||
beforeEach(() => {
|
||||
capturedWsHandlers = null;
|
||||
setupMocks();
|
||||
consoleSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
consoleSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("does not add a chat message when onError is called", async () => {
|
||||
|
||||
@@ -227,6 +227,7 @@ describe("usePathCompletion hook", () => {
|
||||
});
|
||||
|
||||
it("sets completionError when listDirectoryAbsolute throws an Error", async () => {
|
||||
const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
mockListDir.mockRejectedValue(new Error("Permission denied"));
|
||||
|
||||
const { result } = renderHook(() =>
|
||||
@@ -242,9 +243,13 @@ describe("usePathCompletion hook", () => {
|
||||
await waitFor(() => {
|
||||
expect(result.current.completionError).toBe("Permission denied");
|
||||
});
|
||||
|
||||
expect(errorSpy).toHaveBeenCalledWith(new Error("Permission denied"));
|
||||
errorSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("sets generic completionError when listDirectoryAbsolute throws a non-Error", async () => {
|
||||
const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
mockListDir.mockRejectedValue("some string error");
|
||||
|
||||
const { result } = renderHook(() =>
|
||||
@@ -262,6 +267,9 @@ describe("usePathCompletion hook", () => {
|
||||
"Failed to compute suggestion.",
|
||||
);
|
||||
});
|
||||
|
||||
expect(errorSpy).toHaveBeenCalledWith("some string error");
|
||||
errorSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("clears suggestionTail when selected match path does not start with input", async () => {
|
||||
|
||||
Executable
+37
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Build all project images in dependency order:
|
||||
# huskies → huskies-project-base → huskies-project-<stack> (one per stack fragment)
|
||||
#
|
||||
# Run this after `script/docker_rebuild` or whenever you add a new stack.
|
||||
# Safe to re-run: each step re-tags the image with the latest layers.
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
if [[ -f .env ]]; then
|
||||
set -a
|
||||
source .env
|
||||
set +a
|
||||
fi
|
||||
|
||||
CACHE_FLAG=""
|
||||
if [[ "${1:-}" == "--no-cache" ]]; then
|
||||
CACHE_FLAG="--no-cache"
|
||||
fi
|
||||
|
||||
echo "==> Building huskies"
|
||||
docker build $CACHE_FLAG -t huskies -f docker/Dockerfile .
|
||||
|
||||
echo "==> Building huskies-project-base"
|
||||
docker build $CACHE_FLAG -t huskies-project-base -f docker/Dockerfile.base .
|
||||
|
||||
for fragment in docker/stacks/*/Dockerfile.fragment; do
|
||||
stack=$(basename "$(dirname "$fragment")")
|
||||
image="huskies-project-${stack}"
|
||||
echo "==> Building ${image}"
|
||||
(printf 'FROM huskies-project-base\n'; cat "$fragment") \
|
||||
| docker build $CACHE_FLAG -t "$image" -
|
||||
done
|
||||
|
||||
echo "All project images built."
|
||||
@@ -24,4 +24,6 @@ docker compose -f docker/docker-compose.yml down
|
||||
docker compose -f docker/docker-compose.yml build $CACHE_FLAG
|
||||
docker compose -f docker/docker-compose.yml up -d
|
||||
|
||||
script/build-project-images $CACHE_FLAG
|
||||
|
||||
echo "Rebuild complete. Logs: docker compose -f docker/docker-compose.yml logs -f"
|
||||
|
||||
+12
-10
@@ -11,10 +11,12 @@ export GIT_CONFIG_VALUE_0=master
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
# Ordered fail-fast: cheapest deterministic checks first, slowest builds and
|
||||
# test suites last. `set -euo pipefail` aborts at the first failure, so a fmt
|
||||
# or clippy drift never wastes time on a frontend build or a multi-minute
|
||||
# test run.
|
||||
# Ordered fail-fast: cheapest deterministic checks first. The frontend build
|
||||
# must run *before* anything that compiles Rust, because story 1113 introduced
|
||||
# a compile-time dependency on `frontend/dist/` via `rust-embed` — a fresh
|
||||
# merge worktree without that directory will fail `cargo clippy` on
|
||||
# `EmbeddedAssets::iter()` before the frontend build has a chance to populate
|
||||
# it. `set -euo pipefail` aborts at the first failure.
|
||||
|
||||
echo "=== Checking Rust formatting ==="
|
||||
if cargo fmt --version &>/dev/null; then
|
||||
@@ -44,12 +46,6 @@ if [ "$_dup_found" -eq 1 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== Running cargo clippy ==="
|
||||
cargo clippy --manifest-path "$PROJECT_ROOT/Cargo.toml" --all-targets --all-features -- -D warnings
|
||||
|
||||
echo "=== Checking doc coverage on changed files ==="
|
||||
cargo run --manifest-path "$PROJECT_ROOT/Cargo.toml" -p source-map-gen --bin source-map-check --quiet -- --worktree "$PROJECT_ROOT" --base master
|
||||
|
||||
echo "=== Building frontend ==="
|
||||
if [ -d "$PROJECT_ROOT/frontend" ]; then
|
||||
cd "$PROJECT_ROOT/frontend"
|
||||
@@ -75,6 +71,12 @@ else
|
||||
echo "Skipping frontend build (no frontend directory)"
|
||||
fi
|
||||
|
||||
echo "=== Running cargo clippy ==="
|
||||
cargo clippy --manifest-path "$PROJECT_ROOT/Cargo.toml" --all-targets --all-features -- -D warnings
|
||||
|
||||
echo "=== Checking doc coverage on changed files ==="
|
||||
cargo run --manifest-path "$PROJECT_ROOT/Cargo.toml" -p source-map-gen --bin source-map-check --quiet -- --worktree "$PROJECT_ROOT" --base master
|
||||
|
||||
echo "=== Running Rust tests ==="
|
||||
cargo test --manifest-path "$PROJECT_ROOT/Cargo.toml" --bin huskies
|
||||
cargo test --manifest-path "$PROJECT_ROOT/Cargo.toml" -p source-map-gen
|
||||
|
||||
+1
-3
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "huskies"
|
||||
version = "0.11.0"
|
||||
version = "0.12.1"
|
||||
edition = "2024"
|
||||
build = "build.rs"
|
||||
|
||||
@@ -13,12 +13,10 @@ chrono-tz = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
homedir = { workspace = true }
|
||||
ignore = { workspace = true }
|
||||
mime_guess = { workspace = true }
|
||||
notify = { workspace = true }
|
||||
poem = { workspace = true, features = ["websocket"] }
|
||||
portable-pty = { workspace = true }
|
||||
reqwest = { workspace = true, features = ["json", "stream", "form"] }
|
||||
rust-embed = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
serde_json = { workspace = true }
|
||||
serde_urlencoded = { workspace = true }
|
||||
|
||||
@@ -78,6 +78,7 @@ pub(super) fn build_agent_app_context(
|
||||
pending_perm_replies: Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())),
|
||||
permission_timeout_secs: 120,
|
||||
status: agents.status_broadcaster(),
|
||||
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||
});
|
||||
crate::http::context::AppContext {
|
||||
state: Arc::new(state),
|
||||
|
||||
@@ -33,16 +33,28 @@ impl GateFailureKind {
|
||||
/// Called once when a gate fails to produce a typed kind. Downstream code
|
||||
/// matches on the variant and must not call this on subsequent reads.
|
||||
pub fn classify(output: &str) -> Self {
|
||||
// Strip `test <name> ... ok` lines before checking lint-trigger keywords so
|
||||
// a passing test whose name contains e.g. `missing_doc_comments` or `clippy::`
|
||||
// does not produce a false-positive Lint classification (story 1101).
|
||||
let stripped_for_lint: String = output
|
||||
.lines()
|
||||
.filter(|l| {
|
||||
let t = l.trim();
|
||||
!(t.starts_with("test ") && t.ends_with("... ok"))
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let is_lint = stripped_for_lint.contains("error[clippy::")
|
||||
|| stripped_for_lint.contains("warning[clippy::")
|
||||
|| stripped_for_lint.contains("missing_doc_comments");
|
||||
|
||||
if output.contains("CONFLICT (content):") || output.contains("Merge conflict:") {
|
||||
GateFailureKind::ContentConflict
|
||||
} else if output.contains("Diff in ") || output.contains("would reformat") {
|
||||
GateFailureKind::Fmt
|
||||
} else if output.contains("missing-docs direction") {
|
||||
GateFailureKind::SourceMapCheck
|
||||
} else if output.contains("error[clippy::")
|
||||
|| output.contains("warning[clippy::")
|
||||
|| output.contains("missing_doc_comments")
|
||||
{
|
||||
} else if is_lint {
|
||||
GateFailureKind::Lint
|
||||
} else if output.contains("error[E") {
|
||||
// rustc compile errors (e.g. `error[E0063]: missing field`).
|
||||
@@ -871,6 +883,19 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
/// Story 1101: a passing test whose name contains a lint trigger keyword
|
||||
/// must NOT produce a Lint classification.
|
||||
#[test]
|
||||
fn classify_does_not_false_positive_on_test_name_substring() {
|
||||
let output = "test agents::gates::tests::classify_lint_from_missing_doc_comments ... ok\n\
|
||||
test result: ok. 1 passed; 0 failed";
|
||||
assert_ne!(
|
||||
GateFailureKind::classify(output),
|
||||
GateFailureKind::Lint,
|
||||
"passing test name containing 'missing_doc_comments' must not classify as Lint"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_source_map_check_from_missing_docs_direction() {
|
||||
assert_eq!(
|
||||
|
||||
@@ -161,6 +161,42 @@ pub fn pipeline_stage(agent_name: &str) -> PipelineStage {
|
||||
}
|
||||
}
|
||||
|
||||
/// Map a pipeline [`Stage`] to the canonical [`PipelineStage`] for LLM agent spawning.
|
||||
///
|
||||
/// Returns `None` for stages where no LLM agent should be active (terminal states,
|
||||
/// blocked, frozen, or unclassified merge failures requiring human intervention).
|
||||
/// Returns `Some(stage)` naming the single LLM-agent type that may run on this story.
|
||||
/// Used by `validate_agent_stage` and `reconcile_canonical_agents` to enforce the
|
||||
/// one-agent-per-story invariant (story 1100).
|
||||
pub fn canonical_pipeline_stage(s: &crate::pipeline_state::Stage) -> Option<PipelineStage> {
|
||||
use crate::pipeline_state::{MergeFailureKind, Stage};
|
||||
match s {
|
||||
Stage::Coding { .. } => Some(PipelineStage::Coder),
|
||||
Stage::Qa => Some(PipelineStage::Qa),
|
||||
Stage::Merge { .. } => Some(PipelineStage::Mergemaster),
|
||||
Stage::MergeFailure {
|
||||
kind: MergeFailureKind::ConflictDetected(_),
|
||||
..
|
||||
} => Some(PipelineStage::Mergemaster),
|
||||
Stage::MergeFailure {
|
||||
kind: MergeFailureKind::GatesFailed(_),
|
||||
..
|
||||
} => Some(PipelineStage::Coder),
|
||||
Stage::MergeFailureFinal { .. } => Some(PipelineStage::Mergemaster),
|
||||
Stage::Upcoming
|
||||
| Stage::Backlog
|
||||
| Stage::MergeFailure { .. }
|
||||
| Stage::Done { .. }
|
||||
| Stage::Blocked { .. }
|
||||
| Stage::Archived { .. }
|
||||
| Stage::Frozen { .. }
|
||||
| Stage::ReviewHold { .. }
|
||||
| Stage::Abandoned { .. }
|
||||
| Stage::Superseded { .. }
|
||||
| Stage::Rejected { .. } => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine the pipeline stage for a configured agent.
|
||||
///
|
||||
/// Prefers the explicit `stage` config field (added in Bug 150) over the
|
||||
|
||||
@@ -105,12 +105,6 @@ impl AgentPool {
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: drop the (now-stale) child_killers entry — the
|
||||
// process it pointed at is gone.
|
||||
if let Ok(mut killers) = self.child_killers.lock() {
|
||||
killers.remove(key);
|
||||
}
|
||||
|
||||
// Use the retry mechanism: increment retry_count and only block
|
||||
// when the limit is exceeded, matching the pipeline's behaviour.
|
||||
let story_id = key.rsplit_once(':').map(|(s, _)| s).unwrap_or(key);
|
||||
|
||||
@@ -18,7 +18,6 @@ mod test_helpers;
|
||||
|
||||
use crate::io::watcher::WatcherEvent;
|
||||
use crate::service::status::StatusBroadcaster;
|
||||
use portable_pty::ChildKiller;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tokio::sync::broadcast;
|
||||
@@ -31,10 +30,6 @@ use types::{StoryAgent, composite_key};
|
||||
pub struct AgentPool {
|
||||
agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
|
||||
port: u16,
|
||||
/// Registry of active PTY child process killers, keyed by "{story_id}:{agent_name}".
|
||||
/// Used to terminate child processes on server shutdown or agent stop, preventing
|
||||
/// orphaned Claude Code processes from running after the server exits.
|
||||
child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
|
||||
/// Broadcast channel for notifying WebSocket clients of agent state changes.
|
||||
/// When an agent transitions state (Pending, Running, Completed, Failed, Stopped),
|
||||
/// an `AgentStateChanged` event is emitted so the frontend can refresh the
|
||||
@@ -56,7 +51,6 @@ impl AgentPool {
|
||||
let pool = Self {
|
||||
agents: Arc::new(Mutex::new(HashMap::new())),
|
||||
port,
|
||||
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
||||
watcher_tx: watcher_tx.clone(),
|
||||
status_broadcaster: Arc::new(StatusBroadcaster::new()),
|
||||
};
|
||||
|
||||
@@ -33,7 +33,6 @@ pub(crate) fn spawn_pipeline_advance(
|
||||
let pool = AgentPool {
|
||||
agents,
|
||||
port,
|
||||
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
||||
watcher_tx,
|
||||
status_broadcaster: Arc::new(crate::service::status::StatusBroadcaster::new()),
|
||||
};
|
||||
|
||||
@@ -78,21 +78,34 @@ impl AgentPool {
|
||||
// The coder exited with uncommitted content but no commits
|
||||
// (typical "claude-code session boundary mid-sweep" pattern).
|
||||
// Use a PROGRESS-AWARE retry cap: the agent gets unlimited
|
||||
// respawns as long as file edits keep growing between
|
||||
// attempts; only when the worktree diff is byte-identical
|
||||
// to the previous attempt do we count it as "no progress".
|
||||
// After NO_PROGRESS_CAP consecutive no-progress respawns,
|
||||
// block for human attention.
|
||||
// respawns as long as progress is being made between attempts.
|
||||
// Progress is satisfied if EITHER (a) the worktree diff grew,
|
||||
// OR (b) the set of files the agent read grew. Raw tool-call
|
||||
// count does NOT count — a looping agent can produce many calls.
|
||||
// Only self-exited sessions with no file or read progress count
|
||||
// toward the cap; forced exits (API error, network, budget
|
||||
// exhaustion) are excluded (story 1089).
|
||||
// After NO_PROGRESS_CAP consecutive qualifying no-progress
|
||||
// respawns, block for human attention.
|
||||
//
|
||||
// TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
|
||||
// keeps making file-edit progress every session, after this
|
||||
// many total respawns without a commit we escalate — caught
|
||||
// the "agent flaps between different edits but never
|
||||
// commits" pattern that the progress-aware counter would
|
||||
// never trigger.
|
||||
// many total respawns without a commit we escalate — catches
|
||||
// the "agent flaps between different edits but never commits"
|
||||
// pattern that the progress-aware counter would never trigger.
|
||||
const NO_PROGRESS_CAP: u32 = 3;
|
||||
const TOTAL_ATTEMPTS_CAP: u32 = 8;
|
||||
|
||||
// AC1: consume the forced-exit flag written by spawn.rs when
|
||||
// the agent process exited with a non-zero code.
|
||||
let forced_exit = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryForcedExit(story_id),
|
||||
)
|
||||
.is_some();
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryForcedExit(
|
||||
story_id,
|
||||
));
|
||||
|
||||
let current_fingerprint = worktree_path.as_deref().and_then(|p| {
|
||||
std::process::Command::new("git")
|
||||
.args(["diff", "master"])
|
||||
@@ -104,18 +117,31 @@ impl AgentPool {
|
||||
let stored_fingerprint = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
|
||||
);
|
||||
let made_progress = current_fingerprint.is_some()
|
||||
let diff_progress = current_fingerprint.is_some()
|
||||
&& stored_fingerprint.as_ref() != current_fingerprint.as_ref();
|
||||
let no_progress_count = if made_progress || stored_fingerprint.is_none() {
|
||||
|
||||
// AC2: check read-file set progress as an additional signal.
|
||||
let read_progress = previous_session_id.as_deref().is_some_and(|session_id| {
|
||||
collect_read_progress(&project_root, story_id, agent_name, session_id)
|
||||
});
|
||||
|
||||
let made_progress = diff_progress || read_progress;
|
||||
|
||||
let prev_no_progress_count = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryPending(story_id),
|
||||
)
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0);
|
||||
|
||||
// AC1: forced exits do not increment the stuck-respawn counter.
|
||||
let no_progress_count = if forced_exit {
|
||||
prev_no_progress_count
|
||||
} else if made_progress || stored_fingerprint.is_none() {
|
||||
1
|
||||
} else {
|
||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
))
|
||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||
.unwrap_or(0)
|
||||
+ 1
|
||||
prev_no_progress_count + 1
|
||||
};
|
||||
|
||||
let total_attempts = crate::db::read_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
)
|
||||
@@ -136,13 +162,17 @@ impl AgentPool {
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
);
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
|
||||
story_id,
|
||||
));
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
|
||||
commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
|
||||
without a commit. Blocking story."
|
||||
);
|
||||
let reason = format!(
|
||||
"agent flapped — {total_attempts} respawns without ever committing"
|
||||
"commit absent after {total_attempts} respawns — \
|
||||
agent kept making edits but never committed"
|
||||
);
|
||||
if let Err(e) =
|
||||
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
||||
@@ -167,14 +197,18 @@ impl AgentPool {
|
||||
crate::db::delete_content(
|
||||
crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
|
||||
);
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
|
||||
story_id,
|
||||
));
|
||||
slog!(
|
||||
"[pipeline] Coder '{agent_name}' for '{story_id}' made no \
|
||||
file-edit progress over {no_progress_count} consecutive \
|
||||
commit-recovery respawns. Blocking story."
|
||||
file or read progress over {no_progress_count} consecutive \
|
||||
self-exit commit-recovery respawns. Blocking story."
|
||||
);
|
||||
// AC4: block message names the specific cause.
|
||||
let reason = format!(
|
||||
"agent stuck — {no_progress_count} respawns without commits or \
|
||||
new file edits"
|
||||
"stuck-respawn cap reached: {NO_PROGRESS_CAP} consecutive \
|
||||
self-exits with no file or read progress"
|
||||
);
|
||||
if let Err(e) =
|
||||
crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
|
||||
@@ -206,7 +240,8 @@ impl AgentPool {
|
||||
"[pipeline] Coder '{agent_name}' exited with uncommitted work \
|
||||
for '{story_id}' (no-progress {no_progress_count}/\
|
||||
{NO_PROGRESS_CAP}, total {total_attempts}/\
|
||||
{TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \
|
||||
{TOTAL_ATTEMPTS_CAP}; diff_progress={diff_progress}, \
|
||||
read_progress={read_progress}, forced_exit={forced_exit}). \
|
||||
Issuing commit-only respawn."
|
||||
);
|
||||
let addendum = "\n\nYou have uncommitted work in this worktree. \
|
||||
@@ -302,10 +337,13 @@ impl AgentPool {
|
||||
});
|
||||
}
|
||||
} else if completion.gates_passed {
|
||||
// Clear any stale recovery key when the coder succeeds normally.
|
||||
// Clear any stale recovery keys when the coder succeeds normally.
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
|
||||
story_id,
|
||||
));
|
||||
// Determine effective QA mode for this story.
|
||||
let qa_mode = {
|
||||
let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
|
||||
@@ -361,11 +399,14 @@ impl AgentPool {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Clear any stale recovery key when gates fail normally (agent committed
|
||||
// Clear any stale recovery keys when gates fail normally (agent committed
|
||||
// but the build is broken — treat as a standard retry, not a recovery).
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
|
||||
story_id,
|
||||
));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
|
||||
story_id,
|
||||
));
|
||||
// Bug 645 / 668: Before retry/block, check if the agent left committed
|
||||
// work AND the agent had a passing run_tests result captured during its
|
||||
// session. An agent may crash mid-output (e.g. Claude Code CLI PTY write
|
||||
@@ -724,6 +765,109 @@ mod helpers;
|
||||
use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store};
|
||||
pub(crate) use helpers::{should_block_story, spawn_pipeline_advance};
|
||||
|
||||
/// Parse a huskies agent log and return the set of file paths passed to the
|
||||
/// Read tool in that session. Returns an empty set if the log cannot be read.
|
||||
///
|
||||
/// Used by [`collect_read_progress`] to detect read-exploration progress even
|
||||
/// when the worktree diff did not grow (story 1089, AC2).
|
||||
fn collect_read_files_from_log(
|
||||
project_root: &std::path::Path,
|
||||
story_id: &str,
|
||||
agent_name: &str,
|
||||
session_id: &str,
|
||||
) -> std::collections::HashSet<String> {
|
||||
let log_path = crate::agent_log::log_file_path(project_root, story_id, agent_name, session_id);
|
||||
let mut files = std::collections::HashSet::new();
|
||||
|
||||
let log_text = match std::fs::read_to_string(&log_path) {
|
||||
Ok(t) => t,
|
||||
Err(_) => return files,
|
||||
};
|
||||
|
||||
for line in log_text.lines() {
|
||||
let trimmed = line.trim();
|
||||
if trimmed.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let entry: serde_json::Value = match serde_json::from_str(trimmed) {
|
||||
Ok(v) => v,
|
||||
Err(_) => continue,
|
||||
};
|
||||
// Only look at agent_json events where data.type == "assistant".
|
||||
if entry.get("type").and_then(|t| t.as_str()) != Some("agent_json") {
|
||||
continue;
|
||||
}
|
||||
let data = match entry.get("data") {
|
||||
Some(d) => d,
|
||||
None => continue,
|
||||
};
|
||||
if data.get("type").and_then(|t| t.as_str()) != Some("assistant") {
|
||||
continue;
|
||||
}
|
||||
let content = match data.pointer("/message/content").and_then(|c| c.as_array()) {
|
||||
Some(c) => c,
|
||||
None => continue,
|
||||
};
|
||||
for item in content {
|
||||
if item.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
|
||||
continue;
|
||||
}
|
||||
if item.get("name").and_then(|n| n.as_str()) != Some("Read") {
|
||||
continue;
|
||||
}
|
||||
if let Some(path) = item.pointer("/input/file_path").and_then(|p| p.as_str()) {
|
||||
files.insert(path.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
files
|
||||
}
|
||||
|
||||
/// Return `true` if the agent read any files in `session_id` that were not in
|
||||
/// the cumulative read set for `story_id`. Updates the stored cumulative set
|
||||
/// when new files are found (story 1089, AC2).
|
||||
fn collect_read_progress(
|
||||
project_root: &std::path::Path,
|
||||
story_id: &str,
|
||||
agent_name: &str,
|
||||
session_id: &str,
|
||||
) -> bool {
|
||||
let session_files = collect_read_files_from_log(project_root, story_id, agent_name, session_id);
|
||||
if session_files.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let stored_set: std::collections::HashSet<String> =
|
||||
crate::db::read_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id))
|
||||
.map(|s| {
|
||||
s.lines()
|
||||
.filter(|l| !l.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
let union: std::collections::HashSet<String> =
|
||||
stored_set.union(&session_files).cloned().collect();
|
||||
|
||||
if union.len() > stored_set.len() {
|
||||
let mut sorted: Vec<&String> = union.iter().collect();
|
||||
sorted.sort();
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryReadSet(story_id),
|
||||
&sorted
|
||||
.into_iter()
|
||||
.map(String::as_str)
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n"),
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1077,7 +1077,7 @@ stage = "coder"
|
||||
"Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
|
||||
);
|
||||
assert!(
|
||||
block_reason.contains("without commits or new file edits"),
|
||||
block_reason.contains("self-exits with no file or read progress"),
|
||||
"Block reason should describe the no-progress condition, got: {block_reason}"
|
||||
);
|
||||
|
||||
@@ -1193,7 +1193,7 @@ stage = "coder"
|
||||
"Story must be blocked once total commit-recovery attempts hits the outer cap"
|
||||
);
|
||||
assert!(
|
||||
block_reason.contains("flapped") && block_reason.contains("without ever committing"),
|
||||
block_reason.contains("commit absent") && block_reason.contains("never committed"),
|
||||
"Block reason should describe the flapping pattern, got: {block_reason}"
|
||||
);
|
||||
|
||||
|
||||
@@ -111,7 +111,6 @@ impl AgentPool {
|
||||
let pool_clone = Self {
|
||||
agents: Arc::clone(&self.agents),
|
||||
port: self.port,
|
||||
child_killers: Arc::clone(&self.child_killers),
|
||||
watcher_tx: self.watcher_tx.clone(),
|
||||
status_broadcaster: Arc::clone(&self.status_broadcaster),
|
||||
};
|
||||
|
||||
@@ -74,25 +74,11 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(
|
||||
|
||||
// Kill any in-flight cargo test processes for this worktree so they don't
|
||||
// hold the build lock while gates try to run.
|
||||
if let Some(wt_path) = worktree_path.as_ref()
|
||||
&& let Ok(output) = std::process::Command::new("pgrep")
|
||||
.args([
|
||||
"-f",
|
||||
&format!("--manifest-path {}/Cargo.toml", wt_path.display()),
|
||||
])
|
||||
.output()
|
||||
{
|
||||
let pids = String::from_utf8_lossy(&output.stdout);
|
||||
for pid_str in pids.lines() {
|
||||
if let Ok(pid) = pid_str.trim().parse::<i32>() {
|
||||
crate::slog!(
|
||||
"[agents] Killing stale cargo process (pid {pid}) for '{story_id}' before running gates"
|
||||
);
|
||||
unsafe {
|
||||
libc::kill(pid, libc::SIGKILL);
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(wt_path) = worktree_path.as_ref() {
|
||||
let pattern = format!("--manifest-path {}/Cargo.toml", wt_path.display());
|
||||
let _ = crate::process_kill::sigkill_pids_and_verify(&crate::process_kill::pids_matching(
|
||||
&pattern,
|
||||
));
|
||||
}
|
||||
|
||||
// Run acceptance gates. Third element of the tuple is `needs_commit_recovery`:
|
||||
|
||||
@@ -18,7 +18,6 @@ impl AgentPool {
|
||||
let pool = Arc::new(Self {
|
||||
agents: Arc::clone(&self.agents),
|
||||
port: self.port,
|
||||
child_killers: Arc::clone(&self.child_killers),
|
||||
watcher_tx: self.watcher_tx.clone(),
|
||||
status_broadcaster: Arc::clone(&self.status_broadcaster),
|
||||
});
|
||||
|
||||
@@ -1,12 +1,20 @@
|
||||
//! Process management — kills orphaned PTY child processes on server shutdown.
|
||||
//!
|
||||
//! See [`crate::process_kill`] for the general process-termination primitives
|
||||
//! this module's existing methods (`kill_all_children`, `kill_child_for_key`)
|
||||
//! should eventually be migrated to. Those methods currently use
|
||||
//! `portable_pty::ChildKiller::kill()`, which sends `SIGHUP` — a signal
|
||||
//! claude-code ignores — so they leave orphans on every shutdown/stop. The
|
||||
//! migration is tracked in a separate story to keep its diff focused.
|
||||
//! As of story 1090 (2026-05-15), all process termination in this module uses
|
||||
//! [`crate::process_kill::sigkill_pids_and_verify`] — SIGHUP-based killing via
|
||||
//! `portable_pty::ChildKiller` has been removed entirely from the server.
|
||||
//!
|
||||
//! ## History
|
||||
//!
|
||||
//! Prior to commit `fe9804b3`, the watchdog and all kill paths sent SIGHUP via
|
||||
//! `portable_pty::ChildKiller::kill()`. Claude Code ignores SIGHUP, so agents
|
||||
//! survived "kills" and ran concurrently with their replacements — the root cause
|
||||
//! of the 2026-05-15 duplicate-spawn incident. `fe9804b3` migrated the watchdog;
|
||||
//! story 1090 completes the migration by rewriting `kill_all_children` and
|
||||
//! `kill_child_for_key` (this file) to use `pids_matching` + `sigkill_pids_and_verify`.
|
||||
use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
|
||||
use crate::slog;
|
||||
use crate::slog_warn;
|
||||
|
||||
use super::AgentPool;
|
||||
|
||||
@@ -14,53 +22,97 @@ impl AgentPool {
|
||||
/// Kill all active PTY child processes.
|
||||
///
|
||||
/// Called on server shutdown to prevent orphaned Claude Code processes from
|
||||
/// continuing to run after the server exits. Each registered killer is called
|
||||
/// once, then the registry is cleared.
|
||||
/// continuing to run after the server exits. Collects each agent's worktree
|
||||
/// path, then SIGKILLs every process running inside that path and verifies
|
||||
/// termination before returning.
|
||||
pub fn kill_all_children(&self) {
|
||||
if let Ok(mut killers) = self.child_killers.lock() {
|
||||
for (key, killer) in killers.iter_mut() {
|
||||
slog!("[agents] Killing child process for {key} on shutdown");
|
||||
let _ = killer.kill();
|
||||
let worktree_paths: Vec<(String, std::path::PathBuf)> = {
|
||||
let Ok(agents) = self.agents.lock() else {
|
||||
return;
|
||||
};
|
||||
agents
|
||||
.iter()
|
||||
.filter_map(|(key, agent)| {
|
||||
agent
|
||||
.worktree_info
|
||||
.as_ref()
|
||||
.map(|wt| (key.clone(), wt.path.clone()))
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
|
||||
for (key, path) in worktree_paths {
|
||||
let pattern = path.display().to_string();
|
||||
let pids = pids_matching(&pattern);
|
||||
if pids.is_empty() {
|
||||
slog!(
|
||||
"[agents] No processes found in worktree {} for '{key}' on shutdown",
|
||||
path.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
match sigkill_pids_and_verify(&pids) {
|
||||
Ok(n) => slog!(
|
||||
"[agents] SIGKILL'd {n} process(es) in worktree {} for '{key}' on shutdown",
|
||||
path.display()
|
||||
),
|
||||
Err(survivors) => slog_warn!(
|
||||
"[agents] SIGKILL incomplete for '{key}' on shutdown: \
|
||||
pids still alive: {survivors:?}"
|
||||
),
|
||||
}
|
||||
killers.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Kill and deregister the child process for a specific agent key.
|
||||
///
|
||||
/// Used by `stop_agent` to ensure the PTY child is terminated even though
|
||||
/// aborting a `spawn_blocking` task handle does not interrupt the blocking thread.
|
||||
/// Fallback used by `stop_agent` when no worktree path is recorded for the
|
||||
/// agent. Also the primary kill path for any caller that has only a composite
|
||||
/// key and not a worktree path directly.
|
||||
pub(super) fn kill_child_for_key(&self, key: &str) {
|
||||
if let Ok(mut killers) = self.child_killers.lock()
|
||||
&& let Some(mut killer) = killers.remove(key)
|
||||
{
|
||||
slog!("[agents] Killing child process for {key} on stop");
|
||||
let _ = killer.kill();
|
||||
let worktree_path = {
|
||||
let Ok(agents) = self.agents.lock() else {
|
||||
return;
|
||||
};
|
||||
agents
|
||||
.get(key)
|
||||
.and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
|
||||
};
|
||||
|
||||
let Some(path) = worktree_path else {
|
||||
slog_warn!(
|
||||
"[agents] No worktree path recorded for '{key}'; \
|
||||
cannot SIGKILL via process_kill (no-op)"
|
||||
);
|
||||
return;
|
||||
};
|
||||
|
||||
let pattern = path.display().to_string();
|
||||
let pids = pids_matching(&pattern);
|
||||
if pids.is_empty() {
|
||||
slog!(
|
||||
"[agents] No processes found in worktree {} for '{key}' on stop",
|
||||
path.display()
|
||||
);
|
||||
return;
|
||||
}
|
||||
match sigkill_pids_and_verify(&pids) {
|
||||
Ok(n) => slog!(
|
||||
"[agents] SIGKILL'd {n} process(es) in worktree {} for '{key}' on stop",
|
||||
path.display()
|
||||
),
|
||||
Err(survivors) => slog_warn!(
|
||||
"[agents] SIGKILL incomplete for '{key}' on stop: \
|
||||
pids still alive: {survivors:?}"
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// Test helper: inject a child killer into the registry.
|
||||
#[cfg(test)]
|
||||
pub fn inject_child_killer(
|
||||
&self,
|
||||
key: &str,
|
||||
killer: Box<dyn portable_pty::ChildKiller + Send + Sync>,
|
||||
) {
|
||||
let mut killers = self.child_killers.lock().unwrap();
|
||||
killers.insert(key.to_string(), killer);
|
||||
}
|
||||
|
||||
/// Test helper: return the number of registered child killers.
|
||||
#[cfg(test)]
|
||||
pub fn child_killer_count(&self) -> usize {
|
||||
self.child_killers.lock().unwrap().len()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::AgentPool;
|
||||
use portable_pty::{CommandBuilder, PtySize, native_pty_system};
|
||||
use crate::agents::AgentStatus;
|
||||
use std::process::Command;
|
||||
|
||||
/// Returns true if a process with the given PID is currently running.
|
||||
@@ -75,79 +127,100 @@ mod tests {
|
||||
#[test]
|
||||
fn kill_all_children_is_safe_on_empty_pool() {
|
||||
let pool = AgentPool::new_test(3001);
|
||||
pool.kill_all_children();
|
||||
assert_eq!(pool.child_killer_count(), 0);
|
||||
pool.kill_all_children(); // must not panic
|
||||
}
|
||||
|
||||
/// AC 4 — `kill_child_for_key` SIGKILLs the single agent's process and
|
||||
/// verifies it is gone within 2 s. The sleeper has the worktree path in
|
||||
/// its argv[0] so `pgrep -f` can locate it, mirroring how claude-code is
|
||||
/// launched with `--directory <worktree>` in production.
|
||||
#[test]
|
||||
fn kill_all_children_kills_real_process() {
|
||||
let pool = AgentPool::new_test(3001);
|
||||
fn kill_child_for_key_kills_real_process() {
|
||||
use std::os::unix::process::CommandExt;
|
||||
|
||||
let pty_system = native_pty_system();
|
||||
let pair = pty_system
|
||||
.openpty(PtySize {
|
||||
rows: 24,
|
||||
cols: 80,
|
||||
pixel_width: 0,
|
||||
pixel_height: 0,
|
||||
})
|
||||
.expect("failed to open pty");
|
||||
let pool = AgentPool::new_test(3002);
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let worktree = tmp.path();
|
||||
|
||||
let mut cmd = CommandBuilder::new("sleep");
|
||||
cmd.arg("100");
|
||||
let mut child = pair
|
||||
.slave
|
||||
.spawn_command(cmd)
|
||||
.expect("failed to spawn sleep");
|
||||
let pid = child.process_id().expect("no pid");
|
||||
// argv[0] = worktree path → pgrep -f <path> finds this process.
|
||||
let mut child = Command::new("sleep")
|
||||
.arg0(worktree.to_string_lossy().as_ref())
|
||||
.arg("100")
|
||||
.spawn()
|
||||
.expect("spawn sleeper");
|
||||
let pid = child.id();
|
||||
|
||||
pool.inject_child_killer("story:agent", child.clone_killer());
|
||||
// Give pgrep a moment to see the new process.
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
|
||||
pool.inject_test_agent_with_path(
|
||||
"story-1090-kill",
|
||||
"coder",
|
||||
AgentStatus::Running,
|
||||
worktree.to_path_buf(),
|
||||
);
|
||||
|
||||
assert!(
|
||||
process_is_running(pid),
|
||||
"process {pid} should be running before kill_all_children"
|
||||
"sleeper pid {pid} should be running before kill_child_for_key"
|
||||
);
|
||||
|
||||
pool.kill_all_children();
|
||||
let _ = child.wait();
|
||||
pool.kill_child_for_key("story-1090-kill:coder");
|
||||
let _ = child.wait(); // reap zombie so ps -p returns false
|
||||
|
||||
assert!(
|
||||
!process_is_running(pid),
|
||||
"process {pid} should have been killed by kill_all_children"
|
||||
"sleeper pid {pid} should be dead after kill_child_for_key"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC 5 — `kill_all_children` SIGKILLs all agents' processes. Two agents
|
||||
/// with distinct worktree paths are injected; both must be gone after the call.
|
||||
#[test]
|
||||
fn kill_all_children_clears_registry() {
|
||||
let pool = AgentPool::new_test(3001);
|
||||
fn kill_all_children_kills_multiple_real_processes() {
|
||||
use std::os::unix::process::CommandExt;
|
||||
|
||||
let pty_system = native_pty_system();
|
||||
let pair = pty_system
|
||||
.openpty(PtySize {
|
||||
rows: 24,
|
||||
cols: 80,
|
||||
pixel_width: 0,
|
||||
pixel_height: 0,
|
||||
let pool = AgentPool::new_test(3003);
|
||||
|
||||
let mut sleepers: Vec<(u32, std::process::Child, tempfile::TempDir)> = (0..2_u32)
|
||||
.map(|i| {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let worktree = tmp.path();
|
||||
// argv[0] = worktree path for pgrep discoverability.
|
||||
let child = Command::new("sleep")
|
||||
.arg0(worktree.to_string_lossy().as_ref())
|
||||
.arg("100")
|
||||
.spawn()
|
||||
.expect("spawn sleeper");
|
||||
let pid = child.id();
|
||||
pool.inject_test_agent_with_path(
|
||||
&format!("story-1090-all-{i}"),
|
||||
"coder",
|
||||
AgentStatus::Running,
|
||||
worktree.to_path_buf(),
|
||||
);
|
||||
(pid, child, tmp)
|
||||
})
|
||||
.expect("failed to open pty");
|
||||
.collect();
|
||||
|
||||
let mut cmd = CommandBuilder::new("sleep");
|
||||
cmd.arg("1");
|
||||
let mut child = pair
|
||||
.slave
|
||||
.spawn_command(cmd)
|
||||
.expect("failed to spawn sleep");
|
||||
// Give pgrep a moment to see the new processes.
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
|
||||
pool.inject_child_killer("story:agent", child.clone_killer());
|
||||
assert_eq!(pool.child_killer_count(), 1);
|
||||
for (pid, _, _) in &sleepers {
|
||||
assert!(
|
||||
process_is_running(*pid),
|
||||
"pid {pid} should be running before kill_all_children"
|
||||
);
|
||||
}
|
||||
|
||||
pool.kill_all_children();
|
||||
let _ = child.wait();
|
||||
|
||||
assert_eq!(
|
||||
pool.child_killer_count(),
|
||||
0,
|
||||
"child_killers should be cleared after kill_all_children"
|
||||
);
|
||||
for (pid, child, _tmp) in &mut sleepers {
|
||||
let _ = child.wait(); // reap zombie
|
||||
assert!(
|
||||
!process_is_running(*pid),
|
||||
"pid {pid} should be dead after kill_all_children"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,6 +271,42 @@ impl AgentPool {
|
||||
'{conflicting_name}' is already active at the same pipeline stage"
|
||||
));
|
||||
}
|
||||
// Cross-stage LLM agent guard: reject if any Coder/Qa/Mergemaster agent
|
||||
// is already Running or Pending on this story at a *different* pipeline stage.
|
||||
// These are stale agents left over from a previous stage transition that has
|
||||
// since advanced. The periodic reconciler (reconcile_canonical_agents) stops
|
||||
// them; here we surface the conflict so the caller waits for reconciliation.
|
||||
if matches!(
|
||||
resolved_stage,
|
||||
PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster
|
||||
) && let Some(stale_name) = agents.iter().find_map(|(k, a)| {
|
||||
let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k);
|
||||
if k_story != story_id || a.agent_name == resolved_name {
|
||||
return None;
|
||||
}
|
||||
if !matches!(a.status, AgentStatus::Running | AgentStatus::Pending) {
|
||||
return None;
|
||||
}
|
||||
let a_stage = config
|
||||
.find_agent(&a.agent_name)
|
||||
.map(agent_config_stage)
|
||||
.unwrap_or_else(|| pipeline_stage(&a.agent_name));
|
||||
if matches!(
|
||||
a_stage,
|
||||
PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster
|
||||
) && a_stage != resolved_stage
|
||||
{
|
||||
Some(a.agent_name.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}) {
|
||||
return Err(format!(
|
||||
"story '{story_id}' already has an active LLM agent '{stale_name}'; \
|
||||
refusing to spawn '{resolved_name}'"
|
||||
));
|
||||
}
|
||||
|
||||
// Enforce single-instance concurrency for explicitly-named agents:
|
||||
// if this agent is already running on any other story, reject.
|
||||
// Auto-selected agents are already guaranteed idle by
|
||||
@@ -392,7 +428,6 @@ impl AgentPool {
|
||||
event_log.clone(),
|
||||
self.port,
|
||||
log_writer.clone(),
|
||||
self.child_killers.clone(),
|
||||
self.watcher_tx.clone(),
|
||||
inactivity_timeout_secs,
|
||||
prior_events,
|
||||
|
||||
@@ -8,7 +8,6 @@ use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use portable_pty::ChildKiller;
|
||||
use tokio::sync::broadcast;
|
||||
|
||||
use crate::agent_log::AgentLogWriter;
|
||||
@@ -135,7 +134,6 @@ pub(super) async fn run_agent_spawn(
|
||||
event_log: Arc<Mutex<Vec<AgentEvent>>>,
|
||||
port: u16,
|
||||
log_writer: Option<Arc<Mutex<AgentLogWriter>>>,
|
||||
child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
|
||||
watcher_tx: broadcast::Sender<WatcherEvent>,
|
||||
inactivity_timeout_secs: u64,
|
||||
// Formatted `<recent-events>` block drained from the previous session's
|
||||
@@ -159,7 +157,6 @@ pub(super) async fn run_agent_spawn(
|
||||
let log_clone = event_log;
|
||||
let port_for_task = port;
|
||||
let log_writer_clone = log_writer;
|
||||
let child_killers_clone = child_killers;
|
||||
let watcher_tx_clone = watcher_tx;
|
||||
let _ = inactivity_timeout_secs; // currently unused inside the closure body
|
||||
|
||||
@@ -371,8 +368,7 @@ pub(super) async fn run_agent_spawn(
|
||||
|
||||
let run_result = match runtime_name {
|
||||
"claude-code" => {
|
||||
let runtime =
|
||||
ClaudeCodeRuntime::new(child_killers_clone.clone(), watcher_tx_clone.clone());
|
||||
let runtime = ClaudeCodeRuntime::new(watcher_tx_clone.clone());
|
||||
let ctx = RuntimeContext {
|
||||
story_id: sid.clone(),
|
||||
agent_name: aname.clone(),
|
||||
@@ -566,7 +562,6 @@ pub(super) async fn run_agent_spawn(
|
||||
let pool = AgentPool {
|
||||
agents: agents_for_respawn,
|
||||
port: port_r,
|
||||
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
||||
watcher_tx: watcher_for_respawn,
|
||||
status_broadcaster: Arc::new(
|
||||
crate::service::status::StatusBroadcaster::new(),
|
||||
@@ -654,7 +649,6 @@ pub(super) async fn run_agent_spawn(
|
||||
let pool = AgentPool {
|
||||
agents: agents_for_cd,
|
||||
port: port_for_cd,
|
||||
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
||||
watcher_tx: watcher_for_cd,
|
||||
status_broadcaster: Arc::new(
|
||||
crate::service::status::StatusBroadcaster::new(),
|
||||
@@ -774,7 +768,6 @@ pub(super) async fn run_agent_spawn(
|
||||
let pool = AgentPool {
|
||||
agents: agents_for_cd,
|
||||
port: port_for_cd,
|
||||
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
||||
watcher_tx: watcher_for_cd,
|
||||
status_broadcaster: Arc::new(
|
||||
crate::service::status::StatusBroadcaster::new(),
|
||||
@@ -815,6 +808,7 @@ pub(super) async fn run_agent_spawn(
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
|
||||
&sid,
|
||||
));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(&sid));
|
||||
|
||||
// Remove agent from the pool and unblock any wait_for_agent callers.
|
||||
let tx_done = {
|
||||
@@ -862,7 +856,6 @@ pub(super) async fn run_agent_spawn(
|
||||
let pool = AgentPool {
|
||||
agents: agents_for_respawn,
|
||||
port: port_r,
|
||||
child_killers: Arc::new(Mutex::new(HashMap::new())),
|
||||
watcher_tx: watcher_for_respawn,
|
||||
status_broadcaster: Arc::new(
|
||||
crate::service::status::StatusBroadcaster::new(),
|
||||
@@ -881,6 +874,17 @@ pub(super) async fn run_agent_spawn(
|
||||
return;
|
||||
}
|
||||
|
||||
// AC1 (story 1089): mark forced exits so the commit-recovery
|
||||
// stuck counter is not incremented for API errors, network
|
||||
// failures, or Claude-API budget exhaustion. A non-zero exit
|
||||
// code means the CLI was forced out, not that it chose to stop.
|
||||
if !result.exit_ok {
|
||||
crate::db::write_content(
|
||||
crate::db::ContentKey::CommitRecoveryForcedExit(&sid),
|
||||
"1",
|
||||
);
|
||||
}
|
||||
|
||||
// Server-owned completion: run acceptance gates automatically
|
||||
// when the agent process exits normally.
|
||||
super::super::pipeline::run_server_owned_completion(
|
||||
@@ -1254,12 +1258,13 @@ mod tests {
|
||||
"abc123",
|
||||
);
|
||||
|
||||
// Rate-limit exit handler: reset all three counters (the fix).
|
||||
// Rate-limit exit handler: reset all counters (the fix).
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
|
||||
story_id,
|
||||
));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
|
||||
crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id));
|
||||
|
||||
// CommitRecoveryPending must be cleared after each rate-limit exit.
|
||||
assert!(
|
||||
|
||||
@@ -602,6 +602,266 @@ async fn start_agent_allows_correct_stage_agent() {
|
||||
}
|
||||
}
|
||||
|
||||
// ── story-1100: cross-stage LLM agent rejection ─────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn start_agent_rejects_mergemaster_when_coder_running_same_story() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
let sk_dir = root.join(".huskies");
|
||||
fs::create_dir_all(&sk_dir).unwrap();
|
||||
fs::write(
|
||||
sk_dir.join("project.toml"),
|
||||
"[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
|
||||
[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let pool = AgentPool::new_test(3099);
|
||||
pool.inject_test_agent("999_story_cross", "coder-1", AgentStatus::Running);
|
||||
|
||||
let result = pool
|
||||
.start_agent(root, "999_story_cross", Some("mergemaster"), None, None)
|
||||
.await;
|
||||
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"mergemaster must be rejected when coder-1 is still running on same story"
|
||||
);
|
||||
let err = result.unwrap_err();
|
||||
assert!(
|
||||
err.contains("active LLM agent") || err.contains("stale agent"),
|
||||
"error must mention active LLM agent conflict, got: '{err}'"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn start_agent_rejects_coder_when_mergemaster_running_same_story() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
let sk_dir = root.join(".huskies");
|
||||
fs::create_dir_all(&sk_dir).unwrap();
|
||||
fs::write(
|
||||
sk_dir.join("project.toml"),
|
||||
"[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
|
||||
[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let pool = AgentPool::new_test(3099);
|
||||
pool.inject_test_agent("888_story_cross2", "mergemaster", AgentStatus::Running);
|
||||
|
||||
let result = pool
|
||||
.start_agent(root, "888_story_cross2", Some("coder-1"), None, None)
|
||||
.await;
|
||||
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"coder-1 must be rejected when mergemaster is running on same story"
|
||||
);
|
||||
let err = result.unwrap_err();
|
||||
assert!(
|
||||
err.contains("active LLM agent") || err.contains("stale agent"),
|
||||
"error must mention active LLM agent conflict, got: '{err}'"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn start_agent_cross_stage_does_not_block_different_stories() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
let sk_dir = root.join(".huskies");
|
||||
fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap();
|
||||
fs::write(
|
||||
root.join(".huskies/project.toml"),
|
||||
"[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
|
||||
[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
fs::write(
|
||||
root.join(".huskies/work/1_backlog/777_story_other.md"),
|
||||
"---\nname: Other\n---\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let pool = AgentPool::new_test(3099);
|
||||
// mergemaster running on story-x should NOT block coder on story-y
|
||||
pool.inject_test_agent("111_story_x", "mergemaster", AgentStatus::Running);
|
||||
|
||||
let result = pool
|
||||
.start_agent(root, "777_story_other", Some("coder-1"), None, None)
|
||||
.await;
|
||||
|
||||
if let Err(ref e) = result {
|
||||
assert!(
|
||||
!e.contains("active LLM agent") && !e.contains("stale agent"),
|
||||
"cross-stage guard must not fire for agents on different stories, got: '{e}'"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn reconcile_canonical_agents_stops_stale_coder_in_qa_stage() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
let sk_dir = root.join(".huskies");
|
||||
fs::create_dir_all(&sk_dir).unwrap();
|
||||
fs::write(
|
||||
sk_dir.join("project.toml"),
|
||||
"[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Write story to CRDT in QA stage: canonical = Qa, but coder-1 is Running.
|
||||
crate::db::ensure_content_store();
|
||||
crate::db::write_item_with_content(
|
||||
"777_story_reconcile",
|
||||
"qa",
|
||||
"---\nname: Reconcile Test\n---\n",
|
||||
crate::db::ItemMeta::named("Reconcile Test"),
|
||||
);
|
||||
|
||||
let pool = AgentPool::new_test(3099);
|
||||
pool.inject_test_agent("777_story_reconcile", "coder-1", AgentStatus::Running);
|
||||
|
||||
let before = pool.list_agents().unwrap();
|
||||
assert!(
|
||||
before.iter().any(|a| a.agent_name == "coder-1"
|
||||
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)),
|
||||
"coder-1 should be Running before reconciliation"
|
||||
);
|
||||
|
||||
pool.reconcile_canonical_agents(root).await;
|
||||
|
||||
let after = pool.list_agents().unwrap();
|
||||
let still_active = after.iter().any(|a| {
|
||||
a.story_id == "777_story_reconcile"
|
||||
&& a.agent_name == "coder-1"
|
||||
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
||||
});
|
||||
assert!(
|
||||
!still_active,
|
||||
"reconciler must have stopped coder-1 (CRDT stage is QA, coder is wrong stage)"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn reconcile_canonical_agents_leaves_correct_stage_agent_alone() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
let sk_dir = root.join(".huskies");
|
||||
fs::create_dir_all(&sk_dir).unwrap();
|
||||
fs::write(
|
||||
sk_dir.join("project.toml"),
|
||||
"[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Story is in coding stage: canonical = Coder. coder-1 is correct.
|
||||
crate::db::ensure_content_store();
|
||||
crate::db::write_item_with_content(
|
||||
"555_story_correct",
|
||||
"coding",
|
||||
"---\nname: Correct Stage\n---\n",
|
||||
crate::db::ItemMeta::named("Correct Stage"),
|
||||
);
|
||||
|
||||
let pool = AgentPool::new_test(3099);
|
||||
pool.inject_test_agent("555_story_correct", "coder-1", AgentStatus::Running);
|
||||
|
||||
pool.reconcile_canonical_agents(root).await;
|
||||
|
||||
let after = pool.list_agents().unwrap();
|
||||
let still_active = after.iter().any(|a| {
|
||||
a.story_id == "555_story_correct"
|
||||
&& a.agent_name == "coder-1"
|
||||
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
||||
});
|
||||
assert!(
|
||||
still_active,
|
||||
"reconciler must NOT stop coder-1 when it matches the canonical stage"
|
||||
);
|
||||
}
|
||||
|
||||
/// Regression test for story 1100: a stale coder left running after a stage
|
||||
/// transition blocks both a same-stage coder and a cross-stage mergemaster.
|
||||
/// The periodic reconciler stops the stale coder, after which the pool no
|
||||
/// longer has a cross-stage conflict.
|
||||
#[tokio::test]
|
||||
async fn regression_1100_stale_coder_blocks_mergemaster_then_reconciler_clears() {
|
||||
use std::fs;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
let sk_dir = root.join(".huskies");
|
||||
fs::create_dir_all(&sk_dir).unwrap();
|
||||
fs::write(
|
||||
sk_dir.join("project.toml"),
|
||||
"[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
|
||||
[[agent]]\nname = \"coder-2\"\nstage = \"coder\"\n\n\
|
||||
[[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let pool = AgentPool::new_test(3099);
|
||||
// Simulate coder-1 still Running after the story advanced past the coding stage.
|
||||
pool.inject_test_agent("1100_reg", "coder-1", AgentStatus::Running);
|
||||
|
||||
// coder-2 blocked by same-stage check (both are Coder stage)
|
||||
let r1 = pool
|
||||
.start_agent(root, "1100_reg", Some("coder-2"), None, None)
|
||||
.await;
|
||||
assert!(r1.is_err(), "coder-2 must be rejected by same-stage guard");
|
||||
assert!(
|
||||
r1.unwrap_err().contains("same pipeline stage"),
|
||||
"same-stage check must fire for coder-2"
|
||||
);
|
||||
|
||||
// mergemaster blocked by cross-stage LLM guard (coder-1 is a different LLM stage)
|
||||
let r2 = pool
|
||||
.start_agent(root, "1100_reg", Some("mergemaster"), None, None)
|
||||
.await;
|
||||
assert!(
|
||||
r2.is_err(),
|
||||
"mergemaster must be rejected because coder-1 (different LLM stage) is still running"
|
||||
);
|
||||
let r2_err = r2.unwrap_err();
|
||||
assert!(
|
||||
r2_err.contains("active LLM agent") || r2_err.contains("stale agent"),
|
||||
"cross-stage rejection expected, got: '{r2_err}'"
|
||||
);
|
||||
|
||||
// Reconciler: story "1100_reg" has no CRDT entry → canonical = None → stop coder-1.
|
||||
pool.reconcile_canonical_agents(root).await;
|
||||
|
||||
// coder-1 must be gone from the active pool.
|
||||
let remaining = pool.list_agents().unwrap();
|
||||
assert!(
|
||||
!remaining.iter().any(|a| {
|
||||
a.story_id == "1100_reg"
|
||||
&& a.agent_name == "coder-1"
|
||||
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
||||
}),
|
||||
"reconciler must have removed stale coder-1 from the active pool"
|
||||
);
|
||||
}
|
||||
|
||||
/// Bug 502: when start_agent is called for a non-Coder agent (mergemaster
|
||||
/// or qa) on a story that's in 4_merge/, the unconditional
|
||||
/// move_story_to_current at the top of start_agent must NOT fire — even
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use crate::config::ProjectConfig;
|
||||
use crate::pipeline_state::Stage;
|
||||
|
||||
use super::super::super::{PipelineStage, agent_config_stage, pipeline_stage};
|
||||
use super::super::super::{
|
||||
PipelineStage, agent_config_stage, canonical_pipeline_stage, pipeline_stage,
|
||||
};
|
||||
use super::super::worktree::find_active_story_stage;
|
||||
use crate::config::ProjectConfig;
|
||||
|
||||
/// Validate that an explicit `agent_name` is allowed to attach to `story_id`'s
|
||||
/// current pipeline stage.
|
||||
@@ -34,16 +34,15 @@ pub(super) fn validate_agent_stage(
|
||||
let Some(story_stage) = find_active_story_stage(project_root, story_id) else {
|
||||
return Ok(());
|
||||
};
|
||||
let expected_stage = match story_stage {
|
||||
Stage::Coding { .. } => PipelineStage::Coder,
|
||||
Stage::Qa => PipelineStage::Qa,
|
||||
Stage::Merge { .. } => PipelineStage::Mergemaster,
|
||||
_ => PipelineStage::Other,
|
||||
};
|
||||
if expected_stage != PipelineStage::Other && expected_stage != agent_stage {
|
||||
let canonical = canonical_pipeline_stage(&story_stage);
|
||||
let is_llm = matches!(
|
||||
agent_stage,
|
||||
PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster
|
||||
);
|
||||
if is_llm && (canonical.is_none() || canonical.as_ref() != Some(&agent_stage)) {
|
||||
return Err(format!(
|
||||
"Agent '{name}' (stage: {agent_stage:?}) cannot be assigned to \
|
||||
story '{story_id}' in {}/ (requires stage: {expected_stage:?})",
|
||||
story '{story_id}' in {}/ (requires stage: {canonical:?})",
|
||||
story_stage.dir_name()
|
||||
));
|
||||
}
|
||||
|
||||
@@ -5,7 +5,10 @@ use crate::slog_error;
|
||||
use crate::slog_warn;
|
||||
use std::path::Path;
|
||||
|
||||
use super::super::{AgentEvent, AgentStatus};
|
||||
use super::super::{
|
||||
AgentEvent, AgentStatus, PipelineStage, agent_config_stage, canonical_pipeline_stage,
|
||||
pipeline_stage,
|
||||
};
|
||||
use super::AgentPool;
|
||||
use super::types::composite_key;
|
||||
|
||||
@@ -71,8 +74,7 @@ impl AgentPool {
|
||||
self.kill_child_for_key(&key);
|
||||
}
|
||||
|
||||
// Step 3: now safe to mutate. Status flip, handle abort, drop the
|
||||
// child_killers entry.
|
||||
// Step 3: now safe to mutate. Status flip and handle abort.
|
||||
let (task_handle, tx) = {
|
||||
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
||||
let agent = agents
|
||||
@@ -88,9 +90,6 @@ impl AgentPool {
|
||||
handle.abort();
|
||||
let _ = handle.await;
|
||||
}
|
||||
if let Ok(mut killers) = self.child_killers.lock() {
|
||||
killers.remove(&key);
|
||||
}
|
||||
|
||||
// Preserve worktree for inspection — don't destroy agent's work on stop.
|
||||
if let Some(ref wt) = worktree_info {
|
||||
@@ -118,6 +117,82 @@ impl AgentPool {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop LLM agents whose pipeline stage no longer matches the story's canonical stage.
|
||||
///
|
||||
/// Called periodically by the tick loop (story 1100). For each Running or Pending
|
||||
/// LLM agent (Coder, Qa, or Mergemaster) whose stage does not match the canonical
|
||||
/// stage derived from the story's current CRDT state, the agent is stopped via the
|
||||
/// existing SIGKILL path. Idempotent: agents already at the correct stage are left
|
||||
/// untouched. Also stops LLM agents on stories that have no active pipeline stage
|
||||
/// (terminal, blocked, or frozen), since no LLM agent should run there.
|
||||
pub async fn reconcile_canonical_agents(&self, root: &std::path::Path) {
|
||||
use crate::config::ProjectConfig;
|
||||
|
||||
let config = match ProjectConfig::load(root) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
slog_warn!("[reconcile] Cannot load config for canonical reconcile: {e}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// Snapshot active LLM agents without holding the lock during async stops.
|
||||
let snapshot: Vec<(String, String, PipelineStage)> = {
|
||||
let Ok(agents) = self.agents.lock() else {
|
||||
return;
|
||||
};
|
||||
agents
|
||||
.iter()
|
||||
.filter_map(|(key, a)| {
|
||||
if !matches!(a.status, AgentStatus::Running | AgentStatus::Pending) {
|
||||
return None;
|
||||
}
|
||||
let stage = config
|
||||
.find_agent(&a.agent_name)
|
||||
.map(agent_config_stage)
|
||||
.unwrap_or_else(|| pipeline_stage(&a.agent_name));
|
||||
if !matches!(
|
||||
stage,
|
||||
PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster
|
||||
) {
|
||||
return None;
|
||||
}
|
||||
let story_id = key
|
||||
.rsplit_once(':')
|
||||
.map(|(s, _)| s)
|
||||
.unwrap_or(key)
|
||||
.to_string();
|
||||
Some((story_id, a.agent_name.clone(), stage))
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
|
||||
for (story_id, agent_name, agent_stage) in snapshot {
|
||||
let canonical = crate::pipeline_state::read_typed(&story_id)
|
||||
.ok()
|
||||
.flatten()
|
||||
.and_then(|item| canonical_pipeline_stage(&item.stage));
|
||||
|
||||
let should_stop = match &canonical {
|
||||
None => true,
|
||||
Some(c) if *c != agent_stage => true,
|
||||
_ => false,
|
||||
};
|
||||
|
||||
if !should_stop {
|
||||
continue;
|
||||
}
|
||||
|
||||
slog!(
|
||||
"[reconcile] stopping '{agent_name}' on '{story_id}': \
|
||||
canonical={canonical:?} actual={agent_stage:?}"
|
||||
);
|
||||
if let Err(e) = self.stop_agent(root, &story_id, &agent_name).await {
|
||||
slog_warn!("[reconcile] failed to stop '{agent_name}' on '{story_id}': {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove all agent entries for a given story_id from the pool.
|
||||
///
|
||||
/// Called when a story is archived so that stale entries don't accumulate.
|
||||
|
||||
@@ -33,6 +33,8 @@ pub(super) fn find_active_story_stage(
|
||||
crate::pipeline_state::Stage::Coding { .. }
|
||||
| crate::pipeline_state::Stage::Qa
|
||||
| crate::pipeline_state::Stage::Merge { .. }
|
||||
| crate::pipeline_state::Stage::MergeFailure { .. }
|
||||
| crate::pipeline_state::Stage::MergeFailureFinal { .. }
|
||||
)
|
||||
{
|
||||
return Some(item.stage);
|
||||
|
||||
@@ -13,7 +13,6 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::agents::AgentEvent;
|
||||
use crate::io::watcher::WatcherEvent;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tokio::sync::broadcast;
|
||||
|
||||
@@ -41,7 +40,6 @@ mod tests {
|
||||
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
|
||||
let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
|
||||
let event_log = Arc::new(Mutex::new(Vec::new()));
|
||||
let child_killers = Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
// sh -p "--" <script>: -p = privileged mode, "--" = end options,
|
||||
// then the script path is the file operand.
|
||||
@@ -56,7 +54,6 @@ mod tests {
|
||||
&event_log,
|
||||
None,
|
||||
0,
|
||||
child_killers,
|
||||
watcher_tx,
|
||||
None,
|
||||
None,
|
||||
@@ -98,7 +95,6 @@ mod tests {
|
||||
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
|
||||
let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
|
||||
let event_log = Arc::new(Mutex::new(Vec::new()));
|
||||
let child_killers = Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
let result = run_agent_pty_streaming(
|
||||
"423_story_rate_limit",
|
||||
@@ -111,7 +107,6 @@ mod tests {
|
||||
&event_log,
|
||||
None,
|
||||
0,
|
||||
child_killers,
|
||||
watcher_tx,
|
||||
None,
|
||||
None,
|
||||
@@ -160,7 +155,6 @@ mod tests {
|
||||
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
|
||||
let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
|
||||
let event_log = Arc::new(Mutex::new(Vec::new()));
|
||||
let child_killers = Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
let before = chrono::Utc::now();
|
||||
let result = run_agent_pty_streaming(
|
||||
@@ -174,7 +168,6 @@ mod tests {
|
||||
&event_log,
|
||||
None,
|
||||
0,
|
||||
child_killers,
|
||||
watcher_tx,
|
||||
None,
|
||||
None,
|
||||
@@ -229,7 +222,6 @@ mod tests {
|
||||
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
|
||||
let (watcher_tx, _watcher_rx) = broadcast::channel::<WatcherEvent>(16);
|
||||
let event_log = Arc::new(Mutex::new(Vec::new()));
|
||||
let child_killers = Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
let result = run_agent_pty_streaming(
|
||||
"916_story_rate_limit_extension",
|
||||
@@ -242,7 +234,6 @@ mod tests {
|
||||
&event_log,
|
||||
None,
|
||||
1, // inactivity_timeout_secs = 1s; would expire before the 3s sleep without the extension
|
||||
child_killers,
|
||||
watcher_tx,
|
||||
None,
|
||||
None,
|
||||
@@ -407,18 +398,16 @@ mod tests {
|
||||
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
|
||||
let (watcher_tx, _watcher_rx) = broadcast::channel::<WatcherEvent>(16);
|
||||
let event_log = Arc::new(Mutex::new(Vec::new()));
|
||||
let child_killers: Arc<
|
||||
Mutex<HashMap<String, Box<dyn portable_pty::ChildKiller + Send + Sync>>>,
|
||||
> = Arc::new(Mutex::new(HashMap::new()));
|
||||
let child_killers_for_kill = Arc::clone(&child_killers);
|
||||
|
||||
// Spawn a task to kill the child after a short delay (simulating watchdog).
|
||||
// Uses pids_matching on the script path — same mechanism as the production
|
||||
// watchdog after the process_kill migration (story 1090).
|
||||
let script_path_for_kill = script.to_string_lossy().to_string();
|
||||
tokio::spawn(async move {
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
|
||||
if let Ok(mut killers) = child_killers_for_kill.lock() {
|
||||
for (_, killer) in killers.iter_mut() {
|
||||
let _ = killer.kill();
|
||||
}
|
||||
let pids = crate::process_kill::pids_matching(&script_path_for_kill);
|
||||
if !pids.is_empty() {
|
||||
let _ = crate::process_kill::sigkill_pids_and_verify(&pids);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -435,7 +424,6 @@ mod tests {
|
||||
&event_log,
|
||||
None,
|
||||
0, // no inactivity timeout
|
||||
child_killers,
|
||||
watcher_tx,
|
||||
None, // no session to resume
|
||||
Some((project_root.clone(), "sonnet".to_string())),
|
||||
@@ -457,4 +445,62 @@ mod tests {
|
||||
the respawn's lookup_session returns it (warm), not None (cold)"
|
||||
);
|
||||
}
|
||||
|
||||
// ── bug 1103: soft rate-limit warning (status=allowed) must NOT set rate_limit_exit ──
|
||||
|
||||
/// Regression: a `rate_limit_event` with `status="allowed"` is a soft
|
||||
/// warning — the request was permitted. The session that follows should
|
||||
/// complete normally and report `rate_limit_exit == false`, not trigger the
|
||||
/// rate-limit respawn path in the spawn handler.
|
||||
#[tokio::test]
|
||||
async fn rate_limit_allowed_status_does_not_set_rate_limit_exit() {
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let script = tmp.path().join("emit_allowed_then_exit.sh");
|
||||
// Emit status="allowed" (soft warning), then exit cleanly.
|
||||
std::fs::write(
|
||||
&script,
|
||||
"#!/bin/sh\nprintf '%s\\n' '{\"type\":\"rate_limit_event\",\"rate_limit_info\":{\"status\":\"allowed\",\"reset_at\":\"2099-01-01T12:00:00Z\"}}'\n",
|
||||
)
|
||||
.unwrap();
|
||||
std::fs::set_permissions(&script, std::fs::Permissions::from_mode(0o755)).unwrap();
|
||||
|
||||
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
|
||||
let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
|
||||
let event_log = Arc::new(Mutex::new(Vec::new()));
|
||||
|
||||
let result = run_agent_pty_streaming(
|
||||
"1103_soft_warning_no_exit_flag",
|
||||
"coder-1",
|
||||
"sh",
|
||||
&[script.to_string_lossy().to_string()],
|
||||
"--",
|
||||
"/tmp",
|
||||
&tx,
|
||||
&event_log,
|
||||
None,
|
||||
0,
|
||||
watcher_tx,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
let pty = result.expect("PTY run should succeed");
|
||||
assert!(
|
||||
!pty.rate_limit_exit,
|
||||
"rate_limit_exit must be false for a soft 'allowed' warning; \
|
||||
only genuine hard blocks (rejected) should set it"
|
||||
);
|
||||
|
||||
// Watcher must have received RateLimitWarning, not RateLimitHardBlock.
|
||||
let evt = watcher_rx
|
||||
.try_recv()
|
||||
.expect("Expected a RateLimitWarning watcher event");
|
||||
assert!(
|
||||
matches!(evt, WatcherEvent::RateLimitWarning { .. }),
|
||||
"Expected RateLimitWarning for status=allowed, got: {evt:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
//! PTY process spawning and output loop: builds the command, drives the reader thread,
|
||||
//! and dispatches parsed JSON events to the broadcast channel.
|
||||
use std::collections::HashMap;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use portable_pty::{ChildKiller, CommandBuilder, PtySize, native_pty_system};
|
||||
use portable_pty::{CommandBuilder, PtySize, native_pty_system};
|
||||
use tokio::sync::broadcast;
|
||||
|
||||
use crate::agent_log::AgentLogWriter;
|
||||
@@ -14,7 +13,7 @@ use crate::slog;
|
||||
use crate::slog_warn;
|
||||
|
||||
use super::events::{emit_event, handle_agent_stream_event};
|
||||
use super::types::{ChildKillerGuard, PtyResult, composite_key};
|
||||
use super::types::PtyResult;
|
||||
|
||||
/// Spawn claude agent in a PTY and stream events through the broadcast channel.
|
||||
///
|
||||
@@ -55,7 +54,6 @@ pub(in crate::agents) async fn run_agent_pty_streaming(
|
||||
event_log: &Arc<Mutex<Vec<AgentEvent>>>,
|
||||
log_writer: Option<Arc<Mutex<AgentLogWriter>>>,
|
||||
inactivity_timeout_secs: u64,
|
||||
child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
|
||||
watcher_tx: broadcast::Sender<WatcherEvent>,
|
||||
session_id_to_resume: Option<&str>,
|
||||
eager_record: Option<(std::path::PathBuf, String)>,
|
||||
@@ -82,7 +80,6 @@ pub(in crate::agents) async fn run_agent_pty_streaming(
|
||||
&event_log,
|
||||
log_writer.as_deref(),
|
||||
inactivity_timeout_secs,
|
||||
&child_killers,
|
||||
&watcher_tx,
|
||||
resume_sid.as_deref(),
|
||||
eager_record,
|
||||
@@ -104,7 +101,6 @@ fn run_agent_pty_blocking(
|
||||
event_log: &Mutex<Vec<AgentEvent>>,
|
||||
log_writer: Option<&Mutex<AgentLogWriter>>,
|
||||
inactivity_timeout_secs: u64,
|
||||
child_killers: &Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
|
||||
watcher_tx: &broadcast::Sender<WatcherEvent>,
|
||||
session_id_to_resume: Option<&str>,
|
||||
eager_record: Option<(std::path::PathBuf, String)>,
|
||||
@@ -204,21 +200,6 @@ fn run_agent_pty_blocking(
|
||||
.spawn_command(cmd)
|
||||
.map_err(|e| format!("Failed to spawn agent for {story_id}:{agent_name}: {e}"))?;
|
||||
|
||||
// Register the child killer so that kill_all_children() / stop_agent() can
|
||||
// terminate this process on server shutdown, even if the blocking thread
|
||||
// cannot be interrupted. The ChildKillerGuard deregisters on function exit.
|
||||
let killer_key = composite_key(story_id, agent_name);
|
||||
{
|
||||
let killer = child.clone_killer();
|
||||
if let Ok(mut killers) = child_killers.lock() {
|
||||
killers.insert(killer_key.clone(), killer);
|
||||
}
|
||||
}
|
||||
let _killer_guard = ChildKillerGuard {
|
||||
killers: Arc::clone(child_killers),
|
||||
key: killer_key,
|
||||
};
|
||||
|
||||
drop(pair.slave);
|
||||
|
||||
let reader = pair
|
||||
@@ -366,7 +347,11 @@ fn run_agent_pty_blocking(
|
||||
.and_then(|i| i.get("status"))
|
||||
.and_then(|s| s.as_str())
|
||||
.unwrap_or("");
|
||||
let is_hard_block = !status.is_empty() && status != "allowed_warning";
|
||||
// "allowed" and "allowed_warning" are soft warnings — the request was
|
||||
// permitted; only statuses that actually block the request (e.g. "rejected")
|
||||
// are genuine hard blocks that warrant a rate-limit exit respawn.
|
||||
let is_hard_block =
|
||||
!status.is_empty() && status != "allowed" && status != "allowed_warning";
|
||||
let reset_at = rate_limit_info
|
||||
.and_then(|i| i.get("reset_at"))
|
||||
.and_then(|r| r.as_str())
|
||||
|
||||
@@ -1,9 +1,4 @@
|
||||
//! Core types for the PTY runner: result container and process lifecycle helpers.
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use portable_pty::ChildKiller;
|
||||
|
||||
use crate::agents::TokenUsage;
|
||||
|
||||
/// Result from a PTY agent session, containing the session ID and token usage.
|
||||
@@ -23,20 +18,3 @@ pub(in crate::agents) struct PtyResult {
|
||||
/// event was seen or when the `reset_at` field was absent from the event.
|
||||
pub rate_limit_reset_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
}
|
||||
|
||||
pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String {
|
||||
format!("{story_id}:{agent_name}")
|
||||
}
|
||||
|
||||
pub(super) struct ChildKillerGuard {
|
||||
pub killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
|
||||
pub key: String,
|
||||
}
|
||||
|
||||
impl Drop for ChildKillerGuard {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(mut killers) = self.killers.lock() {
|
||||
killers.remove(&self.key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
//! Claude Code runtime — launches Claude Code CLI sessions as agent backends.
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use portable_pty::ChildKiller;
|
||||
use tokio::sync::broadcast;
|
||||
|
||||
use crate::agent_log::AgentLogWriter;
|
||||
@@ -17,20 +15,13 @@ use super::{AgentEvent, AgentRuntime, RuntimeContext, RuntimeResult, RuntimeStat
|
||||
/// It wraps the existing PTY-based execution logic, preserving all streaming,
|
||||
/// token tracking, and inactivity timeout behaviour.
|
||||
pub struct ClaudeCodeRuntime {
|
||||
child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
|
||||
watcher_tx: broadcast::Sender<WatcherEvent>,
|
||||
}
|
||||
|
||||
impl ClaudeCodeRuntime {
|
||||
/// Create a new Claude Code runtime with shared child-killer registry and event channel.
|
||||
pub fn new(
|
||||
child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
|
||||
watcher_tx: broadcast::Sender<WatcherEvent>,
|
||||
) -> Self {
|
||||
Self {
|
||||
child_killers,
|
||||
watcher_tx,
|
||||
}
|
||||
/// Create a new Claude Code runtime with a shared event channel.
|
||||
pub fn new(watcher_tx: broadcast::Sender<WatcherEvent>) -> Self {
|
||||
Self { watcher_tx }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,7 +48,6 @@ impl AgentRuntime for ClaudeCodeRuntime {
|
||||
&event_log,
|
||||
log_writer.clone(),
|
||||
ctx.inactivity_timeout_secs,
|
||||
Arc::clone(&self.child_killers),
|
||||
self.watcher_tx.clone(),
|
||||
ctx.session_id_to_resume.as_deref(),
|
||||
eager_record.clone(),
|
||||
@@ -69,6 +59,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
|
||||
// Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its
|
||||
// first "system" event. Detected by: non-zero exit AND no session.
|
||||
aborted_signal: !result.exit_ok && result.session_id.is_none(),
|
||||
exit_ok: result.exit_ok,
|
||||
session_id: result.session_id,
|
||||
token_usage: result.token_usage,
|
||||
rate_limit_exit: result.rate_limit_exit,
|
||||
@@ -94,7 +85,6 @@ impl AgentRuntime for ClaudeCodeRuntime {
|
||||
&event_log,
|
||||
log_writer,
|
||||
ctx.inactivity_timeout_secs,
|
||||
Arc::clone(&self.child_killers),
|
||||
self.watcher_tx.clone(),
|
||||
None, // no --resume on fallback
|
||||
eager_record,
|
||||
@@ -103,6 +93,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
|
||||
Ok(RuntimeResult {
|
||||
aborted_signal: !fallback_result.exit_ok
|
||||
&& fallback_result.session_id.is_none(),
|
||||
exit_ok: fallback_result.exit_ok,
|
||||
session_id: fallback_result.session_id,
|
||||
token_usage: fallback_result.token_usage,
|
||||
rate_limit_exit: fallback_result.rate_limit_exit,
|
||||
@@ -115,7 +106,6 @@ impl AgentRuntime for ClaudeCodeRuntime {
|
||||
|
||||
fn stop(&self) {
|
||||
// Stopping is handled externally by the pool via kill_child_for_key().
|
||||
// The ChildKillerGuard in pty.rs deregisters automatically on process exit.
|
||||
}
|
||||
|
||||
fn get_status(&self) -> RuntimeStatus {
|
||||
|
||||
@@ -135,6 +135,7 @@ impl AgentRuntime for GeminiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
exit_ok: true,
|
||||
aborted_signal: false,
|
||||
rate_limit_exit: false,
|
||||
rate_limit_reset_at: None,
|
||||
@@ -151,6 +152,7 @@ impl AgentRuntime for GeminiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
exit_ok: true,
|
||||
aborted_signal: false,
|
||||
rate_limit_exit: false,
|
||||
rate_limit_reset_at: None,
|
||||
@@ -254,6 +256,7 @@ impl AgentRuntime for GeminiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
exit_ok: true,
|
||||
aborted_signal: false,
|
||||
rate_limit_exit: false,
|
||||
rate_limit_reset_at: None,
|
||||
@@ -339,6 +342,7 @@ impl AgentRuntime for GeminiRuntime {
|
||||
Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
exit_ok: true,
|
||||
aborted_signal: false,
|
||||
rate_limit_exit: false,
|
||||
rate_limit_reset_at: None,
|
||||
|
||||
@@ -55,6 +55,12 @@ pub struct RuntimeContext {
|
||||
pub struct RuntimeResult {
|
||||
pub session_id: Option<String>,
|
||||
pub token_usage: Option<TokenUsage>,
|
||||
/// `true` when the process exited with exit code 0; `false` for non-zero exits
|
||||
/// (API errors, network failures, or Claude-API-level budget exhaustion). Always
|
||||
/// `true` for API-based runtimes (OpenAI, Gemini) which have no exit-code concept.
|
||||
/// Used by the commit-recovery path to skip the stuck-respawn counter for forced
|
||||
/// exits (story 1089, AC1).
|
||||
pub exit_ok: bool,
|
||||
/// `true` when the process exited with a failure AND no session was established.
|
||||
///
|
||||
/// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion
|
||||
@@ -169,6 +175,7 @@ mod tests {
|
||||
cache_read_input_tokens: 0,
|
||||
total_cost_usd: 0.01,
|
||||
}),
|
||||
exit_ok: true,
|
||||
aborted_signal: false,
|
||||
rate_limit_exit: false,
|
||||
rate_limit_reset_at: None,
|
||||
@@ -186,6 +193,7 @@ mod tests {
|
||||
let result = RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: None,
|
||||
exit_ok: true,
|
||||
aborted_signal: false,
|
||||
rate_limit_exit: false,
|
||||
rate_limit_reset_at: None,
|
||||
@@ -204,20 +212,16 @@ mod tests {
|
||||
#[test]
|
||||
fn claude_code_runtime_get_status_returns_idle() {
|
||||
use crate::io::watcher::WatcherEvent;
|
||||
use std::collections::HashMap;
|
||||
let killers = Arc::new(Mutex::new(HashMap::new()));
|
||||
let (watcher_tx, _) = broadcast::channel::<WatcherEvent>(16);
|
||||
let runtime = ClaudeCodeRuntime::new(killers, watcher_tx);
|
||||
let runtime = ClaudeCodeRuntime::new(watcher_tx);
|
||||
assert_eq!(runtime.get_status(), RuntimeStatus::Idle);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn claude_code_runtime_stream_events_empty() {
|
||||
use crate::io::watcher::WatcherEvent;
|
||||
use std::collections::HashMap;
|
||||
let killers = Arc::new(Mutex::new(HashMap::new()));
|
||||
let (watcher_tx, _) = broadcast::channel::<WatcherEvent>(16);
|
||||
let runtime = ClaudeCodeRuntime::new(killers, watcher_tx);
|
||||
let runtime = ClaudeCodeRuntime::new(watcher_tx);
|
||||
assert!(runtime.stream_events().is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,6 +122,7 @@ impl AgentRuntime for OpenAiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
exit_ok: true,
|
||||
aborted_signal: false,
|
||||
rate_limit_exit: false,
|
||||
rate_limit_reset_at: None,
|
||||
@@ -138,6 +139,7 @@ impl AgentRuntime for OpenAiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
exit_ok: true,
|
||||
aborted_signal: false,
|
||||
rate_limit_exit: false,
|
||||
rate_limit_reset_at: None,
|
||||
@@ -224,6 +226,7 @@ impl AgentRuntime for OpenAiRuntime {
|
||||
return Ok(RuntimeResult {
|
||||
session_id: None,
|
||||
token_usage: Some(total_usage),
|
||||
exit_ok: true,
|
||||
aborted_signal: false,
|
||||
rate_limit_exit: false,
|
||||
rate_limit_reset_at: None,
|
||||
|
||||
@@ -19,6 +19,7 @@ mod help;
|
||||
pub(crate) mod loc;
|
||||
mod logs;
|
||||
mod move_story;
|
||||
mod new_project;
|
||||
mod overview;
|
||||
mod run_tests;
|
||||
mod setup;
|
||||
@@ -262,6 +263,11 @@ pub fn commands() -> &'static [BotCommand] {
|
||||
description: "List orphaned worktrees (dry run), or `cleanup_worktrees --confirm` to remove them",
|
||||
handler: handle_cleanup_worktrees_fallback,
|
||||
},
|
||||
BotCommand {
|
||||
name: "new",
|
||||
description: "Bootstrap a new project container (gateway only): `new project <name>`",
|
||||
handler: new_project::handle_new_project_fallback,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
//! `new project` command stub.
|
||||
//!
|
||||
//! The command is handled asynchronously in the Matrix transport's
|
||||
//! `on_room_message` handler (gateway mode only). This file exists so that
|
||||
//! `help` lists the command and the gateway proxy block does not forward it
|
||||
//! to the active project sled.
|
||||
|
||||
use super::CommandContext;
|
||||
|
||||
/// Fallback handler for the `new` command when it is not intercepted by the
|
||||
/// async gateway handler in `on_room_message`. In practice this is never
|
||||
/// called — `new project` is detected and handled before `try_handle_command`
|
||||
/// runs in gateway mode, and in standalone mode there is no matching project
|
||||
/// bootstrap context.
|
||||
///
|
||||
/// Returns `None` to prevent the LLM from receiving the raw command text.
|
||||
pub fn handle_new_project_fallback(_ctx: &CommandContext) -> Option<String> {
|
||||
None
|
||||
}
|
||||
@@ -0,0 +1,367 @@
|
||||
//! Protocol-agnostic chat dispatcher — coalesce window + per-session serial lock.
|
||||
//!
|
||||
//! Sits between every inbound transport (Matrix, Slack, WhatsApp, …) and the
|
||||
//! `claude -p` spawner. Transport handlers call [`ChatDispatcher::submit`]
|
||||
//! instead of spawning directly; the dispatcher enforces two invariants:
|
||||
//!
|
||||
//! 1. **Coalesce window**: messages arriving for the same session within
|
||||
//! `coalesce_ms` of each other are concatenated and delivered to a single
|
||||
//! spawn. The window is a *debounce*: each new message extends the window by
|
||||
//! `coalesce_ms` from its arrival time, so bursts flush as one batch.
|
||||
//!
|
||||
//! 2. **Per-session serial lock**: while one `claude -p` run is active, further
|
||||
//! messages for that session queue up and are dispatched as a single batch
|
||||
//! once the running invocation completes.
|
||||
//!
|
||||
//! A [`ChatDispatcher::stop`] call cancels the active run for a session and
|
||||
//! discards the pending queue.
|
||||
|
||||
use crate::slog;
|
||||
use std::collections::HashMap;
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
use tokio::sync::{mpsc, watch};
|
||||
|
||||
/// A factory function that produces one LLM execution future per dispatch.
|
||||
///
|
||||
/// Arguments:
|
||||
/// - `String` — the (possibly concatenated) prompt to send to `claude -p`.
|
||||
/// - `watch::Receiver<bool>` — send `true` on this channel to cancel the run.
|
||||
///
|
||||
/// Returns a boxed, pinned `Send + 'static` future that resolves when the LLM
|
||||
/// session ends (whether normally or via cancellation).
|
||||
pub type SpawnFn = Arc<
|
||||
dyn Fn(
|
||||
String,
|
||||
watch::Receiver<bool>,
|
||||
) -> Pin<Box<dyn std::future::Future<Output = ()> + Send + 'static>>
|
||||
+ Send
|
||||
+ Sync,
|
||||
>;
|
||||
|
||||
enum SessionMsg {
|
||||
UserMessage { text: String, factory: SpawnFn },
|
||||
Stop,
|
||||
}
|
||||
|
||||
struct SessionHandle {
|
||||
tx: mpsc::UnboundedSender<SessionMsg>,
|
||||
}
|
||||
|
||||
/// Coalescing, serialising dispatcher for chat-to-LLM message routing.
|
||||
///
|
||||
/// Construct once at startup via [`ChatDispatcher::new`] and share via `Arc`.
|
||||
/// Call [`submit`](ChatDispatcher::submit) from every transport handler instead
|
||||
/// of spawning `claude -p` directly.
|
||||
pub struct ChatDispatcher {
|
||||
sessions: Mutex<HashMap<String, SessionHandle>>,
|
||||
coalesce_ms: u64,
|
||||
}
|
||||
|
||||
impl ChatDispatcher {
|
||||
/// Create a new dispatcher with the given coalesce window in milliseconds.
|
||||
pub fn new(coalesce_ms: u64) -> Self {
|
||||
Self {
|
||||
sessions: Mutex::new(HashMap::new()),
|
||||
coalesce_ms,
|
||||
}
|
||||
}
|
||||
|
||||
/// Submit a message for a chat session.
|
||||
///
|
||||
/// If no session task exists for `session_key`, one is created lazily.
|
||||
/// The `factory` is called by the session task when the coalesce window
|
||||
/// closes (or immediately after the current run finishes, for pending
|
||||
/// messages).
|
||||
pub fn submit(&self, session_key: String, message: String, factory: SpawnFn) {
|
||||
let mut guard = self.sessions.lock().unwrap();
|
||||
let coalesce_ms = self.coalesce_ms;
|
||||
let handle = guard.entry(session_key.clone()).or_insert_with(|| {
|
||||
let (tx, rx) = mpsc::unbounded_channel();
|
||||
tokio::spawn(session_task(session_key.clone(), rx, coalesce_ms));
|
||||
SessionHandle { tx }
|
||||
});
|
||||
let _ = handle.tx.send(SessionMsg::UserMessage {
|
||||
text: message,
|
||||
factory,
|
||||
});
|
||||
}
|
||||
|
||||
/// Stop the active LLM run for `session_key` and clear its pending queue.
|
||||
///
|
||||
/// Returns `true` if the session existed (whether or not anything was
|
||||
/// actually running), `false` if no session for that key has been created.
|
||||
pub fn stop(&self, session_key: &str) -> bool {
|
||||
let guard = self.sessions.lock().unwrap();
|
||||
if let Some(handle) = guard.get(session_key) {
|
||||
let _ = handle.tx.send(SessionMsg::Stop);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-session background task.
|
||||
///
|
||||
/// Phases:
|
||||
/// 1. **Wait** — blocks until the first `UserMessage` arrives.
|
||||
/// 2. **Coalesce** — extends the window by `coalesce_ms` on each new message;
|
||||
/// fires when no message arrives within the window.
|
||||
/// 3. **Run** — calls the factory with the concatenated batch; while running,
|
||||
/// collects further `UserMessage`s into a pending list and logs a warn per
|
||||
/// message. A `Stop` message cancels the running call and clears pending.
|
||||
/// 4. **Drain** — after the run, if pending is non-empty, fires a second run
|
||||
/// with the accumulated batch and loops back to step 3.
|
||||
/// 5. Returns to step 1 when pending is empty.
|
||||
async fn session_task(
|
||||
session_key: String,
|
||||
mut rx: mpsc::UnboundedReceiver<SessionMsg>,
|
||||
coalesce_ms: u64,
|
||||
) {
|
||||
let coalesce_dur = Duration::from_millis(coalesce_ms);
|
||||
|
||||
loop {
|
||||
// ── Phase 1: wait for the first message ─────────────────────────────
|
||||
let (first_text, first_factory) = loop {
|
||||
match rx.recv().await {
|
||||
None => return,
|
||||
Some(SessionMsg::Stop) => continue,
|
||||
Some(SessionMsg::UserMessage { text, factory }) => break (text, factory),
|
||||
}
|
||||
};
|
||||
|
||||
// ── Phase 2: coalesce window (debounce) ──────────────────────────────
|
||||
let mut batch: Vec<String> = vec![first_text];
|
||||
let mut latest_factory: SpawnFn = first_factory;
|
||||
let mut deadline = tokio::time::Instant::now() + coalesce_dur;
|
||||
|
||||
'coalesce: loop {
|
||||
let now = tokio::time::Instant::now();
|
||||
if now >= deadline {
|
||||
break 'coalesce;
|
||||
}
|
||||
let remaining = deadline - now;
|
||||
match tokio::time::timeout(remaining, rx.recv()).await {
|
||||
Err(_) => break 'coalesce, // window closed
|
||||
Ok(None) => return, // channel closed → exit task
|
||||
Ok(Some(SessionMsg::Stop)) => {
|
||||
batch.clear();
|
||||
break 'coalesce;
|
||||
}
|
||||
Ok(Some(SessionMsg::UserMessage { text, factory })) => {
|
||||
batch.push(text);
|
||||
latest_factory = factory;
|
||||
// Extend deadline on each new message (debounce).
|
||||
deadline = tokio::time::Instant::now() + coalesce_dur;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if batch.is_empty() {
|
||||
continue; // Stop received during coalesce — restart
|
||||
}
|
||||
|
||||
// ── Phase 3 + 4: run → drain pending → repeat ───────────────────────
|
||||
let mut prompt = batch.join("\n\n");
|
||||
let mut factory = latest_factory;
|
||||
|
||||
loop {
|
||||
let (cancel_tx, cancel_rx) = watch::channel(false);
|
||||
let llm_fut = factory(prompt, cancel_rx);
|
||||
let mut llm_task = tokio::spawn(llm_fut);
|
||||
|
||||
let mut pending_texts: Vec<String> = vec![];
|
||||
let mut pending_factory: Option<SpawnFn> = None;
|
||||
let mut stopped = false;
|
||||
|
||||
// Wait for the LLM to finish, collecting messages that arrive during the run.
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = &mut llm_task => { break; }
|
||||
msg = rx.recv() => {
|
||||
match msg {
|
||||
None => {
|
||||
llm_task.abort();
|
||||
return;
|
||||
}
|
||||
Some(SessionMsg::Stop) => {
|
||||
let _ = cancel_tx.send(true);
|
||||
let _ = llm_task.await;
|
||||
pending_texts.clear();
|
||||
stopped = true;
|
||||
break;
|
||||
}
|
||||
Some(SessionMsg::UserMessage { text, factory: f }) => {
|
||||
pending_texts.push(text);
|
||||
let depth = pending_texts.len();
|
||||
slog!(
|
||||
"[chat-dispatcher] coalescing message for session={}, queue_depth={}",
|
||||
session_key,
|
||||
depth,
|
||||
);
|
||||
pending_factory = Some(f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if stopped || pending_texts.is_empty() {
|
||||
break; // back to Phase 1
|
||||
}
|
||||
|
||||
// Fire the pending batch as the next run (no additional coalesce window).
|
||||
prompt = pending_texts.join("\n\n");
|
||||
factory = pending_factory.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
fn make_factory(spawn_count: Arc<AtomicUsize>, run_ms: u64) -> SpawnFn {
|
||||
Arc::new(move |_prompt: String, _cancel_rx: watch::Receiver<bool>| {
|
||||
let count = Arc::clone(&spawn_count);
|
||||
Box::pin(async move {
|
||||
count.fetch_add(1, Ordering::SeqCst);
|
||||
tokio::time::sleep(Duration::from_millis(run_ms)).await;
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/// AC 6 regression: three messages arriving 200 ms / (long gap) / (after run)
|
||||
/// apart on the same session must produce at most two spawns, never three
|
||||
/// concurrent processes.
|
||||
///
|
||||
/// Setup:
|
||||
/// coalesce_ms = 50 ms (short window so test runs fast)
|
||||
/// LLM "run" = 150 ms
|
||||
/// msg1 @ t=0
|
||||
/// msg2 @ t=20 ms — within coalesce window, merged with msg1 → 1 spawn
|
||||
/// msg3 @ t=300 ms — after run completes → 2nd spawn
|
||||
///
|
||||
/// Expected: exactly 2 spawns, never 3.
|
||||
#[tokio::test]
|
||||
async fn three_messages_never_three_concurrent_spawns() {
|
||||
let spawn_count = Arc::new(AtomicUsize::new(0));
|
||||
let dispatcher = Arc::new(ChatDispatcher::new(50));
|
||||
let session = "room1".to_string();
|
||||
|
||||
// msg1 at t=0
|
||||
dispatcher.submit(
|
||||
session.clone(),
|
||||
"msg1".to_string(),
|
||||
make_factory(Arc::clone(&spawn_count), 150),
|
||||
);
|
||||
|
||||
// msg2 at t=20 ms — inside the 50 ms coalesce window
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
dispatcher.submit(
|
||||
session.clone(),
|
||||
"msg2".to_string(),
|
||||
make_factory(Arc::clone(&spawn_count), 150),
|
||||
);
|
||||
|
||||
// msg3 at t=300 ms — after the coalesce window fires (t≈70 ms) and the
|
||||
// 150 ms run completes (t≈220 ms), so msg3 starts a second coalesce cycle.
|
||||
tokio::time::sleep(Duration::from_millis(280)).await;
|
||||
dispatcher.submit(
|
||||
session.clone(),
|
||||
"msg3".to_string(),
|
||||
make_factory(Arc::clone(&spawn_count), 150),
|
||||
);
|
||||
|
||||
// Wait long enough for both runs to finish.
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
|
||||
let count = spawn_count.load(Ordering::SeqCst);
|
||||
assert!(
|
||||
(1..=2).contains(&count),
|
||||
"expected 1 or 2 spawns (msgs 1+2 coalesced, msg3 separate), got {count}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Messages that arrive while the LLM is running are not lost — they are
|
||||
/// delivered as a single follow-up spawn once the first run completes.
|
||||
#[tokio::test]
|
||||
async fn pending_messages_dispatched_after_run_completes() {
|
||||
let spawn_count = Arc::new(AtomicUsize::new(0));
|
||||
let dispatcher = Arc::new(ChatDispatcher::new(50));
|
||||
let session = "room2".to_string();
|
||||
|
||||
// First message — starts a 200 ms run.
|
||||
dispatcher.submit(
|
||||
session.clone(),
|
||||
"first".to_string(),
|
||||
make_factory(Arc::clone(&spawn_count), 200),
|
||||
);
|
||||
|
||||
// Wait for coalesce window to fire, then send two more.
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
dispatcher.submit(
|
||||
session.clone(),
|
||||
"second".to_string(),
|
||||
make_factory(Arc::clone(&spawn_count), 50),
|
||||
);
|
||||
dispatcher.submit(
|
||||
session.clone(),
|
||||
"third".to_string(),
|
||||
make_factory(Arc::clone(&spawn_count), 50),
|
||||
);
|
||||
|
||||
// Wait long enough for both runs.
|
||||
tokio::time::sleep(Duration::from_millis(600)).await;
|
||||
|
||||
let count = spawn_count.load(Ordering::SeqCst);
|
||||
assert_eq!(
|
||||
count, 2,
|
||||
"first run + one pending-batch run = 2 total spawns"
|
||||
);
|
||||
}
|
||||
|
||||
/// Stop cancels the running LLM and discards pending messages.
|
||||
#[tokio::test]
|
||||
async fn stop_cancels_run_and_clears_pending() {
|
||||
let spawn_count = Arc::new(AtomicUsize::new(0));
|
||||
let dispatcher = Arc::new(ChatDispatcher::new(30));
|
||||
let session = "room3".to_string();
|
||||
|
||||
// Start a long run.
|
||||
dispatcher.submit(
|
||||
session.clone(),
|
||||
"long-running".to_string(),
|
||||
make_factory(Arc::clone(&spawn_count), 500),
|
||||
);
|
||||
|
||||
// Wait for coalesce window to fire.
|
||||
tokio::time::sleep(Duration::from_millis(80)).await;
|
||||
|
||||
// Queue a pending message.
|
||||
dispatcher.submit(
|
||||
session.clone(),
|
||||
"pending".to_string(),
|
||||
make_factory(Arc::clone(&spawn_count), 50),
|
||||
);
|
||||
|
||||
// Stop immediately.
|
||||
dispatcher.stop(&session);
|
||||
|
||||
// Wait longer than the run would have taken if not stopped.
|
||||
tokio::time::sleep(Duration::from_millis(700)).await;
|
||||
|
||||
let count = spawn_count.load(Ordering::SeqCst);
|
||||
// The first run was started before stop (spawn_count=1).
|
||||
// The pending message should NOT have produced a second spawn.
|
||||
assert!(
|
||||
count <= 1,
|
||||
"stop should discard pending; got {count} spawns"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,8 @@
|
||||
|
||||
/// Bot command registry and dispatch — parses and routes incoming chat messages.
|
||||
pub mod commands;
|
||||
/// Protocol-agnostic chat dispatcher — coalesce window and per-session serial lock.
|
||||
pub mod dispatcher;
|
||||
/// Chat history utilities — loading and serialising conversation history.
|
||||
pub mod history;
|
||||
pub(crate) mod lookup;
|
||||
|
||||
@@ -300,6 +300,20 @@ pub(super) async fn handle_incoming_message(
|
||||
handle_llm_message(ctx, channel, user, message).await;
|
||||
}
|
||||
|
||||
/// Build the prompt for a Discord LLM turn, prepending any pending
|
||||
/// CRDT pipeline-transition events as a `<system-reminder>` block.
|
||||
fn build_discord_llm_prompt(
|
||||
session_id: &str,
|
||||
bot_name: &str,
|
||||
user: &str,
|
||||
user_message: &str,
|
||||
) -> String {
|
||||
let event_ctx = crate::llm_session::assemble_prompt_context(session_id);
|
||||
format!(
|
||||
"{event_ctx}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{user}: {user_message}"
|
||||
)
|
||||
}
|
||||
|
||||
/// Forward a message to Claude Code and send the response back via Discord.
|
||||
async fn handle_llm_message(ctx: &DiscordContext, channel: &str, user: &str, user_message: &str) {
|
||||
use crate::chat::util::drain_complete_paragraphs;
|
||||
@@ -314,8 +328,11 @@ async fn handle_llm_message(ctx: &DiscordContext, channel: &str, user: &str, use
|
||||
};
|
||||
|
||||
let bot_name = &ctx.services.bot_name;
|
||||
let prompt = format!(
|
||||
"[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{user}: {user_message}"
|
||||
let prompt = build_discord_llm_prompt(
|
||||
resume_session_id.as_deref().unwrap_or(channel),
|
||||
bot_name,
|
||||
user,
|
||||
user_message,
|
||||
);
|
||||
|
||||
let provider = ClaudeCodeProvider::new();
|
||||
@@ -604,4 +621,40 @@ mod tests {
|
||||
assert!(conv.session_id.is_none(), "session_id should be cleared");
|
||||
assert!(conv.entries.is_empty(), "entries should be cleared");
|
||||
}
|
||||
|
||||
/// AC 4: fire a `TransitionFired` event, simulate a Discord user turn, and
|
||||
/// assert the assembled prompt contains the event (end-to-end non-Matrix test).
|
||||
#[test]
|
||||
fn discord_prompt_includes_transition_event() {
|
||||
use crate::pipeline_state::{PipelineEvent, PlanState, Stage, StoryId, TransitionFired};
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
crate::event_log::log_transition_event(&TransitionFired {
|
||||
story_id: StoryId("77_discord_test".to_string()),
|
||||
before: Stage::Backlog,
|
||||
after: Stage::Coding {
|
||||
claim: None,
|
||||
plan: PlanState::Missing,
|
||||
retries: 0,
|
||||
},
|
||||
event: PipelineEvent::DepsMet,
|
||||
at: chrono::Utc::now(),
|
||||
});
|
||||
|
||||
let prompt =
|
||||
build_discord_llm_prompt("discord-ch-test", "Timmy", "@alice", "what is the status?");
|
||||
|
||||
assert!(
|
||||
prompt.contains("<system-reminder>"),
|
||||
"assembled prompt must include system-reminder block; got: {prompt}"
|
||||
);
|
||||
assert!(
|
||||
prompt.contains("77_discord_test"),
|
||||
"assembled prompt must contain story id; got: {prompt}"
|
||||
);
|
||||
assert!(
|
||||
prompt.contains("what is the status?"),
|
||||
"assembled prompt must contain user message; got: {prompt}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//! Matrix bot context — shared state for the Matrix bot (rooms, history, permissions).
|
||||
use crate::chat::ChatTransport;
|
||||
use crate::service::gateway::config::ProjectEntry;
|
||||
use crate::service::timer::TimerStore;
|
||||
use crate::services::Services;
|
||||
use matrix_sdk::ruma::{OwnedEventId, OwnedRoomId, OwnedUserId};
|
||||
@@ -87,27 +88,15 @@ pub struct BotContext {
|
||||
/// In gateway mode: the currently active project (shared with the gateway HTTP handler).
|
||||
/// `None` in standalone single-project mode.
|
||||
pub gateway_active_project: Option<Arc<RwLock<String>>>,
|
||||
/// In gateway mode: valid project names accepted by the `switch` command.
|
||||
/// Empty in standalone mode.
|
||||
pub gateway_projects: Vec<String>,
|
||||
/// In gateway mode: mapping of project name → base URL (e.g. `"http://localhost:3001"`).
|
||||
/// Used to proxy bot commands to the active project over WebSocket (`/ws`).
|
||||
/// Empty in standalone mode.
|
||||
pub gateway_project_urls: BTreeMap<String, String>,
|
||||
/// Pipeline transition events buffered since the last LLM turn.
|
||||
/// In gateway mode: shared live projects map from [`GatewayState`].
|
||||
///
|
||||
/// A background task appends one compact audit line per real stage
|
||||
/// transition. `handle_message` drains this buffer and injects it as a
|
||||
/// `<system-reminder>` block at the head of the next user prompt so Timmy
|
||||
/// sees pipeline activity without requiring a separate message.
|
||||
pub pending_pipeline_events: Arc<TokioMutex<Vec<String>>>,
|
||||
/// Gateway aggregate transition events buffered since the last LLM turn.
|
||||
///
|
||||
/// In gateway mode a background task appends one compact audit line per
|
||||
/// `GatewayStatusEvent` received from the gateway broadcaster. Drained
|
||||
/// alongside `pending_pipeline_events` on each user message. Always
|
||||
/// empty in standalone (non-gateway) mode.
|
||||
pub pending_gateway_events: Arc<TokioMutex<Vec<String>>>,
|
||||
/// The `new project` command writes here so HTTP handlers see the new entry
|
||||
/// immediately without requiring a gateway restart. `None` in standalone mode.
|
||||
pub gateway_projects_store: Option<Arc<RwLock<BTreeMap<String, ProjectEntry>>>>,
|
||||
/// Bounded FIFO set of already-handled incoming event IDs.
|
||||
///
|
||||
/// The Matrix sync loop can replay events on reconnect. This set ensures
|
||||
@@ -268,6 +257,7 @@ mod tests {
|
||||
pending_perm_replies: Arc::new(TokioMutex::new(HashMap::new())),
|
||||
permission_timeout_secs: 120,
|
||||
status: Arc::new(crate::service::status::StatusBroadcaster::new()),
|
||||
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -276,7 +266,6 @@ mod tests {
|
||||
fn test_bot_context(
|
||||
services: Arc<Services>,
|
||||
gateway_active_project: Option<Arc<RwLock<String>>>,
|
||||
gateway_projects: Vec<String>,
|
||||
gateway_project_urls: BTreeMap<String, String>,
|
||||
) -> BotContext {
|
||||
BotContext {
|
||||
@@ -297,10 +286,8 @@ mod tests {
|
||||
std::path::PathBuf::from("/tmp/timers.json"),
|
||||
)),
|
||||
gateway_active_project,
|
||||
gateway_projects,
|
||||
gateway_project_urls,
|
||||
pending_pipeline_events: Arc::new(TokioMutex::new(Vec::new())),
|
||||
pending_gateway_events: Arc::new(TokioMutex::new(Vec::new())),
|
||||
gateway_projects_store: None,
|
||||
handled_incoming_event_ids: Arc::new(TokioMutex::new(SeenEventIds::new(
|
||||
SEEN_EVENT_IDS_CAP,
|
||||
))),
|
||||
@@ -317,7 +304,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn effective_project_root_standalone_returns_project_root() {
|
||||
let services = test_services(PathBuf::from("/projects/myapp"));
|
||||
let ctx = test_bot_context(services, None, vec![], BTreeMap::new());
|
||||
let ctx = test_bot_context(services, None, BTreeMap::new());
|
||||
assert_eq!(
|
||||
ctx.effective_project_root().await,
|
||||
PathBuf::from("/projects/myapp")
|
||||
@@ -331,7 +318,6 @@ mod tests {
|
||||
let ctx = test_bot_context(
|
||||
services,
|
||||
Some(Arc::clone(&active)),
|
||||
vec!["huskies".into(), "robot-studio".into()],
|
||||
BTreeMap::from([
|
||||
("huskies".into(), "http://localhost:3001".into()),
|
||||
("robot-studio".into(), "http://localhost:3002".into()),
|
||||
@@ -350,7 +336,6 @@ mod tests {
|
||||
let ctx = test_bot_context(
|
||||
services,
|
||||
Some(Arc::clone(&active)),
|
||||
vec!["huskies".into(), "robot-studio".into()],
|
||||
BTreeMap::from([
|
||||
("huskies".into(), "http://localhost:3001".into()),
|
||||
("robot-studio".into(), "http://localhost:3002".into()),
|
||||
@@ -431,7 +416,7 @@ mod tests {
|
||||
#[test]
|
||||
fn bot_context_has_no_require_verified_devices_field() {
|
||||
let services = test_services(PathBuf::from("/tmp"));
|
||||
let ctx = test_bot_context(services, None, vec![], BTreeMap::new());
|
||||
let ctx = test_bot_context(services, None, BTreeMap::new());
|
||||
let _cloned = ctx.clone();
|
||||
}
|
||||
|
||||
@@ -481,7 +466,6 @@ mod tests {
|
||||
let ctx = test_bot_context(
|
||||
services,
|
||||
Some(Arc::clone(&active)),
|
||||
vec!["huskies".into()],
|
||||
BTreeMap::from([("huskies".into(), base_url)]),
|
||||
);
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ use super::super::context::BotContext;
|
||||
use super::super::format::markdown_to_html;
|
||||
use super::super::history::{ConversationEntry, ConversationRole, save_history};
|
||||
|
||||
use super::{format_drained_events, format_user_prompt};
|
||||
use super::format_user_prompt;
|
||||
|
||||
pub(in crate::chat::transport::matrix::bot) async fn handle_message(
|
||||
room_id_str: String,
|
||||
@@ -21,6 +21,7 @@ pub(in crate::chat::transport::matrix::bot) async fn handle_message(
|
||||
ctx: BotContext,
|
||||
sender: String,
|
||||
user_message: String,
|
||||
mut cancel_rx: watch::Receiver<bool>,
|
||||
) {
|
||||
// Look up the room's existing Claude Code session ID (if any) so we can
|
||||
// resume the conversation with structured API messages instead of
|
||||
@@ -30,28 +31,10 @@ pub(in crate::chat::transport::matrix::bot) async fn handle_message(
|
||||
guard.get(&room_id).and_then(|conv| conv.session_id.clone())
|
||||
};
|
||||
|
||||
// Drain pipeline and gateway transition events buffered since the last LLM
|
||||
// turn and prepend them as a passive <system-reminder> block so Timmy sees
|
||||
// pipeline activity without requiring a separate message. Sled events come
|
||||
// from `pending_pipeline_events`; gateway events from `pending_gateway_events`.
|
||||
// In practice only one buffer is non-empty (sled mode vs gateway mode).
|
||||
let system_reminder_prefix = {
|
||||
let mut sled_guard = ctx.pending_pipeline_events.lock().await;
|
||||
let mut gtw_guard = ctx.pending_gateway_events.lock().await;
|
||||
let all_lines: Vec<String> = sled_guard.drain(..).chain(gtw_guard.drain(..)).collect();
|
||||
drop(sled_guard);
|
||||
drop(gtw_guard);
|
||||
slog!(
|
||||
"[matrix-bot] drained {} gateway audit lines for LLM context",
|
||||
all_lines.len()
|
||||
);
|
||||
let prefix = format_drained_events(all_lines);
|
||||
slog!(
|
||||
"[matrix-bot] format_drained_events output: {} bytes",
|
||||
prefix.len()
|
||||
);
|
||||
prefix
|
||||
};
|
||||
// Pull new pipeline-transition events from the CRDT event log for this
|
||||
// session and atomically advance the high-water marks so the same events
|
||||
// are not re-injected on the next turn.
|
||||
let event_log_ctx = crate::llm_session::assemble_prompt_context(&room_id_str);
|
||||
|
||||
// The prompt is just the current message with sender attribution.
|
||||
// Prior conversation context is carried by the Claude Code session.
|
||||
@@ -63,14 +46,11 @@ pub(in crate::chat::transport::matrix::bot) async fn handle_message(
|
||||
String::new()
|
||||
};
|
||||
let prompt = format!(
|
||||
"{system_reminder_prefix}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n{active_project_ctx}\n{}",
|
||||
"{event_log_ctx}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n{active_project_ctx}\n{}",
|
||||
format_user_prompt(&sender, &user_message)
|
||||
);
|
||||
|
||||
let provider = ClaudeCodeProvider::new();
|
||||
let (cancel_tx, mut cancel_rx) = watch::channel(false);
|
||||
// Keep the sender alive for the duration of the call.
|
||||
let _cancel_tx = cancel_tx;
|
||||
|
||||
// Channel for sending complete paragraphs to the Matrix posting task.
|
||||
let (msg_tx, mut msg_rx) = tokio::sync::mpsc::unbounded_channel::<String>();
|
||||
|
||||
@@ -11,27 +11,6 @@ pub(super) fn format_user_prompt(sender: &str, message: &str) -> String {
|
||||
format!("{sender}: {message}")
|
||||
}
|
||||
|
||||
/// Drain `lines` into a `<system-reminder>` block for injection at the head of
|
||||
/// the next LLM prompt. Returns an empty string when `lines` is empty.
|
||||
///
|
||||
/// At most 20 lines are shown verbatim; excess lines are replaced with a
|
||||
/// `…and N more` indicator to keep context size bounded.
|
||||
pub(in crate::chat::transport::matrix::bot) fn format_drained_events(lines: Vec<String>) -> String {
|
||||
if lines.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
const MAX_PIPELINE_EVENTS: usize = 20;
|
||||
let total = lines.len();
|
||||
let shown_count = total.min(MAX_PIPELINE_EVENTS);
|
||||
let shown = lines[..shown_count].join("\n");
|
||||
let tail = if total > MAX_PIPELINE_EVENTS {
|
||||
format!("\n...and {} more", total - MAX_PIPELINE_EVENTS)
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
format!("<system-reminder>\n{shown}{tail}\n</system-reminder>\n")
|
||||
}
|
||||
|
||||
/// Matrix event handler for room messages. Each invocation spawns an
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -72,49 +51,6 @@ mod tests {
|
||||
assert!(crate::llm::oauth::extract_login_url_from_error(err).is_none());
|
||||
}
|
||||
|
||||
// -- format_drained_events ----------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn format_drained_events_empty_returns_empty_string() {
|
||||
assert_eq!(format_drained_events(vec![]), String::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_drained_events_wraps_in_system_reminder() {
|
||||
let result = format_drained_events(vec!["audit ts=2026 id=1 event=x".to_string()]);
|
||||
assert!(result.starts_with("<system-reminder>\n"), "got: {result}");
|
||||
assert!(result.ends_with("</system-reminder>\n"), "got: {result}");
|
||||
assert!(
|
||||
result.contains("audit ts=2026 id=1 event=x"),
|
||||
"got: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_drained_events_caps_at_20_with_overflow_indicator() {
|
||||
let lines: Vec<String> = (0..25).map(|i| format!("line {i}")).collect();
|
||||
let result = format_drained_events(lines);
|
||||
assert!(result.contains("...and 5 more"), "got: {result}");
|
||||
assert!(
|
||||
result.contains("line 19"),
|
||||
"last shown line missing; got: {result}"
|
||||
);
|
||||
assert!(
|
||||
!result.contains("line 20"),
|
||||
"line 21 must be hidden; got: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_drained_events_exactly_20_no_overflow_indicator() {
|
||||
let lines: Vec<String> = (0..20).map(|i| format!("line {i}")).collect();
|
||||
let result = format_drained_events(lines);
|
||||
assert!(
|
||||
!result.contains("...and"),
|
||||
"must not show overflow when exactly 20; got: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
// -- bot_name / system prompt -------------------------------------------
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -19,6 +19,31 @@ use super::super::verification::check_sender_verified;
|
||||
|
||||
use super::handle_message;
|
||||
|
||||
/// Evaluate a `switch <arg>` command against the live project store.
|
||||
///
|
||||
/// Reads valid project names from the store at call time so newly added
|
||||
/// projects are visible without a bot restart. Returns the reply text.
|
||||
pub(super) async fn eval_switch_command(
|
||||
arg: &str,
|
||||
active_project: &tokio::sync::RwLock<String>,
|
||||
store: &tokio::sync::RwLock<
|
||||
std::collections::BTreeMap<String, crate::service::gateway::config::ProjectEntry>,
|
||||
>,
|
||||
) -> String {
|
||||
let projects: Vec<String> = store.read().await.keys().cloned().collect();
|
||||
if arg.is_empty() {
|
||||
let available = projects.join(", ");
|
||||
format!("Usage: `switch <project>`. Available projects: {available}")
|
||||
} else if projects.iter().any(|p| p == arg) {
|
||||
*active_project.write().await = arg.to_string();
|
||||
crate::crdt_state::write_gateway_active_project(arg);
|
||||
format!("Switched to project **{arg}**.")
|
||||
} else {
|
||||
let available = projects.join(", ");
|
||||
format!("Unknown project `{arg}`. Available: {available}")
|
||||
}
|
||||
}
|
||||
|
||||
pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
ev: OriginalSyncRoomMessageEvent,
|
||||
room: Room,
|
||||
@@ -193,7 +218,7 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
if ctx.is_gateway() {
|
||||
// Commands that are meaningful on the gateway itself (no project state needed).
|
||||
const GATEWAY_LOCAL_COMMANDS: &[&str] =
|
||||
&["help", "ambient", "reset", "switch", "all_status"];
|
||||
&["help", "ambient", "reset", "switch", "all_status", "new"];
|
||||
|
||||
let stripped = crate::chat::util::strip_bot_mention(
|
||||
&user_message,
|
||||
@@ -260,6 +285,49 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
// Gateway-local commands and freeform text fall through to normal handling below.
|
||||
}
|
||||
|
||||
// In gateway mode, handle the "new project <name> [--stack <stack>]" command
|
||||
// to bootstrap a project container and register it with the gateway.
|
||||
if ctx.is_gateway()
|
||||
&& let Some(cmd) = super::super::super::new_project::extract_new_project_command(
|
||||
&user_message,
|
||||
&ctx.services.bot_name,
|
||||
ctx.matrix_user_id.as_str(),
|
||||
)
|
||||
{
|
||||
slog!(
|
||||
"[matrix-bot] Handling new project command from {sender}: name={:?} stack={:?} git_url={:?} adopt_path={:?}",
|
||||
cmd.name,
|
||||
cmd.stack,
|
||||
cmd.git_url,
|
||||
cmd.adopt_path,
|
||||
);
|
||||
let response = if let Some(ref store) = ctx.gateway_projects_store {
|
||||
super::super::super::new_project::handle_new_project(
|
||||
&cmd.name,
|
||||
cmd.stack.as_deref(),
|
||||
cmd.git_url.as_deref(),
|
||||
cmd.git_token.as_deref(),
|
||||
cmd.host_path.as_deref(),
|
||||
cmd.adopt_path.as_deref(),
|
||||
store,
|
||||
&ctx.services.project_root,
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
"Gateway projects store unavailable — cannot create project.".to_string()
|
||||
};
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, &response, &html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for bot-level commands (help, status, ambient, …) before invoking
|
||||
// the LLM. All commands are registered in commands.rs — no special-casing
|
||||
// needed here.
|
||||
@@ -529,16 +597,10 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
};
|
||||
|
||||
if cmd.eq_ignore_ascii_case("switch") {
|
||||
let response = if arg.is_empty() {
|
||||
let available = ctx.gateway_projects.join(", ");
|
||||
format!("Usage: `switch <project>`. Available projects: {available}")
|
||||
} else if ctx.gateway_projects.iter().any(|p| p == &arg) {
|
||||
*active_project.write().await = arg.clone();
|
||||
crate::crdt_state::write_gateway_active_project(&arg);
|
||||
format!("Switched to project **{arg}**.")
|
||||
let response = if let Some(ref store) = ctx.gateway_projects_store {
|
||||
eval_switch_command(&arg, active_project, store).await
|
||||
} else {
|
||||
let available = ctx.gateway_projects.join(", ");
|
||||
format!("Unknown project `{arg}`. Available: {available}")
|
||||
"Switch is unavailable: project store not initialised.".to_string()
|
||||
};
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
@@ -608,9 +670,133 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
return;
|
||||
}
|
||||
|
||||
// Spawn a separate task so the Matrix sync loop is not blocked while we
|
||||
// wait for the LLM response (which can take several seconds).
|
||||
tokio::spawn(async move {
|
||||
handle_message(room_id_str, incoming_room_id, ctx, sender, user_message).await;
|
||||
});
|
||||
// "stop" — cancel the running LLM turn for this session and clear pending queue.
|
||||
{
|
||||
let stripped = crate::chat::util::strip_bot_mention(
|
||||
&user_message,
|
||||
&ctx.services.bot_name,
|
||||
ctx.matrix_user_id.as_str(),
|
||||
)
|
||||
.trim()
|
||||
.to_ascii_lowercase();
|
||||
if stripped == "stop" {
|
||||
slog!("[matrix-bot] stop command from {sender} for session {room_id_str}");
|
||||
ctx.services.chat_dispatcher.stop(&room_id_str);
|
||||
let msg = "Stopped.";
|
||||
let html = markdown_to_html(msg);
|
||||
if let Ok(msg_id) = ctx.transport.send_message(&room_id_str, msg, &html).await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Hand the message to the protocol-agnostic dispatcher instead of spawning
|
||||
// directly. The dispatcher applies a coalesce window and a per-session
|
||||
// serial lock, preventing duplicate concurrent Timmy spawns.
|
||||
let ctx_for_factory = ctx.clone();
|
||||
let factory: crate::chat::dispatcher::SpawnFn = {
|
||||
let room_id_str2 = room_id_str.clone();
|
||||
std::sync::Arc::new(
|
||||
move |coalesced: String, cancel_rx: tokio::sync::watch::Receiver<bool>| {
|
||||
let room_id_str = room_id_str2.clone();
|
||||
let incoming_room_id = incoming_room_id.clone();
|
||||
let ctx = ctx_for_factory.clone();
|
||||
let sender = sender.clone();
|
||||
Box::pin(async move {
|
||||
handle_message(
|
||||
room_id_str,
|
||||
incoming_room_id,
|
||||
ctx,
|
||||
sender,
|
||||
coalesced,
|
||||
cancel_rx,
|
||||
)
|
||||
.await;
|
||||
})
|
||||
},
|
||||
)
|
||||
};
|
||||
ctx.services
|
||||
.chat_dispatcher
|
||||
.submit(room_id_str, user_message, factory);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::eval_switch_command;
|
||||
use crate::service::gateway::config::ProjectEntry;
|
||||
use std::collections::BTreeMap;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
/// Regression test: `switch` reads from the live store, not a snapshot Vec.
|
||||
///
|
||||
/// Seeds an empty store, inserts a project at runtime, then asserts the
|
||||
/// command finds it — covering the bug where a stale `gateway_projects` Vec
|
||||
/// caused newly added projects to be invisible until the bot restarted.
|
||||
#[tokio::test]
|
||||
async fn switch_reads_live_store_after_runtime_insert() {
|
||||
let active = RwLock::new("huskies".to_string());
|
||||
let store: RwLock<BTreeMap<String, ProjectEntry>> = RwLock::new(BTreeMap::new());
|
||||
|
||||
// Empty store: unknown project.
|
||||
let resp = eval_switch_command("robot-studio", &active, &store).await;
|
||||
assert!(
|
||||
resp.contains("Unknown project"),
|
||||
"empty store should not find robot-studio: {resp}"
|
||||
);
|
||||
|
||||
// Insert the project at runtime — no restart.
|
||||
store.write().await.insert(
|
||||
"robot-studio".to_string(),
|
||||
ProjectEntry {
|
||||
url: Some("http://localhost:3002".to_string()),
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
|
||||
// Now the live store has the project; switch must succeed.
|
||||
let resp = eval_switch_command("robot-studio", &active, &store).await;
|
||||
assert_eq!(
|
||||
resp, "Switched to project **robot-studio**.",
|
||||
"live store insert must be visible without restart: {resp}"
|
||||
);
|
||||
assert_eq!(
|
||||
*active.read().await,
|
||||
"robot-studio",
|
||||
"active project must be updated after switch"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn switch_empty_arg_lists_available_projects() {
|
||||
let active = RwLock::new("huskies".to_string());
|
||||
let store: RwLock<BTreeMap<String, ProjectEntry>> = RwLock::new(BTreeMap::from([(
|
||||
"huskies".to_string(),
|
||||
ProjectEntry {
|
||||
url: None,
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
)]));
|
||||
|
||||
let resp = eval_switch_command("", &active, &store).await;
|
||||
assert!(
|
||||
resp.contains("Usage:"),
|
||||
"empty arg should show usage: {resp}"
|
||||
);
|
||||
assert!(
|
||||
resp.contains("huskies"),
|
||||
"usage should list available projects: {resp}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,6 +150,7 @@ mod tests {
|
||||
pending_perm_replies: Arc::new(TokioMutex::new(HashMap::new())),
|
||||
permission_timeout_secs: 120,
|
||||
status: Arc::new(crate::service::status::StatusBroadcaster::new()),
|
||||
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||
});
|
||||
(services, perm_tx)
|
||||
}
|
||||
|
||||
@@ -28,8 +28,14 @@ pub async fn run_bot(
|
||||
watcher_tx: tokio::sync::broadcast::Sender<crate::io::watcher::WatcherEvent>,
|
||||
shutdown_rx: watch::Receiver<Option<crate::rebuild::ShutdownReason>>,
|
||||
gateway_active_project: Option<Arc<RwLock<String>>>,
|
||||
gateway_projects: Vec<String>,
|
||||
gateway_project_urls: std::collections::BTreeMap<String, String>,
|
||||
gateway_projects_store: Option<
|
||||
Arc<
|
||||
RwLock<
|
||||
std::collections::BTreeMap<String, crate::service::gateway::config::ProjectEntry>,
|
||||
>,
|
||||
>,
|
||||
>,
|
||||
timer_store: Arc<TimerStore>,
|
||||
gateway_event_rx: Option<
|
||||
tokio::sync::broadcast::Receiver<crate::service::gateway::GatewayStatusEvent>,
|
||||
@@ -297,93 +303,11 @@ pub async fn run_bot(
|
||||
);
|
||||
}
|
||||
|
||||
// Subscribe to pipeline stage transitions and buffer compact audit lines
|
||||
// between Timmy's turns. Replay events (before == after stage label) are
|
||||
// silently dropped — only real transitions are recorded.
|
||||
let pending_pipeline_events: Arc<TokioMutex<Vec<String>>> =
|
||||
Arc::new(TokioMutex::new(Vec::new()));
|
||||
{
|
||||
use crate::pipeline_state::{format_audit_entry, stage_label, subscribe_transitions};
|
||||
let mut rx = subscribe_transitions();
|
||||
let buf = Arc::clone(&pending_pipeline_events);
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(fired) => {
|
||||
if stage_label(&fired.before) == stage_label(&fired.after) {
|
||||
continue;
|
||||
}
|
||||
let line = format_audit_entry(&fired);
|
||||
buf.lock().await.push(line);
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
|
||||
slog!("[matrix-bot] pipeline event buffer lagged by {n} events");
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Subscribe to gateway-side status events and buffer compact audit lines for
|
||||
// the LLM context.
|
||||
//
|
||||
// Investigation log (story 1078) — hypotheses ruled out:
|
||||
// (A) gateway_event_rx is None: impossible — spawn_gateway_bot always passes
|
||||
// Some(state.event_tx.clone()) in gateway mode (gateway/mod.rs:130).
|
||||
// (B) recv() never returns: buf task uses the ORIGINAL event_rx (subscribed
|
||||
// before Matrix init) so any events buffered during init are visible;
|
||||
// future events arrive normally via the shared broadcast channel.
|
||||
// (C) Different Arc: buf and ctx.pending_gateway_events are both clones of
|
||||
// the same Arc<TokioMutex<Vec<String>>> — writes in the buf task are
|
||||
// immediately visible to handle_message.
|
||||
// (D) format_drained_events empty on non-empty input: the function is
|
||||
// pure/tested; the drain slog in handle_message now makes the count
|
||||
// observable so we can confirm it is non-zero when events arrive.
|
||||
//
|
||||
// Bug fixed here: previously the buffer task held `event_rx.resubscribe()`,
|
||||
// which starts at the *current tail* (next unsent message) and silently
|
||||
// discards every event that arrived during the Matrix login / room-join /
|
||||
// cross-signing phase (~5–30 s window). The forwarder now gets the
|
||||
// resubscribed receiver (only needs live events going forward); the buffer
|
||||
// task holds the original `event_rx` so it drains the init-window backlog
|
||||
// on first poll.
|
||||
let pending_gateway_events: Arc<TokioMutex<Vec<String>>> =
|
||||
Arc::new(TokioMutex::new(Vec::new()));
|
||||
let gateway_event_rx_for_forwarder = if let Some(event_rx) = gateway_event_rx {
|
||||
// The forwarder only needs live (future) events — resubscribe is fine.
|
||||
let forwarder_rx = event_rx.resubscribe();
|
||||
// Buffer task: hold the *original* receiver so init-window events are
|
||||
// not lost. Silently accumulate compact audit lines for Timmy's context.
|
||||
{
|
||||
use crate::service::gateway::polling::format_gateway_audit_line;
|
||||
let buf = Arc::clone(&pending_gateway_events);
|
||||
slog!("[matrix-bot] subscribed to gateway events; buffer task starting");
|
||||
tokio::spawn(async move {
|
||||
let mut rx = event_rx;
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(event) => {
|
||||
slog!(
|
||||
"[matrix-bot] buffered audit line for project={} id={}",
|
||||
event.project,
|
||||
event.event.timestamp_ms()
|
||||
);
|
||||
let line = format_gateway_audit_line(&event.project, &event.event);
|
||||
buf.lock().await.push(line);
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
|
||||
slog!("[matrix-bot] gateway event buffer lagged by {n} events");
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
Some(forwarder_rx)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// The forwarder only needs live (future) events — resubscribe is fine.
|
||||
// Pipeline-transition context is now delivered to the LLM via
|
||||
// `assemble_prompt_context` (CRDT event log) rather than these in-memory
|
||||
// buffers, so the buffer tasks are gone; only the forwarder remains.
|
||||
let gateway_event_rx_for_forwarder = gateway_event_rx.map(|rx| rx.resubscribe());
|
||||
|
||||
let ctx = BotContext {
|
||||
services,
|
||||
@@ -397,10 +321,8 @@ pub async fn run_bot(
|
||||
transport: Arc::clone(&transport),
|
||||
timer_store,
|
||||
gateway_active_project,
|
||||
gateway_projects,
|
||||
gateway_project_urls,
|
||||
pending_pipeline_events,
|
||||
pending_gateway_events,
|
||||
gateway_projects_store,
|
||||
handled_incoming_event_ids: Arc::new(TokioMutex::new(super::context::SeenEventIds::new(
|
||||
super::context::SEEN_EVENT_IDS_CAP,
|
||||
))),
|
||||
@@ -620,89 +542,4 @@ mod tests {
|
||||
assert_eq!(steps[2], 20);
|
||||
assert_eq!(steps[3], 40);
|
||||
}
|
||||
|
||||
/// Regression test (story 1078): gateway broadcast events must reach
|
||||
/// `pending_gateway_events` and produce an `audit ts=…` line in the
|
||||
/// `format_drained_events` output that is prepended to Timmy's prompt.
|
||||
///
|
||||
/// The test spins up a mock `event_tx` broadcaster, sends one
|
||||
/// `StageTransition` event, lets the buffer task process it, drains the
|
||||
/// buffer, and asserts the result contains the expected audit prefix.
|
||||
#[tokio::test]
|
||||
async fn gateway_buffer_task_injects_audit_line_into_context() {
|
||||
use super::super::messages::format_drained_events;
|
||||
use crate::service::events::StoredEvent;
|
||||
use crate::service::gateway::GatewayStatusEvent;
|
||||
use crate::service::gateway::polling::format_gateway_audit_line;
|
||||
|
||||
let (event_tx, event_rx) = tokio::sync::broadcast::channel::<GatewayStatusEvent>(16);
|
||||
|
||||
// pending_gateway_events shared between buffer task and drain site.
|
||||
let pending: Arc<TokioMutex<Vec<String>>> = Arc::new(TokioMutex::new(Vec::new()));
|
||||
|
||||
// Spawn a minimal buffer task — same logic as run_bot uses.
|
||||
{
|
||||
let buf = Arc::clone(&pending);
|
||||
tokio::spawn(async move {
|
||||
let mut rx = event_rx;
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(event) => {
|
||||
let line = format_gateway_audit_line(&event.project, &event.event);
|
||||
buf.lock().await.push(line);
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Lagged(_)) => {}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Send one stage-transition event, as a project node would.
|
||||
let evt = GatewayStatusEvent {
|
||||
project: "huskies".to_string(),
|
||||
event: StoredEvent::StageTransition {
|
||||
story_id: "42_story_feat".to_string(),
|
||||
story_name: String::new(),
|
||||
from_stage: "2_current".to_string(),
|
||||
to_stage: "3_qa".to_string(),
|
||||
timestamp_ms: 1_000_000,
|
||||
},
|
||||
};
|
||||
let receivers = event_tx.send(evt).unwrap_or(0);
|
||||
assert!(
|
||||
receivers > 0,
|
||||
"event must have at least one active receiver"
|
||||
);
|
||||
|
||||
// Wait for the buffer task to process the event.
|
||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(2);
|
||||
loop {
|
||||
if !pending.lock().await.is_empty() {
|
||||
break;
|
||||
}
|
||||
assert!(
|
||||
std::time::Instant::now() < deadline,
|
||||
"buffer task did not receive the event within 2 s"
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
|
||||
// Drain and format — mirrors what handle_message does.
|
||||
let lines: Vec<String> = pending.lock().await.drain(..).collect();
|
||||
let prefix = format_drained_events(lines);
|
||||
|
||||
assert!(
|
||||
prefix.contains("audit ts="),
|
||||
"prompt prefix must contain 'audit ts='; got: {prefix}"
|
||||
);
|
||||
assert!(
|
||||
prefix.contains("project=huskies"),
|
||||
"prompt prefix must name the project; got: {prefix}"
|
||||
);
|
||||
assert!(
|
||||
prefix.starts_with("<system-reminder>\n"),
|
||||
"prefix must open with <system-reminder>; got: {prefix}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,11 @@ pub(super) fn default_aggregated_notifications_enabled() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// Default coalesce window for the chat dispatcher (1 500 ms).
|
||||
pub(super) fn default_coalesce_window_ms() -> u64 {
|
||||
1_500
|
||||
}
|
||||
|
||||
pub(super) fn default_transport() -> String {
|
||||
"matrix".to_string()
|
||||
}
|
||||
@@ -187,4 +192,30 @@ pub struct BotConfig {
|
||||
/// Defaults to `true`.
|
||||
#[serde(default = "default_aggregated_notifications_enabled")]
|
||||
pub aggregated_notifications_enabled: bool,
|
||||
|
||||
/// Duration in milliseconds of the chat dispatcher's coalesce window.
|
||||
///
|
||||
/// Messages for the same session arriving within this window are
|
||||
/// concatenated into a single `claude -p` call. The window is a
|
||||
/// debounce: each new message extends the deadline by this duration.
|
||||
///
|
||||
/// Defaults to 1 500 ms (1.5 s).
|
||||
#[serde(default = "default_coalesce_window_ms")]
|
||||
pub coalesce_window_ms: u64,
|
||||
|
||||
/// Git `user.name` to inject into project containers created by `new project`.
|
||||
///
|
||||
/// Passed as `GIT_USER_NAME` to the container entrypoint so agents can commit
|
||||
/// code with the correct author identity. Falls back to the host's
|
||||
/// `git config user.name` when absent.
|
||||
#[serde(default)]
|
||||
pub git_user_name: Option<String>,
|
||||
|
||||
/// Git `user.email` to inject into project containers created by `new project`.
|
||||
///
|
||||
/// Passed as `GIT_USER_EMAIL` to the container entrypoint so agents can commit
|
||||
/// code with the correct author identity. Falls back to the host's
|
||||
/// `git config user.email` when absent.
|
||||
#[serde(default)]
|
||||
pub git_user_email: Option<String>,
|
||||
}
|
||||
|
||||
@@ -27,6 +27,8 @@ pub(crate) mod config;
|
||||
pub mod delete;
|
||||
/// htop-style agent monitor command — renders a live process table in Matrix.
|
||||
pub mod htop;
|
||||
/// `new project <name>` chat command — Phase 1 gateway project bootstrap.
|
||||
pub mod new_project;
|
||||
/// Rebuild command — triggers a server rebuild/restart via a bot command.
|
||||
pub mod rebuild;
|
||||
/// Reset command — handles `!reset` bot commands to restart the server state.
|
||||
@@ -79,8 +81,14 @@ pub fn spawn_bot(
|
||||
services: Arc<Services>,
|
||||
shutdown_rx: watch::Receiver<Option<ShutdownReason>>,
|
||||
gateway_active_project: Option<Arc<RwLock<String>>>,
|
||||
gateway_projects: Vec<String>,
|
||||
gateway_project_urls: std::collections::BTreeMap<String, String>,
|
||||
gateway_projects_store: Option<
|
||||
Arc<
|
||||
RwLock<
|
||||
std::collections::BTreeMap<String, crate::service::gateway::config::ProjectEntry>,
|
||||
>,
|
||||
>,
|
||||
>,
|
||||
timer_store: Arc<TimerStore>,
|
||||
gateway_event_rx: Option<
|
||||
tokio::sync::broadcast::Receiver<crate::service::gateway::GatewayStatusEvent>,
|
||||
@@ -120,8 +128,8 @@ pub fn spawn_bot(
|
||||
watcher_tx,
|
||||
shutdown_rx,
|
||||
gateway_active_project,
|
||||
gateway_projects,
|
||||
gateway_project_urls,
|
||||
gateway_projects_store,
|
||||
timer_store,
|
||||
gateway_event_rx,
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -29,8 +29,11 @@ pub(super) async fn handle_llm_message(
|
||||
};
|
||||
|
||||
let bot_name = &ctx.services.bot_name;
|
||||
let event_ctx = crate::llm_session::assemble_prompt_context(
|
||||
resume_session_id.as_deref().unwrap_or(channel),
|
||||
);
|
||||
let prompt = format!(
|
||||
"[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{user}: {user_message}"
|
||||
"{event_ctx}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{user}: {user_message}"
|
||||
);
|
||||
|
||||
let provider = ClaudeCodeProvider::new();
|
||||
|
||||
@@ -27,8 +27,10 @@ pub(super) async fn handle_llm_message(
|
||||
};
|
||||
|
||||
let bot_name = &ctx.services.bot_name;
|
||||
let event_ctx =
|
||||
crate::llm_session::assemble_prompt_context(resume_session_id.as_deref().unwrap_or(sender));
|
||||
let prompt = format!(
|
||||
"[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{sender}: {user_message}"
|
||||
"{event_ctx}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{sender}: {user_message}"
|
||||
);
|
||||
|
||||
let provider = ClaudeCodeProvider::new();
|
||||
|
||||
@@ -310,6 +310,7 @@ mod tests {
|
||||
perm_rx: Arc::new(tokio::sync::Mutex::new(perm_rx)),
|
||||
pending_perm_replies: Arc::new(tokio::sync::Mutex::new(Default::default())),
|
||||
permission_timeout_secs: 120,
|
||||
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||
});
|
||||
Arc::new(WhatsAppWebhookContext {
|
||||
services,
|
||||
|
||||
@@ -0,0 +1,176 @@
|
||||
//! Read/write helpers for the `event_log` append-only list in the CRDT document.
|
||||
//!
|
||||
//! Every pipeline stage transition is appended as an [`EventLogEntryCrdt`][super::super::types::EventLogEntryCrdt]
|
||||
//! entry. Entries are never updated or tombstoned — the list is strictly grow-only.
|
||||
//! Monotonic sequencing is computed at write time while holding the CRDT lock,
|
||||
//! so `event_seq` values for a given sled are always contiguous and gap-free.
|
||||
|
||||
use bft_json_crdt::json_crdt::{JsonValue, *};
|
||||
use bft_json_crdt::op::ROOT_ID;
|
||||
use serde_json::json;
|
||||
|
||||
use super::super::state::{apply_and_persist, get_crdt};
|
||||
use super::super::types::EventLogEntryCrdt;
|
||||
|
||||
/// `pipeline_event` value used to mark a gap sentinel entry in the event log.
|
||||
///
|
||||
/// A gap sentinel is appended when the event-log subscriber detects that the
|
||||
/// broadcast channel dropped events (i.e. it received `RecvError::Lagged`).
|
||||
/// The `from_stage` and `to_stage` fields encode the logical EventId range
|
||||
/// `[from, to]` of the dropped events as decimal strings.
|
||||
pub const GAP_PIPELINE_EVENT: &str = "EventStreamGap";
|
||||
|
||||
/// Raw event log entry extracted from the CRDT document.
|
||||
///
|
||||
/// All fields are decoded to Rust primitives; entries with a missing or
|
||||
/// malformed `sled_id` are silently dropped by [`read_all_event_log_entries`].
|
||||
pub struct EventLogEntryRaw {
|
||||
/// Monotonic sequence number for the recording sled (0-based).
|
||||
pub event_seq: u64,
|
||||
/// Hex-encoded Ed25519 public key of the sled that wrote this entry.
|
||||
pub sled_id: String,
|
||||
/// Unix timestamp (seconds) when the transition fired.
|
||||
pub timestamp: f64,
|
||||
/// Story ID of the work item that transitioned.
|
||||
pub story_id: String,
|
||||
/// Human-readable label of the stage before the transition.
|
||||
pub from_stage: String,
|
||||
/// Human-readable label of the stage after the transition.
|
||||
pub to_stage: String,
|
||||
/// String label of the `PipelineEvent` variant.
|
||||
pub pipeline_event: String,
|
||||
}
|
||||
|
||||
/// Append a new event log entry to the CRDT, computing the monotonic `event_seq`
|
||||
/// atomically while the CRDT lock is held.
|
||||
///
|
||||
/// No-ops silently when the CRDT is not yet initialised.
|
||||
pub fn append_event_log_entry(
|
||||
sled_id: &str,
|
||||
timestamp: f64,
|
||||
story_id: &str,
|
||||
from_stage: &str,
|
||||
to_stage: &str,
|
||||
pipeline_event: &str,
|
||||
) {
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return;
|
||||
};
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return;
|
||||
};
|
||||
|
||||
// Count existing entries for this sled while holding the lock so the seq
|
||||
// is computed and used in the same critical section — no TOCTOU gap.
|
||||
let event_seq = state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter(|e| matches!(e.sled_id.view(), JsonValue::String(s) if s == sled_id))
|
||||
.count() as f64;
|
||||
|
||||
// Append after the last existing entry so the list stays in insertion order.
|
||||
// Inserting after ROOT_ID would place each entry at the front (RGA semantics),
|
||||
// reversing the sequence; inserting after the current tail preserves order.
|
||||
let total_len = state.crdt.doc.event_log.view().len();
|
||||
let after = if total_len > 0 {
|
||||
super::list_id_at(&state.crdt.doc.event_log, total_len - 1).unwrap_or(ROOT_ID)
|
||||
} else {
|
||||
ROOT_ID
|
||||
};
|
||||
|
||||
let entry: JsonValue = json!({
|
||||
"event_seq": event_seq,
|
||||
"sled_id": sled_id,
|
||||
"timestamp": timestamp,
|
||||
"story_id": story_id,
|
||||
"from_stage": from_stage,
|
||||
"to_stage": to_stage,
|
||||
"pipeline_event": pipeline_event,
|
||||
})
|
||||
.into();
|
||||
|
||||
apply_and_persist(&mut state, |s| s.crdt.doc.event_log.insert(after, entry));
|
||||
}
|
||||
|
||||
/// Append an `EventStreamGap` sentinel entry to the CRDT event log.
|
||||
///
|
||||
/// Called when the event-log broadcast subscriber detects that the channel
|
||||
/// dropped events (`RecvError::Lagged`). `from_id` and `to_id` are the
|
||||
/// logical sequence numbers (in the per-sled event stream) of the first and
|
||||
/// last dropped events respectively. The sentinel itself also consumes one
|
||||
/// CRDT `event_seq` slot so the monotonic counter remains contiguous across
|
||||
/// the gap.
|
||||
pub fn append_gap_log_entry(sled_id: &str, from_id: u64, to_id: u64) {
|
||||
let timestamp = chrono::Utc::now().timestamp() as f64;
|
||||
append_event_log_entry(
|
||||
sled_id,
|
||||
timestamp,
|
||||
"",
|
||||
&from_id.to_string(),
|
||||
&to_id.to_string(),
|
||||
GAP_PIPELINE_EVENT,
|
||||
);
|
||||
}
|
||||
|
||||
/// Read all event log entries from the CRDT document.
|
||||
///
|
||||
/// Entries with a missing or empty `sled_id` are silently skipped.
|
||||
/// Order reflects CRDT insertion order (RGA list semantics).
|
||||
pub fn read_all_event_log_entries() -> Vec<EventLogEntryRaw> {
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return Vec::new();
|
||||
};
|
||||
let Ok(state) = state_mutex.lock() else {
|
||||
return Vec::new();
|
||||
};
|
||||
state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter_map(extract_entry)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Convert a CRDT event log entry to its read-side representation.
|
||||
fn extract_entry(e: &EventLogEntryCrdt) -> Option<EventLogEntryRaw> {
|
||||
let event_seq = match e.event_seq.view() {
|
||||
JsonValue::Number(n) => n as u64,
|
||||
_ => return None,
|
||||
};
|
||||
let sled_id = match e.sled_id.view() {
|
||||
JsonValue::String(s) if !s.is_empty() => s,
|
||||
_ => return None,
|
||||
};
|
||||
let timestamp = match e.timestamp.view() {
|
||||
JsonValue::Number(n) => n,
|
||||
_ => 0.0,
|
||||
};
|
||||
let story_id = match e.story_id.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let from_stage = match e.from_stage.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let to_stage = match e.to_stage.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let pipeline_event = match e.pipeline_event.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
Some(EventLogEntryRaw {
|
||||
event_seq,
|
||||
sled_id,
|
||||
timestamp,
|
||||
story_id,
|
||||
from_stage,
|
||||
to_stage,
|
||||
pipeline_event,
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,334 @@
|
||||
//! Read/write helpers for the `llm_sessions` LWW-map collection, including the
|
||||
//! atomic `assemble_and_advance_session` helper used by the Matrix bot.
|
||||
//!
|
||||
//! LLM sessions are keyed by `session_id` (typically a Matrix room ID) and track
|
||||
//! per-sled high-water marks so that `assemble_and_advance_session` can inject
|
||||
//! only events the LLM has not yet seen and advance the marks atomically within
|
||||
//! a single CRDT lock acquisition.
|
||||
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use bft_json_crdt::json_crdt::{JsonValue, *};
|
||||
use bft_json_crdt::op::ROOT_ID;
|
||||
use serde_json::json;
|
||||
|
||||
use super::super::state::{apply_and_persist, get_crdt, rebuild_llm_session_index};
|
||||
use super::super::types::{LlmSessionCrdt, LlmSessionView, ScopeFilter};
|
||||
use super::event_log::GAP_PIPELINE_EVENT;
|
||||
|
||||
/// Write or upsert an LLM session entry keyed by `session_id`.
|
||||
///
|
||||
/// Creates a new entry if `session_id` is not yet present; updates
|
||||
/// `persona_name` and `scope` on an existing entry. The `high_water`
|
||||
/// register is not touched by this function — use `assemble_and_advance_session`
|
||||
/// to advance it atomically.
|
||||
///
|
||||
/// The `scope` string must be in wire form: `"all"` for [`ScopeFilter::All`]
|
||||
/// or `"sleds:hex1,hex2"` for [`ScopeFilter::Sleds`].
|
||||
pub fn write_llm_session(session_id: &str, persona_name: &str, scope: &str) {
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return;
|
||||
};
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return;
|
||||
};
|
||||
|
||||
if let Some(&idx) = state.llm_session_index.get(session_id) {
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions[idx]
|
||||
.persona_name
|
||||
.set(persona_name.to_string())
|
||||
});
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions[idx].scope.set(scope.to_string())
|
||||
});
|
||||
} else {
|
||||
let entry: JsonValue = json!({
|
||||
"session_id": session_id,
|
||||
"persona_name": persona_name,
|
||||
"scope": scope,
|
||||
"high_water": "{}",
|
||||
})
|
||||
.into();
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions.insert(ROOT_ID, entry)
|
||||
});
|
||||
state.llm_session_index = rebuild_llm_session_index(&state.crdt);
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a single LLM session entry by `session_id`.
|
||||
pub fn read_llm_session(session_id: &str) -> Option<LlmSessionView> {
|
||||
let state_mutex = get_crdt()?;
|
||||
let state = state_mutex.lock().ok()?;
|
||||
let &idx = state.llm_session_index.get(session_id)?;
|
||||
extract_llm_session_view(&state.crdt.doc.llm_sessions[idx])
|
||||
}
|
||||
|
||||
/// Atomically read new event-log entries for `session_id` past the stored
|
||||
/// high-water marks, render them as a block of audit lines, and advance the
|
||||
/// marks to prevent double-injection on the next call.
|
||||
///
|
||||
/// The set of sleds whose events are collected is determined by the session's
|
||||
/// [`ScopeFilter`]:
|
||||
/// - [`ScopeFilter::All`]: events from every sled present in the event log are
|
||||
/// included — this is the gateway-level persona default that gives a full
|
||||
/// cross-sled view.
|
||||
/// - [`ScopeFilter::Sleds`]: only events whose `sled_id` is in the stored set
|
||||
/// are included. When the stored set is empty (legacy `"single-sled"` rows or
|
||||
/// freshly created sessions with no explicit scope), the local node's sled ID
|
||||
/// is used as the sole member, preserving prior single-sled behaviour.
|
||||
///
|
||||
/// Returns an empty `Vec` when there are no new events or the CRDT is not
|
||||
/// initialised.
|
||||
pub fn assemble_and_advance_session(session_id: &str) -> Vec<String> {
|
||||
let local_sled_id = crate::crdt_state::our_node_id().unwrap_or_default();
|
||||
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return Vec::new();
|
||||
};
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
// Determine the session's scope filter and current high-water map.
|
||||
let (scope_filter, current_high_water) = match state.llm_session_index.get(session_id).copied()
|
||||
{
|
||||
Some(idx) => {
|
||||
let filter = parse_scope(&state.crdt.doc.llm_sessions[idx], &local_sled_id);
|
||||
let hw = parse_high_water(&state.crdt.doc.llm_sessions[idx]);
|
||||
(filter, hw)
|
||||
}
|
||||
None => {
|
||||
// New session with no stored entry: default to local sled only.
|
||||
let mut ids = BTreeSet::new();
|
||||
if !local_sled_id.is_empty() {
|
||||
ids.insert(local_sled_id.clone());
|
||||
}
|
||||
(ScopeFilter::Sleds(ids), BTreeMap::new())
|
||||
}
|
||||
};
|
||||
|
||||
// Build the set of sled IDs to collect events from.
|
||||
let target_sleds: BTreeSet<String> = match &scope_filter {
|
||||
ScopeFilter::All => {
|
||||
// Collect every unique sled_id present in the event log at this moment
|
||||
// (live, not snapshotted — picks up newly adopted sleds automatically).
|
||||
state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter_map(|e| match e.sled_id.view() {
|
||||
JsonValue::String(s) if !s.is_empty() => Some(s),
|
||||
_ => None,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
ScopeFilter::Sleds(ids) if ids.is_empty() => {
|
||||
// Empty set → legacy fallback: local sled only.
|
||||
if local_sled_id.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
std::iter::once(local_sled_id.clone()).collect()
|
||||
}
|
||||
ScopeFilter::Sleds(ids) => ids.clone(),
|
||||
};
|
||||
|
||||
if target_sleds.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
// Collect new events from each target sled past its high-water mark.
|
||||
let mut new_events: Vec<(f64, String, String, String, String, String)> = state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter_map(|e| extract_new_event_multi(e, &target_sleds, ¤t_high_water))
|
||||
.collect();
|
||||
|
||||
if new_events.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
// Sort by (sled_id, event_seq) for deterministic ordering.
|
||||
new_events.sort_by(|a, b| {
|
||||
a.1.cmp(&b.1)
|
||||
.then(a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal))
|
||||
});
|
||||
|
||||
// Advance the high-water mark for each sled that had new events.
|
||||
let mut new_high_water = current_high_water;
|
||||
for (seq, sled_id, ..) in &new_events {
|
||||
let entry = new_high_water.entry(sled_id.clone()).or_insert(0);
|
||||
if *seq as u64 > *entry {
|
||||
*entry = *seq as u64;
|
||||
}
|
||||
}
|
||||
let new_hw_json = serde_json::to_string(&new_high_water).unwrap_or_else(|_| "{}".to_string());
|
||||
|
||||
// Upsert the session entry with the new high-water value.
|
||||
let idx_opt = state.llm_session_index.get(session_id).copied();
|
||||
if let Some(idx) = idx_opt {
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions[idx]
|
||||
.high_water
|
||||
.set(new_hw_json.clone())
|
||||
});
|
||||
} else {
|
||||
let scope_str = scope_filter.to_scope_str();
|
||||
let entry: JsonValue = json!({
|
||||
"session_id": session_id,
|
||||
"persona_name": "",
|
||||
"scope": scope_str,
|
||||
"high_water": new_hw_json,
|
||||
})
|
||||
.into();
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions.insert(ROOT_ID, entry)
|
||||
});
|
||||
state.llm_session_index = rebuild_llm_session_index(&state.crdt);
|
||||
}
|
||||
|
||||
// Observability: log event-log size and gap count across the session's
|
||||
// target sleds (the scope actually assembled for this session).
|
||||
let total_entries = state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter(|e| matches!(e.sled_id.view(), JsonValue::String(s) if target_sleds.contains(&s)))
|
||||
.count();
|
||||
let gap_count = state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter(|e| {
|
||||
matches!(e.sled_id.view(), JsonValue::String(s) if target_sleds.contains(&s))
|
||||
&& matches!(e.pipeline_event.view(), JsonValue::String(s) if s == GAP_PIPELINE_EVENT)
|
||||
})
|
||||
.count();
|
||||
crate::slog!(
|
||||
"[event-log] assemble session={session_id} sled_entries={total_entries} gap_count={gap_count}"
|
||||
);
|
||||
|
||||
// Render each new event as a compact audit line; gap sentinels get a
|
||||
// human-readable message so the LLM is never presented with raw field data.
|
||||
new_events
|
||||
.into_iter()
|
||||
.map(
|
||||
|(_, sled_id, story_id, from_stage, to_stage, pipeline_event)| {
|
||||
if pipeline_event == GAP_PIPELINE_EVENT {
|
||||
format!("events between {from_stage} and {to_stage} were dropped")
|
||||
} else {
|
||||
format!(
|
||||
"pipeline_event sled_id=\"{sled_id}\" story_id=\"{story_id}\" \
|
||||
from=\"{from_stage}\" to=\"{to_stage}\" event=\"{pipeline_event}\""
|
||||
)
|
||||
}
|
||||
},
|
||||
)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Decode the high-water JSON string from an `LlmSessionCrdt` entry.
|
||||
fn parse_high_water(entry: &LlmSessionCrdt) -> BTreeMap<String, u64> {
|
||||
match entry.high_water.view() {
|
||||
JsonValue::String(s) if !s.is_empty() && s != "{}" => {
|
||||
serde_json::from_str(&s).unwrap_or_default()
|
||||
}
|
||||
_ => BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse the scope filter from an `LlmSessionCrdt` entry, falling back to
|
||||
/// a single-element set containing `local_sled_id` for legacy / empty scope strings.
|
||||
fn parse_scope(entry: &LlmSessionCrdt, local_sled_id: &str) -> ScopeFilter {
|
||||
let raw = match entry.scope.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let filter = ScopeFilter::from_scope_str(&raw);
|
||||
// For a Sleds filter with an empty set (legacy "single-sled" or ""),
|
||||
// fall back to the local sled.
|
||||
if let ScopeFilter::Sleds(ref ids) = filter
|
||||
&& ids.is_empty()
|
||||
&& !local_sled_id.is_empty()
|
||||
{
|
||||
let mut fallback = BTreeSet::new();
|
||||
fallback.insert(local_sled_id.to_string());
|
||||
return ScopeFilter::Sleds(fallback);
|
||||
}
|
||||
filter
|
||||
}
|
||||
|
||||
/// Extract one event log entry if its `sled_id` is in `target_sleds` and its
|
||||
/// `event_seq` is strictly greater than the matching high-water value (or no
|
||||
/// high-water has been recorded yet for that sled).
|
||||
///
|
||||
/// Returns `(event_seq, sled_id, story_id, from_stage, to_stage, pipeline_event)`.
|
||||
fn extract_new_event_multi(
|
||||
e: &crate::crdt_state::types::EventLogEntryCrdt,
|
||||
target_sleds: &BTreeSet<String>,
|
||||
high_water: &BTreeMap<String, u64>,
|
||||
) -> Option<(f64, String, String, String, String, String)> {
|
||||
let sled_id = match e.sled_id.view() {
|
||||
JsonValue::String(s) if !s.is_empty() && target_sleds.contains(&s) => s,
|
||||
_ => return None,
|
||||
};
|
||||
let event_seq = match e.event_seq.view() {
|
||||
JsonValue::Number(n) => n,
|
||||
_ => return None,
|
||||
};
|
||||
let last_seen = high_water.get(&sled_id).copied();
|
||||
if last_seen.is_some_and(|last| event_seq as u64 <= last) {
|
||||
return None;
|
||||
}
|
||||
let story_id = match e.story_id.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let from_stage = match e.from_stage.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let to_stage = match e.to_stage.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let pipeline_event = match e.pipeline_event.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
Some((
|
||||
event_seq,
|
||||
sled_id,
|
||||
story_id,
|
||||
from_stage,
|
||||
to_stage,
|
||||
pipeline_event,
|
||||
))
|
||||
}
|
||||
|
||||
/// Convert a CRDT LLM session entry into its read-only view representation.
|
||||
pub(super) fn extract_llm_session_view(entry: &LlmSessionCrdt) -> Option<LlmSessionView> {
|
||||
let session_id = match entry.session_id.view() {
|
||||
JsonValue::String(s) if !s.is_empty() => s,
|
||||
_ => return None,
|
||||
};
|
||||
let persona_name = match entry.persona_name.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let local_sled_id = crate::crdt_state::our_node_id().unwrap_or_default();
|
||||
let scope_filter = parse_scope(entry, &local_sled_id);
|
||||
let high_water = parse_high_water(entry);
|
||||
Some(LlmSessionView {
|
||||
session_id,
|
||||
persona_name,
|
||||
scope_filter,
|
||||
high_water,
|
||||
})
|
||||
}
|
||||
@@ -14,7 +14,9 @@ use bft_json_crdt::op::OpId;
|
||||
|
||||
mod active_agents;
|
||||
mod agent_throttle;
|
||||
mod event_log;
|
||||
mod gateway_projects;
|
||||
mod llm_sessions;
|
||||
mod merge_jobs;
|
||||
mod test_jobs;
|
||||
mod tokens;
|
||||
@@ -28,9 +30,14 @@ pub use active_agents::{
|
||||
pub use agent_throttle::{
|
||||
delete_agent_throttle, read_agent_throttle, read_all_agent_throttles, write_agent_throttle,
|
||||
};
|
||||
pub use event_log::{
|
||||
EventLogEntryRaw, GAP_PIPELINE_EVENT, append_event_log_entry, append_gap_log_entry,
|
||||
read_all_event_log_entries,
|
||||
};
|
||||
pub use gateway_projects::{
|
||||
delete_gateway_project, read_all_gateway_projects, read_gateway_project, write_gateway_project,
|
||||
};
|
||||
pub use llm_sessions::{assemble_and_advance_session, read_llm_session, write_llm_session};
|
||||
pub use merge_jobs::{delete_merge_job, read_all_merge_jobs, read_merge_job, write_merge_job};
|
||||
pub use test_jobs::{delete_test_job, read_all_test_jobs, read_test_job, write_test_job};
|
||||
pub use tokens::{delete_token_usage, read_all_token_usage, read_token_usage, write_token_usage};
|
||||
|
||||
@@ -28,12 +28,14 @@ mod write;
|
||||
|
||||
pub use gateway_config::{read_gateway_active_project, write_gateway_active_project};
|
||||
pub use lww_maps::{
|
||||
delete_active_agent, delete_agent_throttle, delete_gateway_project, delete_merge_job,
|
||||
delete_test_job, delete_token_usage, read_active_agent, read_agent_throttle,
|
||||
read_all_active_agents, read_all_agent_throttles, read_all_gateway_projects,
|
||||
read_all_merge_jobs, read_all_test_jobs, read_all_token_usage, read_gateway_project,
|
||||
read_merge_job, read_test_job, read_token_usage, write_active_agent, write_agent_throttle,
|
||||
write_gateway_project, write_merge_job, write_test_job, write_token_usage,
|
||||
EventLogEntryRaw, GAP_PIPELINE_EVENT, append_event_log_entry, append_gap_log_entry,
|
||||
assemble_and_advance_session, delete_active_agent, delete_agent_throttle,
|
||||
delete_gateway_project, delete_merge_job, delete_test_job, delete_token_usage,
|
||||
read_active_agent, read_agent_throttle, read_all_active_agents, read_all_agent_throttles,
|
||||
read_all_event_log_entries, read_all_gateway_projects, read_all_merge_jobs, read_all_test_jobs,
|
||||
read_all_token_usage, read_gateway_project, read_llm_session, read_merge_job, read_test_job,
|
||||
read_token_usage, write_active_agent, write_agent_throttle, write_gateway_project,
|
||||
write_llm_session, write_merge_job, write_test_job, write_token_usage,
|
||||
};
|
||||
pub use ops::{all_ops_json, apply_remote_op, ops_since, our_vector_clock, subscribe_ops};
|
||||
pub use presence::{
|
||||
@@ -45,19 +47,21 @@ pub use read::{
|
||||
dep_is_archived_crdt, dep_is_done_crdt, dump_crdt_state, evict_item, is_tombstoned,
|
||||
read_all_items, read_item, tombstoned_ids,
|
||||
};
|
||||
pub(crate) use state::flush_persistence;
|
||||
pub use state::{init, subscribe};
|
||||
pub use types::{
|
||||
ActiveAgentCrdt, ActiveAgentView, AgentThrottleCrdt, AgentThrottleView, CrdtEvent, EpicId,
|
||||
GatewayConfigCrdt, GatewayProjectCrdt, GatewayProjectView, MergeJobCrdt, MergeJobView,
|
||||
NodePresenceCrdt, NodePresenceView, PipelineDoc, PipelineItemCrdt, PipelineItemView,
|
||||
TestJobCrdt, TestJobView, TokenUsageCrdt, TokenUsageView, WorkItem,
|
||||
EventLogEntryCrdt, GatewayConfigCrdt, GatewayProjectCrdt, GatewayProjectView, LlmSessionCrdt,
|
||||
LlmSessionView, MergeJobCrdt, MergeJobView, NodePresenceCrdt, NodePresenceView, PipelineDoc,
|
||||
PipelineItemCrdt, PipelineItemView, ScopeFilter, TestJobCrdt, TestJobView, TokenUsageCrdt,
|
||||
TokenUsageView, WorkItem,
|
||||
};
|
||||
pub use write::{
|
||||
bump_retry_count, migrate_legacy_stage_strings, migrate_merge_job, migrate_names_from_slugs,
|
||||
migrate_node_claims_to_agent_claims, migrate_story_ids_to_numeric, name_from_story_id,
|
||||
purge_done_stage_merge_jobs, set_agent, set_depends_on, set_epic, set_item_type, set_name,
|
||||
set_origin, set_plan_state, set_qa_mode, set_resume_to, set_resume_to_raw, set_retry_count,
|
||||
write_item,
|
||||
migrate_node_claims_to_agent_claims, migrate_story_ids_to_numeric,
|
||||
migrate_zombie_pipeline_rows, name_from_story_id, purge_done_stage_merge_jobs, set_agent,
|
||||
set_depends_on, set_epic, set_item_type, set_name, set_origin, set_plan_state, set_qa_mode,
|
||||
set_resume_to, set_resume_to_raw, set_retry_count, write_item,
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
#![allow(unused_imports, dead_code)]
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use super::hex;
|
||||
use bft_json_crdt::json_crdt::*;
|
||||
@@ -10,9 +11,10 @@ use tokio::sync::broadcast;
|
||||
|
||||
use super::VectorClock;
|
||||
use super::state::{
|
||||
SYNC_TX, all_ops_lock, apply_and_persist, emit_event, get_crdt, rebuild_active_agent_index,
|
||||
rebuild_agent_throttle_index, rebuild_index, rebuild_merge_job_index, rebuild_node_index,
|
||||
rebuild_test_job_index, rebuild_token_index, track_op, vector_clock_lock,
|
||||
PERSIST_PENDING, PersistMsg, SYNC_TX, all_ops_lock, apply_and_persist, emit_event, get_crdt,
|
||||
rebuild_active_agent_index, rebuild_agent_throttle_index, rebuild_index,
|
||||
rebuild_merge_job_index, rebuild_node_index, rebuild_test_job_index, rebuild_token_index,
|
||||
track_op, vector_clock_lock,
|
||||
};
|
||||
use super::types::{CrdtEvent, PipelineDoc};
|
||||
use crate::slog;
|
||||
@@ -116,9 +118,15 @@ pub fn apply_remote_op(op: SignedOp) -> bool {
|
||||
}
|
||||
|
||||
// Persist the op.
|
||||
if let Err(e) = state.persist_tx.send(op.clone()) {
|
||||
if state
|
||||
.persist_tx
|
||||
.send(PersistMsg::Op(Box::new(op.clone())))
|
||||
.is_ok()
|
||||
{
|
||||
PERSIST_PENDING.fetch_add(1, Ordering::Relaxed);
|
||||
} else {
|
||||
crate::slog_error!(
|
||||
"[crdt] Failed to send remote op to persist task: {e}; persist task may be dead. \
|
||||
"[crdt] Failed to send remote op to persist task; persist task may be dead. \
|
||||
In-memory state is now ahead of persisted state."
|
||||
);
|
||||
}
|
||||
|
||||
@@ -6,7 +6,9 @@ use std::collections::HashMap;
|
||||
use bft_json_crdt::json_crdt::*;
|
||||
use bft_json_crdt::op::{OpId, ROOT_ID};
|
||||
|
||||
use super::state::{all_ops_lock, apply_and_persist, get_crdt, rebuild_index};
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use super::state::{PERSIST_PENDING, all_ops_lock, apply_and_persist, get_crdt, rebuild_index};
|
||||
use super::types::{PipelineDoc, PipelineItemCrdt, PipelineItemView};
|
||||
|
||||
// ── Debug dump ───────────────────────────────────────────────────────
|
||||
@@ -44,6 +46,8 @@ pub struct CrdtStateDump {
|
||||
pub max_seq_in_list: u64,
|
||||
/// Count of ops in the ALL_OPS journal (persisted ops replayed at startup).
|
||||
pub persisted_ops_count: usize,
|
||||
/// Count of ops queued in the persistence channel not yet written to SQLite.
|
||||
pub pending_persist_ops_count: usize,
|
||||
pub items: Vec<CrdtItemDump>,
|
||||
}
|
||||
|
||||
@@ -61,6 +65,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
let persisted_ops_count = all_ops_lock()
|
||||
.and_then(|m| m.lock().ok().map(|v| v.len()))
|
||||
.unwrap_or(0);
|
||||
let pending_persist_ops_count = PERSIST_PENDING.load(Ordering::Relaxed);
|
||||
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return CrdtStateDump {
|
||||
@@ -69,6 +74,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
total_ops_in_list: 0,
|
||||
max_seq_in_list: 0,
|
||||
persisted_ops_count,
|
||||
pending_persist_ops_count,
|
||||
items: Vec::new(),
|
||||
};
|
||||
};
|
||||
@@ -80,6 +86,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
total_ops_in_list: 0,
|
||||
max_seq_in_list: 0,
|
||||
persisted_ops_count,
|
||||
pending_persist_ops_count,
|
||||
items: Vec::new(),
|
||||
};
|
||||
};
|
||||
@@ -179,6 +186,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
total_ops_in_list,
|
||||
max_seq_in_list,
|
||||
persisted_ops_count,
|
||||
pending_persist_ops_count,
|
||||
items,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,11 +5,13 @@
|
||||
//! it to the live document, sends it to the persistence channel, and broadcasts
|
||||
//! it to sync peers via [`super::SYNC_TX`].
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use bft_json_crdt::json_crdt::JsonValue;
|
||||
use bft_json_crdt::op::Op;
|
||||
|
||||
use super::super::types::CrdtEvent;
|
||||
use super::{CrdtState, statics};
|
||||
use super::{CrdtState, init::PersistMsg, statics};
|
||||
|
||||
/// Create a CRDT op via `op_fn`, sign it, apply it, and send it to the
|
||||
/// persistence channel. The closure receives `&mut CrdtState` so it can
|
||||
@@ -21,7 +23,13 @@ where
|
||||
let raw_op = op_fn(state);
|
||||
let signed = raw_op.sign(&state.keypair);
|
||||
state.crdt.apply(signed.clone());
|
||||
if state.persist_tx.send(signed.clone()).is_err() {
|
||||
if state
|
||||
.persist_tx
|
||||
.send(PersistMsg::Op(Box::new(signed.clone())))
|
||||
.is_ok()
|
||||
{
|
||||
statics::PERSIST_PENDING.fetch_add(1, Ordering::Relaxed);
|
||||
} else {
|
||||
let op_type = if signed.inner.is_deleted {
|
||||
"Delete"
|
||||
} else {
|
||||
|
||||
@@ -113,3 +113,16 @@ pub(in crate::crdt_state) fn rebuild_gateway_project_index(
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
/// Rebuild the session_id → llm_sessions list index.
|
||||
pub(in crate::crdt_state) fn rebuild_llm_session_index(
|
||||
crdt: &BaseCrdt<PipelineDoc>,
|
||||
) -> HashMap<String, usize> {
|
||||
let mut map = HashMap::new();
|
||||
for (i, entry) in crdt.doc.llm_sessions.iter().enumerate() {
|
||||
if let JsonValue::String(ref k) = entry.session_id.view() {
|
||||
map.insert(k.clone(), i);
|
||||
}
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
@@ -8,25 +8,34 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::Path;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue, SignedOp};
|
||||
use bft_json_crdt::keypair::{Ed25519KeyPair, make_keypair};
|
||||
use sqlx::SqlitePool;
|
||||
use sqlx::sqlite::SqliteConnectOptions;
|
||||
use tokio::sync::{broadcast, mpsc};
|
||||
use tokio::sync::{broadcast, mpsc, oneshot};
|
||||
|
||||
use super::super::VectorClock;
|
||||
use super::super::hex;
|
||||
use super::super::types::{CrdtEvent, PipelineDoc};
|
||||
use super::indices::{
|
||||
rebuild_active_agent_index, rebuild_agent_throttle_index, rebuild_gateway_project_index,
|
||||
rebuild_index, rebuild_merge_job_index, rebuild_node_index, rebuild_test_job_index,
|
||||
rebuild_token_index,
|
||||
rebuild_index, rebuild_llm_session_index, rebuild_merge_job_index, rebuild_node_index,
|
||||
rebuild_test_job_index, rebuild_token_index,
|
||||
};
|
||||
use super::statics::{ALL_OPS, CRDT_EVENT_TX, SYNC_TX, VECTOR_CLOCK};
|
||||
use super::statics::{ALL_OPS, CRDT_EVENT_TX, PERSIST_PENDING, SYNC_TX, VECTOR_CLOCK};
|
||||
use super::{CRDT_STATE, CrdtState};
|
||||
use crate::slog;
|
||||
|
||||
/// Message type for the persistence background channel.
|
||||
pub(crate) enum PersistMsg {
|
||||
/// Persist this op to SQLite.
|
||||
Op(Box<SignedOp>),
|
||||
/// Drain: signal the sender after all preceding ops are committed.
|
||||
Flush(oneshot::Sender<()>),
|
||||
}
|
||||
|
||||
/// Initialise the CRDT state layer.
|
||||
///
|
||||
/// Opens the SQLite database, loads or creates a node keypair, replays any
|
||||
@@ -94,6 +103,7 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
let test_job_index = rebuild_test_job_index(&crdt);
|
||||
let agent_throttle_index = rebuild_agent_throttle_index(&crdt);
|
||||
let gateway_project_index = rebuild_gateway_project_index(&crdt);
|
||||
let llm_session_index = rebuild_llm_session_index(&crdt);
|
||||
|
||||
// Advance the top-level list clocks to the Lamport floor so that
|
||||
// list-level inserts don't re-emit low seq numbers.
|
||||
@@ -105,6 +115,7 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
crdt.doc.test_jobs.advance_seq(lamport_floor);
|
||||
crdt.doc.agent_throttle.advance_seq(lamport_floor);
|
||||
crdt.doc.gateway_projects.advance_seq(lamport_floor);
|
||||
crdt.doc.llm_sessions.advance_seq(lamport_floor);
|
||||
crdt.doc
|
||||
.gateway_config
|
||||
.active_project
|
||||
@@ -119,35 +130,46 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
);
|
||||
|
||||
// Spawn background persistence task.
|
||||
let (persist_tx, mut persist_rx) = mpsc::unbounded_channel::<SignedOp>();
|
||||
let (persist_tx, mut persist_rx) = mpsc::unbounded_channel::<PersistMsg>();
|
||||
|
||||
tokio::spawn(async move {
|
||||
while let Some(op) = persist_rx.recv().await {
|
||||
let op_json = match serde_json::to_string(&op) {
|
||||
Ok(j) => j,
|
||||
Err(e) => {
|
||||
slog!("[crdt] Failed to serialize op: {e}");
|
||||
continue;
|
||||
while let Some(msg) = persist_rx.recv().await {
|
||||
match msg {
|
||||
PersistMsg::Op(op) => {
|
||||
let op = *op;
|
||||
let op_json = match serde_json::to_string(&op) {
|
||||
Ok(j) => j,
|
||||
Err(e) => {
|
||||
slog!("[crdt] Failed to serialize op: {e}");
|
||||
PERSIST_PENDING.fetch_sub(1, Ordering::Relaxed);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let op_id = hex::encode(&op.id());
|
||||
let seq = op.inner.seq as i64;
|
||||
let now = chrono::Utc::now().to_rfc3339();
|
||||
|
||||
let result = sqlx::query(
|
||||
"INSERT INTO crdt_ops (op_id, seq, op_json, created_at) \
|
||||
VALUES (?1, ?2, ?3, ?4) \
|
||||
ON CONFLICT(op_id) DO NOTHING",
|
||||
)
|
||||
.bind(&op_id)
|
||||
.bind(seq)
|
||||
.bind(&op_json)
|
||||
.bind(&now)
|
||||
.execute(&pool)
|
||||
.await;
|
||||
|
||||
if let Err(e) = result {
|
||||
slog!("[crdt] Failed to persist op {}: {e}", &op_id[..12]);
|
||||
}
|
||||
PERSIST_PENDING.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
PersistMsg::Flush(reply) => {
|
||||
// All ops queued before this message have already been processed.
|
||||
let _ = reply.send(());
|
||||
}
|
||||
};
|
||||
let op_id = hex::encode(&op.id());
|
||||
let seq = op.inner.seq as i64;
|
||||
let now = chrono::Utc::now().to_rfc3339();
|
||||
|
||||
let result = sqlx::query(
|
||||
"INSERT INTO crdt_ops (op_id, seq, op_json, created_at) \
|
||||
VALUES (?1, ?2, ?3, ?4) \
|
||||
ON CONFLICT(op_id) DO NOTHING",
|
||||
)
|
||||
.bind(&op_id)
|
||||
.bind(seq)
|
||||
.bind(&op_json)
|
||||
.bind(&now)
|
||||
.execute(&pool)
|
||||
.await;
|
||||
|
||||
if let Err(e) = result {
|
||||
slog!("[crdt] Failed to persist op {}: {e}", &op_id[..12]);
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -163,6 +185,7 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
test_job_index,
|
||||
agent_throttle_index,
|
||||
gateway_project_index,
|
||||
llm_session_index,
|
||||
persist_tx,
|
||||
lamport_floor,
|
||||
tombstones,
|
||||
@@ -181,6 +204,43 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Signal the persistence background task to drain and wait until all currently-queued
|
||||
/// ops have been written to SQLite, or until `timeout` elapses.
|
||||
///
|
||||
/// Because the persistence channel is FIFO, a `Flush` sentinel processed by the task
|
||||
/// guarantees that every `Op` sent before it has already been committed. On timeout a
|
||||
/// warning is logged with the queue depth so regressions are visible in logs.
|
||||
pub(crate) async fn flush_persistence(timeout: std::time::Duration) {
|
||||
let Some(state_mutex) = super::get_crdt() else {
|
||||
return;
|
||||
};
|
||||
let persist_tx = {
|
||||
let Ok(state) = state_mutex.lock() else {
|
||||
return;
|
||||
};
|
||||
state.persist_tx.clone()
|
||||
};
|
||||
let pending_at_send = PERSIST_PENDING.load(Ordering::Relaxed);
|
||||
let (tx, rx) = oneshot::channel();
|
||||
if persist_tx.send(PersistMsg::Flush(tx)).is_err() {
|
||||
slog!("[rebuild] Persistence channel closed — skipping flush");
|
||||
return;
|
||||
}
|
||||
match tokio::time::timeout(timeout, rx).await {
|
||||
Ok(_) => {
|
||||
slog!("[rebuild] Persistence channel drained ({pending_at_send} ops flushed)");
|
||||
}
|
||||
Err(_) => {
|
||||
let pending_now = PERSIST_PENDING.load(Ordering::Relaxed);
|
||||
slog!(
|
||||
"[rebuild] WARNING: persistence flush timed out after {}ms; \
|
||||
queue_depth_at_send={pending_at_send} queue_depth_now={pending_now}",
|
||||
timeout.as_millis()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Load or create the Ed25519 keypair used by this node.
|
||||
async fn load_or_create_keypair(pool: &SqlitePool) -> Result<Ed25519KeyPair, sqlx::Error> {
|
||||
let row: Option<(Vec<u8>,)> =
|
||||
|
||||
@@ -27,6 +27,7 @@ mod tests;
|
||||
// ── Re-exports for crdt_state siblings ──────────────────────────────
|
||||
|
||||
pub use init::init;
|
||||
pub(crate) use init::{PersistMsg, flush_persistence};
|
||||
|
||||
/// Subscribe to CRDT state-transition events.
|
||||
///
|
||||
@@ -38,11 +39,11 @@ pub fn subscribe() -> Option<broadcast::Receiver<super::types::CrdtEvent>> {
|
||||
pub(super) use apply::{apply_and_persist, emit_event};
|
||||
pub(super) use indices::{
|
||||
rebuild_active_agent_index, rebuild_agent_throttle_index, rebuild_gateway_project_index,
|
||||
rebuild_index, rebuild_merge_job_index, rebuild_node_index, rebuild_test_job_index,
|
||||
rebuild_token_index,
|
||||
rebuild_index, rebuild_llm_session_index, rebuild_merge_job_index, rebuild_node_index,
|
||||
rebuild_test_job_index, rebuild_token_index,
|
||||
};
|
||||
pub(crate) use statics::{PERSIST_PENDING, all_ops_lock, vector_clock_lock};
|
||||
pub(super) use statics::{SYNC_TX, track_op};
|
||||
pub(crate) use statics::{all_ops_lock, vector_clock_lock};
|
||||
|
||||
// ── CrdtState struct ─────────────────────────────────────────────────
|
||||
|
||||
@@ -66,8 +67,10 @@ pub(super) struct CrdtState {
|
||||
pub(super) agent_throttle_index: HashMap<String, usize>,
|
||||
/// Maps project name → index in the gateway_projects ListCrdt for O(1) lookup.
|
||||
pub(super) gateway_project_index: HashMap<String, usize>,
|
||||
/// Channel sender for fire-and-forget op persistence.
|
||||
pub(super) persist_tx: mpsc::UnboundedSender<SignedOp>,
|
||||
/// Maps session_id → index in the llm_sessions ListCrdt for O(1) lookup.
|
||||
pub(super) llm_session_index: HashMap<String, usize>,
|
||||
/// Channel sender for op persistence and drain signalling.
|
||||
pub(super) persist_tx: mpsc::UnboundedSender<init::PersistMsg>,
|
||||
/// Max sequence number seen across all ops during init() replay.
|
||||
///
|
||||
/// Newly-created registers (post-init) must have their Lamport clock
|
||||
@@ -122,49 +125,58 @@ pub(super) fn get_crdt() -> Option<&'static Mutex<CrdtState>> {
|
||||
/// This avoids the async SQLite setup from `init()`. Ops are sent to a
|
||||
/// channel whose receiver is leaked (so nothing is persisted, but the channel
|
||||
/// stays open and `apply_and_persist` succeeds silently).
|
||||
/// Safe to call multiple times — subsequent calls are no-ops (thread-local).
|
||||
/// Always resets all thread-local state so each call produces a clean slate —
|
||||
/// no cross-test pollution when two tests share the same thread.
|
||||
#[cfg(test)]
|
||||
pub fn init_for_test() {
|
||||
// Initialise thread-local CRDT for test isolation.
|
||||
// Only creates a new CRDT if one isn't set yet on this thread;
|
||||
// subsequent calls are no-ops (matching the old OnceLock semantics
|
||||
// while keeping each thread isolated).
|
||||
let keypair = make_keypair();
|
||||
let crdt = BaseCrdt::<PipelineDoc>::new(&keypair);
|
||||
let (persist_tx, rx) = mpsc::unbounded_channel::<init::PersistMsg>();
|
||||
// Leak the receiver so the channel stays open: apply_and_persist
|
||||
// can then send without error, preventing [crdt_persist] WARNs
|
||||
// from racing with other tests that watch the global log buffer.
|
||||
std::mem::forget(rx);
|
||||
let fresh = CrdtState {
|
||||
crdt,
|
||||
keypair,
|
||||
index: HashMap::new(),
|
||||
node_index: HashMap::new(),
|
||||
token_index: HashMap::new(),
|
||||
merge_job_index: HashMap::new(),
|
||||
active_agent_index: HashMap::new(),
|
||||
test_job_index: HashMap::new(),
|
||||
agent_throttle_index: HashMap::new(),
|
||||
gateway_project_index: HashMap::new(),
|
||||
llm_session_index: HashMap::new(),
|
||||
persist_tx,
|
||||
lamport_floor: 0,
|
||||
tombstones: HashSet::new(),
|
||||
};
|
||||
CRDT_STATE_TL.with(|lock| {
|
||||
if lock.get().is_none() {
|
||||
let keypair = make_keypair();
|
||||
let crdt = BaseCrdt::<PipelineDoc>::new(&keypair);
|
||||
let (persist_tx, rx) = mpsc::unbounded_channel();
|
||||
// Leak the receiver so the channel stays open: apply_and_persist
|
||||
// can then send without error, preventing [crdt_persist] WARNs
|
||||
// from racing with other tests that watch the global log buffer.
|
||||
std::mem::forget(rx);
|
||||
let state = CrdtState {
|
||||
crdt,
|
||||
keypair,
|
||||
index: HashMap::new(),
|
||||
node_index: HashMap::new(),
|
||||
token_index: HashMap::new(),
|
||||
merge_job_index: HashMap::new(),
|
||||
active_agent_index: HashMap::new(),
|
||||
test_job_index: HashMap::new(),
|
||||
agent_throttle_index: HashMap::new(),
|
||||
gateway_project_index: HashMap::new(),
|
||||
persist_tx,
|
||||
lamport_floor: 0,
|
||||
tombstones: HashSet::new(),
|
||||
};
|
||||
let _ = lock.set(Mutex::new(state));
|
||||
if let Some(mutex) = lock.get() {
|
||||
// Already set on this thread — replace contents so the second
|
||||
// (and subsequent) test on the same thread starts clean.
|
||||
*mutex.lock().unwrap() = fresh;
|
||||
} else {
|
||||
let _ = lock.set(Mutex::new(fresh));
|
||||
}
|
||||
});
|
||||
let _ = statics::CRDT_EVENT_TX.get_or_init(|| broadcast::channel::<CrdtEvent>(256).0);
|
||||
let _ = statics::SYNC_TX.get_or_init(|| broadcast::channel::<SignedOp>(1024).0);
|
||||
// Per-thread op journal + vector clock — keeps parallel tests' writes
|
||||
// from corrupting each other's view of ALL_OPS (notably, one thread's
|
||||
// `apply_compaction` could otherwise prune another thread's ops).
|
||||
// Per-thread op journal + vector clock — always cleared so a second test
|
||||
// on the same thread cannot see ops written by the first.
|
||||
statics::ALL_OPS_TL.with(|lock| {
|
||||
let _ = lock.set(Mutex::new(Vec::new()));
|
||||
if let Some(mutex) = lock.get() {
|
||||
mutex.lock().unwrap().clear();
|
||||
} else {
|
||||
let _ = lock.set(Mutex::new(Vec::new()));
|
||||
}
|
||||
});
|
||||
statics::VECTOR_CLOCK_TL.with(|lock| {
|
||||
let _ = lock.set(Mutex::new(VectorClock::new()));
|
||||
if let Some(mutex) = lock.get() {
|
||||
mutex.lock().unwrap().clear();
|
||||
} else {
|
||||
let _ = lock.set(Mutex::new(VectorClock::new()));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
//! tests do not share `ALL_OPS` — preventing one test's `apply_compaction`
|
||||
//! from pruning another test's freshly-written ops.
|
||||
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::{Mutex, OnceLock};
|
||||
|
||||
use bft_json_crdt::json_crdt::SignedOp;
|
||||
@@ -19,6 +20,14 @@ use super::super::VectorClock;
|
||||
use super::super::hex;
|
||||
use super::super::types::CrdtEvent;
|
||||
|
||||
/// Count of ops queued in the persistence channel that have not yet been written to SQLite.
|
||||
///
|
||||
/// Incremented when an op is sent into the channel; decremented after the
|
||||
/// persistence task commits it. Exposed via `dump_crdt_state` as
|
||||
/// `pending_persist_ops_count` so operators can tell whether there is a flush
|
||||
/// backlog before calling `rebuild_and_restart`.
|
||||
pub(crate) static PERSIST_PENDING: AtomicUsize = AtomicUsize::new(0);
|
||||
|
||||
/// Broadcast channel for CRDT events (stage transitions, etc.).
|
||||
pub(super) static CRDT_EVENT_TX: OnceLock<broadcast::Sender<CrdtEvent>> = OnceLock::new();
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
use super::super::hex;
|
||||
use super::super::read::extract_item_view;
|
||||
use super::super::types::PipelineDoc;
|
||||
use super::init::PersistMsg;
|
||||
use super::*;
|
||||
use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue, SignedOp};
|
||||
use bft_json_crdt::keypair::make_keypair;
|
||||
@@ -222,7 +223,7 @@ async fn init_and_write_read_roundtrip() {
|
||||
fn persist_tx_send_failure_logs_warn_with_op_type_and_seq() {
|
||||
let kp = make_keypair();
|
||||
let crdt = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
let (persist_tx, persist_rx) = mpsc::unbounded_channel::<SignedOp>();
|
||||
let (persist_tx, persist_rx) = mpsc::unbounded_channel::<PersistMsg>();
|
||||
|
||||
let mut state = CrdtState {
|
||||
crdt,
|
||||
@@ -235,6 +236,7 @@ fn persist_tx_send_failure_logs_warn_with_op_type_and_seq() {
|
||||
test_job_index: HashMap::new(),
|
||||
agent_throttle_index: HashMap::new(),
|
||||
gateway_project_index: HashMap::new(),
|
||||
llm_session_index: HashMap::new(),
|
||||
persist_tx,
|
||||
lamport_floor: 0,
|
||||
tombstones: std::collections::HashSet::new(),
|
||||
@@ -296,7 +298,7 @@ fn persist_tx_send_failure_logs_warn_with_op_type_and_seq() {
|
||||
fn persist_tx_send_success_emits_no_warn() {
|
||||
let kp = make_keypair();
|
||||
let crdt = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
let (persist_tx, _persist_rx) = mpsc::unbounded_channel::<SignedOp>();
|
||||
let (persist_tx, _persist_rx) = mpsc::unbounded_channel::<PersistMsg>();
|
||||
|
||||
let mut state = CrdtState {
|
||||
crdt,
|
||||
@@ -309,6 +311,7 @@ fn persist_tx_send_success_emits_no_warn() {
|
||||
test_job_index: HashMap::new(),
|
||||
agent_throttle_index: HashMap::new(),
|
||||
gateway_project_index: HashMap::new(),
|
||||
llm_session_index: HashMap::new(),
|
||||
persist_tx,
|
||||
lamport_floor: 0,
|
||||
tombstones: std::collections::HashSet::new(),
|
||||
@@ -485,3 +488,102 @@ async fn restart_new_register_resumes_from_lamport_floor() {
|
||||
max_seq,
|
||||
);
|
||||
}
|
||||
|
||||
/// Regression test for story 1116: ops sent before `flush_persistence` must all be
|
||||
/// present in the `crdt_ops` SQLite table after the flush completes.
|
||||
///
|
||||
/// Bug: `rebuild_and_restart` called `exec()` before the persistence task had
|
||||
/// a chance to drain the unbounded channel, silently dropping queued ops.
|
||||
///
|
||||
/// Reproducer: apply N ops → call `rebuild_and_restart` → the process re-execs
|
||||
/// and on the next startup `persisted_ops_count` is < N (lost ops).
|
||||
/// Fixed by: send a `Flush` sentinel through the channel before `exec()`; the
|
||||
/// task echoes back only after all preceding `Op` messages are committed.
|
||||
#[tokio::test]
|
||||
async fn flush_persistence_drains_all_ops_before_ack() {
|
||||
use std::sync::atomic::Ordering;
|
||||
use tokio::sync::oneshot;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let db_path = tmp.path().join("flush_drain_test.db");
|
||||
|
||||
let options = SqliteConnectOptions::new()
|
||||
.filename(&db_path)
|
||||
.create_if_missing(true);
|
||||
let pool = SqlitePool::connect_with(options).await.unwrap();
|
||||
sqlx::migrate!("./migrations").run(&pool).await.unwrap();
|
||||
|
||||
let kp = make_keypair();
|
||||
let mut crdt = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
|
||||
// Spawn an isolated persistence task — same logic as init() but without
|
||||
// touching the global singleton (keeping this test fully self-contained).
|
||||
let (tx, mut rx) = mpsc::unbounded_channel::<PersistMsg>();
|
||||
let pool_clone = pool.clone();
|
||||
tokio::spawn(async move {
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
let counter = AtomicUsize::new(0);
|
||||
while let Some(msg) = rx.recv().await {
|
||||
match msg {
|
||||
PersistMsg::Op(op) => {
|
||||
let op_json = serde_json::to_string(&op).unwrap();
|
||||
let op_id = hex::encode(&op.id());
|
||||
let seq = op.inner.seq as i64;
|
||||
let now = chrono::Utc::now().to_rfc3339();
|
||||
sqlx::query(
|
||||
"INSERT INTO crdt_ops (op_id, seq, op_json, created_at) \
|
||||
VALUES (?1, ?2, ?3, ?4) ON CONFLICT(op_id) DO NOTHING",
|
||||
)
|
||||
.bind(&op_id)
|
||||
.bind(seq)
|
||||
.bind(&op_json)
|
||||
.bind(&now)
|
||||
.execute(&pool_clone)
|
||||
.await
|
||||
.unwrap();
|
||||
counter.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
PersistMsg::Flush(reply) => {
|
||||
let _ = reply.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const N: usize = 10;
|
||||
for i in 0..N {
|
||||
let item: JsonValue = json!({
|
||||
"story_id": format!("1116_drain_{i}"),
|
||||
"stage": "1_backlog",
|
||||
"name": format!("Drain Test {i}"),
|
||||
"agent": "",
|
||||
"retry_count": 0.0,
|
||||
"blocked": false,
|
||||
"depends_on": "",
|
||||
"claimed_by": "",
|
||||
"claimed_at": 0.0,
|
||||
})
|
||||
.into();
|
||||
let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp);
|
||||
crdt.apply(op.clone());
|
||||
tx.send(PersistMsg::Op(Box::new(op))).unwrap();
|
||||
}
|
||||
|
||||
// Send flush sentinel and wait — all N ops must be committed first.
|
||||
let (flush_tx, flush_rx) = oneshot::channel();
|
||||
tx.send(PersistMsg::Flush(flush_tx)).unwrap();
|
||||
tokio::time::timeout(std::time::Duration::from_secs(5), flush_rx)
|
||||
.await
|
||||
.expect("flush timed out — persistence task did not drain within 5 s")
|
||||
.expect("flush oneshot dropped unexpectedly");
|
||||
|
||||
// Verify all N ops are in the database.
|
||||
let (count,): (i64,) = sqlx::query_as("SELECT COUNT(*) FROM crdt_ops")
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
count as usize, N,
|
||||
"all {N} ops must be in crdt_ops after flush; got {count}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -46,6 +46,121 @@ pub struct PipelineDoc {
|
||||
pub agent_throttle: ListCrdt<AgentThrottleCrdt>,
|
||||
pub gateway_projects: ListCrdt<GatewayProjectCrdt>,
|
||||
pub gateway_config: GatewayConfigCrdt,
|
||||
/// Append-only log of every pipeline transition, persisted as CRDT ops.
|
||||
pub event_log: ListCrdt<EventLogEntryCrdt>,
|
||||
/// Per-session LLM context state (high-water marks for event log injection).
|
||||
pub llm_sessions: ListCrdt<LlmSessionCrdt>,
|
||||
}
|
||||
|
||||
/// CRDT entry representing a single persisted pipeline stage-transition event.
|
||||
///
|
||||
/// Entries are append-only; once written they are never updated or tombstoned.
|
||||
/// The `event_seq` field is a per-sled monotonic counter computed at write time
|
||||
/// (count of existing entries for that sled), giving deterministic ordering for
|
||||
/// all transitions recorded by a single node even after CRDT replay on restart.
|
||||
#[add_crdt_fields]
|
||||
#[derive(Clone, CrdtNode, Debug)]
|
||||
pub struct EventLogEntryCrdt {
|
||||
/// Monotonic sequence number for this sled (0, 1, 2, …). Stored as `f64`
|
||||
/// because all CRDT scalar registers use JSON numbers.
|
||||
pub event_seq: LwwRegisterCrdt<f64>,
|
||||
/// Hex-encoded Ed25519 public key of the sled that recorded this event.
|
||||
pub sled_id: LwwRegisterCrdt<String>,
|
||||
/// Unix timestamp (seconds) when the transition fired.
|
||||
pub timestamp: LwwRegisterCrdt<f64>,
|
||||
/// Story ID of the work item that transitioned (e.g. `"42_story_foo"`).
|
||||
pub story_id: LwwRegisterCrdt<String>,
|
||||
/// Human-readable label of the stage before the transition.
|
||||
pub from_stage: LwwRegisterCrdt<String>,
|
||||
/// Human-readable label of the stage after the transition.
|
||||
pub to_stage: LwwRegisterCrdt<String>,
|
||||
/// String label of the `PipelineEvent` variant that triggered the transition.
|
||||
pub pipeline_event: LwwRegisterCrdt<String>,
|
||||
}
|
||||
|
||||
/// CRDT entry tracking an LLM session's event-log injection state.
|
||||
///
|
||||
/// Each session (keyed by `session_id`, typically a Matrix room ID) records the
|
||||
/// per-sled high-water marks so that `assemble_prompt_context` can inject only
|
||||
/// events the LLM has not yet seen and then advance the marks atomically.
|
||||
#[add_crdt_fields]
|
||||
#[derive(Clone, CrdtNode, Debug)]
|
||||
pub struct LlmSessionCrdt {
|
||||
/// Stable session identifier (e.g. Matrix room ID).
|
||||
pub session_id: LwwRegisterCrdt<String>,
|
||||
/// Human-readable persona name (e.g. `"Timmy"`).
|
||||
pub persona_name: LwwRegisterCrdt<String>,
|
||||
/// Scope wire string parsed by [`ScopeFilter::from_scope_str`]: `"all"`,
|
||||
/// `"sleds:hex1,hex2"`, or legacy `"single-sled"` / empty (→ local sled).
|
||||
pub scope: LwwRegisterCrdt<String>,
|
||||
/// JSON-serialised `BTreeMap<sled_id, last_seen_event_seq>` tracking how far
|
||||
/// each sled's event stream has been injected into this session's prompts.
|
||||
pub high_water: LwwRegisterCrdt<String>,
|
||||
}
|
||||
|
||||
/// Which sleds' events an LLM session may see.
|
||||
///
|
||||
/// Stored as a compact string in the CRDT register and parsed at read time.
|
||||
/// The default for a freshly-created session with no stored scope is
|
||||
/// [`ScopeFilter::LocalOnly`], which preserves prior single-sled behaviour.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub enum ScopeFilter {
|
||||
/// Include events from every sled present in the CRDT event log.
|
||||
///
|
||||
/// Default for gateway-level personas (e.g. Timmy in multi-project mode).
|
||||
All,
|
||||
/// Include only events whose `sled_id` is in the given set.
|
||||
///
|
||||
/// Default for sled-level personas: the set contains only the sled's own ID.
|
||||
Sleds(std::collections::BTreeSet<String>),
|
||||
}
|
||||
|
||||
impl ScopeFilter {
|
||||
/// Parse a wire-form scope string stored in the CRDT register.
|
||||
///
|
||||
/// Recognised forms:
|
||||
/// - `"all"` → [`ScopeFilter::All`]
|
||||
/// - `"sleds:hex1,hex2,…"` → [`ScopeFilter::Sleds`]
|
||||
/// - Anything else (including legacy `"single-sled"` and empty) →
|
||||
/// [`ScopeFilter::Sleds`] with an empty set; callers should fall back
|
||||
/// to the local sled ID in that case.
|
||||
pub fn from_scope_str(s: &str) -> Self {
|
||||
if s == "all" {
|
||||
return ScopeFilter::All;
|
||||
}
|
||||
if let Some(rest) = s.strip_prefix("sleds:") {
|
||||
let ids = rest
|
||||
.split(',')
|
||||
.filter(|id| !id.is_empty())
|
||||
.map(|id| id.to_string())
|
||||
.collect();
|
||||
return ScopeFilter::Sleds(ids);
|
||||
}
|
||||
ScopeFilter::Sleds(std::collections::BTreeSet::new())
|
||||
}
|
||||
|
||||
/// Encode this filter as the compact wire string stored in the CRDT.
|
||||
pub fn to_scope_str(&self) -> String {
|
||||
match self {
|
||||
ScopeFilter::All => "all".to_string(),
|
||||
ScopeFilter::Sleds(ids) => {
|
||||
let joined = ids.iter().map(|s| s.as_str()).collect::<Vec<_>>().join(",");
|
||||
format!("sleds:{joined}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Read-side snapshot of a single LLM session entry.
|
||||
pub struct LlmSessionView {
|
||||
/// Stable session identifier.
|
||||
pub session_id: String,
|
||||
/// Persona name for the bot in this session.
|
||||
pub persona_name: String,
|
||||
/// Parsed event-scope filter derived from the `scope` CRDT register.
|
||||
pub scope_filter: ScopeFilter,
|
||||
/// Decoded high-water map: sled_id → last seen event_seq.
|
||||
pub high_water: std::collections::BTreeMap<String, u64>,
|
||||
}
|
||||
|
||||
/// CRDT sub-document representing a single pipeline work item with LWW fields for stage, agent, etc.
|
||||
|
||||
@@ -21,21 +21,26 @@ use crate::pipeline_state::{AgentClaim, Stage, stage_dir_name};
|
||||
///
|
||||
/// Returns `true` if the item was found and the op was applied, `false` otherwise.
|
||||
pub fn set_depends_on(story_id: &str, deps: &[u32]) -> bool {
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return false;
|
||||
};
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return false;
|
||||
};
|
||||
let Some(&idx) = state.index.get(story_id) else {
|
||||
return false;
|
||||
};
|
||||
let value = if deps.is_empty() {
|
||||
String::new()
|
||||
} else {
|
||||
serde_json::to_string(deps).unwrap_or_default()
|
||||
};
|
||||
apply_and_persist(&mut state, |s| s.crdt.doc.items[idx].depends_on.set(value));
|
||||
{
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return false;
|
||||
};
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return false;
|
||||
};
|
||||
let Some(&idx) = state.index.get(story_id) else {
|
||||
return false;
|
||||
};
|
||||
let value = if deps.is_empty() {
|
||||
String::new()
|
||||
} else {
|
||||
serde_json::to_string(deps).unwrap_or_default()
|
||||
};
|
||||
apply_and_persist(&mut state, |s| s.crdt.doc.items[idx].depends_on.set(value));
|
||||
}
|
||||
// Drop the CRDT lock before calling sync: read_item acquires the same
|
||||
// mutex and would deadlock if the lock were still held here.
|
||||
crate::db::ops::sync_item_depends_on(story_id);
|
||||
true
|
||||
}
|
||||
|
||||
@@ -155,6 +160,9 @@ pub fn set_name(story_id: &str, name: Option<&str>) -> bool {
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.items[idx].name.set(value.clone())
|
||||
});
|
||||
// Drop the lock before the shadow write so `read_item` can acquire it.
|
||||
drop(state);
|
||||
crate::db::sync_item_name(story_id);
|
||||
true
|
||||
}
|
||||
|
||||
@@ -175,16 +183,21 @@ pub fn set_agent(story_id: &str, agent: Option<crate::config::AgentName>) -> boo
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return false;
|
||||
};
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return false;
|
||||
};
|
||||
let Some(&idx) = state.index.get(story_id) else {
|
||||
return false;
|
||||
};
|
||||
let value = agent.map(|a| a.as_str().to_string()).unwrap_or_default();
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.items[idx].agent.set(value.clone())
|
||||
});
|
||||
{
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return false;
|
||||
};
|
||||
let Some(&idx) = state.index.get(story_id) else {
|
||||
return false;
|
||||
};
|
||||
let value = agent.map(|a| a.as_str().to_string()).unwrap_or_default();
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.items[idx].agent.set(value.clone())
|
||||
});
|
||||
}
|
||||
// Sync the updated agent to the SQLite shadow table. Must be called after
|
||||
// releasing the CRDT mutex so read_item can re-acquire it without deadlock.
|
||||
crate::db::ops::sync_item_agent(story_id);
|
||||
true
|
||||
}
|
||||
|
||||
@@ -556,6 +569,24 @@ pub fn set_retry_count(story_id: &str, count: i64) {
|
||||
_ => return,
|
||||
};
|
||||
write_item(story_id, &new_stage, None, None, None, None);
|
||||
if let Some(db) = crate::db::shadow_write::PIPELINE_DB.get() {
|
||||
let stage = stage_dir_name(&new_stage).to_string();
|
||||
let name = Some(item.name().to_string());
|
||||
let agent = item.agent().map(|a| a.to_string());
|
||||
let depends_on = (!item.depends_on().is_empty())
|
||||
.then(|| serde_json::to_string(item.depends_on()).ok())
|
||||
.flatten();
|
||||
let msg = crate::db::shadow_write::PipelineWriteMsg {
|
||||
story_id: story_id.to_string(),
|
||||
stage,
|
||||
name,
|
||||
agent,
|
||||
retry_count: Some(count.max(0)),
|
||||
depends_on,
|
||||
content: None,
|
||||
};
|
||||
let _ = db.tx.send(msg);
|
||||
}
|
||||
}
|
||||
|
||||
/// Increment `retries` by 1 and return the new value.
|
||||
@@ -605,5 +636,23 @@ pub fn bump_retry_count(story_id: &str) -> i64 {
|
||||
_ => return 0,
|
||||
};
|
||||
write_item(story_id, &new_stage, None, None, None, None);
|
||||
if let Some(db) = crate::db::shadow_write::PIPELINE_DB.get() {
|
||||
let stage = stage_dir_name(&new_stage).to_string();
|
||||
let name = Some(item.name().to_string());
|
||||
let agent = item.agent().map(|a| a.to_string());
|
||||
let depends_on = (!item.depends_on().is_empty())
|
||||
.then(|| serde_json::to_string(item.depends_on()).ok())
|
||||
.flatten();
|
||||
let msg = crate::db::shadow_write::PipelineWriteMsg {
|
||||
story_id: story_id.to_string(),
|
||||
stage,
|
||||
name,
|
||||
agent,
|
||||
retry_count: Some(new_retries as i64),
|
||||
depends_on,
|
||||
content: None,
|
||||
};
|
||||
let _ = db.tx.send(msg);
|
||||
}
|
||||
new_retries as i64
|
||||
}
|
||||
|
||||
@@ -705,6 +705,59 @@ pub fn purge_done_stage_merge_jobs() {
|
||||
slog!("[crdt] Purged {count} stale MergeJob entries for terminal-stage stories");
|
||||
}
|
||||
|
||||
/// Delete `pipeline_items` rows that correspond to CRDT-tombstoned stories.
|
||||
///
|
||||
/// Pre-1094 code deleted pipeline_items via a fire-and-forget channel that
|
||||
/// could be lost on an abrupt restart, leaving rows with non-terminal stage
|
||||
/// values for stories that no longer exist in the CRDT. This migration
|
||||
/// removes those zombie rows on startup.
|
||||
///
|
||||
/// Idempotent: rows already absent are unaffected; running twice produces the
|
||||
/// same result.
|
||||
pub async fn migrate_zombie_pipeline_rows() {
|
||||
let pool = match crate::db::get_shared_pool() {
|
||||
Some(p) => p,
|
||||
None => return,
|
||||
};
|
||||
let tombstone_ids = crate::crdt_state::tombstoned_ids();
|
||||
sweep_zombie_rows(pool, &tombstone_ids).await;
|
||||
}
|
||||
|
||||
/// Inner sweep used by [`migrate_zombie_pipeline_rows`] and its tests.
|
||||
///
|
||||
/// Deletes every `pipeline_items` row in `ids` whose stage is not already a
|
||||
/// terminal value. Returns the number of rows deleted.
|
||||
#[cfg_attr(test, allow(dead_code))]
|
||||
pub(crate) async fn sweep_zombie_rows(pool: &sqlx::SqlitePool, ids: &[String]) -> u32 {
|
||||
if ids.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
let mut cleaned = 0u32;
|
||||
for story_id in ids {
|
||||
match sqlx::query(
|
||||
"DELETE FROM pipeline_items WHERE id = ?1 AND stage NOT IN \
|
||||
('done','archived','abandoned','superseded','rejected')",
|
||||
)
|
||||
.bind(story_id)
|
||||
.execute(pool)
|
||||
.await
|
||||
{
|
||||
Ok(r) if r.rows_affected() > 0 => cleaned += 1,
|
||||
Ok(_) => {}
|
||||
Err(e) => {
|
||||
slog!(
|
||||
"[crdt] migrate_zombie_pipeline_rows: failed to delete '{}': {e}",
|
||||
story_id
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
if cleaned > 0 {
|
||||
slog!("[crdt] Swept {cleaned} zombie pipeline_items rows for tombstoned stories");
|
||||
}
|
||||
cleaned
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod merge_job_migration_tests {
|
||||
use super::super::super::state::init_for_test;
|
||||
@@ -909,3 +962,100 @@ mod merge_job_migration_tests {
|
||||
migrate_merge_job(std::path::Path::new("/nonexistent/pipeline.db"));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod zombie_row_migration_tests {
|
||||
use super::super::super::state::init_for_test;
|
||||
use super::*;
|
||||
use sqlx::Row as _;
|
||||
|
||||
async fn make_pool() -> sqlx::SqlitePool {
|
||||
let options = sqlx::sqlite::SqliteConnectOptions::new()
|
||||
.filename(":memory:")
|
||||
.create_if_missing(true);
|
||||
let pool = sqlx::pool::PoolOptions::new()
|
||||
.max_connections(1)
|
||||
.connect_with(options)
|
||||
.await
|
||||
.unwrap();
|
||||
sqlx::migrate!("./migrations").run(&pool).await.unwrap();
|
||||
pool
|
||||
}
|
||||
|
||||
async fn insert_row(pool: &sqlx::SqlitePool, story_id: &str, stage: &str) {
|
||||
let now = chrono::Utc::now().to_rfc3339();
|
||||
sqlx::query(
|
||||
"INSERT INTO pipeline_items \
|
||||
(id, name, stage, agent, retry_count, depends_on, content, created_at, updated_at) \
|
||||
VALUES (?1, ?2, ?3, NULL, 0, NULL, NULL, ?4, ?4)",
|
||||
)
|
||||
.bind(story_id)
|
||||
.bind(story_id)
|
||||
.bind(stage)
|
||||
.bind(&now)
|
||||
.execute(pool)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
async fn row_stage(pool: &sqlx::SqlitePool, story_id: &str) -> Option<String> {
|
||||
sqlx::query("SELECT stage FROM pipeline_items WHERE id = ?1")
|
||||
.bind(story_id)
|
||||
.fetch_optional(pool)
|
||||
.await
|
||||
.unwrap()
|
||||
.map(|r| r.get(0))
|
||||
}
|
||||
|
||||
/// Bug 1094 regression: delete a story in `coding` stage, assert the
|
||||
/// `pipeline_items` row is gone; then re-run the sweep and confirm no
|
||||
/// further changes (idempotent).
|
||||
#[tokio::test]
|
||||
async fn sweep_removes_zombie_coding_row_and_is_idempotent() {
|
||||
init_for_test();
|
||||
let pool = make_pool().await;
|
||||
let story_id = "1094_zombie_regression";
|
||||
|
||||
// Seed: insert a pipeline_items row in the "coding" stage.
|
||||
insert_row(&pool, story_id, "coding").await;
|
||||
assert_eq!(row_stage(&pool, story_id).await.as_deref(), Some("coding"));
|
||||
|
||||
// Tombstone the story in the CRDT (simulate evict_item outcome).
|
||||
crate::crdt_state::write_item_str(
|
||||
story_id,
|
||||
"coding",
|
||||
Some("Zombie regression story"),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
crate::crdt_state::evict_item(story_id).ok();
|
||||
|
||||
// Run the sweep — row must be deleted.
|
||||
let deleted = sweep_zombie_rows(&pool, &[story_id.to_string()]).await;
|
||||
assert_eq!(deleted, 1, "expected one zombie row to be cleaned");
|
||||
assert!(
|
||||
row_stage(&pool, story_id).await.is_none(),
|
||||
"pipeline_items row must be gone after sweep"
|
||||
);
|
||||
|
||||
// Re-run is a no-op (idempotent).
|
||||
let second = sweep_zombie_rows(&pool, &[story_id.to_string()]).await;
|
||||
assert_eq!(second, 0, "second sweep must be a no-op");
|
||||
}
|
||||
|
||||
/// Rows already in a terminal stage must be left alone.
|
||||
#[tokio::test]
|
||||
async fn sweep_skips_terminal_stage_rows() {
|
||||
let pool = make_pool().await;
|
||||
let story_id = "1094_terminal_skip";
|
||||
insert_row(&pool, story_id, "done").await;
|
||||
|
||||
let deleted = sweep_zombie_rows(&pool, &[story_id.to_string()]).await;
|
||||
assert_eq!(deleted, 0, "terminal-stage row must not be deleted");
|
||||
assert!(
|
||||
row_stage(&pool, story_id).await.is_some(),
|
||||
"terminal-stage row must survive sweep"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,6 @@ pub use item::{
|
||||
pub use item::write_item_str;
|
||||
pub use migrations::{
|
||||
migrate_legacy_stage_strings, migrate_merge_job, migrate_names_from_slugs,
|
||||
migrate_node_claims_to_agent_claims, migrate_story_ids_to_numeric, name_from_story_id,
|
||||
purge_done_stage_merge_jobs,
|
||||
migrate_node_claims_to_agent_claims, migrate_story_ids_to_numeric,
|
||||
migrate_zombie_pipeline_rows, name_from_story_id, purge_done_stage_merge_jobs,
|
||||
};
|
||||
|
||||
@@ -60,6 +60,17 @@ pub enum ContentKey<'a> {
|
||||
/// completion. Read by `get_merge_status` to surface gate output for the
|
||||
/// "completed" state without a separate MergeJob CRDT register (story 1036).
|
||||
MergeReport(&'a str),
|
||||
/// Flag written by spawn.rs when a coder session exits with a non-zero exit
|
||||
/// code (API error, network failure, or Claude-API-level budget exhaustion).
|
||||
/// Prevents the stuck-respawn counter from incrementing for forced exits —
|
||||
/// only self-exits with no file or read changes count toward the cap.
|
||||
/// Consumed (read + deleted) by the commit-recovery path in pipeline advance.
|
||||
CommitRecoveryForcedExit(&'a str),
|
||||
/// Cumulative set of files read across all commit-recovery sessions for a
|
||||
/// story, stored as a newline-separated sorted list. Used to detect whether
|
||||
/// the agent made read-exploration progress even when the worktree diff did
|
||||
/// not grow (story 1089, AC2). Cleared when a commit lands or the story blocks.
|
||||
CommitRecoveryReadSet(&'a str),
|
||||
}
|
||||
|
||||
impl<'a> ContentKey<'a> {
|
||||
@@ -85,6 +96,10 @@ impl<'a> ContentKey<'a> {
|
||||
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
|
||||
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),
|
||||
ContentKey::MergeReport(id) => format!("{id}:merge_report"),
|
||||
ContentKey::CommitRecoveryForcedExit(id) => {
|
||||
format!("{id}:commit_recovery_forced_exit")
|
||||
}
|
||||
ContentKey::CommitRecoveryReadSet(id) => format!("{id}:commit_recovery_read_set"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -150,7 +165,9 @@ pub fn delete_content(key: ContentKey<'_>) {
|
||||
|
||||
/// Ensure the in-memory content store is initialised.
|
||||
///
|
||||
/// Safe to call multiple times — the `OnceLock` is set at most once.
|
||||
/// In non-test builds: init-once via `OnceLock` (safe to call multiple times).
|
||||
/// In test builds: always resets `CONTENT_STORE_TL` to an empty `HashMap` so
|
||||
/// each test on the same thread starts with a clean store.
|
||||
pub fn ensure_content_store() {
|
||||
#[cfg(not(test))]
|
||||
{
|
||||
@@ -160,7 +177,11 @@ pub fn ensure_content_store() {
|
||||
#[cfg(test)]
|
||||
{
|
||||
CONTENT_STORE_TL.with(|lock| {
|
||||
if lock.get().is_none() {
|
||||
if let Some(mutex) = lock.get() {
|
||||
// Already initialised on this thread — reset to empty so the
|
||||
// next test does not see content written by a previous test.
|
||||
mutex.lock().unwrap().clear();
|
||||
} else {
|
||||
let _ = lock.set(Mutex::new(HashMap::new()));
|
||||
}
|
||||
});
|
||||
@@ -188,6 +209,41 @@ pub(super) fn init_content_store(map: HashMap<String, String>) {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Regression: two sequential `ensure_content_store()` + write + read cycles
|
||||
/// in the same test body must not see each other's content. Before the fix,
|
||||
/// `ensure_content_store()` was a no-op on the second call (OnceLock gating),
|
||||
/// so the second cycle could read items written in the first cycle.
|
||||
#[test]
|
||||
fn sequential_ensure_content_store_resets_state() {
|
||||
// ── Cycle 1 ──────────────────────────────────────────────────────────
|
||||
ensure_content_store();
|
||||
write_content(ContentKey::Story("1111_cycle1"), "cycle-one body");
|
||||
assert_eq!(
|
||||
read_content(ContentKey::Story("1111_cycle1")).as_deref(),
|
||||
Some("cycle-one body"),
|
||||
"cycle 1: item must be readable after write"
|
||||
);
|
||||
|
||||
// ── Cycle 2: reset, write a different item ────────────────────────────
|
||||
ensure_content_store();
|
||||
// Cycle-1 item must no longer be visible.
|
||||
assert!(
|
||||
read_content(ContentKey::Story("1111_cycle1")).is_none(),
|
||||
"cycle 2: store must be empty; cycle-1 content must not bleed through"
|
||||
);
|
||||
write_content(ContentKey::Story("1111_cycle2"), "cycle-two body");
|
||||
assert_eq!(
|
||||
read_content(ContentKey::Story("1111_cycle2")).as_deref(),
|
||||
Some("cycle-two body"),
|
||||
"cycle 2: own item must be readable"
|
||||
);
|
||||
// And cycle-1 key must still be absent.
|
||||
assert!(
|
||||
read_content(ContentKey::Story("1111_cycle1")).is_none(),
|
||||
"cycle 2: cycle-1 content must remain absent after cycle-2 write"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC 2 regression: writing under `ContentKey::Story` is not visible under
|
||||
/// `ContentKey::GateOutput` (and vice versa). The typed key namespace, not
|
||||
/// runtime substring matching, enforces the separation.
|
||||
|
||||
+142
-1
@@ -28,7 +28,10 @@ pub mod recover;
|
||||
pub mod shadow_write;
|
||||
|
||||
pub use content_store::{ContentKey, all_content_ids, delete_content, read_content, write_content};
|
||||
pub use ops::{ItemMeta, delete_item, move_item_stage, next_item_number, write_item_with_content};
|
||||
pub use ops::{
|
||||
ItemMeta, delete_item, delete_item_sync, move_item_stage, next_item_number, sync_item_name,
|
||||
write_item_with_content,
|
||||
};
|
||||
pub use shadow_write::{check_schema_drift, get_shared_pool, init};
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -589,6 +592,144 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
/// `shadow_write::init` spawns its background task on the calling runtime,
|
||||
/// which under `#[tokio::test]` is per-test and dies when the test ends.
|
||||
/// Park the init on a leaked multi-thread runtime so the bg task lives for
|
||||
/// the whole test process; mirrors `db::ops::tests::ensure_shadow_db`.
|
||||
#[cfg(test)]
|
||||
static SHADOW_RT: std::sync::OnceLock<tokio::runtime::Runtime> = std::sync::OnceLock::new();
|
||||
|
||||
#[cfg(test)]
|
||||
async fn ensure_shadow_db() {
|
||||
static INIT: std::sync::OnceLock<()> = std::sync::OnceLock::new();
|
||||
if INIT.get().is_some() {
|
||||
return;
|
||||
}
|
||||
let rt = SHADOW_RT.get_or_init(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.worker_threads(1)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("shadow rt")
|
||||
});
|
||||
rt.spawn(async {
|
||||
static INNER: std::sync::OnceLock<()> = std::sync::OnceLock::new();
|
||||
if INNER.get().is_some() {
|
||||
return;
|
||||
}
|
||||
let tmp = tempfile::tempdir().expect("tmp");
|
||||
let db_path = tmp.path().join("pipeline.db");
|
||||
std::mem::forget(tmp);
|
||||
shadow_write::init(&db_path).await.expect("shadow init");
|
||||
let _ = INNER.set(());
|
||||
})
|
||||
.await
|
||||
.expect("shadow init task");
|
||||
let _ = INIT.set(());
|
||||
}
|
||||
|
||||
/// Regression for story 1095: `set_name` must propagate the new name to the
|
||||
/// SQLite shadow table via `sync_item_name`. Before the fix, the CRDT
|
||||
/// register was updated but `pipeline_items.name` stayed stale.
|
||||
#[tokio::test]
|
||||
async fn set_name_updates_shadow_name_column() {
|
||||
crate::crdt_state::init_for_test();
|
||||
ensure_content_store();
|
||||
ensure_shadow_db().await;
|
||||
|
||||
let story_id = "9095_story_set_name_shadow";
|
||||
write_item_with_content(
|
||||
story_id,
|
||||
"1_backlog",
|
||||
"---\nname: Original Name\n---\n",
|
||||
ItemMeta::named("Original Name"),
|
||||
);
|
||||
|
||||
// Wait for the initial insert to land.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
||||
|
||||
// Rename via the CRDT setter — now also triggers sync_item_name.
|
||||
crate::crdt_state::set_name(story_id, Some("Updated Name"));
|
||||
|
||||
// Wait for the background write task to flush.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||
|
||||
// Open a fresh pool on this test's runtime — sqlx pools are not safe
|
||||
// to share across runtimes, so we can't reuse `get_shared_pool()`
|
||||
// (which was created on the leaked shadow-write runtime).
|
||||
let path = shadow_write::SHADOW_DB_PATH
|
||||
.get()
|
||||
.expect("SHADOW_DB_PATH set by init");
|
||||
let opts = sqlx::sqlite::SqliteConnectOptions::new()
|
||||
.filename(path)
|
||||
.create_if_missing(false);
|
||||
let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
|
||||
let row: (Option<String>,) =
|
||||
sqlx::query_as("SELECT name FROM pipeline_items WHERE id = ?1")
|
||||
.bind(story_id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
row.0.as_deref(),
|
||||
Some("Updated Name"),
|
||||
"set_name must propagate the new name to the shadow table"
|
||||
);
|
||||
}
|
||||
|
||||
/// Bug 1098: `bump_retry_count` must mirror the new value to the SQLite
|
||||
/// shadow table, not only to the CRDT register.
|
||||
///
|
||||
/// Before the fix, calling `bump_retry_count` updated the CRDT but left
|
||||
/// `pipeline_items.retry_count` stale.
|
||||
#[tokio::test]
|
||||
async fn bump_retry_count_updates_shadow_table() {
|
||||
crate::crdt_state::init_for_test();
|
||||
ensure_content_store();
|
||||
ensure_shadow_db().await;
|
||||
|
||||
let story_id = "9899_story_retry_shadow_1098";
|
||||
|
||||
// Insert the story into both CRDT and the shadow table.
|
||||
write_item_with_content(
|
||||
story_id,
|
||||
"2_current",
|
||||
"# Retry shadow test\n",
|
||||
ItemMeta::named("Retry Shadow Test"),
|
||||
);
|
||||
|
||||
// Let the background write task process the initial insert.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
||||
|
||||
// Three bumps → retry_count must reach 3 in SQLite.
|
||||
crate::crdt_state::bump_retry_count(story_id);
|
||||
crate::crdt_state::bump_retry_count(story_id);
|
||||
crate::crdt_state::bump_retry_count(story_id);
|
||||
|
||||
// Let the background write task process all three updates.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||
|
||||
let path = shadow_write::SHADOW_DB_PATH
|
||||
.get()
|
||||
.expect("SHADOW_DB_PATH set by init");
|
||||
let opts = sqlx::sqlite::SqliteConnectOptions::new()
|
||||
.filename(path)
|
||||
.create_if_missing(false);
|
||||
let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
|
||||
let (count,): (i64,) =
|
||||
sqlx::query_as("SELECT retry_count FROM pipeline_items WHERE id = ?1")
|
||||
.bind(story_id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
count, 3,
|
||||
"retry_count must be 3 after three bump_retry_count calls"
|
||||
);
|
||||
}
|
||||
|
||||
/// Story 1087, AC2: the split-stage migration projects every supported
|
||||
/// wire-form `stage` string into the canonical `(pipeline, status)` pair.
|
||||
/// The fixture covers each Stage variant (and the legacy numeric-prefix
|
||||
|
||||
@@ -72,6 +72,12 @@ pub fn write_item_with_content(story_id: &str, stage: &str, content: &str, meta:
|
||||
.and_then(|d| serde_json::to_string(d).ok());
|
||||
|
||||
// Update in-memory content store.
|
||||
// In test builds, the caller (test setup) is responsible for calling
|
||||
// ensure_content_store() once before writing — calling it here would
|
||||
// reset the store on every write, losing items from prior writes in the
|
||||
// same test. In production, the lazy-init call is safe because nothing
|
||||
// resets the store between writes.
|
||||
#[cfg(not(test))]
|
||||
ensure_content_store();
|
||||
write_content(ContentKey::Story(story_id), content);
|
||||
|
||||
@@ -176,6 +182,43 @@ pub fn move_item_stage(
|
||||
}
|
||||
}
|
||||
|
||||
/// Shadow-write the updated agent field for an existing pipeline item.
|
||||
///
|
||||
/// Called by [`crate::crdt_state::set_agent`] after the CRDT register is updated
|
||||
/// so `pipeline_items.agent` stays in sync. Reads the full current metadata from
|
||||
/// the CRDT (stage, name, depends_on, retry_count) to avoid overwriting other
|
||||
/// columns with stale values — only the `agent` column carries the new data.
|
||||
pub fn sync_item_agent(story_id: &str) {
|
||||
let Some(db) = PIPELINE_DB.get() else {
|
||||
return;
|
||||
};
|
||||
let Some(view) = crate::crdt_state::read_item(story_id) else {
|
||||
return;
|
||||
};
|
||||
let stage = view.stage().dir_name().to_string();
|
||||
let name = Some(view.name().to_string());
|
||||
let agent = view.agent().map(|a| a.as_str().to_string());
|
||||
let depends_on = {
|
||||
let d = view.depends_on();
|
||||
if d.is_empty() {
|
||||
None
|
||||
} else {
|
||||
serde_json::to_string(d).ok()
|
||||
}
|
||||
};
|
||||
let retry_count = Some(i64::from(view.retry_count()));
|
||||
let msg = PipelineWriteMsg {
|
||||
story_id: story_id.to_string(),
|
||||
stage,
|
||||
name,
|
||||
agent,
|
||||
retry_count,
|
||||
depends_on,
|
||||
content: None,
|
||||
};
|
||||
let _ = db.tx.send(msg);
|
||||
}
|
||||
|
||||
/// Delete a story from the shadow table (fire-and-forget).
|
||||
pub fn delete_item(story_id: &str) {
|
||||
delete_content(ContentKey::Story(story_id));
|
||||
@@ -198,6 +241,111 @@ pub fn delete_item(story_id: &str) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete a story from the shadow table, awaiting the SQLite write.
|
||||
///
|
||||
/// Unlike [`delete_item`], this function issues a direct `DELETE FROM
|
||||
/// pipeline_items` via the shared pool and awaits the result — so the row
|
||||
/// is gone before this function returns. Use this from async call sites
|
||||
/// where durability of the deletion matters (e.g. story deletion, startup
|
||||
/// migration). Falls back to the fire-and-forget channel when the shared
|
||||
/// pool is not yet initialised.
|
||||
pub async fn delete_item_sync(story_id: &str) {
|
||||
delete_content(ContentKey::Story(story_id));
|
||||
|
||||
if let Some(pool) = super::shadow_write::get_shared_pool() {
|
||||
if let Err(e) = sqlx::query("DELETE FROM pipeline_items WHERE id = ?1")
|
||||
.bind(story_id)
|
||||
.execute(pool)
|
||||
.await
|
||||
{
|
||||
crate::slog_warn!(
|
||||
"[db] Synchronous delete from pipeline_items failed for '{}': {e}",
|
||||
story_id
|
||||
);
|
||||
}
|
||||
} else if let Some(db) = PIPELINE_DB.get() {
|
||||
let msg = PipelineWriteMsg {
|
||||
story_id: story_id.to_string(),
|
||||
stage: "deleted".to_string(),
|
||||
name: None,
|
||||
agent: None,
|
||||
retry_count: None,
|
||||
depends_on: None,
|
||||
content: None,
|
||||
};
|
||||
let _ = db.tx.send(msg);
|
||||
}
|
||||
}
|
||||
|
||||
/// Sync the shadow table's `name` column after a CRDT name-register write.
|
||||
///
|
||||
/// Reads the current item from the CRDT (which already holds the new name after
|
||||
/// `apply_and_persist`) and sends a `PipelineWriteMsg` so the SQLite mirror
|
||||
/// stays in sync. All other columns (stage, agent, retry_count, depends_on)
|
||||
/// are preserved from the live CRDT view; `content` is left as `None` so the
|
||||
/// UPSERT's `COALESCE` keeps the existing value.
|
||||
///
|
||||
/// No-ops if the DB is not initialised or the item is not in the CRDT.
|
||||
pub fn sync_item_name(story_id: &str) {
|
||||
let Some(db) = PIPELINE_DB.get() else { return };
|
||||
let Some(view) = crate::crdt_state::read_item(story_id) else {
|
||||
return;
|
||||
};
|
||||
let depends_on = {
|
||||
let d = view.depends_on();
|
||||
if d.is_empty() {
|
||||
None
|
||||
} else {
|
||||
serde_json::to_string(d).ok()
|
||||
}
|
||||
};
|
||||
let msg = PipelineWriteMsg {
|
||||
story_id: story_id.to_string(),
|
||||
stage: view.stage().dir_name().to_string(),
|
||||
name: Some(view.name().to_string()),
|
||||
agent: view.agent().map(|a| a.to_string()),
|
||||
retry_count: Some(view.retry_count() as i64),
|
||||
depends_on,
|
||||
content: None,
|
||||
};
|
||||
let _ = db.tx.send(msg);
|
||||
}
|
||||
|
||||
/// Sync the `depends_on` field of a pipeline item from the CRDT to the shadow table.
|
||||
///
|
||||
/// Called after [`crate::crdt_state::set_depends_on`] updates the CRDT register so
|
||||
/// that the SQLite shadow table stays in lock-step. Reads the full current view from
|
||||
/// the CRDT (stage, name, agent, retry_count, depends_on) and sends a
|
||||
/// [`PipelineWriteMsg`] over [`PIPELINE_DB`]`.tx`. Pattern mirrors
|
||||
/// [`move_item_stage`] lines 157-176. No-op when the CRDT is uninitialised or the
|
||||
/// story_id is not found.
|
||||
pub fn sync_item_depends_on(story_id: &str) {
|
||||
let Some(db) = PIPELINE_DB.get() else {
|
||||
return;
|
||||
};
|
||||
let Some(view) = crate::crdt_state::read_item(story_id) else {
|
||||
return;
|
||||
};
|
||||
let depends_on = {
|
||||
let d = view.depends_on();
|
||||
if d.is_empty() {
|
||||
None
|
||||
} else {
|
||||
serde_json::to_string(d).ok()
|
||||
}
|
||||
};
|
||||
let msg = PipelineWriteMsg {
|
||||
story_id: story_id.to_string(),
|
||||
stage: view.stage().dir_name().to_string(),
|
||||
name: Some(view.name().to_string()),
|
||||
agent: view.agent().map(|a| a.to_string()),
|
||||
retry_count: Some(view.retry_count() as i64),
|
||||
depends_on,
|
||||
content: None,
|
||||
};
|
||||
let _ = db.tx.send(msg);
|
||||
}
|
||||
|
||||
/// Get the next available item number by scanning the CRDT state, the
|
||||
/// in-memory content store, AND the tombstone set for the highest existing
|
||||
/// number.
|
||||
@@ -248,3 +396,88 @@ pub fn next_item_number() -> u32 {
|
||||
|
||||
max_num + 1
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::db::shadow_write;
|
||||
|
||||
/// `shadow_write::init` spawns its background task on the calling runtime.
|
||||
/// Under `#[tokio::test]` that runtime is per-test and drops when the test
|
||||
/// ends, killing the task. This OnceLock holds a multi-thread runtime that
|
||||
/// persists for the lifetime of the test binary so the write loop stays alive
|
||||
/// across all tests that share `PIPELINE_DB`.
|
||||
static SHADOW_RT: std::sync::OnceLock<tokio::runtime::Runtime> = std::sync::OnceLock::new();
|
||||
|
||||
async fn ensure_shadow_db() {
|
||||
static INIT: std::sync::OnceLock<()> = std::sync::OnceLock::new();
|
||||
if INIT.get().is_some() {
|
||||
return;
|
||||
}
|
||||
let rt = SHADOW_RT.get_or_init(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.worker_threads(1)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("shadow rt")
|
||||
});
|
||||
rt.spawn(async {
|
||||
static INNER: std::sync::OnceLock<()> = std::sync::OnceLock::new();
|
||||
if INNER.get().is_some() {
|
||||
return;
|
||||
}
|
||||
let tmp = tempfile::tempdir().expect("tmp");
|
||||
let db_path = tmp.path().join("pipeline.db");
|
||||
std::mem::forget(tmp);
|
||||
shadow_write::init(&db_path).await.expect("shadow init");
|
||||
let _ = INNER.set(());
|
||||
})
|
||||
.await
|
||||
.expect("shadow init task");
|
||||
let _ = INIT.set(());
|
||||
}
|
||||
|
||||
/// Regression test for story 1097: `set_depends_on` must sync the shadow
|
||||
/// table. Before the fix, the CRDT register was updated but the
|
||||
/// `pipeline_items.depends_on` column was never written.
|
||||
#[tokio::test]
|
||||
async fn set_depends_on_syncs_shadow_table() {
|
||||
crate::crdt_state::init_for_test();
|
||||
ensure_content_store();
|
||||
ensure_shadow_db().await;
|
||||
|
||||
let story_id = "1097_story_depends_on_shadow_drift";
|
||||
|
||||
// Insert the story so it exists in both the CRDT and the shadow table.
|
||||
write_item_with_content(
|
||||
story_id,
|
||||
"backlog",
|
||||
"---\nname: Depends On Shadow Drift\n---\n",
|
||||
ItemMeta::named("Depends On Shadow Drift"),
|
||||
);
|
||||
|
||||
// Let the initial shadow write land.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
||||
|
||||
// This is the write under test: it must update the shadow table.
|
||||
let ok = crate::crdt_state::set_depends_on(story_id, &[1, 2]);
|
||||
assert!(ok, "set_depends_on must return true for an existing item");
|
||||
|
||||
// Let the shadow write land.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
||||
|
||||
let pool = shadow_write::get_shared_pool().expect("pool must be initialised");
|
||||
let row: (Option<String>,) =
|
||||
sqlx::query_as("SELECT depends_on FROM pipeline_items WHERE id = ?1")
|
||||
.bind(story_id)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.expect("row must exist in shadow table");
|
||||
|
||||
assert_eq!(
|
||||
row.0.as_deref(),
|
||||
Some("[1,2]"),
|
||||
"pipeline_items.depends_on must reflect the set_depends_on call"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,23 +41,30 @@ pub fn get_shared_pool() -> Option<&'static SqlitePool> {
|
||||
}
|
||||
|
||||
/// A pending shadow write for one pipeline item.
|
||||
pub(super) struct PipelineWriteMsg {
|
||||
pub(super) story_id: String,
|
||||
pub(super) stage: String,
|
||||
pub(super) name: Option<String>,
|
||||
pub(super) agent: Option<String>,
|
||||
pub(super) retry_count: Option<i64>,
|
||||
pub(super) depends_on: Option<String>,
|
||||
pub(super) content: Option<String>,
|
||||
pub(crate) struct PipelineWriteMsg {
|
||||
pub(crate) story_id: String,
|
||||
pub(crate) stage: String,
|
||||
pub(crate) name: Option<String>,
|
||||
pub(crate) agent: Option<String>,
|
||||
pub(crate) retry_count: Option<i64>,
|
||||
pub(crate) depends_on: Option<String>,
|
||||
pub(crate) content: Option<String>,
|
||||
}
|
||||
|
||||
/// Handle to the background shadow-write task.
|
||||
pub struct PipelineDb {
|
||||
pub(super) tx: mpsc::UnboundedSender<PipelineWriteMsg>,
|
||||
pub(crate) tx: mpsc::UnboundedSender<PipelineWriteMsg>,
|
||||
}
|
||||
|
||||
/// Process-global handle to the background shadow-write task, set once during `init`.
|
||||
pub(super) static PIPELINE_DB: OnceLock<PipelineDb> = OnceLock::new();
|
||||
pub(crate) static PIPELINE_DB: OnceLock<PipelineDb> = OnceLock::new();
|
||||
|
||||
/// Path of the SQLite file opened by [`init`], set once by the first successful caller.
|
||||
///
|
||||
/// Tests that need to open their own pool (because sqlx pools are not safe to
|
||||
/// share across Tokio runtimes) read this to find the right file regardless of
|
||||
/// which test won the `PIPELINE_DB` init race.
|
||||
pub(crate) static SHADOW_DB_PATH: OnceLock<std::path::PathBuf> = OnceLock::new();
|
||||
|
||||
/// Initialise the pipeline database.
|
||||
///
|
||||
@@ -68,6 +75,10 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
if PIPELINE_DB.get().is_some() {
|
||||
return Ok(());
|
||||
}
|
||||
// Record the path before doing any real work so tests can always find the
|
||||
// correct file even if two callers race — the OnceLock ensures only one
|
||||
// path wins, and whichever wins will also win the PIPELINE_DB set below.
|
||||
let _ = SHADOW_DB_PATH.set(db_path.to_path_buf());
|
||||
|
||||
// Story 1087: before running the migration that splits `stage` into
|
||||
// (`pipeline`, `status`), take a timestamped side-car copy of the live DB
|
||||
|
||||
@@ -0,0 +1,320 @@
|
||||
//! Pipeline transition event log — persists every `TransitionFired` event into
|
||||
//! the CRDT so the log survives server restarts and replicates across nodes.
|
||||
//!
|
||||
//! ## Design
|
||||
//!
|
||||
//! Each [`TransitionFired`][crate::pipeline_state::TransitionFired] is written
|
||||
//! as an [`EventLogEntryCrdt`][crate::crdt_state::EventLogEntryCrdt] entry in
|
||||
//! the `PipelineDoc::event_log` grow-only list. Because the list is backed by
|
||||
//! CRDT ops that are persisted to SQLite and replayed on startup, the log
|
||||
//! survives `rebuild_and_restart` without any additional bookkeeping.
|
||||
//!
|
||||
//! A monotonic per-sled sequence number (`event_seq`) is computed atomically
|
||||
//! while the CRDT lock is held, guaranteeing that no two entries from the same
|
||||
//! sled share a sequence number and that the numbers are contiguous from 0.
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use chrono::DateTime;
|
||||
|
||||
/// Monotonic per-sled logical sequence number identifying a pipeline event.
|
||||
///
|
||||
/// This is the sequence number that *would have been assigned* to an event in the
|
||||
/// contiguous logical event stream, as tracked by the event-log subscriber. It
|
||||
/// differs from the CRDT `event_seq` (which counts CRDT entries including gap
|
||||
/// sentinels) but is meaningful for identifying the range of dropped events when
|
||||
/// a gap is inserted.
|
||||
pub type EventId = u64;
|
||||
|
||||
/// A snapshot of a single persisted pipeline transition event.
|
||||
///
|
||||
/// Constructed by [`read_event_log`] from the raw CRDT entries.
|
||||
pub struct LoggedEvent {
|
||||
/// Monotonic sequence number for `sled_id` (0-based, contiguous).
|
||||
pub event_id: u64,
|
||||
/// Hex-encoded Ed25519 public key of the sled that recorded this event.
|
||||
pub sled_id: String,
|
||||
/// UTC timestamp when the transition fired.
|
||||
pub at: DateTime<chrono::Utc>,
|
||||
/// Story ID of the work item that transitioned.
|
||||
pub story_id: String,
|
||||
/// Human-readable label of the stage before the transition.
|
||||
pub from_stage: String,
|
||||
/// Human-readable label of the stage after the transition.
|
||||
pub to_stage: String,
|
||||
/// String label of the `PipelineEvent` variant that triggered the transition.
|
||||
pub pipeline_event: String,
|
||||
}
|
||||
|
||||
/// Write a single `TransitionFired` event into the CRDT event log.
|
||||
///
|
||||
/// Computes the next monotonic `event_seq` for this sled atomically inside
|
||||
/// the CRDT write lock and appends the entry. No-ops when the CRDT is not
|
||||
/// yet initialised (e.g. in gateway mode with no project root).
|
||||
pub fn log_transition_event(fired: &crate::pipeline_state::TransitionFired) {
|
||||
let sled_id = crate::crdt_state::our_node_id().unwrap_or_default();
|
||||
let timestamp = fired.at.timestamp() as f64;
|
||||
let from_stage = crate::pipeline_state::stage_label(&fired.before);
|
||||
let to_stage = crate::pipeline_state::stage_label(&fired.after);
|
||||
let pipeline_event = crate::pipeline_state::event_label(&fired.event);
|
||||
|
||||
crate::crdt_state::append_event_log_entry(
|
||||
&sled_id,
|
||||
timestamp,
|
||||
&fired.story_id.0,
|
||||
from_stage,
|
||||
to_stage,
|
||||
pipeline_event,
|
||||
);
|
||||
}
|
||||
|
||||
/// Read all persisted events from the CRDT event log.
|
||||
///
|
||||
/// Entries are returned sorted by `(sled_id, event_id)` so that events from
|
||||
/// each sled appear in monotonic order. Entries with malformed CRDT fields
|
||||
/// are silently dropped.
|
||||
pub fn read_event_log() -> Vec<LoggedEvent> {
|
||||
let mut entries: Vec<LoggedEvent> = crate::crdt_state::read_all_event_log_entries()
|
||||
.into_iter()
|
||||
.map(|raw| LoggedEvent {
|
||||
event_id: raw.event_seq,
|
||||
sled_id: raw.sled_id,
|
||||
at: DateTime::from_timestamp(raw.timestamp as i64, 0).unwrap_or_default(),
|
||||
story_id: raw.story_id,
|
||||
from_stage: raw.from_stage,
|
||||
to_stage: raw.to_stage,
|
||||
pipeline_event: raw.pipeline_event,
|
||||
})
|
||||
.collect();
|
||||
entries.sort_by(|a, b| a.sled_id.cmp(&b.sled_id).then(a.event_id.cmp(&b.event_id)));
|
||||
entries
|
||||
}
|
||||
|
||||
/// Append a gap sentinel to the event log for the local sled.
|
||||
///
|
||||
/// Encodes the logical [`EventId`] range `[from_id, to_id]` of dropped events
|
||||
/// using the `EventStreamGap` pipeline event marker. Should be called whenever
|
||||
/// the event-log subscriber detects a lag in the broadcast channel so that no
|
||||
/// drop is silent.
|
||||
pub fn insert_gap_sentinel(from_id: EventId, to_id: EventId) {
|
||||
let sled_id = crate::crdt_state::our_node_id().unwrap_or_default();
|
||||
crate::crdt_state::append_gap_log_entry(&sled_id, from_id, to_id);
|
||||
log_gap_observability(&sled_id, from_id, to_id);
|
||||
}
|
||||
|
||||
/// Spawn a background task that persists every `TransitionFired` event to the CRDT.
|
||||
///
|
||||
/// Subscribes to the global `TransitionFired` broadcast channel. Normal events
|
||||
/// are persisted via [`log_transition_event`]. When the subscriber lags (the
|
||||
/// broadcast channel drops the oldest messages), a single
|
||||
/// `EventStreamGap` sentinel is appended to the log covering the dropped range
|
||||
/// so no transition is silently lost.
|
||||
pub fn spawn_event_log_subscriber() {
|
||||
let mut rx = crate::pipeline_state::subscribe_transitions();
|
||||
tokio::spawn(async move {
|
||||
// Tracks the next expected logical sequence number in the subscriber's
|
||||
// view of the event stream. Incremented on every successfully processed
|
||||
// event; advanced by the gap size on each lag so we can identify the
|
||||
// exact logical range of dropped events.
|
||||
let mut next_logical_seq: EventId = 0;
|
||||
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(fired) => {
|
||||
log_transition_event(&fired);
|
||||
next_logical_seq += 1;
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
|
||||
let from = next_logical_seq;
|
||||
let to = next_logical_seq + n - 1;
|
||||
crate::slog_warn!(
|
||||
"[event-log] Subscriber lagged; {n} event(s) dropped \
|
||||
(logical ids {from}..={to}); gap sentinel appended."
|
||||
);
|
||||
insert_gap_sentinel(from, to);
|
||||
next_logical_seq += n;
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Emit observability log lines after inserting a gap sentinel.
|
||||
fn log_gap_observability(sled_id: &str, from_id: EventId, to_id: EventId) {
|
||||
let entries = crate::crdt_state::read_all_event_log_entries();
|
||||
let sled_total: usize = entries.iter().filter(|e| e.sled_id == sled_id).count();
|
||||
let gap_count: usize = entries
|
||||
.iter()
|
||||
.filter(|e| {
|
||||
e.sled_id == sled_id && e.pipeline_event == crate::crdt_state::GAP_PIPELINE_EVENT
|
||||
})
|
||||
.count();
|
||||
crate::slog!(
|
||||
"[event-log] gap inserted sled={sled_id} from={from_id} to={to_id} \
|
||||
sled_entries={sled_total} gap_count={gap_count}"
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::crdt_state::PipelineDoc;
|
||||
use crate::pipeline_state::{PipelineEvent, PlanState, Stage, StoryId, TransitionFired};
|
||||
use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue, OpState};
|
||||
use bft_json_crdt::keypair::make_keypair;
|
||||
use bft_json_crdt::op::ROOT_ID;
|
||||
use serde_json::json;
|
||||
|
||||
fn make_fired(i: u32) -> TransitionFired {
|
||||
TransitionFired {
|
||||
story_id: StoryId(format!("test_{i}")),
|
||||
before: Stage::Backlog,
|
||||
after: Stage::Coding {
|
||||
claim: None,
|
||||
plan: PlanState::Missing,
|
||||
retries: 0,
|
||||
},
|
||||
event: PipelineEvent::DepsMet,
|
||||
at: chrono::Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// AC4: fire N `TransitionFired` events, simulate a restart by re-initialising
|
||||
/// the CRDT (replaying all ops on a fresh doc), assert all N entries appear in
|
||||
/// the log in insertion order with monotonically increasing IDs.
|
||||
#[test]
|
||||
fn event_log_survives_crdt_reinit() {
|
||||
let kp = make_keypair();
|
||||
let mut crdt1 = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
let sled_id = crate::crdt_state::hex::encode(&crdt1.id);
|
||||
|
||||
let n = 5usize;
|
||||
let mut ops = Vec::new();
|
||||
// Track the last OpId so each entry appends to the end (insert after
|
||||
// ROOT_ID would place each entry at the front, reversing the sequence).
|
||||
let mut last_id = ROOT_ID;
|
||||
|
||||
for i in 0..n {
|
||||
let entry: JsonValue = json!({
|
||||
"event_seq": i as f64,
|
||||
"sled_id": &sled_id,
|
||||
"timestamp": 1_000_000.0_f64 + i as f64,
|
||||
"story_id": format!("story_{i}"),
|
||||
"from_stage": "backlog",
|
||||
"to_stage": "coding",
|
||||
"pipeline_event": "DepsMet",
|
||||
})
|
||||
.into();
|
||||
let op = crdt1.doc.event_log.insert(last_id, entry).sign(&kp);
|
||||
last_id = op.inner.id;
|
||||
assert_eq!(crdt1.apply(op.clone()), OpState::Ok);
|
||||
ops.push(op);
|
||||
}
|
||||
|
||||
assert_eq!(crdt1.doc.event_log.view().len(), n);
|
||||
|
||||
// Simulate restart: replay the same ops on a fresh CRDT instance.
|
||||
let mut crdt2 = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
for op in ops {
|
||||
assert_eq!(crdt2.apply(op), OpState::Ok);
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
crdt2.doc.event_log.view().len(),
|
||||
n,
|
||||
"all {n} entries must survive CRDT re-init"
|
||||
);
|
||||
|
||||
// Entries must appear in insertion order with monotonically increasing IDs.
|
||||
for i in 0..n {
|
||||
let entry = &crdt2.doc.event_log[i];
|
||||
let seq = match entry.event_seq.view() {
|
||||
JsonValue::Number(v) => v as u64,
|
||||
other => panic!("expected numeric event_seq at index {i}, got {other:?}"),
|
||||
};
|
||||
assert_eq!(seq, i as u64, "event_seq must equal insertion index {i}");
|
||||
assert_eq!(
|
||||
entry.story_id.view(),
|
||||
JsonValue::String(format!("story_{i}")),
|
||||
"story_id mismatch at index {i}"
|
||||
);
|
||||
assert_eq!(
|
||||
entry.sled_id.view(),
|
||||
JsonValue::String(sled_id.clone()),
|
||||
"sled_id mismatch at index {i}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// AC4: fill the feeder queue past capacity by inserting a gap sentinel, then
|
||||
/// assert (a) the gap sentinel appears in the event log and (b) the assembled
|
||||
/// context contains the human-readable gap line.
|
||||
#[test]
|
||||
fn gap_sentinel_in_log_and_assembled_context() {
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
// Log 3 real events (logical ids 0, 1, 2).
|
||||
for i in 0..3u32 {
|
||||
log_transition_event(&make_fired(i));
|
||||
}
|
||||
|
||||
// Simulate: the feeder queue overflowed and logical ids 3..=5 were dropped.
|
||||
insert_gap_sentinel(3, 5);
|
||||
|
||||
// Log one more real event after the gap.
|
||||
log_transition_event(&make_fired(99));
|
||||
|
||||
// (a) Gap sentinel must appear in read_event_log().
|
||||
let entries = read_event_log();
|
||||
let gap = entries
|
||||
.iter()
|
||||
.find(|e| e.pipeline_event == crate::crdt_state::GAP_PIPELINE_EVENT);
|
||||
assert!(gap.is_some(), "gap sentinel must be present in event log");
|
||||
let gap = gap.unwrap();
|
||||
// from_stage encodes the from EventId; to_stage encodes the to EventId.
|
||||
assert_eq!(gap.from_stage, "3", "gap from_stage must be '3'");
|
||||
assert_eq!(gap.to_stage, "5", "gap to_stage must be '5'");
|
||||
|
||||
// (b) assemble_prompt_context must render the gap line.
|
||||
let ctx = crate::llm_session::assemble_prompt_context("room-gap-e2e");
|
||||
assert!(
|
||||
ctx.contains("events between 3 and 5 were dropped"),
|
||||
"assembled context must contain gap line; got: {ctx}"
|
||||
);
|
||||
// Real events must also appear.
|
||||
assert!(
|
||||
ctx.contains("test_0"),
|
||||
"first story must appear; got: {ctx}"
|
||||
);
|
||||
assert!(
|
||||
ctx.contains("test_99"),
|
||||
"last story must appear; got: {ctx}"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC2: every `TransitionFired` event is written to the log without filtering.
|
||||
#[test]
|
||||
fn log_transition_event_appends_all_events() {
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
let n = 4u32;
|
||||
for i in 0..n {
|
||||
log_transition_event(&make_fired(i));
|
||||
}
|
||||
|
||||
let entries = crate::crdt_state::read_all_event_log_entries();
|
||||
assert_eq!(
|
||||
entries.len(),
|
||||
n as usize,
|
||||
"expected {n} event log entries, got {}",
|
||||
entries.len()
|
||||
);
|
||||
|
||||
// Verify monotonic sequence numbers 0..n-1.
|
||||
let mut seqs: Vec<u64> = entries.iter().map(|e| e.event_seq).collect();
|
||||
seqs.sort_unstable();
|
||||
let expected: Vec<u64> = (0..u64::from(n)).collect();
|
||||
assert_eq!(seqs, expected, "event_seq values must be 0..{n}");
|
||||
}
|
||||
}
|
||||
@@ -62,13 +62,6 @@ pub fn build_gateway_route(state_arc: Arc<GatewayState>) -> impl poem::Endpoint
|
||||
"/gateway/agents/:id/assign",
|
||||
poem::post(gateway_assign_agent_handler),
|
||||
)
|
||||
// Serve the embedded React frontend so the gateway has a UI.
|
||||
.at(
|
||||
"/assets/*path",
|
||||
poem::get(crate::http::assets::embedded_asset),
|
||||
)
|
||||
.at("/*path", poem::get(crate::http::assets::embedded_file))
|
||||
.at("/", poem::get(crate::http::assets::embedded_index))
|
||||
.data(state_arc)
|
||||
}
|
||||
|
||||
@@ -113,7 +106,6 @@ pub async fn run(config_path: &Path, port: u16) -> Result<(), std::io::Error> {
|
||||
}
|
||||
|
||||
// Spawn the Matrix bot if `.huskies/bot.toml` exists in the config directory.
|
||||
let gateway_projects: Vec<String> = state_arc.projects.read().await.keys().cloned().collect();
|
||||
let gateway_project_urls: std::collections::BTreeMap<String, String> = state_arc
|
||||
.projects
|
||||
.read()
|
||||
@@ -124,8 +116,8 @@ pub async fn run(config_path: &Path, port: u16) -> Result<(), std::io::Error> {
|
||||
let (bot_abort, bot_shutdown_tx) = gateway::io::spawn_gateway_bot(
|
||||
&config_dir,
|
||||
Arc::clone(&state_arc.active_project),
|
||||
gateway_projects,
|
||||
gateway_project_urls,
|
||||
Arc::clone(&state_arc.projects),
|
||||
port,
|
||||
Some(state_arc.event_tx.clone()),
|
||||
Arc::clone(&state_arc.perm_rx),
|
||||
|
||||
@@ -1175,6 +1175,8 @@ async fn ws_only_sled_handles_tools_list_and_tools_call() {
|
||||
ProjectEntry {
|
||||
url: None,
|
||||
auth_token: Some("secret".into()),
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
let config = GatewayConfig {
|
||||
@@ -1244,6 +1246,8 @@ async fn two_concurrent_sleds_are_routed_by_active_project() {
|
||||
ProjectEntry {
|
||||
url: None,
|
||||
auth_token: Some("alpha-tok".into()),
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
projects.insert(
|
||||
@@ -1251,6 +1255,8 @@ async fn two_concurrent_sleds_are_routed_by_active_project() {
|
||||
ProjectEntry {
|
||||
url: None,
|
||||
auth_token: Some("beta-tok".into()),
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
let config = GatewayConfig {
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
//! Static asset serving — serves the embedded React frontend via `rust-embed`.
|
||||
use poem::{
|
||||
Response, handler,
|
||||
http::{StatusCode, header},
|
||||
web::Path,
|
||||
};
|
||||
use rust_embed::RustEmbed;
|
||||
|
||||
#[derive(RustEmbed)]
|
||||
#[folder = "../frontend/dist"]
|
||||
struct EmbeddedAssets;
|
||||
|
||||
fn serve_embedded(path: &str) -> Response {
|
||||
let normalized = if path.is_empty() {
|
||||
"index.html"
|
||||
} else {
|
||||
path.trim_start_matches('/')
|
||||
};
|
||||
|
||||
let is_asset_request = normalized.starts_with("assets/");
|
||||
let asset = if is_asset_request {
|
||||
EmbeddedAssets::get(normalized)
|
||||
} else {
|
||||
EmbeddedAssets::get(normalized).or_else(|| {
|
||||
if normalized == "index.html" {
|
||||
None
|
||||
} else {
|
||||
EmbeddedAssets::get("index.html")
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
match asset {
|
||||
Some(content) => {
|
||||
let body = content.data.into_owned();
|
||||
let mime = mime_guess::from_path(normalized)
|
||||
.first_or_octet_stream()
|
||||
.to_string();
|
||||
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, mime)
|
||||
.body(body)
|
||||
}
|
||||
None => Response::builder()
|
||||
.status(StatusCode::NOT_FOUND)
|
||||
.body("Not Found"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Serve a single embedded asset from the `assets/` folder.
|
||||
#[handler]
|
||||
pub fn embedded_asset(Path(path): Path<String>) -> Response {
|
||||
let asset_path = format!("assets/{path}");
|
||||
serve_embedded(&asset_path)
|
||||
}
|
||||
|
||||
/// Serve an embedded file by path (falls back to `index.html` for SPA routing).
|
||||
#[handler]
|
||||
pub fn embedded_file(Path(path): Path<String>) -> Response {
|
||||
serve_embedded(&path)
|
||||
}
|
||||
|
||||
/// Serve the embedded SPA entrypoint.
|
||||
#[handler]
|
||||
pub fn embedded_index() -> Response {
|
||||
serve_embedded("index.html")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use poem::http::StatusCode;
|
||||
|
||||
#[test]
|
||||
fn non_asset_path_spa_fallback_or_not_found() {
|
||||
// Non-asset paths fall back to index.html for SPA client-side routing.
|
||||
// In release builds (with embedded dist/) this returns 200.
|
||||
// In debug builds without a built frontend dist/ it returns 404.
|
||||
let response = serve_embedded("__nonexistent_spa_route__.html");
|
||||
let status = response.status();
|
||||
assert!(
|
||||
status == StatusCode::OK || status == StatusCode::NOT_FOUND,
|
||||
"unexpected status: {status}",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn missing_asset_path_prefix_returns_not_found() {
|
||||
// assets/ prefix: no SPA fallback – returns 404 if the file does not exist
|
||||
let response = serve_embedded("assets/__nonexistent__.js");
|
||||
assert_eq!(response.status(), StatusCode::NOT_FOUND);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serve_embedded_does_not_panic_on_empty_path() {
|
||||
// Empty path normalises to index.html; OK in release, 404 in debug without dist/
|
||||
let response = serve_embedded("");
|
||||
let status = response.status();
|
||||
assert!(
|
||||
status == StatusCode::OK || status == StatusCode::NOT_FOUND,
|
||||
"unexpected status: {status}",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn embedded_assets_struct_is_iterable() {
|
||||
// Verifies that rust-embed compiled the EmbeddedAssets struct correctly.
|
||||
// In debug builds without a built frontend dist/ directory the iterator is empty; that is
|
||||
// expected. In release builds it will contain all bundled frontend files.
|
||||
let _files: Vec<_> = EmbeddedAssets::iter().collect();
|
||||
// No assertion needed – the test passes as long as it compiles and does not panic.
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embedded_index_handler_returns_ok_or_not_found() {
|
||||
// Route the handler through TestClient; index.html is the SPA entry point.
|
||||
let app = poem::Route::new().at("/", poem::get(embedded_index));
|
||||
let cli = poem::test::TestClient::new(app);
|
||||
let resp = cli.get("/").send().await;
|
||||
let status = resp.0.status();
|
||||
assert!(
|
||||
status == StatusCode::OK || status == StatusCode::NOT_FOUND,
|
||||
"unexpected status: {status}",
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embedded_file_handler_with_path_returns_ok_or_not_found() {
|
||||
// Non-asset paths fall back to index.html (SPA routing) or 404.
|
||||
let app = poem::Route::new().at("/*path", poem::get(embedded_file));
|
||||
let cli = poem::test::TestClient::new(app);
|
||||
let resp = cli.get("/__spa_route__").send().await;
|
||||
let status = resp.0.status();
|
||||
assert!(
|
||||
status == StatusCode::OK || status == StatusCode::NOT_FOUND,
|
||||
"unexpected status: {status}",
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embedded_asset_handler_missing_file_returns_not_found() {
|
||||
// The assets/ prefix disables SPA fallback; missing files must return 404.
|
||||
let app = poem::Route::new().at("/assets/*path", poem::get(embedded_asset));
|
||||
let cli = poem::test::TestClient::new(app);
|
||||
let resp = cli.get("/assets/__nonexistent__.js").send().await;
|
||||
assert_eq!(resp.0.status(), StatusCode::NOT_FOUND);
|
||||
}
|
||||
}
|
||||
@@ -118,6 +118,7 @@ impl AppContext {
|
||||
)),
|
||||
permission_timeout_secs: 120,
|
||||
status: agents.status_broadcaster(),
|
||||
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||
});
|
||||
Self {
|
||||
state: Arc::new(state),
|
||||
|
||||
@@ -20,6 +20,7 @@ const GATEWAY_TOOLS: &[&str] = &[
|
||||
"gateway_status",
|
||||
"gateway_health",
|
||||
"init_project",
|
||||
"adopt_project",
|
||||
"aggregate_pipeline_status",
|
||||
"agents.list",
|
||||
// Handled at the gateway so the Matrix bot's perm_rx listener is used
|
||||
@@ -82,6 +83,28 @@ pub(crate) fn gateway_tool_definitions() -> Vec<Value> {
|
||||
"required": ["path"]
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "adopt_project",
|
||||
"description": "Wrap a Docker container around an existing host checkout — the same as `new project <name> --adopt <path>`. No git clone or git init is performed; the directory is bind-mounted at /workspace. Launches the appropriate stack-specific image, generates an SSH keypair, and registers the project in projects.toml. Returns the SSH connection command and detected stack.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Short project name (letters, digits, hyphens, underscores). Must be unique across registered projects."
|
||||
},
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Absolute host filesystem path to the existing checkout to adopt. Must be an existing directory."
|
||||
},
|
||||
"stack": {
|
||||
"type": "string",
|
||||
"description": "Optional: override stack detection (e.g. 'rust', 'node', 'python'). Auto-detected from directory contents when omitted."
|
||||
}
|
||||
},
|
||||
"required": ["name", "path"]
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "aggregate_pipeline_status",
|
||||
"description": "Fetch pipeline status from ALL registered projects in parallel and return an aggregated report. For each project: stage counts (backlog/current/qa/merge/done) and a list of blocked or failing items with triage detail. Unreachable projects are included with an error state rather than failing the whole call.",
|
||||
@@ -358,6 +381,7 @@ async fn handle_gateway_tool(
|
||||
"gateway_status" => handle_gateway_status_tool(state, id).await,
|
||||
"gateway_health" => handle_gateway_health_tool(state, id).await,
|
||||
"init_project" => handle_init_project_tool(params, state, id).await,
|
||||
"adopt_project" => handle_adopt_project_tool(params, state, id).await,
|
||||
"aggregate_pipeline_status" => handle_aggregate_pipeline_status_tool(state, id).await,
|
||||
"agents.list" => handle_agents_list_tool(id),
|
||||
"prompt_permission" => handle_prompt_permission_tool(params, state, id).await,
|
||||
@@ -525,6 +549,81 @@ async fn handle_init_project_tool(
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle the `adopt_project` gateway tool.
|
||||
///
|
||||
/// Wraps a Docker container around an existing host checkout — the MCP
|
||||
/// equivalent of the `new project <name> --adopt <path>` chat command.
|
||||
/// Validates that `path` exists and is a directory before delegating to
|
||||
/// `handle_new_project`, which performs stack detection, container launch,
|
||||
/// SSH keypair generation, and project registration.
|
||||
async fn handle_adopt_project_tool(
|
||||
params: &Value,
|
||||
state: &GatewayState,
|
||||
id: Option<Value>,
|
||||
) -> JsonRpcResponse {
|
||||
use crate::chat::transport::matrix::new_project::handle_new_project;
|
||||
|
||||
let args = params.get("arguments").unwrap_or(params);
|
||||
let name = args
|
||||
.get("name")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.trim();
|
||||
let path_str = args
|
||||
.get("path")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.trim();
|
||||
let stack = args.get("stack").and_then(|v| v.as_str());
|
||||
|
||||
if name.is_empty() {
|
||||
return JsonRpcResponse::error(id, -32602, "missing required parameter: name".into());
|
||||
}
|
||||
if path_str.is_empty() {
|
||||
return JsonRpcResponse::error(id, -32602, "missing required parameter: path".into());
|
||||
}
|
||||
|
||||
let path = std::path::Path::new(path_str);
|
||||
if !path.exists() {
|
||||
return JsonRpcResponse::error(
|
||||
id,
|
||||
-32602,
|
||||
format!(
|
||||
"Adopt path `{path_str}` does not exist — specify the path to an existing checkout."
|
||||
),
|
||||
);
|
||||
}
|
||||
if !path.is_dir() {
|
||||
return JsonRpcResponse::error(
|
||||
id,
|
||||
-32602,
|
||||
format!("Adopt path `{path_str}` is not a directory."),
|
||||
);
|
||||
}
|
||||
|
||||
let result = handle_new_project(
|
||||
name,
|
||||
stack,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
Some(path_str),
|
||||
&state.projects,
|
||||
&state.config_dir,
|
||||
)
|
||||
.await;
|
||||
|
||||
JsonRpcResponse::success(
|
||||
id,
|
||||
json!({
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": result
|
||||
}]
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_aggregate_pipeline_status_tool(
|
||||
state: &GatewayState,
|
||||
id: Option<Value>,
|
||||
@@ -686,3 +785,123 @@ async fn handle_pipeline_get(state: &GatewayState, id: Option<Value>) -> JsonRpc
|
||||
|
||||
JsonRpcResponse::success(id, json!({ "active": active, "projects": results }))
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::service::gateway::config::{GatewayConfig, ProjectEntry};
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn make_test_state(config_dir: &std::path::Path) -> Arc<GatewayState> {
|
||||
let mut projects = BTreeMap::new();
|
||||
projects.insert(
|
||||
"test-project".to_string(),
|
||||
ProjectEntry::with_url("http://127.0.0.1:3001"),
|
||||
);
|
||||
let config = GatewayConfig {
|
||||
projects,
|
||||
sled_tokens: BTreeMap::new(),
|
||||
};
|
||||
Arc::new(GatewayState::new(config, config_dir.to_path_buf(), 3000).unwrap())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_missing_name_returns_error() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "path": "/some/path" } });
|
||||
let resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(resp.error.is_some(), "expected error for missing name");
|
||||
let msg = resp.error.unwrap().message;
|
||||
assert!(msg.contains("name"), "expected 'name' in error, got: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_missing_path_returns_error() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "name": "myapp" } });
|
||||
let resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(resp.error.is_some(), "expected error for missing path");
|
||||
let msg = resp.error.unwrap().message;
|
||||
assert!(msg.contains("path"), "expected 'path' in error, got: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_nonexistent_path_returns_error() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "name": "myapp", "path": "/nonexistent/xyz/abc123" } });
|
||||
let resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(resp.error.is_some(), "expected error for nonexistent path");
|
||||
let msg = resp.error.unwrap().message;
|
||||
assert!(
|
||||
msg.contains("does not exist"),
|
||||
"expected 'does not exist' in error, got: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_file_path_returns_error() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let file = dir.path().join("not_a_dir.txt");
|
||||
std::fs::write(&file, "content").unwrap();
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "name": "myapp", "path": file.to_str().unwrap() } });
|
||||
let resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(resp.error.is_some(), "expected error for file path");
|
||||
let msg = resp.error.unwrap().message;
|
||||
assert!(
|
||||
msg.contains("not a directory"),
|
||||
"expected 'not a directory' in error, got: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
/// The MCP entry point produces the same validation outcome as the chat-routed call.
|
||||
///
|
||||
/// Both paths ultimately run the same checks: path-doesn't-exist and
|
||||
/// path-is-file are tested here to verify the MCP layer is consistent
|
||||
/// with `handle_new_project` in `new_project.rs`.
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_matches_chat_routed_call() {
|
||||
use crate::chat::transport::matrix::new_project::handle_new_project;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let file = dir.path().join("a_file.txt");
|
||||
std::fs::write(&file, "not a dir").unwrap();
|
||||
let file_path = file.to_str().unwrap();
|
||||
|
||||
// Chat-routed: handle_new_project returns a text string with the error.
|
||||
let store = Arc::new(RwLock::new(BTreeMap::new()));
|
||||
let chat_result = handle_new_project(
|
||||
"myapp",
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
Some(file_path),
|
||||
&store,
|
||||
dir.path(),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
chat_result.contains("not a directory"),
|
||||
"chat path should report 'not a directory', got: {chat_result}"
|
||||
);
|
||||
|
||||
// MCP-routed: handle_adopt_project_tool returns a JSON-RPC error.
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "name": "myapp2", "path": file_path } });
|
||||
let mcp_resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(mcp_resp.error.is_some(), "MCP path should return an error");
|
||||
let mcp_msg = mcp_resp.error.unwrap().message;
|
||||
assert!(
|
||||
mcp_msg.contains("not a directory"),
|
||||
"MCP path should report 'not a directory', got: {mcp_msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,7 +126,7 @@ pub(crate) fn tool_dump_crdt(args: &Value) -> Result<String, String> {
|
||||
"total_ops_in_list": dump.total_ops_in_list,
|
||||
"max_seq_in_list": dump.max_seq_in_list,
|
||||
"persisted_ops_count": dump.persisted_ops_count,
|
||||
"pending_persist_ops_count": null,
|
||||
"pending_persist_ops_count": dump.pending_persist_ops_count,
|
||||
},
|
||||
"items": items,
|
||||
}))
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user