Compare commits
78 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5bca1f6cec | |||
| 86b9d069b1 | |||
| f6ee90e169 | |||
| 9a286315a3 | |||
| 5d0801854c | |||
| 343473bc01 | |||
| 2593b36072 | |||
| 34af2f1820 | |||
| be7bdf8304 | |||
| 918f18c200 | |||
| 1db5473f50 | |||
| de638603cd | |||
| 20ec690e22 | |||
| 9a5b6f4d92 | |||
| 398726a14a | |||
| c8be24f833 | |||
| f8ff63af0e | |||
| 34e78bdbd5 | |||
| fb4e52dd09 | |||
| e58ff4465a | |||
| b1dec36e1c | |||
| 4aaf7dbdc6 | |||
| 95c0aafb68 | |||
| 5062e008c6 | |||
| 55badc1e08 | |||
| bdc621fb36 | |||
| 0ec5c05de8 | |||
| d10634c7d6 | |||
| a7bad217eb | |||
| f2c13c7d29 | |||
| 3444ff4e29 | |||
| 26f4da7ba5 | |||
| 4c6b4f5d4d | |||
| 70797753df | |||
| ec3216072d | |||
| 810c8d4d72 | |||
| ce688fc0bf | |||
| c131896432 | |||
| 42e6eec9e9 | |||
| fe00fe6a25 | |||
| c97b7c841f | |||
| 2d0387fe63 | |||
| 71d3047ef0 | |||
| d86cc38b2a | |||
| 21b2efd268 | |||
| badd522d60 | |||
| ecd3f600d9 | |||
| 099df17e77 | |||
| c88e42eba2 | |||
| 89058ebd49 | |||
| d8204ab7ed | |||
| e2ea1af4c8 | |||
| 08780475d0 | |||
| 6eb2742e7d | |||
| c1b7e12b0b | |||
| 53d44ff42a | |||
| 6331dea8b0 | |||
| 240beec7de | |||
| 7de167b21b | |||
| 49af014a84 | |||
| 73cf1c6ff9 | |||
| f8b1e14b74 | |||
| 265e6f9a15 | |||
| 40e995da88 | |||
| 6e4fb7fd4b | |||
| 0695ad7ae6 | |||
| eb6b07531a | |||
| 2d6846fe03 | |||
| a5bfd40233 | |||
| a40500eea9 | |||
| f8212f102f | |||
| 59302b465d | |||
| efafe44db1 | |||
| 6a2f81e873 | |||
| 3a43337735 | |||
| b6df89d24c | |||
| 10d992a7e4 | |||
| 5c63618b30 |
+2
-3
@@ -6,15 +6,14 @@
|
||||
# Local environment (secrets)
|
||||
.env
|
||||
|
||||
# Local-only scripts
|
||||
script/local-release
|
||||
|
||||
# App specific (root-level; huskies subdirectory patterns live in .huskies/.gitignore)
|
||||
store.json
|
||||
_merge_parsed.json
|
||||
.huskies_port
|
||||
.huskies/bot.toml.bak
|
||||
.huskies/build_hash
|
||||
# Phantom 0-byte pipeline.db sometimes appears at repo root from old code; canonical DB lives at .huskies/pipeline.db
|
||||
/pipeline.db
|
||||
|
||||
# Per-worktree planning file (written by coder agents, must never reach squash commits)
|
||||
PLAN.md
|
||||
|
||||
+1
-1
@@ -56,7 +56,7 @@ There are no exceptions. The merge gate runs `source-map-check` and rejects the
|
||||
Before committing, run `cargo run -p source-map-gen --bin source-map-check -- --worktree . --base master` and address every missing-docs direction it prints. If you added a new module file (e.g. `foo.rs` or `foo/mod.rs`), the FIRST line of that file MUST be a `//! What this module is for` doc comment.
|
||||
|
||||
## Documentation
|
||||
Docs live in `website/docs/*.html` (static HTML), **not** Markdown files. When a story asks you to document something, edit the relevant `.html` file in `website/docs/`.
|
||||
Docs live in `website/app/docs/*.tsx` (Next.js pages), **not** Markdown files. When a story asks you to document something, edit the relevant `.tsx` file under `website/app/docs/`. Run `npm run build` in `website/` to verify your changes render correctly.
|
||||
|
||||
## Configuration files
|
||||
- Agent config: `.huskies/agents.toml` (preferred) or `[[agent]]` blocks in `.huskies/project.toml`
|
||||
|
||||
+130
-11
@@ -696,6 +696,7 @@
|
||||
"server/src/agents/pool/start/spawn.rs": [
|
||||
"fn maybe_cap_for_merge_fixup",
|
||||
"fn maybe_inject_gate_failure",
|
||||
"fn inject_worktree_disallowed_tools",
|
||||
"fn run_agent_spawn"
|
||||
],
|
||||
"server/src/agents/pool/start/tests_concurrency.rs": [],
|
||||
@@ -805,6 +806,10 @@
|
||||
"fn build_backlog_from_items"
|
||||
],
|
||||
"server/src/chat/commands/cleanup_worktrees.rs": [],
|
||||
"server/src/chat/commands/convert.rs": [
|
||||
"fn handle_convert",
|
||||
"fn convert_by_number"
|
||||
],
|
||||
"server/src/chat/commands/cost.rs": [
|
||||
"fn handle_cost",
|
||||
"fn extract_agent_type"
|
||||
@@ -856,6 +861,9 @@
|
||||
"server/src/chat/commands/move_story.rs": [
|
||||
"fn handle_move"
|
||||
],
|
||||
"server/src/chat/commands/new_project.rs": [
|
||||
"fn handle_new_project_fallback"
|
||||
],
|
||||
"server/src/chat/commands/overview.rs": [
|
||||
"fn handle_overview"
|
||||
],
|
||||
@@ -975,6 +983,8 @@
|
||||
],
|
||||
"server/src/chat/transport/matrix/bot/format.rs": [
|
||||
"fn format_startup_announcement",
|
||||
"fn format_gateway_ready_announcement",
|
||||
"fn format_gateway_rollback_announcement",
|
||||
"fn markdown_to_html"
|
||||
],
|
||||
"server/src/chat/transport/matrix/bot/history.rs": [
|
||||
@@ -996,10 +1006,10 @@
|
||||
"fn handle_message"
|
||||
],
|
||||
"server/src/chat/transport/matrix/bot/messages/mod.rs": [
|
||||
"fn format_user_prompt",
|
||||
"fn format_drained_events"
|
||||
"fn format_user_prompt"
|
||||
],
|
||||
"server/src/chat/transport/matrix/bot/messages/on_room_message.rs": [
|
||||
"fn eval_switch_command",
|
||||
"fn on_room_message"
|
||||
],
|
||||
"server/src/chat/transport/matrix/bot/mod.rs": [
|
||||
@@ -1053,6 +1063,9 @@
|
||||
"fn extract_delete_command",
|
||||
"fn handle_delete"
|
||||
],
|
||||
"server/src/chat/transport/matrix/health.rs": [
|
||||
"fn run_health_check"
|
||||
],
|
||||
"server/src/chat/transport/matrix/htop.rs": [
|
||||
"enum HtopCommand",
|
||||
"struct HtopSession",
|
||||
@@ -1069,17 +1082,40 @@
|
||||
"mod commands",
|
||||
"mod config",
|
||||
"mod delete",
|
||||
"mod health",
|
||||
"mod htop",
|
||||
"mod new_project",
|
||||
"mod project_rebuild",
|
||||
"mod rebuild",
|
||||
"mod reset",
|
||||
"mod rmtree",
|
||||
"mod sled_upgrade",
|
||||
"mod start",
|
||||
"mod transport_impl",
|
||||
"fn spawn_bot"
|
||||
],
|
||||
"server/src/chat/transport/matrix/new_project.rs": [
|
||||
"struct NewProjectCommand",
|
||||
"fn extract_new_project_command",
|
||||
"fn apply_project_config",
|
||||
"fn detect_stack",
|
||||
"fn image_for_stack",
|
||||
"fn resolve_git_identity",
|
||||
"fn handle_new_project",
|
||||
"fn dockerfile_for_project",
|
||||
"fn build_project_image",
|
||||
"fn project_docker_run_args",
|
||||
"fn resolve_gateway_url"
|
||||
],
|
||||
"server/src/chat/transport/matrix/project_rebuild.rs": [
|
||||
"struct ProjectRebuildCommand",
|
||||
"fn extract_project_rebuild_command",
|
||||
"fn handle_project_rebuild"
|
||||
],
|
||||
"server/src/chat/transport/matrix/rebuild.rs": [
|
||||
"struct RebuildCommand",
|
||||
"fn extract_rebuild_command",
|
||||
"fn extract_rebuild_gateway_command",
|
||||
"fn handle_rebuild"
|
||||
],
|
||||
"server/src/chat/transport/matrix/reset.rs": [
|
||||
@@ -1092,6 +1128,12 @@
|
||||
"fn extract_rmtree_command",
|
||||
"fn handle_rmtree"
|
||||
],
|
||||
"server/src/chat/transport/matrix/sled_upgrade.rs": [
|
||||
"enum UpgradeCommand",
|
||||
"fn extract_upgrade_command",
|
||||
"fn handle_upgrade_list_projects",
|
||||
"fn handle_sled_upgrade"
|
||||
],
|
||||
"server/src/chat/transport/matrix/start.rs": [
|
||||
"enum StartCommand",
|
||||
"fn extract_start_command",
|
||||
@@ -1282,6 +1324,13 @@
|
||||
"fn delete_agent_throttle",
|
||||
"fn extract_agent_throttle_view"
|
||||
],
|
||||
"server/src/crdt_state/lww_maps/event_log.rs": [
|
||||
"const GAP_PIPELINE_EVENT",
|
||||
"struct EventLogEntryRaw",
|
||||
"fn append_event_log_entry",
|
||||
"fn append_gap_log_entry",
|
||||
"fn read_all_event_log_entries"
|
||||
],
|
||||
"server/src/crdt_state/lww_maps/gateway_projects.rs": [
|
||||
"fn write_gateway_project",
|
||||
"fn read_all_gateway_projects",
|
||||
@@ -1289,6 +1338,12 @@
|
||||
"fn delete_gateway_project",
|
||||
"fn extract_gateway_project_view"
|
||||
],
|
||||
"server/src/crdt_state/lww_maps/llm_sessions.rs": [
|
||||
"fn write_llm_session",
|
||||
"fn read_llm_session",
|
||||
"fn assemble_and_advance_session",
|
||||
"fn extract_llm_session_view"
|
||||
],
|
||||
"server/src/crdt_state/lww_maps/merge_jobs.rs": [
|
||||
"fn write_merge_job",
|
||||
"fn read_all_merge_jobs",
|
||||
@@ -1364,10 +1419,13 @@
|
||||
"fn rebuild_active_agent_index",
|
||||
"fn rebuild_test_job_index",
|
||||
"fn rebuild_agent_throttle_index",
|
||||
"fn rebuild_gateway_project_index"
|
||||
"fn rebuild_gateway_project_index",
|
||||
"fn rebuild_llm_session_index"
|
||||
],
|
||||
"server/src/crdt_state/state/init.rs": [
|
||||
"fn init"
|
||||
"enum PersistMsg",
|
||||
"fn init",
|
||||
"fn flush_persistence"
|
||||
],
|
||||
"server/src/crdt_state/state/mod.rs": [
|
||||
"fn subscribe",
|
||||
@@ -1378,6 +1436,7 @@
|
||||
"fn init_for_test"
|
||||
],
|
||||
"server/src/crdt_state/state/statics.rs": [
|
||||
"static PERSIST_PENDING",
|
||||
"static CRDT_EVENT_TX",
|
||||
"static SYNC_TX",
|
||||
"static ALL_OPS",
|
||||
@@ -1393,6 +1452,12 @@
|
||||
"struct CrdtEvent",
|
||||
"struct GatewayConfigCrdt",
|
||||
"struct PipelineDoc",
|
||||
"struct EventLogEntryCrdt",
|
||||
"struct LlmSessionCrdt",
|
||||
"enum ScopeFilter",
|
||||
"fn from_scope_str",
|
||||
"fn to_scope_str",
|
||||
"struct LlmSessionView",
|
||||
"struct PipelineItemCrdt",
|
||||
"struct NodePresenceCrdt",
|
||||
"struct EpicId",
|
||||
@@ -1583,10 +1648,22 @@
|
||||
"fn backup_pre_pipeline_status",
|
||||
"fn check_schema_drift"
|
||||
],
|
||||
"server/src/event_log/mod.rs": [
|
||||
"type EventId",
|
||||
"struct LoggedEvent",
|
||||
"fn log_transition_event",
|
||||
"fn read_event_log",
|
||||
"fn insert_gap_sentinel",
|
||||
"fn spawn_event_log_subscriber"
|
||||
],
|
||||
"server/src/gateway/mod.rs": [
|
||||
"mod rebuild",
|
||||
"fn build_gateway_route",
|
||||
"fn run"
|
||||
],
|
||||
"server/src/gateway/rebuild.rs": [
|
||||
"fn rebuild_gateway"
|
||||
],
|
||||
"server/src/gateway/tests.rs": [],
|
||||
"server/src/gateway_relay.rs": [
|
||||
"fn spawn_relay_task"
|
||||
@@ -1594,11 +1671,6 @@
|
||||
"server/src/http/agents_sse.rs": [
|
||||
"fn agent_stream"
|
||||
],
|
||||
"server/src/http/assets.rs": [
|
||||
"fn embedded_asset",
|
||||
"fn embedded_file",
|
||||
"fn embedded_index"
|
||||
],
|
||||
"server/src/http/context.rs": [
|
||||
"enum PermissionDecision",
|
||||
"struct PermissionForward",
|
||||
@@ -1733,6 +1805,11 @@
|
||||
"fn validate_working_dir",
|
||||
"fn tool_run_command"
|
||||
],
|
||||
"server/src/http/mcp/shell_tools/file_tools.rs": [
|
||||
"fn validate_worktree_file_path",
|
||||
"fn tool_edit",
|
||||
"fn tool_write"
|
||||
],
|
||||
"server/src/http/mcp/shell_tools/mod.rs": [],
|
||||
"server/src/http/mcp/shell_tools/script.rs": [
|
||||
"fn tool_run_tests",
|
||||
@@ -1773,6 +1850,9 @@
|
||||
"server/src/http/mcp/story_tools/spike.rs": [
|
||||
"fn tool_create_spike"
|
||||
],
|
||||
"server/src/http/mcp/story_tools/story/convert.rs": [
|
||||
"fn tool_convert_item_type"
|
||||
],
|
||||
"server/src/http/mcp/story_tools/story/create.rs": [
|
||||
"fn tool_create_story",
|
||||
"fn tool_purge_story"
|
||||
@@ -1831,7 +1911,6 @@
|
||||
],
|
||||
"server/src/http/mod.rs": [
|
||||
"mod agents_sse",
|
||||
"mod assets",
|
||||
"mod context",
|
||||
"mod events",
|
||||
"mod identity",
|
||||
@@ -1848,7 +1927,9 @@
|
||||
"fn health_handler",
|
||||
"fn build_routes",
|
||||
"fn rpc_http_handler",
|
||||
"fn debug_crdt_handler"
|
||||
"fn debug_crdt_handler",
|
||||
"fn upgrade_trigger_handler",
|
||||
"fn serve_binary_handler"
|
||||
],
|
||||
"server/src/http/oauth.rs": [
|
||||
"fn oauth_authorize",
|
||||
@@ -2164,6 +2245,9 @@
|
||||
"struct CompletionResponse",
|
||||
"trait ModelProvider"
|
||||
],
|
||||
"server/src/llm_session/mod.rs": [
|
||||
"fn assemble_prompt_context"
|
||||
],
|
||||
"server/src/log_buffer.rs": [
|
||||
"enum LogLevel",
|
||||
"fn as_str",
|
||||
@@ -2184,15 +2268,21 @@
|
||||
"mod crdt_state",
|
||||
"mod crdt_sync",
|
||||
"mod crdt_wire",
|
||||
"mod event_log",
|
||||
"mod gateway",
|
||||
"mod llm_session",
|
||||
"mod log_buffer",
|
||||
"mod mesh",
|
||||
"mod node_identity",
|
||||
"mod pidfile",
|
||||
"mod pipeline_event_bus",
|
||||
"mod pipeline_state",
|
||||
"mod process_kill",
|
||||
"mod rebuild",
|
||||
"mod services",
|
||||
"mod sled_uplink",
|
||||
"mod trampoline",
|
||||
"mod upgrade",
|
||||
"mod validation"
|
||||
],
|
||||
"server/src/mesh.rs": [
|
||||
@@ -2215,6 +2305,19 @@
|
||||
"fn init_identity",
|
||||
"fn get_identity"
|
||||
],
|
||||
"server/src/pidfile.rs": [
|
||||
"struct PidfileGuard",
|
||||
"fn acquire_gateway_pidfile",
|
||||
"fn acquire_gateway_pidfile_at"
|
||||
],
|
||||
"server/src/pipeline_event_bus.rs": [
|
||||
"struct BusEvent",
|
||||
"fn init",
|
||||
"fn broadcast",
|
||||
"fn subscribe",
|
||||
"fn render_event",
|
||||
"fn event_matches_persona"
|
||||
],
|
||||
"server/src/pipeline_state/apply.rs": [
|
||||
"enum ApplyError",
|
||||
"fn apply_transition",
|
||||
@@ -2952,6 +3055,7 @@
|
||||
"fn subscribe_logs",
|
||||
"fn subscribe_watcher",
|
||||
"fn subscribe_status",
|
||||
"fn subscribe_persona_pipeline_events",
|
||||
"fn subscribe_reconciliation"
|
||||
],
|
||||
"server/src/service/ws/message/convert.rs": [
|
||||
@@ -3024,6 +3128,19 @@
|
||||
"fn from_path",
|
||||
"fn path"
|
||||
],
|
||||
"server/src/trampoline.rs": [
|
||||
"struct TrampolineJob",
|
||||
"fn write_job_atomic",
|
||||
"fn spawn_detached_trampoline",
|
||||
"fn execute_trampoline_core",
|
||||
"fn run_trampoline"
|
||||
],
|
||||
"server/src/upgrade.rs": [
|
||||
"fn fetch_and_replace_binary",
|
||||
"fn upgrade_and_reexec",
|
||||
"fn run_cli_upgrade",
|
||||
"fn resolve_target_path"
|
||||
],
|
||||
"server/src/validation/error.rs": [
|
||||
"enum ValidationError",
|
||||
"fn format_errors_as_json"
|
||||
@@ -3085,6 +3202,8 @@
|
||||
"struct UnblockStoryRequest",
|
||||
"fn from_json",
|
||||
"struct FreezeStoryRequest",
|
||||
"fn from_json",
|
||||
"struct ConvertItemTypeRequest",
|
||||
"fn from_json"
|
||||
],
|
||||
"server/src/validation/sanitize.rs": [
|
||||
|
||||
@@ -0,0 +1,306 @@
|
||||
# Chat-Driven Project Bootstrap
|
||||
|
||||
Design overview for going from "I want a new project" to a running,
|
||||
container-isolated, editor-accessible huskies project in one chat command.
|
||||
|
||||
## Goal
|
||||
|
||||
A user can say to Timmy in chat:
|
||||
|
||||
```
|
||||
new project myapp --stack rust
|
||||
new project legacy-rails --git git@github.com:me/legacy-rails.git
|
||||
```
|
||||
|
||||
and end up with:
|
||||
|
||||
1. A fresh docker container running the project's huskies node.
|
||||
2. The project's source code bind-mounted from the host so the user can
|
||||
edit it in any editor.
|
||||
3. SSH into the container so editors can run LSPs, builds, and tests
|
||||
inside the container — never on the host.
|
||||
4. Optional git remote configured for push to GitHub or Gitea.
|
||||
5. The new sled registered with the gateway, so Timmy can drive coders /
|
||||
mergemaster / etc. on the project via existing chat commands.
|
||||
|
||||
Manual repo creation on GitHub/Gitea remains the user's job. Everything
|
||||
downstream of that is orchestrated.
|
||||
|
||||
## Architecture at a Glance
|
||||
|
||||
```
|
||||
┌──────────────────────┐
|
||||
│ Browser / Matrix │───┐
|
||||
└──────────────────────┘ │
|
||||
▼
|
||||
┌───────────────────────┐
|
||||
│ Gateway (huskies-gw) │
|
||||
│ • chat dispatcher │
|
||||
│ • new-project │
|
||||
│ • routing │
|
||||
└─────────┬─────────────┘
|
||||
│
|
||||
┌─────────┴───────────────────────────────────┐
|
||||
│ docker engine (host) │
|
||||
│ ┌────────────┐ ┌────────────┐ ┌─────────┐ │
|
||||
│ │ project-A │ │ project-B │ │ ... │ │
|
||||
│ │ sled + │ │ sled + │ │ │ │
|
||||
│ │ sshd + │ │ sshd + │ │ │ │
|
||||
│ │ LSPs │ │ LSPs │ │ │ │
|
||||
│ └─────┬──────┘ └─────┬──────┘ └─────────┘ │
|
||||
└────────┼──────────────┼─────────────────────┘
|
||||
│ │
|
||||
bind mount │ │ bind mount
|
||||
┌────────┴───┐ ┌─────┴──────┐
|
||||
│ ~/code/A │ │ ~/code/B │ ◄── host
|
||||
└────────────┘ └────────────┘ editor opens
|
||||
these paths
|
||||
```
|
||||
|
||||
- One container per project. The container runs the project's huskies
|
||||
binary (sled), an SSH server, and the stack-appropriate LSP(s).
|
||||
- Source lives on the host (e.g. `~/code/<project>`), bind-mounted into
|
||||
the container at a known path. Host can git-diff, back up, or edit.
|
||||
- The gateway is editor-agnostic and project-agnostic — it talks to each
|
||||
sled via the existing rendezvous / CRDT-sync protocol.
|
||||
|
||||
## Three Personas
|
||||
|
||||
| Persona | What they do | What they need |
|
||||
|---------|--------------|----------------|
|
||||
| Chat-only user | Drives everything via Matrix/web chat | Installed huskies binary; chat client |
|
||||
| Editor-using technical user | Same + edits source in their editor | SSH config to the container + editor-specific remote-dev setup |
|
||||
| Multi-project user | Several projects running in parallel | Gateway-listed projects, all routable from one chat |
|
||||
|
||||
Chat-only users never touch SSH. Editor users go through a one-time
|
||||
"copy this SSH command into your editor's remote settings" handoff at
|
||||
project creation time.
|
||||
|
||||
## The Bootstrap Chat Command
|
||||
|
||||
```
|
||||
new project <name> [--stack <stack>] [--git <url>] [--path <host-path>]
|
||||
```
|
||||
|
||||
Flow:
|
||||
|
||||
1. **Validate**: name unique among existing projects; host path doesn't already
|
||||
exist; stack (if declared) is one of the supported overlays.
|
||||
2. **Allocate** a fresh per-project port range (gateway picks).
|
||||
3. **Create host directory** at `--path` (default `~/huskies/<name>/`).
|
||||
4. If `--git` provided, `git clone` into that directory; else `git init`.
|
||||
5. **Detect stack** from cloned content if not declared:
|
||||
- `Cargo.toml` → `rust`
|
||||
- `package.json` → `node`
|
||||
- `go.mod` → `go`
|
||||
- `pyproject.toml` / `requirements.txt` / `setup.py` → `python`
|
||||
- `Gemfile` → `ruby`
|
||||
- `pom.xml` / `build.gradle` → `jvm`
|
||||
- Multiple → pick the dominant, warn.
|
||||
- None → minimal base image, user can install tooling later.
|
||||
6. **Compose the container** from `huskies-project-base` + the stack
|
||||
overlay (Dockerfile fragments under `docker/stacks/<stack>/`).
|
||||
7. **Launch** the container with bind mount + port forwards + an
|
||||
auto-generated SSH key.
|
||||
8. **Seed `.huskies/project.toml`** with sensible defaults.
|
||||
9. **Register** the project with the gateway (`gateway_projects` LWW-map).
|
||||
10. **Reply in chat** with: project name, host path, SSH command, and
|
||||
a `huskies status <name>` invocation to verify.
|
||||
|
||||
## Container Template
|
||||
|
||||
Layered:
|
||||
|
||||
- **`huskies-project-base`**: debian-slim + git + huskies binary + sshd
|
||||
+ sudo + a `huskies` user with the SSH pubkey installed.
|
||||
- **`huskies-project-<stack>`**: per-stack additions, pre-built by
|
||||
`script/build-project-images`. E.g. rust gets `rustup` +
|
||||
`rust-analyzer` + `cargo-nextest`; node gets `node@22` +
|
||||
`typescript-language-server`; etc. Stack fragments live in
|
||||
`docker/stacks/<stack>/Dockerfile.fragment`.
|
||||
- **`huskies-project-local-<name>`** *(optional)*: built on the fly at
|
||||
container launch time when the project contains
|
||||
`.huskies/Dockerfile.fragment`. This file is appended after the
|
||||
stack overlay (`FROM huskies-project-<stack>`) so agents can extend
|
||||
their own image without editing shared stack files. Because the
|
||||
fragment lives inside the bind-mounted `/workspace/.huskies/`, changes
|
||||
survive container recreation and are committed alongside the project
|
||||
source. The `project-rebuild` command picks up the fragment
|
||||
automatically when rebuilding.
|
||||
|
||||
Example `.huskies/Dockerfile.fragment` that adds `jq`:
|
||||
|
||||
```dockerfile
|
||||
RUN apt-get update && apt-get install -y jq
|
||||
```
|
||||
|
||||
- **Project layer**: the bind-mounted `/workspace` is the project source,
|
||||
written by the host's editor, read by the in-container tooling.
|
||||
|
||||
The container's SSH server is bound to a host-local port (not exposed
|
||||
externally). Auth is the per-project keypair generated at bootstrap;
|
||||
the public key sits inside the container, the private key on host.
|
||||
|
||||
## Build Sandbox Model
|
||||
|
||||
The threat: editing code in a host-side editor causes the editor (or its
|
||||
LSP plugin) to run `cargo check` / `npm install` / `pip install` /
|
||||
similar, which executes arbitrary code from project dependencies —
|
||||
`build.rs`, proc-macros, npm `postinstall`, Python `setup.py`, Ruby
|
||||
native-extension build scripts, etc. A malicious dependency compromises
|
||||
the host.
|
||||
|
||||
The mitigation: all build / type-check / dependency-install commands
|
||||
execute **inside the project container**. The host's editor connects to
|
||||
the container over SSH; rust-analyzer (or equivalent) runs inside the
|
||||
container; the host process never `exec`s untrusted build scripts.
|
||||
|
||||
Container isolation is the docker default plus:
|
||||
- No `--privileged`.
|
||||
- No host bind mounts beyond the project source and the SSH key.
|
||||
- No host network beyond the gateway's CRDT sync port.
|
||||
- `--cap-drop=ALL` plus the minimum caps needed (probably none).
|
||||
|
||||
This isn't a hardened sandbox in the gvisor / Firecracker sense — a
|
||||
docker-escape exploit on a compromised container still escalates to
|
||||
host. For most consumer threat models (malicious crate from
|
||||
crates.io / npm), docker's default isolation is sufficient. Tighter
|
||||
sandboxing (gvisor) is a separate future spike if needed.
|
||||
|
||||
## Editor Connection — Editor-Agnostic SSH
|
||||
|
||||
| Editor | Connection mechanism |
|
||||
|--------|----------------------|
|
||||
| VSCode | Remote-SSH extension |
|
||||
| JetBrains (IntelliJ/Rover) | JetBrains Gateway (SSH) |
|
||||
| Zed | Built-in SSH remoting (mac/linux only today) |
|
||||
| Vim/Neovim | SSH terminal session, or local nvim + LSP-over-SSH |
|
||||
| Emacs | TRAMP + remote LSP via lsp-mode |
|
||||
|
||||
All converge on: `ssh huskies@127.0.0.1 -p <project-port> -i ~/.huskies/<name>/id_ed25519`.
|
||||
That string is emitted in the bootstrap chat reply.
|
||||
|
||||
## Git Integration
|
||||
|
||||
- Initial setup is `git init` or `git clone` inside the container.
|
||||
- For push: user's existing GitHub / Gitea SSH key is bind-mounted
|
||||
read-only into the container at `~/.ssh/id_*`, OR the user supplies a
|
||||
push token via `huskies secrets set GIT_TOKEN=...` (stored as a Fly
|
||||
secret equivalent — for now, a chmod 600 file in the container).
|
||||
- The container's `git` config gets `user.name` / `user.email` from the
|
||||
gateway-level user identity.
|
||||
|
||||
## Decisions
|
||||
|
||||
| Decision | Choice | Alternative |
|
||||
|----------|--------|-------------|
|
||||
| Container per project | One container per project | One container many projects: simpler but breaks isolation, breaks per-project deps |
|
||||
| Editor model | SSH-remote (any editor) | VSCode Dev Containers only: simpler config but locks out everyone else |
|
||||
| Source location | Bind mount from host | Inside container only: breaks "I can also edit on my laptop" requirement |
|
||||
| Stack detection | Auto from project files, override with `--stack` | Always declared: more friction at bootstrap |
|
||||
| Push secrets | Bind-mounted host SSH key OR per-project token | Gateway holds tokens: bigger blast radius |
|
||||
|
||||
## Open Questions
|
||||
|
||||
1. **Per-project resource limits.** Should each container have a hard
|
||||
CPU / RAM cap so a runaway agent doesn't starve the host?
|
||||
2. **Lifecycle / cleanup.** If the user deletes a project from chat,
|
||||
what gets removed? Container yes; host source no (data loss); git
|
||||
remotes yes? Need a confirm step.
|
||||
3. **Multi-tenant.** Out of scope for this design (that's huskies.dev
|
||||
territory). This doc assumes single-user local-only.
|
||||
4. **Windows specifics.** Bind mounts work but line-ending /
|
||||
permission edge cases. Probably document "use WSL2 for best
|
||||
experience" rather than fight Windows native paths.
|
||||
5. **Gateway-on-host vs gateway-in-container.** The gateway today runs
|
||||
in its own container. New per-project containers connect via docker
|
||||
network. Need to confirm the network plumbing works for arbitrary
|
||||
per-project containers, not just the manually-configured ones.
|
||||
|
||||
## Phasing
|
||||
|
||||
The work breaks naturally into:
|
||||
|
||||
- **Phase 0 (now):** this design doc.
|
||||
- **Phase 1:** chat command exists and provisions a bare project
|
||||
container (no stack overlay, no SSH, no git clone — just
|
||||
"start a container, register with gateway"). Validates the
|
||||
orchestration shell.
|
||||
- **Phase 2:** stack-aware container template — base image + overlays;
|
||||
detection from project files.
|
||||
- **Phase 3:** SSH-remote editor access — sshd in the container,
|
||||
per-project keypair, chat-reply emits the connection string.
|
||||
- **Phase 4:** git integration — `--git <url>` clones, host SSH key
|
||||
mount, push verification.
|
||||
- **Phase 5:** per-project resource limits + cleanup chat commands.
|
||||
- **Phase 6:** `--adopt <dir>` wraps a container around an existing
|
||||
checkout. No clone or init — bind-mount only.
|
||||
- **Phase 7 (story 1137):** First-run init flow — config summary and
|
||||
chat-driven overrides (see below).
|
||||
|
||||
Each phase ships independently and is usable on its own. Phase 1 alone
|
||||
gives chat-only users a working project; later phases add the editor
|
||||
and git polish.
|
||||
|
||||
## First-Run Init Flow (Story 1137)
|
||||
|
||||
After a successful `new project ... --adopt` (or any new-project
|
||||
bootstrap), the bot appends a **Default configuration** block to the
|
||||
adoption success reply. This block lists every scaffolded agent with
|
||||
its model, budget cap, and turn limit, and provides ready-to-send
|
||||
override commands.
|
||||
|
||||
### Example reply tail
|
||||
|
||||
```
|
||||
**Default configuration** (3 agents):
|
||||
- coder-1 (coder): model=`sonnet`, budget=$5.00, max_turns=50
|
||||
- qa (qa): model=`sonnet`, budget=$4.00, max_turns=40
|
||||
- mergemaster (mergemaster): model=`sonnet`, budget=$5.00, max_turns=30
|
||||
|
||||
Override via chat: `huskies config myapp coder.model=opus`
|
||||
Project settings: `huskies config myapp default_qa=human`
|
||||
Accept all defaults silently: add `--skip-config` to the bootstrap command.
|
||||
```
|
||||
|
||||
### Config override command
|
||||
|
||||
```
|
||||
huskies config <project> <key>=<value>
|
||||
```
|
||||
|
||||
The gateway resolves the project's `host_path` from `projects.toml`,
|
||||
then writes the setting to `.huskies/agents.toml` or
|
||||
`.huskies/project.toml` on the host.
|
||||
|
||||
**Agent fields** (`<stage_or_name>.<field>=<value>`):
|
||||
|
||||
| Key | Target | Supported values |
|
||||
|-----|--------|-----------------|
|
||||
| `coder.model` | agents.toml, coder stage | `sonnet`, `opus`, any model string |
|
||||
| `qa.model` | agents.toml, qa stage | same |
|
||||
| `mergemaster.model` | agents.toml, mergemaster stage | same |
|
||||
| `coder.max_turns` | agents.toml, coder stage | integer |
|
||||
| `coder.max_budget` | agents.toml, coder stage | decimal (USD) |
|
||||
|
||||
**Project keys** (bare `<key>=<value>`):
|
||||
|
||||
| Key | Notes |
|
||||
|-----|-------|
|
||||
| `default_qa` | `"server"`, `"agent"`, or `"human"` |
|
||||
| `max_retries` | integer |
|
||||
| `max_coders` | integer |
|
||||
| `base_branch` | branch name string |
|
||||
| `timezone` | IANA timezone (e.g. `"Europe/London"`) |
|
||||
| `default_coder_model` | model string |
|
||||
|
||||
### Skip path
|
||||
|
||||
Pass `--skip-config` to suppress the config block entirely:
|
||||
|
||||
```
|
||||
new project myapp --adopt /path/to/checkout --skip-config
|
||||
```
|
||||
|
||||
The success reply is identical to pre-1137 output — only the SSH
|
||||
command and registration summary, no agent listing.
|
||||
Generated
+8
-54
@@ -872,9 +872,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crypto-common"
|
||||
version = "0.2.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710"
|
||||
checksum = "ce6e4c961d6cd6c9a86db418387425e8bdeaf05b3c8bc1411e6dca4c252f1453"
|
||||
dependencies = [
|
||||
"hybrid-array",
|
||||
]
|
||||
@@ -1137,7 +1137,7 @@ checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2"
|
||||
dependencies = [
|
||||
"block-buffer 0.12.0",
|
||||
"const-oid 0.10.2",
|
||||
"crypto-common 0.2.1",
|
||||
"crypto-common 0.2.2",
|
||||
"ctutils",
|
||||
]
|
||||
|
||||
@@ -1911,7 +1911,7 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
|
||||
|
||||
[[package]]
|
||||
name = "huskies"
|
||||
version = "0.11.1"
|
||||
version = "0.13.0"
|
||||
dependencies = [
|
||||
"ammonia",
|
||||
"async-stream",
|
||||
@@ -1931,7 +1931,6 @@ dependencies = [
|
||||
"libc",
|
||||
"libsqlite3-sys",
|
||||
"matrix-sdk",
|
||||
"mime_guess",
|
||||
"mockito",
|
||||
"notify",
|
||||
"nutype",
|
||||
@@ -1941,7 +1940,6 @@ dependencies = [
|
||||
"rand 0.10.1",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"rust-embed",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
@@ -2978,16 +2976,6 @@ version = "0.1.54"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbf6f36070878c42c5233846cd3de24cf9016828fd47bc22957a687298bb21fc"
|
||||
|
||||
[[package]]
|
||||
name = "mime_guess"
|
||||
version = "2.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e"
|
||||
dependencies = [
|
||||
"mime",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.8.9"
|
||||
@@ -3119,9 +3107,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "num-conv"
|
||||
version = "0.2.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
|
||||
checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441"
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
@@ -4206,40 +4194,6 @@ dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-embed"
|
||||
version = "8.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04113cb9355a377d83f06ef1f0a45b8ab8cd7d8b1288160717d66df5c7988d27"
|
||||
dependencies = [
|
||||
"rust-embed-impl",
|
||||
"rust-embed-utils",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-embed-impl"
|
||||
version = "8.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da0902e4c7c8e997159ab384e6d0fc91c221375f6894346ae107f47dd0f3ccaa"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rust-embed-utils",
|
||||
"syn 2.0.117",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-embed-utils"
|
||||
version = "8.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bcdef0be6fe7f6fa333b1073c949729274b05f123a0ad7efcb8efd878e5c3b1"
|
||||
dependencies = [
|
||||
"sha2 0.10.9",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.2"
|
||||
@@ -5429,9 +5383,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tower-http"
|
||||
version = "0.6.10"
|
||||
version = "0.6.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68d6fdd9f81c2819c9a8b0e0cd91660e7746a8e6ea2ba7c6b2b057985f6bcb51"
|
||||
checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840"
|
||||
dependencies = [
|
||||
"async-compression",
|
||||
"bitflags 2.11.1",
|
||||
|
||||
@@ -79,6 +79,10 @@ cd frontend && npm install && npm run dev
|
||||
|
||||
Configuration lives in `.huskies/project.toml`. See `.huskies/bot.toml.*.example` for transport setup.
|
||||
|
||||
## Website
|
||||
|
||||
The huskies.dev website source has moved to [crashlabs/huskies-server](https://code.crashlabs.io/crashlabs/huskies-server).
|
||||
|
||||
## Architecture
|
||||
|
||||
Internal architecture documentation lives in [`docs/architecture/`](docs/architecture/):
|
||||
|
||||
+11
-2
@@ -46,8 +46,17 @@ WORKDIR /app
|
||||
# build.rs) can produce the release binary with embedded frontend assets.
|
||||
COPY . .
|
||||
|
||||
# Build frontend deps first (better layer caching)
|
||||
RUN cd frontend && npm ci
|
||||
# Build frontend deps first (better layer caching).
|
||||
# Cannot use `npm ci` because of npm's optional-dependencies bug
|
||||
# (npm/cli#4828): platform-specific bindings (e.g. rolldown's
|
||||
# linux-arm64-gnu native binary, introduced by 1119's vite 5→8 upgrade)
|
||||
# get listed in package-lock.json for the lockfile author's platform
|
||||
# only, so `npm ci` skips them on every other platform — the build
|
||||
# then fails at runtime with `Cannot find native binding`. Wipe the
|
||||
# lockfile + node_modules and let `npm install` resolve fresh for the
|
||||
# build platform. The lockfile mutation stays inside the container
|
||||
# image and never reaches the host repo.
|
||||
RUN cd frontend && rm -rf node_modules package-lock.json && npm install
|
||||
|
||||
# Build the release binary (build.rs runs npm run build for the frontend)
|
||||
RUN cargo build --release \
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
# huskies-project-base — minimal base for all project containers.
|
||||
#
|
||||
# This image provides git, the huskies server binary, and a non-root user.
|
||||
# It carries no language tooling. Per-stack overlays (docker/stacks/<name>/
|
||||
# Dockerfile.fragment) layer their toolchains on top of this base.
|
||||
#
|
||||
# Prerequisites: build the main `huskies` image first so its binary is
|
||||
# available as a build source.
|
||||
#
|
||||
# docker build -t huskies -f docker/Dockerfile .
|
||||
# docker build -t huskies-project-base -f docker/Dockerfile.base .
|
||||
#
|
||||
# To build a stack image (e.g. rust):
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/rust/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-rust -
|
||||
|
||||
FROM huskies AS huskies-src
|
||||
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libssl3 \
|
||||
procps \
|
||||
openssh-server \
|
||||
sudo \
|
||||
nodejs \
|
||||
npm \
|
||||
&& npm install -g @anthropic-ai/claude-code \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy the huskies binary and entrypoint from the main image.
|
||||
COPY --from=huskies-src /usr/local/bin/huskies /usr/local/bin/huskies
|
||||
COPY --from=huskies-src /usr/local/bin/entrypoint.sh /usr/local/bin/entrypoint.sh
|
||||
|
||||
# Non-root user — Claude Code refuses --dangerously-skip-permissions as root.
|
||||
# -s /bin/bash required for SSH sessions to start a real shell.
|
||||
RUN groupadd -r huskies \
|
||||
&& useradd -r -g huskies -m -d /home/huskies -s /bin/bash huskies \
|
||||
&& mkdir -p /home/huskies/.claude \
|
||||
&& mkdir -p /home/huskies/.ssh \
|
||||
&& chmod 700 /home/huskies/.ssh \
|
||||
&& chown -R huskies:huskies /home/huskies \
|
||||
&& mkdir -p /workspace \
|
||||
&& chown huskies:huskies /workspace \
|
||||
&& git config --global init.defaultBranch master \
|
||||
&& echo "huskies ALL=(root) NOPASSWD: /usr/sbin/sshd" > /etc/sudoers.d/huskies-sshd \
|
||||
&& chmod 0440 /etc/sudoers.d/huskies-sshd \
|
||||
&& mkdir -p /run/sshd \
|
||||
&& sed -i \
|
||||
-e 's/#PasswordAuthentication yes/PasswordAuthentication no/' \
|
||||
-e 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' \
|
||||
-e 's/UsePAM yes/UsePAM no/' \
|
||||
/etc/ssh/sshd_config
|
||||
|
||||
# Shell profile for SSH sessions: land in /workspace and load toolchain paths.
|
||||
RUN printf 'cd /workspace\n[ -f "$HOME/.cargo/env" ] && . "$HOME/.cargo/env"\n' \
|
||||
> /home/huskies/.profile \
|
||||
&& chown huskies:huskies /home/huskies/.profile
|
||||
|
||||
USER huskies
|
||||
WORKDIR /workspace
|
||||
|
||||
EXPOSE 3001 22
|
||||
|
||||
ENTRYPOINT ["entrypoint.sh"]
|
||||
CMD ["huskies", "/workspace"]
|
||||
@@ -29,6 +29,9 @@ services:
|
||||
- HUSKIES_PORT=3001
|
||||
# Bind to all interfaces so Docker port forwarding works.
|
||||
- HUSKIES_HOST=0.0.0.0
|
||||
# Gateway URL so this sled's relay task forwards CRDT events to the gateway.
|
||||
# Uses host.docker.internal so the container can reach the gateway on the host.
|
||||
- HUSKIES_GATEWAY_URL=http://host.docker.internal:3000
|
||||
# Optional: Matrix bot credentials (if using Matrix integration)
|
||||
- MATRIX_HOMESERVER=${MATRIX_HOMESERVER:-}
|
||||
- MATRIX_USER=${MATRIX_USER:-}
|
||||
|
||||
@@ -1,6 +1,32 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
# ── Claude credentials ────────────────────────────────────────────────
|
||||
# The `new project` command bind-mounts the host ~/.claude/.credentials.json
|
||||
# at /run/claude-credentials-src:ro. We copy it here so the huskies user
|
||||
# owns the file and mode 0600 is enforced regardless of host uid/gid.
|
||||
if [ -f /run/claude-credentials-src ]; then
|
||||
mkdir -p /home/huskies/.claude
|
||||
cp /run/claude-credentials-src /home/huskies/.claude/.credentials.json
|
||||
chmod 600 /home/huskies/.claude/.credentials.json
|
||||
fi
|
||||
|
||||
# ── SSH authorized key ────────────────────────────────────────────────
|
||||
# HUSKIES_SSH_PUBKEY is set by `new project` when it generates a keypair.
|
||||
# Write it to authorized_keys so the user can connect with the matching
|
||||
# private key stored at ~/.huskies/<project>/id_ed25519 on the host.
|
||||
if [ -n "$HUSKIES_SSH_PUBKEY" ]; then
|
||||
mkdir -p /home/huskies/.ssh
|
||||
chmod 700 /home/huskies/.ssh
|
||||
printf '%s\n' "$HUSKIES_SSH_PUBKEY" > /home/huskies/.ssh/authorized_keys
|
||||
chmod 600 /home/huskies/.ssh/authorized_keys
|
||||
fi
|
||||
|
||||
# ── SSH daemon ────────────────────────────────────────────────────────
|
||||
# Start sshd in the background so the container accepts SSH connections.
|
||||
# Uses sudo (huskies has NOPASSWD for /usr/sbin/sshd in sudoers.d).
|
||||
sudo /usr/sbin/sshd -D -e &
|
||||
|
||||
# ── Git identity ─────────────────────────────────────────────────────
|
||||
# Agents commit code inside the container. Without a git identity,
|
||||
# commits fail or use garbage defaults. Fail loudly at startup so the
|
||||
@@ -25,6 +51,20 @@ export GIT_COMMITTER_NAME="$GIT_USER_NAME"
|
||||
export GIT_AUTHOR_EMAIL="$GIT_USER_EMAIL"
|
||||
export GIT_COMMITTER_EMAIL="$GIT_USER_EMAIL"
|
||||
|
||||
# ── Git credential helper (HTTPS push) ────────────────────────────────────
|
||||
# If GIT_PUSH_TOKEN is supplied at container creation time, configure git's
|
||||
# built-in credential store so `git push` over HTTPS authenticates without
|
||||
# user interaction. GIT_CLONE_URL provides the host portion of the URL used
|
||||
# as the key in ~/.git-credentials.
|
||||
if [ -n "$GIT_PUSH_TOKEN" ] && [ -n "$GIT_CLONE_URL" ]; then
|
||||
_scheme=$(echo "$GIT_CLONE_URL" | cut -d':' -f1)
|
||||
_host=$(echo "$GIT_CLONE_URL" | sed 's|^https\?://||' | cut -d'/' -f1)
|
||||
git config --global credential.helper store
|
||||
printf '%s://x-access-token:%s@%s\n' "$_scheme" "$GIT_PUSH_TOKEN" "$_host" \
|
||||
> /home/huskies/.git-credentials
|
||||
chmod 600 /home/huskies/.git-credentials
|
||||
fi
|
||||
|
||||
# ── Frontend native deps ────────────────────────────────────────────
|
||||
# The project repo is bind-mounted from the host, so node_modules/
|
||||
# may contain native binaries for the wrong platform (e.g. darwin
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
# Go stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with Go 1.22, gopls (official Go language server), and standard tooling.
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/go/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-go -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Official Go binary distribution — Debian's golang-go package is too old for gopls.
|
||||
# Update GOVERSION to pick up a newer release.
|
||||
ENV GOVERSION="1.22.3"
|
||||
RUN curl -fsSL "https://go.dev/dl/go${GOVERSION}.linux-amd64.tar.gz" \
|
||||
| tar -C /usr/local -xzf -
|
||||
|
||||
ENV PATH="/usr/local/go/bin:${PATH}"
|
||||
|
||||
# gopls: the official Go language server.
|
||||
# GOBIN=/usr/local/bin puts the binary on the system PATH for all users.
|
||||
RUN GOBIN=/usr/local/bin go install golang.org/x/tools/gopls@latest
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,4 @@
|
||||
# Stack detection markers for the go stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
go.mod
|
||||
@@ -0,0 +1,50 @@
|
||||
# JVM stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with OpenJDK 21, Maven, and eclipse.jdt.ls (the canonical Java/JVM LSP).
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/jvm/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-jvm -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# OpenJDK 21 (current LTS) and Maven for build support.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
openjdk-21-jdk-headless \
|
||||
maven \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV JAVA_HOME="/usr/lib/jvm/java-21-openjdk-amd64"
|
||||
|
||||
# Eclipse JDT Language Server — canonical LSP for Java/JVM (Java, Kotlin, Groovy).
|
||||
# Pin to a specific release; update JDTLS_VERSION + JDTLS_BUILD for upgrades.
|
||||
# All releases: https://github.com/eclipse-jdtls/eclipse.jdt.ls/releases
|
||||
ENV JDTLS_VERSION="1.38.0" \
|
||||
JDTLS_BUILD="202503271418"
|
||||
RUN mkdir -p /opt/jdtls \
|
||||
&& curl -fsSL \
|
||||
"https://download.eclipse.org/jdtls/milestones/${JDTLS_VERSION}/jdt-language-server-${JDTLS_VERSION}-${JDTLS_BUILD}.tar.gz" \
|
||||
| tar -xzf - -C /opt/jdtls
|
||||
|
||||
# Wrapper script so `jdtls` is available as a PATH command.
|
||||
RUN { \
|
||||
echo '#!/bin/sh'; \
|
||||
echo 'JAR=$(ls /opt/jdtls/plugins/org.eclipse.equinox.launcher_*.jar 2>/dev/null | head -1)'; \
|
||||
echo 'exec java \'; \
|
||||
echo ' -Declipse.application=org.eclipse.jdt.ls.core.id1 \'; \
|
||||
echo ' -Dosgi.bundles.defaultStartLevel=4 \'; \
|
||||
echo ' -Declipse.product=org.eclipse.jdt.ls.core.product \'; \
|
||||
echo ' -Dlog.protocol=true \'; \
|
||||
echo ' -Dlog.level=ALL \'; \
|
||||
echo ' -jar "$JAR" \'; \
|
||||
echo ' -configuration /opt/jdtls/config_linux \'; \
|
||||
echo ' "$@"'; \
|
||||
} > /usr/local/bin/jdtls \
|
||||
&& chmod +x /usr/local/bin/jdtls
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,6 @@
|
||||
# Stack detection markers for the jvm stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
pom.xml
|
||||
build.gradle
|
||||
build.gradle.kts
|
||||
@@ -0,0 +1,26 @@
|
||||
# Node stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with Node.js 22, TypeScript (tsc), and typescript-language-server.
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/node/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-node -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Node.js 22.x (LTS).
|
||||
RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
|
||||
&& apt-get install -y --no-install-recommends nodejs \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# TypeScript compiler and language server for LSP-aware agents.
|
||||
# tsc: TypeScript compiler (tsc --version)
|
||||
# typescript-language-server: LSP server used by editors/agents
|
||||
RUN npm install -g typescript typescript-language-server
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,7 @@
|
||||
# Stack detection markers for the node stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
# tsconfig.json is listed explicitly so TypeScript-only projects are detected
|
||||
# even without a package.json at the repo root.
|
||||
package.json
|
||||
tsconfig.json
|
||||
@@ -0,0 +1,27 @@
|
||||
# Python stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with Python 3, pip, and pyright (the Microsoft Python LSP / type checker).
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/python/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-python -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Python 3 runtime and pip.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# pyright: Microsoft's Python language server / static type checker.
|
||||
# --break-system-packages is required on Debian 12+ where pip is externally
|
||||
# managed; the flag is safe inside a Docker container.
|
||||
RUN pip install --no-cache-dir --break-system-packages pyright
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,6 @@
|
||||
# Stack detection markers for the python stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
pyproject.toml
|
||||
requirements.txt
|
||||
setup.py
|
||||
@@ -0,0 +1,28 @@
|
||||
# Ruby stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with Ruby, Bundler, and ruby-lsp (the Shopify Ruby language server).
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/ruby/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-ruby -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Ruby runtime, development headers (needed by native gem extensions), and Bundler.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ruby \
|
||||
ruby-dev \
|
||||
bundler \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# ruby-lsp: Shopify's Ruby language server (LSP-compliant, actively maintained).
|
||||
# Installed globally so the `ruby-lsp` binary is available on PATH.
|
||||
RUN gem install ruby-lsp
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,4 @@
|
||||
# Stack detection markers for the ruby stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
Gemfile
|
||||
@@ -0,0 +1,37 @@
|
||||
# Rust stack overlay fragment.
|
||||
#
|
||||
# Layer this on top of huskies-project-base to produce a project container
|
||||
# with a full Rust toolchain, rust-analyzer, and cargo-nextest.
|
||||
#
|
||||
# Build the combined image:
|
||||
# (echo "FROM huskies-project-base"; \
|
||||
# cat docker/stacks/rust/Dockerfile.fragment) | \
|
||||
# docker build -t huskies-project-rust -
|
||||
#
|
||||
# Adding a new stack: create docker/stacks/<name>/Dockerfile.fragment and
|
||||
# docker/stacks/<name>/markers — no changes to orchestration code required.
|
||||
|
||||
USER root
|
||||
|
||||
# Build tools required by rustup and many Rust crates.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV RUSTUP_HOME="/home/huskies/.rustup" \
|
||||
CARGO_HOME="/home/huskies/.cargo"
|
||||
|
||||
# Install stable Rust + rust-analyzer component as the huskies user.
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
|
||||
| su huskies -c "sh -s -- -y --no-modify-path --default-toolchain stable" \
|
||||
&& /home/huskies/.cargo/bin/rustup component add rust-analyzer \
|
||||
&& chown -R huskies:huskies /home/huskies/.rustup /home/huskies/.cargo
|
||||
|
||||
# cargo-nextest: fast Rust test runner used by huskies quality gates.
|
||||
RUN curl -LsSf https://get.nexte.st/latest/linux | tar zxf - -C /usr/local/bin
|
||||
|
||||
ENV PATH="/home/huskies/.cargo/bin:${PATH}"
|
||||
|
||||
USER huskies
|
||||
@@ -0,0 +1,4 @@
|
||||
# Stack detection markers for the rust stack.
|
||||
# Each non-blank, non-comment line names a file relative to the project root.
|
||||
# If any listed file exists in the project directory, this stack is matched.
|
||||
Cargo.toml
|
||||
Generated
+945
-1215
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "huskies",
|
||||
"private": true,
|
||||
"version": "0.11.1",
|
||||
"version": "0.13.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
@@ -32,11 +32,11 @@
|
||||
"@types/node": "^25.0.0",
|
||||
"@types/react": "^19.1.8",
|
||||
"@types/react-dom": "^19.1.6",
|
||||
"@vitejs/plugin-react": "^4.6.0",
|
||||
"@vitest/coverage-v8": "^2.1.9",
|
||||
"@vitejs/plugin-react": "^5.2.0",
|
||||
"@vitest/coverage-v8": "^4.1.6",
|
||||
"jsdom": "^28.1.0",
|
||||
"typescript": "~5.8.3",
|
||||
"vite": "^5.4.21",
|
||||
"vitest": "^2.1.4"
|
||||
"vite": "^8.0.13",
|
||||
"vitest": "^4.1.6"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,6 +160,7 @@ describe("App", () => {
|
||||
});
|
||||
|
||||
it("shows error when openProject fails", async () => {
|
||||
const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
mockedApi.openProject.mockRejectedValue(new Error("Path does not exist"));
|
||||
|
||||
await renderApp();
|
||||
@@ -182,6 +183,7 @@ describe("App", () => {
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText(/Path does not exist/)).toBeInTheDocument();
|
||||
});
|
||||
errorSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("shows known projects list", async () => {
|
||||
|
||||
@@ -266,6 +266,8 @@ describe("subscribeAgentStream", () => {
|
||||
});
|
||||
|
||||
it("handles malformed JSON without throwing", () => {
|
||||
vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
|
||||
subscribeAgentStream("42_story_test", "coder", vi.fn());
|
||||
|
||||
expect(() => {
|
||||
|
||||
@@ -472,9 +472,16 @@ describe("Slash command handling (Story 374)", () => {
|
||||
});
|
||||
|
||||
describe("Story 1058: WebSocket errors do not appear in chat", () => {
|
||||
let consoleSpy: ReturnType<typeof vi.spyOn>;
|
||||
|
||||
beforeEach(() => {
|
||||
capturedWsHandlers = null;
|
||||
setupMocks();
|
||||
consoleSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
consoleSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("does not add a chat message when onError is called", async () => {
|
||||
|
||||
@@ -227,6 +227,7 @@ describe("usePathCompletion hook", () => {
|
||||
});
|
||||
|
||||
it("sets completionError when listDirectoryAbsolute throws an Error", async () => {
|
||||
const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
mockListDir.mockRejectedValue(new Error("Permission denied"));
|
||||
|
||||
const { result } = renderHook(() =>
|
||||
@@ -242,9 +243,13 @@ describe("usePathCompletion hook", () => {
|
||||
await waitFor(() => {
|
||||
expect(result.current.completionError).toBe("Permission denied");
|
||||
});
|
||||
|
||||
expect(errorSpy).toHaveBeenCalledWith(new Error("Permission denied"));
|
||||
errorSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("sets generic completionError when listDirectoryAbsolute throws a non-Error", async () => {
|
||||
const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
mockListDir.mockRejectedValue("some string error");
|
||||
|
||||
const { result } = renderHook(() =>
|
||||
@@ -262,6 +267,9 @@ describe("usePathCompletion hook", () => {
|
||||
"Failed to compute suggestion.",
|
||||
);
|
||||
});
|
||||
|
||||
expect(errorSpy).toHaveBeenCalledWith("some string error");
|
||||
errorSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("clears suggestionTail when selected match path does not start with input", async () => {
|
||||
|
||||
Executable
+37
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Build all project images in dependency order:
|
||||
# huskies → huskies-project-base → huskies-project-<stack> (one per stack fragment)
|
||||
#
|
||||
# Run this after `script/docker_rebuild` or whenever you add a new stack.
|
||||
# Safe to re-run: each step re-tags the image with the latest layers.
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
if [[ -f .env ]]; then
|
||||
set -a
|
||||
source .env
|
||||
set +a
|
||||
fi
|
||||
|
||||
CACHE_FLAG=""
|
||||
if [[ "${1:-}" == "--no-cache" ]]; then
|
||||
CACHE_FLAG="--no-cache"
|
||||
fi
|
||||
|
||||
echo "==> Building huskies"
|
||||
docker build $CACHE_FLAG -t huskies -f docker/Dockerfile .
|
||||
|
||||
echo "==> Building huskies-project-base"
|
||||
docker build $CACHE_FLAG -t huskies-project-base -f docker/Dockerfile.base .
|
||||
|
||||
for fragment in docker/stacks/*/Dockerfile.fragment; do
|
||||
stack=$(basename "$(dirname "$fragment")")
|
||||
image="huskies-project-${stack}"
|
||||
echo "==> Building ${image}"
|
||||
(printf 'FROM huskies-project-base\n'; cat "$fragment") \
|
||||
| docker build $CACHE_FLAG -t "$image" -
|
||||
done
|
||||
|
||||
echo "All project images built."
|
||||
@@ -24,4 +24,6 @@ docker compose -f docker/docker-compose.yml down
|
||||
docker compose -f docker/docker-compose.yml build $CACHE_FLAG
|
||||
docker compose -f docker/docker-compose.yml up -d
|
||||
|
||||
script/build-project-images $CACHE_FLAG
|
||||
|
||||
echo "Rebuild complete. Logs: docker compose -f docker/docker-compose.yml logs -f"
|
||||
|
||||
Executable
+165
@@ -0,0 +1,165 @@
|
||||
#!/usr/bin/env bash
|
||||
# Build huskies, install (codesign-heal wrapper + underlying binary), and if a
|
||||
# gateway is running on this host, hot-restart it detached from the current shell
|
||||
# so SSH disconnect — e.g. when redeploying from a phone — doesn't kill it.
|
||||
#
|
||||
# Skips the restart silently if no gateway is running. Errors loudly if more
|
||||
# than one matches, so we don't restart the wrong one.
|
||||
#
|
||||
# Pass --skip-check to bypass `script/check` (useful for docs / build-script
|
||||
# changes you've already verified).
|
||||
#
|
||||
# On relaunch failure the previous binary is restored from
|
||||
# ~/bin/huskies-bin.prev and re-launched, so a bad deploy doesn't leave the
|
||||
# host without a working gateway.
|
||||
#
|
||||
# After a `cp` or download the binary loses its ad-hoc signature and macOS
|
||||
# SIGKILLs it silently on Apple Silicon. The wrapper at ~/bin/huskies re-signs
|
||||
# the underlying binary at ~/bin/huskies-bin whenever codesign validation
|
||||
# fails, then execs it. Normal launches (already signed) are zero-overhead.
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
LOG_DIR="${HUSKIES_LOG_DIR:-$PROJECT_ROOT/logs}"
|
||||
GATEWAY_PATTERN='huskies .*--gateway'
|
||||
BIN_DIR="${HOME}/bin"
|
||||
UNDERLYING="${BIN_DIR}/huskies-bin"
|
||||
WRAPPER="${BIN_DIR}/huskies"
|
||||
PREV_BIN="${BIN_DIR}/huskies-bin.prev"
|
||||
NEW_BIN="${PROJECT_ROOT}/target/release/huskies"
|
||||
|
||||
SKIP_CHECK=0
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--skip-check) SKIP_CHECK=1 ;;
|
||||
-h|--help) sed -n '2,17p' "$0"; exit 0 ;;
|
||||
*) echo "Unknown arg: $arg (use --help)" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ "$SKIP_CHECK" -eq 0 ] && [ -x "$SCRIPT_DIR/check" ]; then
|
||||
echo "=== Running script/check ==="
|
||||
"$SCRIPT_DIR/check"
|
||||
fi
|
||||
|
||||
echo "=== Building release binary ==="
|
||||
cd "$PROJECT_ROOT"
|
||||
cargo build --release --bin huskies
|
||||
|
||||
mkdir -p "$BIN_DIR"
|
||||
|
||||
# Snapshot current binary so we can roll back if the relaunch fails.
|
||||
PREV_VERSION=""
|
||||
if [ -x "$UNDERLYING" ]; then
|
||||
PREV_VERSION="$("$UNDERLYING" --version 2>/dev/null || echo unknown)"
|
||||
cp "$UNDERLYING" "$PREV_BIN"
|
||||
fi
|
||||
|
||||
cp "$NEW_BIN" "$UNDERLYING"
|
||||
chmod +x "$UNDERLYING"
|
||||
codesign -s - -f "$UNDERLYING" 2>/dev/null
|
||||
NEW_VERSION="$("$UNDERLYING" --version 2>/dev/null || echo unknown)"
|
||||
echo "==> Installed binary: ${UNDERLYING}"
|
||||
if [ -n "$PREV_VERSION" ]; then
|
||||
echo " version: $PREV_VERSION → $NEW_VERSION"
|
||||
else
|
||||
echo " version: $NEW_VERSION (no prior install)"
|
||||
fi
|
||||
|
||||
cat > "${WRAPPER}" << 'WRAPPER_EOF'
|
||||
#!/usr/bin/env bash
|
||||
# Codesign-heal wrapper — re-signs ~/bin/huskies-bin if the signature is
|
||||
# missing or invalid, then execs the binary. Logs only when it re-signs.
|
||||
BIN="${HOME}/bin/huskies-bin"
|
||||
if ! codesign --verify --quiet "${BIN}" 2>/dev/null; then
|
||||
codesign -s - "${BIN}"
|
||||
echo "[codesign-heal] re-signed ~/bin/huskies-bin" >&2
|
||||
fi
|
||||
exec "${BIN}" "$@"
|
||||
WRAPPER_EOF
|
||||
chmod +x "${WRAPPER}"
|
||||
echo "==> Installed wrapper: ${WRAPPER}"
|
||||
|
||||
# ── Hot-restart gateway if one is running ─────────────────────────────
|
||||
collect_descendants() {
|
||||
local pid="$1" kid
|
||||
for kid in $(pgrep -P "$pid" 2>/dev/null); do
|
||||
collect_descendants "$kid"
|
||||
printf '%s\n' "$kid"
|
||||
done
|
||||
}
|
||||
|
||||
GATEWAY_PIDS="$(pgrep -f "$GATEWAY_PATTERN" || true)"
|
||||
if [ -z "$GATEWAY_PIDS" ]; then
|
||||
echo "==> No running gateway found; install complete."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ "$(echo "$GATEWAY_PIDS" | wc -l)" -gt 1 ]; then
|
||||
echo "Error: multiple gateway processes match '${GATEWAY_PATTERN}':" >&2
|
||||
ps -p $GATEWAY_PIDS -o pid,args >&2 || true
|
||||
echo "Refusing to guess which to restart." >&2
|
||||
exit 3
|
||||
fi
|
||||
|
||||
GATEWAY_PID="$GATEWAY_PIDS"
|
||||
GATEWAY_ARGS="$(ps -p "$GATEWAY_PID" -o args= | sed -E 's@^[^ ]*huskies[^ ]* @@')"
|
||||
GATEWAY_CWD="$(lsof -p "$GATEWAY_PID" 2>/dev/null | awk '$4=="cwd"{print $9; exit}')"
|
||||
if [ -z "$GATEWAY_CWD" ]; then GATEWAY_CWD="$PWD"; fi
|
||||
|
||||
LOG_FILE="$LOG_DIR/gateway-$(date +%Y%m%d-%H%M%S).log"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
DESCENDANTS="$(collect_descendants "$GATEWAY_PID" | tr '\n' ' ')"
|
||||
echo "==> Stopping gateway tree (pids: $GATEWAY_PID $DESCENDANTS)"
|
||||
# Kill descendants depth-first so PTY children die before the gateway, then the gateway.
|
||||
for pid in $DESCENDANTS $GATEWAY_PID; do
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
|
||||
echo "==> Restarting gateway"
|
||||
echo " log: $LOG_FILE"
|
||||
(
|
||||
cd "$GATEWAY_CWD"
|
||||
nohup "$WRAPPER" $GATEWAY_ARGS >> "$LOG_FILE" 2>&1 < /dev/null &
|
||||
disown
|
||||
)
|
||||
|
||||
# Wait up to 10s for the new gateway to appear AND be a different PID.
|
||||
NEW_PID=""
|
||||
for _ in 1 2 3 4 5 6 7 8 9 10; do
|
||||
sleep 1
|
||||
candidate="$(pgrep -f "$GATEWAY_PATTERN" 2>/dev/null || true)"
|
||||
if [ -n "$candidate" ] && [ "$candidate" != "$GATEWAY_PID" ]; then
|
||||
NEW_PID="$candidate"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$NEW_PID" ]; then
|
||||
echo "==> Gateway restarted as pid $NEW_PID"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Rollback ──────────────────────────────────────────────────────────
|
||||
echo "Error: new gateway failed to come up within 10s; rolling back" >&2
|
||||
if [ -x "$PREV_BIN" ]; then
|
||||
cp "$PREV_BIN" "$UNDERLYING"
|
||||
chmod +x "$UNDERLYING"
|
||||
codesign -s - -f "$UNDERLYING" 2>/dev/null
|
||||
echo "==> Restored previous binary"
|
||||
(
|
||||
cd "$GATEWAY_CWD"
|
||||
nohup "$WRAPPER" $GATEWAY_ARGS >> "$LOG_FILE" 2>&1 < /dev/null &
|
||||
disown
|
||||
)
|
||||
sleep 2
|
||||
if pgrep -f "$GATEWAY_PATTERN" >/dev/null 2>&1; then
|
||||
echo "==> Gateway restored to previous version"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
echo "Error: rollback failed; gateway is DOWN. Inspect $LOG_FILE." >&2
|
||||
exit 1
|
||||
+12
-10
@@ -11,10 +11,12 @@ export GIT_CONFIG_VALUE_0=master
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
# Ordered fail-fast: cheapest deterministic checks first, slowest builds and
|
||||
# test suites last. `set -euo pipefail` aborts at the first failure, so a fmt
|
||||
# or clippy drift never wastes time on a frontend build or a multi-minute
|
||||
# test run.
|
||||
# Ordered fail-fast: cheapest deterministic checks first. The frontend build
|
||||
# must run *before* anything that compiles Rust, because story 1113 introduced
|
||||
# a compile-time dependency on `frontend/dist/` via `rust-embed` — a fresh
|
||||
# merge worktree without that directory will fail `cargo clippy` on
|
||||
# `EmbeddedAssets::iter()` before the frontend build has a chance to populate
|
||||
# it. `set -euo pipefail` aborts at the first failure.
|
||||
|
||||
echo "=== Checking Rust formatting ==="
|
||||
if cargo fmt --version &>/dev/null; then
|
||||
@@ -44,12 +46,6 @@ if [ "$_dup_found" -eq 1 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== Running cargo clippy ==="
|
||||
cargo clippy --manifest-path "$PROJECT_ROOT/Cargo.toml" --all-targets --all-features -- -D warnings
|
||||
|
||||
echo "=== Checking doc coverage on changed files ==="
|
||||
cargo run --manifest-path "$PROJECT_ROOT/Cargo.toml" -p source-map-gen --bin source-map-check --quiet -- --worktree "$PROJECT_ROOT" --base master
|
||||
|
||||
echo "=== Building frontend ==="
|
||||
if [ -d "$PROJECT_ROOT/frontend" ]; then
|
||||
cd "$PROJECT_ROOT/frontend"
|
||||
@@ -75,6 +71,12 @@ else
|
||||
echo "Skipping frontend build (no frontend directory)"
|
||||
fi
|
||||
|
||||
echo "=== Running cargo clippy ==="
|
||||
cargo clippy --manifest-path "$PROJECT_ROOT/Cargo.toml" --all-targets --all-features -- -D warnings
|
||||
|
||||
echo "=== Checking doc coverage on changed files ==="
|
||||
cargo run --manifest-path "$PROJECT_ROOT/Cargo.toml" -p source-map-gen --bin source-map-check --quiet -- --worktree "$PROJECT_ROOT" --base master
|
||||
|
||||
echo "=== Running Rust tests ==="
|
||||
cargo test --manifest-path "$PROJECT_ROOT/Cargo.toml" --bin huskies
|
||||
cargo test --manifest-path "$PROJECT_ROOT/Cargo.toml" -p source-map-gen
|
||||
|
||||
+1
-3
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "huskies"
|
||||
version = "0.11.1"
|
||||
version = "0.13.0"
|
||||
edition = "2024"
|
||||
build = "build.rs"
|
||||
|
||||
@@ -13,12 +13,10 @@ chrono-tz = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
homedir = { workspace = true }
|
||||
ignore = { workspace = true }
|
||||
mime_guess = { workspace = true }
|
||||
notify = { workspace = true }
|
||||
poem = { workspace = true, features = ["websocket"] }
|
||||
portable-pty = { workspace = true }
|
||||
reqwest = { workspace = true, features = ["json", "stream", "form"] }
|
||||
rust-embed = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
serde_json = { workspace = true }
|
||||
serde_urlencoded = { workspace = true }
|
||||
|
||||
@@ -33,16 +33,28 @@ impl GateFailureKind {
|
||||
/// Called once when a gate fails to produce a typed kind. Downstream code
|
||||
/// matches on the variant and must not call this on subsequent reads.
|
||||
pub fn classify(output: &str) -> Self {
|
||||
// Strip `test <name> ... ok` lines before checking lint-trigger keywords so
|
||||
// a passing test whose name contains e.g. `missing_doc_comments` or `clippy::`
|
||||
// does not produce a false-positive Lint classification (story 1101).
|
||||
let stripped_for_lint: String = output
|
||||
.lines()
|
||||
.filter(|l| {
|
||||
let t = l.trim();
|
||||
!(t.starts_with("test ") && t.ends_with("... ok"))
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let is_lint = stripped_for_lint.contains("error[clippy::")
|
||||
|| stripped_for_lint.contains("warning[clippy::")
|
||||
|| stripped_for_lint.contains("missing_doc_comments");
|
||||
|
||||
if output.contains("CONFLICT (content):") || output.contains("Merge conflict:") {
|
||||
GateFailureKind::ContentConflict
|
||||
} else if output.contains("Diff in ") || output.contains("would reformat") {
|
||||
GateFailureKind::Fmt
|
||||
} else if output.contains("missing-docs direction") {
|
||||
GateFailureKind::SourceMapCheck
|
||||
} else if output.contains("error[clippy::")
|
||||
|| output.contains("warning[clippy::")
|
||||
|| output.contains("missing_doc_comments")
|
||||
{
|
||||
} else if is_lint {
|
||||
GateFailureKind::Lint
|
||||
} else if output.contains("error[E") {
|
||||
// rustc compile errors (e.g. `error[E0063]: missing field`).
|
||||
@@ -871,6 +883,19 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
/// Story 1101: a passing test whose name contains a lint trigger keyword
|
||||
/// must NOT produce a Lint classification.
|
||||
#[test]
|
||||
fn classify_does_not_false_positive_on_test_name_substring() {
|
||||
let output = "test agents::gates::tests::classify_lint_from_missing_doc_comments ... ok\n\
|
||||
test result: ok. 1 passed; 0 failed";
|
||||
assert_ne!(
|
||||
GateFailureKind::classify(output),
|
||||
GateFailureKind::Lint,
|
||||
"passing test name containing 'missing_doc_comments' must not classify as Lint"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_source_map_check_from_missing_docs_direction() {
|
||||
assert_eq!(
|
||||
|
||||
@@ -186,50 +186,6 @@ impl AgentPool {
|
||||
.map(|k| k.is_self_evident_fix())
|
||||
.unwrap_or(false);
|
||||
|
||||
// Bug 1101 diagnostic: log the classified failure_kind and the
|
||||
// matched classifier-trigger substring with surrounding context,
|
||||
// so we can confirm whether classify() is incorrectly matching
|
||||
// a passing-step stdout substring (e.g. "Diff in " inside a
|
||||
// failing test's panic message) and bouncing the story to a
|
||||
// fixup coder. Remove once the fix lands.
|
||||
if let Ok(r) = report.as_ref()
|
||||
&& let crate::agents::merge::MergeResult::GateFailure {
|
||||
output: gate_output,
|
||||
failure_kind: Some(k),
|
||||
} = &r.result
|
||||
{
|
||||
const TRIGGERS: &[&str] = &[
|
||||
"CONFLICT (content):",
|
||||
"Merge conflict:",
|
||||
"Diff in ",
|
||||
"would reformat",
|
||||
"missing-docs direction",
|
||||
"error[clippy::",
|
||||
"warning[clippy::",
|
||||
"missing_doc_comments",
|
||||
"error[E",
|
||||
];
|
||||
let matched = TRIGGERS
|
||||
.iter()
|
||||
.find_map(|t| gate_output.find(t).map(|i| (*t, i)));
|
||||
let (trigger, context) = match matched {
|
||||
Some((t, i)) => {
|
||||
let start = i.saturating_sub(30);
|
||||
let end = (i + t.len() + 60).min(gate_output.len());
|
||||
let ctx = gate_output
|
||||
.get(start..end)
|
||||
.unwrap_or("<context unavailable>")
|
||||
.replace('\n', " ");
|
||||
(Some(t), ctx)
|
||||
}
|
||||
None => (None, String::from("<no trigger matched>")),
|
||||
};
|
||||
slog!(
|
||||
"[merge] classify diagnostic for '{sid}': failure_kind={k:?} \
|
||||
is_fixup={is_fixup} trigger={trigger:?} context='{context}'"
|
||||
);
|
||||
}
|
||||
|
||||
if is_no_commits {
|
||||
let reason = kind.display_reason();
|
||||
if let Err(e) = crate::agents::lifecycle::transition_to_blocked(&sid, &reason) {
|
||||
|
||||
@@ -116,6 +116,23 @@ pub(super) fn maybe_inject_gate_failure(args: &mut Vec<String>, story_id: &str)
|
||||
}
|
||||
}
|
||||
|
||||
/// Append `Edit,Write,Bash` to the `--disallowedTools` flag so worktree agents
|
||||
/// cannot write to the master tree via Claude's built-in tools. If
|
||||
/// `--disallowedTools` is already present (from agent config), the three names
|
||||
/// are appended to the existing value rather than replacing it.
|
||||
pub(super) fn inject_worktree_disallowed_tools(args: &mut Vec<String>) {
|
||||
const BLOCKED: &str = "Edit,Write,Bash";
|
||||
if let Some(pos) = args.iter().position(|a| a == "--disallowedTools") {
|
||||
if let Some(val) = args.get_mut(pos + 1) {
|
||||
val.push(',');
|
||||
val.push_str(BLOCKED);
|
||||
}
|
||||
} else {
|
||||
args.push("--disallowedTools".to_string());
|
||||
args.push(BLOCKED.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the background worktree-creation + agent-launch flow.
|
||||
///
|
||||
/// Caller (`AgentPool::start_agent`) wraps this in `tokio::spawn` and stores
|
||||
@@ -264,6 +281,10 @@ pub(super) async fn run_agent_spawn(
|
||||
maybe_inject_gate_failure(&mut args, &sid);
|
||||
// Cap turns and budget for merge-gate fixup sessions (story 981).
|
||||
maybe_cap_for_merge_fixup(&mut args, &sid);
|
||||
// Every agent that runs inside a worktree must use the validated MCP
|
||||
// edit/write tools instead of Claude's built-in Edit/Write/Bash. This
|
||||
// prevents accidental writes to the master worktree (stories 1127, 1136).
|
||||
inject_worktree_disallowed_tools(&mut args);
|
||||
|
||||
// Append project-local prompt content (.huskies/AGENT.md) to the
|
||||
// baked-in prompt so every agent role sees project-specific guidance
|
||||
@@ -1297,4 +1318,43 @@ mod tests {
|
||||
item.stage().dir_name()
|
||||
);
|
||||
}
|
||||
|
||||
// ── inject_worktree_disallowed_tools (AC1, story 1142) ───────────
|
||||
|
||||
/// AC3(c) proxy: worktree agents get `--disallowedTools Edit,Write,Bash`.
|
||||
#[test]
|
||||
fn worktree_disallowed_tools_added_when_absent() {
|
||||
let mut args: Vec<String> = vec!["--verbose".to_string()];
|
||||
inject_worktree_disallowed_tools(&mut args);
|
||||
let pos = args
|
||||
.iter()
|
||||
.position(|a| a == "--disallowedTools")
|
||||
.expect("--disallowedTools must be present");
|
||||
let val = &args[pos + 1];
|
||||
assert!(val.contains("Edit"), "must include Edit");
|
||||
assert!(val.contains("Write"), "must include Write");
|
||||
assert!(val.contains("Bash"), "must include Bash");
|
||||
}
|
||||
|
||||
/// Existing `--disallowedTools` value is extended, not replaced.
|
||||
#[test]
|
||||
fn worktree_disallowed_tools_appended_to_existing() {
|
||||
let mut args = vec!["--disallowedTools".to_string(), "SomeOtherTool".to_string()];
|
||||
inject_worktree_disallowed_tools(&mut args);
|
||||
// Only one --disallowedTools flag.
|
||||
let count = args
|
||||
.iter()
|
||||
.filter(|a| a.as_str() == "--disallowedTools")
|
||||
.count();
|
||||
assert_eq!(count, 1, "must not duplicate --disallowedTools");
|
||||
let pos = args.iter().position(|a| a == "--disallowedTools").unwrap();
|
||||
let val = &args[pos + 1];
|
||||
assert!(
|
||||
val.contains("SomeOtherTool"),
|
||||
"original tool must be preserved"
|
||||
);
|
||||
assert!(val.contains("Edit"), "Edit must be added");
|
||||
assert!(val.contains("Write"), "Write must be added");
|
||||
assert!(val.contains("Bash"), "Bash must be added");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,7 +129,13 @@ pub(crate) async fn on_coding_transition(project_root: &Path, port: u16, story_i
|
||||
"[worktree-create-sub] Worktree ready for '{story_id}' at {}",
|
||||
info.path.display()
|
||||
);
|
||||
if let Err(e) = crate::worktree::install_pre_commit_hook(&info.path) {
|
||||
let hook_path = info.path.clone();
|
||||
let hook_result = tokio::task::spawn_blocking(move || {
|
||||
crate::worktree::install_pre_commit_hook(&hook_path)
|
||||
})
|
||||
.await
|
||||
.unwrap_or_else(|e| Err(format!("spawn_blocking panicked: {e}")));
|
||||
if let Err(e) = hook_result {
|
||||
slog_warn!(
|
||||
"[worktree-create-sub] Pre-commit hook install failed for '{story_id}': {e}"
|
||||
);
|
||||
|
||||
@@ -0,0 +1,188 @@
|
||||
//! Handler for the `convert` chat command (story 1141).
|
||||
//!
|
||||
//! `convert <number> <type>` changes the item-type register of a work item
|
||||
//! in place. All other CRDT registers (ACs, epic, name, stage, …) are
|
||||
//! untouched. Rejected for archived items.
|
||||
|
||||
use super::CommandContext;
|
||||
|
||||
/// Handle the `convert` command.
|
||||
///
|
||||
/// Parses `<number> <type>` from `ctx.args` and delegates to
|
||||
/// [`convert_by_number`]. Returns `None` (route to LLM) when args do not
|
||||
/// look like a numeric ID followed by a type keyword.
|
||||
pub(super) fn handle_convert(ctx: &CommandContext) -> Option<String> {
|
||||
let args = ctx.args.trim();
|
||||
let (num_str, type_str) = args.split_once(char::is_whitespace)?;
|
||||
let num_str = num_str.trim();
|
||||
let type_str = type_str.trim();
|
||||
|
||||
// Route to LLM if the first token is not a bare number.
|
||||
if num_str.is_empty() || !num_str.chars().all(|c| c.is_ascii_digit()) {
|
||||
return None;
|
||||
}
|
||||
// Route to LLM if the type looks like natural language (contains spaces).
|
||||
if type_str.is_empty() || type_str.contains(char::is_whitespace) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(convert_by_number(ctx.effective_root(), num_str, type_str))
|
||||
}
|
||||
|
||||
/// Core convert logic: find item by numeric prefix and change its type.
|
||||
///
|
||||
/// Returns a Markdown-formatted response suitable for all chat transports.
|
||||
pub(crate) fn convert_by_number(
|
||||
project_root: &std::path::Path,
|
||||
story_number: &str,
|
||||
new_type_str: &str,
|
||||
) -> String {
|
||||
let Some(new_type) = crate::io::story_metadata::ItemType::from_str(new_type_str) else {
|
||||
return format!(
|
||||
"Unknown type **{new_type_str}**. Accepted types: story, bug, spike, refactor, epic."
|
||||
);
|
||||
};
|
||||
|
||||
let (story_id, _, _, _) =
|
||||
match crate::chat::lookup::find_story_by_number(project_root, story_number) {
|
||||
Some(found) => found,
|
||||
None => {
|
||||
return format!(
|
||||
"No story, bug, spike, or refactor with number **{story_number}** found."
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
let item = match crate::crdt_state::read_item(&story_id) {
|
||||
Some(i) => i,
|
||||
None => {
|
||||
return format!("Work item **{story_number}** ({story_id}) not found in CRDT.");
|
||||
}
|
||||
};
|
||||
|
||||
if matches!(item.stage(), crate::pipeline_state::Stage::Archived { .. }) {
|
||||
return format!(
|
||||
"Cannot convert **{story_id}**: type change on an archived item is not allowed."
|
||||
);
|
||||
}
|
||||
|
||||
let old_type = item.item_type().map(|t| t.as_str()).unwrap_or("(inferred)");
|
||||
let story_name = item.name().to_string();
|
||||
let new_type_s = new_type.as_str();
|
||||
|
||||
if !crate::crdt_state::set_item_type(&story_id, Some(new_type)) {
|
||||
return format!("Failed to convert **{story_id}**: CRDT write rejected.");
|
||||
}
|
||||
|
||||
format!("Converted **{story_name}** ({story_id}) from type `{old_type}` to `{new_type_s}`.")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::{CommandDispatch, try_handle_command};
|
||||
|
||||
fn convert_cmd(root: &std::path::Path, args: &str) -> Option<String> {
|
||||
let services = crate::services::Services::new_test(root.to_path_buf(), "Timmy".to_string());
|
||||
let room_id = "!test:example.com".to_string();
|
||||
let dispatch = CommandDispatch {
|
||||
services: &services,
|
||||
project_root: &services.project_root,
|
||||
bot_user_id: "@timmy:homeserver.local",
|
||||
room_id: &room_id,
|
||||
};
|
||||
try_handle_command(&dispatch, &format!("@timmy convert {args}"))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn convert_command_is_registered() {
|
||||
use super::super::commands;
|
||||
assert!(
|
||||
commands().iter().any(|c| c.name == "convert"),
|
||||
"convert command must be in the registry"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn convert_no_args_routes_to_llm() {
|
||||
let tmp = tempfile::TempDir::new().unwrap();
|
||||
let result = convert_cmd(tmp.path(), "");
|
||||
assert!(result.is_none(), "no args should route to LLM: {result:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn convert_natural_language_routes_to_llm() {
|
||||
let tmp = tempfile::TempDir::new().unwrap();
|
||||
let result = convert_cmd(tmp.path(), "the login bug to a story");
|
||||
assert!(
|
||||
result.is_none(),
|
||||
"natural-language args should route to LLM: {result:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn convert_well_formed_runs_handler() {
|
||||
let tmp = tempfile::TempDir::new().unwrap();
|
||||
let result = convert_cmd(tmp.path(), "999 story");
|
||||
assert!(
|
||||
result.is_some(),
|
||||
"well-formed args should run the handler: {result:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn convert_invalid_type_returns_error() {
|
||||
let tmp = tempfile::TempDir::new().unwrap();
|
||||
let result = convert_cmd(tmp.path(), "999 banana").unwrap();
|
||||
assert!(
|
||||
result.contains("Unknown type") || result.contains("banana"),
|
||||
"unknown type should show error: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn convert_not_found_returns_error() {
|
||||
let tmp = tempfile::TempDir::new().unwrap();
|
||||
let result = convert_cmd(tmp.path(), "9988 story").unwrap();
|
||||
assert!(
|
||||
result.contains("9988") && result.contains("found"),
|
||||
"not-found message should include number and 'found': {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn convert_changes_item_type_in_crdt() {
|
||||
let tmp = tempfile::TempDir::new().unwrap();
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::db::ensure_content_store();
|
||||
crate::chat::test_helpers::write_story_file(
|
||||
tmp.path(),
|
||||
"backlog",
|
||||
"9120_spike_convert_chat.md",
|
||||
"# Spike\n",
|
||||
Some("Convert Chat Test"),
|
||||
);
|
||||
crate::crdt_state::set_item_type(
|
||||
"9120_spike_convert_chat",
|
||||
Some(crate::io::story_metadata::ItemType::Spike),
|
||||
);
|
||||
|
||||
let result = convert_cmd(tmp.path(), "9120 story").unwrap();
|
||||
assert!(
|
||||
result.contains("story") || result.contains("Converted"),
|
||||
"should confirm conversion: {result}"
|
||||
);
|
||||
|
||||
let item =
|
||||
crate::crdt_state::read_item("9120_spike_convert_chat").expect("item should exist");
|
||||
assert_eq!(
|
||||
item.item_type(),
|
||||
Some(crate::io::story_metadata::ItemType::Story),
|
||||
"item_type should be Story after conversion: {:?}",
|
||||
item.item_type()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -9,6 +9,7 @@ mod ambient;
|
||||
mod assign;
|
||||
mod backlog;
|
||||
mod cleanup_worktrees;
|
||||
mod convert;
|
||||
mod cost;
|
||||
mod coverage;
|
||||
mod depends;
|
||||
@@ -19,6 +20,7 @@ mod help;
|
||||
pub(crate) mod loc;
|
||||
mod logs;
|
||||
mod move_story;
|
||||
mod new_project;
|
||||
mod overview;
|
||||
mod run_tests;
|
||||
mod setup;
|
||||
@@ -232,6 +234,11 @@ pub fn commands() -> &'static [BotCommand] {
|
||||
description: "Schedule a deferred agent start: `timer <story_id> <HH:MM>`, `timer list`, `timer cancel <story_id>`",
|
||||
handler: timer::handle_timer,
|
||||
},
|
||||
BotCommand {
|
||||
name: "convert",
|
||||
description: "Convert a work item's type: `convert <number> <type>` (types: story, bug, spike, refactor, epic)",
|
||||
handler: convert::handle_convert,
|
||||
},
|
||||
BotCommand {
|
||||
name: "unblock",
|
||||
description: "Reset a blocked story: `unblock <number>` (clears blocked flag and resets retry count)",
|
||||
@@ -262,6 +269,21 @@ pub fn commands() -> &'static [BotCommand] {
|
||||
description: "List orphaned worktrees (dry run), or `cleanup_worktrees --confirm` to remove them",
|
||||
handler: handle_cleanup_worktrees_fallback,
|
||||
},
|
||||
BotCommand {
|
||||
name: "health",
|
||||
description: "Show subsystem health: gateway, sled, matrix-sync, creds, and build-hash",
|
||||
handler: handle_health_fallback,
|
||||
},
|
||||
BotCommand {
|
||||
name: "new",
|
||||
description: "Bootstrap a new project container (gateway only): `new project <name>`",
|
||||
handler: new_project::handle_new_project_fallback,
|
||||
},
|
||||
BotCommand {
|
||||
name: "project-rebuild",
|
||||
description: "Rebuild a project's Docker image and swap the container (gateway only): `project-rebuild <name> [--timeout <secs>] [--force]`",
|
||||
handler: handle_project_rebuild_fallback,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
@@ -419,6 +441,26 @@ fn handle_cleanup_worktrees_fallback(_ctx: &CommandContext) -> Option<String> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Fallback handler for the `project-rebuild` command when it is not intercepted
|
||||
/// by the async gateway handler in `on_room_message`. In practice this is never
|
||||
/// called — `project-rebuild` is detected and handled before `try_handle_command`
|
||||
/// runs in gateway mode. The entry exists in the registry so `help` lists it.
|
||||
///
|
||||
/// Returns `None` to prevent the LLM from receiving the raw command text.
|
||||
fn handle_project_rebuild_fallback(_ctx: &CommandContext) -> Option<String> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Fallback handler for the `health` command when it is not intercepted by the
|
||||
/// async handler in `on_room_message`. In practice this is never called — health
|
||||
/// is detected and handled before `try_handle_command` is invoked. The entry
|
||||
/// exists in the registry only so `help` lists it.
|
||||
///
|
||||
/// Returns `None` to prevent the LLM from receiving "health" as a prompt.
|
||||
fn handle_health_fallback(_ctx: &CommandContext) -> Option<String> {
|
||||
None
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
//! `new project` command stub.
|
||||
//!
|
||||
//! The command is handled asynchronously in the Matrix transport's
|
||||
//! `on_room_message` handler (gateway mode only). This file exists so that
|
||||
//! `help` lists the command and the gateway proxy block does not forward it
|
||||
//! to the active project sled.
|
||||
|
||||
use super::CommandContext;
|
||||
|
||||
/// Fallback handler for the `new` command when it is not intercepted by the
|
||||
/// async gateway handler in `on_room_message`. In practice this is never
|
||||
/// called — `new project` is detected and handled before `try_handle_command`
|
||||
/// runs in gateway mode, and in standalone mode there is no matching project
|
||||
/// bootstrap context.
|
||||
///
|
||||
/// Returns `None` to prevent the LLM from receiving the raw command text.
|
||||
pub fn handle_new_project_fallback(_ctx: &CommandContext) -> Option<String> {
|
||||
None
|
||||
}
|
||||
@@ -300,6 +300,20 @@ pub(super) async fn handle_incoming_message(
|
||||
handle_llm_message(ctx, channel, user, message).await;
|
||||
}
|
||||
|
||||
/// Build the prompt for a Discord LLM turn, prepending any pending
|
||||
/// CRDT pipeline-transition events as a `<system-reminder>` block.
|
||||
fn build_discord_llm_prompt(
|
||||
persona: &str,
|
||||
bot_name: &str,
|
||||
user: &str,
|
||||
user_message: &str,
|
||||
) -> String {
|
||||
let event_ctx = crate::llm_session::assemble_prompt_context(persona);
|
||||
format!(
|
||||
"{event_ctx}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{user}: {user_message}"
|
||||
)
|
||||
}
|
||||
|
||||
/// Forward a message to Claude Code and send the response back via Discord.
|
||||
async fn handle_llm_message(ctx: &DiscordContext, channel: &str, user: &str, user_message: &str) {
|
||||
use crate::chat::util::drain_complete_paragraphs;
|
||||
@@ -314,9 +328,8 @@ async fn handle_llm_message(ctx: &DiscordContext, channel: &str, user: &str, use
|
||||
};
|
||||
|
||||
let bot_name = &ctx.services.bot_name;
|
||||
let prompt = format!(
|
||||
"[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{user}: {user_message}"
|
||||
);
|
||||
let persona = bot_name.to_lowercase();
|
||||
let prompt = build_discord_llm_prompt(&persona, bot_name, user, user_message);
|
||||
|
||||
let provider = ClaudeCodeProvider::new();
|
||||
let (_cancel_tx, mut cancel_rx) = watch::channel(false);
|
||||
@@ -604,4 +617,40 @@ mod tests {
|
||||
assert!(conv.session_id.is_none(), "session_id should be cleared");
|
||||
assert!(conv.entries.is_empty(), "entries should be cleared");
|
||||
}
|
||||
|
||||
/// AC 4: fire a `TransitionFired` event, simulate a Discord user turn, and
|
||||
/// assert the assembled prompt contains the event (end-to-end non-Matrix test).
|
||||
#[test]
|
||||
fn discord_prompt_includes_transition_event() {
|
||||
use crate::pipeline_state::{PipelineEvent, PlanState, Stage, StoryId, TransitionFired};
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
crate::event_log::log_transition_event(&TransitionFired {
|
||||
story_id: StoryId("77_discord_test".to_string()),
|
||||
before: Stage::Backlog,
|
||||
after: Stage::Coding {
|
||||
claim: None,
|
||||
plan: PlanState::Missing,
|
||||
retries: 0,
|
||||
},
|
||||
event: PipelineEvent::DepsMet,
|
||||
at: chrono::Utc::now(),
|
||||
});
|
||||
|
||||
let prompt =
|
||||
build_discord_llm_prompt("discord-ch-test", "Timmy", "@alice", "what is the status?");
|
||||
|
||||
assert!(
|
||||
prompt.contains("<system-reminder>"),
|
||||
"assembled prompt must include system-reminder block; got: {prompt}"
|
||||
);
|
||||
assert!(
|
||||
prompt.contains("77_discord_test"),
|
||||
"assembled prompt must contain story id; got: {prompt}"
|
||||
);
|
||||
assert!(
|
||||
prompt.contains("what is the status?"),
|
||||
"assembled prompt must contain user message; got: {prompt}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
//! Matrix bot context — shared state for the Matrix bot (rooms, history, permissions).
|
||||
use crate::chat::ChatTransport;
|
||||
use crate::service::gateway::config::ProjectEntry;
|
||||
use crate::service::timer::TimerStore;
|
||||
use crate::services::Services;
|
||||
use matrix_sdk::ruma::{OwnedEventId, OwnedRoomId, OwnedUserId};
|
||||
use std::collections::{BTreeMap, HashSet, VecDeque};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicI64;
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
@@ -87,33 +89,26 @@ pub struct BotContext {
|
||||
/// In gateway mode: the currently active project (shared with the gateway HTTP handler).
|
||||
/// `None` in standalone single-project mode.
|
||||
pub gateway_active_project: Option<Arc<RwLock<String>>>,
|
||||
/// In gateway mode: valid project names accepted by the `switch` command.
|
||||
/// Empty in standalone mode.
|
||||
pub gateway_projects: Vec<String>,
|
||||
/// In gateway mode: mapping of project name → base URL (e.g. `"http://localhost:3001"`).
|
||||
/// Used to proxy bot commands to the active project over WebSocket (`/ws`).
|
||||
/// Empty in standalone mode.
|
||||
pub gateway_project_urls: BTreeMap<String, String>,
|
||||
/// Pipeline transition events buffered since the last LLM turn.
|
||||
/// In gateway mode: shared live projects map from [`GatewayState`].
|
||||
///
|
||||
/// A background task appends one compact audit line per real stage
|
||||
/// transition. `handle_message` drains this buffer and injects it as a
|
||||
/// `<system-reminder>` block at the head of the next user prompt so Timmy
|
||||
/// sees pipeline activity without requiring a separate message.
|
||||
pub pending_pipeline_events: Arc<TokioMutex<Vec<String>>>,
|
||||
/// Gateway aggregate transition events buffered since the last LLM turn.
|
||||
///
|
||||
/// In gateway mode a background task appends one compact audit line per
|
||||
/// `GatewayStatusEvent` received from the gateway broadcaster. Drained
|
||||
/// alongside `pending_pipeline_events` on each user message. Always
|
||||
/// empty in standalone (non-gateway) mode.
|
||||
pub pending_gateway_events: Arc<TokioMutex<Vec<String>>>,
|
||||
/// The `new project` command writes here so HTTP handlers see the new entry
|
||||
/// immediately without requiring a gateway restart. `None` in standalone mode.
|
||||
pub gateway_projects_store: Option<Arc<RwLock<BTreeMap<String, ProjectEntry>>>>,
|
||||
/// Bounded FIFO set of already-handled incoming event IDs.
|
||||
///
|
||||
/// The Matrix sync loop can replay events on reconnect. This set ensures
|
||||
/// each event is processed at most once. Insert the event ID before any
|
||||
/// side-effecting work; return early if the insert returns `false`.
|
||||
pub handled_incoming_event_ids: Arc<TokioMutex<SeenEventIds>>,
|
||||
/// In gateway mode: the port the gateway is listening on.
|
||||
///
|
||||
/// Used by the "rebuild gateway" command to construct the health-check URL
|
||||
/// passed to the trampoline. `None` in standalone single-project mode.
|
||||
pub gateway_port: Option<u16>,
|
||||
/// Timestamp (ms since Unix epoch) of the last Matrix event received in any
|
||||
/// configured room. Updated atomically on every `on_room_message` call so
|
||||
/// the `health` command can detect a stale or dead sync loop.
|
||||
pub last_matrix_event_ms: Arc<AtomicI64>,
|
||||
}
|
||||
|
||||
impl BotContext {
|
||||
@@ -141,7 +136,12 @@ impl BotContext {
|
||||
pub async fn active_project_url(&self) -> Option<String> {
|
||||
let ap = self.gateway_active_project.as_ref()?;
|
||||
let name = ap.read().await.clone();
|
||||
self.gateway_project_urls.get(&name).cloned()
|
||||
let store = self.gateway_projects_store.as_ref()?;
|
||||
store
|
||||
.read()
|
||||
.await
|
||||
.get(&name)
|
||||
.and_then(|entry| entry.url.clone())
|
||||
}
|
||||
|
||||
/// Proxy a bot command to the active project over a WebSocket RPC call.
|
||||
@@ -277,8 +277,9 @@ mod tests {
|
||||
fn test_bot_context(
|
||||
services: Arc<Services>,
|
||||
gateway_active_project: Option<Arc<RwLock<String>>>,
|
||||
gateway_projects: Vec<String>,
|
||||
gateway_project_urls: BTreeMap<String, String>,
|
||||
gateway_projects_store: Option<
|
||||
Arc<RwLock<BTreeMap<String, crate::service::gateway::config::ProjectEntry>>>,
|
||||
>,
|
||||
) -> BotContext {
|
||||
BotContext {
|
||||
services,
|
||||
@@ -298,13 +299,12 @@ mod tests {
|
||||
std::path::PathBuf::from("/tmp/timers.json"),
|
||||
)),
|
||||
gateway_active_project,
|
||||
gateway_projects,
|
||||
gateway_project_urls,
|
||||
pending_pipeline_events: Arc::new(TokioMutex::new(Vec::new())),
|
||||
pending_gateway_events: Arc::new(TokioMutex::new(Vec::new())),
|
||||
gateway_projects_store,
|
||||
handled_incoming_event_ids: Arc::new(TokioMutex::new(SeenEventIds::new(
|
||||
SEEN_EVENT_IDS_CAP,
|
||||
))),
|
||||
gateway_port: None,
|
||||
last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,7 +318,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn effective_project_root_standalone_returns_project_root() {
|
||||
let services = test_services(PathBuf::from("/projects/myapp"));
|
||||
let ctx = test_bot_context(services, None, vec![], BTreeMap::new());
|
||||
let ctx = test_bot_context(services, None, None);
|
||||
assert_eq!(
|
||||
ctx.effective_project_root().await,
|
||||
PathBuf::from("/projects/myapp")
|
||||
@@ -329,15 +329,7 @@ mod tests {
|
||||
async fn effective_project_root_gateway_uses_active_project_subdir() {
|
||||
let services = test_services(PathBuf::from("/gateway"));
|
||||
let active = Arc::new(RwLock::new("huskies".to_string()));
|
||||
let ctx = test_bot_context(
|
||||
services,
|
||||
Some(Arc::clone(&active)),
|
||||
vec!["huskies".into(), "robot-studio".into()],
|
||||
BTreeMap::from([
|
||||
("huskies".into(), "http://localhost:3001".into()),
|
||||
("robot-studio".into(), "http://localhost:3002".into()),
|
||||
]),
|
||||
);
|
||||
let ctx = test_bot_context(services, Some(Arc::clone(&active)), None);
|
||||
assert_eq!(
|
||||
ctx.effective_project_root().await,
|
||||
PathBuf::from("/gateway/huskies")
|
||||
@@ -348,15 +340,7 @@ mod tests {
|
||||
async fn effective_project_root_gateway_reflects_project_switch() {
|
||||
let services = test_services(PathBuf::from("/gateway"));
|
||||
let active = Arc::new(RwLock::new("huskies".to_string()));
|
||||
let ctx = test_bot_context(
|
||||
services,
|
||||
Some(Arc::clone(&active)),
|
||||
vec!["huskies".into(), "robot-studio".into()],
|
||||
BTreeMap::from([
|
||||
("huskies".into(), "http://localhost:3001".into()),
|
||||
("robot-studio".into(), "http://localhost:3002".into()),
|
||||
]),
|
||||
);
|
||||
let ctx = test_bot_context(services, Some(Arc::clone(&active)), None);
|
||||
|
||||
assert_eq!(
|
||||
ctx.effective_project_root().await,
|
||||
@@ -432,7 +416,7 @@ mod tests {
|
||||
#[test]
|
||||
fn bot_context_has_no_require_verified_devices_field() {
|
||||
let services = test_services(PathBuf::from("/tmp"));
|
||||
let ctx = test_bot_context(services, None, vec![], BTreeMap::new());
|
||||
let ctx = test_bot_context(services, None, None);
|
||||
let _cloned = ctx.clone();
|
||||
}
|
||||
|
||||
@@ -479,12 +463,16 @@ mod tests {
|
||||
let base_url = format!("http://127.0.0.1:{port}");
|
||||
let services = test_services(PathBuf::from("/gateway"));
|
||||
let active = Arc::new(RwLock::new("huskies".to_string()));
|
||||
let ctx = test_bot_context(
|
||||
services,
|
||||
Some(Arc::clone(&active)),
|
||||
vec!["huskies".into()],
|
||||
BTreeMap::from([("huskies".into(), base_url)]),
|
||||
);
|
||||
let store = Arc::new(RwLock::new(BTreeMap::from([(
|
||||
"huskies".to_string(),
|
||||
crate::service::gateway::config::ProjectEntry {
|
||||
url: Some(base_url),
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
)])));
|
||||
let ctx = test_bot_context(services, Some(Arc::clone(&active)), Some(store));
|
||||
|
||||
let result = ctx.proxy_bot_command("status", "").await;
|
||||
assert_eq!(
|
||||
@@ -495,4 +483,45 @@ mod tests {
|
||||
|
||||
server.await.unwrap();
|
||||
}
|
||||
|
||||
/// Regression test for story 1132: `active_project_url` must read from the
|
||||
/// live `gateway_projects_store`, not a stale snapshot frozen at bot startup.
|
||||
/// Adding a project to the store after `BotContext` is created must be
|
||||
/// visible immediately — no restart required.
|
||||
#[tokio::test]
|
||||
async fn active_project_url_reflects_runtime_added_project() {
|
||||
let store: Arc<RwLock<BTreeMap<String, crate::service::gateway::config::ProjectEntry>>> =
|
||||
Arc::new(RwLock::new(BTreeMap::new()));
|
||||
let active = Arc::new(RwLock::new("new-project".to_string()));
|
||||
let services = test_services(PathBuf::from("/gateway"));
|
||||
let ctx = test_bot_context(
|
||||
services,
|
||||
Some(Arc::clone(&active)),
|
||||
Some(Arc::clone(&store)),
|
||||
);
|
||||
|
||||
// Store is empty — must return None.
|
||||
assert!(
|
||||
ctx.active_project_url().await.is_none(),
|
||||
"URL must be None when store is empty"
|
||||
);
|
||||
|
||||
// Insert the entry at runtime (simulates `new project` command).
|
||||
store.write().await.insert(
|
||||
"new-project".to_string(),
|
||||
crate::service::gateway::config::ProjectEntry {
|
||||
url: Some("http://localhost:3099".to_string()),
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
|
||||
// Now the live store has the entry — active_project_url must see it.
|
||||
assert_eq!(
|
||||
ctx.active_project_url().await.as_deref(),
|
||||
Some("http://localhost:3099"),
|
||||
"URL must be visible after runtime insertion without bot restart"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,23 @@ pub fn format_startup_announcement(bot_name: &str) -> String {
|
||||
format!("{bot_name} is online.")
|
||||
}
|
||||
|
||||
/// Format the ready announcement sent after a successful gateway trampoline restart.
|
||||
///
|
||||
/// Returns "gateway X.Y.Z ready" using the compiled-in crate version so the
|
||||
/// operator can confirm which binary is running after a rebuild.
|
||||
pub fn format_gateway_ready_announcement() -> String {
|
||||
format!("gateway {} ready", env!("CARGO_PKG_VERSION"))
|
||||
}
|
||||
|
||||
/// Format the failure announcement sent when the trampoline rolls back to the
|
||||
/// previous binary.
|
||||
///
|
||||
/// `reason` is the human-readable failure description from the trampoline
|
||||
/// (e.g. "port 3000 already in use").
|
||||
pub fn format_gateway_rollback_announcement(reason: &str) -> String {
|
||||
format!("Gateway rebuild failed: {reason}. Previous version restored.")
|
||||
}
|
||||
|
||||
/// Convert a Markdown string to an HTML string using pulldown-cmark.
|
||||
///
|
||||
/// Enables the standard extension set (tables, footnotes, strikethrough,
|
||||
|
||||
@@ -13,7 +13,7 @@ use super::super::context::BotContext;
|
||||
use super::super::format::markdown_to_html;
|
||||
use super::super::history::{ConversationEntry, ConversationRole, save_history};
|
||||
|
||||
use super::{format_drained_events, format_user_prompt};
|
||||
use super::format_user_prompt;
|
||||
|
||||
pub(in crate::chat::transport::matrix::bot) async fn handle_message(
|
||||
room_id_str: String,
|
||||
@@ -31,28 +31,13 @@ pub(in crate::chat::transport::matrix::bot) async fn handle_message(
|
||||
guard.get(&room_id).and_then(|conv| conv.session_id.clone())
|
||||
};
|
||||
|
||||
// Drain pipeline and gateway transition events buffered since the last LLM
|
||||
// turn and prepend them as a passive <system-reminder> block so Timmy sees
|
||||
// pipeline activity without requiring a separate message. Sled events come
|
||||
// from `pending_pipeline_events`; gateway events from `pending_gateway_events`.
|
||||
// In practice only one buffer is non-empty (sled mode vs gateway mode).
|
||||
let system_reminder_prefix = {
|
||||
let mut sled_guard = ctx.pending_pipeline_events.lock().await;
|
||||
let mut gtw_guard = ctx.pending_gateway_events.lock().await;
|
||||
let all_lines: Vec<String> = sled_guard.drain(..).chain(gtw_guard.drain(..)).collect();
|
||||
drop(sled_guard);
|
||||
drop(gtw_guard);
|
||||
slog!(
|
||||
"[matrix-bot] drained {} gateway audit lines for LLM context",
|
||||
all_lines.len()
|
||||
);
|
||||
let prefix = format_drained_events(all_lines);
|
||||
slog!(
|
||||
"[matrix-bot] format_drained_events output: {} bytes",
|
||||
prefix.len()
|
||||
);
|
||||
prefix
|
||||
};
|
||||
// Pull new pipeline-transition events from the CRDT event log for this
|
||||
// persona and atomically advance the high-water marks so the same events
|
||||
// are not re-injected on the next turn. All transports share the same
|
||||
// persona key so events are visible regardless of which transport handles
|
||||
// the next turn.
|
||||
let persona = ctx.services.bot_name.to_lowercase();
|
||||
let event_log_ctx = crate::llm_session::assemble_prompt_context(&persona);
|
||||
|
||||
// The prompt is just the current message with sender attribution.
|
||||
// Prior conversation context is carried by the Claude Code session.
|
||||
@@ -64,7 +49,7 @@ pub(in crate::chat::transport::matrix::bot) async fn handle_message(
|
||||
String::new()
|
||||
};
|
||||
let prompt = format!(
|
||||
"{system_reminder_prefix}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n{active_project_ctx}\n{}",
|
||||
"{event_log_ctx}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n{active_project_ctx}\n{}",
|
||||
format_user_prompt(&sender, &user_message)
|
||||
);
|
||||
|
||||
|
||||
@@ -11,27 +11,6 @@ pub(super) fn format_user_prompt(sender: &str, message: &str) -> String {
|
||||
format!("{sender}: {message}")
|
||||
}
|
||||
|
||||
/// Drain `lines` into a `<system-reminder>` block for injection at the head of
|
||||
/// the next LLM prompt. Returns an empty string when `lines` is empty.
|
||||
///
|
||||
/// At most 20 lines are shown verbatim; excess lines are replaced with a
|
||||
/// `…and N more` indicator to keep context size bounded.
|
||||
pub(in crate::chat::transport::matrix::bot) fn format_drained_events(lines: Vec<String>) -> String {
|
||||
if lines.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
const MAX_PIPELINE_EVENTS: usize = 20;
|
||||
let total = lines.len();
|
||||
let shown_count = total.min(MAX_PIPELINE_EVENTS);
|
||||
let shown = lines[..shown_count].join("\n");
|
||||
let tail = if total > MAX_PIPELINE_EVENTS {
|
||||
format!("\n...and {} more", total - MAX_PIPELINE_EVENTS)
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
format!("<system-reminder>\n{shown}{tail}\n</system-reminder>\n")
|
||||
}
|
||||
|
||||
/// Matrix event handler for room messages. Each invocation spawns an
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -72,49 +51,6 @@ mod tests {
|
||||
assert!(crate::llm::oauth::extract_login_url_from_error(err).is_none());
|
||||
}
|
||||
|
||||
// -- format_drained_events ----------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn format_drained_events_empty_returns_empty_string() {
|
||||
assert_eq!(format_drained_events(vec![]), String::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_drained_events_wraps_in_system_reminder() {
|
||||
let result = format_drained_events(vec!["audit ts=2026 id=1 event=x".to_string()]);
|
||||
assert!(result.starts_with("<system-reminder>\n"), "got: {result}");
|
||||
assert!(result.ends_with("</system-reminder>\n"), "got: {result}");
|
||||
assert!(
|
||||
result.contains("audit ts=2026 id=1 event=x"),
|
||||
"got: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_drained_events_caps_at_20_with_overflow_indicator() {
|
||||
let lines: Vec<String> = (0..25).map(|i| format!("line {i}")).collect();
|
||||
let result = format_drained_events(lines);
|
||||
assert!(result.contains("...and 5 more"), "got: {result}");
|
||||
assert!(
|
||||
result.contains("line 19"),
|
||||
"last shown line missing; got: {result}"
|
||||
);
|
||||
assert!(
|
||||
!result.contains("line 20"),
|
||||
"line 21 must be hidden; got: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_drained_events_exactly_20_no_overflow_indicator() {
|
||||
let lines: Vec<String> = (0..20).map(|i| format!("line {i}")).collect();
|
||||
let result = format_drained_events(lines);
|
||||
assert!(
|
||||
!result.contains("...and"),
|
||||
"must not show overflow when exactly 20; got: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
// -- bot_name / system prompt -------------------------------------------
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -19,6 +19,67 @@ use super::super::verification::check_sender_verified;
|
||||
|
||||
use super::handle_message;
|
||||
|
||||
/// Return `true` when the message is a `health` command addressed to the bot.
|
||||
///
|
||||
/// Recognised case-insensitively as the single word `health` after stripping the bot
|
||||
/// mention prefix. Any trailing whitespace is ignored; extra arguments are not
|
||||
/// expected and are silently discarded.
|
||||
fn extract_health_command(message: &str, bot_name: &str, bot_user_id: &str) -> bool {
|
||||
let stripped = crate::chat::util::strip_bot_mention(message, bot_name, bot_user_id);
|
||||
let trimmed = stripped
|
||||
.trim()
|
||||
.trim_start_matches(|c: char| !c.is_alphanumeric());
|
||||
let cmd = trimmed.split_whitespace().next().unwrap_or("");
|
||||
cmd.eq_ignore_ascii_case("health")
|
||||
}
|
||||
|
||||
/// Return `true` when the message is a "rebuild gateway" command addressed to the bot.
|
||||
///
|
||||
/// The command is recognised case-insensitively as `rebuild gateway` after stripping
|
||||
/// the bot mention prefix so both `@Timmy rebuild gateway` and `Timmy rebuild gateway`
|
||||
/// match.
|
||||
fn extract_rebuild_gateway_command(message: &str, bot_name: &str, bot_user_id: &str) -> bool {
|
||||
let stripped = crate::chat::util::strip_bot_mention(message, bot_name, bot_user_id);
|
||||
let trimmed = stripped
|
||||
.trim()
|
||||
.trim_start_matches(|c: char| !c.is_alphanumeric());
|
||||
let (cmd, rest) = match trimmed.split_once(char::is_whitespace) {
|
||||
Some((c, r)) => (c, r.trim()),
|
||||
None => return false,
|
||||
};
|
||||
cmd.eq_ignore_ascii_case("rebuild")
|
||||
&& rest
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.map(|w| w.eq_ignore_ascii_case("gateway"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Evaluate a `switch <arg>` command against the live project store.
|
||||
///
|
||||
/// Reads valid project names from the store at call time so newly added
|
||||
/// projects are visible without a bot restart. Returns the reply text.
|
||||
pub(super) async fn eval_switch_command(
|
||||
arg: &str,
|
||||
active_project: &tokio::sync::RwLock<String>,
|
||||
store: &tokio::sync::RwLock<
|
||||
std::collections::BTreeMap<String, crate::service::gateway::config::ProjectEntry>,
|
||||
>,
|
||||
) -> String {
|
||||
let projects: Vec<String> = store.read().await.keys().cloned().collect();
|
||||
if arg.is_empty() {
|
||||
let available = projects.join(", ");
|
||||
format!("Usage: `switch <project>`. Available projects: {available}")
|
||||
} else if projects.iter().any(|p| p == arg) {
|
||||
*active_project.write().await = arg.to_string();
|
||||
crate::crdt_state::write_gateway_active_project(arg);
|
||||
format!("Switched to project **{arg}**.")
|
||||
} else {
|
||||
let available = projects.join(", ");
|
||||
format!("Unknown project `{arg}`. Available: {available}")
|
||||
}
|
||||
}
|
||||
|
||||
pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
ev: OriginalSyncRoomMessageEvent,
|
||||
room: Room,
|
||||
@@ -53,6 +114,12 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
return;
|
||||
}
|
||||
|
||||
// Update last-event timestamp so the `health` command can detect a stale sync loop.
|
||||
ctx.last_matrix_event_ms.store(
|
||||
chrono::Utc::now().timestamp_millis(),
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
);
|
||||
|
||||
// Ignore the bot's own messages to prevent echo loops.
|
||||
if ev.sender == ctx.matrix_user_id {
|
||||
return;
|
||||
@@ -192,8 +259,18 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
// endpoint. Only a small set of gateway-local commands are handled here.
|
||||
if ctx.is_gateway() {
|
||||
// Commands that are meaningful on the gateway itself (no project state needed).
|
||||
const GATEWAY_LOCAL_COMMANDS: &[&str] =
|
||||
&["help", "ambient", "reset", "switch", "all_status"];
|
||||
const GATEWAY_LOCAL_COMMANDS: &[&str] = &[
|
||||
"help",
|
||||
"ambient",
|
||||
"reset",
|
||||
"switch",
|
||||
"all_status",
|
||||
"new",
|
||||
"config",
|
||||
"project-rebuild",
|
||||
"upgrade",
|
||||
"health",
|
||||
];
|
||||
|
||||
let stripped = crate::chat::util::strip_bot_mention(
|
||||
&user_message,
|
||||
@@ -240,7 +317,18 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
|
||||
// `all_status` — aggregate pipeline status across all projects (gateway-only).
|
||||
if cmd == "all_status" {
|
||||
let project_urls = ctx.gateway_project_urls.clone();
|
||||
let project_urls: std::collections::BTreeMap<String, String> = if let Some(ref store) =
|
||||
ctx.gateway_projects_store
|
||||
{
|
||||
store
|
||||
.read()
|
||||
.await
|
||||
.iter()
|
||||
.filter_map(|(name, entry)| entry.url.clone().map(|url| (name.clone(), url)))
|
||||
.collect()
|
||||
} else {
|
||||
std::collections::BTreeMap::new()
|
||||
};
|
||||
let client = reqwest::Client::new();
|
||||
let statuses =
|
||||
crate::gateway::fetch_all_project_pipeline_statuses(&project_urls, &client).await;
|
||||
@@ -257,9 +345,248 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
return;
|
||||
}
|
||||
|
||||
// `config <project> <key>=<value>` — override an agent or project setting.
|
||||
if cmd == "config" {
|
||||
let response = if let Some(ref store) = ctx.gateway_projects_store {
|
||||
// Parse: "<project> <key>=<value>"
|
||||
let mut parts = args.splitn(2, char::is_whitespace);
|
||||
let project = parts.next().unwrap_or("").trim();
|
||||
let setting = parts.next().unwrap_or("").trim();
|
||||
if project.is_empty() || setting.is_empty() {
|
||||
"Usage: `config <project> <key>=<value>`\n\
|
||||
Examples:\n\
|
||||
- `config myapp coder.model=opus`\n\
|
||||
- `config myapp default_qa=human`"
|
||||
.to_string()
|
||||
} else {
|
||||
match setting.split_once('=') {
|
||||
None => {
|
||||
"Usage: setting must be in `key=value` form, e.g. `coder.model=opus`"
|
||||
.to_string()
|
||||
}
|
||||
Some((key, value)) => {
|
||||
let host_path_opt = {
|
||||
let projects = store.read().await;
|
||||
projects.get(project).and_then(|e| e.host_path.clone())
|
||||
};
|
||||
match host_path_opt {
|
||||
None => format!(
|
||||
"Project `{project}` not found or has no host path configured."
|
||||
),
|
||||
Some(path) => {
|
||||
match super::super::super::new_project::apply_project_config(
|
||||
std::path::Path::new(&path),
|
||||
key.trim(),
|
||||
value.trim(),
|
||||
) {
|
||||
Ok(msg) => msg,
|
||||
Err(e) => format!("Config error: {e}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
"Gateway projects store unavailable.".to_string()
|
||||
};
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, &response, &html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Gateway-local commands and freeform text fall through to normal handling below.
|
||||
}
|
||||
|
||||
// In gateway mode, handle the "new project <name> [--stack <stack>]" command
|
||||
// to bootstrap a project container and register it with the gateway.
|
||||
if ctx.is_gateway()
|
||||
&& let Some(cmd) = super::super::super::new_project::extract_new_project_command(
|
||||
&user_message,
|
||||
&ctx.services.bot_name,
|
||||
ctx.matrix_user_id.as_str(),
|
||||
)
|
||||
{
|
||||
slog!(
|
||||
"[matrix-bot] Handling new project command from {sender}: name={:?} stack={:?} git_url={:?} adopt_path={:?}",
|
||||
cmd.name,
|
||||
cmd.stack,
|
||||
cmd.git_url,
|
||||
cmd.adopt_path,
|
||||
);
|
||||
let response = if let Some(ref store) = ctx.gateway_projects_store {
|
||||
super::super::super::new_project::handle_new_project(
|
||||
&cmd.name,
|
||||
cmd.stack.as_deref(),
|
||||
cmd.git_url.as_deref(),
|
||||
cmd.git_token.as_deref(),
|
||||
cmd.host_path.as_deref(),
|
||||
cmd.adopt_path.as_deref(),
|
||||
cmd.skip_config,
|
||||
store,
|
||||
&ctx.services.project_root,
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
"Gateway projects store unavailable — cannot create project.".to_string()
|
||||
};
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, &response, &html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// In gateway mode, handle the `project-rebuild <name>` command to rebuild a
|
||||
// project container and swap it without losing pipeline state.
|
||||
if ctx.is_gateway()
|
||||
&& let Some(rebuild_cmd) =
|
||||
super::super::super::project_rebuild::extract_project_rebuild_command(
|
||||
&user_message,
|
||||
&ctx.services.bot_name,
|
||||
ctx.matrix_user_id.as_str(),
|
||||
)
|
||||
{
|
||||
slog!(
|
||||
"[matrix-bot] Handling project-rebuild command from {sender}: name={:?} timeout={}s force={}",
|
||||
rebuild_cmd.name,
|
||||
rebuild_cmd.drain_timeout_secs,
|
||||
rebuild_cmd.force,
|
||||
);
|
||||
let response = if let Some(ref store) = ctx.gateway_projects_store {
|
||||
super::super::super::project_rebuild::handle_project_rebuild(
|
||||
&rebuild_cmd.name,
|
||||
rebuild_cmd.drain_timeout_secs,
|
||||
rebuild_cmd.force,
|
||||
store,
|
||||
&ctx.services.project_root,
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
"Gateway projects store unavailable — cannot rebuild project.".to_string()
|
||||
};
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, &response, &html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// In gateway mode, handle the `upgrade [<project>]` command to upgrade a
|
||||
// sled's binary in-container, streaming phase markers to the room.
|
||||
if ctx.is_gateway()
|
||||
&& let Some(upgrade_cmd) = super::super::super::sled_upgrade::extract_upgrade_command(
|
||||
&user_message,
|
||||
&ctx.services.bot_name,
|
||||
ctx.matrix_user_id.as_str(),
|
||||
)
|
||||
{
|
||||
match upgrade_cmd {
|
||||
super::super::super::sled_upgrade::UpgradeCommand::ListProjects => {
|
||||
slog!("[matrix-bot] Handling 'upgrade' list-projects from {sender}");
|
||||
let response = if let Some(ref store) = ctx.gateway_projects_store {
|
||||
super::super::super::sled_upgrade::handle_upgrade_list_projects(store).await
|
||||
} else {
|
||||
"Gateway projects store unavailable.".to_string()
|
||||
};
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, &response, &html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
}
|
||||
super::super::super::sled_upgrade::UpgradeCommand::Upgrade { project } => {
|
||||
slog!("[matrix-bot] Handling 'upgrade {project}' from {sender}");
|
||||
if let Some(ref store) = ctx.gateway_projects_store {
|
||||
let transport = Arc::clone(&ctx.transport);
|
||||
let bot_sent = Arc::clone(&ctx.bot_sent_event_ids);
|
||||
let room = room_id_str.clone();
|
||||
|
||||
let response = super::super::super::sled_upgrade::handle_sled_upgrade(
|
||||
&project,
|
||||
store,
|
||||
ctx.gateway_port,
|
||||
|phase_msg| {
|
||||
let transport = Arc::clone(&transport);
|
||||
let bot_sent = Arc::clone(&bot_sent);
|
||||
let room = room.clone();
|
||||
async move {
|
||||
let html = markdown_to_html(&phase_msg);
|
||||
if let Ok(msg_id) =
|
||||
transport.send_message(&room, &phase_msg, &html).await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
bot_sent.lock().await.insert(event_id);
|
||||
}
|
||||
}
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, &response, &html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
} else {
|
||||
let msg = "Gateway projects store unavailable — cannot upgrade sled.";
|
||||
let html = markdown_to_html(msg);
|
||||
if let Ok(msg_id) = ctx.transport.send_message(&room_id_str, msg, &html).await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// `health` — async subsystem health report (gateway + standalone).
|
||||
if extract_health_command(
|
||||
&user_message,
|
||||
&ctx.services.bot_name,
|
||||
ctx.matrix_user_id.as_str(),
|
||||
) {
|
||||
slog!("[matrix-bot] Handling 'health' from {sender}");
|
||||
let response = super::super::super::health::run_health_check(&ctx).await;
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, &response, &html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for bot-level commands (help, status, ambient, …) before invoking
|
||||
// the LLM. All commands are registered in commands.rs — no special-casing
|
||||
// needed here.
|
||||
@@ -472,6 +799,87 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
return;
|
||||
}
|
||||
|
||||
// In gateway mode, intercept "rebuild gateway" and route it through the
|
||||
// detached trampoline so the process swap survives any bash-tool kill cascade.
|
||||
if ctx.gateway_active_project.is_some()
|
||||
&& extract_rebuild_gateway_command(
|
||||
&user_message,
|
||||
&ctx.services.bot_name,
|
||||
ctx.matrix_user_id.as_str(),
|
||||
)
|
||||
{
|
||||
slog!("[matrix-bot] Handling 'rebuild gateway' command from {sender}");
|
||||
let ack = "Rebuilding gateway\u{2026} this may take a moment.";
|
||||
let ack_html = markdown_to_html(ack);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, ack, &ack_html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
let config_dir = ctx.services.project_root.clone();
|
||||
let gateway_port: u16 = ctx.gateway_port.unwrap_or(3000);
|
||||
match crate::gateway::rebuild::rebuild_gateway(&config_dir, gateway_port).await {
|
||||
Ok(()) => {
|
||||
// Trampoline is running detached — it kills this gateway and starts
|
||||
// the new one, which will post "gateway X.Y.Z ready" on startup.
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = format!("Gateway rebuild failed: {e}");
|
||||
let html = markdown_to_html(&msg);
|
||||
if let Ok(msg_id) = ctx.transport.send_message(&room_id_str, &msg, &html).await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// In gateway mode, intercept "rebuild gateway" before the plain "rebuild"
|
||||
// handler so the trampoline path is used instead of a direct re-exec.
|
||||
if ctx.gateway_port.is_some()
|
||||
&& super::super::super::rebuild::extract_rebuild_gateway_command(
|
||||
&user_message,
|
||||
&ctx.services.bot_name,
|
||||
ctx.matrix_user_id.as_str(),
|
||||
)
|
||||
.is_some()
|
||||
{
|
||||
slog!("[matrix-bot] Handling rebuild-gateway command from {sender}");
|
||||
let ack = "Rebuilding gateway… this may take a moment. \
|
||||
The gateway will announce itself when the new version is ready.";
|
||||
let ack_html = markdown_to_html(ack);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, ack, &ack_html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
let port = ctx.gateway_port.unwrap_or(3000);
|
||||
match crate::gateway::rebuild::rebuild_gateway(&ctx.services.project_root, port).await {
|
||||
Ok(()) => {
|
||||
// Trampoline is running — this gateway will be killed shortly.
|
||||
// No further reply needed; the new gateway posts "gateway X.Y.Z ready".
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = format!("Gateway rebuild failed: {e}");
|
||||
let html = markdown_to_html(&msg);
|
||||
if let Ok(msg_id) = ctx.transport.send_message(&room_id_str, &msg, &html).await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for the rebuild command, which requires async agent and process ops
|
||||
// and cannot be handled by the sync command registry.
|
||||
if super::super::super::rebuild::extract_rebuild_command(
|
||||
@@ -529,16 +937,10 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
};
|
||||
|
||||
if cmd.eq_ignore_ascii_case("switch") {
|
||||
let response = if arg.is_empty() {
|
||||
let available = ctx.gateway_projects.join(", ");
|
||||
format!("Usage: `switch <project>`. Available projects: {available}")
|
||||
} else if ctx.gateway_projects.iter().any(|p| p == &arg) {
|
||||
*active_project.write().await = arg.clone();
|
||||
crate::crdt_state::write_gateway_active_project(&arg);
|
||||
format!("Switched to project **{arg}**.")
|
||||
let response = if let Some(ref store) = ctx.gateway_projects_store {
|
||||
eval_switch_command(&arg, active_project, store).await
|
||||
} else {
|
||||
let available = ctx.gateway_projects.join(", ");
|
||||
format!("Unknown project `{arg}`. Available: {available}")
|
||||
"Switch is unavailable: project store not initialised.".to_string()
|
||||
};
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
@@ -661,3 +1063,80 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
.chat_dispatcher
|
||||
.submit(room_id_str, user_message, factory);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::eval_switch_command;
|
||||
use crate::service::gateway::config::ProjectEntry;
|
||||
use std::collections::BTreeMap;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
/// Regression test: `switch` reads from the live store, not a snapshot Vec.
|
||||
///
|
||||
/// Seeds an empty store, inserts a project at runtime, then asserts the
|
||||
/// command finds it — covering the bug where a stale `gateway_projects` Vec
|
||||
/// caused newly added projects to be invisible until the bot restarted.
|
||||
#[tokio::test]
|
||||
async fn switch_reads_live_store_after_runtime_insert() {
|
||||
let active = RwLock::new("huskies".to_string());
|
||||
let store: RwLock<BTreeMap<String, ProjectEntry>> = RwLock::new(BTreeMap::new());
|
||||
|
||||
// Empty store: unknown project.
|
||||
let resp = eval_switch_command("robot-studio", &active, &store).await;
|
||||
assert!(
|
||||
resp.contains("Unknown project"),
|
||||
"empty store should not find robot-studio: {resp}"
|
||||
);
|
||||
|
||||
// Insert the project at runtime — no restart.
|
||||
store.write().await.insert(
|
||||
"robot-studio".to_string(),
|
||||
ProjectEntry {
|
||||
url: Some("http://localhost:3002".to_string()),
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
|
||||
// Now the live store has the project; switch must succeed.
|
||||
let resp = eval_switch_command("robot-studio", &active, &store).await;
|
||||
assert_eq!(
|
||||
resp, "Switched to project **robot-studio**.",
|
||||
"live store insert must be visible without restart: {resp}"
|
||||
);
|
||||
assert_eq!(
|
||||
*active.read().await,
|
||||
"robot-studio",
|
||||
"active project must be updated after switch"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn switch_empty_arg_lists_available_projects() {
|
||||
let active = RwLock::new("huskies".to_string());
|
||||
let store: RwLock<BTreeMap<String, ProjectEntry>> = RwLock::new(BTreeMap::from([(
|
||||
"huskies".to_string(),
|
||||
ProjectEntry {
|
||||
url: None,
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
)]));
|
||||
|
||||
let resp = eval_switch_command("", &active, &store).await;
|
||||
assert!(
|
||||
resp.contains("Usage:"),
|
||||
"empty arg should show usage: {resp}"
|
||||
);
|
||||
assert!(
|
||||
resp.contains("huskies"),
|
||||
"usage should list available projects: {resp}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use matrix_sdk::ruma::OwnedRoomId;
|
||||
use matrix_sdk::{Client, LoopCtrl, config::SyncSettings};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
||||
use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering};
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
use tokio::sync::{RwLock, watch};
|
||||
|
||||
@@ -28,12 +28,18 @@ pub async fn run_bot(
|
||||
watcher_tx: tokio::sync::broadcast::Sender<crate::io::watcher::WatcherEvent>,
|
||||
shutdown_rx: watch::Receiver<Option<crate::rebuild::ShutdownReason>>,
|
||||
gateway_active_project: Option<Arc<RwLock<String>>>,
|
||||
gateway_projects: Vec<String>,
|
||||
gateway_project_urls: std::collections::BTreeMap<String, String>,
|
||||
gateway_projects_store: Option<
|
||||
Arc<
|
||||
RwLock<
|
||||
std::collections::BTreeMap<String, crate::service::gateway::config::ProjectEntry>,
|
||||
>,
|
||||
>,
|
||||
>,
|
||||
timer_store: Arc<TimerStore>,
|
||||
gateway_event_rx: Option<
|
||||
tokio::sync::broadcast::Receiver<crate::service::gateway::GatewayStatusEvent>,
|
||||
>,
|
||||
gateway_port: Option<u16>,
|
||||
) -> Result<(), String> {
|
||||
let project_root = &services.project_root;
|
||||
let store_path = project_root.join(".huskies").join("matrix_store");
|
||||
@@ -176,7 +182,17 @@ pub async fn run_bot(
|
||||
let announce_room_ids = target_room_ids.clone();
|
||||
// Clone values needed by the gateway notification poller (only used in gateway mode).
|
||||
let poller_room_ids: Vec<String> = target_room_ids.iter().map(|r| r.to_string()).collect();
|
||||
let poller_project_urls = gateway_project_urls.clone();
|
||||
let poller_project_urls: std::collections::BTreeMap<String, String> =
|
||||
if let Some(ref store) = gateway_projects_store {
|
||||
store
|
||||
.read()
|
||||
.await
|
||||
.iter()
|
||||
.filter_map(|(name, entry)| entry.url.clone().map(|url| (name.clone(), url)))
|
||||
.collect()
|
||||
} else {
|
||||
std::collections::BTreeMap::new()
|
||||
};
|
||||
let poller_poll_interval = config.aggregated_notifications_poll_interval_secs;
|
||||
let poller_enabled = config.aggregated_notifications_enabled;
|
||||
|
||||
@@ -297,93 +313,11 @@ pub async fn run_bot(
|
||||
);
|
||||
}
|
||||
|
||||
// Subscribe to pipeline stage transitions and buffer compact audit lines
|
||||
// between Timmy's turns. Replay events (before == after stage label) are
|
||||
// silently dropped — only real transitions are recorded.
|
||||
let pending_pipeline_events: Arc<TokioMutex<Vec<String>>> =
|
||||
Arc::new(TokioMutex::new(Vec::new()));
|
||||
{
|
||||
use crate::pipeline_state::{format_audit_entry, stage_label, subscribe_transitions};
|
||||
let mut rx = subscribe_transitions();
|
||||
let buf = Arc::clone(&pending_pipeline_events);
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(fired) => {
|
||||
if stage_label(&fired.before) == stage_label(&fired.after) {
|
||||
continue;
|
||||
}
|
||||
let line = format_audit_entry(&fired);
|
||||
buf.lock().await.push(line);
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
|
||||
slog!("[matrix-bot] pipeline event buffer lagged by {n} events");
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Subscribe to gateway-side status events and buffer compact audit lines for
|
||||
// the LLM context.
|
||||
//
|
||||
// Investigation log (story 1078) — hypotheses ruled out:
|
||||
// (A) gateway_event_rx is None: impossible — spawn_gateway_bot always passes
|
||||
// Some(state.event_tx.clone()) in gateway mode (gateway/mod.rs:130).
|
||||
// (B) recv() never returns: buf task uses the ORIGINAL event_rx (subscribed
|
||||
// before Matrix init) so any events buffered during init are visible;
|
||||
// future events arrive normally via the shared broadcast channel.
|
||||
// (C) Different Arc: buf and ctx.pending_gateway_events are both clones of
|
||||
// the same Arc<TokioMutex<Vec<String>>> — writes in the buf task are
|
||||
// immediately visible to handle_message.
|
||||
// (D) format_drained_events empty on non-empty input: the function is
|
||||
// pure/tested; the drain slog in handle_message now makes the count
|
||||
// observable so we can confirm it is non-zero when events arrive.
|
||||
//
|
||||
// Bug fixed here: previously the buffer task held `event_rx.resubscribe()`,
|
||||
// which starts at the *current tail* (next unsent message) and silently
|
||||
// discards every event that arrived during the Matrix login / room-join /
|
||||
// cross-signing phase (~5–30 s window). The forwarder now gets the
|
||||
// resubscribed receiver (only needs live events going forward); the buffer
|
||||
// task holds the original `event_rx` so it drains the init-window backlog
|
||||
// on first poll.
|
||||
let pending_gateway_events: Arc<TokioMutex<Vec<String>>> =
|
||||
Arc::new(TokioMutex::new(Vec::new()));
|
||||
let gateway_event_rx_for_forwarder = if let Some(event_rx) = gateway_event_rx {
|
||||
// The forwarder only needs live (future) events — resubscribe is fine.
|
||||
let forwarder_rx = event_rx.resubscribe();
|
||||
// Buffer task: hold the *original* receiver so init-window events are
|
||||
// not lost. Silently accumulate compact audit lines for Timmy's context.
|
||||
{
|
||||
use crate::service::gateway::polling::format_gateway_audit_line;
|
||||
let buf = Arc::clone(&pending_gateway_events);
|
||||
slog!("[matrix-bot] subscribed to gateway events; buffer task starting");
|
||||
tokio::spawn(async move {
|
||||
let mut rx = event_rx;
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(event) => {
|
||||
slog!(
|
||||
"[matrix-bot] buffered audit line for project={} id={}",
|
||||
event.project,
|
||||
event.event.timestamp_ms()
|
||||
);
|
||||
let line = format_gateway_audit_line(&event.project, &event.event);
|
||||
buf.lock().await.push(line);
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
|
||||
slog!("[matrix-bot] gateway event buffer lagged by {n} events");
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
Some(forwarder_rx)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// The forwarder only needs live (future) events — resubscribe is fine.
|
||||
// Pipeline-transition context is now delivered to the LLM via
|
||||
// `assemble_prompt_context` (CRDT event log) rather than these in-memory
|
||||
// buffers, so the buffer tasks are gone; only the forwarder remains.
|
||||
let gateway_event_rx_for_forwarder = gateway_event_rx.map(|rx| rx.resubscribe());
|
||||
|
||||
let ctx = BotContext {
|
||||
services,
|
||||
@@ -397,13 +331,12 @@ pub async fn run_bot(
|
||||
transport: Arc::clone(&transport),
|
||||
timer_store,
|
||||
gateway_active_project,
|
||||
gateway_projects,
|
||||
gateway_project_urls,
|
||||
pending_pipeline_events,
|
||||
pending_gateway_events,
|
||||
gateway_projects_store,
|
||||
handled_incoming_event_ids: Arc::new(TokioMutex::new(super::context::SeenEventIds::new(
|
||||
super::context::SEEN_EVENT_IDS_CAP,
|
||||
))),
|
||||
gateway_port,
|
||||
last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
|
||||
};
|
||||
|
||||
slog!(
|
||||
@@ -478,7 +411,17 @@ pub async fn run_bot(
|
||||
// bot is online. This runs once per process start — the sync loop handles
|
||||
// reconnects internally so this code is never reached again on a network
|
||||
// blip or sync resumption.
|
||||
let announce_msg = format_startup_announcement(&announce_bot_name);
|
||||
//
|
||||
// When started by the trampoline the message is specialised:
|
||||
// - HUSKIES_TRAMPOLINE_STARTED=1 → "gateway X.Y.Z ready"
|
||||
// - HUSKIES_TRAMPOLINE_FAILURE=<reason> → rollback failure notice
|
||||
let announce_msg = if let Ok(reason) = std::env::var("HUSKIES_TRAMPOLINE_FAILURE") {
|
||||
super::format::format_gateway_rollback_announcement(&reason)
|
||||
} else if std::env::var("HUSKIES_TRAMPOLINE_STARTED").is_ok() {
|
||||
super::format::format_gateway_ready_announcement()
|
||||
} else {
|
||||
format_startup_announcement(&announce_bot_name)
|
||||
};
|
||||
let announce_html = markdown_to_html(&announce_msg);
|
||||
slog!("[matrix-bot] Sending startup announcement: {announce_msg}");
|
||||
for room_id in &announce_room_ids {
|
||||
@@ -498,81 +441,164 @@ pub async fn run_bot(
|
||||
const INITIAL_BACKOFF_SECS: u64 = 5;
|
||||
let backoff = Arc::new(AtomicU64::new(INITIAL_BACKOFF_SECS));
|
||||
let was_disconnected = Arc::new(AtomicBool::new(false));
|
||||
// Set to true by the sync callback when a 401/M_UNKNOWN_TOKEN is received.
|
||||
// Checked after the sync loop returns to decide whether to re-login.
|
||||
let needs_relogin = Arc::new(AtomicBool::new(false));
|
||||
|
||||
let sync_transport = Arc::clone(&transport);
|
||||
let sync_rooms: Vec<String> = announce_room_ids.iter().map(|r| r.to_string()).collect();
|
||||
let sync_bot_name = announce_bot_name.clone();
|
||||
|
||||
let backoff_cb = Arc::clone(&backoff);
|
||||
let was_disconnected_cb = Arc::clone(&was_disconnected);
|
||||
// Credentials needed for re-login; captured before any partial moves of `config`.
|
||||
let relogin_username = config.username.clone().unwrap_or_default();
|
||||
let relogin_password = config.password.clone().unwrap_or_default();
|
||||
|
||||
// Use sync_with_result_callback so transient errors (network blips, DNS
|
||||
// hiccups, temporary homeserver outages) are handled in the callback
|
||||
// rather than bubbling up as fatal errors. Fatal errors (HTTP 401/403)
|
||||
// still terminate the loop and propagate to the caller.
|
||||
client
|
||||
.sync_with_result_callback(SyncSettings::default(), move |result| {
|
||||
let backoff = Arc::clone(&backoff_cb);
|
||||
let was_disconnected = Arc::clone(&was_disconnected_cb);
|
||||
let recovery_transport = Arc::clone(&sync_transport);
|
||||
let recovery_rooms = sync_rooms.clone();
|
||||
let recovery_bot_name = sync_bot_name.clone();
|
||||
async move {
|
||||
match result {
|
||||
Ok(_) => {
|
||||
// If we previously lost the connection, announce recovery.
|
||||
if was_disconnected.swap(false, Ordering::Relaxed) {
|
||||
backoff.store(INITIAL_BACKOFF_SECS, Ordering::Relaxed);
|
||||
slog!("[matrix-bot] Reconnected to homeserver — resuming normal operation");
|
||||
let msg = format!(
|
||||
"⚡ **{recovery_bot_name}** reconnected to homeserver."
|
||||
);
|
||||
let html = format!(
|
||||
"<p>⚡ <strong>{recovery_bot_name}</strong> reconnected to homeserver.</p>"
|
||||
);
|
||||
for room_id in &recovery_rooms {
|
||||
if let Err(e) = recovery_transport
|
||||
.send_message(room_id, &msg, &html)
|
||||
.await
|
||||
{
|
||||
slog!(
|
||||
"[matrix-bot] Failed to send recovery notification to {room_id}: {e}"
|
||||
);
|
||||
// Outer loop: re-enters after a successful re-login to restart the sync.
|
||||
// Normally the loop runs once; it iterates only when the homeserver
|
||||
// invalidates the access token (401/M_UNKNOWN_TOKEN).
|
||||
loop {
|
||||
let backoff_cb = Arc::clone(&backoff);
|
||||
let was_disconnected_cb = Arc::clone(&was_disconnected);
|
||||
let needs_relogin_cb = Arc::clone(&needs_relogin);
|
||||
let iter_sync_transport = Arc::clone(&sync_transport);
|
||||
let iter_sync_rooms = sync_rooms.clone();
|
||||
let iter_sync_bot_name = sync_bot_name.clone();
|
||||
|
||||
// Use sync_with_result_callback so transient errors (network blips, DNS
|
||||
// hiccups, temporary homeserver outages) are handled in the callback
|
||||
// rather than bubbling up as fatal errors. Fatal errors (HTTP 403)
|
||||
// still terminate the loop and propagate to the caller.
|
||||
// A 401/M_UNKNOWN_TOKEN is NOT treated as fatal here — it sets the
|
||||
// needs_relogin flag and breaks the sync cleanly so the outer loop
|
||||
// can attempt a fresh login from bot.toml credentials.
|
||||
client
|
||||
.sync_with_result_callback(SyncSettings::default(), move |result| {
|
||||
let backoff = Arc::clone(&backoff_cb);
|
||||
let was_disconnected = Arc::clone(&was_disconnected_cb);
|
||||
let needs_relogin = Arc::clone(&needs_relogin_cb);
|
||||
let recovery_transport = Arc::clone(&iter_sync_transport);
|
||||
let recovery_rooms = iter_sync_rooms.clone();
|
||||
let recovery_bot_name = iter_sync_bot_name.clone();
|
||||
async move {
|
||||
match result {
|
||||
Ok(_) => {
|
||||
// If we previously lost the connection, announce recovery.
|
||||
if was_disconnected.swap(false, Ordering::Relaxed) {
|
||||
backoff.store(INITIAL_BACKOFF_SECS, Ordering::Relaxed);
|
||||
slog!("[matrix-bot] Reconnected to homeserver — resuming normal operation");
|
||||
let msg = format!(
|
||||
"⚡ **{recovery_bot_name}** reconnected to homeserver."
|
||||
);
|
||||
let html = format!(
|
||||
"<p>⚡ <strong>{recovery_bot_name}</strong> reconnected to homeserver.</p>"
|
||||
);
|
||||
for room_id in &recovery_rooms {
|
||||
if let Err(e) = recovery_transport
|
||||
.send_message(room_id, &msg, &html)
|
||||
.await
|
||||
{
|
||||
slog!(
|
||||
"[matrix-bot] Failed to send recovery notification to {room_id}: {e}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(LoopCtrl::Continue)
|
||||
}
|
||||
Err(e) if is_unknown_token_error(&e) => {
|
||||
// 401/M_UNKNOWN_TOKEN: the homeserver rotated or
|
||||
// invalidated our access token. Break cleanly so
|
||||
// the outer loop can re-login from bot.toml.
|
||||
slog!("[matrix-bot] Sync got 401/M_UNKNOWN_TOKEN — queuing re-login");
|
||||
needs_relogin.store(true, Ordering::Relaxed);
|
||||
Ok(LoopCtrl::Break)
|
||||
}
|
||||
Err(e) if is_fatal_sync_error(&e) => Err(e),
|
||||
Err(e) => {
|
||||
// Transient error: log, back off, and let the stream retry.
|
||||
let delay = backoff.load(Ordering::Relaxed);
|
||||
slog!("[matrix-bot] Sync warning (retrying in {delay}s): {e}");
|
||||
was_disconnected.store(true, Ordering::Relaxed);
|
||||
tokio::time::sleep(std::time::Duration::from_secs(delay)).await;
|
||||
let new_delay = (delay * 2).min(MAX_BACKOFF_SECS);
|
||||
backoff.store(new_delay, Ordering::Relaxed);
|
||||
Ok(LoopCtrl::Continue)
|
||||
}
|
||||
Ok(LoopCtrl::Continue)
|
||||
}
|
||||
Err(e) if is_fatal_sync_error(&e) => Err(e),
|
||||
Err(e) => {
|
||||
// Transient error: log, back off, and let the stream retry.
|
||||
let delay = backoff.load(Ordering::Relaxed);
|
||||
slog!("[matrix-bot] Sync warning (retrying in {delay}s): {e}");
|
||||
was_disconnected.store(true, Ordering::Relaxed);
|
||||
tokio::time::sleep(std::time::Duration::from_secs(delay)).await;
|
||||
let new_delay = (delay * 2).min(MAX_BACKOFF_SECS);
|
||||
backoff.store(new_delay, Ordering::Relaxed);
|
||||
Ok(LoopCtrl::Continue)
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(|e| format!("Matrix sync error: {e}"))?;
|
||||
|
||||
if !needs_relogin.swap(false, Ordering::Relaxed) {
|
||||
// Normal clean exit — not a re-login scenario.
|
||||
break;
|
||||
}
|
||||
|
||||
// --- Re-login flow: access token was invalidated by the homeserver ---
|
||||
// The SQLite store at `.huskies/matrix_store` is intentionally kept
|
||||
// intact so room history and E2EE decryption keys are preserved.
|
||||
// Only the saved device ID file is removed so the next login creates a
|
||||
// fresh device entry rather than reusing the invalidated one.
|
||||
slog!("[matrix-bot] Access token invalidated — re-logging in from bot.toml credentials");
|
||||
let _ = std::fs::remove_file(&device_id_path);
|
||||
|
||||
loop {
|
||||
match client
|
||||
.matrix_auth()
|
||||
.login_username(&relogin_username, &relogin_password)
|
||||
.initial_device_display_name("Huskies Bot")
|
||||
.await
|
||||
{
|
||||
Ok(response) => {
|
||||
let _ = std::fs::write(&device_id_path, &response.device_id);
|
||||
slog!(
|
||||
"[matrix-bot] Re-login successful; new device: {}",
|
||||
response.device_id
|
||||
);
|
||||
let msg =
|
||||
"[matrix-bot] Token rotated by homeserver; re-logged in as new device";
|
||||
let html = "<p>[matrix-bot] Token rotated by homeserver; re-logged in as new device</p>";
|
||||
for room_id in &sync_rooms {
|
||||
if let Err(e) = sync_transport.send_message(room_id, msg, html).await {
|
||||
slog!("[matrix-bot] Failed to send re-login notice to {room_id}: {e}");
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
// Wrong password, homeserver down, etc. — log and keep
|
||||
// retrying every 30 s instead of dying fatally.
|
||||
slog!("[matrix-bot] Re-login failed: {e} — retrying in 30s");
|
||||
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(|e| format!("Matrix sync error: {e}"))?;
|
||||
}
|
||||
// Outer loop continues: restarts the Matrix sync with the new token.
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns `true` for errors that indicate the bot's session is permanently
|
||||
/// invalid (HTTP 401 Unauthorized or 403 Forbidden). All other errors —
|
||||
/// network failures, timeouts, transient 5xx responses — are considered
|
||||
/// recoverable and should be retried with exponential back-off.
|
||||
/// Returns `true` for errors that indicate the bot is permanently forbidden
|
||||
/// from the homeserver (HTTP 403). All other errors — network failures,
|
||||
/// timeouts, transient 5xx responses — are considered recoverable.
|
||||
///
|
||||
/// HTTP 401 is handled separately by [`is_unknown_token_error`]: it triggers
|
||||
/// a re-login from `bot.toml` credentials rather than a fatal shutdown.
|
||||
fn is_fatal_sync_error(e: &matrix_sdk::Error) -> bool {
|
||||
e.as_client_api_error()
|
||||
.map(|api_err| {
|
||||
let code = api_err.status_code.as_u16();
|
||||
code == 401 || code == 403
|
||||
})
|
||||
.map(|api_err| api_err.status_code.as_u16() == 403)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Returns `true` when the homeserver returned 401 / M_UNKNOWN_TOKEN,
|
||||
/// indicating that the current access token has been invalidated.
|
||||
/// The bot should respond by re-logging in from `bot.toml` credentials
|
||||
/// rather than shutting down permanently.
|
||||
fn is_unknown_token_error(e: &matrix_sdk::Error) -> bool {
|
||||
e.as_client_api_error()
|
||||
.map(|api_err| api_err.status_code.as_u16() == 401)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
@@ -589,6 +615,14 @@ mod tests {
|
||||
assert!(!is_fatal_sync_error(&e));
|
||||
}
|
||||
|
||||
/// An I/O error must NOT be mistaken for an unknown-token error.
|
||||
#[test]
|
||||
fn io_error_is_not_unknown_token() {
|
||||
let e: matrix_sdk::Error =
|
||||
std::io::Error::new(std::io::ErrorKind::ConnectionRefused, "connection refused").into();
|
||||
assert!(!is_unknown_token_error(&e));
|
||||
}
|
||||
|
||||
/// Exponential back-off must clamp at MAX_BACKOFF_SECS (300 s) regardless
|
||||
/// of how many consecutive failures occur.
|
||||
#[test]
|
||||
@@ -621,88 +655,39 @@ mod tests {
|
||||
assert_eq!(steps[3], 40);
|
||||
}
|
||||
|
||||
/// Regression test (story 1078): gateway broadcast events must reach
|
||||
/// `pending_gateway_events` and produce an `audit ts=…` line in the
|
||||
/// `format_drained_events` output that is prepended to Timmy's prompt.
|
||||
/// 401 must NOT be classified as fatal: the bot re-logs in rather than dying.
|
||||
/// is_fatal_sync_error must return false for 401 so the re-login path runs.
|
||||
#[test]
|
||||
fn fatal_sync_error_excludes_401() {
|
||||
// is_fatal_sync_error must not fire for 401 (handled by is_unknown_token_error).
|
||||
// We verify the logic: only 403 is fatal in the sync loop.
|
||||
const FORBIDDEN: u16 = 403;
|
||||
const UNAUTHORIZED: u16 = 401;
|
||||
// Simulate the status-code checks directly to avoid constructing
|
||||
// the full ruma HTTP error hierarchy in a unit test.
|
||||
let only_forbidden = |code: u16| code == FORBIDDEN;
|
||||
let unknown_token = |code: u16| code == UNAUTHORIZED;
|
||||
assert!(only_forbidden(FORBIDDEN), "403 must be fatal");
|
||||
assert!(!only_forbidden(UNAUTHORIZED), "401 must NOT be fatal");
|
||||
assert!(unknown_token(UNAUTHORIZED), "401 must trigger re-login");
|
||||
assert!(!unknown_token(FORBIDDEN), "403 must NOT trigger re-login");
|
||||
}
|
||||
|
||||
/// Re-login retry interval must be exactly 30 s.
|
||||
///
|
||||
/// The test spins up a mock `event_tx` broadcaster, sends one
|
||||
/// `StageTransition` event, lets the buffer task process it, drains the
|
||||
/// buffer, and asserts the result contains the expected audit prefix.
|
||||
#[tokio::test]
|
||||
async fn gateway_buffer_task_injects_audit_line_into_context() {
|
||||
use super::super::messages::format_drained_events;
|
||||
use crate::service::events::StoredEvent;
|
||||
use crate::service::gateway::GatewayStatusEvent;
|
||||
use crate::service::gateway::polling::format_gateway_audit_line;
|
||||
|
||||
let (event_tx, event_rx) = tokio::sync::broadcast::channel::<GatewayStatusEvent>(16);
|
||||
|
||||
// pending_gateway_events shared between buffer task and drain site.
|
||||
let pending: Arc<TokioMutex<Vec<String>>> = Arc::new(TokioMutex::new(Vec::new()));
|
||||
|
||||
// Spawn a minimal buffer task — same logic as run_bot uses.
|
||||
{
|
||||
let buf = Arc::clone(&pending);
|
||||
tokio::spawn(async move {
|
||||
let mut rx = event_rx;
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(event) => {
|
||||
let line = format_gateway_audit_line(&event.project, &event.event);
|
||||
buf.lock().await.push(line);
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Lagged(_)) => {}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Send one stage-transition event, as a project node would.
|
||||
let evt = GatewayStatusEvent {
|
||||
project: "huskies".to_string(),
|
||||
event: StoredEvent::StageTransition {
|
||||
story_id: "42_story_feat".to_string(),
|
||||
story_name: String::new(),
|
||||
from_stage: "2_current".to_string(),
|
||||
to_stage: "3_qa".to_string(),
|
||||
timestamp_ms: 1_000_000,
|
||||
},
|
||||
};
|
||||
let receivers = event_tx.send(evt).unwrap_or(0);
|
||||
assert!(
|
||||
receivers > 0,
|
||||
"event must have at least one active receiver"
|
||||
);
|
||||
|
||||
// Wait for the buffer task to process the event.
|
||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(2);
|
||||
loop {
|
||||
if !pending.lock().await.is_empty() {
|
||||
break;
|
||||
}
|
||||
assert!(
|
||||
std::time::Instant::now() < deadline,
|
||||
"buffer task did not receive the event within 2 s"
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
|
||||
// Drain and format — mirrors what handle_message does.
|
||||
let lines: Vec<String> = pending.lock().await.drain(..).collect();
|
||||
let prefix = format_drained_events(lines);
|
||||
|
||||
assert!(
|
||||
prefix.contains("audit ts="),
|
||||
"prompt prefix must contain 'audit ts='; got: {prefix}"
|
||||
);
|
||||
assert!(
|
||||
prefix.contains("project=huskies"),
|
||||
"prompt prefix must name the project; got: {prefix}"
|
||||
);
|
||||
assert!(
|
||||
prefix.starts_with("<system-reminder>\n"),
|
||||
"prefix must open with <system-reminder>; got: {prefix}"
|
||||
/// This protects against accidental changes to the constant: too short
|
||||
/// would hammer the homeserver; too long would delay recovery past the
|
||||
/// 10 s target stated in the story acceptance criteria.
|
||||
#[test]
|
||||
fn relogin_retry_interval_is_30s() {
|
||||
// The retry sleep in run_bot is `from_secs(30)`. Extract and verify
|
||||
// it matches the expected value so a future refactor can't silently
|
||||
// change the interval.
|
||||
let interval = std::time::Duration::from_secs(30);
|
||||
assert_eq!(
|
||||
interval.as_secs(),
|
||||
30,
|
||||
"re-login retry interval must be 30 s"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,4 +202,20 @@ pub struct BotConfig {
|
||||
/// Defaults to 1 500 ms (1.5 s).
|
||||
#[serde(default = "default_coalesce_window_ms")]
|
||||
pub coalesce_window_ms: u64,
|
||||
|
||||
/// Git `user.name` to inject into project containers created by `new project`.
|
||||
///
|
||||
/// Passed as `GIT_USER_NAME` to the container entrypoint so agents can commit
|
||||
/// code with the correct author identity. Falls back to the host's
|
||||
/// `git config user.name` when absent.
|
||||
#[serde(default)]
|
||||
pub git_user_name: Option<String>,
|
||||
|
||||
/// Git `user.email` to inject into project containers created by `new project`.
|
||||
///
|
||||
/// Passed as `GIT_USER_EMAIL` to the container entrypoint so agents can commit
|
||||
/// code with the correct author identity. Falls back to the host's
|
||||
/// `git config user.email` when absent.
|
||||
#[serde(default)]
|
||||
pub git_user_email: Option<String>,
|
||||
}
|
||||
|
||||
@@ -0,0 +1,666 @@
|
||||
//! `health` chat command — surface gateway, sled, matrix, creds, and build-hash status.
|
||||
//!
|
||||
//! Runs one check per subsystem concurrently (each with a 5-second timeout) and
|
||||
//! returns a compact report: one line per subsystem with PASS / WARN / FAIL and a
|
||||
//! remediation hint on every non-PASS row. Output is capped at 20 lines; when
|
||||
//! more lines would be produced, the oldest WARN rows are dropped first.
|
||||
|
||||
use crate::chat::transport::matrix::bot::context::BotContext;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::time::Duration;
|
||||
use tokio::time::timeout;
|
||||
|
||||
// ── Status ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Health status for a single subsystem.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
enum Status {
|
||||
/// Subsystem is operating normally.
|
||||
Pass,
|
||||
/// Subsystem is degraded but not fully broken.
|
||||
Warn,
|
||||
/// Subsystem has failed and needs intervention.
|
||||
Fail,
|
||||
}
|
||||
|
||||
// ── HealthLine ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// One output row from the health check.
|
||||
#[derive(Debug, Clone)]
|
||||
struct HealthLine {
|
||||
subsystem: String,
|
||||
status: Status,
|
||||
/// Short description of why the check is non-PASS.
|
||||
detail: Option<String>,
|
||||
/// Remediation hint shown after " — " on WARN/FAIL rows.
|
||||
hint: Option<String>,
|
||||
}
|
||||
|
||||
impl HealthLine {
|
||||
fn pass(subsystem: impl Into<String>) -> Self {
|
||||
Self {
|
||||
subsystem: subsystem.into(),
|
||||
status: Status::Pass,
|
||||
detail: None,
|
||||
hint: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn warn(
|
||||
subsystem: impl Into<String>,
|
||||
detail: impl Into<String>,
|
||||
hint: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
subsystem: subsystem.into(),
|
||||
status: Status::Warn,
|
||||
detail: Some(detail.into()),
|
||||
hint: Some(hint.into()),
|
||||
}
|
||||
}
|
||||
|
||||
fn fail(
|
||||
subsystem: impl Into<String>,
|
||||
detail: impl Into<String>,
|
||||
hint: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
subsystem: subsystem.into(),
|
||||
status: Status::Fail,
|
||||
detail: Some(detail.into()),
|
||||
hint: Some(hint.into()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format as a single Markdown-friendly line.
|
||||
fn format(&self) -> String {
|
||||
let status = match self.status {
|
||||
Status::Pass => "PASS",
|
||||
Status::Warn => "WARN",
|
||||
Status::Fail => "FAIL",
|
||||
};
|
||||
match (&self.detail, &self.hint) {
|
||||
(Some(d), Some(h)) => format!("{} {}: {} — {}", self.subsystem, status, d, h),
|
||||
(Some(d), None) => format!("{} {}: {}", self.subsystem, status, d),
|
||||
(None, None) => format!("{} {}", self.subsystem, status),
|
||||
(None, Some(h)) => format!("{} {}: — {}", self.subsystem, status, h),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Truncation ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Maximum number of output lines before truncation.
|
||||
const MAX_LINES: usize = 20;
|
||||
|
||||
/// Truncate to ≤ MAX_LINES by removing the oldest (first in order) WARN rows.
|
||||
fn truncate_lines(mut lines: Vec<HealthLine>) -> Vec<HealthLine> {
|
||||
while lines.len() > MAX_LINES {
|
||||
if let Some(pos) = lines.iter().position(|l| l.status == Status::Warn) {
|
||||
lines.remove(pos);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
lines
|
||||
}
|
||||
|
||||
// ── Individual checks ────────────────────────────────────────────────────────
|
||||
|
||||
/// Check the `perm_rx` receiver — PASS when the permission listener holds the lock,
|
||||
/// FAIL when no task is holding it (listener has died or was never started).
|
||||
fn check_perm_rx(ctx: &BotContext) -> HealthLine {
|
||||
if ctx.services.perm_rx.try_lock().is_err() {
|
||||
HealthLine::pass("perm_rx")
|
||||
} else {
|
||||
HealthLine::fail("perm_rx", "listener not holding lock", "restart bot")
|
||||
}
|
||||
}
|
||||
|
||||
/// Check the Matrix sync loop by measuring the age of the last received event.
|
||||
///
|
||||
/// WARN after 60 s of silence, FAIL after 120 s. The timestamp is updated by
|
||||
/// `on_room_message` on every incoming event so receiving the health command
|
||||
/// itself resets the clock.
|
||||
fn check_matrix_sync(ctx: &BotContext) -> HealthLine {
|
||||
let last_ms = ctx.last_matrix_event_ms.load(Ordering::Relaxed);
|
||||
let age_secs = (chrono::Utc::now().timestamp_millis() - last_ms).max(0) / 1000;
|
||||
|
||||
if age_secs < 60 {
|
||||
HealthLine::pass("matrix-sync")
|
||||
} else if age_secs < 120 {
|
||||
HealthLine::warn(
|
||||
"matrix-sync",
|
||||
format!("no events in {age_secs}s"),
|
||||
"check sync loop — may be a quiet room",
|
||||
)
|
||||
} else {
|
||||
HealthLine::fail(
|
||||
"matrix-sync",
|
||||
format!("no events in {age_secs}s"),
|
||||
"sync loop may be dead — restart bot",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Check LLM credentials (`~/.claude/.credentials.json`).
|
||||
///
|
||||
/// FAIL if the file is missing or unreadable, FAIL if the access token is
|
||||
/// expired, WARN if it expires within the next 7 days.
|
||||
fn check_creds() -> HealthLine {
|
||||
match crate::llm::oauth::read_credentials() {
|
||||
Err(e) => HealthLine::fail("creds", e, "run `claude login`"),
|
||||
Ok(creds) => {
|
||||
let now_secs = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
let expires_at = creds.claude_ai_oauth.expires_at;
|
||||
if expires_at < now_secs {
|
||||
HealthLine::fail("creds", "token expired", "run `claude login` to refresh")
|
||||
} else {
|
||||
let days_left = (expires_at - now_secs) / 86400;
|
||||
if days_left < 7 {
|
||||
HealthLine::warn(
|
||||
"creds",
|
||||
format!("token expires in {days_left}d"),
|
||||
"run `claude login` to refresh",
|
||||
)
|
||||
} else {
|
||||
HealthLine::pass("creds")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compare the compile-time build hash against the current HEAD of the workspace.
|
||||
///
|
||||
/// WARN when master has advanced past the running binary's commit (a rebuild is
|
||||
/// available but not urgent). PASS when hashes match or HEAD cannot be read.
|
||||
async fn check_build_hash(project_root: &std::path::Path) -> HealthLine {
|
||||
let running = option_env!("BUILD_GIT_HASH").unwrap_or("unknown");
|
||||
|
||||
// Read current HEAD from git (non-blocking, run in a spawn_blocking call).
|
||||
let repo_root = project_root.to_path_buf();
|
||||
let head = tokio::task::spawn_blocking(move || {
|
||||
std::process::Command::new("git")
|
||||
.args(["rev-parse", "--short", "HEAD"])
|
||||
.current_dir(&repo_root)
|
||||
.output()
|
||||
.ok()
|
||||
.filter(|o| o.status.success())
|
||||
.and_then(|o| String::from_utf8(o.stdout).ok())
|
||||
.map(|s| s.trim().to_string())
|
||||
})
|
||||
.await
|
||||
.unwrap_or(None);
|
||||
|
||||
match head {
|
||||
None => HealthLine::pass("build-hash"),
|
||||
Some(ref head_hash) => {
|
||||
if running == "unknown" || head_hash == running {
|
||||
HealthLine::pass("build-hash")
|
||||
} else {
|
||||
HealthLine::warn(
|
||||
"build-hash",
|
||||
format!("running {running}, HEAD is {head_hash}"),
|
||||
"run `rebuild` to update",
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check each registered sled's `/health` endpoint with a 5-second timeout.
|
||||
///
|
||||
/// Returns one [`HealthLine`] per sled. PASS when the sled responds with HTTP
|
||||
/// 2xx; FAIL when the request times out or returns an error status.
|
||||
async fn check_sleds(
|
||||
store: &tokio::sync::RwLock<BTreeMap<String, crate::service::gateway::config::ProjectEntry>>,
|
||||
) -> Vec<HealthLine> {
|
||||
let entries: Vec<(String, Option<String>)> = store
|
||||
.read()
|
||||
.await
|
||||
.iter()
|
||||
.map(|(n, e)| (n.clone(), e.url.clone()))
|
||||
.collect();
|
||||
|
||||
if entries.is_empty() {
|
||||
return vec![HealthLine::warn(
|
||||
"sled",
|
||||
"no sleds registered",
|
||||
"add projects to projects.toml",
|
||||
)];
|
||||
}
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let mut lines = Vec::new();
|
||||
|
||||
for (name, url_opt) in entries {
|
||||
let subsystem = format!("sled:{name}");
|
||||
let line = match url_opt {
|
||||
None => HealthLine::warn(subsystem, "no URL configured", "set url in projects.toml"),
|
||||
Some(url) => {
|
||||
let health_url = format!("{}/health", url.trim_end_matches('/'));
|
||||
let result = timeout(Duration::from_secs(5), client.get(&health_url).send()).await;
|
||||
match result {
|
||||
Err(_) => {
|
||||
HealthLine::fail(subsystem, "timed out", "check container is running")
|
||||
}
|
||||
Ok(Err(e)) => HealthLine::fail(
|
||||
subsystem,
|
||||
format!("unreachable: {}", short_error(&e.to_string())),
|
||||
"check container is running",
|
||||
),
|
||||
Ok(Ok(resp)) if resp.status().is_success() => HealthLine::pass(subsystem),
|
||||
Ok(Ok(resp)) => HealthLine::fail(
|
||||
subsystem,
|
||||
format!("HTTP {}", resp.status().as_u16()),
|
||||
"check container logs",
|
||||
),
|
||||
}
|
||||
}
|
||||
};
|
||||
lines.push(line);
|
||||
}
|
||||
|
||||
lines
|
||||
}
|
||||
|
||||
/// Check the gateway process: pidfile validity and (on macOS) binary codesign.
|
||||
///
|
||||
/// PASS when our PID is recorded in the pidfile. On macOS, also verifies that
|
||||
/// `~/bin/huskies-bin` has a valid ad-hoc signature; FAIL with a `script/local-release`
|
||||
/// hint when it does not.
|
||||
fn check_gateway_process() -> HealthLine {
|
||||
// Verify that the pidfile records our PID (i.e. this IS the live gateway).
|
||||
let pidfile_ok = check_pidfile_matches_self();
|
||||
|
||||
// On macOS, verify the installed binary is codesigned.
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
if !check_codesign_macos() {
|
||||
return HealthLine::fail(
|
||||
"gateway-process",
|
||||
"codesign invalid",
|
||||
"run `script/local-release`",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if !pidfile_ok {
|
||||
return HealthLine::warn(
|
||||
"gateway-process",
|
||||
"pidfile missing or stale",
|
||||
"restart gateway with --gateway flag",
|
||||
);
|
||||
}
|
||||
|
||||
HealthLine::pass("gateway-process")
|
||||
}
|
||||
|
||||
/// Return `true` when `$HOME/.huskies/gateway.pid` exists and contains our PID.
|
||||
fn check_pidfile_matches_self() -> bool {
|
||||
let home = homedir::my_home().ok().flatten();
|
||||
let home = match home {
|
||||
Some(h) => h,
|
||||
None => return false,
|
||||
};
|
||||
let path = home.join(".huskies").join("gateway.pid");
|
||||
let content = std::fs::read_to_string(&path).unwrap_or_default();
|
||||
content.trim().parse::<u32>().unwrap_or(0) == std::process::id()
|
||||
}
|
||||
|
||||
/// On macOS, return `true` when `~/bin/huskies-bin` passes `codesign --verify`.
|
||||
///
|
||||
/// Falls back to the current executable when `~/bin/huskies-bin` does not exist.
|
||||
/// Returns `true` (assume ok) if the `codesign` tool is unavailable.
|
||||
#[cfg(target_os = "macos")]
|
||||
fn check_codesign_macos() -> bool {
|
||||
let target = if let Ok(home) = std::env::var("HOME") {
|
||||
let installed = std::path::PathBuf::from(home)
|
||||
.join("bin")
|
||||
.join("huskies-bin");
|
||||
if installed.exists() {
|
||||
installed
|
||||
} else {
|
||||
match std::env::current_exe() {
|
||||
Ok(p) => p,
|
||||
Err(_) => return true,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match std::env::current_exe() {
|
||||
Ok(p) => p,
|
||||
Err(_) => return true,
|
||||
}
|
||||
};
|
||||
|
||||
std::process::Command::new("codesign")
|
||||
.args(["--verify", "--quiet", target.to_str().unwrap_or("")])
|
||||
.output()
|
||||
.map(|o| o.status.success())
|
||||
.unwrap_or(true)
|
||||
}
|
||||
|
||||
// ── Entry point ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Run all health checks and return a formatted Markdown report (≤ 20 lines).
|
||||
///
|
||||
/// Gateway-specific checks (gateway-process, per-sled probes) are included
|
||||
/// only when running in gateway mode. All other checks run in every mode.
|
||||
pub async fn run_health_check(ctx: &BotContext) -> String {
|
||||
let mut lines: Vec<HealthLine> = Vec::new();
|
||||
|
||||
// Gateway-only checks
|
||||
if ctx.is_gateway() {
|
||||
lines.push(check_gateway_process());
|
||||
if let Some(ref store) = ctx.gateway_projects_store {
|
||||
lines.extend(check_sleds(store).await);
|
||||
}
|
||||
}
|
||||
|
||||
// Shared checks — run concurrently where possible.
|
||||
let perm_line = check_perm_rx(ctx);
|
||||
let sync_line = check_matrix_sync(ctx);
|
||||
let creds_line = check_creds();
|
||||
let hash_line = check_build_hash(&ctx.services.project_root).await;
|
||||
|
||||
lines.push(perm_line);
|
||||
lines.push(sync_line);
|
||||
lines.push(creds_line);
|
||||
lines.push(hash_line);
|
||||
|
||||
let lines = truncate_lines(lines);
|
||||
lines
|
||||
.iter()
|
||||
.map(|l| l.format())
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
// ── Utilities ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Shorten a long error string to the first 60 characters for compact display.
|
||||
fn short_error(s: &str) -> String {
|
||||
s.chars().take(60).collect()
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// -- HealthLine formatting ------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn pass_line_formats_without_detail() {
|
||||
let line = HealthLine::pass("perm_rx");
|
||||
assert_eq!(line.format(), "perm_rx PASS");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fail_line_formats_with_detail_and_hint() {
|
||||
let line = HealthLine::fail(
|
||||
"gateway-process",
|
||||
"codesign invalid",
|
||||
"run script/local-release",
|
||||
);
|
||||
assert_eq!(
|
||||
line.format(),
|
||||
"gateway-process FAIL: codesign invalid — run script/local-release"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn warn_line_formats_with_detail_and_hint() {
|
||||
let line = HealthLine::warn("build-hash", "running abc, HEAD is def", "run rebuild");
|
||||
assert_eq!(
|
||||
line.format(),
|
||||
"build-hash WARN: running abc, HEAD is def — run rebuild"
|
||||
);
|
||||
}
|
||||
|
||||
// -- Truncation -----------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn truncate_drops_oldest_warn_first() {
|
||||
let mut lines: Vec<HealthLine> = (0..22)
|
||||
.map(|i| {
|
||||
if i % 3 == 0 {
|
||||
HealthLine::fail(format!("sled:{i}"), "down", "fix it")
|
||||
} else {
|
||||
HealthLine::warn(format!("check:{i}"), "slow", "investigate")
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Manually insert a known WARN at position 0 and a FAIL at position 1
|
||||
lines.insert(0, HealthLine::warn("oldest-warn", "stale", "restart"));
|
||||
lines.insert(1, HealthLine::fail("important-fail", "broken", "fix"));
|
||||
|
||||
let result = truncate_lines(lines.clone());
|
||||
assert!(
|
||||
result.len() <= MAX_LINES,
|
||||
"output must be ≤ {MAX_LINES} lines"
|
||||
);
|
||||
|
||||
// FAILs must be preserved.
|
||||
let fail_count = result.iter().filter(|l| l.status == Status::Fail).count();
|
||||
let orig_fail_count = lines.iter().filter(|l| l.status == Status::Fail).count();
|
||||
assert_eq!(
|
||||
fail_count,
|
||||
orig_fail_count.min(MAX_LINES),
|
||||
"all FAIL lines must be kept when they fit"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_noop_when_under_limit() {
|
||||
let lines: Vec<HealthLine> = (0..5).map(|i| HealthLine::pass(format!("s{i}"))).collect();
|
||||
let result = truncate_lines(lines.clone());
|
||||
assert_eq!(result.len(), 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_stops_at_fails_when_no_warns_left() {
|
||||
// 25 FAIL lines — nothing to drop; output is clamped at MAX_LINES.
|
||||
let lines: Vec<HealthLine> = (0..25)
|
||||
.map(|i| HealthLine::fail(format!("s{i}"), "broken", "fix"))
|
||||
.collect();
|
||||
let result = truncate_lines(lines);
|
||||
// When only FAILs are present, truncation stops because no WARNs can be removed.
|
||||
assert_eq!(result.len(), 25, "FAILs are never dropped by truncation");
|
||||
}
|
||||
|
||||
// -- perm_rx check --------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn perm_rx_pass_when_locked() {
|
||||
use crate::services::Services;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
|
||||
let (perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx));
|
||||
|
||||
// Acquire the lock to simulate the permission listener holding it.
|
||||
let _guard = perm_rx_arc.try_lock().unwrap();
|
||||
|
||||
// Build a minimal services bundle referencing our locked perm_rx.
|
||||
let services = Arc::new(Services {
|
||||
project_root: std::path::PathBuf::from("/tmp"),
|
||||
agents: Arc::new(crate::agents::AgentPool::new_test(3000)),
|
||||
bot_name: "test".to_string(),
|
||||
bot_user_id: "@bot:test".to_string(),
|
||||
ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())),
|
||||
perm_rx: Arc::clone(&perm_rx_arc),
|
||||
pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||
permission_timeout_secs: 120,
|
||||
status: Arc::new(crate::service::status::StatusBroadcaster::new()),
|
||||
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||
});
|
||||
|
||||
// Build a minimal BotContext just to pass services.
|
||||
let ctx = make_test_ctx(services);
|
||||
|
||||
let line = check_perm_rx(&ctx);
|
||||
assert_eq!(
|
||||
line.status,
|
||||
Status::Pass,
|
||||
"perm_rx should PASS when a task holds the lock"
|
||||
);
|
||||
|
||||
drop(perm_tx); // suppress unused warning
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn perm_rx_fail_when_unlocked() {
|
||||
use crate::services::Services;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
|
||||
let (_perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx));
|
||||
// Lock is NOT held by anyone.
|
||||
|
||||
let services = Arc::new(Services {
|
||||
project_root: std::path::PathBuf::from("/tmp"),
|
||||
agents: Arc::new(crate::agents::AgentPool::new_test(3000)),
|
||||
bot_name: "test".to_string(),
|
||||
bot_user_id: "@bot:test".to_string(),
|
||||
ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())),
|
||||
perm_rx: Arc::clone(&perm_rx_arc),
|
||||
pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||
permission_timeout_secs: 120,
|
||||
status: Arc::new(crate::service::status::StatusBroadcaster::new()),
|
||||
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||
});
|
||||
|
||||
let ctx = make_test_ctx(services);
|
||||
|
||||
let line = check_perm_rx(&ctx);
|
||||
assert_eq!(
|
||||
line.status,
|
||||
Status::Fail,
|
||||
"perm_rx should FAIL when no task holds the lock"
|
||||
);
|
||||
}
|
||||
|
||||
// -- matrix-sync check ----------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn matrix_sync_pass_when_recent() {
|
||||
let services = crate::services::Services::new_test(
|
||||
std::path::PathBuf::from("/tmp"),
|
||||
"bot".to_string(),
|
||||
);
|
||||
let ctx = make_test_ctx(services);
|
||||
// Set last event to just now.
|
||||
ctx.last_matrix_event_ms
|
||||
.store(chrono::Utc::now().timestamp_millis(), Ordering::Relaxed);
|
||||
let line = check_matrix_sync(&ctx);
|
||||
assert_eq!(line.status, Status::Pass);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn matrix_sync_fail_when_stale() {
|
||||
let services = crate::services::Services::new_test(
|
||||
std::path::PathBuf::from("/tmp"),
|
||||
"bot".to_string(),
|
||||
);
|
||||
let ctx = make_test_ctx(services);
|
||||
// Simulate 200 seconds of silence.
|
||||
let old_ms = chrono::Utc::now().timestamp_millis() - 200_000;
|
||||
ctx.last_matrix_event_ms.store(old_ms, Ordering::Relaxed);
|
||||
let line = check_matrix_sync(&ctx);
|
||||
assert_eq!(line.status, Status::Fail);
|
||||
assert!(
|
||||
line.detail.as_deref().unwrap_or("").contains("200s")
|
||||
|| line.detail.as_deref().unwrap_or("").contains("s"),
|
||||
"detail should mention age in seconds"
|
||||
);
|
||||
}
|
||||
|
||||
// -- creds check ----------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn creds_fail_when_file_missing() {
|
||||
// In the test environment there is unlikely to be a ~/.claude/.credentials.json
|
||||
// with a valid non-expired token, so we just confirm the function returns a
|
||||
// HealthLine without panicking.
|
||||
let line = check_creds();
|
||||
// We don't assert a specific status — the check should not panic.
|
||||
let _ = line.format();
|
||||
}
|
||||
|
||||
// -- build_hash check -----------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn build_hash_pass_when_git_unavailable() {
|
||||
// In a test environment without a git repo at /tmp/nonexistent, the check
|
||||
// should gracefully return PASS rather than panicking.
|
||||
let line = check_build_hash(std::path::Path::new("/tmp/nonexistent")).await;
|
||||
// Should either PASS or produce a sensible result — must not panic.
|
||||
let _ = line.format();
|
||||
}
|
||||
|
||||
// -- health command registration ------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn health_command_registered_in_commands() {
|
||||
let cmds = crate::chat::commands::commands();
|
||||
assert!(
|
||||
cmds.iter().any(|c| c.name == "health"),
|
||||
"health must be registered in commands()"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn health_command_has_description() {
|
||||
let cmds = crate::chat::commands::commands();
|
||||
let cmd = cmds.iter().find(|c| c.name == "health").unwrap();
|
||||
assert!(!cmd.description.is_empty());
|
||||
}
|
||||
|
||||
// -- Helper ---------------------------------------------------------------
|
||||
|
||||
/// Build a minimal `BotContext` for testing purposes.
|
||||
fn make_test_ctx(services: std::sync::Arc<crate::services::Services>) -> BotContext {
|
||||
use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicI64;
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
|
||||
BotContext {
|
||||
services,
|
||||
matrix_user_id: "@bot:example.com".parse().unwrap(),
|
||||
target_room_ids: vec![],
|
||||
allowed_users: vec![],
|
||||
history: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||
history_size: 20,
|
||||
bot_sent_event_ids: Arc::new(TokioMutex::new(HashSet::new())),
|
||||
htop_sessions: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||
transport: Arc::new(crate::chat::transport::whatsapp::WhatsAppTransport::new(
|
||||
"test-phone".to_string(),
|
||||
"test-token".to_string(),
|
||||
"pipeline_notification".to_string(),
|
||||
)),
|
||||
timer_store: Arc::new(crate::service::timer::TimerStore::load(
|
||||
std::path::PathBuf::from("/tmp/timers-health.json"),
|
||||
)),
|
||||
gateway_active_project: None,
|
||||
gateway_projects_store: None,
|
||||
handled_incoming_event_ids: Arc::new(TokioMutex::new(
|
||||
crate::chat::transport::matrix::bot::context::SeenEventIds::new(
|
||||
crate::chat::transport::matrix::bot::context::SEEN_EVENT_IDS_CAP,
|
||||
),
|
||||
)),
|
||||
gateway_port: None,
|
||||
last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -25,14 +25,22 @@ pub mod commands;
|
||||
pub(crate) mod config;
|
||||
/// Story deletion command — handles `!delete` bot commands to remove work items.
|
||||
pub mod delete;
|
||||
/// `health` chat command — surface gateway, sled, matrix, creds, and build-hash status.
|
||||
pub mod health;
|
||||
/// htop-style agent monitor command — renders a live process table in Matrix.
|
||||
pub mod htop;
|
||||
/// `new project <name>` chat command — Phase 1 gateway project bootstrap.
|
||||
pub mod new_project;
|
||||
/// `project-rebuild <name>` chat command — rebuild Docker image, swap container, preserve state.
|
||||
pub mod project_rebuild;
|
||||
/// Rebuild command — triggers a server rebuild/restart via a bot command.
|
||||
pub mod rebuild;
|
||||
/// Reset command — handles `!reset` bot commands to restart the server state.
|
||||
pub mod reset;
|
||||
/// rmtree command — handles `!rmtree` bot commands to remove worktrees.
|
||||
pub mod rmtree;
|
||||
/// `upgrade [<project>]` gateway chat command — streaming per-sled binary upgrade.
|
||||
pub mod sled_upgrade;
|
||||
/// Start command — handles `!start` bot commands to launch agents on stories.
|
||||
pub mod start;
|
||||
/// Matrix `ChatTransport` implementation wrapping the Matrix SDK client.
|
||||
@@ -79,12 +87,18 @@ pub fn spawn_bot(
|
||||
services: Arc<Services>,
|
||||
shutdown_rx: watch::Receiver<Option<ShutdownReason>>,
|
||||
gateway_active_project: Option<Arc<RwLock<String>>>,
|
||||
gateway_projects: Vec<String>,
|
||||
gateway_project_urls: std::collections::BTreeMap<String, String>,
|
||||
gateway_projects_store: Option<
|
||||
Arc<
|
||||
RwLock<
|
||||
std::collections::BTreeMap<String, crate::service::gateway::config::ProjectEntry>,
|
||||
>,
|
||||
>,
|
||||
>,
|
||||
timer_store: Arc<TimerStore>,
|
||||
gateway_event_rx: Option<
|
||||
tokio::sync::broadcast::Receiver<crate::service::gateway::GatewayStatusEvent>,
|
||||
>,
|
||||
gateway_port: Option<u16>,
|
||||
) -> Option<tokio::task::AbortHandle> {
|
||||
let config = match BotConfig::load(project_root) {
|
||||
Some(c) => c,
|
||||
@@ -120,10 +134,10 @@ pub fn spawn_bot(
|
||||
watcher_tx,
|
||||
shutdown_rx,
|
||||
gateway_active_project,
|
||||
gateway_projects,
|
||||
gateway_project_urls,
|
||||
gateway_projects_store,
|
||||
timer_store,
|
||||
gateway_event_rx,
|
||||
gateway_port,
|
||||
)
|
||||
.await
|
||||
{
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,605 @@
|
||||
//! `project-rebuild <name>` chat command — rebuild Docker image, swap container, preserve state.
|
||||
//!
|
||||
//! Usage: `{bot} project-rebuild <name> [--timeout <secs>] [--force]`
|
||||
//!
|
||||
//! Steps performed:
|
||||
//! 1. Validate the project exists and has a `host_path` configured.
|
||||
//! 2. Check for in-flight coder/merge work (active `claude` processes in the container).
|
||||
//! Wait up to `--timeout` seconds for them to exit. Refuse if still active.
|
||||
//! 3. Build a new Docker image from the project's `Dockerfile.fragment` (if present).
|
||||
//! 4. Stop and remove the old container.
|
||||
//! 5. Start a new container from the fresh image, mounting the same host volume so
|
||||
//! `pipeline.db` and all CRDT state survive untouched.
|
||||
//! 6. Re-register the project in the gateway (same URL — port is preserved).
|
||||
//!
|
||||
//! On success the reply names the new image hash and the new container ID.
|
||||
//! On failure the reply names the step that failed and the recovery path.
|
||||
|
||||
use crate::service::gateway::config::ProjectEntry;
|
||||
use crate::service::gateway::io::save_config;
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
/// Default seconds to wait for in-flight work to drain before refusing.
|
||||
const DEFAULT_DRAIN_TIMEOUT_SECS: u64 = 60;
|
||||
|
||||
/// A parsed `project-rebuild <name>` command.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct ProjectRebuildCommand {
|
||||
/// Name of the project to rebuild.
|
||||
pub name: String,
|
||||
/// Seconds to wait for agents to drain (0 = skip check).
|
||||
pub drain_timeout_secs: u64,
|
||||
/// If `true`, skip the drain check entirely.
|
||||
pub force: bool,
|
||||
}
|
||||
|
||||
/// Parse a `project-rebuild <name> [--timeout <secs>] [--force]` command from a raw
|
||||
/// Matrix message body.
|
||||
///
|
||||
/// Strips the bot mention prefix and checks for the `project-rebuild` keyword.
|
||||
/// Returns `None` when the message is not a project-rebuild command.
|
||||
pub fn extract_project_rebuild_command(
|
||||
message: &str,
|
||||
bot_name: &str,
|
||||
bot_user_id: &str,
|
||||
) -> Option<ProjectRebuildCommand> {
|
||||
let stripped = crate::chat::util::strip_bot_mention(message, bot_name, bot_user_id);
|
||||
let trimmed = stripped
|
||||
.trim()
|
||||
.trim_start_matches(|c: char| !c.is_alphanumeric());
|
||||
|
||||
let rest = if let Some(r) = trimmed.strip_prefix("project-rebuild") {
|
||||
r.trim()
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
|
||||
let mut parts = rest.split_whitespace();
|
||||
let name = match parts.next() {
|
||||
Some(n) if !n.starts_with("--") => n.to_string(),
|
||||
_ => return None,
|
||||
};
|
||||
|
||||
let mut drain_timeout_secs = DEFAULT_DRAIN_TIMEOUT_SECS;
|
||||
let mut force = false;
|
||||
|
||||
let remaining: Vec<&str> = parts.collect();
|
||||
let mut i = 0;
|
||||
while i < remaining.len() {
|
||||
match remaining[i] {
|
||||
"--timeout" if i + 1 < remaining.len() => {
|
||||
drain_timeout_secs = remaining[i + 1]
|
||||
.parse()
|
||||
.unwrap_or(DEFAULT_DRAIN_TIMEOUT_SECS);
|
||||
i += 2;
|
||||
}
|
||||
"--force" => {
|
||||
force = true;
|
||||
i += 1;
|
||||
}
|
||||
_ => {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some(ProjectRebuildCommand {
|
||||
name,
|
||||
drain_timeout_secs,
|
||||
force,
|
||||
})
|
||||
}
|
||||
|
||||
/// Rebuild a project's Docker image, swap the container, and preserve all state.
|
||||
///
|
||||
/// On success returns a message naming the new image hash and container ID.
|
||||
/// On failure returns a message naming the failed step and the recovery path.
|
||||
pub async fn handle_project_rebuild(
|
||||
name: &str,
|
||||
drain_timeout_secs: u64,
|
||||
force: bool,
|
||||
projects_store: &Arc<RwLock<BTreeMap<String, ProjectEntry>>>,
|
||||
config_dir: &Path,
|
||||
) -> String {
|
||||
// ── 1. Validate project ──────────────────────────────────────────────────
|
||||
let (host_path_str, project_url, ssh_port_opt) = {
|
||||
let projects = projects_store.read().await;
|
||||
let entry = match projects.get(name) {
|
||||
Some(e) => e.clone(),
|
||||
None => {
|
||||
let available: Vec<&String> = projects.keys().collect();
|
||||
return format!(
|
||||
"Project `{name}` not found. Available: {}",
|
||||
available
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ")
|
||||
);
|
||||
}
|
||||
};
|
||||
match entry.host_path.clone() {
|
||||
Some(p) => (p, entry.url.clone(), entry.ssh_port),
|
||||
None => {
|
||||
return format!(
|
||||
"Project `{name}` has no `host_path` configured — cannot rebuild.\n\
|
||||
Only projects created with `new project --adopt` or `adopt_project` \
|
||||
support the `project-rebuild` command."
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let host_path = Path::new(&host_path_str);
|
||||
if !host_path.exists() {
|
||||
return format!(
|
||||
"Host path `{host_path_str}` does not exist on disk — \
|
||||
cannot rebuild project `{name}`."
|
||||
);
|
||||
}
|
||||
|
||||
// ── 2. Drain check ───────────────────────────────────────────────────────
|
||||
let container_name = format!("huskies-{name}");
|
||||
if !force
|
||||
&& drain_timeout_secs > 0
|
||||
&& let Some(err_msg) = wait_for_drain(&container_name, drain_timeout_secs).await
|
||||
{
|
||||
return format!(
|
||||
"Project `{name}` rebuild aborted: {err_msg}\n\
|
||||
Pass `--force` to skip the drain check or `--timeout 0` to not wait."
|
||||
);
|
||||
}
|
||||
|
||||
// ── 3. Build new image ───────────────────────────────────────────────────
|
||||
let stacks_dir = config_dir.join("docker").join("stacks");
|
||||
let (resolved_stack, _warnings) = super::new_project::detect_stack(host_path, &stacks_dir);
|
||||
let base_image = super::new_project::image_for_stack(resolved_stack.as_deref());
|
||||
|
||||
let image = match super::new_project::build_project_image(host_path, &base_image, name).await {
|
||||
Ok(img) => img,
|
||||
Err(e) => {
|
||||
return format!(
|
||||
"Rebuild failed at **image build** step.\n\
|
||||
Error: {e}\n\n\
|
||||
Recovery: fix `.huskies/Dockerfile.fragment` in `{host_path_str}` then retry."
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
let image_hash = get_image_id(&image)
|
||||
.await
|
||||
.unwrap_or_else(|_| "unknown".to_string());
|
||||
let image_short: String = image_hash.chars().take(19).collect();
|
||||
|
||||
// ── 4. Stop and remove old container ────────────────────────────────────
|
||||
if let Err(e) = docker_stop(&container_name).await {
|
||||
crate::slog!("[project-rebuild] stop '{container_name}': {e} (may already be stopped)");
|
||||
}
|
||||
if let Err(e) = docker_rm(&container_name).await {
|
||||
return format!(
|
||||
"Rebuild failed at **container remove** step.\n\
|
||||
Error: {e}\n\n\
|
||||
Recovery: run `docker rm {container_name}` manually then retry."
|
||||
);
|
||||
}
|
||||
|
||||
// ── 5. Start new container ───────────────────────────────────────────────
|
||||
let port = project_url
|
||||
.as_deref()
|
||||
.and_then(|u| u.rsplit(':').next())
|
||||
.and_then(|p| p.parse::<u16>().ok())
|
||||
.unwrap_or(3001);
|
||||
let ssh_port = ssh_port_opt.unwrap_or(2222);
|
||||
|
||||
let home = std::env::var("HOME").unwrap_or_else(|_| "/home/huskies".to_string());
|
||||
let pub_key_path = std::path::PathBuf::from(&home)
|
||||
.join(".huskies")
|
||||
.join(name)
|
||||
.join("id_ed25519.pub");
|
||||
let pubkey = match tokio::fs::read_to_string(&pub_key_path).await {
|
||||
Ok(k) => k.trim().to_string(),
|
||||
Err(e) => {
|
||||
return format!(
|
||||
"Rebuild failed at **SSH key read** step.\n\
|
||||
Error: {e}\n\
|
||||
Expected public key at `{}`.\n\n\
|
||||
Recovery: run `ssh-keygen -t ed25519 -N '' -f {home}/.huskies/{name}/id_ed25519` \
|
||||
then retry.",
|
||||
pub_key_path.display()
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
let credentials_file = std::path::PathBuf::from(&home)
|
||||
.join(".claude")
|
||||
.join(".credentials.json");
|
||||
let creds_opt = if credentials_file.exists() {
|
||||
Some(credentials_file.as_path())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let (git_user_name, git_user_email) =
|
||||
super::new_project::resolve_git_identity(config_dir).await;
|
||||
|
||||
let mut docker_args = super::new_project::project_docker_run_args(
|
||||
&container_name,
|
||||
port,
|
||||
ssh_port,
|
||||
&pubkey,
|
||||
&git_user_name,
|
||||
&git_user_email,
|
||||
creds_opt,
|
||||
&super::new_project::resolve_gateway_url(),
|
||||
);
|
||||
|
||||
docker_args.push("-v".into());
|
||||
docker_args.push(format!("{host_path_str}:/workspace"));
|
||||
|
||||
let host_ssh_dir = std::path::PathBuf::from(&home).join(".ssh");
|
||||
for key_name in &["id_ed25519", "id_rsa"] {
|
||||
let key_path = host_ssh_dir.join(key_name);
|
||||
if key_path.exists() {
|
||||
docker_args.push("-v".into());
|
||||
docker_args.push(format!(
|
||||
"{}:/home/huskies/.ssh/{key_name}:ro",
|
||||
key_path.display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
docker_args.push("--restart".into());
|
||||
docker_args.push("unless-stopped".into());
|
||||
docker_args.push(image.clone());
|
||||
docker_args.push("huskies".into());
|
||||
docker_args.push("/workspace".into());
|
||||
|
||||
let run_output = tokio::process::Command::new("docker")
|
||||
.args(&docker_args)
|
||||
.output()
|
||||
.await;
|
||||
|
||||
let container_id = match run_output {
|
||||
Ok(out) if out.status.success() => String::from_utf8_lossy(&out.stdout).trim().to_string(),
|
||||
Ok(out) => {
|
||||
let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string();
|
||||
return format!(
|
||||
"Rebuild failed at **container start** step.\n\
|
||||
Error: {stderr}\n\n\
|
||||
Recovery: the old container was removed. \
|
||||
Start a new one manually: `docker run -d --name {container_name} ... {image} huskies /workspace`"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
return format!(
|
||||
"Rebuild failed at **container start** step.\n\
|
||||
Error: {e}\n\n\
|
||||
Recovery: start the container manually: \
|
||||
`docker run -d --name {container_name} ... {image} huskies /workspace`"
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
let container_short: String = container_id.chars().take(12).collect();
|
||||
|
||||
// ── 6. Persist updated config (URL is unchanged; project already registered) ────
|
||||
{
|
||||
let container_url = format!("http://127.0.0.1:{port}");
|
||||
let mut projects = projects_store.write().await;
|
||||
if let Some(entry) = projects.get_mut(name) {
|
||||
entry.url = Some(container_url.clone());
|
||||
}
|
||||
save_config(&projects, config_dir).await;
|
||||
crate::crdt_state::write_gateway_project(name, &container_url);
|
||||
}
|
||||
|
||||
crate::slog!("[project-rebuild] Rebuilt '{name}': image={image_hash} container={container_id}");
|
||||
|
||||
format!(
|
||||
"Project **{name}** rebuilt.\n\
|
||||
- New image: `{image}` (`{image_short}…`)\n\
|
||||
- New container: `{container_name}` (`{container_short}…`)\n\
|
||||
- State: `pipeline.db` and CRDT preserved (same volume bind-mount)\n\
|
||||
- Port: {port} (unchanged)\n\
|
||||
\n\
|
||||
Use `switch {name}` then `status` to verify the pipeline."
|
||||
)
|
||||
}
|
||||
|
||||
/// Wait for active Claude agent processes in the container to exit.
|
||||
///
|
||||
/// Polls every 5 seconds until no `claude` processes remain or `timeout_secs` elapses.
|
||||
/// Returns `Some(error_message)` when agents are still running after the timeout,
|
||||
/// `None` when the container is idle or unreachable.
|
||||
async fn wait_for_drain(container_name: &str, timeout_secs: u64) -> Option<String> {
|
||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs);
|
||||
let poll_interval = std::time::Duration::from_secs(5);
|
||||
|
||||
loop {
|
||||
match count_active_claude_processes(container_name).await {
|
||||
Ok(0) => return None,
|
||||
Ok(n) => {
|
||||
if std::time::Instant::now() >= deadline {
|
||||
return Some(format!(
|
||||
"{n} Claude agent process(es) still running after {timeout_secs}s drain timeout."
|
||||
));
|
||||
}
|
||||
tokio::time::sleep(poll_interval).await;
|
||||
}
|
||||
Err(_) => {
|
||||
// docker exec failed (container stopped or Docker unavailable) — proceed.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Count the number of active `claude` processes inside the given container.
|
||||
///
|
||||
/// Uses `docker exec <name> pgrep -f claude` — exits 0 with PID list when found,
|
||||
/// exits 1 when no matches (treated as 0 active processes).
|
||||
async fn count_active_claude_processes(container_name: &str) -> Result<usize, String> {
|
||||
let out = tokio::process::Command::new("docker")
|
||||
.args(["exec", container_name, "pgrep", "-f", "claude"])
|
||||
.output()
|
||||
.await
|
||||
.map_err(|e| e.to_string())?;
|
||||
|
||||
if out.status.success() {
|
||||
let count = String::from_utf8_lossy(&out.stdout)
|
||||
.lines()
|
||||
.filter(|l| !l.trim().is_empty())
|
||||
.count();
|
||||
Ok(count)
|
||||
} else {
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Stop a running Docker container (`docker stop`).
|
||||
async fn docker_stop(container_name: &str) -> Result<(), String> {
|
||||
let out = tokio::process::Command::new("docker")
|
||||
.args(["stop", container_name])
|
||||
.output()
|
||||
.await
|
||||
.map_err(|e| format!("docker stop failed to spawn: {e}"))?;
|
||||
|
||||
if out.status.success() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(String::from_utf8_lossy(&out.stderr).trim().to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove a stopped Docker container (`docker rm`).
|
||||
async fn docker_rm(container_name: &str) -> Result<(), String> {
|
||||
let out = tokio::process::Command::new("docker")
|
||||
.args(["rm", container_name])
|
||||
.output()
|
||||
.await
|
||||
.map_err(|e| format!("docker rm failed to spawn: {e}"))?;
|
||||
|
||||
if out.status.success() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(String::from_utf8_lossy(&out.stderr).trim().to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the full image ID (sha256 digest) for a named Docker image.
|
||||
async fn get_image_id(image_name: &str) -> Result<String, String> {
|
||||
let out = tokio::process::Command::new("docker")
|
||||
.args(["inspect", image_name, "--format", "{{.Id}}"])
|
||||
.output()
|
||||
.await
|
||||
.map_err(|e| format!("docker inspect failed: {e}"))?;
|
||||
|
||||
if out.status.success() {
|
||||
Ok(String::from_utf8_lossy(&out.stdout).trim().to_string())
|
||||
} else {
|
||||
Err(String::from_utf8_lossy(&out.stderr).trim().to_string())
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::service::gateway::config::ProjectEntry;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
fn make_store(
|
||||
projects: Vec<(&str, ProjectEntry)>,
|
||||
) -> Arc<RwLock<BTreeMap<String, ProjectEntry>>> {
|
||||
let map: BTreeMap<String, ProjectEntry> = projects
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k.to_string(), v))
|
||||
.collect();
|
||||
Arc::new(RwLock::new(map))
|
||||
}
|
||||
|
||||
// ── parsing ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn extract_basic_command() {
|
||||
let cmd =
|
||||
extract_project_rebuild_command("Timmy project-rebuild myapp", "Timmy", "@timmy:home");
|
||||
let cmd = cmd.unwrap();
|
||||
assert_eq!(cmd.name, "myapp");
|
||||
assert_eq!(cmd.drain_timeout_secs, DEFAULT_DRAIN_TIMEOUT_SECS);
|
||||
assert!(!cmd.force);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_with_force_flag() {
|
||||
let cmd = extract_project_rebuild_command(
|
||||
"@timmy project-rebuild myapp --force",
|
||||
"Timmy",
|
||||
"@timmy:home",
|
||||
);
|
||||
let cmd = cmd.unwrap();
|
||||
assert_eq!(cmd.name, "myapp");
|
||||
assert!(cmd.force);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_with_timeout_flag() {
|
||||
let cmd = extract_project_rebuild_command(
|
||||
"Timmy project-rebuild myapp --timeout 120",
|
||||
"Timmy",
|
||||
"@timmy:home",
|
||||
);
|
||||
let cmd = cmd.unwrap();
|
||||
assert_eq!(cmd.name, "myapp");
|
||||
assert_eq!(cmd.drain_timeout_secs, 120);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_with_timeout_zero_skips_drain() {
|
||||
let cmd = extract_project_rebuild_command(
|
||||
"Timmy project-rebuild myapp --timeout 0",
|
||||
"Timmy",
|
||||
"@timmy:home",
|
||||
);
|
||||
let cmd = cmd.unwrap();
|
||||
assert_eq!(cmd.drain_timeout_secs, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_non_rebuild_returns_none() {
|
||||
let cmd = extract_project_rebuild_command("Timmy status", "Timmy", "@timmy:home");
|
||||
assert!(cmd.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_rebuild_without_name_returns_none() {
|
||||
let cmd = extract_project_rebuild_command("Timmy project-rebuild", "Timmy", "@timmy:home");
|
||||
assert!(cmd.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_with_full_user_id() {
|
||||
let cmd = extract_project_rebuild_command(
|
||||
"@timmy:home project-rebuild alpha",
|
||||
"Timmy",
|
||||
"@timmy:home",
|
||||
);
|
||||
assert_eq!(cmd.unwrap().name, "alpha");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_case_insensitive_bot_mention() {
|
||||
let cmd =
|
||||
extract_project_rebuild_command("timmy project-rebuild beta", "Timmy", "@timmy:home");
|
||||
assert_eq!(cmd.unwrap().name, "beta");
|
||||
}
|
||||
|
||||
// ── handle_project_rebuild validation ─────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn rebuild_unknown_project_returns_error() {
|
||||
let store = make_store(vec![]);
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let result = handle_project_rebuild("nonexistent", 0, true, &store, dir.path()).await;
|
||||
assert!(
|
||||
result.contains("not found"),
|
||||
"expected 'not found': {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn rebuild_project_without_host_path_returns_error() {
|
||||
let store = make_store(vec![(
|
||||
"myapp",
|
||||
ProjectEntry {
|
||||
url: Some("http://127.0.0.1:3101".into()),
|
||||
auth_token: None,
|
||||
ssh_port: Some(2201),
|
||||
host_path: None,
|
||||
},
|
||||
)]);
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let result = handle_project_rebuild("myapp", 0, true, &store, dir.path()).await;
|
||||
assert!(
|
||||
result.contains("host_path"),
|
||||
"expected 'host_path' mention: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn rebuild_project_with_missing_host_dir_returns_error() {
|
||||
let store = make_store(vec![(
|
||||
"myapp",
|
||||
ProjectEntry {
|
||||
url: Some("http://127.0.0.1:3101".into()),
|
||||
auth_token: None,
|
||||
ssh_port: Some(2201),
|
||||
host_path: Some("/nonexistent/path/xyz123".into()),
|
||||
},
|
||||
)]);
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let result = handle_project_rebuild("myapp", 0, true, &store, dir.path()).await;
|
||||
assert!(
|
||||
result.contains("does not exist"),
|
||||
"expected 'does not exist': {result}"
|
||||
);
|
||||
}
|
||||
|
||||
/// End-to-end flow test: rebuild a project that has a valid host directory.
|
||||
///
|
||||
/// With `--force` and `--timeout 0` the drain check is skipped.
|
||||
/// The function proceeds to the image build step, which fails when Docker is
|
||||
/// not available in CI. On failure the reply must:
|
||||
/// (a) name the failed step ("image build")
|
||||
/// (b) leave the project still registered in the gateway (state preserved)
|
||||
/// (c) include a recovery path
|
||||
///
|
||||
/// When Docker IS available and the base image exists this test would exercise
|
||||
/// the full container stop → build → start → re-register flow.
|
||||
#[tokio::test]
|
||||
async fn rebuild_e2e_with_valid_host_path_reaches_image_build_step() {
|
||||
let host_dir = tempfile::tempdir().unwrap();
|
||||
// Create a minimal .huskies/ directory (simulating an existing project).
|
||||
std::fs::create_dir_all(host_dir.path().join(".huskies")).unwrap();
|
||||
|
||||
let store = make_store(vec![(
|
||||
"myapp",
|
||||
ProjectEntry {
|
||||
url: Some("http://127.0.0.1:3101".into()),
|
||||
auth_token: Some("tok".into()),
|
||||
ssh_port: Some(2201),
|
||||
host_path: Some(host_dir.path().to_str().unwrap().to_string()),
|
||||
},
|
||||
)]);
|
||||
let config_dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let result = handle_project_rebuild("myapp", 0, true, &store, config_dir.path()).await;
|
||||
|
||||
// (a) Step naming: one of several possible failure steps depending on what Docker
|
||||
// binaries are available in the test environment, or a success reply.
|
||||
let names_a_step = result.contains("image build")
|
||||
|| result.contains("SSH key")
|
||||
|| result.contains("container remove")
|
||||
|| result.contains("container start");
|
||||
let is_success = result.contains("rebuilt");
|
||||
assert!(
|
||||
names_a_step || is_success,
|
||||
"result should name a step or report success: {result}"
|
||||
);
|
||||
|
||||
// (b) State preserved: project is still registered in the gateway store.
|
||||
let projects = store.read().await;
|
||||
assert!(
|
||||
projects.contains_key("myapp"),
|
||||
"project 'myapp' must remain registered after failed rebuild: {result}"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -40,6 +40,43 @@ pub fn extract_rebuild_command(
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a "rebuild gateway" command from a raw message body.
|
||||
///
|
||||
/// Returns `Some(RebuildCommand)` only when the stripped message begins with
|
||||
/// "rebuild gateway" (case-insensitive). A plain "rebuild" without the
|
||||
/// "gateway" qualifier returns `None` so it falls through to the standard
|
||||
/// server rebuild handler.
|
||||
pub fn extract_rebuild_gateway_command(
|
||||
message: &str,
|
||||
bot_name: &str,
|
||||
bot_user_id: &str,
|
||||
) -> Option<RebuildCommand> {
|
||||
let stripped = strip_bot_mention(message, bot_name, bot_user_id);
|
||||
let trimmed = stripped
|
||||
.trim()
|
||||
.trim_start_matches(|c: char| !c.is_alphanumeric());
|
||||
|
||||
let (cmd, rest) = trimmed.split_once(char::is_whitespace)?;
|
||||
|
||||
if !cmd.eq_ignore_ascii_case("rebuild") {
|
||||
return None;
|
||||
}
|
||||
|
||||
let qualifier = rest
|
||||
.trim()
|
||||
.trim_start_matches(|c: char| !c.is_alphanumeric());
|
||||
let first_word = match qualifier.split_once(char::is_whitespace) {
|
||||
Some((w, _)) => w,
|
||||
None => qualifier,
|
||||
};
|
||||
|
||||
if first_word.eq_ignore_ascii_case("gateway") {
|
||||
Some(RebuildCommand)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a rebuild command: trigger server rebuild and restart.
|
||||
///
|
||||
/// Returns a string describing the outcome. On build failure the error
|
||||
|
||||
@@ -0,0 +1,478 @@
|
||||
//! `upgrade [<project>]` gateway chat command — streaming sled binary upgrade.
|
||||
//!
|
||||
//! Usage (gateway mode only):
|
||||
//! - `{bot} upgrade <project>` — upgrade the named sled's binary in-container.
|
||||
//! - `{bot} upgrade` — list registered projects (shows what can be targeted).
|
||||
//!
|
||||
//! The gateway orchestrates the upgrade in four phases, streaming a marker to
|
||||
//! the chat room at each step:
|
||||
//! 1. `[1/4] downloading` — POSTs to `{sled_url}/api/upgrade`; sled starts download.
|
||||
//! 2. `[2/4] swapping binary` — gateway received 202; sled atomically renamed the binary.
|
||||
//! 3. `[3/4] restarting sled` — sled re-execs with the new binary; HTTP goes dark briefly.
|
||||
//! 4. `[4/4] reconnected to gateway` — sled's `/health` probe is responding again.
|
||||
//!
|
||||
//! Concurrent `upgrade` invocations are serialised via a global async mutex so
|
||||
//! that two simultaneous upgrades cannot interleave their phase markers or race
|
||||
//! on the sled restart.
|
||||
|
||||
use crate::service::gateway::config::ProjectEntry;
|
||||
use std::collections::BTreeMap;
|
||||
use std::future::Future;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Duration;
|
||||
use tokio::sync::{Mutex, RwLock};
|
||||
|
||||
// ── Serial lock ────────────────────────────────────────────────────────────────
|
||||
|
||||
static UPGRADE_LOCK: OnceLock<Mutex<()>> = OnceLock::new();
|
||||
|
||||
fn upgrade_lock() -> &'static Mutex<()> {
|
||||
UPGRADE_LOCK.get_or_init(|| Mutex::new(()))
|
||||
}
|
||||
|
||||
// ── Command parsing ────────────────────────────────────────────────────────────
|
||||
|
||||
/// A parsed `upgrade` command.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum UpgradeCommand {
|
||||
/// `upgrade <project>` — upgrade the named sled.
|
||||
Upgrade {
|
||||
/// The project/sled name to upgrade.
|
||||
project: String,
|
||||
},
|
||||
/// `upgrade` with no argument — list available projects.
|
||||
ListProjects,
|
||||
}
|
||||
|
||||
/// Parse an `upgrade [<project>]` command from a raw message body.
|
||||
///
|
||||
/// Strips the bot mention prefix and checks whether the first word is `upgrade`.
|
||||
/// Returns `None` when the message is not an upgrade command.
|
||||
pub fn extract_upgrade_command(
|
||||
message: &str,
|
||||
bot_name: &str,
|
||||
bot_user_id: &str,
|
||||
) -> Option<UpgradeCommand> {
|
||||
let stripped = crate::chat::util::strip_bot_mention(message, bot_name, bot_user_id);
|
||||
let trimmed = stripped
|
||||
.trim()
|
||||
.trim_start_matches(|c: char| !c.is_alphanumeric());
|
||||
|
||||
let (cmd, rest) = match trimmed.split_once(char::is_whitespace) {
|
||||
Some((c, r)) => (c, r.trim()),
|
||||
None => (trimmed, ""),
|
||||
};
|
||||
|
||||
if !cmd.eq_ignore_ascii_case("upgrade") {
|
||||
return None;
|
||||
}
|
||||
|
||||
if rest.is_empty() {
|
||||
Some(UpgradeCommand::ListProjects)
|
||||
} else {
|
||||
Some(UpgradeCommand::Upgrade {
|
||||
project: rest.split_whitespace().next().unwrap_or(rest).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── Handlers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
/// List available projects when `upgrade` is invoked without an argument.
|
||||
///
|
||||
/// Returns a Markdown string enumerating the registered project names so the
|
||||
/// user knows which targets are valid for `upgrade <project>`.
|
||||
pub async fn handle_upgrade_list_projects(
|
||||
projects_store: &Arc<RwLock<BTreeMap<String, ProjectEntry>>>,
|
||||
) -> String {
|
||||
let projects = projects_store.read().await;
|
||||
if projects.is_empty() {
|
||||
return "No projects are currently registered with the gateway.".to_string();
|
||||
}
|
||||
let names: Vec<&String> = projects.keys().collect();
|
||||
let list = names
|
||||
.iter()
|
||||
.map(|n| format!("- `{n}`"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
format!("Registered projects (use `upgrade <project>` to upgrade one):\n{list}")
|
||||
}
|
||||
|
||||
/// Upgrade a named sled by streaming phase markers to the chat room.
|
||||
///
|
||||
/// Acquires the global upgrade lock to serialise concurrent invocations. Each
|
||||
/// phase is announced by calling `send_phase` before the corresponding work
|
||||
/// begins. On any failure, an error message is returned and the previous
|
||||
/// binary remains active on the sled.
|
||||
///
|
||||
/// `gateway_port` is used to derive the default binary source URL
|
||||
/// (`http://gateway:<port>/api/huskies-binary`) when neither
|
||||
/// `HUSKIES_GATEWAY_BINARY_URL` nor `--source` is set.
|
||||
pub async fn handle_sled_upgrade<F, Fut>(
|
||||
project: &str,
|
||||
projects_store: &Arc<RwLock<BTreeMap<String, ProjectEntry>>>,
|
||||
gateway_port: Option<u16>,
|
||||
send_phase: F,
|
||||
) -> String
|
||||
where
|
||||
F: Fn(String) -> Fut,
|
||||
Fut: Future<Output = ()>,
|
||||
{
|
||||
// ── Look up project URL ──────────────────────────────────────────────────
|
||||
let sled_url = {
|
||||
let projects = projects_store.read().await;
|
||||
match projects.get(project).and_then(|e| e.url.clone()) {
|
||||
Some(u) => u,
|
||||
None => {
|
||||
let available: Vec<&String> = projects.keys().collect();
|
||||
return format!(
|
||||
"Project `{project}` not found. Registered projects: {}",
|
||||
available
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ")
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// ── Resolve binary source URL ────────────────────────────────────────────
|
||||
let source_url = std::env::var("HUSKIES_GATEWAY_BINARY_URL").unwrap_or_else(|_| {
|
||||
format!(
|
||||
"http://gateway:{}/api/huskies-binary",
|
||||
gateway_port.unwrap_or(3000)
|
||||
)
|
||||
});
|
||||
|
||||
// ── Acquire serial lock ──────────────────────────────────────────────────
|
||||
let _lock = upgrade_lock().lock().await;
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
.unwrap_or_default();
|
||||
|
||||
// ── Phase 1: downloading ─────────────────────────────────────────────────
|
||||
send_phase("[1/4] downloading\u{2026}".to_string()).await;
|
||||
|
||||
let upgrade_url = format!("{}/api/upgrade", sled_url.trim_end_matches('/'));
|
||||
let body = serde_json::json!({ "source_url": source_url });
|
||||
|
||||
let resp = match client.post(&upgrade_url).json(&body).send().await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
return format!(
|
||||
"Upgrade failed at **[1/4] downloading**: could not reach sled at `{upgrade_url}`.\n\
|
||||
Error: {e}\n\n\
|
||||
The previous version remains active."
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
if !resp.status().is_success() && resp.status().as_u16() != 202 {
|
||||
let status = resp.status();
|
||||
let body_text = resp.text().await.unwrap_or_default();
|
||||
return format!(
|
||||
"Upgrade failed at **[1/4] downloading**: sled returned HTTP {status}.\n\
|
||||
Response: {body_text}\n\n\
|
||||
The previous version remains active."
|
||||
);
|
||||
}
|
||||
|
||||
// ── Phase 2: swapping binary ─────────────────────────────────────────────
|
||||
// The sled accepted the request (202) and is downloading + atomically
|
||||
// replacing the binary in the background.
|
||||
send_phase("[2/4] swapping binary\u{2026}".to_string()).await;
|
||||
|
||||
// ── Phase 3: restarting sled ─────────────────────────────────────────────
|
||||
// The sled will re-exec momentarily; announce before the health loop.
|
||||
send_phase("[3/4] restarting sled\u{2026}".to_string()).await;
|
||||
|
||||
// ── Wait for sled to come back up ────────────────────────────────────────
|
||||
let health_url = format!("{}/health", sled_url.trim_end_matches('/'));
|
||||
// Give the sled a few seconds to start the download + re-exec before polling.
|
||||
tokio::time::sleep(Duration::from_secs(3)).await;
|
||||
|
||||
let reconnected = wait_for_health(&client, &health_url, 120).await;
|
||||
if !reconnected {
|
||||
return format!(
|
||||
"Upgrade failed at **[4/4] reconnected to gateway**: sled at `{sled_url}` did not \
|
||||
come back online within 120 seconds after the upgrade was triggered.\n\n\
|
||||
Check the container logs: `docker logs huskies-{project}`"
|
||||
);
|
||||
}
|
||||
|
||||
// ── Phase 4: reconnected ─────────────────────────────────────────────────
|
||||
send_phase("[4/4] reconnected to gateway".to_string()).await;
|
||||
|
||||
// ── Report new version ───────────────────────────────────────────────────
|
||||
let version = fetch_sled_version(&client, &sled_url).await;
|
||||
format!("{project} upgraded to version {version}")
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Poll `GET {health_url}` every 3 seconds until it returns 200 or `timeout_secs` elapses.
|
||||
///
|
||||
/// Returns `true` when the probe succeeds, `false` on timeout.
|
||||
async fn wait_for_health(client: &reqwest::Client, health_url: &str, timeout_secs: u64) -> bool {
|
||||
let deadline = std::time::Instant::now() + Duration::from_secs(timeout_secs);
|
||||
let poll = Duration::from_secs(3);
|
||||
loop {
|
||||
match client.get(health_url).send().await {
|
||||
Ok(r) if r.status().is_success() => return true,
|
||||
_ => {}
|
||||
}
|
||||
if std::time::Instant::now() >= deadline {
|
||||
return false;
|
||||
}
|
||||
tokio::time::sleep(poll).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch the running version from the sled's `get_version` MCP tool.
|
||||
///
|
||||
/// Returns the version string on success, or `"unknown"` on any error so the
|
||||
/// final chat reply is still meaningful.
|
||||
async fn fetch_sled_version(client: &reqwest::Client, sled_url: &str) -> String {
|
||||
let mcp_url = format!("{}/mcp", sled_url.trim_end_matches('/'));
|
||||
let body = serde_json::json!({
|
||||
"jsonrpc": "2.0",
|
||||
"id": 1,
|
||||
"method": "tools/call",
|
||||
"params": {
|
||||
"name": "get_version",
|
||||
"arguments": {}
|
||||
}
|
||||
});
|
||||
let resp = match client.post(&mcp_url).json(&body).send().await {
|
||||
Ok(r) => r,
|
||||
Err(_) => return "unknown".to_string(),
|
||||
};
|
||||
let val: serde_json::Value = match resp.json().await {
|
||||
Ok(v) => v,
|
||||
Err(_) => return "unknown".to_string(),
|
||||
};
|
||||
// MCP tools/call response: result.content[0].text is a JSON string.
|
||||
let text = val
|
||||
.pointer("/result/content/0/text")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
if text.is_empty() {
|
||||
return "unknown".to_string();
|
||||
}
|
||||
serde_json::from_str::<serde_json::Value>(text)
|
||||
.ok()
|
||||
.and_then(|v| v.get("version").and_then(|v| v.as_str()).map(String::from))
|
||||
.unwrap_or_else(|| "unknown".to_string())
|
||||
}
|
||||
|
||||
// ── Tests ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// ── extract_upgrade_command ───────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn extract_upgrade_with_project() {
|
||||
let cmd = extract_upgrade_command("Timmy upgrade huskies-server", "Timmy", "@timmy:home");
|
||||
assert_eq!(
|
||||
cmd,
|
||||
Some(UpgradeCommand::Upgrade {
|
||||
project: "huskies-server".to_string()
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_upgrade_no_arg_is_list() {
|
||||
let cmd = extract_upgrade_command("Timmy upgrade", "Timmy", "@timmy:home");
|
||||
assert_eq!(cmd, Some(UpgradeCommand::ListProjects));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_upgrade_with_full_user_id() {
|
||||
let cmd = extract_upgrade_command("@timmy:home upgrade myapp", "Timmy", "@timmy:home");
|
||||
assert_eq!(
|
||||
cmd,
|
||||
Some(UpgradeCommand::Upgrade {
|
||||
project: "myapp".to_string()
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_non_upgrade_returns_none() {
|
||||
let cmd = extract_upgrade_command("Timmy status", "Timmy", "@timmy:home");
|
||||
assert!(cmd.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_upgrade_case_insensitive() {
|
||||
let cmd = extract_upgrade_command("Timmy UPGRADE alpha", "Timmy", "@timmy:home");
|
||||
assert_eq!(
|
||||
cmd,
|
||||
Some(UpgradeCommand::Upgrade {
|
||||
project: "alpha".to_string()
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
// ── handle_upgrade_list_projects ─────────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn list_projects_empty_store() {
|
||||
let store: Arc<RwLock<BTreeMap<String, ProjectEntry>>> =
|
||||
Arc::new(RwLock::new(BTreeMap::new()));
|
||||
let msg = handle_upgrade_list_projects(&store).await;
|
||||
assert!(
|
||||
msg.contains("No projects"),
|
||||
"empty store should say no projects: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list_projects_shows_names() {
|
||||
use std::collections::BTreeMap;
|
||||
let mut map = BTreeMap::new();
|
||||
map.insert(
|
||||
"alpha".to_string(),
|
||||
ProjectEntry {
|
||||
url: Some("http://localhost:3001".into()),
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
map.insert(
|
||||
"beta".to_string(),
|
||||
ProjectEntry {
|
||||
url: Some("http://localhost:3002".into()),
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
let store = Arc::new(RwLock::new(map));
|
||||
let msg = handle_upgrade_list_projects(&store).await;
|
||||
assert!(msg.contains("alpha"), "should list alpha: {msg}");
|
||||
assert!(msg.contains("beta"), "should list beta: {msg}");
|
||||
}
|
||||
|
||||
// ── handle_sled_upgrade validation ───────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn upgrade_unknown_project_returns_error() {
|
||||
let store: Arc<RwLock<BTreeMap<String, ProjectEntry>>> =
|
||||
Arc::new(RwLock::new(BTreeMap::new()));
|
||||
let phases: std::sync::Mutex<Vec<String>> = std::sync::Mutex::new(vec![]);
|
||||
let result = handle_sled_upgrade("nonexistent", &store, Some(3000), |msg| {
|
||||
phases.lock().unwrap().push(msg);
|
||||
async {}
|
||||
})
|
||||
.await;
|
||||
assert!(
|
||||
result.contains("not found"),
|
||||
"should say not found: {result}"
|
||||
);
|
||||
// No phase markers should have been emitted before the validation error.
|
||||
assert!(
|
||||
phases.lock().unwrap().is_empty(),
|
||||
"no phases should be emitted for unknown project"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn upgrade_project_with_no_url_fails_gracefully() {
|
||||
let mut map = BTreeMap::new();
|
||||
map.insert(
|
||||
"myapp".to_string(),
|
||||
ProjectEntry {
|
||||
url: None,
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
let store = Arc::new(RwLock::new(map));
|
||||
let result = handle_sled_upgrade("myapp", &store, Some(3000), |_msg| async {}).await;
|
||||
assert!(
|
||||
result.contains("not found"),
|
||||
"project with no URL should say not found: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn upgrade_unreachable_sled_reports_failure() {
|
||||
let mut map = BTreeMap::new();
|
||||
map.insert(
|
||||
"myapp".to_string(),
|
||||
ProjectEntry {
|
||||
url: Some("http://127.0.0.1:1".into()), // port 1 is never listening
|
||||
auth_token: None,
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
let store = Arc::new(RwLock::new(map));
|
||||
let phases: std::sync::Mutex<Vec<String>> = std::sync::Mutex::new(vec![]);
|
||||
let result = handle_sled_upgrade("myapp", &store, Some(3000), |msg| {
|
||||
phases.lock().unwrap().push(msg);
|
||||
async {}
|
||||
})
|
||||
.await;
|
||||
// Phase 1 marker must have been sent before the failed request.
|
||||
let sent = phases.lock().unwrap().clone();
|
||||
assert!(
|
||||
sent.iter().any(|m| m.contains("[1/4]")),
|
||||
"phase 1 marker must be sent: {sent:?}"
|
||||
);
|
||||
assert!(
|
||||
result.contains("downloading") || result.contains("reach"),
|
||||
"error should mention the failure: {result}"
|
||||
);
|
||||
assert!(
|
||||
result.contains("previous version"),
|
||||
"error should confirm old version is active: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── wait_for_health ───────────────────────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn wait_for_health_immediate_success() {
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let port = listener.local_addr().unwrap().port();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
if let Ok((mut stream, _)) = listener.accept().await {
|
||||
use tokio::io::AsyncWriteExt;
|
||||
let mut buf = [0u8; 4096];
|
||||
let _ = tokio::io::AsyncReadExt::read(&mut stream, &mut buf).await;
|
||||
let _ = stream
|
||||
.write_all(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nok")
|
||||
.await;
|
||||
}
|
||||
});
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let url = format!("http://127.0.0.1:{port}/health");
|
||||
let ok = wait_for_health(&client, &url, 5).await;
|
||||
assert!(ok, "should return true when health probe succeeds");
|
||||
handle.abort();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn wait_for_health_timeout() {
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_millis(100))
|
||||
.build()
|
||||
.unwrap();
|
||||
// Nothing listening on port 1.
|
||||
let ok = wait_for_health(&client, "http://127.0.0.1:1/health", 1).await;
|
||||
assert!(!ok, "should return false when health probe never succeeds");
|
||||
}
|
||||
}
|
||||
@@ -29,8 +29,10 @@ pub(super) async fn handle_llm_message(
|
||||
};
|
||||
|
||||
let bot_name = &ctx.services.bot_name;
|
||||
let persona = bot_name.to_lowercase();
|
||||
let event_ctx = crate::llm_session::assemble_prompt_context(&persona);
|
||||
let prompt = format!(
|
||||
"[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{user}: {user_message}"
|
||||
"{event_ctx}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{user}: {user_message}"
|
||||
);
|
||||
|
||||
let provider = ClaudeCodeProvider::new();
|
||||
|
||||
@@ -27,8 +27,10 @@ pub(super) async fn handle_llm_message(
|
||||
};
|
||||
|
||||
let bot_name = &ctx.services.bot_name;
|
||||
let persona = bot_name.to_lowercase();
|
||||
let event_ctx = crate::llm_session::assemble_prompt_context(&persona);
|
||||
let prompt = format!(
|
||||
"[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{sender}: {user_message}"
|
||||
"{event_ctx}[Your name is {bot_name}. Refer to yourself as {bot_name}, not Claude.]\n\n{sender}: {user_message}"
|
||||
);
|
||||
|
||||
let provider = ClaudeCodeProvider::new();
|
||||
|
||||
+102
-2
@@ -27,6 +27,19 @@ pub(crate) struct CliArgs {
|
||||
/// forwards all `prompt_permission` tool calls to the gateway over a WebSocket.
|
||||
/// Also readable from the `HUSKIES_UPSTREAM_GATEWAY` env var.
|
||||
pub(crate) upstream_gateway: Option<String>,
|
||||
/// Whether the `upgrade` subcommand was given.
|
||||
pub(crate) upgrade: bool,
|
||||
/// Source URL for the `upgrade` subcommand (`--source <URL>`).
|
||||
///
|
||||
/// If omitted, the upgrade subcommand falls back to
|
||||
/// `HUSKIES_BINARY_SOURCE` env var, then derives the URL from
|
||||
/// `HUSKIES_UPSTREAM_GATEWAY`.
|
||||
pub(crate) upgrade_source: Option<String>,
|
||||
/// Path to a trampoline job file (`--trampoline <path>`).
|
||||
///
|
||||
/// When set, the binary runs as a detached trampoline helper: it kills the
|
||||
/// old gateway, starts the new one, polls its health, and rolls back on failure.
|
||||
pub(crate) trampoline: Option<String>,
|
||||
}
|
||||
|
||||
/// Parse CLI arguments into `CliArgs`, or exit early for `--help` / `--version`.
|
||||
@@ -41,6 +54,9 @@ pub(crate) fn parse_cli_args(args: &[String]) -> Result<CliArgs, String> {
|
||||
let mut join_token: Option<String> = None;
|
||||
let mut gateway_url: Option<String> = None;
|
||||
let mut upstream_gateway: Option<String> = None;
|
||||
let mut upgrade = false;
|
||||
let mut upgrade_source: Option<String> = None;
|
||||
let mut trampoline: Option<String> = None;
|
||||
let mut i = 0;
|
||||
|
||||
while i < args.len() {
|
||||
@@ -120,6 +136,29 @@ pub(crate) fn parse_cli_args(args: &[String]) -> Result<CliArgs, String> {
|
||||
"agent" => {
|
||||
agent = true;
|
||||
}
|
||||
"upgrade" => {
|
||||
upgrade = true;
|
||||
}
|
||||
"--source" => {
|
||||
i += 1;
|
||||
if i >= args.len() {
|
||||
return Err("--source requires a value".to_string());
|
||||
}
|
||||
upgrade_source = Some(args[i].clone());
|
||||
}
|
||||
a if a.starts_with("--source=") => {
|
||||
upgrade_source = Some(a["--source=".len()..].to_string());
|
||||
}
|
||||
"--trampoline" => {
|
||||
i += 1;
|
||||
if i >= args.len() {
|
||||
return Err("--trampoline requires a path".to_string());
|
||||
}
|
||||
trampoline = Some(args[i].clone());
|
||||
}
|
||||
a if a.starts_with("--trampoline=") => {
|
||||
trampoline = Some(a["--trampoline=".len()..].to_string());
|
||||
}
|
||||
a if a.starts_with('-') => {
|
||||
return Err(format!("unknown option: {a}"));
|
||||
}
|
||||
@@ -147,6 +186,9 @@ pub(crate) fn parse_cli_args(args: &[String]) -> Result<CliArgs, String> {
|
||||
join_token,
|
||||
gateway_url,
|
||||
upstream_gateway,
|
||||
upgrade,
|
||||
upgrade_source,
|
||||
trampoline,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -155,12 +197,16 @@ pub(crate) fn print_help() {
|
||||
println!("huskies init [OPTIONS] [PATH]");
|
||||
println!("huskies agent --rendezvous <URL> [OPTIONS] [PATH]");
|
||||
println!("huskies --gateway [OPTIONS] [PATH]");
|
||||
println!("huskies upgrade [--source <URL>]");
|
||||
println!();
|
||||
println!("Serve a huskies project.");
|
||||
println!();
|
||||
println!("COMMANDS:");
|
||||
println!(" init Scaffold a new .huskies/ project and start the interactive setup wizard.");
|
||||
println!(" agent Run as a headless build agent — syncs CRDT state, claims and runs work.");
|
||||
println!(" init Scaffold a new .huskies/ project and start the interactive setup wizard.");
|
||||
println!(" agent Run as a headless build agent — syncs CRDT state, claims and runs work.");
|
||||
println!(
|
||||
" upgrade Fetch a new huskies binary from SOURCE and atomically replace the current"
|
||||
);
|
||||
println!();
|
||||
println!("ARGS:");
|
||||
println!(
|
||||
@@ -190,6 +236,8 @@ pub(crate) fn print_help() {
|
||||
println!(" sled connects to WS URL and forwards all");
|
||||
println!(" prompt_permission calls via the uplink protocol.");
|
||||
println!(" Also readable from HUSKIES_UPSTREAM_GATEWAY env var.");
|
||||
println!(" --source <URL> Binary source URL for the `upgrade` subcommand.");
|
||||
println!(" Falls back to HUSKIES_BINARY_SOURCE env var.");
|
||||
}
|
||||
|
||||
/// Resolve the optional positional path argument into an absolute `PathBuf`.
|
||||
@@ -399,6 +447,58 @@ mod tests {
|
||||
assert!(parse_cli_args(&args).is_err());
|
||||
}
|
||||
|
||||
// ── upgrade subcommand ──────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn parse_upgrade_subcommand() {
|
||||
let args = vec!["upgrade".to_string()];
|
||||
let result = parse_cli_args(&args).unwrap();
|
||||
assert!(result.upgrade);
|
||||
assert_eq!(result.upgrade_source, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_upgrade_with_source_flag() {
|
||||
let args = vec![
|
||||
"upgrade".to_string(),
|
||||
"--source".to_string(),
|
||||
"http://gateway:3000/api/huskies-binary".to_string(),
|
||||
];
|
||||
let result = parse_cli_args(&args).unwrap();
|
||||
assert!(result.upgrade);
|
||||
assert_eq!(
|
||||
result.upgrade_source,
|
||||
Some("http://gateway:3000/api/huskies-binary".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_upgrade_with_source_equals_syntax() {
|
||||
let args = vec![
|
||||
"upgrade".to_string(),
|
||||
"--source=http://gw:3000/api/b".to_string(),
|
||||
];
|
||||
let result = parse_cli_args(&args).unwrap();
|
||||
assert!(result.upgrade);
|
||||
assert_eq!(
|
||||
result.upgrade_source,
|
||||
Some("http://gw:3000/api/b".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_upgrade_source_missing_value_is_error() {
|
||||
let args = vec!["upgrade".to_string(), "--source".to_string()];
|
||||
assert!(parse_cli_args(&args).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_no_args_upgrade_is_false() {
|
||||
let result = parse_cli_args(&[]).unwrap();
|
||||
assert!(!result.upgrade);
|
||||
assert_eq!(result.upgrade_source, None);
|
||||
}
|
||||
|
||||
// ── resolve_path_arg ────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -0,0 +1,176 @@
|
||||
//! Read/write helpers for the `event_log` append-only list in the CRDT document.
|
||||
//!
|
||||
//! Every pipeline stage transition is appended as an [`EventLogEntryCrdt`][super::super::types::EventLogEntryCrdt]
|
||||
//! entry. Entries are never updated or tombstoned — the list is strictly grow-only.
|
||||
//! Monotonic sequencing is computed at write time while holding the CRDT lock,
|
||||
//! so `event_seq` values for a given sled are always contiguous and gap-free.
|
||||
|
||||
use bft_json_crdt::json_crdt::{JsonValue, *};
|
||||
use bft_json_crdt::op::ROOT_ID;
|
||||
use serde_json::json;
|
||||
|
||||
use super::super::state::{apply_and_persist, get_crdt};
|
||||
use super::super::types::EventLogEntryCrdt;
|
||||
|
||||
/// `pipeline_event` value used to mark a gap sentinel entry in the event log.
|
||||
///
|
||||
/// A gap sentinel is appended when the event-log subscriber detects that the
|
||||
/// broadcast channel dropped events (i.e. it received `RecvError::Lagged`).
|
||||
/// The `from_stage` and `to_stage` fields encode the logical EventId range
|
||||
/// `[from, to]` of the dropped events as decimal strings.
|
||||
pub const GAP_PIPELINE_EVENT: &str = "EventStreamGap";
|
||||
|
||||
/// Raw event log entry extracted from the CRDT document.
|
||||
///
|
||||
/// All fields are decoded to Rust primitives; entries with a missing or
|
||||
/// malformed `sled_id` are silently dropped by [`read_all_event_log_entries`].
|
||||
pub struct EventLogEntryRaw {
|
||||
/// Monotonic sequence number for the recording sled (0-based).
|
||||
pub event_seq: u64,
|
||||
/// Hex-encoded Ed25519 public key of the sled that wrote this entry.
|
||||
pub sled_id: String,
|
||||
/// Unix timestamp (seconds) when the transition fired.
|
||||
pub timestamp: f64,
|
||||
/// Story ID of the work item that transitioned.
|
||||
pub story_id: String,
|
||||
/// Human-readable label of the stage before the transition.
|
||||
pub from_stage: String,
|
||||
/// Human-readable label of the stage after the transition.
|
||||
pub to_stage: String,
|
||||
/// String label of the `PipelineEvent` variant.
|
||||
pub pipeline_event: String,
|
||||
}
|
||||
|
||||
/// Append a new event log entry to the CRDT, computing the monotonic `event_seq`
|
||||
/// atomically while the CRDT lock is held.
|
||||
///
|
||||
/// No-ops silently when the CRDT is not yet initialised.
|
||||
pub fn append_event_log_entry(
|
||||
sled_id: &str,
|
||||
timestamp: f64,
|
||||
story_id: &str,
|
||||
from_stage: &str,
|
||||
to_stage: &str,
|
||||
pipeline_event: &str,
|
||||
) {
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return;
|
||||
};
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return;
|
||||
};
|
||||
|
||||
// Count existing entries for this sled while holding the lock so the seq
|
||||
// is computed and used in the same critical section — no TOCTOU gap.
|
||||
let event_seq = state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter(|e| matches!(e.sled_id.view(), JsonValue::String(s) if s == sled_id))
|
||||
.count() as f64;
|
||||
|
||||
// Append after the last existing entry so the list stays in insertion order.
|
||||
// Inserting after ROOT_ID would place each entry at the front (RGA semantics),
|
||||
// reversing the sequence; inserting after the current tail preserves order.
|
||||
let total_len = state.crdt.doc.event_log.view().len();
|
||||
let after = if total_len > 0 {
|
||||
super::list_id_at(&state.crdt.doc.event_log, total_len - 1).unwrap_or(ROOT_ID)
|
||||
} else {
|
||||
ROOT_ID
|
||||
};
|
||||
|
||||
let entry: JsonValue = json!({
|
||||
"event_seq": event_seq,
|
||||
"sled_id": sled_id,
|
||||
"timestamp": timestamp,
|
||||
"story_id": story_id,
|
||||
"from_stage": from_stage,
|
||||
"to_stage": to_stage,
|
||||
"pipeline_event": pipeline_event,
|
||||
})
|
||||
.into();
|
||||
|
||||
apply_and_persist(&mut state, |s| s.crdt.doc.event_log.insert(after, entry));
|
||||
}
|
||||
|
||||
/// Append an `EventStreamGap` sentinel entry to the CRDT event log.
|
||||
///
|
||||
/// Called when the event-log broadcast subscriber detects that the channel
|
||||
/// dropped events (`RecvError::Lagged`). `from_id` and `to_id` are the
|
||||
/// logical sequence numbers (in the per-sled event stream) of the first and
|
||||
/// last dropped events respectively. The sentinel itself also consumes one
|
||||
/// CRDT `event_seq` slot so the monotonic counter remains contiguous across
|
||||
/// the gap.
|
||||
pub fn append_gap_log_entry(sled_id: &str, from_id: u64, to_id: u64) {
|
||||
let timestamp = chrono::Utc::now().timestamp() as f64;
|
||||
append_event_log_entry(
|
||||
sled_id,
|
||||
timestamp,
|
||||
"",
|
||||
&from_id.to_string(),
|
||||
&to_id.to_string(),
|
||||
GAP_PIPELINE_EVENT,
|
||||
);
|
||||
}
|
||||
|
||||
/// Read all event log entries from the CRDT document.
|
||||
///
|
||||
/// Entries with a missing or empty `sled_id` are silently skipped.
|
||||
/// Order reflects CRDT insertion order (RGA list semantics).
|
||||
pub fn read_all_event_log_entries() -> Vec<EventLogEntryRaw> {
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return Vec::new();
|
||||
};
|
||||
let Ok(state) = state_mutex.lock() else {
|
||||
return Vec::new();
|
||||
};
|
||||
state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter_map(extract_entry)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Convert a CRDT event log entry to its read-side representation.
|
||||
fn extract_entry(e: &EventLogEntryCrdt) -> Option<EventLogEntryRaw> {
|
||||
let event_seq = match e.event_seq.view() {
|
||||
JsonValue::Number(n) => n as u64,
|
||||
_ => return None,
|
||||
};
|
||||
let sled_id = match e.sled_id.view() {
|
||||
JsonValue::String(s) if !s.is_empty() => s,
|
||||
_ => return None,
|
||||
};
|
||||
let timestamp = match e.timestamp.view() {
|
||||
JsonValue::Number(n) => n,
|
||||
_ => 0.0,
|
||||
};
|
||||
let story_id = match e.story_id.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let from_stage = match e.from_stage.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let to_stage = match e.to_stage.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let pipeline_event = match e.pipeline_event.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
Some(EventLogEntryRaw {
|
||||
event_seq,
|
||||
sled_id,
|
||||
timestamp,
|
||||
story_id,
|
||||
from_stage,
|
||||
to_stage,
|
||||
pipeline_event,
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,332 @@
|
||||
//! Read/write helpers for the `llm_sessions` LWW-map collection, including the
|
||||
//! atomic `assemble_and_advance_session` helper used by every chat transport.
|
||||
//!
|
||||
//! LLM sessions are keyed by **persona name** (e.g. `"timmy"` for the
|
||||
//! gateway-level bot) and track per-sled high-water marks so that
|
||||
//! `assemble_and_advance_session` can inject only events the LLM has not yet
|
||||
//! seen and advance the marks atomically within a single CRDT lock acquisition.
|
||||
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use bft_json_crdt::json_crdt::{JsonValue, *};
|
||||
use bft_json_crdt::op::ROOT_ID;
|
||||
use serde_json::json;
|
||||
|
||||
use super::super::state::{apply_and_persist, get_crdt, rebuild_llm_session_index};
|
||||
use super::super::types::{LlmSessionCrdt, LlmSessionView, ScopeFilter};
|
||||
use super::event_log::GAP_PIPELINE_EVENT;
|
||||
|
||||
/// Write or upsert an LLM session entry keyed by `persona`.
|
||||
///
|
||||
/// Creates a new entry if `persona` is not yet present; updates `scope` on an
|
||||
/// existing entry. The `high_water` register is not touched by this function —
|
||||
/// use `assemble_and_advance_session` to advance it atomically.
|
||||
///
|
||||
/// The `scope` string must be in wire form: `"all"` for [`ScopeFilter::All`]
|
||||
/// or `"sleds:hex1,hex2"` for [`ScopeFilter::Sleds`].
|
||||
pub fn write_llm_session(persona: &str, scope: &str) {
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return;
|
||||
};
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return;
|
||||
};
|
||||
|
||||
if let Some(&idx) = state.llm_session_index.get(persona) {
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions[idx]
|
||||
.persona_name
|
||||
.set(persona.to_string())
|
||||
});
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions[idx].scope.set(scope.to_string())
|
||||
});
|
||||
} else {
|
||||
let entry: JsonValue = json!({
|
||||
"session_id": persona,
|
||||
"persona_name": persona,
|
||||
"scope": scope,
|
||||
"high_water": "{}",
|
||||
})
|
||||
.into();
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions.insert(ROOT_ID, entry)
|
||||
});
|
||||
state.llm_session_index = rebuild_llm_session_index(&state.crdt);
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a single LLM session entry by persona name.
|
||||
pub fn read_llm_session(persona: &str) -> Option<LlmSessionView> {
|
||||
let state_mutex = get_crdt()?;
|
||||
let state = state_mutex.lock().ok()?;
|
||||
let &idx = state.llm_session_index.get(persona)?;
|
||||
extract_llm_session_view(&state.crdt.doc.llm_sessions[idx])
|
||||
}
|
||||
|
||||
/// Atomically read new event-log entries for `persona` past the stored
|
||||
/// high-water marks, render them as a block of audit lines, and advance the
|
||||
/// marks to prevent double-injection on the next call.
|
||||
///
|
||||
/// The set of sleds whose events are collected is determined by the persona's
|
||||
/// [`ScopeFilter`]:
|
||||
/// - [`ScopeFilter::All`]: events from every sled present in the event log are
|
||||
/// included — this is the gateway-level persona default that gives a full
|
||||
/// cross-sled view.
|
||||
/// - [`ScopeFilter::Sleds`]: only events whose `sled_id` is in the stored set
|
||||
/// are included. When the stored set is empty (legacy `"single-sled"` rows or
|
||||
/// freshly created sessions with no explicit scope), the local node's sled ID
|
||||
/// is used as the sole member, preserving prior single-sled behaviour.
|
||||
///
|
||||
/// Returns an empty `Vec` when there are no new events or the CRDT is not
|
||||
/// initialised.
|
||||
pub fn assemble_and_advance_session(persona: &str) -> Vec<String> {
|
||||
let local_sled_id = crate::crdt_state::our_node_id().unwrap_or_default();
|
||||
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return Vec::new();
|
||||
};
|
||||
let Ok(mut state) = state_mutex.lock() else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
// Determine the persona's scope filter and current high-water map.
|
||||
let (scope_filter, current_high_water) = match state.llm_session_index.get(persona).copied() {
|
||||
Some(idx) => {
|
||||
let filter = parse_scope(&state.crdt.doc.llm_sessions[idx], &local_sled_id);
|
||||
let hw = parse_high_water(&state.crdt.doc.llm_sessions[idx]);
|
||||
(filter, hw)
|
||||
}
|
||||
None => {
|
||||
// New session with no stored entry: default to local sled only.
|
||||
let mut ids = BTreeSet::new();
|
||||
if !local_sled_id.is_empty() {
|
||||
ids.insert(local_sled_id.clone());
|
||||
}
|
||||
(ScopeFilter::Sleds(ids), BTreeMap::new())
|
||||
}
|
||||
};
|
||||
|
||||
// Build the set of sled IDs to collect events from.
|
||||
let target_sleds: BTreeSet<String> = match &scope_filter {
|
||||
ScopeFilter::All => {
|
||||
// Collect every unique sled_id present in the event log at this moment
|
||||
// (live, not snapshotted — picks up newly adopted sleds automatically).
|
||||
state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter_map(|e| match e.sled_id.view() {
|
||||
JsonValue::String(s) if !s.is_empty() => Some(s),
|
||||
_ => None,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
ScopeFilter::Sleds(ids) if ids.is_empty() => {
|
||||
// Empty set → legacy fallback: local sled only.
|
||||
if local_sled_id.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
std::iter::once(local_sled_id.clone()).collect()
|
||||
}
|
||||
ScopeFilter::Sleds(ids) => ids.clone(),
|
||||
};
|
||||
|
||||
if target_sleds.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
// Collect new events from each target sled past its high-water mark.
|
||||
let mut new_events: Vec<(f64, String, String, String, String, String)> = state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter_map(|e| extract_new_event_multi(e, &target_sleds, ¤t_high_water))
|
||||
.collect();
|
||||
|
||||
if new_events.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
// Sort by (sled_id, event_seq) for deterministic ordering.
|
||||
new_events.sort_by(|a, b| {
|
||||
a.1.cmp(&b.1)
|
||||
.then(a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal))
|
||||
});
|
||||
|
||||
// Advance the high-water mark for each sled that had new events.
|
||||
let mut new_high_water = current_high_water;
|
||||
for (seq, sled_id, ..) in &new_events {
|
||||
let entry = new_high_water.entry(sled_id.clone()).or_insert(0);
|
||||
if *seq as u64 > *entry {
|
||||
*entry = *seq as u64;
|
||||
}
|
||||
}
|
||||
let new_hw_json = serde_json::to_string(&new_high_water).unwrap_or_else(|_| "{}".to_string());
|
||||
|
||||
// Upsert the persona entry with the new high-water value.
|
||||
let idx_opt = state.llm_session_index.get(persona).copied();
|
||||
if let Some(idx) = idx_opt {
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions[idx]
|
||||
.high_water
|
||||
.set(new_hw_json.clone())
|
||||
});
|
||||
} else {
|
||||
let scope_str = scope_filter.to_scope_str();
|
||||
let entry: JsonValue = json!({
|
||||
"session_id": persona,
|
||||
"persona_name": persona,
|
||||
"scope": scope_str,
|
||||
"high_water": new_hw_json,
|
||||
})
|
||||
.into();
|
||||
apply_and_persist(&mut state, |s| {
|
||||
s.crdt.doc.llm_sessions.insert(ROOT_ID, entry)
|
||||
});
|
||||
state.llm_session_index = rebuild_llm_session_index(&state.crdt);
|
||||
}
|
||||
|
||||
// Observability: log event-log size and gap count across the persona's
|
||||
// target sleds (the scope actually assembled for this persona).
|
||||
let total_entries = state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter(|e| matches!(e.sled_id.view(), JsonValue::String(s) if target_sleds.contains(&s)))
|
||||
.count();
|
||||
let gap_count = state
|
||||
.crdt
|
||||
.doc
|
||||
.event_log
|
||||
.iter()
|
||||
.filter(|e| {
|
||||
matches!(e.sled_id.view(), JsonValue::String(s) if target_sleds.contains(&s))
|
||||
&& matches!(e.pipeline_event.view(), JsonValue::String(s) if s == GAP_PIPELINE_EVENT)
|
||||
})
|
||||
.count();
|
||||
crate::slog!(
|
||||
"[event-log] assemble persona={persona} sled_entries={total_entries} gap_count={gap_count}"
|
||||
);
|
||||
|
||||
// Render each new event as a compact audit line; gap sentinels get a
|
||||
// human-readable message so the LLM is never presented with raw field data.
|
||||
new_events
|
||||
.into_iter()
|
||||
.map(
|
||||
|(_, sled_id, story_id, from_stage, to_stage, pipeline_event)| {
|
||||
if pipeline_event == GAP_PIPELINE_EVENT {
|
||||
format!("events between {from_stage} and {to_stage} were dropped")
|
||||
} else {
|
||||
format!(
|
||||
"pipeline_event sled_id=\"{sled_id}\" story_id=\"{story_id}\" \
|
||||
from=\"{from_stage}\" to=\"{to_stage}\" event=\"{pipeline_event}\""
|
||||
)
|
||||
}
|
||||
},
|
||||
)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Decode the high-water JSON string from an `LlmSessionCrdt` entry.
|
||||
fn parse_high_water(entry: &LlmSessionCrdt) -> BTreeMap<String, u64> {
|
||||
match entry.high_water.view() {
|
||||
JsonValue::String(s) if !s.is_empty() && s != "{}" => {
|
||||
serde_json::from_str(&s).unwrap_or_default()
|
||||
}
|
||||
_ => BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse the scope filter from an `LlmSessionCrdt` entry, falling back to
|
||||
/// a single-element set containing `local_sled_id` for legacy / empty scope strings.
|
||||
fn parse_scope(entry: &LlmSessionCrdt, local_sled_id: &str) -> ScopeFilter {
|
||||
let raw = match entry.scope.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let filter = ScopeFilter::from_scope_str(&raw);
|
||||
// For a Sleds filter with an empty set (legacy "single-sled" or ""),
|
||||
// fall back to the local sled.
|
||||
if let ScopeFilter::Sleds(ref ids) = filter
|
||||
&& ids.is_empty()
|
||||
&& !local_sled_id.is_empty()
|
||||
{
|
||||
let mut fallback = BTreeSet::new();
|
||||
fallback.insert(local_sled_id.to_string());
|
||||
return ScopeFilter::Sleds(fallback);
|
||||
}
|
||||
filter
|
||||
}
|
||||
|
||||
/// Extract one event log entry if its `sled_id` is in `target_sleds` and its
|
||||
/// `event_seq` is strictly greater than the matching high-water value (or no
|
||||
/// high-water has been recorded yet for that sled).
|
||||
///
|
||||
/// Returns `(event_seq, sled_id, story_id, from_stage, to_stage, pipeline_event)`.
|
||||
fn extract_new_event_multi(
|
||||
e: &crate::crdt_state::types::EventLogEntryCrdt,
|
||||
target_sleds: &BTreeSet<String>,
|
||||
high_water: &BTreeMap<String, u64>,
|
||||
) -> Option<(f64, String, String, String, String, String)> {
|
||||
let sled_id = match e.sled_id.view() {
|
||||
JsonValue::String(s) if !s.is_empty() && target_sleds.contains(&s) => s,
|
||||
_ => return None,
|
||||
};
|
||||
let event_seq = match e.event_seq.view() {
|
||||
JsonValue::Number(n) => n,
|
||||
_ => return None,
|
||||
};
|
||||
let last_seen = high_water.get(&sled_id).copied();
|
||||
if last_seen.is_some_and(|last| event_seq as u64 <= last) {
|
||||
return None;
|
||||
}
|
||||
let story_id = match e.story_id.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let from_stage = match e.from_stage.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let to_stage = match e.to_stage.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let pipeline_event = match e.pipeline_event.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
Some((
|
||||
event_seq,
|
||||
sled_id,
|
||||
story_id,
|
||||
from_stage,
|
||||
to_stage,
|
||||
pipeline_event,
|
||||
))
|
||||
}
|
||||
|
||||
/// Convert a CRDT LLM session entry into its read-only view representation.
|
||||
pub(super) fn extract_llm_session_view(entry: &LlmSessionCrdt) -> Option<LlmSessionView> {
|
||||
let session_id = match entry.session_id.view() {
|
||||
JsonValue::String(s) if !s.is_empty() => s,
|
||||
_ => return None,
|
||||
};
|
||||
let persona_name = match entry.persona_name.view() {
|
||||
JsonValue::String(s) => s,
|
||||
_ => String::new(),
|
||||
};
|
||||
let local_sled_id = crate::crdt_state::our_node_id().unwrap_or_default();
|
||||
let scope_filter = parse_scope(entry, &local_sled_id);
|
||||
let high_water = parse_high_water(entry);
|
||||
Some(LlmSessionView {
|
||||
session_id,
|
||||
persona_name,
|
||||
scope_filter,
|
||||
high_water,
|
||||
})
|
||||
}
|
||||
@@ -14,7 +14,9 @@ use bft_json_crdt::op::OpId;
|
||||
|
||||
mod active_agents;
|
||||
mod agent_throttle;
|
||||
mod event_log;
|
||||
mod gateway_projects;
|
||||
mod llm_sessions;
|
||||
mod merge_jobs;
|
||||
mod test_jobs;
|
||||
mod tokens;
|
||||
@@ -28,9 +30,14 @@ pub use active_agents::{
|
||||
pub use agent_throttle::{
|
||||
delete_agent_throttle, read_agent_throttle, read_all_agent_throttles, write_agent_throttle,
|
||||
};
|
||||
pub use event_log::{
|
||||
EventLogEntryRaw, GAP_PIPELINE_EVENT, append_event_log_entry, append_gap_log_entry,
|
||||
read_all_event_log_entries,
|
||||
};
|
||||
pub use gateway_projects::{
|
||||
delete_gateway_project, read_all_gateway_projects, read_gateway_project, write_gateway_project,
|
||||
};
|
||||
pub use llm_sessions::{assemble_and_advance_session, read_llm_session, write_llm_session};
|
||||
pub use merge_jobs::{delete_merge_job, read_all_merge_jobs, read_merge_job, write_merge_job};
|
||||
pub use test_jobs::{delete_test_job, read_all_test_jobs, read_test_job, write_test_job};
|
||||
pub use tokens::{delete_token_usage, read_all_token_usage, read_token_usage, write_token_usage};
|
||||
|
||||
@@ -28,12 +28,14 @@ mod write;
|
||||
|
||||
pub use gateway_config::{read_gateway_active_project, write_gateway_active_project};
|
||||
pub use lww_maps::{
|
||||
delete_active_agent, delete_agent_throttle, delete_gateway_project, delete_merge_job,
|
||||
delete_test_job, delete_token_usage, read_active_agent, read_agent_throttle,
|
||||
read_all_active_agents, read_all_agent_throttles, read_all_gateway_projects,
|
||||
read_all_merge_jobs, read_all_test_jobs, read_all_token_usage, read_gateway_project,
|
||||
read_merge_job, read_test_job, read_token_usage, write_active_agent, write_agent_throttle,
|
||||
write_gateway_project, write_merge_job, write_test_job, write_token_usage,
|
||||
EventLogEntryRaw, GAP_PIPELINE_EVENT, append_event_log_entry, append_gap_log_entry,
|
||||
assemble_and_advance_session, delete_active_agent, delete_agent_throttle,
|
||||
delete_gateway_project, delete_merge_job, delete_test_job, delete_token_usage,
|
||||
read_active_agent, read_agent_throttle, read_all_active_agents, read_all_agent_throttles,
|
||||
read_all_event_log_entries, read_all_gateway_projects, read_all_merge_jobs, read_all_test_jobs,
|
||||
read_all_token_usage, read_gateway_project, read_llm_session, read_merge_job, read_test_job,
|
||||
read_token_usage, write_active_agent, write_agent_throttle, write_gateway_project,
|
||||
write_llm_session, write_merge_job, write_test_job, write_token_usage,
|
||||
};
|
||||
pub use ops::{all_ops_json, apply_remote_op, ops_since, our_vector_clock, subscribe_ops};
|
||||
pub use presence::{
|
||||
@@ -45,12 +47,14 @@ pub use read::{
|
||||
dep_is_archived_crdt, dep_is_done_crdt, dump_crdt_state, evict_item, is_tombstoned,
|
||||
read_all_items, read_item, tombstoned_ids,
|
||||
};
|
||||
pub(crate) use state::flush_persistence;
|
||||
pub use state::{init, subscribe};
|
||||
pub use types::{
|
||||
ActiveAgentCrdt, ActiveAgentView, AgentThrottleCrdt, AgentThrottleView, CrdtEvent, EpicId,
|
||||
GatewayConfigCrdt, GatewayProjectCrdt, GatewayProjectView, MergeJobCrdt, MergeJobView,
|
||||
NodePresenceCrdt, NodePresenceView, PipelineDoc, PipelineItemCrdt, PipelineItemView,
|
||||
TestJobCrdt, TestJobView, TokenUsageCrdt, TokenUsageView, WorkItem,
|
||||
EventLogEntryCrdt, GatewayConfigCrdt, GatewayProjectCrdt, GatewayProjectView, LlmSessionCrdt,
|
||||
LlmSessionView, MergeJobCrdt, MergeJobView, NodePresenceCrdt, NodePresenceView, PipelineDoc,
|
||||
PipelineItemCrdt, PipelineItemView, ScopeFilter, TestJobCrdt, TestJobView, TokenUsageCrdt,
|
||||
TokenUsageView, WorkItem,
|
||||
};
|
||||
pub use write::{
|
||||
bump_retry_count, migrate_legacy_stage_strings, migrate_merge_job, migrate_names_from_slugs,
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
#![allow(unused_imports, dead_code)]
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use super::hex;
|
||||
use bft_json_crdt::json_crdt::*;
|
||||
@@ -10,9 +11,10 @@ use tokio::sync::broadcast;
|
||||
|
||||
use super::VectorClock;
|
||||
use super::state::{
|
||||
SYNC_TX, all_ops_lock, apply_and_persist, emit_event, get_crdt, rebuild_active_agent_index,
|
||||
rebuild_agent_throttle_index, rebuild_index, rebuild_merge_job_index, rebuild_node_index,
|
||||
rebuild_test_job_index, rebuild_token_index, track_op, vector_clock_lock,
|
||||
PERSIST_PENDING, PersistMsg, SYNC_TX, all_ops_lock, apply_and_persist, emit_event, get_crdt,
|
||||
rebuild_active_agent_index, rebuild_agent_throttle_index, rebuild_index,
|
||||
rebuild_merge_job_index, rebuild_node_index, rebuild_test_job_index, rebuild_token_index,
|
||||
track_op, vector_clock_lock,
|
||||
};
|
||||
use super::types::{CrdtEvent, PipelineDoc};
|
||||
use crate::slog;
|
||||
@@ -116,9 +118,15 @@ pub fn apply_remote_op(op: SignedOp) -> bool {
|
||||
}
|
||||
|
||||
// Persist the op.
|
||||
if let Err(e) = state.persist_tx.send(op.clone()) {
|
||||
if state
|
||||
.persist_tx
|
||||
.send(PersistMsg::Op(Box::new(op.clone())))
|
||||
.is_ok()
|
||||
{
|
||||
PERSIST_PENDING.fetch_add(1, Ordering::Relaxed);
|
||||
} else {
|
||||
crate::slog_error!(
|
||||
"[crdt] Failed to send remote op to persist task: {e}; persist task may be dead. \
|
||||
"[crdt] Failed to send remote op to persist task; persist task may be dead. \
|
||||
In-memory state is now ahead of persisted state."
|
||||
);
|
||||
}
|
||||
|
||||
@@ -6,7 +6,9 @@ use std::collections::HashMap;
|
||||
use bft_json_crdt::json_crdt::*;
|
||||
use bft_json_crdt::op::{OpId, ROOT_ID};
|
||||
|
||||
use super::state::{all_ops_lock, apply_and_persist, get_crdt, rebuild_index};
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use super::state::{PERSIST_PENDING, all_ops_lock, apply_and_persist, get_crdt, rebuild_index};
|
||||
use super::types::{PipelineDoc, PipelineItemCrdt, PipelineItemView};
|
||||
|
||||
// ── Debug dump ───────────────────────────────────────────────────────
|
||||
@@ -31,6 +33,8 @@ pub struct CrdtItemDump {
|
||||
pub is_deleted: bool,
|
||||
/// Origin JSON string, or `None` for items that pre-date story 1088.
|
||||
pub origin: Option<String>,
|
||||
/// Explicit item type register, or `None` when unset (infer from story_id prefix).
|
||||
pub item_type: Option<String>,
|
||||
}
|
||||
|
||||
/// Top-level debug dump of the in-memory CRDT state.
|
||||
@@ -44,6 +48,8 @@ pub struct CrdtStateDump {
|
||||
pub max_seq_in_list: u64,
|
||||
/// Count of ops in the ALL_OPS journal (persisted ops replayed at startup).
|
||||
pub persisted_ops_count: usize,
|
||||
/// Count of ops queued in the persistence channel not yet written to SQLite.
|
||||
pub pending_persist_ops_count: usize,
|
||||
pub items: Vec<CrdtItemDump>,
|
||||
}
|
||||
|
||||
@@ -61,6 +67,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
let persisted_ops_count = all_ops_lock()
|
||||
.and_then(|m| m.lock().ok().map(|v| v.len()))
|
||||
.unwrap_or(0);
|
||||
let pending_persist_ops_count = PERSIST_PENDING.load(Ordering::Relaxed);
|
||||
|
||||
let Some(state_mutex) = get_crdt() else {
|
||||
return CrdtStateDump {
|
||||
@@ -69,6 +76,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
total_ops_in_list: 0,
|
||||
max_seq_in_list: 0,
|
||||
persisted_ops_count,
|
||||
pending_persist_ops_count,
|
||||
items: Vec::new(),
|
||||
};
|
||||
};
|
||||
@@ -80,6 +88,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
total_ops_in_list: 0,
|
||||
max_seq_in_list: 0,
|
||||
persisted_ops_count,
|
||||
pending_persist_ops_count,
|
||||
items: Vec::new(),
|
||||
};
|
||||
};
|
||||
@@ -155,6 +164,10 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
JsonValue::String(s) if !s.is_empty() => Some(s),
|
||||
_ => None,
|
||||
};
|
||||
let item_type = match item_crdt.item_type.view() {
|
||||
JsonValue::String(s) if !s.is_empty() => Some(s),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let content_index = op.id.iter().map(|b| format!("{b:02x}")).collect::<String>();
|
||||
|
||||
@@ -170,6 +183,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
content_index,
|
||||
is_deleted: op.is_deleted,
|
||||
origin,
|
||||
item_type,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -179,6 +193,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
|
||||
total_ops_in_list,
|
||||
max_seq_in_list,
|
||||
persisted_ops_count,
|
||||
pending_persist_ops_count,
|
||||
items,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,11 +5,13 @@
|
||||
//! it to the live document, sends it to the persistence channel, and broadcasts
|
||||
//! it to sync peers via [`super::SYNC_TX`].
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use bft_json_crdt::json_crdt::JsonValue;
|
||||
use bft_json_crdt::op::Op;
|
||||
|
||||
use super::super::types::CrdtEvent;
|
||||
use super::{CrdtState, statics};
|
||||
use super::{CrdtState, init::PersistMsg, statics};
|
||||
|
||||
/// Create a CRDT op via `op_fn`, sign it, apply it, and send it to the
|
||||
/// persistence channel. The closure receives `&mut CrdtState` so it can
|
||||
@@ -21,7 +23,13 @@ where
|
||||
let raw_op = op_fn(state);
|
||||
let signed = raw_op.sign(&state.keypair);
|
||||
state.crdt.apply(signed.clone());
|
||||
if state.persist_tx.send(signed.clone()).is_err() {
|
||||
if state
|
||||
.persist_tx
|
||||
.send(PersistMsg::Op(Box::new(signed.clone())))
|
||||
.is_ok()
|
||||
{
|
||||
statics::PERSIST_PENDING.fetch_add(1, Ordering::Relaxed);
|
||||
} else {
|
||||
let op_type = if signed.inner.is_deleted {
|
||||
"Delete"
|
||||
} else {
|
||||
|
||||
@@ -113,3 +113,16 @@ pub(in crate::crdt_state) fn rebuild_gateway_project_index(
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
/// Rebuild the session_id → llm_sessions list index.
|
||||
pub(in crate::crdt_state) fn rebuild_llm_session_index(
|
||||
crdt: &BaseCrdt<PipelineDoc>,
|
||||
) -> HashMap<String, usize> {
|
||||
let mut map = HashMap::new();
|
||||
for (i, entry) in crdt.doc.llm_sessions.iter().enumerate() {
|
||||
if let JsonValue::String(ref k) = entry.session_id.view() {
|
||||
map.insert(k.clone(), i);
|
||||
}
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
@@ -8,25 +8,34 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::Path;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue, SignedOp};
|
||||
use bft_json_crdt::keypair::{Ed25519KeyPair, make_keypair};
|
||||
use sqlx::SqlitePool;
|
||||
use sqlx::sqlite::SqliteConnectOptions;
|
||||
use tokio::sync::{broadcast, mpsc};
|
||||
use tokio::sync::{broadcast, mpsc, oneshot};
|
||||
|
||||
use super::super::VectorClock;
|
||||
use super::super::hex;
|
||||
use super::super::types::{CrdtEvent, PipelineDoc};
|
||||
use super::indices::{
|
||||
rebuild_active_agent_index, rebuild_agent_throttle_index, rebuild_gateway_project_index,
|
||||
rebuild_index, rebuild_merge_job_index, rebuild_node_index, rebuild_test_job_index,
|
||||
rebuild_token_index,
|
||||
rebuild_index, rebuild_llm_session_index, rebuild_merge_job_index, rebuild_node_index,
|
||||
rebuild_test_job_index, rebuild_token_index,
|
||||
};
|
||||
use super::statics::{ALL_OPS, CRDT_EVENT_TX, SYNC_TX, VECTOR_CLOCK};
|
||||
use super::statics::{ALL_OPS, CRDT_EVENT_TX, PERSIST_PENDING, SYNC_TX, VECTOR_CLOCK};
|
||||
use super::{CRDT_STATE, CrdtState};
|
||||
use crate::slog;
|
||||
|
||||
/// Message type for the persistence background channel.
|
||||
pub(crate) enum PersistMsg {
|
||||
/// Persist this op to SQLite.
|
||||
Op(Box<SignedOp>),
|
||||
/// Drain: signal the sender after all preceding ops are committed.
|
||||
Flush(oneshot::Sender<()>),
|
||||
}
|
||||
|
||||
/// Initialise the CRDT state layer.
|
||||
///
|
||||
/// Opens the SQLite database, loads or creates a node keypair, replays any
|
||||
@@ -94,6 +103,7 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
let test_job_index = rebuild_test_job_index(&crdt);
|
||||
let agent_throttle_index = rebuild_agent_throttle_index(&crdt);
|
||||
let gateway_project_index = rebuild_gateway_project_index(&crdt);
|
||||
let llm_session_index = rebuild_llm_session_index(&crdt);
|
||||
|
||||
// Advance the top-level list clocks to the Lamport floor so that
|
||||
// list-level inserts don't re-emit low seq numbers.
|
||||
@@ -105,6 +115,7 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
crdt.doc.test_jobs.advance_seq(lamport_floor);
|
||||
crdt.doc.agent_throttle.advance_seq(lamport_floor);
|
||||
crdt.doc.gateway_projects.advance_seq(lamport_floor);
|
||||
crdt.doc.llm_sessions.advance_seq(lamport_floor);
|
||||
crdt.doc
|
||||
.gateway_config
|
||||
.active_project
|
||||
@@ -119,35 +130,46 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
);
|
||||
|
||||
// Spawn background persistence task.
|
||||
let (persist_tx, mut persist_rx) = mpsc::unbounded_channel::<SignedOp>();
|
||||
let (persist_tx, mut persist_rx) = mpsc::unbounded_channel::<PersistMsg>();
|
||||
|
||||
tokio::spawn(async move {
|
||||
while let Some(op) = persist_rx.recv().await {
|
||||
let op_json = match serde_json::to_string(&op) {
|
||||
Ok(j) => j,
|
||||
Err(e) => {
|
||||
slog!("[crdt] Failed to serialize op: {e}");
|
||||
continue;
|
||||
while let Some(msg) = persist_rx.recv().await {
|
||||
match msg {
|
||||
PersistMsg::Op(op) => {
|
||||
let op = *op;
|
||||
let op_json = match serde_json::to_string(&op) {
|
||||
Ok(j) => j,
|
||||
Err(e) => {
|
||||
slog!("[crdt] Failed to serialize op: {e}");
|
||||
PERSIST_PENDING.fetch_sub(1, Ordering::Relaxed);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let op_id = hex::encode(&op.id());
|
||||
let seq = op.inner.seq as i64;
|
||||
let now = chrono::Utc::now().to_rfc3339();
|
||||
|
||||
let result = sqlx::query(
|
||||
"INSERT INTO crdt_ops (op_id, seq, op_json, created_at) \
|
||||
VALUES (?1, ?2, ?3, ?4) \
|
||||
ON CONFLICT(op_id) DO NOTHING",
|
||||
)
|
||||
.bind(&op_id)
|
||||
.bind(seq)
|
||||
.bind(&op_json)
|
||||
.bind(&now)
|
||||
.execute(&pool)
|
||||
.await;
|
||||
|
||||
if let Err(e) = result {
|
||||
slog!("[crdt] Failed to persist op {}: {e}", &op_id[..12]);
|
||||
}
|
||||
PERSIST_PENDING.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
PersistMsg::Flush(reply) => {
|
||||
// All ops queued before this message have already been processed.
|
||||
let _ = reply.send(());
|
||||
}
|
||||
};
|
||||
let op_id = hex::encode(&op.id());
|
||||
let seq = op.inner.seq as i64;
|
||||
let now = chrono::Utc::now().to_rfc3339();
|
||||
|
||||
let result = sqlx::query(
|
||||
"INSERT INTO crdt_ops (op_id, seq, op_json, created_at) \
|
||||
VALUES (?1, ?2, ?3, ?4) \
|
||||
ON CONFLICT(op_id) DO NOTHING",
|
||||
)
|
||||
.bind(&op_id)
|
||||
.bind(seq)
|
||||
.bind(&op_json)
|
||||
.bind(&now)
|
||||
.execute(&pool)
|
||||
.await;
|
||||
|
||||
if let Err(e) = result {
|
||||
slog!("[crdt] Failed to persist op {}: {e}", &op_id[..12]);
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -163,6 +185,7 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
test_job_index,
|
||||
agent_throttle_index,
|
||||
gateway_project_index,
|
||||
llm_session_index,
|
||||
persist_tx,
|
||||
lamport_floor,
|
||||
tombstones,
|
||||
@@ -181,6 +204,43 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Signal the persistence background task to drain and wait until all currently-queued
|
||||
/// ops have been written to SQLite, or until `timeout` elapses.
|
||||
///
|
||||
/// Because the persistence channel is FIFO, a `Flush` sentinel processed by the task
|
||||
/// guarantees that every `Op` sent before it has already been committed. On timeout a
|
||||
/// warning is logged with the queue depth so regressions are visible in logs.
|
||||
pub(crate) async fn flush_persistence(timeout: std::time::Duration) {
|
||||
let Some(state_mutex) = super::get_crdt() else {
|
||||
return;
|
||||
};
|
||||
let persist_tx = {
|
||||
let Ok(state) = state_mutex.lock() else {
|
||||
return;
|
||||
};
|
||||
state.persist_tx.clone()
|
||||
};
|
||||
let pending_at_send = PERSIST_PENDING.load(Ordering::Relaxed);
|
||||
let (tx, rx) = oneshot::channel();
|
||||
if persist_tx.send(PersistMsg::Flush(tx)).is_err() {
|
||||
slog!("[rebuild] Persistence channel closed — skipping flush");
|
||||
return;
|
||||
}
|
||||
match tokio::time::timeout(timeout, rx).await {
|
||||
Ok(_) => {
|
||||
slog!("[rebuild] Persistence channel drained ({pending_at_send} ops flushed)");
|
||||
}
|
||||
Err(_) => {
|
||||
let pending_now = PERSIST_PENDING.load(Ordering::Relaxed);
|
||||
slog!(
|
||||
"[rebuild] WARNING: persistence flush timed out after {}ms; \
|
||||
queue_depth_at_send={pending_at_send} queue_depth_now={pending_now}",
|
||||
timeout.as_millis()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Load or create the Ed25519 keypair used by this node.
|
||||
async fn load_or_create_keypair(pool: &SqlitePool) -> Result<Ed25519KeyPair, sqlx::Error> {
|
||||
let row: Option<(Vec<u8>,)> =
|
||||
|
||||
@@ -27,6 +27,7 @@ mod tests;
|
||||
// ── Re-exports for crdt_state siblings ──────────────────────────────
|
||||
|
||||
pub use init::init;
|
||||
pub(crate) use init::{PersistMsg, flush_persistence};
|
||||
|
||||
/// Subscribe to CRDT state-transition events.
|
||||
///
|
||||
@@ -38,11 +39,11 @@ pub fn subscribe() -> Option<broadcast::Receiver<super::types::CrdtEvent>> {
|
||||
pub(super) use apply::{apply_and_persist, emit_event};
|
||||
pub(super) use indices::{
|
||||
rebuild_active_agent_index, rebuild_agent_throttle_index, rebuild_gateway_project_index,
|
||||
rebuild_index, rebuild_merge_job_index, rebuild_node_index, rebuild_test_job_index,
|
||||
rebuild_token_index,
|
||||
rebuild_index, rebuild_llm_session_index, rebuild_merge_job_index, rebuild_node_index,
|
||||
rebuild_test_job_index, rebuild_token_index,
|
||||
};
|
||||
pub(crate) use statics::{PERSIST_PENDING, all_ops_lock, vector_clock_lock};
|
||||
pub(super) use statics::{SYNC_TX, track_op};
|
||||
pub(crate) use statics::{all_ops_lock, vector_clock_lock};
|
||||
|
||||
// ── CrdtState struct ─────────────────────────────────────────────────
|
||||
|
||||
@@ -66,8 +67,10 @@ pub(super) struct CrdtState {
|
||||
pub(super) agent_throttle_index: HashMap<String, usize>,
|
||||
/// Maps project name → index in the gateway_projects ListCrdt for O(1) lookup.
|
||||
pub(super) gateway_project_index: HashMap<String, usize>,
|
||||
/// Channel sender for fire-and-forget op persistence.
|
||||
pub(super) persist_tx: mpsc::UnboundedSender<SignedOp>,
|
||||
/// Maps session_id → index in the llm_sessions ListCrdt for O(1) lookup.
|
||||
pub(super) llm_session_index: HashMap<String, usize>,
|
||||
/// Channel sender for op persistence and drain signalling.
|
||||
pub(super) persist_tx: mpsc::UnboundedSender<init::PersistMsg>,
|
||||
/// Max sequence number seen across all ops during init() replay.
|
||||
///
|
||||
/// Newly-created registers (post-init) must have their Lamport clock
|
||||
@@ -122,49 +125,58 @@ pub(super) fn get_crdt() -> Option<&'static Mutex<CrdtState>> {
|
||||
/// This avoids the async SQLite setup from `init()`. Ops are sent to a
|
||||
/// channel whose receiver is leaked (so nothing is persisted, but the channel
|
||||
/// stays open and `apply_and_persist` succeeds silently).
|
||||
/// Safe to call multiple times — subsequent calls are no-ops (thread-local).
|
||||
/// Always resets all thread-local state so each call produces a clean slate —
|
||||
/// no cross-test pollution when two tests share the same thread.
|
||||
#[cfg(test)]
|
||||
pub fn init_for_test() {
|
||||
// Initialise thread-local CRDT for test isolation.
|
||||
// Only creates a new CRDT if one isn't set yet on this thread;
|
||||
// subsequent calls are no-ops (matching the old OnceLock semantics
|
||||
// while keeping each thread isolated).
|
||||
let keypair = make_keypair();
|
||||
let crdt = BaseCrdt::<PipelineDoc>::new(&keypair);
|
||||
let (persist_tx, rx) = mpsc::unbounded_channel::<init::PersistMsg>();
|
||||
// Leak the receiver so the channel stays open: apply_and_persist
|
||||
// can then send without error, preventing [crdt_persist] WARNs
|
||||
// from racing with other tests that watch the global log buffer.
|
||||
std::mem::forget(rx);
|
||||
let fresh = CrdtState {
|
||||
crdt,
|
||||
keypair,
|
||||
index: HashMap::new(),
|
||||
node_index: HashMap::new(),
|
||||
token_index: HashMap::new(),
|
||||
merge_job_index: HashMap::new(),
|
||||
active_agent_index: HashMap::new(),
|
||||
test_job_index: HashMap::new(),
|
||||
agent_throttle_index: HashMap::new(),
|
||||
gateway_project_index: HashMap::new(),
|
||||
llm_session_index: HashMap::new(),
|
||||
persist_tx,
|
||||
lamport_floor: 0,
|
||||
tombstones: HashSet::new(),
|
||||
};
|
||||
CRDT_STATE_TL.with(|lock| {
|
||||
if lock.get().is_none() {
|
||||
let keypair = make_keypair();
|
||||
let crdt = BaseCrdt::<PipelineDoc>::new(&keypair);
|
||||
let (persist_tx, rx) = mpsc::unbounded_channel();
|
||||
// Leak the receiver so the channel stays open: apply_and_persist
|
||||
// can then send without error, preventing [crdt_persist] WARNs
|
||||
// from racing with other tests that watch the global log buffer.
|
||||
std::mem::forget(rx);
|
||||
let state = CrdtState {
|
||||
crdt,
|
||||
keypair,
|
||||
index: HashMap::new(),
|
||||
node_index: HashMap::new(),
|
||||
token_index: HashMap::new(),
|
||||
merge_job_index: HashMap::new(),
|
||||
active_agent_index: HashMap::new(),
|
||||
test_job_index: HashMap::new(),
|
||||
agent_throttle_index: HashMap::new(),
|
||||
gateway_project_index: HashMap::new(),
|
||||
persist_tx,
|
||||
lamport_floor: 0,
|
||||
tombstones: HashSet::new(),
|
||||
};
|
||||
let _ = lock.set(Mutex::new(state));
|
||||
if let Some(mutex) = lock.get() {
|
||||
// Already set on this thread — replace contents so the second
|
||||
// (and subsequent) test on the same thread starts clean.
|
||||
*mutex.lock().unwrap() = fresh;
|
||||
} else {
|
||||
let _ = lock.set(Mutex::new(fresh));
|
||||
}
|
||||
});
|
||||
let _ = statics::CRDT_EVENT_TX.get_or_init(|| broadcast::channel::<CrdtEvent>(256).0);
|
||||
let _ = statics::SYNC_TX.get_or_init(|| broadcast::channel::<SignedOp>(1024).0);
|
||||
// Per-thread op journal + vector clock — keeps parallel tests' writes
|
||||
// from corrupting each other's view of ALL_OPS (notably, one thread's
|
||||
// `apply_compaction` could otherwise prune another thread's ops).
|
||||
// Per-thread op journal + vector clock — always cleared so a second test
|
||||
// on the same thread cannot see ops written by the first.
|
||||
statics::ALL_OPS_TL.with(|lock| {
|
||||
let _ = lock.set(Mutex::new(Vec::new()));
|
||||
if let Some(mutex) = lock.get() {
|
||||
mutex.lock().unwrap().clear();
|
||||
} else {
|
||||
let _ = lock.set(Mutex::new(Vec::new()));
|
||||
}
|
||||
});
|
||||
statics::VECTOR_CLOCK_TL.with(|lock| {
|
||||
let _ = lock.set(Mutex::new(VectorClock::new()));
|
||||
if let Some(mutex) = lock.get() {
|
||||
mutex.lock().unwrap().clear();
|
||||
} else {
|
||||
let _ = lock.set(Mutex::new(VectorClock::new()));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
//! tests do not share `ALL_OPS` — preventing one test's `apply_compaction`
|
||||
//! from pruning another test's freshly-written ops.
|
||||
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::{Mutex, OnceLock};
|
||||
|
||||
use bft_json_crdt::json_crdt::SignedOp;
|
||||
@@ -19,6 +20,14 @@ use super::super::VectorClock;
|
||||
use super::super::hex;
|
||||
use super::super::types::CrdtEvent;
|
||||
|
||||
/// Count of ops queued in the persistence channel that have not yet been written to SQLite.
|
||||
///
|
||||
/// Incremented when an op is sent into the channel; decremented after the
|
||||
/// persistence task commits it. Exposed via `dump_crdt_state` as
|
||||
/// `pending_persist_ops_count` so operators can tell whether there is a flush
|
||||
/// backlog before calling `rebuild_and_restart`.
|
||||
pub(crate) static PERSIST_PENDING: AtomicUsize = AtomicUsize::new(0);
|
||||
|
||||
/// Broadcast channel for CRDT events (stage transitions, etc.).
|
||||
pub(super) static CRDT_EVENT_TX: OnceLock<broadcast::Sender<CrdtEvent>> = OnceLock::new();
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
use super::super::hex;
|
||||
use super::super::read::extract_item_view;
|
||||
use super::super::types::PipelineDoc;
|
||||
use super::init::PersistMsg;
|
||||
use super::*;
|
||||
use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue, SignedOp};
|
||||
use bft_json_crdt::keypair::make_keypair;
|
||||
@@ -222,7 +223,7 @@ async fn init_and_write_read_roundtrip() {
|
||||
fn persist_tx_send_failure_logs_warn_with_op_type_and_seq() {
|
||||
let kp = make_keypair();
|
||||
let crdt = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
let (persist_tx, persist_rx) = mpsc::unbounded_channel::<SignedOp>();
|
||||
let (persist_tx, persist_rx) = mpsc::unbounded_channel::<PersistMsg>();
|
||||
|
||||
let mut state = CrdtState {
|
||||
crdt,
|
||||
@@ -235,6 +236,7 @@ fn persist_tx_send_failure_logs_warn_with_op_type_and_seq() {
|
||||
test_job_index: HashMap::new(),
|
||||
agent_throttle_index: HashMap::new(),
|
||||
gateway_project_index: HashMap::new(),
|
||||
llm_session_index: HashMap::new(),
|
||||
persist_tx,
|
||||
lamport_floor: 0,
|
||||
tombstones: std::collections::HashSet::new(),
|
||||
@@ -296,7 +298,7 @@ fn persist_tx_send_failure_logs_warn_with_op_type_and_seq() {
|
||||
fn persist_tx_send_success_emits_no_warn() {
|
||||
let kp = make_keypair();
|
||||
let crdt = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
let (persist_tx, _persist_rx) = mpsc::unbounded_channel::<SignedOp>();
|
||||
let (persist_tx, _persist_rx) = mpsc::unbounded_channel::<PersistMsg>();
|
||||
|
||||
let mut state = CrdtState {
|
||||
crdt,
|
||||
@@ -309,6 +311,7 @@ fn persist_tx_send_success_emits_no_warn() {
|
||||
test_job_index: HashMap::new(),
|
||||
agent_throttle_index: HashMap::new(),
|
||||
gateway_project_index: HashMap::new(),
|
||||
llm_session_index: HashMap::new(),
|
||||
persist_tx,
|
||||
lamport_floor: 0,
|
||||
tombstones: std::collections::HashSet::new(),
|
||||
@@ -485,3 +488,102 @@ async fn restart_new_register_resumes_from_lamport_floor() {
|
||||
max_seq,
|
||||
);
|
||||
}
|
||||
|
||||
/// Regression test for story 1116: ops sent before `flush_persistence` must all be
|
||||
/// present in the `crdt_ops` SQLite table after the flush completes.
|
||||
///
|
||||
/// Bug: `rebuild_and_restart` called `exec()` before the persistence task had
|
||||
/// a chance to drain the unbounded channel, silently dropping queued ops.
|
||||
///
|
||||
/// Reproducer: apply N ops → call `rebuild_and_restart` → the process re-execs
|
||||
/// and on the next startup `persisted_ops_count` is < N (lost ops).
|
||||
/// Fixed by: send a `Flush` sentinel through the channel before `exec()`; the
|
||||
/// task echoes back only after all preceding `Op` messages are committed.
|
||||
#[tokio::test]
|
||||
async fn flush_persistence_drains_all_ops_before_ack() {
|
||||
use std::sync::atomic::Ordering;
|
||||
use tokio::sync::oneshot;
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let db_path = tmp.path().join("flush_drain_test.db");
|
||||
|
||||
let options = SqliteConnectOptions::new()
|
||||
.filename(&db_path)
|
||||
.create_if_missing(true);
|
||||
let pool = SqlitePool::connect_with(options).await.unwrap();
|
||||
sqlx::migrate!("./migrations").run(&pool).await.unwrap();
|
||||
|
||||
let kp = make_keypair();
|
||||
let mut crdt = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
|
||||
// Spawn an isolated persistence task — same logic as init() but without
|
||||
// touching the global singleton (keeping this test fully self-contained).
|
||||
let (tx, mut rx) = mpsc::unbounded_channel::<PersistMsg>();
|
||||
let pool_clone = pool.clone();
|
||||
tokio::spawn(async move {
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
let counter = AtomicUsize::new(0);
|
||||
while let Some(msg) = rx.recv().await {
|
||||
match msg {
|
||||
PersistMsg::Op(op) => {
|
||||
let op_json = serde_json::to_string(&op).unwrap();
|
||||
let op_id = hex::encode(&op.id());
|
||||
let seq = op.inner.seq as i64;
|
||||
let now = chrono::Utc::now().to_rfc3339();
|
||||
sqlx::query(
|
||||
"INSERT INTO crdt_ops (op_id, seq, op_json, created_at) \
|
||||
VALUES (?1, ?2, ?3, ?4) ON CONFLICT(op_id) DO NOTHING",
|
||||
)
|
||||
.bind(&op_id)
|
||||
.bind(seq)
|
||||
.bind(&op_json)
|
||||
.bind(&now)
|
||||
.execute(&pool_clone)
|
||||
.await
|
||||
.unwrap();
|
||||
counter.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
PersistMsg::Flush(reply) => {
|
||||
let _ = reply.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const N: usize = 10;
|
||||
for i in 0..N {
|
||||
let item: JsonValue = json!({
|
||||
"story_id": format!("1116_drain_{i}"),
|
||||
"stage": "1_backlog",
|
||||
"name": format!("Drain Test {i}"),
|
||||
"agent": "",
|
||||
"retry_count": 0.0,
|
||||
"blocked": false,
|
||||
"depends_on": "",
|
||||
"claimed_by": "",
|
||||
"claimed_at": 0.0,
|
||||
})
|
||||
.into();
|
||||
let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp);
|
||||
crdt.apply(op.clone());
|
||||
tx.send(PersistMsg::Op(Box::new(op))).unwrap();
|
||||
}
|
||||
|
||||
// Send flush sentinel and wait — all N ops must be committed first.
|
||||
let (flush_tx, flush_rx) = oneshot::channel();
|
||||
tx.send(PersistMsg::Flush(flush_tx)).unwrap();
|
||||
tokio::time::timeout(std::time::Duration::from_secs(5), flush_rx)
|
||||
.await
|
||||
.expect("flush timed out — persistence task did not drain within 5 s")
|
||||
.expect("flush oneshot dropped unexpectedly");
|
||||
|
||||
// Verify all N ops are in the database.
|
||||
let (count,): (i64,) = sqlx::query_as("SELECT COUNT(*) FROM crdt_ops")
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
count as usize, N,
|
||||
"all {N} ops must be in crdt_ops after flush; got {count}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -46,6 +46,121 @@ pub struct PipelineDoc {
|
||||
pub agent_throttle: ListCrdt<AgentThrottleCrdt>,
|
||||
pub gateway_projects: ListCrdt<GatewayProjectCrdt>,
|
||||
pub gateway_config: GatewayConfigCrdt,
|
||||
/// Append-only log of every pipeline transition, persisted as CRDT ops.
|
||||
pub event_log: ListCrdt<EventLogEntryCrdt>,
|
||||
/// Per-session LLM context state (high-water marks for event log injection).
|
||||
pub llm_sessions: ListCrdt<LlmSessionCrdt>,
|
||||
}
|
||||
|
||||
/// CRDT entry representing a single persisted pipeline stage-transition event.
|
||||
///
|
||||
/// Entries are append-only; once written they are never updated or tombstoned.
|
||||
/// The `event_seq` field is a per-sled monotonic counter computed at write time
|
||||
/// (count of existing entries for that sled), giving deterministic ordering for
|
||||
/// all transitions recorded by a single node even after CRDT replay on restart.
|
||||
#[add_crdt_fields]
|
||||
#[derive(Clone, CrdtNode, Debug)]
|
||||
pub struct EventLogEntryCrdt {
|
||||
/// Monotonic sequence number for this sled (0, 1, 2, …). Stored as `f64`
|
||||
/// because all CRDT scalar registers use JSON numbers.
|
||||
pub event_seq: LwwRegisterCrdt<f64>,
|
||||
/// Hex-encoded Ed25519 public key of the sled that recorded this event.
|
||||
pub sled_id: LwwRegisterCrdt<String>,
|
||||
/// Unix timestamp (seconds) when the transition fired.
|
||||
pub timestamp: LwwRegisterCrdt<f64>,
|
||||
/// Story ID of the work item that transitioned (e.g. `"42_story_foo"`).
|
||||
pub story_id: LwwRegisterCrdt<String>,
|
||||
/// Human-readable label of the stage before the transition.
|
||||
pub from_stage: LwwRegisterCrdt<String>,
|
||||
/// Human-readable label of the stage after the transition.
|
||||
pub to_stage: LwwRegisterCrdt<String>,
|
||||
/// String label of the `PipelineEvent` variant that triggered the transition.
|
||||
pub pipeline_event: LwwRegisterCrdt<String>,
|
||||
}
|
||||
|
||||
/// CRDT entry tracking an LLM session's event-log injection state.
|
||||
///
|
||||
/// Each session (keyed by `session_id`, typically a Matrix room ID) records the
|
||||
/// per-sled high-water marks so that `assemble_prompt_context` can inject only
|
||||
/// events the LLM has not yet seen and then advance the marks atomically.
|
||||
#[add_crdt_fields]
|
||||
#[derive(Clone, CrdtNode, Debug)]
|
||||
pub struct LlmSessionCrdt {
|
||||
/// Stable session identifier (e.g. Matrix room ID).
|
||||
pub session_id: LwwRegisterCrdt<String>,
|
||||
/// Human-readable persona name (e.g. `"Timmy"`).
|
||||
pub persona_name: LwwRegisterCrdt<String>,
|
||||
/// Scope wire string parsed by [`ScopeFilter::from_scope_str`]: `"all"`,
|
||||
/// `"sleds:hex1,hex2"`, or legacy `"single-sled"` / empty (→ local sled).
|
||||
pub scope: LwwRegisterCrdt<String>,
|
||||
/// JSON-serialised `BTreeMap<sled_id, last_seen_event_seq>` tracking how far
|
||||
/// each sled's event stream has been injected into this session's prompts.
|
||||
pub high_water: LwwRegisterCrdt<String>,
|
||||
}
|
||||
|
||||
/// Which sleds' events an LLM session may see.
|
||||
///
|
||||
/// Stored as a compact string in the CRDT register and parsed at read time.
|
||||
/// The default for a freshly-created session with no stored scope is
|
||||
/// [`ScopeFilter::LocalOnly`], which preserves prior single-sled behaviour.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub enum ScopeFilter {
|
||||
/// Include events from every sled present in the CRDT event log.
|
||||
///
|
||||
/// Default for gateway-level personas (e.g. Timmy in multi-project mode).
|
||||
All,
|
||||
/// Include only events whose `sled_id` is in the given set.
|
||||
///
|
||||
/// Default for sled-level personas: the set contains only the sled's own ID.
|
||||
Sleds(std::collections::BTreeSet<String>),
|
||||
}
|
||||
|
||||
impl ScopeFilter {
|
||||
/// Parse a wire-form scope string stored in the CRDT register.
|
||||
///
|
||||
/// Recognised forms:
|
||||
/// - `"all"` → [`ScopeFilter::All`]
|
||||
/// - `"sleds:hex1,hex2,…"` → [`ScopeFilter::Sleds`]
|
||||
/// - Anything else (including legacy `"single-sled"` and empty) →
|
||||
/// [`ScopeFilter::Sleds`] with an empty set; callers should fall back
|
||||
/// to the local sled ID in that case.
|
||||
pub fn from_scope_str(s: &str) -> Self {
|
||||
if s == "all" {
|
||||
return ScopeFilter::All;
|
||||
}
|
||||
if let Some(rest) = s.strip_prefix("sleds:") {
|
||||
let ids = rest
|
||||
.split(',')
|
||||
.filter(|id| !id.is_empty())
|
||||
.map(|id| id.to_string())
|
||||
.collect();
|
||||
return ScopeFilter::Sleds(ids);
|
||||
}
|
||||
ScopeFilter::Sleds(std::collections::BTreeSet::new())
|
||||
}
|
||||
|
||||
/// Encode this filter as the compact wire string stored in the CRDT.
|
||||
pub fn to_scope_str(&self) -> String {
|
||||
match self {
|
||||
ScopeFilter::All => "all".to_string(),
|
||||
ScopeFilter::Sleds(ids) => {
|
||||
let joined = ids.iter().map(|s| s.as_str()).collect::<Vec<_>>().join(",");
|
||||
format!("sleds:{joined}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Read-side snapshot of a single LLM session entry.
|
||||
pub struct LlmSessionView {
|
||||
/// Stable session identifier.
|
||||
pub session_id: String,
|
||||
/// Persona name for the bot in this session.
|
||||
pub persona_name: String,
|
||||
/// Parsed event-scope filter derived from the `scope` CRDT register.
|
||||
pub scope_filter: ScopeFilter,
|
||||
/// Decoded high-water map: sled_id → last seen event_seq.
|
||||
pub high_water: std::collections::BTreeMap<String, u64>,
|
||||
}
|
||||
|
||||
/// CRDT sub-document representing a single pipeline work item with LWW fields for stage, agent, etc.
|
||||
|
||||
@@ -165,7 +165,9 @@ pub fn delete_content(key: ContentKey<'_>) {
|
||||
|
||||
/// Ensure the in-memory content store is initialised.
|
||||
///
|
||||
/// Safe to call multiple times — the `OnceLock` is set at most once.
|
||||
/// In non-test builds: init-once via `OnceLock` (safe to call multiple times).
|
||||
/// In test builds: always resets `CONTENT_STORE_TL` to an empty `HashMap` so
|
||||
/// each test on the same thread starts with a clean store.
|
||||
pub fn ensure_content_store() {
|
||||
#[cfg(not(test))]
|
||||
{
|
||||
@@ -175,7 +177,11 @@ pub fn ensure_content_store() {
|
||||
#[cfg(test)]
|
||||
{
|
||||
CONTENT_STORE_TL.with(|lock| {
|
||||
if lock.get().is_none() {
|
||||
if let Some(mutex) = lock.get() {
|
||||
// Already initialised on this thread — reset to empty so the
|
||||
// next test does not see content written by a previous test.
|
||||
mutex.lock().unwrap().clear();
|
||||
} else {
|
||||
let _ = lock.set(Mutex::new(HashMap::new()));
|
||||
}
|
||||
});
|
||||
@@ -203,6 +209,41 @@ pub(super) fn init_content_store(map: HashMap<String, String>) {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Regression: two sequential `ensure_content_store()` + write + read cycles
|
||||
/// in the same test body must not see each other's content. Before the fix,
|
||||
/// `ensure_content_store()` was a no-op on the second call (OnceLock gating),
|
||||
/// so the second cycle could read items written in the first cycle.
|
||||
#[test]
|
||||
fn sequential_ensure_content_store_resets_state() {
|
||||
// ── Cycle 1 ──────────────────────────────────────────────────────────
|
||||
ensure_content_store();
|
||||
write_content(ContentKey::Story("1111_cycle1"), "cycle-one body");
|
||||
assert_eq!(
|
||||
read_content(ContentKey::Story("1111_cycle1")).as_deref(),
|
||||
Some("cycle-one body"),
|
||||
"cycle 1: item must be readable after write"
|
||||
);
|
||||
|
||||
// ── Cycle 2: reset, write a different item ────────────────────────────
|
||||
ensure_content_store();
|
||||
// Cycle-1 item must no longer be visible.
|
||||
assert!(
|
||||
read_content(ContentKey::Story("1111_cycle1")).is_none(),
|
||||
"cycle 2: store must be empty; cycle-1 content must not bleed through"
|
||||
);
|
||||
write_content(ContentKey::Story("1111_cycle2"), "cycle-two body");
|
||||
assert_eq!(
|
||||
read_content(ContentKey::Story("1111_cycle2")).as_deref(),
|
||||
Some("cycle-two body"),
|
||||
"cycle 2: own item must be readable"
|
||||
);
|
||||
// And cycle-1 key must still be absent.
|
||||
assert!(
|
||||
read_content(ContentKey::Story("1111_cycle1")).is_none(),
|
||||
"cycle 2: cycle-1 content must remain absent after cycle-2 write"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC 2 regression: writing under `ContentKey::Story` is not visible under
|
||||
/// `ContentKey::GateOutput` (and vice versa). The typed key namespace, not
|
||||
/// runtime substring matching, enforces the separation.
|
||||
|
||||
@@ -72,6 +72,12 @@ pub fn write_item_with_content(story_id: &str, stage: &str, content: &str, meta:
|
||||
.and_then(|d| serde_json::to_string(d).ok());
|
||||
|
||||
// Update in-memory content store.
|
||||
// In test builds, the caller (test setup) is responsible for calling
|
||||
// ensure_content_store() once before writing — calling it here would
|
||||
// reset the store on every write, losing items from prior writes in the
|
||||
// same test. In production, the lazy-init call is safe because nothing
|
||||
// resets the store between writes.
|
||||
#[cfg(not(test))]
|
||||
ensure_content_store();
|
||||
write_content(ContentKey::Story(story_id), content);
|
||||
|
||||
|
||||
@@ -0,0 +1,330 @@
|
||||
//! Pipeline transition event log — persists every `TransitionFired` event into
|
||||
//! the CRDT so the log survives server restarts and replicates across nodes.
|
||||
//!
|
||||
//! ## Design
|
||||
//!
|
||||
//! Each [`TransitionFired`][crate::pipeline_state::TransitionFired] is written
|
||||
//! as an [`EventLogEntryCrdt`][crate::crdt_state::EventLogEntryCrdt] entry in
|
||||
//! the `PipelineDoc::event_log` grow-only list. Because the list is backed by
|
||||
//! CRDT ops that are persisted to SQLite and replayed on startup, the log
|
||||
//! survives `rebuild_and_restart` without any additional bookkeeping.
|
||||
//!
|
||||
//! A monotonic per-sled sequence number (`event_seq`) is computed atomically
|
||||
//! while the CRDT lock is held, guaranteeing that no two entries from the same
|
||||
//! sled share a sequence number and that the numbers are contiguous from 0.
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use chrono::DateTime;
|
||||
|
||||
/// Monotonic per-sled logical sequence number identifying a pipeline event.
|
||||
///
|
||||
/// This is the sequence number that *would have been assigned* to an event in the
|
||||
/// contiguous logical event stream, as tracked by the event-log subscriber. It
|
||||
/// differs from the CRDT `event_seq` (which counts CRDT entries including gap
|
||||
/// sentinels) but is meaningful for identifying the range of dropped events when
|
||||
/// a gap is inserted.
|
||||
pub type EventId = u64;
|
||||
|
||||
/// A snapshot of a single persisted pipeline transition event.
|
||||
///
|
||||
/// Constructed by [`read_event_log`] from the raw CRDT entries.
|
||||
pub struct LoggedEvent {
|
||||
/// Monotonic sequence number for `sled_id` (0-based, contiguous).
|
||||
pub event_id: u64,
|
||||
/// Hex-encoded Ed25519 public key of the sled that recorded this event.
|
||||
pub sled_id: String,
|
||||
/// UTC timestamp when the transition fired.
|
||||
pub at: DateTime<chrono::Utc>,
|
||||
/// Story ID of the work item that transitioned.
|
||||
pub story_id: String,
|
||||
/// Human-readable label of the stage before the transition.
|
||||
pub from_stage: String,
|
||||
/// Human-readable label of the stage after the transition.
|
||||
pub to_stage: String,
|
||||
/// String label of the `PipelineEvent` variant that triggered the transition.
|
||||
pub pipeline_event: String,
|
||||
}
|
||||
|
||||
/// Write a single `TransitionFired` event into the CRDT event log.
|
||||
///
|
||||
/// Computes the next monotonic `event_seq` for this sled atomically inside
|
||||
/// the CRDT write lock and appends the entry. No-ops when the CRDT is not
|
||||
/// yet initialised (e.g. in gateway mode with no project root).
|
||||
pub fn log_transition_event(fired: &crate::pipeline_state::TransitionFired) {
|
||||
let sled_id = crate::crdt_state::our_node_id().unwrap_or_default();
|
||||
let timestamp = fired.at.timestamp() as f64;
|
||||
let from_stage = crate::pipeline_state::stage_label(&fired.before);
|
||||
let to_stage = crate::pipeline_state::stage_label(&fired.after);
|
||||
let pipeline_event = crate::pipeline_state::event_label(&fired.event);
|
||||
|
||||
crate::crdt_state::append_event_log_entry(
|
||||
&sled_id,
|
||||
timestamp,
|
||||
&fired.story_id.0,
|
||||
from_stage,
|
||||
to_stage,
|
||||
pipeline_event,
|
||||
);
|
||||
|
||||
// Real-time push to per-persona WebSocket subscribers.
|
||||
crate::pipeline_event_bus::broadcast(crate::pipeline_event_bus::BusEvent {
|
||||
sled_id,
|
||||
story_id: fired.story_id.0.clone(),
|
||||
from_stage: crate::pipeline_state::stage_label(&fired.before).to_string(),
|
||||
to_stage: crate::pipeline_state::stage_label(&fired.after).to_string(),
|
||||
pipeline_event: crate::pipeline_state::event_label(&fired.event).to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
/// Read all persisted events from the CRDT event log.
|
||||
///
|
||||
/// Entries are returned sorted by `(sled_id, event_id)` so that events from
|
||||
/// each sled appear in monotonic order. Entries with malformed CRDT fields
|
||||
/// are silently dropped.
|
||||
pub fn read_event_log() -> Vec<LoggedEvent> {
|
||||
let mut entries: Vec<LoggedEvent> = crate::crdt_state::read_all_event_log_entries()
|
||||
.into_iter()
|
||||
.map(|raw| LoggedEvent {
|
||||
event_id: raw.event_seq,
|
||||
sled_id: raw.sled_id,
|
||||
at: DateTime::from_timestamp(raw.timestamp as i64, 0).unwrap_or_default(),
|
||||
story_id: raw.story_id,
|
||||
from_stage: raw.from_stage,
|
||||
to_stage: raw.to_stage,
|
||||
pipeline_event: raw.pipeline_event,
|
||||
})
|
||||
.collect();
|
||||
entries.sort_by(|a, b| a.sled_id.cmp(&b.sled_id).then(a.event_id.cmp(&b.event_id)));
|
||||
entries
|
||||
}
|
||||
|
||||
/// Append a gap sentinel to the event log for the local sled.
|
||||
///
|
||||
/// Encodes the logical [`EventId`] range `[from_id, to_id]` of dropped events
|
||||
/// using the `EventStreamGap` pipeline event marker. Should be called whenever
|
||||
/// the event-log subscriber detects a lag in the broadcast channel so that no
|
||||
/// drop is silent.
|
||||
pub fn insert_gap_sentinel(from_id: EventId, to_id: EventId) {
|
||||
let sled_id = crate::crdt_state::our_node_id().unwrap_or_default();
|
||||
crate::crdt_state::append_gap_log_entry(&sled_id, from_id, to_id);
|
||||
log_gap_observability(&sled_id, from_id, to_id);
|
||||
}
|
||||
|
||||
/// Spawn a background task that persists every `TransitionFired` event to the CRDT.
|
||||
///
|
||||
/// Subscribes to the global `TransitionFired` broadcast channel. Normal events
|
||||
/// are persisted via [`log_transition_event`]. When the subscriber lags (the
|
||||
/// broadcast channel drops the oldest messages), a single
|
||||
/// `EventStreamGap` sentinel is appended to the log covering the dropped range
|
||||
/// so no transition is silently lost.
|
||||
pub fn spawn_event_log_subscriber() {
|
||||
let mut rx = crate::pipeline_state::subscribe_transitions();
|
||||
tokio::spawn(async move {
|
||||
// Tracks the next expected logical sequence number in the subscriber's
|
||||
// view of the event stream. Incremented on every successfully processed
|
||||
// event; advanced by the gap size on each lag so we can identify the
|
||||
// exact logical range of dropped events.
|
||||
let mut next_logical_seq: EventId = 0;
|
||||
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(fired) => {
|
||||
// log_transition_event also broadcasts to the pipeline_event_bus.
|
||||
log_transition_event(&fired);
|
||||
next_logical_seq += 1;
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
|
||||
let from = next_logical_seq;
|
||||
let to = next_logical_seq + n - 1;
|
||||
crate::slog_warn!(
|
||||
"[event-log] Subscriber lagged; {n} event(s) dropped \
|
||||
(logical ids {from}..={to}); gap sentinel appended."
|
||||
);
|
||||
insert_gap_sentinel(from, to);
|
||||
next_logical_seq += n;
|
||||
}
|
||||
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Emit observability log lines after inserting a gap sentinel.
|
||||
fn log_gap_observability(sled_id: &str, from_id: EventId, to_id: EventId) {
|
||||
let entries = crate::crdt_state::read_all_event_log_entries();
|
||||
let sled_total: usize = entries.iter().filter(|e| e.sled_id == sled_id).count();
|
||||
let gap_count: usize = entries
|
||||
.iter()
|
||||
.filter(|e| {
|
||||
e.sled_id == sled_id && e.pipeline_event == crate::crdt_state::GAP_PIPELINE_EVENT
|
||||
})
|
||||
.count();
|
||||
crate::slog!(
|
||||
"[event-log] gap inserted sled={sled_id} from={from_id} to={to_id} \
|
||||
sled_entries={sled_total} gap_count={gap_count}"
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::crdt_state::PipelineDoc;
|
||||
use crate::pipeline_state::{PipelineEvent, PlanState, Stage, StoryId, TransitionFired};
|
||||
use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue, OpState};
|
||||
use bft_json_crdt::keypair::make_keypair;
|
||||
use bft_json_crdt::op::ROOT_ID;
|
||||
use serde_json::json;
|
||||
|
||||
fn make_fired(i: u32) -> TransitionFired {
|
||||
TransitionFired {
|
||||
story_id: StoryId(format!("test_{i}")),
|
||||
before: Stage::Backlog,
|
||||
after: Stage::Coding {
|
||||
claim: None,
|
||||
plan: PlanState::Missing,
|
||||
retries: 0,
|
||||
},
|
||||
event: PipelineEvent::DepsMet,
|
||||
at: chrono::Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// AC4: fire N `TransitionFired` events, simulate a restart by re-initialising
|
||||
/// the CRDT (replaying all ops on a fresh doc), assert all N entries appear in
|
||||
/// the log in insertion order with monotonically increasing IDs.
|
||||
#[test]
|
||||
fn event_log_survives_crdt_reinit() {
|
||||
let kp = make_keypair();
|
||||
let mut crdt1 = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
let sled_id = crate::crdt_state::hex::encode(&crdt1.id);
|
||||
|
||||
let n = 5usize;
|
||||
let mut ops = Vec::new();
|
||||
// Track the last OpId so each entry appends to the end (insert after
|
||||
// ROOT_ID would place each entry at the front, reversing the sequence).
|
||||
let mut last_id = ROOT_ID;
|
||||
|
||||
for i in 0..n {
|
||||
let entry: JsonValue = json!({
|
||||
"event_seq": i as f64,
|
||||
"sled_id": &sled_id,
|
||||
"timestamp": 1_000_000.0_f64 + i as f64,
|
||||
"story_id": format!("story_{i}"),
|
||||
"from_stage": "backlog",
|
||||
"to_stage": "coding",
|
||||
"pipeline_event": "DepsMet",
|
||||
})
|
||||
.into();
|
||||
let op = crdt1.doc.event_log.insert(last_id, entry).sign(&kp);
|
||||
last_id = op.inner.id;
|
||||
assert_eq!(crdt1.apply(op.clone()), OpState::Ok);
|
||||
ops.push(op);
|
||||
}
|
||||
|
||||
assert_eq!(crdt1.doc.event_log.view().len(), n);
|
||||
|
||||
// Simulate restart: replay the same ops on a fresh CRDT instance.
|
||||
let mut crdt2 = BaseCrdt::<PipelineDoc>::new(&kp);
|
||||
for op in ops {
|
||||
assert_eq!(crdt2.apply(op), OpState::Ok);
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
crdt2.doc.event_log.view().len(),
|
||||
n,
|
||||
"all {n} entries must survive CRDT re-init"
|
||||
);
|
||||
|
||||
// Entries must appear in insertion order with monotonically increasing IDs.
|
||||
for i in 0..n {
|
||||
let entry = &crdt2.doc.event_log[i];
|
||||
let seq = match entry.event_seq.view() {
|
||||
JsonValue::Number(v) => v as u64,
|
||||
other => panic!("expected numeric event_seq at index {i}, got {other:?}"),
|
||||
};
|
||||
assert_eq!(seq, i as u64, "event_seq must equal insertion index {i}");
|
||||
assert_eq!(
|
||||
entry.story_id.view(),
|
||||
JsonValue::String(format!("story_{i}")),
|
||||
"story_id mismatch at index {i}"
|
||||
);
|
||||
assert_eq!(
|
||||
entry.sled_id.view(),
|
||||
JsonValue::String(sled_id.clone()),
|
||||
"sled_id mismatch at index {i}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// AC4: fill the feeder queue past capacity by inserting a gap sentinel, then
|
||||
/// assert (a) the gap sentinel appears in the event log and (b) the assembled
|
||||
/// context contains the human-readable gap line.
|
||||
#[test]
|
||||
fn gap_sentinel_in_log_and_assembled_context() {
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
// Log 3 real events (logical ids 0, 1, 2).
|
||||
for i in 0..3u32 {
|
||||
log_transition_event(&make_fired(i));
|
||||
}
|
||||
|
||||
// Simulate: the feeder queue overflowed and logical ids 3..=5 were dropped.
|
||||
insert_gap_sentinel(3, 5);
|
||||
|
||||
// Log one more real event after the gap.
|
||||
log_transition_event(&make_fired(99));
|
||||
|
||||
// (a) Gap sentinel must appear in read_event_log().
|
||||
let entries = read_event_log();
|
||||
let gap = entries
|
||||
.iter()
|
||||
.find(|e| e.pipeline_event == crate::crdt_state::GAP_PIPELINE_EVENT);
|
||||
assert!(gap.is_some(), "gap sentinel must be present in event log");
|
||||
let gap = gap.unwrap();
|
||||
// from_stage encodes the from EventId; to_stage encodes the to EventId.
|
||||
assert_eq!(gap.from_stage, "3", "gap from_stage must be '3'");
|
||||
assert_eq!(gap.to_stage, "5", "gap to_stage must be '5'");
|
||||
|
||||
// (b) assemble_prompt_context must render the gap line.
|
||||
let ctx = crate::llm_session::assemble_prompt_context("room-gap-e2e");
|
||||
assert!(
|
||||
ctx.contains("events between 3 and 5 were dropped"),
|
||||
"assembled context must contain gap line; got: {ctx}"
|
||||
);
|
||||
// Real events must also appear.
|
||||
assert!(
|
||||
ctx.contains("test_0"),
|
||||
"first story must appear; got: {ctx}"
|
||||
);
|
||||
assert!(
|
||||
ctx.contains("test_99"),
|
||||
"last story must appear; got: {ctx}"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC2: every `TransitionFired` event is written to the log without filtering.
|
||||
#[test]
|
||||
fn log_transition_event_appends_all_events() {
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
let n = 4u32;
|
||||
for i in 0..n {
|
||||
log_transition_event(&make_fired(i));
|
||||
}
|
||||
|
||||
let entries = crate::crdt_state::read_all_event_log_entries();
|
||||
assert_eq!(
|
||||
entries.len(),
|
||||
n as usize,
|
||||
"expected {n} event log entries, got {}",
|
||||
entries.len()
|
||||
);
|
||||
|
||||
// Verify monotonic sequence numbers 0..n-1.
|
||||
let mut seqs: Vec<u64> = entries.iter().map(|e| e.event_seq).collect();
|
||||
seqs.sort_unstable();
|
||||
let expected: Vec<u64> = (0..u64::from(n)).collect();
|
||||
assert_eq!(seqs, expected, "event_seq values must be 0..{n}");
|
||||
}
|
||||
}
|
||||
+16
-15
@@ -4,6 +4,9 @@
|
||||
//! Business logic lives in `service::gateway`, HTTP handlers in `http::gateway`.
|
||||
//! This file contains only the `run` entrypoint and `build_gateway_route` wiring.
|
||||
|
||||
/// Gateway rebuild — builds the new binary and launches the detached trampoline.
|
||||
pub mod rebuild;
|
||||
|
||||
use crate::http::gateway::*;
|
||||
use crate::rebuild::ShutdownReason;
|
||||
use crate::service::gateway::{self, GatewayState};
|
||||
@@ -62,18 +65,25 @@ pub fn build_gateway_route(state_arc: Arc<GatewayState>) -> impl poem::Endpoint
|
||||
"/gateway/agents/:id/assign",
|
||||
poem::post(gateway_assign_agent_handler),
|
||||
)
|
||||
// Serve the embedded React frontend so the gateway has a UI.
|
||||
// Binary self-update: serve the gateway binary so sleds can download it.
|
||||
.at(
|
||||
"/assets/*path",
|
||||
poem::get(crate::http::assets::embedded_asset),
|
||||
"/api/huskies-binary",
|
||||
poem::get(crate::http::serve_binary_handler),
|
||||
)
|
||||
.at("/*path", poem::get(crate::http::assets::embedded_file))
|
||||
.at("/", poem::get(crate::http::assets::embedded_index))
|
||||
.data(state_arc)
|
||||
}
|
||||
|
||||
/// Start the gateway HTTP server. This is the entry point when `--gateway` is used.
|
||||
pub async fn run(config_path: &Path, port: u16) -> Result<(), std::io::Error> {
|
||||
// Enforce one-active-gateway invariant: acquire an exclusive flock on the
|
||||
// pidfile before doing anything else. A second gateway start while one is
|
||||
// running will fail here with a clear error. The flock is held for the
|
||||
// lifetime of `_pidfile_guard`; it is released automatically when this
|
||||
// process exits, allowing the next gateway (spawned by the trampoline) to
|
||||
// acquire it.
|
||||
let _pidfile_guard =
|
||||
crate::pidfile::acquire_gateway_pidfile().map_err(std::io::Error::other)?;
|
||||
|
||||
let config_dir = config_path
|
||||
.parent()
|
||||
.unwrap_or(std::path::Path::new("."))
|
||||
@@ -113,19 +123,10 @@ pub async fn run(config_path: &Path, port: u16) -> Result<(), std::io::Error> {
|
||||
}
|
||||
|
||||
// Spawn the Matrix bot if `.huskies/bot.toml` exists in the config directory.
|
||||
let gateway_projects: Vec<String> = state_arc.projects.read().await.keys().cloned().collect();
|
||||
let gateway_project_urls: std::collections::BTreeMap<String, String> = state_arc
|
||||
.projects
|
||||
.read()
|
||||
.await
|
||||
.iter()
|
||||
.filter_map(|(name, entry)| entry.url.as_ref().map(|u| (name.clone(), u.clone())))
|
||||
.collect();
|
||||
let (bot_abort, bot_shutdown_tx) = gateway::io::spawn_gateway_bot(
|
||||
&config_dir,
|
||||
Arc::clone(&state_arc.active_project),
|
||||
gateway_projects,
|
||||
gateway_project_urls,
|
||||
Arc::clone(&state_arc.projects),
|
||||
port,
|
||||
Some(state_arc.event_tx.clone()),
|
||||
Arc::clone(&state_arc.perm_rx),
|
||||
|
||||
@@ -0,0 +1,115 @@
|
||||
//! Gateway rebuild — builds the new huskies binary and hands off to the trampoline.
|
||||
//!
|
||||
//! The trampoline is spawned as a detached process (new Unix session) so that it
|
||||
//! survives the gateway's own death. On success the gateway continues running
|
||||
//! until the trampoline kills it; the new gateway then posts "gateway X.Y.Z ready".
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
/// Build the huskies binary and launch the detached trampoline to swap the gateway.
|
||||
///
|
||||
/// Returns `Err(message)` (shown to the user in chat) if the build or trampoline
|
||||
/// launch fails. On success returns `Ok(())` — the trampoline is now running
|
||||
/// in a detached process and will kill this gateway and replace it with the new
|
||||
/// binary within 10 s.
|
||||
pub async fn rebuild_gateway(config_dir: &Path, gateway_port: u16) -> Result<(), String> {
|
||||
let manifest_dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR"));
|
||||
let workspace_root = manifest_dir
|
||||
.parent()
|
||||
.ok_or("cannot determine workspace root from CARGO_MANIFEST_DIR")?;
|
||||
|
||||
crate::slog!(
|
||||
"[gateway-rebuild] Building from workspace root: {}",
|
||||
workspace_root.display()
|
||||
);
|
||||
|
||||
// Rebuild the frontend bundle so rust-embed picks up the latest assets.
|
||||
let frontend_dir = workspace_root.join("frontend");
|
||||
if frontend_dir.join("package.json").exists() {
|
||||
crate::slog!("[gateway-rebuild] Building frontend");
|
||||
let fe_output = tokio::task::spawn_blocking({
|
||||
let dir = frontend_dir.clone();
|
||||
move || {
|
||||
std::process::Command::new("npm")
|
||||
.args(["run", "build"])
|
||||
.current_dir(&dir)
|
||||
.output()
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(|e| format!("frontend build task panicked: {e}"))?
|
||||
.map_err(|e| format!("failed to run npm run build: {e}"))?;
|
||||
|
||||
if !fe_output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&fe_output.stderr);
|
||||
return Err(format!("Frontend build failed:\n{stderr}"));
|
||||
}
|
||||
crate::slog!("[gateway-rebuild] Frontend build succeeded");
|
||||
}
|
||||
|
||||
// Build the server binary matching the current profile.
|
||||
let build_args: Vec<&str> = if cfg!(debug_assertions) {
|
||||
vec!["build", "-p", "huskies"]
|
||||
} else {
|
||||
vec!["build", "--release", "-p", "huskies"]
|
||||
};
|
||||
crate::slog!("[gateway-rebuild] cargo {}", build_args.join(" "));
|
||||
|
||||
let output = tokio::task::spawn_blocking({
|
||||
let root = workspace_root.to_path_buf();
|
||||
move || {
|
||||
std::process::Command::new("cargo")
|
||||
.args(&build_args)
|
||||
.current_dir(&root)
|
||||
.output()
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(|e| format!("build task panicked: {e}"))?
|
||||
.map_err(|e| format!("failed to run cargo build: {e}"))?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
crate::slog!("[gateway-rebuild] Build failed");
|
||||
return Err(format!("Build failed:\n{stderr}"));
|
||||
}
|
||||
|
||||
crate::slog!("[gateway-rebuild] Build succeeded — launching trampoline");
|
||||
|
||||
// Paths for the new and old binaries.
|
||||
let new_binary = if cfg!(debug_assertions) {
|
||||
workspace_root.join("target/debug/huskies")
|
||||
} else {
|
||||
workspace_root.join("target/release/huskies")
|
||||
};
|
||||
|
||||
let old_binary =
|
||||
std::env::current_exe().map_err(|e| format!("cannot locate current binary: {e}"))?;
|
||||
|
||||
let huskies_dir = config_dir.join(".huskies");
|
||||
std::fs::create_dir_all(&huskies_dir)
|
||||
.map_err(|e| format!("cannot create .huskies dir: {e}"))?;
|
||||
let backup_binary = huskies_dir.join("huskies_backup");
|
||||
|
||||
// Current gateway args (skip argv[0]).
|
||||
let gateway_args: Vec<String> = std::env::args().skip(1).collect();
|
||||
|
||||
let job = crate::trampoline::TrampolineJob {
|
||||
gateway_pid: std::process::id(),
|
||||
new_binary_path: new_binary,
|
||||
old_binary_path: old_binary,
|
||||
backup_binary_path: backup_binary,
|
||||
gateway_args,
|
||||
health_url: format!("http://127.0.0.1:{gateway_port}/api/gateway"),
|
||||
};
|
||||
|
||||
let job_path = huskies_dir.join("trampoline.json");
|
||||
crate::trampoline::write_job_atomic(&job, &job_path)?;
|
||||
|
||||
let exe = std::env::current_exe()
|
||||
.map_err(|e| format!("cannot locate current binary for trampoline: {e}"))?;
|
||||
crate::trampoline::spawn_detached_trampoline(&exe, &job_path)?;
|
||||
|
||||
crate::slog!("[gateway-rebuild] Trampoline launched — gateway will be replaced shortly");
|
||||
Ok(())
|
||||
}
|
||||
@@ -1175,6 +1175,8 @@ async fn ws_only_sled_handles_tools_list_and_tools_call() {
|
||||
ProjectEntry {
|
||||
url: None,
|
||||
auth_token: Some("secret".into()),
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
let config = GatewayConfig {
|
||||
@@ -1244,6 +1246,8 @@ async fn two_concurrent_sleds_are_routed_by_active_project() {
|
||||
ProjectEntry {
|
||||
url: None,
|
||||
auth_token: Some("alpha-tok".into()),
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
projects.insert(
|
||||
@@ -1251,6 +1255,8 @@ async fn two_concurrent_sleds_are_routed_by_active_project() {
|
||||
ProjectEntry {
|
||||
url: None,
|
||||
auth_token: Some("beta-tok".into()),
|
||||
ssh_port: None,
|
||||
host_path: None,
|
||||
},
|
||||
);
|
||||
let config = GatewayConfig {
|
||||
|
||||
@@ -271,4 +271,209 @@ mod tests {
|
||||
spawn_relay_task(String::new(), "test".into(), broadcaster, client);
|
||||
// If we reach here without panic, the guard worked.
|
||||
}
|
||||
|
||||
/// End-to-end: a `TransitionFired`-equivalent event published on the sled's
|
||||
/// broadcaster must reach the gateway's [`GatewayStatusEvent`] broadcast
|
||||
/// within 1 second.
|
||||
///
|
||||
/// Spins up a real poem HTTP server (token endpoint + WS event-push endpoint),
|
||||
/// spawns the relay task pointing at it, fires a [`StatusEvent::StageTransition`],
|
||||
/// and asserts the gateway broadcast receives the matching [`StoredEvent`].
|
||||
#[tokio::test]
|
||||
async fn relay_end_to_end_stage_transition_reaches_gateway_broadcast() {
|
||||
use crate::http::gateway::{gateway_event_push_handler, gateway_generate_token_handler};
|
||||
use crate::service::gateway::{GatewayConfig, GatewayState, ProjectEntry};
|
||||
use poem::EndpointExt as _;
|
||||
use poem::listener::TcpAcceptor;
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::PathBuf;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
// Gateway state: one project whose name matches the relay project name.
|
||||
let mut projects = BTreeMap::new();
|
||||
projects.insert(
|
||||
"sled-test".to_string(),
|
||||
ProjectEntry::with_url("http://sled-test:3001"),
|
||||
);
|
||||
let config = GatewayConfig {
|
||||
projects,
|
||||
sled_tokens: BTreeMap::new(),
|
||||
};
|
||||
let state = Arc::new(GatewayState::new(config, PathBuf::new(), 9000).unwrap());
|
||||
|
||||
// Subscribe before the relay connects so the event is not missed.
|
||||
let mut gw_rx = state.event_tx.subscribe();
|
||||
|
||||
// Start a poem server on an ephemeral loopback port exposing the real
|
||||
// token and event-push handlers.
|
||||
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
let gateway_url = format!("http://127.0.0.1:{}", addr.port());
|
||||
|
||||
let route = poem::Route::new()
|
||||
.at(
|
||||
"/gateway/tokens",
|
||||
poem::post(gateway_generate_token_handler),
|
||||
)
|
||||
.at(
|
||||
"/gateway/events/push",
|
||||
poem::get(gateway_event_push_handler),
|
||||
)
|
||||
.data(state.clone());
|
||||
|
||||
tokio::spawn(async move {
|
||||
let acceptor = TcpAcceptor::from_tokio(listener).unwrap();
|
||||
let _ = poem::Server::new_with_acceptor(acceptor).run(route).await;
|
||||
});
|
||||
|
||||
// Spawn the relay task pointing at our in-process gateway server.
|
||||
let broadcaster = Arc::new(StatusBroadcaster::new());
|
||||
spawn_relay_task(
|
||||
gateway_url,
|
||||
"sled-test".into(),
|
||||
Arc::clone(&broadcaster),
|
||||
reqwest::Client::new(),
|
||||
);
|
||||
|
||||
// Give the relay time to obtain a join token, connect the WebSocket,
|
||||
// and enter its event-receive loop before we publish.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(200)).await;
|
||||
|
||||
// Publish a stage transition on the sled side.
|
||||
broadcaster.publish(StatusEvent::StageTransition {
|
||||
story_id: "42_story_relay_e2e".into(),
|
||||
story_name: "Relay E2E".into(),
|
||||
from_stage: "1_backlog".into(),
|
||||
to_stage: "2_current".into(),
|
||||
});
|
||||
|
||||
// The event must arrive at the gateway broadcast within 1 second.
|
||||
let received = tokio::time::timeout(std::time::Duration::from_secs(1), gw_rx.recv())
|
||||
.await
|
||||
.expect("timed out: event did not arrive at gateway broadcast within 1 s")
|
||||
.expect("gateway broadcast channel closed unexpectedly");
|
||||
|
||||
assert_eq!(received.project, "sled-test");
|
||||
assert!(
|
||||
matches!(
|
||||
received.event,
|
||||
StoredEvent::StageTransition { ref story_id, .. } if story_id == "42_story_relay_e2e"
|
||||
),
|
||||
"unexpected gateway event: {:?}",
|
||||
received.event
|
||||
);
|
||||
}
|
||||
|
||||
/// Extends `relay_end_to_end_stage_transition_reaches_gateway_broadcast` to
|
||||
/// cover the full wiring path: `project_docker_run_args` embeds
|
||||
/// `HUSKIES_GATEWAY_URL` in the sled's argv; when that URL is used to start
|
||||
/// the relay, a transition fired inside the sled reaches the gateway's CRDT
|
||||
/// event_log within 1 second.
|
||||
#[tokio::test]
|
||||
async fn project_docker_run_args_gateway_url_wires_relay() {
|
||||
use crate::chat::transport::matrix::new_project::project_docker_run_args;
|
||||
use crate::http::gateway::{gateway_event_push_handler, gateway_generate_token_handler};
|
||||
use crate::service::gateway::{GatewayConfig, GatewayState, ProjectEntry};
|
||||
use poem::EndpointExt as _;
|
||||
use poem::listener::TcpAcceptor;
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::PathBuf;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
// Spin up an in-process gateway server on an ephemeral port so we have
|
||||
// a real URL to embed in the docker run args.
|
||||
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
let gateway_url = format!("http://127.0.0.1:{}", addr.port());
|
||||
|
||||
// project_docker_run_args embeds the gateway URL: this is the production
|
||||
// code path that sets HUSKIES_GATEWAY_URL on the sled container.
|
||||
let docker_args = project_docker_run_args(
|
||||
"huskies-sled-relay",
|
||||
3200,
|
||||
2300,
|
||||
"ssh-ed25519 AAAA...",
|
||||
"Test User",
|
||||
"test@example.com",
|
||||
None,
|
||||
&gateway_url,
|
||||
);
|
||||
|
||||
// Extract the injected URL exactly as the sled would read it from its env.
|
||||
let injected_url = docker_args
|
||||
.windows(2)
|
||||
.find(|w| w[0] == "-e" && w[1].starts_with("HUSKIES_GATEWAY_URL="))
|
||||
.map(|w| w[1].trim_start_matches("HUSKIES_GATEWAY_URL=").to_string())
|
||||
.expect("project_docker_run_args must inject HUSKIES_GATEWAY_URL");
|
||||
|
||||
assert_eq!(injected_url, gateway_url, "injected URL must match input");
|
||||
|
||||
// Set up gateway state for the relay project.
|
||||
let mut projects = BTreeMap::new();
|
||||
projects.insert(
|
||||
"sled-relay".to_string(),
|
||||
ProjectEntry::with_url("http://sled-relay:3001"),
|
||||
);
|
||||
let config = GatewayConfig {
|
||||
projects,
|
||||
sled_tokens: BTreeMap::new(),
|
||||
};
|
||||
let state = Arc::new(GatewayState::new(config, PathBuf::new(), 9001).unwrap());
|
||||
let mut gw_rx = state.event_tx.subscribe();
|
||||
|
||||
let route = poem::Route::new()
|
||||
.at(
|
||||
"/gateway/tokens",
|
||||
poem::post(gateway_generate_token_handler),
|
||||
)
|
||||
.at(
|
||||
"/gateway/events/push",
|
||||
poem::get(gateway_event_push_handler),
|
||||
)
|
||||
.data(state.clone());
|
||||
|
||||
tokio::spawn(async move {
|
||||
let acceptor = TcpAcceptor::from_tokio(listener).unwrap();
|
||||
let _ = poem::Server::new_with_acceptor(acceptor).run(route).await;
|
||||
});
|
||||
|
||||
// Spawn the relay using the URL extracted from the docker run args —
|
||||
// this simulates what the sled does when it reads HUSKIES_GATEWAY_URL
|
||||
// from its container environment.
|
||||
let broadcaster = Arc::new(StatusBroadcaster::new());
|
||||
spawn_relay_task(
|
||||
injected_url,
|
||||
"sled-relay".into(),
|
||||
Arc::clone(&broadcaster),
|
||||
reqwest::Client::new(),
|
||||
);
|
||||
|
||||
tokio::time::sleep(std::time::Duration::from_millis(200)).await;
|
||||
|
||||
broadcaster.publish(StatusEvent::StageTransition {
|
||||
story_id: "99_docker_args_relay".into(),
|
||||
story_name: "Docker Args Relay".into(),
|
||||
from_stage: "1_backlog".into(),
|
||||
to_stage: "2_current".into(),
|
||||
});
|
||||
|
||||
let received = tokio::time::timeout(std::time::Duration::from_secs(1), gw_rx.recv())
|
||||
.await
|
||||
.expect("timed out: event did not reach gateway within 1 s")
|
||||
.expect("gateway broadcast channel closed unexpectedly");
|
||||
|
||||
assert_eq!(received.project, "sled-relay");
|
||||
assert!(
|
||||
matches!(
|
||||
received.event,
|
||||
StoredEvent::StageTransition { ref story_id, .. } if story_id == "99_docker_args_relay"
|
||||
),
|
||||
"unexpected gateway event: {:?}",
|
||||
received.event
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
//! Static asset serving — serves the embedded React frontend via `rust-embed`.
|
||||
use poem::{
|
||||
Response, handler,
|
||||
http::{StatusCode, header},
|
||||
web::Path,
|
||||
};
|
||||
use rust_embed::RustEmbed;
|
||||
|
||||
#[derive(RustEmbed)]
|
||||
#[folder = "../frontend/dist"]
|
||||
struct EmbeddedAssets;
|
||||
|
||||
fn serve_embedded(path: &str) -> Response {
|
||||
let normalized = if path.is_empty() {
|
||||
"index.html"
|
||||
} else {
|
||||
path.trim_start_matches('/')
|
||||
};
|
||||
|
||||
let is_asset_request = normalized.starts_with("assets/");
|
||||
let asset = if is_asset_request {
|
||||
EmbeddedAssets::get(normalized)
|
||||
} else {
|
||||
EmbeddedAssets::get(normalized).or_else(|| {
|
||||
if normalized == "index.html" {
|
||||
None
|
||||
} else {
|
||||
EmbeddedAssets::get("index.html")
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
match asset {
|
||||
Some(content) => {
|
||||
let body = content.data.into_owned();
|
||||
let mime = mime_guess::from_path(normalized)
|
||||
.first_or_octet_stream()
|
||||
.to_string();
|
||||
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, mime)
|
||||
.body(body)
|
||||
}
|
||||
None => Response::builder()
|
||||
.status(StatusCode::NOT_FOUND)
|
||||
.body("Not Found"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Serve a single embedded asset from the `assets/` folder.
|
||||
#[handler]
|
||||
pub fn embedded_asset(Path(path): Path<String>) -> Response {
|
||||
let asset_path = format!("assets/{path}");
|
||||
serve_embedded(&asset_path)
|
||||
}
|
||||
|
||||
/// Serve an embedded file by path (falls back to `index.html` for SPA routing).
|
||||
#[handler]
|
||||
pub fn embedded_file(Path(path): Path<String>) -> Response {
|
||||
serve_embedded(&path)
|
||||
}
|
||||
|
||||
/// Serve the embedded SPA entrypoint.
|
||||
#[handler]
|
||||
pub fn embedded_index() -> Response {
|
||||
serve_embedded("index.html")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use poem::http::StatusCode;
|
||||
|
||||
#[test]
|
||||
fn non_asset_path_spa_fallback_or_not_found() {
|
||||
// Non-asset paths fall back to index.html for SPA client-side routing.
|
||||
// In release builds (with embedded dist/) this returns 200.
|
||||
// In debug builds without a built frontend dist/ it returns 404.
|
||||
let response = serve_embedded("__nonexistent_spa_route__.html");
|
||||
let status = response.status();
|
||||
assert!(
|
||||
status == StatusCode::OK || status == StatusCode::NOT_FOUND,
|
||||
"unexpected status: {status}",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn missing_asset_path_prefix_returns_not_found() {
|
||||
// assets/ prefix: no SPA fallback – returns 404 if the file does not exist
|
||||
let response = serve_embedded("assets/__nonexistent__.js");
|
||||
assert_eq!(response.status(), StatusCode::NOT_FOUND);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serve_embedded_does_not_panic_on_empty_path() {
|
||||
// Empty path normalises to index.html; OK in release, 404 in debug without dist/
|
||||
let response = serve_embedded("");
|
||||
let status = response.status();
|
||||
assert!(
|
||||
status == StatusCode::OK || status == StatusCode::NOT_FOUND,
|
||||
"unexpected status: {status}",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn embedded_assets_struct_is_iterable() {
|
||||
// Verifies that rust-embed compiled the EmbeddedAssets struct correctly.
|
||||
// In debug builds without a built frontend dist/ directory the iterator is empty; that is
|
||||
// expected. In release builds it will contain all bundled frontend files.
|
||||
let _files: Vec<_> = EmbeddedAssets::iter().collect();
|
||||
// No assertion needed – the test passes as long as it compiles and does not panic.
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embedded_index_handler_returns_ok_or_not_found() {
|
||||
// Route the handler through TestClient; index.html is the SPA entry point.
|
||||
let app = poem::Route::new().at("/", poem::get(embedded_index));
|
||||
let cli = poem::test::TestClient::new(app);
|
||||
let resp = cli.get("/").send().await;
|
||||
let status = resp.0.status();
|
||||
assert!(
|
||||
status == StatusCode::OK || status == StatusCode::NOT_FOUND,
|
||||
"unexpected status: {status}",
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embedded_file_handler_with_path_returns_ok_or_not_found() {
|
||||
// Non-asset paths fall back to index.html (SPA routing) or 404.
|
||||
let app = poem::Route::new().at("/*path", poem::get(embedded_file));
|
||||
let cli = poem::test::TestClient::new(app);
|
||||
let resp = cli.get("/__spa_route__").send().await;
|
||||
let status = resp.0.status();
|
||||
assert!(
|
||||
status == StatusCode::OK || status == StatusCode::NOT_FOUND,
|
||||
"unexpected status: {status}",
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embedded_asset_handler_missing_file_returns_not_found() {
|
||||
// The assets/ prefix disables SPA fallback; missing files must return 404.
|
||||
let app = poem::Route::new().at("/assets/*path", poem::get(embedded_asset));
|
||||
let cli = poem::test::TestClient::new(app);
|
||||
let resp = cli.get("/assets/__nonexistent__.js").send().await;
|
||||
assert_eq!(resp.0.status(), StatusCode::NOT_FOUND);
|
||||
}
|
||||
}
|
||||
@@ -20,11 +20,16 @@ const GATEWAY_TOOLS: &[&str] = &[
|
||||
"gateway_status",
|
||||
"gateway_health",
|
||||
"init_project",
|
||||
"adopt_project",
|
||||
"aggregate_pipeline_status",
|
||||
"agents.list",
|
||||
// Handled at the gateway so the Matrix bot's perm_rx listener is used
|
||||
// rather than the container's (which has no interactive session attached).
|
||||
"prompt_permission",
|
||||
// Binary self-update: gateway serves its own binary and triggers upgrade on sleds.
|
||||
"upgrade_sled",
|
||||
// One-shot container rebuild: build fresh image, swap container, preserve state.
|
||||
"project_rebuild",
|
||||
];
|
||||
|
||||
/// Gateway tool definitions.
|
||||
@@ -82,6 +87,28 @@ pub(crate) fn gateway_tool_definitions() -> Vec<Value> {
|
||||
"required": ["path"]
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "adopt_project",
|
||||
"description": "Wrap a Docker container around an existing host checkout — the same as `new project <name> --adopt <path>`. No git clone or git init is performed; the directory is bind-mounted at /workspace. Launches the appropriate stack-specific image, generates an SSH keypair, and registers the project in projects.toml. Returns the SSH connection command and detected stack.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Short project name (letters, digits, hyphens, underscores). Must be unique across registered projects."
|
||||
},
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Absolute host filesystem path to the existing checkout to adopt. Must be an existing directory."
|
||||
},
|
||||
"stack": {
|
||||
"type": "string",
|
||||
"description": "Optional: override stack detection (e.g. 'rust', 'node', 'python'). Auto-detected from directory contents when omitted."
|
||||
}
|
||||
},
|
||||
"required": ["name", "path"]
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "aggregate_pipeline_status",
|
||||
"description": "Fetch pipeline status from ALL registered projects in parallel and return an aggregated report. For each project: stage counts (backlog/current/qa/merge/done) and a list of blocked or failing items with triage detail. Unreachable projects are included with an error state rather than failing the whole call.",
|
||||
@@ -98,6 +125,45 @@ pub(crate) fn gateway_tool_definitions() -> Vec<Value> {
|
||||
"properties": {}
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "upgrade_sled",
|
||||
"description": "Trigger a binary self-update on a project sled. The sled downloads the new binary from `source_url` (defaults to this gateway's /api/huskies-binary endpoint), atomically replaces its own executable, drains CRDT persistence so no ops are lost, and re-execs. Without `project`, upgrades the active project.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"project": {
|
||||
"type": "string",
|
||||
"description": "Name of the project sled to upgrade. Defaults to the currently active project."
|
||||
},
|
||||
"source_url": {
|
||||
"type": "string",
|
||||
"description": "HTTP URL of the binary to install (e.g. 'http://gateway:3000/api/huskies-binary'). Defaults to this gateway's own binary endpoint."
|
||||
}
|
||||
}
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "project_rebuild",
|
||||
"description": "Rebuild a project's Docker image from its Dockerfile.fragment, swap the container, and preserve all CRDT and pipeline state. In-flight coder/merge work is drained before the swap; if not drainable within the timeout the command refuses. On success returns the new image hash and container ID.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Name of the project to rebuild (must exist in projects.toml with host_path set)."
|
||||
},
|
||||
"drain_timeout_secs": {
|
||||
"type": "integer",
|
||||
"description": "Seconds to wait for active agents to stop before rebuilding (default: 60). Pass 0 to skip the drain check."
|
||||
},
|
||||
"force": {
|
||||
"type": "boolean",
|
||||
"description": "If true, skip the drain check and rebuild immediately even if agents are running."
|
||||
}
|
||||
},
|
||||
"required": ["name"]
|
||||
}
|
||||
}),
|
||||
]
|
||||
}
|
||||
|
||||
@@ -358,9 +424,12 @@ async fn handle_gateway_tool(
|
||||
"gateway_status" => handle_gateway_status_tool(state, id).await,
|
||||
"gateway_health" => handle_gateway_health_tool(state, id).await,
|
||||
"init_project" => handle_init_project_tool(params, state, id).await,
|
||||
"adopt_project" => handle_adopt_project_tool(params, state, id).await,
|
||||
"aggregate_pipeline_status" => handle_aggregate_pipeline_status_tool(state, id).await,
|
||||
"agents.list" => handle_agents_list_tool(id),
|
||||
"prompt_permission" => handle_prompt_permission_tool(params, state, id).await,
|
||||
"upgrade_sled" => handle_upgrade_sled_tool(params, state, id).await,
|
||||
"project_rebuild" => handle_project_rebuild_tool(params, state, id).await,
|
||||
_ => JsonRpcResponse::error(id, -32601, format!("Unknown gateway tool: {tool_name}")),
|
||||
}
|
||||
}
|
||||
@@ -525,6 +594,82 @@ async fn handle_init_project_tool(
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle the `adopt_project` gateway tool.
|
||||
///
|
||||
/// Wraps a Docker container around an existing host checkout — the MCP
|
||||
/// equivalent of the `new project <name> --adopt <path>` chat command.
|
||||
/// Validates that `path` exists and is a directory before delegating to
|
||||
/// `handle_new_project`, which performs stack detection, container launch,
|
||||
/// SSH keypair generation, and project registration.
|
||||
async fn handle_adopt_project_tool(
|
||||
params: &Value,
|
||||
state: &GatewayState,
|
||||
id: Option<Value>,
|
||||
) -> JsonRpcResponse {
|
||||
use crate::chat::transport::matrix::new_project::handle_new_project;
|
||||
|
||||
let args = params.get("arguments").unwrap_or(params);
|
||||
let name = args
|
||||
.get("name")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.trim();
|
||||
let path_str = args
|
||||
.get("path")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.trim();
|
||||
let stack = args.get("stack").and_then(|v| v.as_str());
|
||||
|
||||
if name.is_empty() {
|
||||
return JsonRpcResponse::error(id, -32602, "missing required parameter: name".into());
|
||||
}
|
||||
if path_str.is_empty() {
|
||||
return JsonRpcResponse::error(id, -32602, "missing required parameter: path".into());
|
||||
}
|
||||
|
||||
let path = std::path::Path::new(path_str);
|
||||
if !path.exists() {
|
||||
return JsonRpcResponse::error(
|
||||
id,
|
||||
-32602,
|
||||
format!(
|
||||
"Adopt path `{path_str}` does not exist — specify the path to an existing checkout."
|
||||
),
|
||||
);
|
||||
}
|
||||
if !path.is_dir() {
|
||||
return JsonRpcResponse::error(
|
||||
id,
|
||||
-32602,
|
||||
format!("Adopt path `{path_str}` is not a directory."),
|
||||
);
|
||||
}
|
||||
|
||||
let result = handle_new_project(
|
||||
name,
|
||||
stack,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
Some(path_str),
|
||||
false,
|
||||
&state.projects,
|
||||
&state.config_dir,
|
||||
)
|
||||
.await;
|
||||
|
||||
JsonRpcResponse::success(
|
||||
id,
|
||||
json!({
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": result
|
||||
}]
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_aggregate_pipeline_status_tool(
|
||||
state: &GatewayState,
|
||||
id: Option<Value>,
|
||||
@@ -669,6 +814,142 @@ fn handle_agents_list_tool(id: Option<Value>) -> JsonRpcResponse {
|
||||
)
|
||||
}
|
||||
|
||||
/// Handle the `upgrade_sled` gateway tool.
|
||||
///
|
||||
/// Posts `{"source_url": "<url>"}` to the target sled's `/api/upgrade` endpoint,
|
||||
/// which triggers the sled to download the new binary, drain CRDT persistence,
|
||||
/// and re-exec. Returns 202 text immediately — the sled connection will drop
|
||||
/// shortly after as `exec()` replaces the process.
|
||||
async fn handle_upgrade_sled_tool(
|
||||
params: &Value,
|
||||
state: &GatewayState,
|
||||
id: Option<Value>,
|
||||
) -> JsonRpcResponse {
|
||||
let args = params.get("arguments").unwrap_or(params);
|
||||
|
||||
// Resolve target project URL (explicit project arg or active project).
|
||||
let project_name = args.get("project").and_then(|v| v.as_str());
|
||||
let sled_url = if let Some(name) = project_name {
|
||||
let projects = state.projects.read().await;
|
||||
match projects.get(name).and_then(|e| e.url.clone()) {
|
||||
Some(u) => u,
|
||||
None => {
|
||||
return JsonRpcResponse::error(
|
||||
id,
|
||||
-32602,
|
||||
format!("Project '{name}' not found or has no URL configured"),
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match state.active_url().await {
|
||||
Ok(u) => u,
|
||||
Err(e) => return JsonRpcResponse::error(id, -32603, e.to_string()),
|
||||
}
|
||||
};
|
||||
|
||||
// Build the binary source URL: caller-supplied or this gateway's own endpoint.
|
||||
let source_url = args
|
||||
.get("source_url")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| {
|
||||
// Default: the gateway serves its own binary at /api/huskies-binary.
|
||||
// Use the same host/port the gateway is bound to.
|
||||
std::env::var("HUSKIES_GATEWAY_BINARY_URL")
|
||||
.unwrap_or_else(|_| format!("http://gateway:{}/api/huskies-binary", state.port))
|
||||
});
|
||||
|
||||
let upgrade_url = format!("{sled_url}/api/upgrade");
|
||||
let body = serde_json::json!({ "source_url": source_url });
|
||||
|
||||
let active_name = project_name.map(|s| s.to_string()).unwrap_or_else(|| {
|
||||
state
|
||||
.active_project
|
||||
.try_read()
|
||||
.map(|g| g.clone())
|
||||
.unwrap_or_default()
|
||||
});
|
||||
|
||||
match state.client.post(&upgrade_url).json(&body).send().await {
|
||||
Ok(resp) if resp.status().is_success() || resp.status().as_u16() == 202 => {
|
||||
JsonRpcResponse::success(
|
||||
id,
|
||||
json!({
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": format!(
|
||||
"Upgrade triggered on '{active_name}'. The sled is downloading the new binary from {source_url} and will re-exec momentarily."
|
||||
)
|
||||
}]
|
||||
}),
|
||||
)
|
||||
}
|
||||
Ok(resp) => JsonRpcResponse::error(
|
||||
id,
|
||||
-32603,
|
||||
format!(
|
||||
"Sled returned HTTP {} for upgrade request to {upgrade_url}",
|
||||
resp.status()
|
||||
),
|
||||
),
|
||||
Err(e) => JsonRpcResponse::error(
|
||||
id,
|
||||
-32603,
|
||||
format!("Failed to send upgrade request to {upgrade_url}: {e}"),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle the `project_rebuild` gateway tool.
|
||||
///
|
||||
/// Rebuilds a project's Docker image, swaps the container, and preserves all
|
||||
/// CRDT and pipeline state. Delegates to `handle_project_rebuild` in the chat
|
||||
/// transport module so the logic is shared between the chat and MCP entry points.
|
||||
async fn handle_project_rebuild_tool(
|
||||
params: &Value,
|
||||
state: &GatewayState,
|
||||
id: Option<Value>,
|
||||
) -> JsonRpcResponse {
|
||||
use crate::chat::transport::matrix::project_rebuild::handle_project_rebuild;
|
||||
|
||||
let args = params.get("arguments").unwrap_or(params);
|
||||
let name = args
|
||||
.get("name")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.trim();
|
||||
|
||||
if name.is_empty() {
|
||||
return JsonRpcResponse::error(id, -32602, "missing required parameter: name".into());
|
||||
}
|
||||
|
||||
let drain_timeout_secs = args
|
||||
.get("drain_timeout_secs")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(60);
|
||||
let force = args.get("force").and_then(|v| v.as_bool()).unwrap_or(false);
|
||||
|
||||
let result = handle_project_rebuild(
|
||||
name,
|
||||
drain_timeout_secs,
|
||||
force,
|
||||
&state.projects,
|
||||
&state.config_dir,
|
||||
)
|
||||
.await;
|
||||
|
||||
JsonRpcResponse::success(
|
||||
id,
|
||||
json!({
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": result
|
||||
}]
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
/// Handle the `pipeline.get` read-RPC — returns per-project item lists in the
|
||||
/// shape expected by the gateway web UI:
|
||||
/// `{ "active": "...", "projects": { "name": { "active": [...], "backlog_count": N } } }`.
|
||||
@@ -686,3 +967,124 @@ async fn handle_pipeline_get(state: &GatewayState, id: Option<Value>) -> JsonRpc
|
||||
|
||||
JsonRpcResponse::success(id, json!({ "active": active, "projects": results }))
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::service::gateway::config::{GatewayConfig, ProjectEntry};
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn make_test_state(config_dir: &std::path::Path) -> Arc<GatewayState> {
|
||||
let mut projects = BTreeMap::new();
|
||||
projects.insert(
|
||||
"test-project".to_string(),
|
||||
ProjectEntry::with_url("http://127.0.0.1:3001"),
|
||||
);
|
||||
let config = GatewayConfig {
|
||||
projects,
|
||||
sled_tokens: BTreeMap::new(),
|
||||
};
|
||||
Arc::new(GatewayState::new(config, config_dir.to_path_buf(), 3000).unwrap())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_missing_name_returns_error() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "path": "/some/path" } });
|
||||
let resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(resp.error.is_some(), "expected error for missing name");
|
||||
let msg = resp.error.unwrap().message;
|
||||
assert!(msg.contains("name"), "expected 'name' in error, got: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_missing_path_returns_error() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "name": "myapp" } });
|
||||
let resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(resp.error.is_some(), "expected error for missing path");
|
||||
let msg = resp.error.unwrap().message;
|
||||
assert!(msg.contains("path"), "expected 'path' in error, got: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_nonexistent_path_returns_error() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "name": "myapp", "path": "/nonexistent/xyz/abc123" } });
|
||||
let resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(resp.error.is_some(), "expected error for nonexistent path");
|
||||
let msg = resp.error.unwrap().message;
|
||||
assert!(
|
||||
msg.contains("does not exist"),
|
||||
"expected 'does not exist' in error, got: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_file_path_returns_error() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let file = dir.path().join("not_a_dir.txt");
|
||||
std::fs::write(&file, "content").unwrap();
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "name": "myapp", "path": file.to_str().unwrap() } });
|
||||
let resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(resp.error.is_some(), "expected error for file path");
|
||||
let msg = resp.error.unwrap().message;
|
||||
assert!(
|
||||
msg.contains("not a directory"),
|
||||
"expected 'not a directory' in error, got: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
/// The MCP entry point produces the same validation outcome as the chat-routed call.
|
||||
///
|
||||
/// Both paths ultimately run the same checks: path-doesn't-exist and
|
||||
/// path-is-file are tested here to verify the MCP layer is consistent
|
||||
/// with `handle_new_project` in `new_project.rs`.
|
||||
#[tokio::test]
|
||||
async fn adopt_project_tool_matches_chat_routed_call() {
|
||||
use crate::chat::transport::matrix::new_project::handle_new_project;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let file = dir.path().join("a_file.txt");
|
||||
std::fs::write(&file, "not a dir").unwrap();
|
||||
let file_path = file.to_str().unwrap();
|
||||
|
||||
// Chat-routed: handle_new_project returns a text string with the error.
|
||||
let store = Arc::new(RwLock::new(BTreeMap::new()));
|
||||
let chat_result = handle_new_project(
|
||||
"myapp",
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
Some(file_path),
|
||||
false,
|
||||
&store,
|
||||
dir.path(),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
chat_result.contains("not a directory"),
|
||||
"chat path should report 'not a directory', got: {chat_result}"
|
||||
);
|
||||
|
||||
// MCP-routed: handle_adopt_project_tool returns a JSON-RPC error.
|
||||
let state = make_test_state(dir.path());
|
||||
let params = json!({ "arguments": { "name": "myapp2", "path": file_path } });
|
||||
let mcp_resp = handle_adopt_project_tool(¶ms, &state, Some(json!(1))).await;
|
||||
assert!(mcp_resp.error.is_some(), "MCP path should return an error");
|
||||
let mcp_msg = mcp_resp.error.unwrap().message;
|
||||
assert!(
|
||||
mcp_msg.contains("not a directory"),
|
||||
"MCP path should report 'not a directory', got: {mcp_msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,6 +115,7 @@ pub(crate) fn tool_dump_crdt(args: &Value) -> Result<String, String> {
|
||||
"content_index": item.content_index,
|
||||
"is_deleted": item.is_deleted,
|
||||
"origin": item.origin,
|
||||
"item_type": item.item_type,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
@@ -126,7 +127,7 @@ pub(crate) fn tool_dump_crdt(args: &Value) -> Result<String, String> {
|
||||
"total_ops_in_list": dump.total_ops_in_list,
|
||||
"max_seq_in_list": dump.max_seq_in_list,
|
||||
"persisted_ops_count": dump.persisted_ops_count,
|
||||
"pending_persist_ops_count": null,
|
||||
"pending_persist_ops_count": dump.pending_persist_ops_count,
|
||||
},
|
||||
"items": items,
|
||||
}))
|
||||
|
||||
@@ -102,9 +102,14 @@ pub async fn dispatch_tool_call(
|
||||
"move_story" => diagnostics::tool_move_story(&args, ctx),
|
||||
// Unblock story
|
||||
"unblock_story" => story_tools::tool_unblock_story(&args, ctx),
|
||||
// Convert work-item type in place (story 1141)
|
||||
"convert_item_type" => story_tools::tool_convert_item_type(&args, ctx),
|
||||
// Freeze / unfreeze story
|
||||
"freeze_story" => story_tools::tool_freeze_story(&args, ctx),
|
||||
"unfreeze_story" => story_tools::tool_unfreeze_story(&args, ctx),
|
||||
// Worktree-sandboxed file editing (replaces Claude's built-in Edit/Write for coder agents)
|
||||
"edit" => shell_tools::tool_edit(&args, ctx),
|
||||
"write" => shell_tools::tool_write(&args, ctx),
|
||||
// Shell command execution
|
||||
"run_command" => shell_tools::tool_run_command(&args, ctx).await,
|
||||
"run_tests" => shell_tools::tool_run_tests(&args, ctx).await,
|
||||
|
||||
@@ -0,0 +1,452 @@
|
||||
//! MCP file-editing tools: `edit` and `write`.
|
||||
//!
|
||||
//! These are worktree-sandboxed equivalents of Claude's built-in `Edit` and
|
||||
//! `Write` tools. All paths must canonicalize to inside `.huskies/worktrees/`
|
||||
//! so agents cannot write to the master working tree.
|
||||
|
||||
use crate::http::context::AppContext;
|
||||
use serde_json::Value;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// Validate that `file_path` is an absolute path whose nearest existing
|
||||
/// ancestor lies inside the project's `.huskies/worktrees/` directory.
|
||||
///
|
||||
/// Unlike [`crate::service::shell::io::validate_working_dir`], the target file
|
||||
/// itself need not exist (write creates it), so we walk up to the first
|
||||
/// existing ancestor before canonicalising.
|
||||
///
|
||||
/// Returns the original (non-canonicalized) `PathBuf` on success so the
|
||||
/// caller can use it directly for I/O.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns a `String` error naming both the worktrees root and the offending
|
||||
/// path, matching the style of the `run_command` guard.
|
||||
pub(super) fn validate_worktree_file_path(
|
||||
file_path: &str,
|
||||
ctx: &AppContext,
|
||||
) -> Result<PathBuf, String> {
|
||||
let path = PathBuf::from(file_path);
|
||||
|
||||
if !path.is_absolute() {
|
||||
return Err(format!(
|
||||
"file_path must be an absolute path, got: {file_path}"
|
||||
));
|
||||
}
|
||||
|
||||
let project_root = ctx.services.agents.get_project_root(&ctx.state)?;
|
||||
let worktrees_root = project_root.join(".huskies").join("worktrees");
|
||||
|
||||
if !worktrees_root.exists() {
|
||||
return Err(format!(
|
||||
"No worktrees directory found; file_path must be inside {worktrees_root:?}, got: {file_path}"
|
||||
));
|
||||
}
|
||||
|
||||
let canonical_wt = worktrees_root
|
||||
.canonicalize()
|
||||
.map_err(|e| format!("Cannot canonicalize worktrees root: {e}"))?;
|
||||
|
||||
// Walk up to find the deepest existing ancestor so we can canonicalize it.
|
||||
let canonical_ancestor = find_existing_ancestor(&path)
|
||||
.ok_or_else(|| format!("file_path has no accessible ancestor on disk: {file_path}"))?
|
||||
.canonicalize()
|
||||
.map_err(|e| format!("Cannot canonicalize path: {e}"))?;
|
||||
|
||||
if !canonical_ancestor.starts_with(&canonical_wt) {
|
||||
return Err(format!(
|
||||
"file_path must be inside worktrees root {worktrees_root:?}. Got: {file_path}"
|
||||
));
|
||||
}
|
||||
|
||||
Ok(path)
|
||||
}
|
||||
|
||||
/// Return the deepest ancestor of `p` (inclusive) that exists on disk.
|
||||
fn find_existing_ancestor(p: &Path) -> Option<&Path> {
|
||||
let mut current = p;
|
||||
loop {
|
||||
if current.exists() {
|
||||
return Some(current);
|
||||
}
|
||||
current = current.parent()?;
|
||||
}
|
||||
}
|
||||
|
||||
/// Replace `old_string` with `new_string` in a file inside the agent's worktree.
|
||||
///
|
||||
/// Mirrors Claude's built-in `Edit` tool with worktree path validation.
|
||||
/// By default replaces only the first occurrence; pass `replace_all: true`
|
||||
/// to replace every occurrence.
|
||||
pub(crate) fn tool_edit(args: &Value, ctx: &AppContext) -> Result<String, String> {
|
||||
let file_path = args
|
||||
.get("file_path")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("Missing required argument: file_path")?;
|
||||
let old_string = args
|
||||
.get("old_string")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("Missing required argument: old_string")?;
|
||||
let new_string = args
|
||||
.get("new_string")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("Missing required argument: new_string")?;
|
||||
let replace_all = args
|
||||
.get("replace_all")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(false);
|
||||
|
||||
let path = validate_worktree_file_path(file_path, ctx)?;
|
||||
|
||||
if !path.exists() {
|
||||
return Err(format!("file_path does not exist: {file_path}"));
|
||||
}
|
||||
|
||||
let content =
|
||||
std::fs::read_to_string(&path).map_err(|e| format!("Failed to read {file_path}: {e}"))?;
|
||||
|
||||
if !content.contains(old_string) {
|
||||
return Err(format!(
|
||||
"old_string not found in {file_path}: {old_string:?}"
|
||||
));
|
||||
}
|
||||
|
||||
let new_content = if replace_all {
|
||||
content.replace(old_string, new_string)
|
||||
} else {
|
||||
content.replacen(old_string, new_string, 1)
|
||||
};
|
||||
|
||||
std::fs::write(&path, &new_content).map_err(|e| format!("Failed to write {file_path}: {e}"))?;
|
||||
|
||||
Ok(format!("Edited {file_path}"))
|
||||
}
|
||||
|
||||
/// Write `content` to a file inside the agent's worktree, creating the file
|
||||
/// (and any missing parent directories) if necessary.
|
||||
///
|
||||
/// Mirrors Claude's built-in `Write` tool with worktree path validation.
|
||||
pub(crate) fn tool_write(args: &Value, ctx: &AppContext) -> Result<String, String> {
|
||||
let file_path = args
|
||||
.get("file_path")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("Missing required argument: file_path")?;
|
||||
let content = args
|
||||
.get("content")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("Missing required argument: content")?;
|
||||
|
||||
let path = validate_worktree_file_path(file_path, ctx)?;
|
||||
|
||||
if let Some(parent) = path.parent() {
|
||||
std::fs::create_dir_all(parent)
|
||||
.map_err(|e| format!("Failed to create parent dirs for {file_path}: {e}"))?;
|
||||
}
|
||||
|
||||
std::fs::write(&path, content).map_err(|e| format!("Failed to write {file_path}: {e}"))?;
|
||||
|
||||
Ok(format!("Written {file_path}"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::http::test_helpers::test_ctx;
|
||||
use serde_json::json;
|
||||
|
||||
fn make_worktree(tmp: &tempfile::TempDir, name: &str) -> PathBuf {
|
||||
let wt = tmp.path().join(".huskies").join("worktrees").join(name);
|
||||
std::fs::create_dir_all(&wt).unwrap();
|
||||
wt
|
||||
}
|
||||
|
||||
// ── validate_worktree_file_path ───────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn validate_rejects_relative_path() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
make_worktree(&tmp, "42_test");
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let result = validate_worktree_file_path("relative/path.rs", &ctx);
|
||||
assert!(result.is_err());
|
||||
assert!(result.unwrap_err().contains("absolute"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_rejects_path_outside_worktree() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
make_worktree(&tmp, "42_test");
|
||||
let ctx = test_ctx(tmp.path());
|
||||
// /workspace/server/foo.rs is outside .huskies/worktrees/
|
||||
let outside = tmp.path().join("server").join("foo.rs");
|
||||
let result = validate_worktree_file_path(outside.to_str().unwrap(), &ctx);
|
||||
assert!(result.is_err(), "expected rejection, got ok");
|
||||
let msg = result.unwrap_err();
|
||||
assert!(
|
||||
msg.contains("worktrees"),
|
||||
"error should name worktrees root: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_accepts_existing_file_inside_worktree() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "42_test");
|
||||
let file = wt.join("foo.rs");
|
||||
std::fs::write(&file, "content").unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let result = validate_worktree_file_path(file.to_str().unwrap(), &ctx);
|
||||
assert!(result.is_ok(), "expected ok, got: {:?}", result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_accepts_nonexistent_file_inside_worktree() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "42_test");
|
||||
// File doesn't exist yet — parent dir does
|
||||
let file = wt.join("new_file.rs");
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let result = validate_worktree_file_path(file.to_str().unwrap(), &ctx);
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"expected ok for new file, got: {:?}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_rejects_no_worktrees_dir() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
// Do NOT create worktrees dir
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let path = tmp.path().join("file.rs");
|
||||
let result = validate_worktree_file_path(path.to_str().unwrap(), &ctx);
|
||||
assert!(result.is_err());
|
||||
assert!(result.unwrap_err().contains("worktrees"));
|
||||
}
|
||||
|
||||
// ── tool_edit ─────────────────────────────────────────────────────
|
||||
|
||||
/// AC3(a) — path outside worktree is rejected
|
||||
#[test]
|
||||
fn tool_edit_rejects_path_outside_worktree() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
make_worktree(&tmp, "42_test");
|
||||
// Create a file outside worktrees
|
||||
let outside = tmp.path().join("server");
|
||||
std::fs::create_dir_all(&outside).unwrap();
|
||||
let outside_file = outside.join("foo.rs");
|
||||
std::fs::write(&outside_file, "old content").unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
let result = tool_edit(
|
||||
&json!({
|
||||
"file_path": outside_file.to_str().unwrap(),
|
||||
"old_string": "old content",
|
||||
"new_string": "new content"
|
||||
}),
|
||||
&ctx,
|
||||
);
|
||||
assert!(result.is_err(), "expected rejection");
|
||||
// Master file unchanged
|
||||
let content = std::fs::read_to_string(&outside_file).unwrap();
|
||||
assert_eq!(content, "old content", "master file must be unchanged");
|
||||
}
|
||||
|
||||
/// AC3(b) — path inside worktree succeeds
|
||||
#[test]
|
||||
fn tool_edit_accepts_path_inside_worktree() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "42_test");
|
||||
let file = wt.join("foo.rs");
|
||||
std::fs::write(&file, "fn old_fn() {}").unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
let result = tool_edit(
|
||||
&json!({
|
||||
"file_path": file.to_str().unwrap(),
|
||||
"old_string": "old_fn",
|
||||
"new_string": "new_fn"
|
||||
}),
|
||||
&ctx,
|
||||
);
|
||||
assert!(result.is_ok(), "expected ok, got: {:?}", result);
|
||||
let content = std::fs::read_to_string(&file).unwrap();
|
||||
assert!(content.contains("new_fn"));
|
||||
assert!(!content.contains("old_fn"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_edit_replace_all_replaces_every_occurrence() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "43_test");
|
||||
let file = wt.join("multi.rs");
|
||||
std::fs::write(&file, "foo foo foo").unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
tool_edit(
|
||||
&json!({
|
||||
"file_path": file.to_str().unwrap(),
|
||||
"old_string": "foo",
|
||||
"new_string": "bar",
|
||||
"replace_all": true
|
||||
}),
|
||||
&ctx,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let content = std::fs::read_to_string(&file).unwrap();
|
||||
assert_eq!(content, "bar bar bar");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_edit_default_replaces_first_occurrence_only() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "44_test");
|
||||
let file = wt.join("single.rs");
|
||||
std::fs::write(&file, "foo foo foo").unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
tool_edit(
|
||||
&json!({
|
||||
"file_path": file.to_str().unwrap(),
|
||||
"old_string": "foo",
|
||||
"new_string": "bar"
|
||||
}),
|
||||
&ctx,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let content = std::fs::read_to_string(&file).unwrap();
|
||||
assert_eq!(content, "bar foo foo");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_edit_fails_when_old_string_not_found() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "45_test");
|
||||
let file = wt.join("missing.rs");
|
||||
std::fs::write(&file, "hello world").unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
let result = tool_edit(
|
||||
&json!({
|
||||
"file_path": file.to_str().unwrap(),
|
||||
"old_string": "not present",
|
||||
"new_string": "x"
|
||||
}),
|
||||
&ctx,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
assert!(result.unwrap_err().contains("not found"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_edit_fails_when_file_does_not_exist() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "46_test");
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
let result = tool_edit(
|
||||
&json!({
|
||||
"file_path": wt.join("ghost.rs").to_str().unwrap(),
|
||||
"old_string": "x",
|
||||
"new_string": "y"
|
||||
}),
|
||||
&ctx,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
assert!(result.unwrap_err().contains("does not exist"));
|
||||
}
|
||||
|
||||
// ── tool_write ────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn tool_write_rejects_path_outside_worktree() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
make_worktree(&tmp, "42_test");
|
||||
let outside = tmp.path().join("master_file.rs");
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
let result = tool_write(
|
||||
&json!({
|
||||
"file_path": outside.to_str().unwrap(),
|
||||
"content": "evil"
|
||||
}),
|
||||
&ctx,
|
||||
);
|
||||
assert!(result.is_err(), "expected rejection");
|
||||
assert!(!outside.exists(), "master file must not be created");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_write_creates_new_file_inside_worktree() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "47_test");
|
||||
let file = wt.join("new.rs");
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
tool_write(
|
||||
&json!({
|
||||
"file_path": file.to_str().unwrap(),
|
||||
"content": "pub fn hello() {}"
|
||||
}),
|
||||
&ctx,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let content = std::fs::read_to_string(&file).unwrap();
|
||||
assert_eq!(content, "pub fn hello() {}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_write_overwrites_existing_file() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "48_test");
|
||||
let file = wt.join("existing.rs");
|
||||
std::fs::write(&file, "old").unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
tool_write(
|
||||
&json!({
|
||||
"file_path": file.to_str().unwrap(),
|
||||
"content": "new"
|
||||
}),
|
||||
&ctx,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let content = std::fs::read_to_string(&file).unwrap();
|
||||
assert_eq!(content, "new");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_write_creates_parent_dirs() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let wt = make_worktree(&tmp, "49_test");
|
||||
let file = wt.join("deep").join("nested").join("file.rs");
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
tool_write(
|
||||
&json!({
|
||||
"file_path": file.to_str().unwrap(),
|
||||
"content": "deep content"
|
||||
}),
|
||||
&ctx,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let content = std::fs::read_to_string(&file).unwrap();
|
||||
assert_eq!(content, "deep content");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_write_missing_content_arg_errors() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
make_worktree(&tmp, "50_test");
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
let result = tool_write(&json!({"file_path": "/some/path"}), &ctx);
|
||||
assert!(result.is_err());
|
||||
assert!(result.unwrap_err().contains("content"));
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,14 @@
|
||||
//! MCP shell tools — run commands, execute tests, and stream output via MCP.
|
||||
//! MCP shell tools — run commands, execute tests, edit and write files.
|
||||
//!
|
||||
//! This file is a thin adapter: it deserialises MCP payloads, delegates to
|
||||
//! `crate::service::shell` for all business logic, and serialises responses.
|
||||
|
||||
mod exec;
|
||||
mod file_tools;
|
||||
mod script;
|
||||
|
||||
pub(crate) use exec::tool_run_command;
|
||||
pub(crate) use file_tools::{tool_edit, tool_write};
|
||||
pub(crate) use script::{
|
||||
tool_get_test_result, tool_run_build, tool_run_check, tool_run_lint, tool_run_tests,
|
||||
};
|
||||
|
||||
@@ -86,6 +86,7 @@ mod tests {
|
||||
use crate::http::test_helpers::test_ctx;
|
||||
|
||||
fn setup_git_repo_in(dir: &std::path::Path) {
|
||||
crate::db::ensure_content_store();
|
||||
std::process::Command::new("git")
|
||||
.args(["init"])
|
||||
.current_dir(dir)
|
||||
|
||||
@@ -69,7 +69,7 @@ pub(crate) use epic::{tool_create_epic, tool_list_epics, tool_show_epic};
|
||||
pub(crate) use refactor::{tool_create_refactor, tool_list_refactors};
|
||||
pub(crate) use spike::tool_create_spike;
|
||||
pub(crate) use story::{
|
||||
tool_accept_story, tool_create_story, tool_delete_story, tool_freeze_story,
|
||||
tool_get_pipeline_status, tool_list_upcoming, tool_purge_story, tool_unblock_story,
|
||||
tool_unfreeze_story, tool_update_story, tool_validate_stories,
|
||||
tool_accept_story, tool_convert_item_type, tool_create_story, tool_delete_story,
|
||||
tool_freeze_story, tool_get_pipeline_status, tool_list_upcoming, tool_purge_story,
|
||||
tool_unblock_story, tool_unfreeze_story, tool_update_story, tool_validate_stories,
|
||||
};
|
||||
|
||||
@@ -115,6 +115,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_refactor_accepts_single_criterion() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let result = tool_create_refactor(
|
||||
@@ -146,6 +147,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_refactor_accepts_mixed_junk_and_real_acceptance_criteria() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let result = tool_create_refactor(
|
||||
|
||||
@@ -118,6 +118,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_spike_creates_file() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
@@ -147,6 +148,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_spike_creates_file_without_description() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
@@ -202,6 +204,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_spike_accepts_single_criterion() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let result = tool_create_spike(
|
||||
@@ -233,6 +236,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_spike_accepts_mixed_junk_and_real_acceptance_criteria() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let result = tool_create_spike(
|
||||
|
||||
@@ -0,0 +1,178 @@
|
||||
//! MCP tool for converting a work item's type in place (story 1141).
|
||||
//!
|
||||
//! `convert_item_type` changes the type register of an existing CRDT item
|
||||
//! from any value to another (story ↔ bug ↔ spike ↔ refactor) without
|
||||
//! touching the story_id, ACs, epic association, or any other register.
|
||||
|
||||
use crate::http::context::AppContext;
|
||||
use crate::pipeline_state::Stage;
|
||||
use serde_json::Value;
|
||||
|
||||
/// Convert a work item's type in the CRDT.
|
||||
///
|
||||
/// Accepts `story_id` (full filename stem, e.g. `"42_spike_my_spike"`) and
|
||||
/// `new_type` (one of `"story"`, `"bug"`, `"spike"`, `"refactor"`, `"epic"`).
|
||||
/// Returns an error when the item does not exist or is in the `Archived` stage.
|
||||
pub(crate) fn tool_convert_item_type(args: &Value, _ctx: &AppContext) -> Result<String, String> {
|
||||
let req = crate::validation::ConvertItemTypeRequest::from_json(args)?;
|
||||
let story_id = req.story_id.as_str();
|
||||
|
||||
let item = crate::crdt_state::read_item(story_id)
|
||||
.ok_or_else(|| format!("Work item '{story_id}' not found in CRDT."))?;
|
||||
|
||||
if matches!(item.stage(), Stage::Archived { .. }) {
|
||||
return Err(format!(
|
||||
"Cannot convert '{story_id}': type change on an archived item is not allowed."
|
||||
));
|
||||
}
|
||||
|
||||
let old_type = item.item_type().map(|t| t.as_str()).unwrap_or("(inferred)");
|
||||
let new_type_str = req.new_type.as_str();
|
||||
|
||||
if !crate::crdt_state::set_item_type(story_id, Some(req.new_type)) {
|
||||
return Err(format!(
|
||||
"Failed to update item type for '{story_id}': CRDT write was rejected."
|
||||
));
|
||||
}
|
||||
|
||||
Ok(format!(
|
||||
"Converted '{story_id}' from type '{old_type}' to '{new_type_str}'."
|
||||
))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::http::test_helpers::test_ctx;
|
||||
use crate::io::story_metadata::ItemType;
|
||||
use serde_json::json;
|
||||
|
||||
fn make_spike(spike_id: &str) {
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::db::ensure_content_store();
|
||||
crate::db::write_item_with_content(
|
||||
spike_id,
|
||||
"backlog",
|
||||
"---\nname: Test Spike\n---\n",
|
||||
crate::db::ItemMeta::named("Test Spike"),
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn converts_spike_to_story_and_preserves_epic() {
|
||||
crate::crdt_state::init_for_test();
|
||||
let spike_id = "9111_spike_convert_regression";
|
||||
make_spike(spike_id);
|
||||
|
||||
// Attach an epic.
|
||||
crate::crdt_state::set_item_type(spike_id, Some(ItemType::Spike));
|
||||
crate::crdt_state::set_epic(spike_id, crate::crdt_state::EpicId::from_crdt_str("9000"));
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
// (i) Convert spike → story.
|
||||
let result =
|
||||
tool_convert_item_type(&json!({"story_id": spike_id, "new_type": "story"}), &ctx);
|
||||
assert!(result.is_ok(), "convert should succeed: {result:?}");
|
||||
assert!(
|
||||
result.unwrap().contains("story"),
|
||||
"response should mention new type"
|
||||
);
|
||||
|
||||
// (i) Verify type is now Story in CRDT.
|
||||
let item = crate::crdt_state::read_item(spike_id).expect("item must exist");
|
||||
assert_eq!(
|
||||
item.item_type(),
|
||||
Some(ItemType::Story),
|
||||
"item_type should be Story after conversion"
|
||||
);
|
||||
|
||||
// (ii) Verify the conversion is visible in dump_crdt.
|
||||
let dump = crate::crdt_state::dump_crdt_state(Some(spike_id));
|
||||
let found = dump
|
||||
.items
|
||||
.iter()
|
||||
.any(|i| i.item_type.as_deref() == Some("story") && !i.is_deleted);
|
||||
assert!(
|
||||
found,
|
||||
"dump_crdt should show item_type='story' after conversion"
|
||||
);
|
||||
|
||||
// (iii) Epic association is preserved.
|
||||
assert_eq!(
|
||||
item.epic(),
|
||||
crate::crdt_state::EpicId::from_crdt_str("9000"),
|
||||
"epic should be unchanged after type conversion"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_missing_story_id() {
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let err = tool_convert_item_type(&json!({"new_type": "story"}), &ctx).unwrap_err();
|
||||
assert!(
|
||||
err.contains("story_id"),
|
||||
"error should mention story_id: {err}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_invalid_new_type() {
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let err = tool_convert_item_type(
|
||||
&json!({"story_id": "9112_spike_foo", "new_type": "banana"}),
|
||||
&ctx,
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
err.contains("new_type") || err.contains("InvalidValue"),
|
||||
"error should mention new_type: {err}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_nonexistent_item() {
|
||||
crate::crdt_state::init_for_test();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let err = tool_convert_item_type(
|
||||
&json!({"story_id": "9999_spike_not_real", "new_type": "story"}),
|
||||
&ctx,
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
err.contains("not found"),
|
||||
"error should say not found: {err}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_archived_item() {
|
||||
crate::crdt_state::init_for_test();
|
||||
let spike_id = "9113_spike_archived_convert";
|
||||
crate::db::ensure_content_store();
|
||||
crate::db::write_item_with_content(
|
||||
spike_id,
|
||||
"archived",
|
||||
"---\nname: Archived Spike\n---\n",
|
||||
crate::db::ItemMeta::named("Archived Spike"),
|
||||
);
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let err = tool_convert_item_type(&json!({"story_id": spike_id, "new_type": "story"}), &ctx)
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
err.contains("archived"),
|
||||
"error should mention archived: {err}"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -256,6 +256,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_story_accepts_single_criterion() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let result = tool_create_story(
|
||||
@@ -283,6 +284,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_story_accepts_mixed_junk_and_real_acceptance_criteria() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
let result = tool_create_story(
|
||||
@@ -299,6 +301,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_story_description_is_written_to_file() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
|
||||
@@ -368,6 +371,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_story_html_sanitised_in_name() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let ctx = test_ctx(tmp.path());
|
||||
// HTML in name is sanitised (not rejected)
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
//! Story creation, listing, update, and lifecycle MCP tools.
|
||||
|
||||
mod convert;
|
||||
mod create;
|
||||
mod delete;
|
||||
mod freeze;
|
||||
mod query;
|
||||
mod update;
|
||||
|
||||
pub(crate) use convert::tool_convert_item_type;
|
||||
pub(crate) use create::{tool_create_story, tool_purge_story};
|
||||
pub(crate) use delete::{tool_accept_story, tool_delete_story};
|
||||
pub(crate) use freeze::{tool_freeze_story, tool_unfreeze_story};
|
||||
|
||||
@@ -124,6 +124,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tool_create_story_and_list_upcoming() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
// No git repo needed: spike 61 — create_story just writes the file;
|
||||
// the filesystem watcher handles the commit asynchronously.
|
||||
|
||||
@@ -114,7 +114,10 @@ mod tests {
|
||||
assert!(names.contains(&"schedule_timer"));
|
||||
assert!(names.contains(&"list_timers"));
|
||||
assert!(names.contains(&"cancel_timer"));
|
||||
assert_eq!(tools.len(), 82);
|
||||
assert!(names.contains(&"convert_item_type"));
|
||||
assert!(names.contains(&"edit"));
|
||||
assert!(names.contains(&"write"));
|
||||
assert_eq!(tools.len(), 85);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -671,6 +671,25 @@ pub(super) fn story_tools() -> Vec<Value> {
|
||||
"required": ["story_id"]
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "convert_item_type",
|
||||
"description": "Convert a work item's type in place (e.g. spike → story). The story_id, ACs, epic association, and all other registers are preserved; only the item_type register changes. Rejected for archived items.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"story_id": {
|
||||
"type": "string",
|
||||
"description": "Work item identifier (filename stem, e.g. '42_spike_my_spike')"
|
||||
},
|
||||
"new_type": {
|
||||
"type": "string",
|
||||
"enum": ["story", "bug", "spike", "refactor", "epic"],
|
||||
"description": "Target item type"
|
||||
}
|
||||
},
|
||||
"required": ["story_id", "new_type"]
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "freeze_story",
|
||||
"description": "Freeze a work item at its current pipeline stage, suppressing pipeline advancement and auto-assign until unfrozen.",
|
||||
|
||||
@@ -173,6 +173,50 @@ pub(super) fn system_tools() -> Vec<Value> {
|
||||
"required": []
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "edit",
|
||||
"description": "Replace old_string with new_string in a file inside the agent's assigned worktree. Mirrors Claude's built-in Edit tool but validates that file_path is inside .huskies/worktrees/ to prevent writes to the master worktree. By default replaces the first occurrence only; set replace_all to true to replace every occurrence.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "Absolute path to the file to edit. Must be inside .huskies/worktrees/."
|
||||
},
|
||||
"old_string": {
|
||||
"type": "string",
|
||||
"description": "The exact string to replace."
|
||||
},
|
||||
"new_string": {
|
||||
"type": "string",
|
||||
"description": "The replacement string."
|
||||
},
|
||||
"replace_all": {
|
||||
"type": "boolean",
|
||||
"description": "If true, replace every occurrence of old_string. Default: false (replace first occurrence only)."
|
||||
}
|
||||
},
|
||||
"required": ["file_path", "old_string", "new_string"]
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "write",
|
||||
"description": "Write content to a file inside the agent's assigned worktree, creating the file (and any missing parent directories) if necessary. Mirrors Claude's built-in Write tool but validates that file_path is inside .huskies/worktrees/ to prevent writes to the master worktree.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "Absolute path to the file to write. Must be inside .huskies/worktrees/."
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "The content to write to the file."
|
||||
}
|
||||
},
|
||||
"required": ["file_path", "content"]
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"name": "git_status",
|
||||
"description": "Return the working tree status of an agent's worktree (staged, unstaged, and untracked files). The worktree_path must be inside .huskies/worktrees/. Push and remote operations are not available.",
|
||||
|
||||
+72
-7
@@ -1,8 +1,6 @@
|
||||
//! HTTP server — module declarations for all REST, MCP, WebSocket, and SSE endpoints.
|
||||
/// Server-sent event stream for real-time agent output.
|
||||
pub mod agents_sse;
|
||||
/// Static asset serving (embedded frontend files).
|
||||
pub mod assets;
|
||||
/// Shared application context threaded through handlers.
|
||||
pub mod context;
|
||||
/// Server-sent event stream for pipeline/watcher events.
|
||||
@@ -100,15 +98,16 @@ pub fn build_routes(
|
||||
get(oauth::oauth_callback).data(oauth_state.clone()),
|
||||
)
|
||||
.at("/oauth/status", get(oauth::oauth_status))
|
||||
.at("/debug/crdt", get(debug_crdt_handler))
|
||||
.at("/assets/*path", get(assets::embedded_asset))
|
||||
.at("/", get(assets::embedded_index))
|
||||
.at("/*path", get(assets::embedded_file));
|
||||
.at("/debug/crdt", get(debug_crdt_handler));
|
||||
|
||||
if let Some(buf) = event_buffer {
|
||||
route = route.at("/api/events", get(events::events_handler).data(buf));
|
||||
}
|
||||
|
||||
route = route
|
||||
.at("/api/upgrade", post(upgrade_trigger_handler))
|
||||
.at("/api/huskies-binary", get(serve_binary_handler));
|
||||
|
||||
if let Some(wa_ctx) = whatsapp_ctx {
|
||||
route = route.at(
|
||||
"/webhook/whatsapp",
|
||||
@@ -203,7 +202,7 @@ pub fn debug_crdt_handler(req: &poem::Request) -> poem::Response {
|
||||
"total_ops_in_list": dump.total_ops_in_list,
|
||||
"max_seq_in_list": dump.max_seq_in_list,
|
||||
"persisted_ops_count": dump.persisted_ops_count,
|
||||
"pending_persist_ops_count": null,
|
||||
"pending_persist_ops_count": dump.pending_persist_ops_count,
|
||||
},
|
||||
"items": items,
|
||||
});
|
||||
@@ -214,6 +213,72 @@ pub fn debug_crdt_handler(req: &poem::Request) -> poem::Response {
|
||||
.body(serde_json::to_string_pretty(&body).unwrap_or_default())
|
||||
}
|
||||
|
||||
/// `POST /api/upgrade` — trigger a self-update on the running sled.
|
||||
///
|
||||
/// Accepts `{"source_url": "http://gateway:3000/api/huskies-binary"}` and
|
||||
/// spawns the upgrade task in the background, returning 202 immediately.
|
||||
/// The connection will be dropped when `exec()` replaces the process.
|
||||
#[poem::handler]
|
||||
pub async fn upgrade_trigger_handler(
|
||||
body: poem::web::Json<serde_json::Value>,
|
||||
ctx: poem::web::Data<&std::sync::Arc<AppContext>>,
|
||||
) -> poem::Response {
|
||||
let source_url = match body
|
||||
.0
|
||||
.get("source_url")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string())
|
||||
{
|
||||
Some(u) => u,
|
||||
None => {
|
||||
return poem::Response::builder()
|
||||
.status(StatusCode::BAD_REQUEST)
|
||||
.body("Missing required field: source_url");
|
||||
}
|
||||
};
|
||||
|
||||
let project_root = ctx.state.get_project_root().unwrap_or_default();
|
||||
|
||||
// Spawn upgrade in background so we can return 202 before exec() fires.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = crate::upgrade::upgrade_and_reexec(&source_url, &project_root).await {
|
||||
crate::slog!("[upgrade] Upgrade failed: {e}");
|
||||
}
|
||||
});
|
||||
|
||||
poem::Response::builder()
|
||||
.status(StatusCode::ACCEPTED)
|
||||
.body("Upgrade triggered. The sled will re-exec momentarily.")
|
||||
}
|
||||
|
||||
/// `GET /api/huskies-binary` — serve the running binary so peer sleds can download it.
|
||||
///
|
||||
/// Streams `current_exe()` (the binary that is currently running) as an
|
||||
/// `application/octet-stream` download. Returns 500 if the path cannot be
|
||||
/// resolved or read.
|
||||
#[poem::handler]
|
||||
pub async fn serve_binary_handler() -> poem::Response {
|
||||
let exe = match std::env::current_exe() {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
return poem::Response::builder()
|
||||
.status(StatusCode::INTERNAL_SERVER_ERROR)
|
||||
.body(format!("Cannot resolve current executable: {e}"));
|
||||
}
|
||||
};
|
||||
|
||||
match tokio::fs::read(&exe).await {
|
||||
Ok(bytes) => poem::Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header("Content-Type", "application/octet-stream")
|
||||
.header("Content-Disposition", "attachment; filename=\"huskies\"")
|
||||
.body(bytes),
|
||||
Err(e) => poem::Response::builder()
|
||||
.status(StatusCode::INTERNAL_SERVER_ERROR)
|
||||
.body(format!("Cannot read binary at {}: {e}", exe.display())),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -6,6 +6,7 @@ use super::spike::create_spike_file;
|
||||
use std::fs;
|
||||
|
||||
fn setup_git_repo(root: &std::path::Path) {
|
||||
crate::db::ensure_content_store();
|
||||
std::process::Command::new("git")
|
||||
.args(["init"])
|
||||
.current_dir(root)
|
||||
@@ -166,6 +167,7 @@ fn extract_bug_name_from_content_parses_heading() {
|
||||
|
||||
#[test]
|
||||
fn create_bug_file_writes_correct_content() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
setup_git_repo(tmp.path());
|
||||
|
||||
@@ -257,6 +259,7 @@ fn create_bug_file_rejects_empty_acceptance_criteria() {
|
||||
|
||||
#[test]
|
||||
fn create_spike_file_writes_correct_content() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
|
||||
let spike_id = create_spike_file(
|
||||
@@ -294,6 +297,7 @@ fn create_spike_file_writes_correct_content() {
|
||||
|
||||
#[test]
|
||||
fn create_spike_file_uses_description_when_provided() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let description = "What is the best approach for watching filesystem events?";
|
||||
|
||||
@@ -319,6 +323,7 @@ fn create_spike_file_uses_description_when_provided() {
|
||||
|
||||
#[test]
|
||||
fn create_spike_file_uses_placeholder_when_no_description() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let spike_id = create_spike_file(
|
||||
tmp.path(),
|
||||
@@ -350,6 +355,7 @@ fn create_spike_file_rejects_empty_name() {
|
||||
|
||||
#[test]
|
||||
fn create_spike_file_with_special_chars_in_name_produces_valid_yaml() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let name = "Spike: compare \"fast\" vs slow encoders";
|
||||
let result = create_spike_file(
|
||||
@@ -423,6 +429,7 @@ fn create_bug_file_with_depends_on_persists_to_crdt() {
|
||||
|
||||
#[test]
|
||||
fn create_bug_file_without_depends_on_omits_field() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
setup_git_repo(tmp.path());
|
||||
|
||||
@@ -474,6 +481,7 @@ fn create_refactor_file_with_depends_on_persists_to_crdt() {
|
||||
|
||||
#[test]
|
||||
fn create_refactor_file_without_depends_on_omits_field() {
|
||||
crate::db::ensure_content_store();
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
setup_git_repo(tmp.path());
|
||||
|
||||
|
||||
@@ -86,6 +86,14 @@ pub async fn ws_handler(ws: WebSocket, ctx: Data<&Arc<AppContext>>) -> impl poem
|
||||
ws::subscribe_status(tx.clone(), ctx.services.status.subscribe());
|
||||
}
|
||||
|
||||
// Subscribe to real-time pipeline-transition events for this persona.
|
||||
// Events that arrived while no client was connected are caught up by
|
||||
// assemble_prompt_context at turn time.
|
||||
ws::subscribe_persona_pipeline_events(
|
||||
tx.clone(),
|
||||
ctx.services.bot_name.to_lowercase(),
|
||||
);
|
||||
|
||||
// Map of pending permission request_id -> oneshot responder.
|
||||
let mut pending_perms: HashMap<String, oneshot::Sender<PermissionDecision>> =
|
||||
HashMap::new();
|
||||
@@ -109,9 +117,11 @@ pub async fn ws_handler(ws: WebSocket, ctx: Data<&Arc<AppContext>>) -> impl poem
|
||||
let tx_activity = tx.clone();
|
||||
let ctx_clone = ctx.clone();
|
||||
|
||||
let persona = ctx_clone.services.bot_name.to_lowercase();
|
||||
let chat_fut = chat::chat(
|
||||
messages,
|
||||
config,
|
||||
&persona,
|
||||
&ctx_clone.state,
|
||||
ctx_clone.store.as_ref(),
|
||||
move |history| {
|
||||
|
||||
@@ -113,10 +113,13 @@ pub fn cancel_chat(state: &SessionState) -> Result<(), String> {
|
||||
}
|
||||
|
||||
/// Run a multi-turn chat with tool calling against the configured provider.
|
||||
///
|
||||
/// `persona` is the persona name used to key CRDT event-log assembly (e.g. `"timmy"`).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn chat<F, U, T, A>(
|
||||
mut messages: Vec<Message>,
|
||||
config: ProviderConfig,
|
||||
persona: &str,
|
||||
state: &SessionState,
|
||||
store: &dyn StoreOps,
|
||||
mut on_update: F,
|
||||
@@ -139,6 +142,11 @@ where
|
||||
let received_at = Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string();
|
||||
inject_received_at(&mut messages, &received_at);
|
||||
|
||||
// Assemble CRDT pipeline-transition events once per turn and advance the
|
||||
// high-water mark. Uses the caller-supplied persona so all transports share
|
||||
// the same event stream regardless of transport-specific session identifiers.
|
||||
let event_ctx = crate::llm_session::assemble_prompt_context(persona);
|
||||
|
||||
let _ = state.cancel_tx.send(false);
|
||||
let mut cancel_rx = state.cancel_rx.clone();
|
||||
cancel_rx.borrow_and_update();
|
||||
@@ -177,10 +185,14 @@ where
|
||||
// would be lost because Claude Code only receives a single prompt
|
||||
// string. In that case, prepend the conversation history so the LLM
|
||||
// retains full context even though the session cannot be resumed.
|
||||
// In both cases, prepend any pending CRDT pipeline-transition events.
|
||||
let user_message = if config.session_id.is_some() {
|
||||
latest_user_content
|
||||
format!("{event_ctx}{latest_user_content}")
|
||||
} else {
|
||||
build_claude_code_context_prompt(&messages, &latest_user_content)
|
||||
format!(
|
||||
"{event_ctx}{}",
|
||||
build_claude_code_context_prompt(&messages, &latest_user_content)
|
||||
)
|
||||
};
|
||||
|
||||
let project_root = state
|
||||
@@ -233,6 +245,14 @@ where
|
||||
&[]
|
||||
};
|
||||
|
||||
// Prepend pipeline-transition events to the last user message so Anthropic
|
||||
// and Ollama providers also receive the CRDT context on every turn.
|
||||
if !event_ctx.is_empty()
|
||||
&& let Some(msg) = messages.iter_mut().rev().find(|m| m.role == Role::User)
|
||||
{
|
||||
msg.content = format!("{event_ctx}{}", msg.content);
|
||||
}
|
||||
|
||||
let mut current_history = messages.clone();
|
||||
|
||||
// Build the system prompt — append onboarding instructions when the
|
||||
@@ -608,6 +628,7 @@ mod tests {
|
||||
let result = chat(
|
||||
messages,
|
||||
config,
|
||||
"timmy",
|
||||
&state,
|
||||
&store,
|
||||
|_| {},
|
||||
@@ -652,6 +673,7 @@ mod tests {
|
||||
let result = chat(
|
||||
messages,
|
||||
config,
|
||||
"timmy",
|
||||
&state,
|
||||
&store,
|
||||
|_| {},
|
||||
@@ -692,6 +714,7 @@ mod tests {
|
||||
let result = chat(
|
||||
messages,
|
||||
config,
|
||||
"timmy",
|
||||
&state,
|
||||
&store,
|
||||
|_| {},
|
||||
|
||||
@@ -0,0 +1,331 @@
|
||||
//! LLM session management — CRDT-backed context assembly for bot prompts.
|
||||
//!
|
||||
//! The central export is [`assemble_prompt_context`], which reads new pipeline
|
||||
//! transition events from the CRDT event log past the persona's stored high-water
|
||||
//! marks, wraps them in a `<system-reminder>` block for injection at the head of
|
||||
//! the next LLM prompt, and atomically advances the marks so a mid-turn crash
|
||||
//! cannot double-inject the same events.
|
||||
|
||||
/// Assemble a `<system-reminder>` block containing new pipeline-transition events
|
||||
/// for `persona` and atomically advance the high-water marks.
|
||||
///
|
||||
/// All chat transports call this with the same persona name (e.g. `"timmy"`)
|
||||
/// so that events are visible to whichever transport handles the next turn,
|
||||
/// regardless of transport-specific session identifiers. Returns an empty
|
||||
/// string when there are no new events or the CRDT is not yet initialised.
|
||||
pub fn assemble_prompt_context(persona: &str) -> String {
|
||||
let lines = crate::crdt_state::assemble_and_advance_session(persona);
|
||||
let event_count = lines.len();
|
||||
crate::slog!(
|
||||
"[llm-session] assemble_prompt_context persona={persona} new_events={event_count}"
|
||||
);
|
||||
if lines.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
let body = lines.join("\n");
|
||||
format!("<system-reminder>\n{body}\n</system-reminder>\n")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::pipeline_state::{PipelineEvent, PlanState, Stage, StoryId, TransitionFired};
|
||||
|
||||
fn make_fired(story_id: &str) -> TransitionFired {
|
||||
TransitionFired {
|
||||
story_id: StoryId(story_id.to_string()),
|
||||
before: Stage::Backlog,
|
||||
after: Stage::Coding {
|
||||
claim: None,
|
||||
plan: PlanState::Missing,
|
||||
retries: 0,
|
||||
},
|
||||
event: PipelineEvent::DepsMet,
|
||||
at: chrono::Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// AC 4: fire a `TransitionFired` event, call `assemble_prompt_context` via
|
||||
/// the session helper, assert the rendered output contains the event details.
|
||||
/// A second call must return empty because the high-water was advanced.
|
||||
#[test]
|
||||
fn assemble_prompt_context_includes_new_events_and_advances_high_water() {
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
// Log two transition events for different stories.
|
||||
crate::event_log::log_transition_event(&make_fired("42_story_foo"));
|
||||
crate::event_log::log_transition_event(&make_fired("99_story_bar"));
|
||||
|
||||
let ctx = assemble_prompt_context("room-test-1");
|
||||
|
||||
// Must be wrapped in a <system-reminder> block.
|
||||
assert!(
|
||||
ctx.starts_with("<system-reminder>\n"),
|
||||
"missing opening tag; got: {ctx}"
|
||||
);
|
||||
assert!(
|
||||
ctx.ends_with("</system-reminder>\n"),
|
||||
"missing closing tag; got: {ctx}"
|
||||
);
|
||||
|
||||
// Both story IDs must appear in the rendered block.
|
||||
assert!(
|
||||
ctx.contains("42_story_foo"),
|
||||
"first story missing; got: {ctx}"
|
||||
);
|
||||
assert!(
|
||||
ctx.contains("99_story_bar"),
|
||||
"second story missing; got: {ctx}"
|
||||
);
|
||||
|
||||
// The pipeline_event label must appear.
|
||||
assert!(ctx.contains("DepsMet"), "event label missing; got: {ctx}");
|
||||
|
||||
// Second call: high-water was advanced — no new events, returns empty.
|
||||
let ctx2 = assemble_prompt_context("room-test-1");
|
||||
assert!(
|
||||
ctx2.is_empty(),
|
||||
"second call must be empty after high-water advance; got: {ctx2}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Different session IDs have independent high-water marks.
|
||||
#[test]
|
||||
fn assemble_prompt_context_sessions_are_independent() {
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
crate::event_log::log_transition_event(&make_fired("77_story_x"));
|
||||
|
||||
// Session A sees the event.
|
||||
let ctx_a = assemble_prompt_context("room-session-a");
|
||||
assert!(
|
||||
ctx_a.contains("77_story_x"),
|
||||
"session A must see the event; got: {ctx_a}"
|
||||
);
|
||||
|
||||
// Session B also sees it (independent high-water).
|
||||
let ctx_b = assemble_prompt_context("room-session-b");
|
||||
assert!(
|
||||
ctx_b.contains("77_story_x"),
|
||||
"session B must see the event; got: {ctx_b}"
|
||||
);
|
||||
|
||||
// Second call on A: already advanced.
|
||||
let ctx_a2 = assemble_prompt_context("room-session-a");
|
||||
assert!(
|
||||
ctx_a2.is_empty(),
|
||||
"session A second call must be empty; got: {ctx_a2}"
|
||||
);
|
||||
|
||||
// But B's second call is also empty.
|
||||
let ctx_b2 = assemble_prompt_context("room-session-b");
|
||||
assert!(
|
||||
ctx_b2.is_empty(),
|
||||
"session B second call must be empty; got: {ctx_b2}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Events logged after a prior advance are included in the next call.
|
||||
#[test]
|
||||
fn assemble_prompt_context_includes_events_logged_after_advance() {
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
crate::event_log::log_transition_event(&make_fired("10_story_old"));
|
||||
// First call drains and advances.
|
||||
let ctx1 = assemble_prompt_context("room-incremental");
|
||||
assert!(ctx1.contains("10_story_old"), "got: {ctx1}");
|
||||
|
||||
// Log a new event after the advance.
|
||||
crate::event_log::log_transition_event(&make_fired("20_story_new"));
|
||||
let ctx2 = assemble_prompt_context("room-incremental");
|
||||
assert!(
|
||||
ctx2.contains("20_story_new"),
|
||||
"new event must appear; got: {ctx2}"
|
||||
);
|
||||
assert!(
|
||||
!ctx2.contains("10_story_old"),
|
||||
"old event must not reappear; got: {ctx2}"
|
||||
);
|
||||
}
|
||||
|
||||
/// `assemble_prompt_context` returns empty string when there are no events.
|
||||
#[test]
|
||||
fn assemble_prompt_context_empty_when_no_events() {
|
||||
crate::crdt_state::init_for_test();
|
||||
let ctx = assemble_prompt_context("room-empty");
|
||||
assert!(ctx.is_empty(), "must be empty with no events; got: {ctx}");
|
||||
}
|
||||
|
||||
/// AC 4: two sleds each fire one transition; a session scoped `All` sees
|
||||
/// both events; a session scoped `Sleds([sled-A])` sees only sled-A's event.
|
||||
///
|
||||
/// Simulates the gateway aggregate view by directly calling
|
||||
/// `append_event_log_entry` with two distinct sled IDs, then asserting
|
||||
/// scope-filtered assembly behaves correctly.
|
||||
#[test]
|
||||
fn scope_filter_all_sees_both_sleds_filter_sees_one() {
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
let sled_a = "aaaaaaaaaaaaaaaa";
|
||||
let sled_b = "bbbbbbbbbbbbbbbb";
|
||||
|
||||
// Each sled fires one pipeline transition.
|
||||
crate::crdt_state::append_event_log_entry(
|
||||
sled_a,
|
||||
1_000_000.0,
|
||||
"10_story_alpha",
|
||||
"1_backlog",
|
||||
"2_current",
|
||||
"DepsMet",
|
||||
);
|
||||
crate::crdt_state::append_event_log_entry(
|
||||
sled_b,
|
||||
1_000_001.0,
|
||||
"20_story_beta",
|
||||
"2_current",
|
||||
"3_qa",
|
||||
"AgentCompleted",
|
||||
);
|
||||
|
||||
// Set up a persona scoped to ALL sleds.
|
||||
crate::crdt_state::write_llm_session("timmy", "all");
|
||||
// Set up a persona scoped to sled-A only.
|
||||
let sled_a_scope = format!("sleds:{sled_a}");
|
||||
crate::crdt_state::write_llm_session("sally", &sled_a_scope);
|
||||
|
||||
// All-scope persona: both events must appear.
|
||||
let ctx_all = assemble_prompt_context("timmy");
|
||||
assert!(
|
||||
ctx_all.contains("10_story_alpha"),
|
||||
"All scope must contain sled-A event; got: {ctx_all}"
|
||||
);
|
||||
assert!(
|
||||
ctx_all.contains("20_story_beta"),
|
||||
"All scope must contain sled-B event; got: {ctx_all}"
|
||||
);
|
||||
|
||||
// Sled-A-only persona: only sled-A's event visible.
|
||||
let ctx_a = assemble_prompt_context("sally");
|
||||
assert!(
|
||||
ctx_a.contains("10_story_alpha"),
|
||||
"Sleds filter must contain sled-A event; got: {ctx_a}"
|
||||
);
|
||||
assert!(
|
||||
!ctx_a.contains("20_story_beta"),
|
||||
"Sleds filter must NOT contain sled-B event; got: {ctx_a}"
|
||||
);
|
||||
|
||||
// Second call on both personas: nothing new (high-water advanced).
|
||||
let ctx_all2 = assemble_prompt_context("timmy");
|
||||
assert!(
|
||||
ctx_all2.is_empty(),
|
||||
"All scope second call must be empty; got: {ctx_all2}"
|
||||
);
|
||||
let ctx_a2 = assemble_prompt_context("sally");
|
||||
assert!(
|
||||
ctx_a2.is_empty(),
|
||||
"Sleds filter second call must be empty; got: {ctx_a2}"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC 5 e2e: fire a pipeline transition, then verify that calling
|
||||
/// `assemble_prompt_context` with the same persona key from any "transport"
|
||||
/// (simulated by different caller labels) sees the event. The persona is
|
||||
/// transport-agnostic; subsequent transports sharing the persona see their
|
||||
/// own new events independently via independent calls (each drains a fresh
|
||||
/// batch).
|
||||
#[test]
|
||||
fn persona_key_is_transport_agnostic() {
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::crdt_state::write_llm_session("timmy", "all");
|
||||
|
||||
// Fire event 1.
|
||||
crate::event_log::log_transition_event(&make_fired("e2e_story_1"));
|
||||
|
||||
// Matrix turn: see event 1.
|
||||
let matrix_ctx = assemble_prompt_context("timmy");
|
||||
assert!(
|
||||
matrix_ctx.contains("e2e_story_1"),
|
||||
"Matrix turn must see event 1; got: {matrix_ctx}"
|
||||
);
|
||||
|
||||
// Fire event 2.
|
||||
crate::event_log::log_transition_event(&make_fired("e2e_story_2"));
|
||||
|
||||
// Web-UI turn (same persona): see event 2 only (event 1 high-water already advanced).
|
||||
let web_ui_ctx = assemble_prompt_context("timmy");
|
||||
assert!(
|
||||
web_ui_ctx.contains("e2e_story_2"),
|
||||
"Web-UI turn must see event 2; got: {web_ui_ctx}"
|
||||
);
|
||||
assert!(
|
||||
!web_ui_ctx.contains("e2e_story_1"),
|
||||
"Web-UI turn must NOT re-see event 1; got: {web_ui_ctx}"
|
||||
);
|
||||
|
||||
// Fire event 3.
|
||||
crate::event_log::log_transition_event(&make_fired("e2e_story_3"));
|
||||
|
||||
// CLI turn (same persona): see event 3 only.
|
||||
let cli_ctx = assemble_prompt_context("timmy");
|
||||
assert!(
|
||||
cli_ctx.contains("e2e_story_3"),
|
||||
"CLI turn must see event 3; got: {cli_ctx}"
|
||||
);
|
||||
assert!(
|
||||
!cli_ctx.contains("e2e_story_1"),
|
||||
"CLI turn must NOT re-see event 1; got: {cli_ctx}"
|
||||
);
|
||||
assert!(
|
||||
!cli_ctx.contains("e2e_story_2"),
|
||||
"CLI turn must NOT re-see event 2; got: {cli_ctx}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Newly-added sled events appear in an All-scope session without
|
||||
/// restarting (AC 5 runtime pickup).
|
||||
#[test]
|
||||
fn scope_filter_all_picks_up_new_sled_at_runtime() {
|
||||
crate::crdt_state::init_for_test();
|
||||
|
||||
let sled_a = "cccccccccccccccc";
|
||||
let sled_new = "dddddddddddddddd";
|
||||
|
||||
// Only sled-A exists initially.
|
||||
crate::crdt_state::append_event_log_entry(
|
||||
sled_a,
|
||||
2_000_000.0,
|
||||
"30_story_first",
|
||||
"1_backlog",
|
||||
"2_current",
|
||||
"DepsMet",
|
||||
);
|
||||
crate::crdt_state::write_llm_session("timmy", "all");
|
||||
|
||||
let ctx1 = assemble_prompt_context("timmy");
|
||||
assert!(
|
||||
ctx1.contains("30_story_first"),
|
||||
"first event must appear; got: {ctx1}"
|
||||
);
|
||||
|
||||
// sled_new is adopted at runtime — its event is appended without restart.
|
||||
crate::crdt_state::append_event_log_entry(
|
||||
sled_new,
|
||||
2_000_001.0,
|
||||
"40_story_second",
|
||||
"2_current",
|
||||
"3_qa",
|
||||
"AgentCompleted",
|
||||
);
|
||||
|
||||
let ctx2 = assemble_prompt_context("timmy");
|
||||
assert!(
|
||||
ctx2.contains("40_story_second"),
|
||||
"newly adopted sled event must appear; got: {ctx2}"
|
||||
);
|
||||
assert!(
|
||||
!ctx2.contains("30_story_first"),
|
||||
"old event must not reappear; got: {ctx2}"
|
||||
);
|
||||
}
|
||||
}
|
||||
+82
-2
@@ -20,18 +20,26 @@ pub mod crdt_sync;
|
||||
/// CRDT wire format — on-wire message types for the crdt-sync protocol.
|
||||
pub mod crdt_wire;
|
||||
mod db;
|
||||
/// Event log — CRDT-persisted append-only log of every pipeline stage transition.
|
||||
pub(crate) mod event_log;
|
||||
/// Gateway mode — multi-project reverse proxy that fronts multiple project containers.
|
||||
pub mod gateway;
|
||||
mod gateway_relay;
|
||||
mod http;
|
||||
mod io;
|
||||
mod llm;
|
||||
/// LLM session management — CRDT-backed context assembly for bot prompts.
|
||||
pub(crate) mod llm_session;
|
||||
/// Log buffer — in-memory ring buffer for recent server-side log lines.
|
||||
pub mod log_buffer;
|
||||
/// Mesh — peer discovery and multi-hop CRDT replication over WebSocket.
|
||||
pub mod mesh;
|
||||
/// Node identity — Ed25519 keypair generation and stable node ID management.
|
||||
pub mod node_identity;
|
||||
/// Gateway pidfile — exclusive flock on `$HOME/.huskies/gateway.pid`.
|
||||
pub mod pidfile;
|
||||
/// Pipeline event bus — real-time broadcast of pipeline-transition events to persona subscribers.
|
||||
pub(crate) mod pipeline_event_bus;
|
||||
pub(crate) mod pipeline_state;
|
||||
/// Reliable process-termination primitives shared across the server.
|
||||
pub mod process_kill;
|
||||
@@ -45,6 +53,10 @@ pub mod sled_uplink;
|
||||
mod startup;
|
||||
mod state;
|
||||
mod store;
|
||||
/// Detached trampoline — kills the running gateway and starts the new binary.
|
||||
pub mod trampoline;
|
||||
/// In-container binary self-update — fetch, atomic replace, and re-exec.
|
||||
pub mod upgrade;
|
||||
/// Validated input layer — transport-agnostic newtypes and request structs for all MCP write tools.
|
||||
pub mod validation;
|
||||
mod workflow;
|
||||
@@ -68,6 +80,19 @@ mod cli;
|
||||
|
||||
use cli::{parse_cli_args, resolve_path_arg};
|
||||
|
||||
/// Convert a WebSocket gateway URL into the binary download HTTP URL.
|
||||
///
|
||||
/// `ws://gateway:3000/api/sled-uplink?token=x` → `http://gateway:3000/api/huskies-binary`
|
||||
fn derive_binary_url_from_ws(ws_url: &str) -> Option<String> {
|
||||
let http = ws_url
|
||||
.strip_prefix("wss://")
|
||||
.map(|s| format!("https://{s}"))
|
||||
.or_else(|| ws_url.strip_prefix("ws://").map(|s| format!("http://{s}")))?;
|
||||
// Strip any path and query string, then append the binary endpoint.
|
||||
let base = http.split('/').take(3).collect::<Vec<_>>().join("/");
|
||||
Some(format!("{base}/api/huskies-binary"))
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), std::io::Error> {
|
||||
// Reap zombie grandchildren on Unix (for native deployments without tini/init).
|
||||
@@ -141,6 +166,32 @@ async fn main() -> Result<(), std::io::Error> {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Trampoline mode: kill old gateway, start new one ─────────────────────
|
||||
if let Some(ref job_path) = cli.trampoline {
|
||||
trampoline::run_trampoline(std::path::Path::new(job_path)).await;
|
||||
}
|
||||
|
||||
// ── Upgrade mode: fetch new binary, replace, exit ───────────────────────
|
||||
if cli.upgrade {
|
||||
let source = cli
|
||||
.upgrade_source
|
||||
.clone()
|
||||
.or_else(|| std::env::var("HUSKIES_BINARY_SOURCE").ok())
|
||||
.unwrap_or_else(|| {
|
||||
// Derive from HUSKIES_UPSTREAM_GATEWAY: ws://host:port/... → http://host:port/api/huskies-binary
|
||||
std::env::var("HUSKIES_UPSTREAM_GATEWAY")
|
||||
.ok()
|
||||
.and_then(|ws| derive_binary_url_from_ws(&ws))
|
||||
.unwrap_or_else(|| "http://gateway:3000/api/huskies-binary".to_string())
|
||||
});
|
||||
let target = upgrade::resolve_target_path();
|
||||
if let Err(e) = upgrade::run_cli_upgrade(&source, &target).await {
|
||||
eprintln!("error: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// ── Gateway mode: multi-project proxy ────────────────────────────────────
|
||||
if is_gateway {
|
||||
let config_dir = explicit_path.unwrap_or_else(|| cwd.clone());
|
||||
@@ -246,6 +297,11 @@ async fn main() -> Result<(), std::io::Error> {
|
||||
)),
|
||||
});
|
||||
|
||||
// Register the bot's persona in the CRDT so all transports share a single
|
||||
// event-log high-water mark keyed by name rather than transport ids.
|
||||
// scope="all" gives the gateway persona a cross-sled view of pipeline events.
|
||||
crate::crdt_state::write_llm_session(&services.bot_name.to_lowercase(), "all");
|
||||
|
||||
// Sled uplink: forward permission requests to an upstream gateway when configured.
|
||||
let upstream_gateway = cli
|
||||
.upstream_gateway
|
||||
@@ -364,10 +420,10 @@ async fn main() -> Result<(), std::io::Error> {
|
||||
Arc::clone(&services),
|
||||
matrix_shutdown_rx,
|
||||
None,
|
||||
vec![],
|
||||
std::collections::BTreeMap::new(),
|
||||
None,
|
||||
timer_store_for_bot,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
} else {
|
||||
drop(matrix_shutdown_rx);
|
||||
@@ -461,4 +517,28 @@ name = "coder"
|
||||
config::ProjectConfig::load(tmp.path())
|
||||
.unwrap_or_else(|e| panic!("Invalid project.toml: {e}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn derive_binary_url_strips_ws_scheme_and_path() {
|
||||
let url = derive_binary_url_from_ws("ws://gateway:3000/api/sled-uplink?token=abc");
|
||||
assert_eq!(
|
||||
url.as_deref(),
|
||||
Some("http://gateway:3000/api/huskies-binary")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn derive_binary_url_handles_wss_scheme() {
|
||||
let url = derive_binary_url_from_ws("wss://myhost:443/path");
|
||||
assert_eq!(
|
||||
url.as_deref(),
|
||||
Some("https://myhost:443/api/huskies-binary")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn derive_binary_url_invalid_scheme_returns_none() {
|
||||
let url = derive_binary_url_from_ws("http://not-a-ws-url");
|
||||
assert!(url.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user