huskies: merge 760
This commit is contained in:
@@ -12,9 +12,9 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
// Re-export public types that callers reference as `crate::gateway::*`.
|
// Re-export public types that callers reference as `crate::gateway::*`.
|
||||||
pub use crate::service::gateway::{
|
pub use crate::service::gateway::{
|
||||||
GatewayConfig, GatewayState as GatewayStateType, JoinedAgent, ProjectEntry,
|
GatewayConfig, GatewayState as GatewayStateType, GatewayStatusEvent, JoinedAgent, ProjectEntry,
|
||||||
fetch_all_project_pipeline_statuses, format_aggregate_status_compact,
|
broadcast_status_event, fetch_all_project_pipeline_statuses, format_aggregate_status_compact,
|
||||||
spawn_gateway_notification_poller,
|
spawn_gateway_notification_poller, subscribe_status_events,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Build the complete gateway route tree.
|
/// Build the complete gateway route tree.
|
||||||
@@ -70,6 +70,10 @@ pub fn build_gateway_route(state_arc: Arc<GatewayState>) -> impl poem::Endpoint
|
|||||||
"/gateway/agents/:id/heartbeat",
|
"/gateway/agents/:id/heartbeat",
|
||||||
poem::post(gateway_heartbeat_handler),
|
poem::post(gateway_heartbeat_handler),
|
||||||
)
|
)
|
||||||
|
.at(
|
||||||
|
"/gateway/events/push",
|
||||||
|
poem::get(gateway_event_push_handler),
|
||||||
|
)
|
||||||
// Serve the embedded React frontend so the gateway has a UI.
|
// Serve the embedded React frontend so the gateway has a UI.
|
||||||
.at(
|
.at(
|
||||||
"/assets/*path",
|
"/assets/*path",
|
||||||
|
|||||||
@@ -4,9 +4,12 @@
|
|||||||
//! the response. No inline business logic, no `reqwest`, no filesystem access.
|
//! the response. No inline business logic, no `reqwest`, no filesystem access.
|
||||||
|
|
||||||
use crate::service::gateway::{self, GatewayState};
|
use crate::service::gateway::{self, GatewayState};
|
||||||
|
use futures::StreamExt;
|
||||||
use poem::handler;
|
use poem::handler;
|
||||||
use poem::http::StatusCode;
|
use poem::http::StatusCode;
|
||||||
use poem::web::Path as PoemPath;
|
use poem::web::Path as PoemPath;
|
||||||
|
use poem::web::Query;
|
||||||
|
use poem::web::websocket::{Message as WsMessage, WebSocket};
|
||||||
use poem::web::{Data, Json};
|
use poem::web::{Data, Json};
|
||||||
use poem::{Body, Request, Response};
|
use poem::{Body, Request, Response};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@@ -631,6 +634,119 @@ pub async fn gateway_heartbeat_handler(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Event-push WebSocket handler ────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Query parameters accepted on the `/gateway/events/push` WebSocket upgrade.
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct EventPushQueryParams {
|
||||||
|
/// One-time join token generated by `POST /gateway/tokens`.
|
||||||
|
token: Option<String>,
|
||||||
|
/// The project name this node represents (e.g. `"huskies"`).
|
||||||
|
project: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `GET /gateway/events/push` — WebSocket endpoint for project nodes to push
|
||||||
|
/// [`StatusEvent`] frames to the gateway.
|
||||||
|
///
|
||||||
|
/// # Authentication
|
||||||
|
///
|
||||||
|
/// The connecting node must supply a valid one-time join token via the `token`
|
||||||
|
/// query parameter, obtained from `POST /gateway/tokens`. The token is
|
||||||
|
/// consumed on the first successful upgrade — the connection itself is then
|
||||||
|
/// kept open indefinitely.
|
||||||
|
///
|
||||||
|
/// # Protocol
|
||||||
|
///
|
||||||
|
/// Each message from the project node must be a JSON-encoded
|
||||||
|
/// [`crate::service::events::StoredEvent`]. The gateway fan-outs the event
|
||||||
|
/// (tagged with the project name) to all current local subscribers.
|
||||||
|
///
|
||||||
|
/// The server does not send data back; clients should treat any close frame
|
||||||
|
/// as a signal to reconnect with exponential back-off (see docs/gateway-protocol.html).
|
||||||
|
///
|
||||||
|
/// # Reconnect-with-backoff
|
||||||
|
///
|
||||||
|
/// Project nodes MUST reconnect on disconnect. Recommended policy:
|
||||||
|
///
|
||||||
|
/// - Initial retry delay: **1 s**
|
||||||
|
/// - Back-off multiplier: **2×** per attempt
|
||||||
|
/// - Max delay cap: **60 s**
|
||||||
|
/// - Jitter: add ±10 % to the delay to avoid thundering herds
|
||||||
|
#[handler]
|
||||||
|
pub async fn gateway_event_push_handler(
|
||||||
|
ws: WebSocket,
|
||||||
|
state: Data<&Arc<GatewayState>>,
|
||||||
|
Query(params): Query<EventPushQueryParams>,
|
||||||
|
) -> poem::Response {
|
||||||
|
// ── Authentication (pre-upgrade) ─────────────────────────────────────
|
||||||
|
let token = match params.token {
|
||||||
|
Some(t) if !t.is_empty() => t,
|
||||||
|
_ => {
|
||||||
|
return poem::Response::builder()
|
||||||
|
.status(StatusCode::UNAUTHORIZED)
|
||||||
|
.body("token query parameter required");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let project = match params.project {
|
||||||
|
Some(p) if !p.is_empty() => p,
|
||||||
|
_ => {
|
||||||
|
return poem::Response::builder()
|
||||||
|
.status(StatusCode::BAD_REQUEST)
|
||||||
|
.body("project query parameter required");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Validate and consume the one-time token.
|
||||||
|
{
|
||||||
|
let mut tokens = state.pending_tokens.write().await;
|
||||||
|
if !tokens.contains_key(&token) {
|
||||||
|
return poem::Response::builder()
|
||||||
|
.status(StatusCode::UNAUTHORIZED)
|
||||||
|
.body("invalid or already-used join token");
|
||||||
|
}
|
||||||
|
tokens.remove(&token);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── WebSocket upgrade ────────────────────────────────────────────────
|
||||||
|
use poem::IntoResponse as _;
|
||||||
|
let state = Arc::clone(&state);
|
||||||
|
ws.on_upgrade(move |socket| async move {
|
||||||
|
let (_, mut stream) = socket.split();
|
||||||
|
|
||||||
|
crate::slog!(
|
||||||
|
"[gateway] Project node '{}' connected to event-push endpoint",
|
||||||
|
project
|
||||||
|
);
|
||||||
|
|
||||||
|
while let Some(msg) = stream.next().await {
|
||||||
|
let text = match msg {
|
||||||
|
Ok(WsMessage::Text(t)) => t,
|
||||||
|
Ok(WsMessage::Close(_)) | Err(_) => break,
|
||||||
|
_ => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
match serde_json::from_str::<crate::service::events::StoredEvent>(&text) {
|
||||||
|
Ok(event) => {
|
||||||
|
gateway::broadcast_status_event(&state, project.clone(), event);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
crate::slog!(
|
||||||
|
"[gateway] event-push: invalid frame from '{}': {e}",
|
||||||
|
project
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::slog!(
|
||||||
|
"[gateway] Project node '{}' disconnected from event-push endpoint",
|
||||||
|
project
|
||||||
|
);
|
||||||
|
})
|
||||||
|
.into_response()
|
||||||
|
}
|
||||||
|
|
||||||
// ── Health handler ──────────────────────────────────────────────────────────
|
// ── Health handler ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
/// HTTP GET `/health` handler for the gateway.
|
/// HTTP GET `/health` handler for the gateway.
|
||||||
|
|||||||
@@ -26,6 +26,21 @@ use std::sync::Arc;
|
|||||||
use tokio::sync::Mutex as TokioMutex;
|
use tokio::sync::Mutex as TokioMutex;
|
||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
|
// ── Status event broadcaster ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Capacity of the gateway status event broadcast channel.
|
||||||
|
const EVENT_CHANNEL_CAPACITY: usize = 64;
|
||||||
|
|
||||||
|
/// A status event pushed by a project node and fanned out to all local
|
||||||
|
/// subscribers (e.g. the Web UI, notification forwarders).
|
||||||
|
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
|
||||||
|
pub struct GatewayStatusEvent {
|
||||||
|
/// The project name that emitted this event.
|
||||||
|
pub project: String,
|
||||||
|
/// The pipeline event payload.
|
||||||
|
pub event: crate::service::events::StoredEvent,
|
||||||
|
}
|
||||||
|
|
||||||
// ── Error type ──────────────────────────────────────────────────────────────
|
// ── Error type ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
/// Typed errors returned by `service::gateway` functions.
|
/// Typed errors returned by `service::gateway` functions.
|
||||||
@@ -93,6 +108,10 @@ pub struct GatewayState {
|
|||||||
pub port: u16,
|
pub port: u16,
|
||||||
/// Abort handle for the running Matrix bot task (if any).
|
/// Abort handle for the running Matrix bot task (if any).
|
||||||
pub bot_handle: Arc<TokioMutex<Option<tokio::task::AbortHandle>>>,
|
pub bot_handle: Arc<TokioMutex<Option<tokio::task::AbortHandle>>>,
|
||||||
|
/// Broadcast sender for [`GatewayStatusEvent`]s pushed by project nodes.
|
||||||
|
///
|
||||||
|
/// Call `event_tx.subscribe()` to obtain a receiver for outbound fan-out.
|
||||||
|
pub event_tx: tokio::sync::broadcast::Sender<GatewayStatusEvent>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GatewayState {
|
impl GatewayState {
|
||||||
@@ -107,6 +126,7 @@ impl GatewayState {
|
|||||||
) -> Result<Self, String> {
|
) -> Result<Self, String> {
|
||||||
let first = config::validate_config(&gateway_config)?;
|
let first = config::validate_config(&gateway_config)?;
|
||||||
let agents = io::load_agents(&config_dir);
|
let agents = io::load_agents(&config_dir);
|
||||||
|
let (event_tx, _) = tokio::sync::broadcast::channel(EVENT_CHANNEL_CAPACITY);
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
projects: Arc::new(RwLock::new(gateway_config.projects)),
|
projects: Arc::new(RwLock::new(gateway_config.projects)),
|
||||||
active_project: Arc::new(RwLock::new(first)),
|
active_project: Arc::new(RwLock::new(first)),
|
||||||
@@ -116,6 +136,7 @@ impl GatewayState {
|
|||||||
config_dir,
|
config_dir,
|
||||||
port,
|
port,
|
||||||
bot_handle: Arc::new(TokioMutex::new(None)),
|
bot_handle: Arc::new(TokioMutex::new(None)),
|
||||||
|
event_tx,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -380,6 +401,32 @@ pub async fn health_check_all(state: &GatewayState) -> (bool, BTreeMap<String, &
|
|||||||
(all_healthy, statuses)
|
(all_healthy, statuses)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Broadcast a status event received from a project node to all local subscribers.
|
||||||
|
///
|
||||||
|
/// Returns the number of active receivers that received the event.
|
||||||
|
/// A return value of zero means no subscribers are currently connected.
|
||||||
|
pub fn broadcast_status_event(
|
||||||
|
state: &GatewayState,
|
||||||
|
project: String,
|
||||||
|
event: crate::service::events::StoredEvent,
|
||||||
|
) -> usize {
|
||||||
|
let msg = GatewayStatusEvent { project, event };
|
||||||
|
state.event_tx.send(msg).unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Subscribe to the gateway's status event stream.
|
||||||
|
///
|
||||||
|
/// Returns a broadcast receiver that will yield [`GatewayStatusEvent`]s as
|
||||||
|
/// project nodes push them. If the receiver falls behind (more than
|
||||||
|
/// [`EVENT_CHANNEL_CAPACITY`] events are queued), it will receive a
|
||||||
|
/// [`tokio::sync::broadcast::error::RecvError::Lagged`] error; callers
|
||||||
|
/// should discard lagged events and continue.
|
||||||
|
pub fn subscribe_status_events(
|
||||||
|
state: &GatewayState,
|
||||||
|
) -> tokio::sync::broadcast::Receiver<GatewayStatusEvent> {
|
||||||
|
state.event_tx.subscribe()
|
||||||
|
}
|
||||||
|
|
||||||
/// Save bot config and restart the bot.
|
/// Save bot config and restart the bot.
|
||||||
pub async fn save_bot_config_and_restart(state: &GatewayState, content: &str) -> Result<(), Error> {
|
pub async fn save_bot_config_and_restart(state: &GatewayState, content: &str) -> Result<(), Error> {
|
||||||
io::write_bot_config(&state.config_dir, content).map_err(Error::Config)?;
|
io::write_bot_config(&state.config_dir, content).map_err(Error::Config)?;
|
||||||
|
|||||||
@@ -146,6 +146,57 @@ HUSKIES_PORT=3002 huskies</code></pre>
|
|||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
|
<h2>Gateway event-push protocol</h2>
|
||||||
|
<p>Project nodes can push pipeline status events to the gateway in real time over a WebSocket connection. The gateway fans each event out to all connected local subscribers.</p>
|
||||||
|
|
||||||
|
<h3>Connecting</h3>
|
||||||
|
<ol>
|
||||||
|
<li>Obtain a one-time join token: <code>POST /gateway/tokens</code> → <code>{"token":"…"}</code></li>
|
||||||
|
<li>Open a WebSocket upgrade to <code>GET /gateway/events/push?token=TOKEN&project=PROJECT_NAME</code></li>
|
||||||
|
<li>The token is consumed on upgrade. The project name is attached to every event the server broadcasts downstream.</li>
|
||||||
|
</ol>
|
||||||
|
|
||||||
|
<h3>Sending events</h3>
|
||||||
|
<p>Each message must be a JSON-encoded <code>StoredEvent</code> frame:</p>
|
||||||
|
<pre><code>// Stage transition
|
||||||
|
{"type":"stage_transition","story_id":"42_story_login","from_stage":"2_current","to_stage":"3_qa","timestamp_ms":1700000000000}
|
||||||
|
|
||||||
|
// Merge failure
|
||||||
|
{"type":"merge_failure","story_id":"42_story_login","reason":"conflict in src/main.rs","timestamp_ms":1700000001000}
|
||||||
|
|
||||||
|
// Story blocked
|
||||||
|
{"type":"story_blocked","story_id":"42_story_login","reason":"retry limit exceeded","timestamp_ms":1700000002000}</code></pre>
|
||||||
|
<p>The server does not send frames back. Any other frames received by the project node indicate an error or server restart — treat them as a disconnect signal.</p>
|
||||||
|
|
||||||
|
<h3>Reconnect with exponential back-off</h3>
|
||||||
|
<p>Project nodes <strong>must</strong> reconnect on any disconnect. Use the following policy to avoid thundering herds after a gateway restart:</p>
|
||||||
|
<table>
|
||||||
|
<thead><tr><th>Parameter</th><th>Value</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Initial delay</td><td>1 s</td></tr>
|
||||||
|
<tr><td>Back-off multiplier</td><td>2× per attempt</td></tr>
|
||||||
|
<tr><td>Maximum delay</td><td>60 s</td></tr>
|
||||||
|
<tr><td>Jitter</td><td>±10 % of the computed delay</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
<p>Pseudocode:</p>
|
||||||
|
<pre><code>delay = 1.0 // seconds
|
||||||
|
max_delay = 60.0
|
||||||
|
|
||||||
|
loop:
|
||||||
|
token = POST /gateway/tokens
|
||||||
|
connect ws:/gateway/events/push?token=TOKEN&project=NAME
|
||||||
|
while connected:
|
||||||
|
send StoredEvent frames
|
||||||
|
// disconnected — wait and retry
|
||||||
|
jitter = delay * (random(0.9, 1.1))
|
||||||
|
sleep(min(jitter, max_delay))
|
||||||
|
delay = min(delay * 2, max_delay)</code></pre>
|
||||||
|
|
||||||
|
<div class="note">
|
||||||
|
<strong>New token per connection:</strong> Each WebSocket upgrade consumes the join token. Request a fresh token for every reconnect attempt.
|
||||||
|
</div>
|
||||||
|
|
||||||
<h2>Building from source</h2>
|
<h2>Building from source</h2>
|
||||||
<h3>Standard release build</h3>
|
<h3>Standard release build</h3>
|
||||||
<pre><code>cargo build --release
|
<pre><code>cargo build --release
|
||||||
|
|||||||
Reference in New Issue
Block a user