Files
huskies/server/src/chat/transport/matrix/project_rebuild.rs
T

605 lines
22 KiB
Rust
Raw Normal View History

//! `project-rebuild <name>` chat command — rebuild Docker image, swap container, preserve state.
//!
//! Usage: `{bot} project-rebuild <name> [--timeout <secs>] [--force]`
//!
//! Steps performed:
//! 1. Validate the project exists and has a `host_path` configured.
//! 2. Check for in-flight coder/merge work (active `claude` processes in the container).
//! Wait up to `--timeout` seconds for them to exit. Refuse if still active.
//! 3. Build a new Docker image from the project's `Dockerfile.fragment` (if present).
//! 4. Stop and remove the old container.
//! 5. Start a new container from the fresh image, mounting the same host volume so
//! `pipeline.db` and all CRDT state survive untouched.
//! 6. Re-register the project in the gateway (same URL — port is preserved).
//!
//! On success the reply names the new image hash and the new container ID.
//! On failure the reply names the step that failed and the recovery path.
use crate::service::gateway::config::ProjectEntry;
use crate::service::gateway::io::save_config;
use std::collections::BTreeMap;
use std::path::Path;
use std::sync::Arc;
use tokio::sync::RwLock;
/// Default seconds to wait for in-flight work to drain before refusing.
const DEFAULT_DRAIN_TIMEOUT_SECS: u64 = 60;
/// A parsed `project-rebuild <name>` command.
#[derive(Debug, PartialEq)]
pub struct ProjectRebuildCommand {
/// Name of the project to rebuild.
pub name: String,
/// Seconds to wait for agents to drain (0 = skip check).
pub drain_timeout_secs: u64,
/// If `true`, skip the drain check entirely.
pub force: bool,
}
/// Parse a `project-rebuild <name> [--timeout <secs>] [--force]` command from a raw
/// Matrix message body.
///
/// Strips the bot mention prefix and checks for the `project-rebuild` keyword.
/// Returns `None` when the message is not a project-rebuild command.
pub fn extract_project_rebuild_command(
message: &str,
bot_name: &str,
bot_user_id: &str,
) -> Option<ProjectRebuildCommand> {
let stripped = crate::chat::util::strip_bot_mention(message, bot_name, bot_user_id);
let trimmed = stripped
.trim()
.trim_start_matches(|c: char| !c.is_alphanumeric());
let rest = if let Some(r) = trimmed.strip_prefix("project-rebuild") {
r.trim()
} else {
return None;
};
let mut parts = rest.split_whitespace();
let name = match parts.next() {
Some(n) if !n.starts_with("--") => n.to_string(),
_ => return None,
};
let mut drain_timeout_secs = DEFAULT_DRAIN_TIMEOUT_SECS;
let mut force = false;
let remaining: Vec<&str> = parts.collect();
let mut i = 0;
while i < remaining.len() {
match remaining[i] {
"--timeout" if i + 1 < remaining.len() => {
drain_timeout_secs = remaining[i + 1]
.parse()
.unwrap_or(DEFAULT_DRAIN_TIMEOUT_SECS);
i += 2;
}
"--force" => {
force = true;
i += 1;
}
_ => {
i += 1;
}
}
}
Some(ProjectRebuildCommand {
name,
drain_timeout_secs,
force,
})
}
/// Rebuild a project's Docker image, swap the container, and preserve all state.
///
/// On success returns a message naming the new image hash and container ID.
/// On failure returns a message naming the failed step and the recovery path.
pub async fn handle_project_rebuild(
name: &str,
drain_timeout_secs: u64,
force: bool,
projects_store: &Arc<RwLock<BTreeMap<String, ProjectEntry>>>,
config_dir: &Path,
) -> String {
// ── 1. Validate project ──────────────────────────────────────────────────
let (host_path_str, project_url, ssh_port_opt) = {
let projects = projects_store.read().await;
let entry = match projects.get(name) {
Some(e) => e.clone(),
None => {
let available: Vec<&String> = projects.keys().collect();
return format!(
"Project `{name}` not found. Available: {}",
available
.iter()
.map(|s| s.as_str())
.collect::<Vec<_>>()
.join(", ")
);
}
};
match entry.host_path.clone() {
Some(p) => (p, entry.url.clone(), entry.ssh_port),
None => {
return format!(
"Project `{name}` has no `host_path` configured — cannot rebuild.\n\
Only projects created with `new project --adopt` or `adopt_project` \
support the `project-rebuild` command."
);
}
}
};
let host_path = Path::new(&host_path_str);
if !host_path.exists() {
return format!(
"Host path `{host_path_str}` does not exist on disk — \
cannot rebuild project `{name}`."
);
}
// ── 2. Drain check ───────────────────────────────────────────────────────
let container_name = format!("huskies-{name}");
if !force
&& drain_timeout_secs > 0
&& let Some(err_msg) = wait_for_drain(&container_name, drain_timeout_secs).await
{
return format!(
"Project `{name}` rebuild aborted: {err_msg}\n\
Pass `--force` to skip the drain check or `--timeout 0` to not wait."
);
}
// ── 3. Build new image ───────────────────────────────────────────────────
let stacks_dir = config_dir.join("docker").join("stacks");
let (resolved_stack, _warnings) = super::new_project::detect_stack(host_path, &stacks_dir);
let base_image = super::new_project::image_for_stack(resolved_stack.as_deref());
let image = match super::new_project::build_project_image(host_path, &base_image, name).await {
Ok(img) => img,
Err(e) => {
return format!(
"Rebuild failed at **image build** step.\n\
Error: {e}\n\n\
Recovery: fix `.huskies/Dockerfile.fragment` in `{host_path_str}` then retry."
);
}
};
let image_hash = get_image_id(&image)
.await
.unwrap_or_else(|_| "unknown".to_string());
let image_short: String = image_hash.chars().take(19).collect();
// ── 4. Stop and remove old container ────────────────────────────────────
if let Err(e) = docker_stop(&container_name).await {
crate::slog!("[project-rebuild] stop '{container_name}': {e} (may already be stopped)");
}
if let Err(e) = docker_rm(&container_name).await {
return format!(
"Rebuild failed at **container remove** step.\n\
Error: {e}\n\n\
Recovery: run `docker rm {container_name}` manually then retry."
);
}
// ── 5. Start new container ───────────────────────────────────────────────
let port = project_url
.as_deref()
.and_then(|u| u.rsplit(':').next())
.and_then(|p| p.parse::<u16>().ok())
.unwrap_or(3001);
let ssh_port = ssh_port_opt.unwrap_or(2222);
let home = std::env::var("HOME").unwrap_or_else(|_| "/home/huskies".to_string());
let pub_key_path = std::path::PathBuf::from(&home)
.join(".huskies")
.join(name)
.join("id_ed25519.pub");
let pubkey = match tokio::fs::read_to_string(&pub_key_path).await {
Ok(k) => k.trim().to_string(),
Err(e) => {
return format!(
"Rebuild failed at **SSH key read** step.\n\
Error: {e}\n\
Expected public key at `{}`.\n\n\
Recovery: run `ssh-keygen -t ed25519 -N '' -f {home}/.huskies/{name}/id_ed25519` \
then retry.",
pub_key_path.display()
);
}
};
let credentials_file = std::path::PathBuf::from(&home)
.join(".claude")
.join(".credentials.json");
let creds_opt = if credentials_file.exists() {
Some(credentials_file.as_path())
} else {
None
};
let (git_user_name, git_user_email) =
super::new_project::resolve_git_identity(config_dir).await;
let mut docker_args = super::new_project::project_docker_run_args(
&container_name,
port,
ssh_port,
&pubkey,
&git_user_name,
&git_user_email,
creds_opt,
);
docker_args.push("-v".into());
docker_args.push(format!("{host_path_str}:/workspace"));
let host_ssh_dir = std::path::PathBuf::from(&home).join(".ssh");
for key_name in &["id_ed25519", "id_rsa"] {
let key_path = host_ssh_dir.join(key_name);
if key_path.exists() {
docker_args.push("-v".into());
docker_args.push(format!(
"{}:/home/huskies/.ssh/{key_name}:ro",
key_path.display()
));
}
}
docker_args.push("--restart".into());
docker_args.push("unless-stopped".into());
docker_args.push(image.clone());
docker_args.push("huskies".into());
docker_args.push("/workspace".into());
let run_output = tokio::process::Command::new("docker")
.args(&docker_args)
.output()
.await;
let container_id = match run_output {
Ok(out) if out.status.success() => String::from_utf8_lossy(&out.stdout).trim().to_string(),
Ok(out) => {
let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string();
return format!(
"Rebuild failed at **container start** step.\n\
Error: {stderr}\n\n\
Recovery: the old container was removed. \
Start a new one manually: `docker run -d --name {container_name} ... {image} huskies /workspace`"
);
}
Err(e) => {
return format!(
"Rebuild failed at **container start** step.\n\
Error: {e}\n\n\
Recovery: start the container manually: \
`docker run -d --name {container_name} ... {image} huskies /workspace`"
);
}
};
let container_short: String = container_id.chars().take(12).collect();
// ── 6. Persist updated config (URL is unchanged; project already registered) ────
{
let container_url = format!("http://127.0.0.1:{port}");
let mut projects = projects_store.write().await;
if let Some(entry) = projects.get_mut(name) {
entry.url = Some(container_url.clone());
}
save_config(&projects, config_dir).await;
crate::crdt_state::write_gateway_project(name, &container_url);
}
crate::slog!("[project-rebuild] Rebuilt '{name}': image={image_hash} container={container_id}");
format!(
"Project **{name}** rebuilt.\n\
- New image: `{image}` (`{image_short}…`)\n\
- New container: `{container_name}` (`{container_short}…`)\n\
- State: `pipeline.db` and CRDT preserved (same volume bind-mount)\n\
- Port: {port} (unchanged)\n\
\n\
Use `switch {name}` then `status` to verify the pipeline."
)
}
/// Wait for active Claude agent processes in the container to exit.
///
/// Polls every 5 seconds until no `claude` processes remain or `timeout_secs` elapses.
/// Returns `Some(error_message)` when agents are still running after the timeout,
/// `None` when the container is idle or unreachable.
async fn wait_for_drain(container_name: &str, timeout_secs: u64) -> Option<String> {
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs);
let poll_interval = std::time::Duration::from_secs(5);
loop {
match count_active_claude_processes(container_name).await {
Ok(0) => return None,
Ok(n) => {
if std::time::Instant::now() >= deadline {
return Some(format!(
"{n} Claude agent process(es) still running after {timeout_secs}s drain timeout."
));
}
tokio::time::sleep(poll_interval).await;
}
Err(_) => {
// docker exec failed (container stopped or Docker unavailable) — proceed.
return None;
}
}
}
}
/// Count the number of active `claude` processes inside the given container.
///
/// Uses `docker exec <name> pgrep -f claude` — exits 0 with PID list when found,
/// exits 1 when no matches (treated as 0 active processes).
async fn count_active_claude_processes(container_name: &str) -> Result<usize, String> {
let out = tokio::process::Command::new("docker")
.args(["exec", container_name, "pgrep", "-f", "claude"])
.output()
.await
.map_err(|e| e.to_string())?;
if out.status.success() {
let count = String::from_utf8_lossy(&out.stdout)
.lines()
.filter(|l| !l.trim().is_empty())
.count();
Ok(count)
} else {
Ok(0)
}
}
/// Stop a running Docker container (`docker stop`).
async fn docker_stop(container_name: &str) -> Result<(), String> {
let out = tokio::process::Command::new("docker")
.args(["stop", container_name])
.output()
.await
.map_err(|e| format!("docker stop failed to spawn: {e}"))?;
if out.status.success() {
Ok(())
} else {
Err(String::from_utf8_lossy(&out.stderr).trim().to_string())
}
}
/// Remove a stopped Docker container (`docker rm`).
async fn docker_rm(container_name: &str) -> Result<(), String> {
let out = tokio::process::Command::new("docker")
.args(["rm", container_name])
.output()
.await
.map_err(|e| format!("docker rm failed to spawn: {e}"))?;
if out.status.success() {
Ok(())
} else {
Err(String::from_utf8_lossy(&out.stderr).trim().to_string())
}
}
/// Return the full image ID (sha256 digest) for a named Docker image.
async fn get_image_id(image_name: &str) -> Result<String, String> {
let out = tokio::process::Command::new("docker")
.args(["inspect", image_name, "--format", "{{.Id}}"])
.output()
.await
.map_err(|e| format!("docker inspect failed: {e}"))?;
if out.status.success() {
Ok(String::from_utf8_lossy(&out.stdout).trim().to_string())
} else {
Err(String::from_utf8_lossy(&out.stderr).trim().to_string())
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
use crate::service::gateway::config::ProjectEntry;
use std::collections::BTreeMap;
use std::sync::Arc;
use tokio::sync::RwLock;
fn make_store(
projects: Vec<(&str, ProjectEntry)>,
) -> Arc<RwLock<BTreeMap<String, ProjectEntry>>> {
let map: BTreeMap<String, ProjectEntry> = projects
.into_iter()
.map(|(k, v)| (k.to_string(), v))
.collect();
Arc::new(RwLock::new(map))
}
// ── parsing ────────────────────────────────────────────────────────────
#[test]
fn extract_basic_command() {
let cmd =
extract_project_rebuild_command("Timmy project-rebuild myapp", "Timmy", "@timmy:home");
let cmd = cmd.unwrap();
assert_eq!(cmd.name, "myapp");
assert_eq!(cmd.drain_timeout_secs, DEFAULT_DRAIN_TIMEOUT_SECS);
assert!(!cmd.force);
}
#[test]
fn extract_with_force_flag() {
let cmd = extract_project_rebuild_command(
"@timmy project-rebuild myapp --force",
"Timmy",
"@timmy:home",
);
let cmd = cmd.unwrap();
assert_eq!(cmd.name, "myapp");
assert!(cmd.force);
}
#[test]
fn extract_with_timeout_flag() {
let cmd = extract_project_rebuild_command(
"Timmy project-rebuild myapp --timeout 120",
"Timmy",
"@timmy:home",
);
let cmd = cmd.unwrap();
assert_eq!(cmd.name, "myapp");
assert_eq!(cmd.drain_timeout_secs, 120);
}
#[test]
fn extract_with_timeout_zero_skips_drain() {
let cmd = extract_project_rebuild_command(
"Timmy project-rebuild myapp --timeout 0",
"Timmy",
"@timmy:home",
);
let cmd = cmd.unwrap();
assert_eq!(cmd.drain_timeout_secs, 0);
}
#[test]
fn extract_non_rebuild_returns_none() {
let cmd = extract_project_rebuild_command("Timmy status", "Timmy", "@timmy:home");
assert!(cmd.is_none());
}
#[test]
fn extract_rebuild_without_name_returns_none() {
let cmd = extract_project_rebuild_command("Timmy project-rebuild", "Timmy", "@timmy:home");
assert!(cmd.is_none());
}
#[test]
fn extract_with_full_user_id() {
let cmd = extract_project_rebuild_command(
"@timmy:home project-rebuild alpha",
"Timmy",
"@timmy:home",
);
assert_eq!(cmd.unwrap().name, "alpha");
}
#[test]
fn extract_case_insensitive_bot_mention() {
let cmd =
extract_project_rebuild_command("timmy project-rebuild beta", "Timmy", "@timmy:home");
assert_eq!(cmd.unwrap().name, "beta");
}
// ── handle_project_rebuild validation ─────────────────────────────────
#[tokio::test]
async fn rebuild_unknown_project_returns_error() {
let store = make_store(vec![]);
let dir = tempfile::tempdir().unwrap();
let result = handle_project_rebuild("nonexistent", 0, true, &store, dir.path()).await;
assert!(
result.contains("not found"),
"expected 'not found': {result}"
);
}
#[tokio::test]
async fn rebuild_project_without_host_path_returns_error() {
let store = make_store(vec![(
"myapp",
ProjectEntry {
url: Some("http://127.0.0.1:3101".into()),
auth_token: None,
ssh_port: Some(2201),
host_path: None,
},
)]);
let dir = tempfile::tempdir().unwrap();
let result = handle_project_rebuild("myapp", 0, true, &store, dir.path()).await;
assert!(
result.contains("host_path"),
"expected 'host_path' mention: {result}"
);
}
#[tokio::test]
async fn rebuild_project_with_missing_host_dir_returns_error() {
let store = make_store(vec![(
"myapp",
ProjectEntry {
url: Some("http://127.0.0.1:3101".into()),
auth_token: None,
ssh_port: Some(2201),
host_path: Some("/nonexistent/path/xyz123".into()),
},
)]);
let dir = tempfile::tempdir().unwrap();
let result = handle_project_rebuild("myapp", 0, true, &store, dir.path()).await;
assert!(
result.contains("does not exist"),
"expected 'does not exist': {result}"
);
}
/// End-to-end flow test: rebuild a project that has a valid host directory.
///
/// With `--force` and `--timeout 0` the drain check is skipped.
/// The function proceeds to the image build step, which fails when Docker is
/// not available in CI. On failure the reply must:
/// (a) name the failed step ("image build")
/// (b) leave the project still registered in the gateway (state preserved)
/// (c) include a recovery path
///
/// When Docker IS available and the base image exists this test would exercise
/// the full container stop → build → start → re-register flow.
#[tokio::test]
async fn rebuild_e2e_with_valid_host_path_reaches_image_build_step() {
let host_dir = tempfile::tempdir().unwrap();
// Create a minimal .huskies/ directory (simulating an existing project).
std::fs::create_dir_all(host_dir.path().join(".huskies")).unwrap();
let store = make_store(vec![(
"myapp",
ProjectEntry {
url: Some("http://127.0.0.1:3101".into()),
auth_token: Some("tok".into()),
ssh_port: Some(2201),
host_path: Some(host_dir.path().to_str().unwrap().to_string()),
},
)]);
let config_dir = tempfile::tempdir().unwrap();
let result = handle_project_rebuild("myapp", 0, true, &store, config_dir.path()).await;
// (a) Step naming: one of several possible failure steps depending on what Docker
// binaries are available in the test environment, or a success reply.
let names_a_step = result.contains("image build")
|| result.contains("SSH key")
|| result.contains("container remove")
|| result.contains("container start");
let is_success = result.contains("rebuilt");
assert!(
names_a_step || is_success,
"result should name a step or report success: {result}"
);
// (b) State preserved: project is still registered in the gateway store.
let projects = store.read().await;
assert!(
projects.contains_key("myapp"),
"project 'myapp' must remain registered after failed rebuild: {result}"
);
}
}