diff --git a/devops/compose/docker-compose.dev.yml b/devops/compose/docker-compose.dev.yml index 219b89a8d..93436db77 100644 --- a/devops/compose/docker-compose.dev.yml +++ b/devops/compose/docker-compose.dev.yml @@ -62,7 +62,7 @@ services: ports: - "127.1.1.3:${RUSTFS_PORT:-8333}:8333" healthcheck: - test: ["CMD", "wget", "-qO-", "http://localhost:8333/"] + test: ["CMD", "wget", "-qO-", "http://127.0.0.1:8888/"] interval: 30s timeout: 10s retries: 3 @@ -110,7 +110,7 @@ services: ports: - "127.1.1.5:80:5000" healthcheck: - test: ["CMD", "wget", "-qO-", "http://localhost:5000/v2/"] + test: ["CMD", "/usr/local/bin/zot-linux-amd64", "verify", "/etc/zot/config.json"] interval: 30s timeout: 5s retries: 3 diff --git a/docs/dev/DEV_ENVIRONMENT_SETUP.md b/docs/dev/DEV_ENVIRONMENT_SETUP.md index a9c70bd3d..c67499675 100644 --- a/docs/dev/DEV_ENVIRONMENT_SETUP.md +++ b/docs/dev/DEV_ENVIRONMENT_SETUP.md @@ -57,7 +57,7 @@ dotnet run --project src/Cli/StellaOps.Cli/StellaOps.Cli.csproj -- ` | Output | Class | Meaning | Action | |---|---|---|---| -| `health=starting` (RustFS) | Warning | Service still warming up | Wait and recheck `docker compose ... ps` | +| `GET http://127.1.1.3:8333/` returns `403` | Info | SeaweedFS S3 endpoint is live and rejecting anonymous root requests | Treat `403` as ready for the scratch setup smoke | | `SM remote service probe failed (localhost:56080)` | Warning | Optional SM remote provider is unavailable | Ignore unless validating China SM remote crypto profile | | `stellaops-dev-rekor restarting` without `--profile sigstore` | Warning | Optional Sigstore container from prior run | Ignore for default profile or remove stale container | | `policy ... scheduler_exceptions_tenant_isolation already exists` | Blocking | Outdated Scheduler migration idempotency | Update code and rerun seeding | @@ -158,6 +158,12 @@ psql -h db.stella-ops.local -U stellaops -d stellaops_dev -c "SELECT 1" # Valkey valkey-cli -h cache.stella-ops.local ping + +# SeaweedFS S3 root returns 403 when unauthenticated; that still proves readiness +curl -I http://s3.stella-ops.local/ + +# Zot OCI registry +curl -I http://registry.stella-ops.local/v2/ ``` Infrastructure versions (from `docker-compose.dev.yml`): diff --git a/docs/implplan/SPRINT_20260309_001_Platform_scratch_setup_bootstrap_restore.md b/docs/implplan/SPRINT_20260309_001_Platform_scratch_setup_bootstrap_restore.md index 146ff89d0..1069e3ddb 100644 --- a/docs/implplan/SPRINT_20260309_001_Platform_scratch_setup_bootstrap_restore.md +++ b/docs/implplan/SPRINT_20260309_001_Platform_scratch_setup_bootstrap_restore.md @@ -5,7 +5,7 @@ - Treat the setup script itself as production surface: a clean repo plus docs must be enough to bootstrap the platform without manual script surgery. - Re-run the clean setup path after the fix, then continue into Playwright-backed live verification on the rebuilt stack. - Working directory: `devops/docker`. -- Allowed coordination edits: `scripts/setup.ps1`, `scripts/setup.sh`, `scripts/build-all-solutions.ps1`, `devops/compose/docker-compose.stella-ops.yml`, `docs/quickstart.md`, `docs/INSTALL_GUIDE.md`, `devops/README.md`, `devops/compose/README.md`, `src/Web/StellaOps.Web/scripts/chrome-path.js`, `src/Web/StellaOps.Web/scripts/verify-chromium.js`, `src/Authority/StellaOps.Authority.sln`, `src/Cli/StellaOps.Cli.sln`, `src/EvidenceLocker/StellaOps.EvidenceLocker.sln`, `src/Signals/StellaOps.Signals.sln`, `src/Tools/StellaOps.Tools.sln`, `src/Policy/StellaOps.Policy.engine.slnf`, `src/Policy/StellaOps.Policy.min.slnf`, `src/Policy/StellaOps.Policy.tests.slnf`, `src/Telemetry/StellaOps.Telemetry.Core/telemetry-tests.slnf`, `docs/implplan/SPRINT_20260309_001_Platform_scratch_setup_bootstrap_restore.md`. +- Allowed coordination edits: `scripts/setup.ps1`, `scripts/setup.sh`, `scripts/build-all-solutions.ps1`, `devops/compose/docker-compose.stella-ops.yml`, `devops/compose/docker-compose.dev.yml`, `docs/quickstart.md`, `docs/INSTALL_GUIDE.md`, `docs/dev/DEV_ENVIRONMENT_SETUP.md`, `devops/README.md`, `devops/compose/README.md`, `src/Web/StellaOps.Web/scripts/chrome-path.js`, `src/Web/StellaOps.Web/scripts/verify-chromium.js`, `src/Authority/StellaOps.Authority.sln`, `src/Cli/StellaOps.Cli.sln`, `src/EvidenceLocker/StellaOps.EvidenceLocker.sln`, `src/Signals/StellaOps.Signals.sln`, `src/Tools/StellaOps.Tools.sln`, `src/Policy/StellaOps.Policy.engine.slnf`, `src/Policy/StellaOps.Policy.min.slnf`, `src/Policy/StellaOps.Policy.tests.slnf`, `src/Telemetry/StellaOps.Telemetry.Core/telemetry-tests.slnf`, `docs/implplan/SPRINT_20260309_001_Platform_scratch_setup_bootstrap_restore.md`. - Expected evidence: clean setup invocation output, successful image-builder startup, rebuilt compose stack, and downstream Playwright verification artifacts. ## Dependencies & Concurrency @@ -75,6 +75,7 @@ Completion criteria: | 2026-03-09 | Investigated the next Windows bootstrap bottleneck: `devops/docker/build-all.ps1` still rebuilt every .NET service image from repo root, so Docker repeatedly transferred the monorepo into BuildKit during scratch setup. Reworked the builder to publish backend services locally into small temp contexts, kept the Angular console on its dedicated Dockerfile path, and threaded `--no-restore` through setup when the solution build already ran. | Developer | | 2026-03-09 | Solution graph fixes committed: normalized solution file paths and consolidated Scheduler references (`e6094e3b5`), improved build script discovery and updated Verifier to System.CommandLine v8+ (`e0c79e0dc`). Running `build-all-solutions.ps1` to verify completion criteria. | Developer | | 2026-03-09 | All 36 solutions build successfully. Task 003 completion criteria met. Sprint complete. | QA | +| 2026-03-10 | Another scratch-bootstrap recheck exposed false-negative third-party infra readiness. SeaweedFS was healthy but its dev-compose probe hit the S3 root that correctly returns `403`, and Zot was healthy but its vendor image does not include `wget`. Updated compose healthchecks and setup smoke probes to validate the real exposed endpoints instead of failing clean bootstraps on healthy services. | Developer | ## Decisions & Risks - Decision: repair the documented setup path first instead of working around it with ad hoc manual builds, because scratch bootstrap is part of the product surface for this mission. @@ -85,6 +86,7 @@ Completion criteria: - Decision: the automated setup path now owns creation of the external frontdoor Docker network because that network is part of the documented default compose topology, and a scratch bootstrap should not depend on an undocumented pre-existing Docker artifact. - Decision: `scripts/build-all-solutions.ps1` must build only repo-owned solution surfaces under `src/`; vendored dependency trees such as frontend `node_modules` are excluded because they are not Stella bootstrap contracts and can contain native/Visual Studio samples that are invalid under `dotnet build`. - Decision: the canonical .NET image builder now uses local `dotnet publish` plus a runtime-only Docker context by default, because repo-root `docker build` repeated monorepo context transfer for every service and made scratch setup unreasonably slow on Windows. +- Decision: scratch-setup readiness for third-party infra now relies on host-level HTTP probes in the setup smoke scripts, because vendor images do not consistently ship shell/network helpers and some valid readiness responses are auth-gated (`403`) rather than `200`. ## Next Checkpoints - 2026-03-09: rerun `scripts/setup.ps1 -SkipBuild` after the parser fix. diff --git a/scripts/setup.ps1 b/scripts/setup.ps1 index d7ee9288d..67ae9d637 100644 --- a/scripts/setup.ps1 +++ b/scripts/setup.ps1 @@ -114,6 +114,34 @@ function Get-RunningContainerByService([string]$serviceName) { return $null } +function Get-ServiceHttpProbeUrl([string]$serviceName, [int]$containerPort, [string]$path = '/') { + $containerName = Get-RunningContainerByService $serviceName + if (-not $containerName) { + return $null + } + + $portMapping = docker port $containerName "${containerPort}/tcp" 2>$null | Select-Object -First 1 + if (-not $portMapping) { + return $null + } + + $portMapping = $portMapping.Trim() + if ($portMapping -notmatch '^(?.+):(?\d+)$') { + return $null + } + + $probeHost = $Matches.host + if ($probeHost -eq '0.0.0.0' -or $probeHost -eq '::') { + $probeHost = '127.0.0.1' + } + + if (-not $path.StartsWith('/')) { + $path = "/$path" + } + + return "http://${probeHost}:$($Matches.port)$path" +} + # ─── 1. Check prerequisites ──────────────────────────────────────────────── function Test-Prerequisites { @@ -439,6 +467,45 @@ function Start-Platform { } } +function Test-ExpectedHttpStatus([string]$url, [int[]]$allowedStatusCodes, [int]$timeoutSeconds = 5, [int]$attempts = 6, [int]$retryDelaySeconds = 2) { + for ($attempt = 1; $attempt -le $attempts; $attempt++) { + $statusCode = $null + + try { + $request = [System.Net.WebRequest]::Create($url) + $request.Method = 'GET' + $request.Timeout = $timeoutSeconds * 1000 + $response = [System.Net.HttpWebResponse]$request.GetResponse() + + try { + $statusCode = [int]$response.StatusCode + } finally { + $response.Dispose() + } + } catch [System.Net.WebException] { + $webResponse = $_.Exception.Response -as [System.Net.HttpWebResponse] + if ($null -ne $webResponse) { + try { + $statusCode = [int]$webResponse.StatusCode + } finally { + $webResponse.Dispose() + } + } + } catch { + } + + if ($null -ne $statusCode -and $allowedStatusCodes -contains $statusCode) { + return $statusCode + } + + if ($attempt -lt $attempts) { + Start-Sleep -Seconds $retryDelaySeconds + } + } + + return $null +} + # ─── 8. Smoke test ───────────────────────────────────────────────────────── function Test-Smoke { @@ -474,6 +541,24 @@ function Test-Smoke { $hasBlockingFailures = $true } + $rustFsUrl = Get-ServiceHttpProbeUrl 'rustfs' 8333 '/' + $rustFsStatus = if ($rustFsUrl) { Test-ExpectedHttpStatus $rustFsUrl @(200, 403) } else { $null } + if ($null -ne $rustFsStatus) { + Write-Ok "RustFS S3 endpoint (HTTP $rustFsStatus)" + } else { + Write-Fail 'RustFS S3 endpoint did not respond with an expected status (wanted 200/403)' + $hasBlockingFailures = $true + } + + $registryUrl = Get-ServiceHttpProbeUrl 'registry' 5000 '/v2/' + $registryStatus = if ($registryUrl) { Test-ExpectedHttpStatus $registryUrl @(200, 401) } else { $null } + if ($null -ne $registryStatus) { + Write-Ok "Zot registry endpoint (HTTP $registryStatus)" + } else { + Write-Fail 'Zot registry endpoint did not respond with an expected status (wanted 200/401)' + $hasBlockingFailures = $true + } + # Platform container health summary Write-Step 'Container health summary' Push-Location $ComposeDir diff --git a/scripts/setup.sh b/scripts/setup.sh index da8585330..0cbf45f2b 100644 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -43,6 +43,33 @@ fail() { printf ' \033[0;31m[FAIL]\033[0m %s\n' "$1"; } has_cmd() { command -v "$1" &>/dev/null; } +get_running_container_by_service() { + local service="$1" + docker ps --filter "label=com.docker.compose.service=${service}" --format "{{.Names}}" 2>/dev/null | head -n1 +} + +service_http_probe_url() { + local service="$1" + local container_port="$2" + local path="${3:-/}" + local container mapping host host_port + + container=$(get_running_container_by_service "$service") + [[ -z "$container" ]] && return 1 + + mapping=$(docker port "$container" "${container_port}/tcp" 2>/dev/null | head -n1) + [[ -z "$mapping" ]] && return 1 + + host="${mapping%:*}" + host_port="${mapping##*:}" + if [[ "$host" == "0.0.0.0" || "$host" == "::" ]]; then + host="127.0.0.1" + fi + + [[ "$path" != /* ]] && path="/$path" + printf 'http://%s:%s%s' "$host" "$host_port" "$path" +} + # ─── 1. Check prerequisites ──────────────────────────────────────────────── check_prerequisites() { @@ -305,6 +332,27 @@ start_platform() { cd "$ROOT" } +http_status() { + local url="$1" + local attempts="${2:-6}" + local delay_seconds="${3:-2}" + local status="" + + for (( attempt=1; attempt<=attempts; attempt++ )); do + status=$(curl -s -o /dev/null --connect-timeout 5 -w '%{http_code}' "$url" 2>/dev/null || true) + if [[ -n "$status" && "$status" != "000" ]]; then + printf '%s' "$status" + return 0 + fi + + if (( attempt < attempts )); then + sleep "$delay_seconds" + fi + done + + return 0 +} + # ─── 8. Smoke test ───────────────────────────────────────────────────────── smoke_test() { @@ -324,6 +372,24 @@ smoke_test() { warn 'Valkey not responding' fi + local rustfs_url rustfs_status + rustfs_url=$(service_http_probe_url rustfs 8333 / || true) + rustfs_status=$(http_status "$rustfs_url") + if [[ "$rustfs_status" == "200" || "$rustfs_status" == "403" ]]; then + ok "RustFS S3 endpoint (HTTP $rustfs_status)" + else + warn 'RustFS S3 endpoint did not respond with an expected status (wanted 200/403)' + fi + + local registry_url registry_status + registry_url=$(service_http_probe_url registry 5000 /v2/ || true) + registry_status=$(http_status "$registry_url") + if [[ "$registry_status" == "200" || "$registry_status" == "401" ]]; then + ok "Zot registry endpoint (HTTP $registry_status)" + else + warn 'Zot registry endpoint did not respond with an expected status (wanted 200/401)' + fi + # Platform container health summary step 'Container health summary' cd "$COMPOSE_DIR"