diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml new file mode 100644 index 00000000..068728c7 --- /dev/null +++ b/.gitea/workflows/release.yml @@ -0,0 +1,149 @@ +# .gitea/workflows/release.yml +# Deterministic release pipeline producing signed images, SBOMs, provenance, and manifest + +name: Release Bundle + +on: + push: + tags: + - 'v*' + workflow_dispatch: + inputs: + version: + description: 'Release version (overrides tag, e.g. 2025.10.0-edge)' + required: false + type: string + channel: + description: 'Release channel (edge|stable|lts)' + required: false + default: 'edge' + type: choice + options: + - edge + - stable + - lts + calendar: + description: 'Calendar tag (YYYY.MM) - optional override' + required: false + type: string + push_images: + description: 'Push container images to registry' + required: false + default: true + type: boolean + +jobs: + build-release: + runs-on: ubuntu-22.04 + env: + DOTNET_VERSION: '10.0.100-rc.1.25451.107' + REGISTRY: registry.stella-ops.org + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up Node.js 20 + uses: actions/setup-node@v4 + with: + node-version: '20.14.0' + + - name: Set up .NET SDK + uses: actions/setup-dotnet@v4 + with: + dotnet-version: ${{ env.DOTNET_VERSION }} + include-prerelease: true + + - name: Install Helm 3.16.0 + run: | + curl -fsSL https://get.helm.sh/helm-v3.16.0-linux-amd64.tar.gz -o /tmp/helm.tgz + tar -xzf /tmp/helm.tgz -C /tmp + sudo install -m 0755 /tmp/linux-amd64/helm /usr/local/bin/helm + + - name: Install Cosign + uses: sigstore/cosign-installer@v3.4.0 + + - name: Determine release metadata + id: meta + run: | + set -euo pipefail + RAW_VERSION="${{ github.ref_name }}" + if [[ "${{ github.event_name }}" != "push" ]]; then + RAW_VERSION="${{ github.event.inputs.version }}" + fi + if [[ -z "$RAW_VERSION" ]]; then + echo "::error::Release version not provided" >&2 + exit 1 + fi + VERSION="${RAW_VERSION#v}" + CHANNEL="${{ github.event.inputs.channel || '' }}" + if [[ -z "$CHANNEL" ]]; then + CHANNEL="edge" + fi + CALENDAR_INPUT="${{ github.event.inputs.calendar || '' }}" + if [[ -z "$CALENDAR_INPUT" ]]; then + YEAR=$(echo "$VERSION" | awk -F'.' '{print $1}') + MONTH=$(echo "$VERSION" | awk -F'.' '{print $2}') + if [[ -n "$YEAR" && -n "$MONTH" ]]; then + CALENDAR_INPUT="$YEAR.$MONTH" + else + CALENDAR_INPUT=$(date -u +'%Y.%m') + fi + fi + PUSH_INPUT="${{ github.event.inputs.push_images || '' }}" + if [[ "${{ github.event_name }}" == "push" ]]; then + PUSH_INPUT="true" + elif [[ -z "$PUSH_INPUT" ]]; then + PUSH_INPUT="true" + fi + if [[ "$PUSH_INPUT" == "false" || "$PUSH_INPUT" == "0" ]]; then + PUSH_FLAG="false" + else + PUSH_FLAG="true" + fi + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "channel=$CHANNEL" >> "$GITHUB_OUTPUT" + echo "calendar=$CALENDAR_INPUT" >> "$GITHUB_OUTPUT" + echo "push=$PUSH_FLAG" >> "$GITHUB_OUTPUT" + + - name: Log in to registry + if: steps.meta.outputs.push == 'true' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + + - name: Prepare release output directory + run: | + rm -rf out/release + mkdir -p out/release + + - name: Build release bundle + env: + COSIGN_KEY_REF: ${{ secrets.COSIGN_KEY_REF }} + COSIGN_PASSWORD: ${{ secrets.COSIGN_PASSWORD }} + COSIGN_IDENTITY_TOKEN: ${{ secrets.COSIGN_IDENTITY_TOKEN }} + run: | + set -euo pipefail + EXTRA_ARGS=() + if [[ "${{ steps.meta.outputs.push }}" != "true" ]]; then + EXTRA_ARGS+=("--no-push") + fi + ./ops/devops/release/build_release.py \ + --version "${{ steps.meta.outputs.version }}" \ + --channel "${{ steps.meta.outputs.channel }}" \ + --calendar "${{ steps.meta.outputs.calendar }}" \ + --git-sha "${{ github.sha }}" \ + "${EXTRA_ARGS[@]}" + + - name: Upload release artefacts + uses: actions/upload-artifact@v4 + with: + name: stellaops-release-${{ steps.meta.outputs.version }} + path: out/release + if-no-files-found: error diff --git a/.gitignore b/.gitignore index c2ceb5fb..d44a4a1d 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,6 @@ seed-data/ics-cisa/*.sha256 seed-data/cert-bund/**/*.json seed-data/cert-bund/**/*.sha256 -out/offline-kit/web/**/* \ No newline at end of file +out/offline-kit/web/**/* +src/StellaOps.Web/node_modules/**/* +src/StellaOps.Web/.angular/**/* \ No newline at end of file diff --git a/EXECPLAN.md b/EXECPLAN.md index 3c1e0c11..ad0fafc2 100644 --- a/EXECPLAN.md +++ b/EXECPLAN.md @@ -3,7 +3,7 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster ## Wave Instructions ### Wave 0 -- Team Attestor Guild: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Attestor/TASKS.md`. Focus on ATTESTOR-API-11-201 (TODO), ATTESTOR-VERIFY-11-202 (TODO), ATTESTOR-OBS-11-203 (TODO). Confirm prerequisites (none) before starting and report status in module TASKS.md. +- Team Attestor Guild: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Attestor/TASKS.md`. ATTESTOR-API-11-201, ATTESTOR-VERIFY-11-202, and ATTESTOR-OBS-11-203 are DONE (2025-10-19); continue monitoring Rekor inclusion proofs/archives and keep module docs/tests aligned. - Team Authority Core & Security Guild: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Authority/TASKS.md`. Focus on AUTH-DPOP-11-001 (DONE 2025-10-20), AUTH-MTLS-11-002 (DONE 2025-10-23). Confirm prerequisites (none) before starting and report status in module TASKS.md. - Team Authority Core & Storage Guild: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Authority/TASKS.md`. Focus on AUTHSTORAGE-MONGO-08-001 (DONE 2025-10-19). Confirm prerequisites (none) before starting and report status in module TASKS.md. - Team DevEx/CLI: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Cli/TASKS.md`. Focus on EXCITITOR-CLI-01-002 (TODO), CLI-RUNTIME-13-005 (TODO). Confirm prerequisites (external: EXCITITOR-CLI-01-001, EXCITITOR-EXPORT-01-001) before starting and report status in module TASKS.md. @@ -49,19 +49,19 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster - Team Team WebService & Authority: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/TASKS.md`, `src/StellaOps.Concelier.WebService/TASKS.md`. Focus on SEC2.PLG (DOING), SEC3.PLG (DOING), SEC5.PLG (DOING), PLG4-6.CAPABILITIES (BLOCKED), PLG6.DIAGRAM (TODO), PLG7.RFC (REVIEW), FEEDWEB-DOCS-01-001 (DOING), FEEDWEB-OPS-01-006 (TODO), FEEDWEB-OPS-01-007 (BLOCKED). Confirm prerequisites (none) before starting and report status in module TASKS.md. - Team Tools Guild, BE-Conn-MSRC: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Concelier.Connector.Common/TASKS.md`. Focus on FEEDCONN-SHARED-STATE-003 (**TODO). Confirm prerequisites (none) before starting and report status in module TASKS.md. - Team UX Specialist, Angular Eng: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Web/TASKS.md`. Focus on WEB1.TRIVY-SETTINGS (DONE 2025-10-21), WEB1.TRIVY-SETTINGS-TESTS (DONE 2025-10-21), and WEB1.DEPS-13-001 (DONE 2025-10-21). Confirm prerequisites (none) before starting and report status in module TASKS.md. -- Team Zastava Core Guild: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Zastava.Core/TASKS.md`. Focus on ZASTAVA-CORE-12-201 (TODO), ZASTAVA-CORE-12-202 (TODO), ZASTAVA-CORE-12-203 (TODO), ZASTAVA-OPS-12-204 (TODO). Confirm prerequisites (none) before starting and report status in module TASKS.md. -- Team Zastava Webhook Guild: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Zastava.Webhook/TASKS.md`. Focus on ZASTAVA-WEBHOOK-12-101 (TODO), ZASTAVA-WEBHOOK-12-102 (TODO), ZASTAVA-WEBHOOK-12-103 (TODO). Confirm prerequisites (none) before starting and report status in module TASKS.md. +- Team Zastava Core Guild: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Zastava.Core/TASKS.md`. Focus on ZASTAVA-CORE-12-201 (DONE 2025-10-23), ZASTAVA-CORE-12-202 (DONE 2025-10-23), ZASTAVA-CORE-12-203 (DONE 2025-10-23), ZASTAVA-OPS-12-204 (DONE 2025-10-23). Confirm prerequisites (none) before starting and report status in module TASKS.md. +- Team Zastava Webhook Guild: read EXECPLAN.md Wave 0 and SPRINTS.md rows for `src/StellaOps.Zastava.Webhook/TASKS.md`. Focus on ZASTAVA-WEBHOOK-12-101 (DONE 2025-10-24), ZASTAVA-WEBHOOK-12-102 (DOING 2025-10-24), ZASTAVA-WEBHOOK-12-103 (DOING 2025-10-24), ZASTAVA-WEBHOOK-12-104 (TODO). Confirm prerequisites (none) before starting and report status in module TASKS.md. ### Wave 1 - Team Bench Guild, Language Analyzer Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `bench/TASKS.md`. Focus on BENCH-SCANNER-10-002 (TODO). Confirm prerequisites (internal: SCANNER-ANALYZERS-LANG-10-301 (Wave 0)) before starting and report status in module TASKS.md. - Team DevEx/CLI, QA Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Cli/TASKS.md`. Focus on CLI-RUNTIME-13-009 (TODO). Confirm prerequisites (internal: CLI-RUNTIME-13-005 (Wave 0)) before starting and report status in module TASKS.md. -- Team DevOps Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `ops/devops/TASKS.md`. Focus on DEVOPS-REL-14-001 (TODO). Confirm prerequisites (internal: ATTESTOR-API-11-201 (Wave 0), SIGNER-API-11-101 (Wave 0)) before starting and report status in module TASKS.md. +- Team DevOps Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `ops/devops/TASKS.md`. Focus on DEVOPS-REL-14-001 (DOING 2025-10-23). Confirm prerequisites (internal: ATTESTOR-API-11-201 (Wave 0), SIGNER-API-11-101 (Wave 0)) before starting and report status in module TASKS.md. - Team DevOps Guild, Scanner WebService Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `ops/devops/TASKS.md`. Focus on DEVOPS-SCANNER-09-204 (TODO). Confirm prerequisites (internal: SCANNER-EVENTS-15-201 (Wave 0)) before starting and report status in module TASKS.md. - Team Emit Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Scanner.Emit/TASKS.md`. SCANNER-EMIT-10-607 shipped 2025-10-22; remaining focus is SCANNER-EMIT-17-701 (build-id enrichment). Confirm prerequisites (internal: POLICY-CORE-09-005 (Wave 0), SCANNER-EMIT-10-602 (Wave 0), SCANNER-EMIT-10-604 (Wave 0)) before starting and report status in module TASKS.md. - Team Language Analyzer Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Scanner.Analyzers.Lang/TASKS.md`. Sprint 10 language analyzers (10-303..10-306) wrapped by 2025-10-22; shift to Wave 1 benchmarking/packaging follow-ups (10-308+/309 variants) and ensure shared helpers stay stable. Node stream (tasks 10-302/309) closed on 2025-10-21; verify prereqs SCANNER-ANALYZERS-LANG-10-301/307 remain satisfied before new work. - Team Licensing Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `ops/licensing/TASKS.md`. Focus on DEVOPS-LIC-14-004 (TODO). Confirm prerequisites (internal: AUTH-MTLS-11-002 (Wave 0)) before starting and report status in module TASKS.md. - Team Notify Engine Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Notify.Engine/TASKS.md`. Focus on NOTIFY-ENGINE-15-301 (TODO). Confirm prerequisites (internal: NOTIFY-MODELS-15-101 (Wave 0)) before starting and report status in module TASKS.md. -- Team Notify Queue Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Notify.Queue/TASKS.md`. Focus on NOTIFY-QUEUE-15-401 (TODO). Confirm prerequisites (internal: NOTIFY-MODELS-15-101 (Wave 0)) before starting and report status in module TASKS.md. +- Team Notify Queue Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Notify.Queue/TASKS.md`. Focus on NOTIFY-QUEUE-15-401 (DONE 2025-10-23). Confirm prerequisites (internal: NOTIFY-MODELS-15-101 (Wave 0)) before starting and report status in module TASKS.md. - Team Notify WebService Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Notify.WebService/TASKS.md`. Focus on NOTIFY-WEB-15-103 (DONE). Confirm prerequisites (internal: NOTIFY-WEB-15-102 (Wave 0)) before starting and report status in module TASKS.md. - Team Scanner WebService Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Scanner.WebService/TASKS.md`. SCANNER-RUNTIME-12-301 closed (2025-10-20); coordinate with Zastava observer guild on batch fixtures and advance to SCANNER-RUNTIME-12-302. - Team Scheduler ImpactIndex Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Scheduler.ImpactIndex/TASKS.md`. Focus on SCHED-IMPACT-16-301 (TODO). Confirm prerequisites (internal: SCANNER-EMIT-10-605 (Wave 0)) before starting and report status in module TASKS.md. @@ -76,19 +76,19 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster - Team Team Excititor Connectors – Ubuntu: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Excititor.Connectors.Ubuntu.CSAF/TASKS.md`. Focus on EXCITITOR-CONN-UBUNTU-01-003 (TODO). Confirm prerequisites (internal: EXCITITOR-CONN-UBUNTU-01-002 (Wave 0); external: EXCITITOR-POLICY-01-001) before starting and report status in module TASKS.md. - Team Team Excititor Export: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Excititor.Export/TASKS.md`. Focus on EXCITITOR-EXPORT-01-006 (DONE 2025-10-21). Confirm prerequisites (internal: EXCITITOR-EXPORT-01-005 (Wave 0), POLICY-CORE-09-005 (Wave 0)) before starting and report status in module TASKS.md. - Team Team Excititor Worker: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Excititor.Worker/TASKS.md`. Focus on EXCITITOR-WORKER-01-003 (TODO). Confirm prerequisites (internal: EXCITITOR-ATTEST-01-003 (Wave 0); external: EXCITITOR-EXPORT-01-002, EXCITITOR-WORKER-01-001) before starting and report status in module TASKS.md. -- Team UI Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.UI/TASKS.md`. Focus on UI-ATTEST-11-005 (TODO), UI-VEX-13-003 (TODO), UI-POLICY-13-007 (TODO), UI-ADMIN-13-004 (TODO), UI-AUTH-13-001 (TODO), UI-SCANS-13-002 (TODO), UI-NOTIFY-13-006 (DOING), UI-SCHED-13-005 (TODO). Confirm prerequisites (internal: ATTESTOR-API-11-201 (Wave 0), AUTH-DPOP-11-001 (Wave 0), AUTH-MTLS-11-002 (Wave 0), EXCITITOR-EXPORT-01-005 (Wave 0), NOTIFY-WEB-15-101 (Wave 0), POLICY-CORE-09-006 (Wave 0), SCHED-WEB-16-101 (Wave 0), SIGNER-API-11-101 (Wave 0); external: EXCITITOR-CORE-02-001, SCANNER-WEB-09-102, SCANNER-WEB-09-103) before starting and report status in module TASKS.md. -- Team Zastava Observer Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Zastava.Observer/TASKS.md`. Focus on ZASTAVA-OBS-12-001 (TODO). Confirm prerequisites (internal: ZASTAVA-CORE-12-201 (Wave 0)) before starting and report status in module TASKS.md. +- Team UI Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.UI/TASKS.md`. Focus on UI-ATTEST-11-005 (DONE 2025-10-23), UI-VEX-13-003 (TODO), UI-POLICY-13-007 (TODO), UI-ADMIN-13-004 (TODO), UI-AUTH-13-001 (DONE 2025-10-23), UI-SCANS-13-002 (TODO), UI-NOTIFY-13-006 (DOING 2025-10-19), UI-SCHED-13-005 (TODO). Confirm prerequisites (internal: ATTESTOR-API-11-201 (Wave 0), AUTH-DPOP-11-001 (Wave 0), AUTH-MTLS-11-002 (Wave 0), EXCITITOR-EXPORT-01-005 (Wave 0), NOTIFY-WEB-15-101 (Wave 0), POLICY-CORE-09-006 (Wave 0), SCHED-WEB-16-101 (Wave 0), SIGNER-API-11-101 (Wave 0); external: EXCITITOR-CORE-02-001, SCANNER-WEB-09-102, SCANNER-WEB-09-103) before starting and report status in module TASKS.md. +- Team Zastava Observer Guild: read EXECPLAN.md Wave 1 and SPRINTS.md rows for `src/StellaOps.Zastava.Observer/TASKS.md`. Focus on ZASTAVA-OBS-12-001 (DOING 2025-10-24). Confirm prerequisites (internal: ZASTAVA-CORE-12-201 (Wave 0)) before starting and report status in module TASKS.md. ### Wave 2 - Team Bench Guild, Notify Team: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `bench/TASKS.md`. Focus on BENCH-NOTIFY-15-001 (TODO). Confirm prerequisites (internal: NOTIFY-ENGINE-15-301 (Wave 1)) before starting and report status in module TASKS.md. - Team Bench Guild, Scheduler Team: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `bench/TASKS.md`. Focus on BENCH-IMPACT-16-001 (TODO). Confirm prerequisites (internal: SCHED-IMPACT-16-301 (Wave 1)) before starting and report status in module TASKS.md. - Team Deployment Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `ops/deployment/TASKS.md`. Focus on DEVOPS-OPS-14-003 (TODO). Confirm prerequisites (internal: DEVOPS-REL-14-001 (Wave 1)) before starting and report status in module TASKS.md. -- Team DevOps Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `ops/devops/TASKS.md`. Focus on DEVOPS-MIRROR-08-001 (DONE 2025-10-19), DEVOPS-PERF-10-002 (TODO), DEVOPS-REL-14-004 (TODO), DEVOPS-REL-17-002 (TODO), and DEVOPS-NUGET-13-001 (TODO). Confirm prerequisites (internal: BENCH-SCANNER-10-002 (Wave 1), DEVOPS-REL-14-001 (Wave 1), SCANNER-EMIT-17-701 (Wave 1)) before starting and report status in module TASKS.md. +- Team DevOps Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `ops/devops/TASKS.md`. Focus on DEVOPS-MIRROR-08-001 (DONE 2025-10-19), DEVOPS-PERF-10-002 (TODO), DEVOPS-REL-14-004 (TODO), DEVOPS-REL-17-002 (TODO), DEVOPS-NUGET-13-001 (DOING 2025-10-24), and DEVOPS-UI-13-006 (TODO). Confirm prerequisites (internal: BENCH-SCANNER-10-002 (Wave 1), DEVOPS-REL-14-001 (Wave 1), SCANNER-EMIT-17-701 (Wave 1), UI-AUTH-13-001 (Wave 1)) before starting and report status in module TASKS.md. - Team DevOps Guild, Notify Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `ops/devops/TASKS.md`. Focus on DEVOPS-SCANNER-09-205 (TODO). Confirm prerequisites (internal: DEVOPS-SCANNER-09-204 (Wave 1)) before starting and report status in module TASKS.md. - Team Notify Engine Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `src/StellaOps.Notify.Engine/TASKS.md`. Focus on NOTIFY-ENGINE-15-302 (TODO). Confirm prerequisites (internal: NOTIFY-ENGINE-15-301 (Wave 1)) before starting and report status in module TASKS.md. -- Team Notify Queue Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `src/StellaOps.Notify.Queue/TASKS.md`. Focus on NOTIFY-QUEUE-15-403 (TODO), NOTIFY-QUEUE-15-402 (TODO). Confirm prerequisites (internal: NOTIFY-QUEUE-15-401 (Wave 1)) before starting and report status in module TASKS.md. +- Team Notify Queue Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `src/StellaOps.Notify.Queue/TASKS.md`. Focus on NOTIFY-QUEUE-15-403 (DONE 2025-10-23), NOTIFY-QUEUE-15-402 (DONE 2025-10-23). Confirm prerequisites (internal: NOTIFY-QUEUE-15-401 (Wave 1)) before starting and report status in module TASKS.md. - Team Notify WebService Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `src/StellaOps.Notify.WebService/TASKS.md`. Focus on NOTIFY-WEB-15-104 (TODO). Confirm prerequisites (internal: NOTIFY-QUEUE-15-401 (Wave 1), NOTIFY-STORAGE-15-201 (Wave 0)) before starting and report status in module TASKS.md. -- Team Notify Worker Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `src/StellaOps.Notify.Worker/TASKS.md`. Focus on NOTIFY-WORKER-15-201 (TODO), NOTIFY-WORKER-15-202 (TODO). Confirm prerequisites (internal: NOTIFY-ENGINE-15-301 (Wave 1), NOTIFY-QUEUE-15-401 (Wave 1)) before starting and report status in module TASKS.md. +- Team Notify Worker Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `src/StellaOps.Notify.Worker/TASKS.md`. Focus on NOTIFY-WORKER-15-201 (DONE 2025-10-23), NOTIFY-WORKER-15-202 (TODO). Confirm prerequisites (internal: NOTIFY-ENGINE-15-301 (Wave 1), NOTIFY-QUEUE-15-401 (Wave 1)) before starting and report status in module TASKS.md. - Team Offline Kit Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `ops/offline-kit/TASKS.md`. Focus on DEVOPS-OFFLINE-14-002 (TODO), DEVOPS-OFFLINE-18-003 (TODO), and DEVOPS-OFFLINE-18-005 (TODO). Confirm prerequisites (internal: DEVOPS-REL-14-001 (Wave 1), DEVOPS-REL-14-004 (Wave 2)) before starting and report status in module TASKS.md. - Team Samples Guild, Policy Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `samples/TASKS.md`. Focus on SAMPLES-13-004 (TODO). Confirm prerequisites (internal: POLICY-CORE-09-006 (Wave 0), UI-POLICY-13-007 (Wave 1)) before starting and report status in module TASKS.md. - Team Scanner WebService Guild: read EXECPLAN.md Wave 2 and SPRINTS.md rows for `src/StellaOps.Scanner.WebService/TASKS.md`. Focus on SCANNER-RUNTIME-12-302 (TODO). Confirm prerequisites (internal: SCANNER-RUNTIME-12-301 (Wave 1), ZASTAVA-CORE-12-201 (Wave 0)) before starting and report status in module TASKS.md. @@ -388,15 +388,15 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster - **Sprint 11** · Signing Chain Bring-up - Team: Attestor Guild - Path: `src/StellaOps.Attestor/TASKS.md` - 1. [TODO] ATTESTOR-API-11-201 — `/rekor/entries` submission pipeline with dedupe, proof acquisition, and persistence. + 1. [DONE 2025-10-19] ATTESTOR-API-11-201 — `/rekor/entries` submission pipeline with dedupe, proof acquisition, and persistence. • Prereqs: — - • Current: DOING (2025-10-23) — RustFS migration underway. - 2. [TODO] ATTESTOR-VERIFY-11-202 — `/rekor/verify` + retrieval endpoints validating signatures and Merkle proofs. + • Current: DONE — mTLS-gated `POST /api/v1/rekor/entries` dedupes `bundleSha256`, coordinates dual-log submissions, archives DSSE/proof bundles when requested. + 2. [DONE 2025-10-19] ATTESTOR-VERIFY-11-202 — `/rekor/verify` + retrieval endpoints validating signatures and Merkle proofs. • Prereqs: — - • Current: TODO - 3. [TODO] ATTESTOR-OBS-11-203 — Telemetry, alerting, mTLS hardening, and archive workflow for Attestor. + • Current: DONE — verification pipeline validates DSSE signatures and Merkle proofs, returns cached entries with optional refresh paths. + 3. [DONE 2025-10-19] ATTESTOR-OBS-11-203 — Telemetry, alerting, mTLS hardening, and archive workflow for Attestor. • Prereqs: — - • Current: TODO + • Current: DONE — structured metrics/logs, mTLS thumbprint/SAN enforcement, and archive retention jobs integrated with alerting runbooks. - Team: Scanner Storage Guild - Path: `src/StellaOps.Scanner.Storage/TASKS.md` 1. [DONE 2025-10-23] SCANNER-STORAGE-11-401 — Migrate scanner artifact storage from MinIO to RustFS, including driver, configuration, and migration tooling. @@ -411,29 +411,32 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster - **Sprint 12** · Runtime Guardrails - Team: Zastava Core Guild - Path: `src/StellaOps.Zastava.Core/TASKS.md` - 1. [TODO] ZASTAVA-CORE-12-201 — Define runtime event/admission DTOs, hashing helpers, and versioning strategy. + 1. [DONE 2025-10-23] ZASTAVA-CORE-12-201 — Define runtime event/admission DTOs, hashing helpers, and versioning strategy. • Prereqs: — - • Current: TODO - 2. [TODO] ZASTAVA-CORE-12-202 — Provide configuration/logging/metrics utilities shared by Observer/Webhook. + • Current: DONE — runtime/admission envelopes canonically serialised, multihash helpers covered by new tests, architecture doc updated with negotiation rules. + 2. [DONE 2025-10-23] ZASTAVA-CORE-12-202 — Provide configuration/logging/metrics utilities shared by Observer/Webhook. • Prereqs: — - • Current: TODO - 3. [TODO] ZASTAVA-CORE-12-203 — Authority client helpers, OpTok caching, and security guardrails for runtime services. + • Current: DONE — `AddZastavaRuntimeCore` binds options, emits scoped logging/metrics, integration tests exercise DI wiring. + 3. [DONE 2025-10-23] ZASTAVA-CORE-12-203 — Authority client helpers, OpTok caching, and security guardrails for runtime services. • Prereqs: — - • Current: TODO - 4. [TODO] ZASTAVA-OPS-12-204 — Operational runbooks, alert rules, and dashboard exports for runtime plane. + • Current: DONE — Zastava authority token provider caches OpToks, enforces DPoP/mTLS guardrails, negative tests cover static fallback + incompat scopes. + 4. [DONE 2025-10-23] ZASTAVA-OPS-12-204 — Operational runbooks, alert rules, and dashboard exports for runtime plane. • Prereqs: — - • Current: TODO + • Current: DONE — new runtime runbook plus Prometheus/Grafana assets committed and referenced in docs/offline kit guidance. - Team: Zastava Webhook Guild - Path: `src/StellaOps.Zastava.Webhook/TASKS.md` - 1. [TODO] ZASTAVA-WEBHOOK-12-101 — Admission controller host with TLS bootstrap and Authority auth. + 1. [DONE 2025-10-24] ZASTAVA-WEBHOOK-12-101 — Admission controller host with TLS bootstrap and Authority auth. • Prereqs: — - • Current: TODO - 2. [TODO] ZASTAVA-WEBHOOK-12-102 — Query Scanner `/policy/runtime`, resolve digests, enforce verdicts. + • Current: DONE — host boots with deterministic TLS + shared runtime core, authority health checks in place, smoke coverage shipped. + 2. [DOING 2025-10-24] ZASTAVA-WEBHOOK-12-102 — Query Scanner `/policy/runtime`, resolve digests, enforce verdicts. • Prereqs: — - • Current: TODO - 3. [TODO] ZASTAVA-WEBHOOK-12-103 — Caching, fail-open/closed toggles, metrics/logging for admission decisions. + • Current: DOING — runtime policy client and telemetry landed; admission wiring + verdict enforcement pending. + 3. [DOING 2025-10-24] ZASTAVA-WEBHOOK-12-103 — Caching, fail-open/closed toggles, metrics/logging for admission decisions. • Prereqs: — - • Current: TODO + • Current: DOING — instrumentation scaffolding ready, awaiting decision pipeline implementation. + 4. [TODO] ZASTAVA-WEBHOOK-12-104 — Wire `/admission` endpoint to runtime policy client and emit allow/deny envelopes. + • Prereqs: ZASTAVA-WEBHOOK-12-102 + • Current: TODO — implement decision handler using new backend client, produce canonical AdmissionDecision envelopes. - **Sprint 13** · UX & CLI Experience - Team: DevEx/CLI - Path: `src/StellaOps.Cli/TASKS.md` @@ -590,9 +593,9 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster - **Sprint 11** · UI Integration - Team: UI Guild - Path: `src/StellaOps.UI/TASKS.md` - 1. [TODO] UI-ATTEST-11-005 — Attestation visibility (Rekor id, status) on Scan Detail. + 1. [DONE 2025-10-23] UI-ATTEST-11-005 — Attestation visibility (Rekor id, status) on Scan Detail. • Prereqs: SIGNER-API-11-101 (Wave 0), ATTESTOR-API-11-201 (Wave 0) - • Current: TODO + • Current: DONE (2025-10-23) — Scan Detail route renders Rekor UUID/status via fixtures with verified/failure states covered by specs. - **Sprint 12** · Runtime Guardrails - Team: Scanner WebService Guild - Path: `src/StellaOps.Scanner.WebService/TASKS.md` @@ -604,9 +607,9 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster 5. [TODO] SCANNER-RUNTIME-12-305 — Finalize shared fixtures and CI automation with Zastava + CLI teams for runtime APIs. - Team: Zastava Observer Guild - Path: `src/StellaOps.Zastava.Observer/TASKS.md` - 1. [TODO] ZASTAVA-OBS-12-001 — Build container lifecycle watcher that tails CRI (containerd/cri-o/docker) events and emits deterministic runtime records with buffering + backoff. + 1. [DOING 2025-10-24] ZASTAVA-OBS-12-001 — Build container lifecycle watcher that tails CRI (containerd/cri-o/docker) events and emits deterministic runtime records with buffering + backoff. • Prereqs: ZASTAVA-CORE-12-201 (Wave 0) - • Current: TODO + • Current: DOING — lifecycle watcher scaffolding and buffering design underway (2025-10-24) - **Sprint 13** · UX & CLI Experience - Team: DevEx/CLI, QA Guild - Path: `src/StellaOps.Cli/TASKS.md` @@ -629,7 +632,7 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster 3. [TODO] UI-ADMIN-13-004 — Deliver admin area (tenants/clients/quotas/licensing) with RBAC + audit hooks. • Prereqs: AUTH-MTLS-11-002 (Wave 0) • Current: TODO - 4. [TODO] UI-AUTH-13-001 — Integrate Authority OIDC + DPoP flows with session management. + 4. [DONE 2025-10-23] UI-AUTH-13-001 — Integrate Authority OIDC + DPoP flows with session management. • Prereqs: AUTH-DPOP-11-001 (Wave 0), AUTH-MTLS-11-002 (Wave 0) • Current: TODO 5. [TODO] UI-SCANS-13-002 — Build scans module (list/detail/SBOM/diff/attestation) with performance + accessibility targets. @@ -644,13 +647,16 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster - **Sprint 13** · Platform Reliability - Team: DevOps Guild, Platform Leads - Path: `ops/devops/TASKS.md` - 1. [TODO] DEVOPS-NUGET-13-001 — Add .NET 10 preview feeds/local mirrors so `dotnet restore` succeeds offline; document updated NuGet bootstrap. + 1. [DOING 2025-10-24] DEVOPS-NUGET-13-001 — Add .NET 10 preview feeds/local mirrors so `dotnet restore` succeeds offline; document updated NuGet bootstrap. • Prereqs: DEVOPS-REL-14-001 (Wave 1) - • Current: TODO – Mirror preview packages into Offline Kit/allowlisted feeds, update NuGet.config mapping, and refresh restore documentation. + • Current: DOING – Mirror preview packages into Offline Kit/allowlisted feeds, update NuGet.config mapping, and refresh restore documentation. + 2. [TODO] DEVOPS-UI-13-006 — Add Playwright-based UI auth smoke job to CI/offline pipelines, wiring sample `/config.json` provisioning and reporting. + • Prereqs: UI-AUTH-13-001 (Wave 1), DEVOPS-REL-14-001 (Wave 1) + • Current: TODO – Extend release/offline pipelines to run `npm run test:e2e`, publish traces on failure, and ensure stub config assets ship alongside the UI bundle. - **Sprint 14** · Release & Offline Ops - Team: DevOps Guild - Path: `ops/devops/TASKS.md` - 1. [TODO] DEVOPS-REL-14-001 — Deterministic build/release pipeline with SBOM/provenance, signing, manifest generation. + 1. [DOING 2025-10-23] DEVOPS-REL-14-001 — Deterministic build/release pipeline with SBOM/provenance, signing, manifest generation. • Prereqs: SIGNER-API-11-101 (Wave 0), ATTESTOR-API-11-201 (Wave 0) • Current: TODO - Team: Licensing Guild @@ -661,14 +667,14 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster - **Sprint 15** · Notify Foundations - Team: Notify Engine Guild - Path: `src/StellaOps.Notify.Engine/TASKS.md` - 1. [TODO] NOTIFY-ENGINE-15-301 — Rules evaluation core: tenant/kind filters, severity/delta gates, VEX gating, throttling, idempotency key generation. + 1. [DOING (2025-10-24)] NOTIFY-ENGINE-15-301 — Rules evaluation core: tenant/kind filters, severity/delta gates, VEX gating, throttling, idempotency key generation. • Prereqs: NOTIFY-MODELS-15-101 (Wave 0) - • Current: TODO + • Current: DOING (2025-10-24) - Team: Notify Queue Guild - Path: `src/StellaOps.Notify.Queue/TASKS.md` - 1. [TODO] NOTIFY-QUEUE-15-401 — Build queue abstraction + Redis Streams adapter with ack/claim APIs, idempotency tokens, serialization contracts. + 1. [DONE 2025-10-23] NOTIFY-QUEUE-15-401 — Build queue abstraction + Redis Streams adapter with ack/claim APIs, idempotency tokens, serialization contracts. • Prereqs: NOTIFY-MODELS-15-101 (Wave 0) - • Current: TODO + • Current: DONE — Redis transport, queue contracts, and integration tests delivered (2025-10-23). - **Sprint 16** · Scheduler Intelligence - Team: Scheduler ImpactIndex Guild @@ -799,12 +805,12 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster • Current: TODO - Team: Notify Queue Guild - Path: `src/StellaOps.Notify.Queue/TASKS.md` - 1. [TODO] NOTIFY-QUEUE-15-403 — Delivery queue for channel actions with retry schedules, poison queues, and metrics instrumentation. + 1. [DONE 2025-10-23] NOTIFY-QUEUE-15-403 — Delivery queue for channel actions with retry schedules, poison queues, and metrics instrumentation. • Prereqs: NOTIFY-QUEUE-15-401 (Wave 1) - • Current: TODO - 2. [TODO] NOTIFY-QUEUE-15-402 — Add NATS JetStream adapter with configuration binding, health probes, failover. + • Current: DONE — delivery queue + retry/dead-letter pipeline shipped with integration tests and metrics (2025-10-23). + 2. [DONE 2025-10-23] NOTIFY-QUEUE-15-402 — Add NATS JetStream adapter with configuration binding, health probes, failover. • Prereqs: NOTIFY-QUEUE-15-401 (Wave 1) - • Current: TODO + • Current: DONE — JetStream transport, DI binding, health check, and integration tests delivered (2025-10-23). - Team: Notify WebService Guild - Path: `src/StellaOps.Notify.WebService/TASKS.md` 1. [TODO] NOTIFY-WEB-15-104 — Configuration binding for Mongo/queue/secrets; startup diagnostics. @@ -812,9 +818,9 @@ Generated from SPRINTS.md and module TASKS.md files on 2025-10-19. Waves cluster • Current: TODO - Team: Notify Worker Guild - Path: `src/StellaOps.Notify.Worker/TASKS.md` - 1. [TODO] NOTIFY-WORKER-15-201 — Implement bus subscription + leasing loop with correlation IDs, backoff, dead-letter handling (§1–§5). + 1. [DONE 2025-10-23] NOTIFY-WORKER-15-201 — Implement bus subscription + leasing loop with correlation IDs, backoff, dead-letter handling (§1–§5). • Prereqs: NOTIFY-QUEUE-15-401 (Wave 1) - • Current: TODO + • Current: DONE — worker leasing loop wired to queue adapters with retry/backoff telemetry (2025-10-23). 2. [TODO] NOTIFY-WORKER-15-202 — Wire rules evaluation pipeline (tenant scoping, filters, throttles, digests, idempotency) with deterministic decisions. • Prereqs: NOTIFY-ENGINE-15-301 (Wave 1) • Current: TODO diff --git a/NuGet.config b/NuGet.config index 9cfcc4e1..64dd811e 100644 --- a/NuGet.config +++ b/NuGet.config @@ -1,17 +1,32 @@ + + + - + - - - + + + + + + + + + + + + + + + diff --git a/SPRINTS.md b/SPRINTS.md index 853dd81b..928c9da3 100644 --- a/SPRINTS.md +++ b/SPRINTS.md @@ -2,37 +2,39 @@ This file describe implementation of Stella Ops (docs/README.md). Implementation | Sprint | Theme | Tasks File Path | Status | Type of Specialist | Task ID | Task Description | | --- | --- | --- | --- | --- | --- | --- | -| Sprint 11 | Signing Chain Bring-up | src/StellaOps.Attestor/TASKS.md | TODO | Attestor Guild | ATTESTOR-API-11-201 | `/rekor/entries` submission pipeline with dedupe, proof acquisition, and persistence. | -| Sprint 11 | Signing Chain Bring-up | src/StellaOps.Attestor/TASKS.md | TODO | Attestor Guild | ATTESTOR-VERIFY-11-202 | `/rekor/verify` + retrieval endpoints validating signatures and Merkle proofs. | -| Sprint 11 | Signing Chain Bring-up | src/StellaOps.Attestor/TASKS.md | TODO | Attestor Guild | ATTESTOR-OBS-11-203 | Telemetry, alerting, mTLS hardening, and archive workflow for Attestor. | +| Sprint 11 | Signing Chain Bring-up | src/StellaOps.Attestor/TASKS.md | DONE (2025-10-19) | Attestor Guild | ATTESTOR-API-11-201 | `/rekor/entries` submission pipeline with dedupe, proof acquisition, and persistence. | +| Sprint 11 | Signing Chain Bring-up | src/StellaOps.Attestor/TASKS.md | DONE (2025-10-19) | Attestor Guild | ATTESTOR-VERIFY-11-202 | `/rekor/verify` + retrieval endpoints validating signatures and Merkle proofs. | +| Sprint 11 | Signing Chain Bring-up | src/StellaOps.Attestor/TASKS.md | DONE (2025-10-19) | Attestor Guild | ATTESTOR-OBS-11-203 | Telemetry, alerting, mTLS hardening, and archive workflow for Attestor. | | Sprint 11 | Storage Platform Hardening | src/StellaOps.Scanner.Storage/TASKS.md | DONE (2025-10-23) | Scanner Storage Guild | SCANNER-STORAGE-11-401 | Migrate scanner object storage integration from MinIO to RustFS with data migration plan. | -| Sprint 11 | UI Integration | src/StellaOps.UI/TASKS.md | TODO | UI Guild | UI-ATTEST-11-005 | Attestation visibility (Rekor id, status) on Scan Detail. | -| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Core/TASKS.md | TODO | Zastava Core Guild | ZASTAVA-CORE-12-201 | Define runtime event/admission DTOs, hashing helpers, and versioning strategy. | -| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Core/TASKS.md | TODO | Zastava Core Guild | ZASTAVA-CORE-12-202 | Provide configuration/logging/metrics utilities shared by Observer/Webhook. | -| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Core/TASKS.md | TODO | Zastava Core Guild | ZASTAVA-CORE-12-203 | Authority client helpers, OpTok caching, and security guardrails for runtime services. | -| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Core/TASKS.md | TODO | Zastava Core Guild | ZASTAVA-OPS-12-204 | Operational runbooks, alert rules, and dashboard exports for runtime plane. | -| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Observer/TASKS.md | TODO | Zastava Observer Guild | ZASTAVA-OBS-12-001 | Container lifecycle watcher emitting deterministic runtime events with buffering. | +| Sprint 11 | UI Integration | src/StellaOps.UI/TASKS.md | DONE (2025-10-23) | UI Guild | UI-ATTEST-11-005 | Attestation visibility (Rekor id, status) on Scan Detail. | +| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Core/TASKS.md | DONE (2025-10-23) | Zastava Core Guild | ZASTAVA-CORE-12-201 | Define runtime event/admission DTOs, hashing helpers, and versioning strategy. | +| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Core/TASKS.md | DONE (2025-10-23) | Zastava Core Guild | ZASTAVA-CORE-12-202 | Provide configuration/logging/metrics utilities shared by Observer/Webhook. | +| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Core/TASKS.md | DONE (2025-10-23) | Zastava Core Guild | ZASTAVA-CORE-12-203 | Authority client helpers, OpTok caching, and security guardrails for runtime services. | +| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Core/TASKS.md | DONE (2025-10-23) | Zastava Core Guild | ZASTAVA-OPS-12-204 | Operational runbooks, alert rules, and dashboard exports for runtime plane. | +| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Observer/TASKS.md | DOING (2025-10-24) | Zastava Observer Guild | ZASTAVA-OBS-12-001 | Container lifecycle watcher emitting deterministic runtime events with buffering. | | Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Observer/TASKS.md | TODO | Zastava Observer Guild | ZASTAVA-OBS-12-002 | Capture entrypoint traces + loaded libraries, hashing binaries and linking to baseline SBOM. | | Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Observer/TASKS.md | TODO | Zastava Observer Guild | ZASTAVA-OBS-12-003 | Posture checks for signatures/SBOM/attestation with offline caching. | | Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Observer/TASKS.md | TODO | Zastava Observer Guild | ZASTAVA-OBS-12-004 | Batch `/runtime/events` submissions with disk-backed buffer and rate limits. | -| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Webhook/TASKS.md | TODO | Zastava Webhook Guild | ZASTAVA-WEBHOOK-12-101 | Admission controller host with TLS bootstrap and Authority auth. | -| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Webhook/TASKS.md | TODO | Zastava Webhook Guild | ZASTAVA-WEBHOOK-12-102 | Query Scanner `/policy/runtime`, resolve digests, enforce verdicts. | -| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Webhook/TASKS.md | TODO | Zastava Webhook Guild | ZASTAVA-WEBHOOK-12-103 | Caching, fail-open/closed toggles, metrics/logging for admission decisions. | +| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Webhook/TASKS.md | DONE (2025-10-24) | Zastava Webhook Guild | ZASTAVA-WEBHOOK-12-101 | Admission controller host with TLS bootstrap and Authority auth. | +| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Webhook/TASKS.md | DOING (2025-10-24) | Zastava Webhook Guild | ZASTAVA-WEBHOOK-12-102 | Query Scanner `/policy/runtime`, resolve digests, enforce verdicts. | +| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Webhook/TASKS.md | DOING (2025-10-24) | Zastava Webhook Guild | ZASTAVA-WEBHOOK-12-103 | Caching, fail-open/closed toggles, metrics/logging for admission decisions. | +| Sprint 12 | Runtime Guardrails | src/StellaOps.Zastava.Webhook/TASKS.md | TODO | Zastava Webhook Guild | ZASTAVA-WEBHOOK-12-104 | Wire `/admission` endpoint to runtime policy client and emit allow/deny envelopes. | | Sprint 12 | Runtime Guardrails | src/StellaOps.Scanner.WebService/TASKS.md | DOING (2025-10-20) | Scanner WebService Guild | SCANNER-RUNTIME-12-302 | `/policy/runtime` endpoint joining SBOM baseline + policy verdict, returning admission guidance. | | Sprint 12 | Runtime Guardrails | src/StellaOps.Scanner.WebService/TASKS.md | TODO | Scanner WebService Guild | SCANNER-RUNTIME-12-303 | Align `/policy/runtime` verdicts with canonical policy evaluation (Feedser/Vexer). | | Sprint 12 | Runtime Guardrails | src/StellaOps.Scanner.WebService/TASKS.md | TODO | Scanner WebService Guild | SCANNER-RUNTIME-12-304 | Integrate attestation verification into runtime policy metadata. | | Sprint 12 | Runtime Guardrails | src/StellaOps.Scanner.WebService/TASKS.md | TODO | Scanner WebService Guild | SCANNER-RUNTIME-12-305 | Deliver shared fixtures + e2e validation with Zastava/CLI teams. | -| Sprint 13 | UX & CLI Experience | src/StellaOps.UI/TASKS.md | TODO | UI Guild | UI-AUTH-13-001 | Integrate Authority OIDC + DPoP flows with session management. | +| Sprint 13 | UX & CLI Experience | src/StellaOps.UI/TASKS.md | DONE (2025-10-23) | UI Guild | UI-AUTH-13-001 | Integrate Authority OIDC + DPoP flows with session management. | | Sprint 13 | UX & CLI Experience | src/StellaOps.UI/TASKS.md | TODO | UI Guild | UI-SCANS-13-002 | Build scans module (list/detail/SBOM/diff/attestation) with performance + accessibility targets. | | Sprint 13 | UX & CLI Experience | src/StellaOps.UI/TASKS.md | TODO | UI Guild | UI-VEX-13-003 | Implement VEX explorer + policy editor with preview integration. | | Sprint 13 | UX & CLI Experience | src/StellaOps.UI/TASKS.md | TODO | UI Guild | UI-ADMIN-13-004 | Deliver admin area (tenants/clients/quotas/licensing) with RBAC + audit hooks. | | Sprint 13 | UX & CLI Experience | src/StellaOps.UI/TASKS.md | TODO | UI Guild | UI-SCHED-13-005 | Scheduler panel: schedules CRUD, run history, dry-run preview. | | Sprint 13 | UX & CLI Experience | src/StellaOps.UI/TASKS.md | DOING (2025-10-19) | UI Guild | UI-NOTIFY-13-006 | Notify panel: channels/rules CRUD, deliveries view, test send. | | Sprint 13 | UX & CLI Experience | src/StellaOps.Cli/TASKS.md | TODO | DevEx/CLI | CLI-RUNTIME-13-005 | Add runtime policy test verbs that consume `/policy/runtime` and display verdicts. | -| Sprint 13 | Platform Reliability | ops/devops/TASKS.md | TODO | DevOps Guild, Platform Leads | DEVOPS-NUGET-13-001 | Wire up .NET 10 preview feeds/local mirrors so `dotnet restore` succeeds offline; document updated NuGet bootstrap. | +| Sprint 13 | Platform Reliability | ops/devops/TASKS.md | DOING (2025-10-24) | DevOps Guild, Platform Leads | DEVOPS-NUGET-13-001 | Wire up .NET 10 preview feeds/local mirrors so `dotnet restore` succeeds offline; document updated NuGet bootstrap. | | Sprint 13 | Platform Reliability | ops/devops/TASKS.md | TODO | DevOps Guild | DEVOPS-NUGET-13-002 | Ensure all solutions/projects prioritize `local-nuget` before public feeds and add restore-order validation. | | Sprint 13 | Platform Reliability | ops/devops/TASKS.md | TODO | DevOps Guild, Platform Leads | DEVOPS-NUGET-13-003 | Upgrade `Microsoft.*` dependencies pinned to 8.* to their latest .NET 10 (or 9.x) releases and refresh guidance. | -| Sprint 14 | Release & Offline Ops | ops/devops/TASKS.md | TODO | DevOps Guild | DEVOPS-REL-14-001 | Deterministic build/release pipeline with SBOM/provenance, signing, and manifest generation. | +| Sprint 13 | Platform Reliability | ops/devops/TASKS.md | TODO | DevOps Guild, UI Guild | DEVOPS-UI-13-006 | Add Playwright-based UI auth smoke job to CI/offline pipelines, wiring sample `/config.json` provisioning and reporting. | +| Sprint 14 | Release & Offline Ops | ops/devops/TASKS.md | DOING (2025-10-23) | DevOps Guild | DEVOPS-REL-14-001 | Deterministic build/release pipeline with SBOM/provenance, signing, and manifest generation. | | Sprint 14 | Release & Offline Ops | ops/devops/TASKS.md | TODO | DevOps Guild, Scanner Guild | DEVOPS-REL-14-004 | Extend release/offline smoke jobs to cover Python analyzer plug-ins (warm/cold, determinism, signing). | | Sprint 14 | Release & Offline Ops | ops/offline-kit/TASKS.md | TODO | Offline Kit Guild | DEVOPS-OFFLINE-14-002 | Offline kit packaging workflow with integrity verification and documentation. | | Sprint 14 | Release & Offline Ops | ops/deployment/TASKS.md | TODO | Deployment Guild | DEVOPS-OPS-14-003 | Deployment/update/rollback automation and channel management documentation. | @@ -43,17 +45,17 @@ This file describe implementation of Stella Ops (docs/README.md). Implementation | Sprint 15 | Notify Foundations | src/StellaOps.Notify.Storage.Mongo/TASKS.md | TODO | Notify Storage Guild | NOTIFY-STORAGE-15-201 | Mongo schemas/indexes for rules, channels, deliveries, digests, locks, audit. | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.Storage.Mongo/TASKS.md | TODO | Notify Storage Guild | NOTIFY-STORAGE-15-202 | Repositories with tenant scoping, soft delete, TTL, causal consistency options. | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.Storage.Mongo/TASKS.md | TODO | Notify Storage Guild | NOTIFY-STORAGE-15-203 | Delivery history retention and query APIs. | -| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Queue/TASKS.md | TODO | Notify Queue Guild | NOTIFY-QUEUE-15-401 | Bus abstraction + Redis Streams adapter with ordering/idempotency. | -| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Queue/TASKS.md | TODO | Notify Queue Guild | NOTIFY-QUEUE-15-402 | NATS JetStream adapter with health probes and failover. | -| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Queue/TASKS.md | TODO | Notify Queue Guild | NOTIFY-QUEUE-15-403 | Delivery queue with retry/dead-letter + metrics. | -| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Engine/TASKS.md | TODO | Notify Engine Guild | NOTIFY-ENGINE-15-301 | Rules evaluation core (filters, throttles, idempotency). | +| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Queue/TASKS.md | DONE (2025-10-23) | Notify Queue Guild | NOTIFY-QUEUE-15-401 | Bus abstraction + Redis Streams adapter with ordering/idempotency. | +| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Queue/TASKS.md | DONE (2025-10-23) | Notify Queue Guild | NOTIFY-QUEUE-15-402 | NATS JetStream adapter with health probes and failover. | +| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Queue/TASKS.md | DONE (2025-10-23) | Notify Queue Guild | NOTIFY-QUEUE-15-403 | Delivery queue with retry/dead-letter + metrics. | +| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Engine/TASKS.md | DOING (2025-10-24) | Notify Engine Guild | NOTIFY-ENGINE-15-301 | Rules evaluation core (filters, throttles, idempotency). | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.Engine/TASKS.md | TODO | Notify Engine Guild | NOTIFY-ENGINE-15-302 | Action planner + digest coalescer. | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.Engine/TASKS.md | TODO | Notify Engine Guild | NOTIFY-ENGINE-15-303 | Template rendering engine (Slack/Teams/Email/Webhook). | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.Engine/TASKS.md | TODO | Notify Engine Guild | NOTIFY-ENGINE-15-304 | Test-send sandbox + preview utilities. | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.WebService/TASKS.md | TODO | Notify WebService Guild | NOTIFY-WEB-15-101 | Minimal API host with Authority enforcement and plug-in loading. | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.WebService/TASKS.md | TODO | Notify WebService Guild | NOTIFY-WEB-15-102 | Rules/channel/template CRUD with audit logging. | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.WebService/TASKS.md | TODO | Notify WebService Guild | NOTIFY-WEB-15-104 | Configuration binding + startup diagnostics. | -| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Worker/TASKS.md | TODO | Notify Worker Guild | NOTIFY-WORKER-15-201 | Bus subscription + leasing loop with backoff. | +| Sprint 15 | Notify Foundations | src/StellaOps.Notify.Worker/TASKS.md | DONE (2025-10-23) | Notify Worker Guild | NOTIFY-WORKER-15-201 | Bus subscription + leasing loop with backoff. | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.Worker/TASKS.md | TODO | Notify Worker Guild | NOTIFY-WORKER-15-202 | Rules evaluation pipeline integration. | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.Worker/TASKS.md | TODO | Notify Worker Guild | NOTIFY-WORKER-15-203 | Channel dispatch orchestration with retries. | | Sprint 15 | Notify Foundations | src/StellaOps.Notify.Worker/TASKS.md | TODO | Notify Worker Guild | NOTIFY-WORKER-15-204 | Metrics/telemetry for Notify workers. | diff --git a/docs/13_RELEASE_ENGINEERING_PLAYBOOK.md b/docs/13_RELEASE_ENGINEERING_PLAYBOOK.md index 2586f655..5ba34c27 100755 --- a/docs/13_RELEASE_ENGINEERING_PLAYBOOK.md +++ b/docs/13_RELEASE_ENGINEERING_PLAYBOOK.md @@ -57,11 +57,16 @@ graph LR | **Publish** | Push to `registry.git.stella-ops.org`. | | **E2E** | Kind‑based Kubernetes test incl. Zastava DaemonSet; verify sub‑5 s scan SLA. | | **Notify** | Report to Mattermost & GitLab Slack app. | -| **OfflineToken** | Call `JwtIssuer.Generate(exp=30d)` → store `client.jwt` artefact → attach to OUK build context | - -*All stages run in parallel where possible; max wall‑time < 15 min.* - ---- +| **OfflineToken** | Call `JwtIssuer.Generate(exp=30d)` → store `client.jwt` artefact → attach to OUK build context | + +*All stages run in parallel where possible; max wall‑time < 15 min.* + +**Implementation note.** `.gitea/workflows/release.yml` executes +`ops/devops/release/build_release.py` to build multi-arch images, attach +CycloneDX SBOMs and SLSA provenance with Cosign, and emit +`out/release/release.yaml` for downstream packaging (Helm, Compose, Offline Kit). + +--- ## 3 Container Image Strategy diff --git a/docs/ARCHITECTURE_ZASTAVA.md b/docs/ARCHITECTURE_ZASTAVA.md index 419a008b..392d173b 100644 --- a/docs/ARCHITECTURE_ZASTAVA.md +++ b/docs/ARCHITECTURE_ZASTAVA.md @@ -115,12 +115,18 @@ stellaops/zastava-agent # System service; watch Docker events; observer on ], "decision": "Allow|Deny", "ttlSeconds": 300 -} -``` - ---- - -## 3) Observer — node agent (DaemonSet) +} +``` + +### 2.3 Schema negotiation & hashing guarantees + +* Every payload is wrapped in an envelope with `schemaVersion` set to `"@v."`. Version negotiation keeps the **major** line in lockstep (`zastava.runtime.event@v1.x`, `zastava.admission.decision@v1.x`) and selects the highest mutually supported **minor**. If no overlap exists, the local default (`@v1.0`) is used. +* Components use the shared `ZastavaContractVersions` helper for parsing/negotiation and the canonical JSON serializer to guarantee identical byte sequences prior to hashing, ensuring multihash IDs such as `sha256-` are reproducible across observers, webhooks, and backend jobs. +* Schema evolution rules: backwards-compatible fields append to the end of the canonical property order; breaking changes bump the **major** and require dual-writer/reader rollout per deployment playbook. + +--- + +## 3) Observer — node agent (DaemonSet) ### 3.1 Responsibilities @@ -210,11 +216,13 @@ sequenceDiagram * If unknown/missing, schedule **delta scan** and return `202 Accepted`. * Emits **derived signals** (usedByEntrypoint per component based on `/proc//maps`). -### 5.2 Policy decision API (for webhook) - -`POST /api/v1/scanner/policy/runtime` - -Request: +### 5.2 Policy decision API (for webhook) + +`POST /api/v1/scanner/policy/runtime` + +The webhook reuses the shared runtime stack (`AddZastavaRuntimeCore` + `IZastavaAuthorityTokenProvider`) so OpTok caching, DPoP enforcement, and telemetry behave identically to the observer plane. + +Request: ```json { @@ -253,23 +261,44 @@ Response: ```yaml zastava: - mode: - observer: true - webhook: true - authority: - issuer: "https://authority.internal" - aud: ["scanner","zastava"] # tokens for backend and self-id - backend: - url: "https://scanner-web.internal" - connectTimeoutMs: 500 - requestTimeoutMs: 1500 - retry: { attempts: 3, backoffMs: 200 } - runtime: - engine: "auto" # containerd|cri-o|docker|auto - procfs: "/host/proc" - collect: - entryTrace: true - loadedLibs: true + mode: + observer: true + webhook: true + backend: + baseAddress: "https://scanner-web.internal" + policyPath: "/api/v1/scanner/policy/runtime" + requestTimeoutSeconds: 5 + allowInsecureHttp: false + runtime: + authority: + issuer: "https://authority.internal" + clientId: "zastava-observer" + audience: ["scanner","zastava"] + scopes: + - "api:scanner.runtime.write" + refreshSkewSeconds: 120 + requireDpop: true + requireMutualTls: true + allowStaticTokenFallback: false + staticTokenPath: null # Optional bootstrap secret + tenant: "tenant-01" + environment: "prod" + deployment: "cluster-a" + logging: + includeScopes: true + includeActivityTracking: true + staticScope: + plane: "runtime" + metrics: + meterName: "StellaOps.Zastava" + meterVersion: "1.0.0" + commonTags: + cluster: "prod-cluster" + engine: "auto" # containerd|cri-o|docker|auto + procfs: "/host/proc" + collect: + entryTrace: true + loadedLibs: true maxLibs: 256 maxHashBytesPerContainer: 64_000_000 maxDepth: 48 @@ -286,45 +315,49 @@ zastava: eventsPerSecond: 50 burst: 200 perNodeQueue: 10_000 - security: - mounts: - containerdSock: "/run/containerd/containerd.sock:ro" - proc: "/proc:/host/proc:ro" - runtimeState: "/var/lib/containerd:ro" -``` - ---- + security: + mounts: + containerdSock: "/run/containerd/containerd.sock:ro" + proc: "/proc:/host/proc:ro" + runtimeState: "/var/lib/containerd:ro" +``` + +> Implementation note: both `zastava-observer` and `zastava-webhook` call `services.AddZastavaRuntimeCore(configuration, "")` during start-up to bind the `zastava:runtime` section, enforce validation, and register canonical log scopes + meters. + +--- ## 7) Security posture * **AuthN/Z**: Authority OpToks (DPoP preferred) to backend; webhook does **not** require client auth from API server (K8s handles). -* **Least privileges**: read‑only host mounts; optional `CAP_SYS_PTRACE`; **no** host networking; **no** write mounts. -* **Isolation**: never exec untrusted code; nsenter only to **read** `/proc/`. -* **Data minimization**: do not exfiltrate env vars or command arguments unless policy explicitly enables diagnostic mode. -* **Rate limiting**: per‑node caps; per‑tenant caps at backend. -* **Hard caps**: bytes hashed, files inspected, depth of shell parsing. +* **Least privileges**: read‑only host mounts; optional `CAP_SYS_PTRACE`; **no** host networking; **no** write mounts. +* **Isolation**: never exec untrusted code; nsenter only to **read** `/proc/`. +* **Data minimization**: do not exfiltrate env vars or command arguments unless policy explicitly enables diagnostic mode. +* **Rate limiting**: per‑node caps; per‑tenant caps at backend. +* **Hard caps**: bytes hashed, files inspected, depth of shell parsing. +* **Authority guardrails**: `AddZastavaRuntimeCore` binds `zastava.runtime.authority` and refuses tokens without `aud:` scope; optional knobs (`requireDpop`, `requireMutualTls`, `allowStaticTokenFallback`) emit structured warnings when relaxed. --- ## 8) Metrics, logs, tracing -**Observer** - -* `zastava.events_emitted_total{kind}` -* `zastava.proc_maps_samples_total{result}` -* `zastava.entrytrace_depth{p99}` -* `zastava.hash_bytes_total` -* `zastava.buffer_drops_total` - -**Webhook** - -* `zastava.admission_requests_total{decision}` -* `zastava.admission_latency_seconds` -* `zastava.cache_hits_total` -* `zastava.backend_failures_total` - -**Logs** (structured): node, pod, image digest, decision, reasons. -**Tracing**: spans for observe→batch→post; webhook request→resolve→respond. +**Observer** + +* `zastava.runtime.events.total{kind}` +* `zastava.runtime.backend.latency.ms{endpoint="events"}` +* `zastava.proc_maps.samples.total{result}` +* `zastava.entrytrace.depth{p99}` +* `zastava.hash.bytes.total` +* `zastava.buffer.drops.total` + +**Webhook** + +* `zastava.admission.decisions.total{decision}` +* `zastava.runtime.backend.latency.ms{endpoint="policy"}` +* `zastava.admission.cache.hits.total` +* `zastava.backend.failures.total` + +**Logs** (structured): node, pod, image digest, decision, reasons. +**Tracing**: spans for observe→batch→post; webhook request→resolve→respond. --- diff --git a/docs/README.md b/docs/README.md index f16d683e..712ddf15 100755 --- a/docs/README.md +++ b/docs/README.md @@ -82,6 +82,7 @@ Everything here is open‑source and versioned — when you check out a git ta - **31 – [Concelier MSRC Connector – AAD Onboarding](ops/concelier-msrc-operations.md)** - **32 – [Scanner Analyzer Bench Operations](ops/scanner-analyzers-operations.md)** - **33 – [Scanner Artifact Store Migration](ops/scanner-rustfs-migration.md)** +- **34 – [Zastava Runtime Operations Runbook](ops/zastava-runtime-operations.md)** ### Legal & licence - **32 – [Legal & Quota FAQ](29_LEGAL_FAQ_QUOTA.md)** diff --git a/docs/ops/ui-auth-smoke.md b/docs/ops/ui-auth-smoke.md new file mode 100644 index 00000000..fdfe1d17 --- /dev/null +++ b/docs/ops/ui-auth-smoke.md @@ -0,0 +1,32 @@ +# UI Auth Smoke Job (Playwright) + +The DevOps Guild tracks **DEVOPS-UI-13-006** to wire the new Playwright auth +smoke checks into CI and the Offline Kit pipeline. These tests exercise the +Angular UI login flow against a stubbed Authority instance to verify that +`/config.json` is discovered, DPoP proofs are minted, and error handling is +surfaced when the backend rejects a request. + +## What the job does + +1. Builds the UI bundle (or consumes the artifact from the release pipeline). +2. Copies the environment stub from `src/config/config.sample.json` into the + runtime directory as `config.json` so the UI can bootstrap without a live + gateway. +3. Runs `npm run test:e2e`, which launches Playwright with the auth fixtures + under `tests/e2e/auth.spec.ts`: + - Validates that the Sign-in button generates an Authorization Code + PKCE + redirect to `https://authority.local/connect/authorize`. + - Confirms the callback view shows an actionable error when the redirect is + missing the pending login state. +4. Publishes JUnit + Playwright traces (retain-on-failure) for troubleshooting. + +## Pipeline integration notes + +- Chromium must already be available (`npx playwright install --with-deps`). +- Set `PLAYWRIGHT_BASE_URL` if the UI serves on a non-default host/port. +- For Offline Kit packaging, bundle the Playwright browser cache under + `.cache/ms-playwright/` so the job runs without network access. +- Failures should block release promotion; export the traces to the artifacts + tab for debugging. + +Refer to `ops/devops/TASKS.md` (DEVOPS-UI-13-006) for progress and ownership. diff --git a/docs/ops/zastava-runtime-grafana-dashboard.json b/docs/ops/zastava-runtime-grafana-dashboard.json new file mode 100644 index 00000000..2132b2c5 --- /dev/null +++ b/docs/ops/zastava-runtime-grafana-dashboard.json @@ -0,0 +1,205 @@ +{ + "title": "Zastava Runtime Plane", + "uid": "zastava-runtime", + "timezone": "utc", + "schemaVersion": 38, + "version": 1, + "refresh": "30s", + "time": { + "from": "now-6h", + "to": "now" + }, + "panels": [ + { + "id": 1, + "type": "timeseries", + "title": "Observer Event Rate", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum by (tenant,component,kind) (rate(zastava_runtime_events_total{tenant=~\"$tenant\"}[5m]))", + "legendFormat": "{{tenant}}/{{component}}/{{kind}}" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "fieldConfig": { + "defaults": { + "unit": "1/s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "showLegend": true, + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 2, + "type": "timeseries", + "title": "Admission Decisions", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum by (decision) (rate(zastava_admission_decisions_total{tenant=~\"$tenant\"}[5m]))", + "legendFormat": "{{decision}}" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "fieldConfig": { + "defaults": { + "unit": "1/s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 20 + } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "showLegend": true, + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "timeseries", + "title": "Backend Latency P95", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(zastava_runtime_backend_latency_ms_bucket{tenant=~\"$tenant\"}[5m])))", + "legendFormat": "p95 latency" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "fieldConfig": { + "defaults": { + "unit": "ms", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "orange", + "value": 500 + }, + { + "color": "red", + "value": 750 + } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "showLegend": true, + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + } + ], + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "label": "Prometheus", + "current": { + "text": "Prometheus", + "value": "Prometheus" + } + }, + { + "name": "tenant", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(zastava_runtime_events_total, tenant)", + "refresh": 1, + "hide": 0, + "current": { + "text": ".*", + "value": ".*" + }, + "regex": "", + "includeAll": true, + "multi": true, + "sort": 1 + } + ] + }, + "annotations": { + "list": [ + { + "name": "Deployments", + "type": "tags", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "enable": true, + "iconColor": "rgba(255, 96, 96, 1)" + } + ] + } +} diff --git a/docs/ops/zastava-runtime-operations.md b/docs/ops/zastava-runtime-operations.md new file mode 100644 index 00000000..04702893 --- /dev/null +++ b/docs/ops/zastava-runtime-operations.md @@ -0,0 +1,131 @@ +# Zastava Runtime Operations Runbook + +This runbook covers the runtime plane (Observer DaemonSet + Admission Webhook). +It aligns with `Sprint 12 – Runtime Guardrails` and assumes components consume +`StellaOps.Zastava.Core` (`AddZastavaRuntimeCore(...)`). + +## 1. Prerequisites + +- **Authority client credentials** – service principal `zastava-runtime` with scopes + `aud:scanner` and `api:scanner.runtime.write`. Provision DPoP keys and mTLS client + certs before rollout. +- **Scanner/WebService reachability** – cluster DNS entry (e.g. `scanner.internal`) + resolvable from every node running Observer/Webhook. +- **Host mounts** – read-only access to `/proc`, container runtime state + (`/var/lib/containerd`, `/var/run/containerd/containerd.sock`) and scratch space + (`/var/run/zastava`). +- **Offline kit bundle** – operators staging air-gapped installs must download + `offline-kit/zastava-runtime-{version}.tar.zst` containing container images, + Grafana dashboards, and Prometheus rules referenced below. +- **Secrets** – Authority OpTok cache dir, DPoP private keys, and webhook TLS secrets + live outside git. For air-gapped installs copy them to the sealed secrets vault. + +### 1.1 Telemetry quick reference + +| Metric | Description | Notes | +|--------|-------------|-------| +| `zastava.runtime.events.total{tenant,component,kind}` | Rate of observer events sent to Scanner | Expect >0 on busy nodes. | +| `zastava.runtime.backend.latency.ms` | Histogram (ms) for `/runtime/events` and `/policy/runtime` calls | P95 & P99 drive alerting. | +| `zastava.admission.decisions.total{decision}` | Admission verdict counts | Track deny spikes or fail-open fallbacks. | +| `zastava.admission.cache.hits.total` | (future) Cache utilisation once Observer batches land | Placeholder until Observer tasks 12-004 complete. | + +## 2. Deployment workflows + +### 2.1 Fresh install (Helm overlay) + +1. Load offline kit bundle: `oras cp offline-kit/zastava-runtime-*.tar.zst oci:registry.internal/zastava`. +2. Render values: + - `zastava.runtime.tenant`, `environment`, `deployment` (cluster identifier). + - `zastava.runtime.authority` block (issuer, clientId, audience, DPoP toggle). + - `zastava.runtime.metrics.commonTags.cluster` for Prometheus labels. +3. Pre-create secrets: + - `zastava-authority-dpop` (JWK + private key). + - `zastava-authority-mtls` (client cert/key chain). + - `zastava-webhook-tls` (serving cert; CSR bundle if using auto-approval). +4. Deploy Observer DaemonSet and Webhook chart: + ```sh + helm upgrade --install zastava-runtime deploy/helm/zastava \ + -f values/zastava-runtime.yaml \ + --namespace stellaops \ + --create-namespace + ``` +5. Verify: + - `kubectl -n stellaops get pods -l app=zastava-observer` ready. + - `kubectl -n stellaops logs ds/zastava-observer --tail=20` shows + `Issued runtime OpTok` audit line with DPoP token type. + - Admission webhook registered: `kubectl get validatingwebhookconfiguration zastava-webhook`. + +### 2.2 Upgrades + +1. Scale webhook deployment to `--replicas=3` (rolling). +2. Drain one node per AZ to ensure Observer tolerates disruption. +3. Apply chart upgrade; watch `zastava.runtime.backend.latency.ms` P95 (<250 ms). +4. Post-upgrade, run smoke tests: + - Apply unsigned Pod manifest → expect `deny` (policy fail). + - Apply signed Pod manifest → expect `allow`. +5. Record upgrade in ops log with Git SHA + Helm chart version. + +### 2.3 Rollback + +1. Use Helm revision history: `helm history zastava-runtime`. +2. Rollback: `helm rollback zastava-runtime `. +3. Invalidate cached OpToks: + ```sh + kubectl -n stellaops exec deploy/zastava-webhook -- \ + zastava-webhook invalidate-op-token --audience scanner + ``` +4. Confirm observers reconnect via metrics (`rate(zastava_runtime_events_total[5m])`). + +## 3. Authority & security guardrails + +- Tokens must be `DPoP` type when `requireDpop=true`. Logs emit + `authority.token.issue` scope with decision data; absence indicates misconfig. +- `requireMutualTls=true` enforces mTLS during token acquisition. Disable only in + lab clusters; expect warning log `Mutual TLS requirement disabled`. +- Static fallback tokens (`allowStaticTokenFallback=true`) should exist only during + initial bootstrap. Rotate nightly; preference is to disable once Authority reachable. +- Audit every change in `zastava.runtime.authority` through change management. + Use `kubectl get secret zastava-authority-dpop -o jsonpath='{.metadata.annotations.revision}'` + to confirm key rotation. + +## 4. Incident response + +### 4.1 Authority offline + +1. Check Prometheus alert `ZastavaAuthorityTokenStale`. +2. Inspect Observer logs for `authority.token.fallback` scope. +3. If fallback engaged, verify static token validity duration; rotate secret if older than 24 h. +4. Once Authority restored, delete static fallback secret and restart pods to rebind DPoP keys. + +### 4.2 Scanner/WebService latency spike + +1. Alert `ZastavaRuntimeBackendLatencyHigh` fires at P95 > 750 ms for 5 minutes. +2. Run backend health: `kubectl -n scanner exec deploy/scanner-web -- curl -f localhost:8080/healthz/ready`. +3. If backend degraded, auto buffer may throttle. Confirm disk-backed queue size via + `kubectl logs ds/zastava-observer | grep buffer.drops`. +4. Consider enabling fail-open for namespaces listed in runbook Appendix B (temporary). + +### 4.3 Admission deny storm + +1. Alert `ZastavaAdmissionDenySpike` indicates >20 denies/minute. +2. Pull sample: `kubectl logs deploy/zastava-webhook --since=10m | jq '.decision'`. +3. Cross-check policy backlog in Scanner (`/policy/runtime` logs). Engage application + owner; optionally set namespace to `failOpenNamespaces` after risk assessment. + +## 5. Offline kit & air-gapped notes + +- Bundle contents: + - Observer/Webhook container images (multi-arch). + - `docs/ops/zastava-runtime-prometheus-rules.yaml` + Grafana dashboard JSON. + - Sample `zastava-runtime.values.yaml`. +- Verification: + - Validate signature: `cosign verify-blob offline-kit/zastava-runtime-*.tar.zst --certificate offline-kit/zastava-runtime.cert`. + - Extract Prometheus rules into offline monitoring cluster (`/etc/prometheus/rules.d`). + - Import Grafana dashboard via `grafana-cli --config ...`. + +## 6. Observability assets + +- Prometheus alert rules: `docs/ops/zastava-runtime-prometheus-rules.yaml`. +- Grafana dashboard JSON: `docs/ops/zastava-runtime-grafana-dashboard.json`. +- Add both to the monitoring repo (`ops/monitoring/zastava`) and reference them in + the Offline Kit manifest. diff --git a/docs/ops/zastava-runtime-prometheus-rules.yaml b/docs/ops/zastava-runtime-prometheus-rules.yaml new file mode 100644 index 00000000..aefdfeb6 --- /dev/null +++ b/docs/ops/zastava-runtime-prometheus-rules.yaml @@ -0,0 +1,31 @@ +groups: + - name: zastava-runtime + interval: 30s + rules: + - alert: ZastavaRuntimeEventsSilent + expr: sum(rate(zastava_runtime_events_total[10m])) == 0 + for: 15m + labels: + severity: warning + service: zastava-runtime + annotations: + summary: "Observer events stalled" + description: "No runtime events emitted in the last 15 minutes. Check observer DaemonSet health and container runtime mounts." + - alert: ZastavaRuntimeBackendLatencyHigh + expr: histogram_quantile(0.95, sum by (le) (rate(zastava_runtime_backend_latency_ms_bucket[5m]))) > 0.75 + for: 10m + labels: + severity: critical + service: zastava-runtime + annotations: + summary: "Runtime backend latency p95 above 750 ms" + description: "Latency to Scanner runtime APIs is elevated. Inspect Scanner.WebService readiness, Authority OpTok issuance, and cluster network." + - alert: ZastavaAdmissionDenySpike + expr: sum(rate(zastava_admission_decisions_total{decision="deny"}[5m])) > 20 + for: 5m + labels: + severity: warning + service: zastava-runtime + annotations: + summary: "Admission webhook denies exceeding threshold" + description: "Webhook is denying more than 20 pod admissions per minute. Confirm policy verdicts and consider fail-open exception for impacted namespaces." diff --git a/local-nuget/Microsoft.AspNetCore.Authentication.JwtBearer.10.0.0-rc.1.25451.107.nupkg b/local-nuget/Microsoft.AspNetCore.Authentication.JwtBearer.10.0.0-rc.1.25451.107.nupkg new file mode 100644 index 00000000..a5e173d6 Binary files /dev/null and b/local-nuget/Microsoft.AspNetCore.Authentication.JwtBearer.10.0.0-rc.1.25451.107.nupkg differ diff --git a/local-nuget/Microsoft.Data.Sqlite.9.0.0-rc.1.24451.1.nupkg b/local-nuget/Microsoft.Data.Sqlite.9.0.0-rc.1.24451.1.nupkg new file mode 100644 index 00000000..fbbc65a3 Binary files /dev/null and b/local-nuget/Microsoft.Data.Sqlite.9.0.0-rc.1.24451.1.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Caching.Memory.10.0.0-preview.7.25380.108.nupkg b/local-nuget/Microsoft.Extensions.Caching.Memory.10.0.0-preview.7.25380.108.nupkg new file mode 100644 index 00000000..5bdcc609 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Caching.Memory.10.0.0-preview.7.25380.108.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Configuration.10.0.0-preview.7.25380.108.nupkg b/local-nuget/Microsoft.Extensions.Configuration.10.0.0-preview.7.25380.108.nupkg new file mode 100644 index 00000000..b1046b3c Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Configuration.10.0.0-preview.7.25380.108.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Configuration.Binder.10.0.0-preview.7.25380.108.nupkg b/local-nuget/Microsoft.Extensions.Configuration.Binder.10.0.0-preview.7.25380.108.nupkg new file mode 100644 index 00000000..cd71ff22 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Configuration.Binder.10.0.0-preview.7.25380.108.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.DependencyInjection.Abstractions.10.0.0-preview.7.25380.108.nupkg b/local-nuget/Microsoft.Extensions.DependencyInjection.Abstractions.10.0.0-preview.7.25380.108.nupkg new file mode 100644 index 00000000..1970b132 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.DependencyInjection.Abstractions.10.0.0-preview.7.25380.108.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.DependencyInjection.Abstractions.9.0.0.nupkg b/local-nuget/Microsoft.Extensions.DependencyInjection.Abstractions.9.0.0.nupkg new file mode 100644 index 00000000..f73275a5 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.DependencyInjection.Abstractions.9.0.0.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Hosting.10.0.0-preview.7.25380.108.nupkg b/local-nuget/Microsoft.Extensions.Hosting.10.0.0-preview.7.25380.108.nupkg new file mode 100644 index 00000000..79dfc0f7 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Hosting.10.0.0-preview.7.25380.108.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Hosting.Abstractions.10.0.0-preview.7.25380.108.nupkg b/local-nuget/Microsoft.Extensions.Hosting.Abstractions.10.0.0-preview.7.25380.108.nupkg new file mode 100644 index 00000000..8777d237 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Hosting.Abstractions.10.0.0-preview.7.25380.108.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Http.10.0.0-preview.7.25380.108.nupkg b/local-nuget/Microsoft.Extensions.Http.10.0.0-preview.7.25380.108.nupkg new file mode 100644 index 00000000..eb263857 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Http.10.0.0-preview.7.25380.108.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Logging.Abstractions.10.0.0-preview.7.25380.108.nupkg b/local-nuget/Microsoft.Extensions.Logging.Abstractions.10.0.0-preview.7.25380.108.nupkg new file mode 100644 index 00000000..69bcb5e1 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Logging.Abstractions.10.0.0-preview.7.25380.108.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Logging.Abstractions.9.0.0.nupkg b/local-nuget/Microsoft.Extensions.Logging.Abstractions.9.0.0.nupkg new file mode 100644 index 00000000..8915684a Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Logging.Abstractions.9.0.0.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Options.10.0.0-preview.7.25380.108.nupkg b/local-nuget/Microsoft.Extensions.Options.10.0.0-preview.7.25380.108.nupkg new file mode 100644 index 00000000..46524699 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Options.10.0.0-preview.7.25380.108.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Options.9.0.0.nupkg b/local-nuget/Microsoft.Extensions.Options.9.0.0.nupkg new file mode 100644 index 00000000..749bdf0b Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Options.9.0.0.nupkg differ diff --git a/local-nuget/Microsoft.Extensions.Options.ConfigurationExtensions.9.0.0.nupkg b/local-nuget/Microsoft.Extensions.Options.ConfigurationExtensions.9.0.0.nupkg new file mode 100644 index 00000000..e48da262 Binary files /dev/null and b/local-nuget/Microsoft.Extensions.Options.ConfigurationExtensions.9.0.0.nupkg differ diff --git a/ops/devops/README.md b/ops/devops/README.md new file mode 100644 index 00000000..1525b8c5 --- /dev/null +++ b/ops/devops/README.md @@ -0,0 +1,41 @@ +# DevOps Release Automation + +The **release** workflow builds and signs the StellaOps service containers, +generates SBOM + provenance attestations, and emits a canonical +`release.yaml`. The logic lives under `ops/devops/release/` and is invoked +by the new `.gitea/workflows/release.yml` pipeline. + +## Local dry run + +```bash +./ops/devops/release/build_release.py \ + --version 2025.10.0-edge \ + --channel edge \ + --dry-run +``` + +Outputs land under `out/release/`. Use `--no-push` to run full builds without +pushing to the registry. + +## Required tooling + +- Docker 25+ with Buildx +- .NET 10 preview SDK (builds container stages and the SBOM generator) +- Node.js 20 (Angular UI build) +- Helm 3.16+ +- Cosign 2.2+ + +Supply signing material via environment variables: + +- `COSIGN_KEY_REF` – e.g. `file:./keys/cosign.key` or `azurekms://…` +- `COSIGN_PASSWORD` – password protecting the above key + +The workflow defaults to multi-arch (`linux/amd64,linux/arm64`), SBOM in +CycloneDX, and SLSA provenance (`https://slsa.dev/provenance/v1`). + +## UI auth smoke (Playwright) + +As part of **DEVOPS-UI-13-006** the pipelines will execute the UI auth smoke +tests (`npm run test:e2e`) after building the Angular bundle. See +`docs/ops/ui-auth-smoke.md` for the job design, environment stubs, and +offline runner considerations. diff --git a/ops/devops/TASKS.md b/ops/devops/TASKS.md index 5a7a072e..61e5db7e 100644 --- a/ops/devops/TASKS.md +++ b/ops/devops/TASKS.md @@ -7,7 +7,7 @@ | DEVOPS-SCANNER-09-205 | DONE (2025-10-21) | DevOps Guild, Notify Guild | DEVOPS-SCANNER-09-204 | Add Notify smoke stage that tails the Redis stream and asserts `scanner.report.ready`/`scanner.scan.completed` reach Notify WebService in staging. | CI job reads Redis stream during scanner smoke deploy, confirms Notify ingestion via API, alerts on failure. | | DEVOPS-PERF-10-001 | DONE | DevOps Guild | BENCH-SCANNER-10-001 | Add perf smoke job (SBOM compose <5 s target) to CI. | CI job runs sample build verifying <5 s; alerts configured. | | DEVOPS-PERF-10-002 | DONE (2025-10-23) | DevOps Guild | BENCH-SCANNER-10-002 | Publish analyzer bench metrics to Grafana/perf workbook and alarm on ≥20 % regressions. | CI exports JSON for dashboards; Grafana panel wired; Ops on-call doc updated with alert hook. | -| DEVOPS-REL-14-001 | TODO | DevOps Guild | SIGNER-API-11-101, ATTESTOR-API-11-201 | Deterministic build/release pipeline with SBOM/provenance, signing, manifest generation. | CI pipeline produces signed images + SBOM/attestations, manifests published with verified hashes, docs updated. | +| DEVOPS-REL-14-001 | DOING (2025-10-23) | DevOps Guild | SIGNER-API-11-101, ATTESTOR-API-11-201 | Deterministic build/release pipeline with SBOM/provenance, signing, manifest generation. | CI pipeline produces signed images + SBOM/attestations, manifests published with verified hashes, docs updated. | | DEVOPS-REL-14-004 | TODO | DevOps Guild, Scanner Guild | DEVOPS-REL-14-001, SCANNER-ANALYZERS-LANG-10-309P | Extend release/offline smoke jobs to exercise the Python analyzer plug-in (warm/cold scans, determinism, signature checks). | Release/Offline pipelines run Python analyzer smoke suite; alerts hooked; docs updated with new coverage matrix. | | DEVOPS-REL-17-002 | TODO | DevOps Guild | DEVOPS-REL-14-001, SCANNER-EMIT-17-701 | Persist stripped-debug artifacts organised by GNU build-id and bundle them into release/offline kits with checksum manifests. | CI job writes `.debug` files under `artifacts/debug/.build-id/`, manifest + checksums published, offline kit includes cache, smoke job proves symbol lookup via build-id. | | DEVOPS-MIRROR-08-001 | DONE (2025-10-19) | DevOps Guild | DEVOPS-REL-14-001 | Stand up managed mirror profiles for `*.stella-ops.org` (Concelier/Excititor), including Helm/Compose overlays, multi-tenant secrets, CDN caching, and sync documentation. | Infra overlays committed, CI smoke deploy hits mirror endpoints, runbooks published for downstream sync and quota management. | @@ -15,8 +15,9 @@ | DEVOPS-LAUNCH-18-100 | TODO | DevOps Guild | - | Finalise production environment footprint (clusters, secrets, network overlays) for full-platform go-live. | IaC/compose overlays committed, secrets placeholders documented, dry-run deploy succeeds in staging. | | DEVOPS-LAUNCH-18-900 | TODO | DevOps Guild, Module Leads | Wave 0 completion | Collect “full implementation” sign-off from module owners and consolidate launch readiness checklist. | Sign-off record stored under `docs/ops/launch-readiness.md`; outstanding gaps triaged; checklist approved. | | DEVOPS-LAUNCH-18-001 | TODO | DevOps Guild | DEVOPS-LAUNCH-18-100, DEVOPS-LAUNCH-18-900 | Production launch cutover rehearsal and runbook publication. | `docs/ops/launch-cutover.md` drafted, rehearsal executed with rollback drill, approvals captured. | -| DEVOPS-NUGET-13-001 | TODO | DevOps Guild, Platform Leads | DEVOPS-REL-14-001 | Add .NET 10 preview feeds / local mirrors so `Microsoft.Extensions.*` 10.0 preview packages restore offline; refresh restore docs. | NuGet.config maps preview feeds (or local mirrored packages), `dotnet restore` succeeds for Excititor/Concelier solutions without ad-hoc feed edits, docs updated for offline bootstrap. | +| DEVOPS-NUGET-13-001 | DOING (2025-10-24) | DevOps Guild, Platform Leads | DEVOPS-REL-14-001 | Add .NET 10 preview feeds / local mirrors so `Microsoft.Extensions.*` 10.0 preview packages restore offline; refresh restore docs. | NuGet.config maps preview feeds (or local mirrored packages), `dotnet restore` succeeds for Excititor/Concelier solutions without ad-hoc feed edits, docs updated for offline bootstrap. | | DEVOPS-NUGET-13-002 | TODO | DevOps Guild | DEVOPS-NUGET-13-001 | Ensure all solutions/projects prefer `local-nuget` before public sources and document restore order validation. | `NuGet.config` and solution-level configs resolve from `local-nuget` first; automated check verifies priority; docs updated for restore ordering. | | DEVOPS-NUGET-13-003 | TODO | DevOps Guild, Platform Leads | DEVOPS-NUGET-13-002 | Sweep `Microsoft.*` NuGet dependencies pinned to 8.* and upgrade to latest .NET 10 equivalents (or .NET 9 when 10 unavailable), updating restore guidance. | Dependency audit shows no 8.* `Microsoft.*` packages remaining; CI builds green; changelog/doc sections capture upgrade rationale. | +| DEVOPS-UI-13-006 | TODO | DevOps Guild, UI Guild | UI-AUTH-13-001 | Add Playwright-based UI auth smoke job to CI/offline pipelines, wiring sample `/config.json` provisioning and reporting. | CI + Offline Kit run Playwright auth smoke (headless Chromium) post-build; job reuses stub config artifact, exports junit + trace on failure, docs updated under `docs/ops/ui-auth-smoke.md`. | > Remark (2025-10-20): Repacked `Mongo2Go` local feed to require MongoDB.Driver 3.5.0 + SharpCompress 0.41.0; cache regression tests green and NU1902/NU1903 suppressed. > Remark (2025-10-21): Compose/Helm profiles now surface `SCANNER__EVENTS__*` toggles with docs pointing at new `.env` placeholders. diff --git a/ops/devops/nuget-preview-packages.csv b/ops/devops/nuget-preview-packages.csv new file mode 100644 index 00000000..274cc8b6 --- /dev/null +++ b/ops/devops/nuget-preview-packages.csv @@ -0,0 +1,16 @@ +# Package,Version,SHA256 +Microsoft.Extensions.Caching.Memory,10.0.0-preview.7.25380.108,8721fd1420fea6e828963c8343cd83605902b663385e8c9060098374139f9b2f +Microsoft.Extensions.Configuration,10.0.0-preview.7.25380.108,5a17ba4ba47f920a04ae51d80560833da82a0926d1e462af0d11c16b5da969f4 +Microsoft.Extensions.Configuration.Binder,10.0.0-preview.7.25380.108,5a3af17729241e205fe8fbb1d458470e9603935ab2eb67cbbb06ce51265ff68f +Microsoft.Extensions.DependencyInjection.Abstractions,10.0.0-preview.7.25380.108,1e9cd330d7833a3a850a7a42bbe0c729906c60bf1c359ad30a8622b50da4399b +Microsoft.Extensions.Hosting,10.0.0-preview.7.25380.108,3123bb019bbc0182cf7ac27f30018ca620929f8027e137bd5bdfb952037c7d29 +Microsoft.Extensions.Hosting.Abstractions,10.0.0-preview.7.25380.108,b57625436c9eb53e3aa27445b680bb93285d0d2c91007bbc221b0c378ab016a3 +Microsoft.Extensions.Http,10.0.0-preview.7.25380.108,daec142b7c7bd09ec1f2a86bfc3d7fe009825f5b653d310bc9e959c0a98a0f19 +Microsoft.Extensions.Logging.Abstractions,10.0.0-preview.7.25380.108,87a495fa0b7054e134a5cf44ec8b071fe2bc3ddfb27e9aefc6375701dca2a33a +Microsoft.Extensions.Options,10.0.0-preview.7.25380.108,c0657c2be3b7b894024586cf6e46a2ebc0e710db64d2645c4655b893b8487d8a +Microsoft.Extensions.DependencyInjection.Abstractions,9.0.0,0a7715c24299e42b081b63b4f8e33da97b985e1de9e941b2b9e4c748b0d52fe7 +Microsoft.Extensions.Logging.Abstractions,9.0.0,8814ecf6dc2359715e111b78084ae42087282595358eb775456088f15e63eca5 +Microsoft.Extensions.Options,9.0.0,0d3e5eb80418fc8b41e4b3c8f16229e839ddd254af0513f7e6f1643970baf1c9 +Microsoft.Extensions.Options.ConfigurationExtensions,9.0.0,af5677b04552223787d942a3f8a323f3a85aafaf20ff3c9b4aaa128c44817280 +Microsoft.Data.Sqlite,9.0.0-rc.1.24451.1,770b637317e1e924f1b13587b31af0787c8c668b1d9f53f2fccae8ee8704e167 +Microsoft.AspNetCore.Authentication.JwtBearer,10.0.0-rc.1.25451.107,05f168c2db7ba79230e3fd77e84f6912bc73721c6656494df0b227867a6c2d3c diff --git a/ops/devops/release/build_release.py b/ops/devops/release/build_release.py new file mode 100644 index 00000000..868944af --- /dev/null +++ b/ops/devops/release/build_release.py @@ -0,0 +1,630 @@ +#!/usr/bin/env python3 +"""Deterministic release pipeline helper for StellaOps. + +This script builds service containers, generates SBOM and provenance artefacts, +signs them with cosign, and writes a channel-specific release manifest. + +The workflow expects external tooling to be available on PATH: +- docker (with buildx) +- cosign +- helm +- npm / node (for the UI build) +- dotnet SDK (for BuildX plugin publication) +""" +from __future__ import annotations + +import argparse +import datetime as dt +import hashlib +import json +import os +import pathlib +import re +import shlex +import subprocess +import sys +import tempfile +from collections import OrderedDict +from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Sequence + +REPO_ROOT = pathlib.Path(__file__).resolve().parents[3] +DEFAULT_CONFIG = REPO_ROOT / "ops/devops/release/components.json" + +class CommandError(RuntimeError): + pass + +def run(cmd: Sequence[str], *, cwd: Optional[pathlib.Path] = None, env: Optional[Mapping[str, str]] = None, capture: bool = True) -> str: + """Run a subprocess command, returning stdout (text).""" + process_env = os.environ.copy() + if env: + process_env.update(env) + result = subprocess.run( + list(cmd), + cwd=str(cwd) if cwd else None, + env=process_env, + check=False, + capture_output=capture, + text=True, + ) + if process_env.get("STELLAOPS_RELEASE_DEBUG"): + sys.stderr.write(f"[debug] {' '.join(shlex.quote(c) for c in cmd)}\n") + if capture: + sys.stderr.write(result.stdout) + sys.stderr.write(result.stderr) + if result.returncode != 0: + stdout = result.stdout if capture else "" + stderr = result.stderr if capture else "" + raise CommandError(f"Command failed ({result.returncode}): {' '.join(cmd)}\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}") + + return result.stdout if capture else "" + + +def load_json_config(path: pathlib.Path) -> Dict[str, Any]: + with path.open("r", encoding="utf-8") as handle: + return json.load(handle) + + +def ensure_directory(path: pathlib.Path) -> pathlib.Path: + path.mkdir(parents=True, exist_ok=True) + return path + + +def compute_sha256(path: pathlib.Path) -> str: + sha = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + sha.update(chunk) + return sha.hexdigest() + + +def format_scalar(value: Any) -> str: + if isinstance(value, bool): + return "true" if value else "false" + if value is None: + return "null" + if isinstance(value, (int, float)): + return str(value) + text = str(value) + if text == "": + return '""' + if re.search(r"[\s:#\-\[\]\{\}]", text): + return json.dumps(text, ensure_ascii=False) + return text + + +def _yaml_lines(value: Any, indent: int = 0) -> List[str]: + pad = " " * indent + if isinstance(value, Mapping): + lines: List[str] = [] + for key, val in value.items(): + if isinstance(val, (Mapping, list)): + lines.append(f"{pad}{key}:") + lines.extend(_yaml_lines(val, indent + 1)) + else: + lines.append(f"{pad}{key}: {format_scalar(val)}") + if not lines: + lines.append(f"{pad}{{}}") + return lines + if isinstance(value, list): + lines = [] + if not value: + lines.append(f"{pad}[]") + return lines + for item in value: + if isinstance(item, (Mapping, list)): + lines.append(f"{pad}-") + lines.extend(_yaml_lines(item, indent + 1)) + else: + lines.append(f"{pad}- {format_scalar(item)}") + return lines + return [f"{pad}{format_scalar(value)}"] + + +def dump_yaml(data: Mapping[str, Any]) -> str: + lines: List[str] = _yaml_lines(data) + return "\n".join(lines) + "\n" + + +def utc_now_iso() -> str: + return dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def sanitize_calendar(version: str, explicit: Optional[str]) -> str: + if explicit: + return explicit + # Expect version like 2025.10.0-edge or 2.4.1 + parts = re.findall(r"\d+", version) + if len(parts) >= 2: + return f"{parts[0]}.{parts[1]}" + return dt.datetime.now(tz=dt.timezone.utc).strftime("%Y.%m") + + +class ReleaseBuilder: + def __init__( + self, + *, + repo_root: pathlib.Path, + config: Mapping[str, Any], + version: str, + channel: str, + calendar: str, + release_date: str, + git_sha: str, + output_dir: pathlib.Path, + push: bool, + dry_run: bool, + registry_override: Optional[str] = None, + platforms_override: Optional[Sequence[str]] = None, + skip_signing: bool = False, + cosign_key_ref: Optional[str] = None, + cosign_password: Optional[str] = None, + cosign_identity_token: Optional[str] = None, + tlog_upload: bool = True, + ) -> None: + self.repo_root = repo_root + self.config = config + self.version = version + self.channel = channel + self.calendar = calendar + self.release_date = release_date + self.git_sha = git_sha + self.output_dir = ensure_directory(output_dir) + self.push = push + self.dry_run = dry_run + self.registry = registry_override or config.get("registry") + if not self.registry: + raise ValueError("Config missing 'registry'") + platforms = list(platforms_override) if platforms_override else config.get("platforms") + if not platforms: + platforms = ["linux/amd64", "linux/arm64"] + self.platforms = list(platforms) + self.source_date_epoch = str(int(dt.datetime.fromisoformat(release_date.replace("Z", "+00:00")).timestamp())) + self.artifacts_dir = ensure_directory(self.output_dir / "artifacts") + self.sboms_dir = ensure_directory(self.artifacts_dir / "sboms") + self.provenance_dir = ensure_directory(self.artifacts_dir / "provenance") + self.signature_dir = ensure_directory(self.artifacts_dir / "signatures") + self.metadata_dir = ensure_directory(self.artifacts_dir / "metadata") + self.temp_dir = pathlib.Path(tempfile.mkdtemp(prefix="stellaops-release-")) + self.skip_signing = skip_signing + self.tlog_upload = tlog_upload + self.cosign_key_ref = cosign_key_ref or os.environ.get("COSIGN_KEY_REF") + self.cosign_identity_token = cosign_identity_token or os.environ.get("COSIGN_IDENTITY_TOKEN") + password = cosign_password if cosign_password is not None else os.environ.get("COSIGN_PASSWORD", "") + self.cosign_env = { + "COSIGN_PASSWORD": password, + "COSIGN_EXPERIMENTAL": "1", + "COSIGN_ALLOW_HTTP_REGISTRY": os.environ.get("COSIGN_ALLOW_HTTP_REGISTRY", "1"), + "COSIGN_DOCKER_MEDIA_TYPES": os.environ.get("COSIGN_DOCKER_MEDIA_TYPES", "1"), + } + + # ---------------- + # Build steps + # ---------------- + def run(self) -> Dict[str, Any]: + components_result = [] + if self.dry_run: + print("⚠️ Dry-run enabled; commands will be skipped") + self._prime_buildx_plugin() + for component in self.config.get("components", []): + result = self._build_component(component) + components_result.append(result) + helm_meta = self._package_helm() + compose_meta = self._digest_compose_files() + manifest = self._compose_manifest(components_result, helm_meta, compose_meta) + return manifest + + def _prime_buildx_plugin(self) -> None: + plugin_cfg = self.config.get("buildxPlugin") + if not plugin_cfg: + return + project = plugin_cfg.get("project") + if not project: + return + out_dir = ensure_directory(self.temp_dir / "buildx") + if not self.dry_run: + run([ + "dotnet", + "publish", + project, + "-c", + "Release", + "-o", + str(out_dir), + ]) + cas_dir = ensure_directory(self.temp_dir / "cas") + run([ + "dotnet", + str(out_dir / "StellaOps.Scanner.Sbomer.BuildXPlugin.dll"), + "handshake", + "--manifest", + str(out_dir), + "--cas", + str(cas_dir), + ]) + + def _component_tags(self, repo: str) -> List[str]: + base = f"{self.registry}/{repo}" + tags = [f"{base}:{self.version}"] + if self.channel: + tags.append(f"{base}:{self.channel}") + return tags + + def _component_ref(self, repo: str, digest: str) -> str: + return f"{self.registry}/{repo}@{digest}" + + def _build_component(self, component: Mapping[str, Any]) -> Mapping[str, Any]: + name = component["name"] + repo = component.get("repository", name) + kind = component.get("kind", "dotnet-service") + dockerfile = component.get("dockerfile") + if not dockerfile: + raise ValueError(f"Component {name} missing dockerfile") + context = component.get("context", ".") + iid_file = self.temp_dir / f"{name}.iid" + metadata_file = self.metadata_dir / f"{name}.metadata.json" + + build_args = { + "VERSION": self.version, + "CHANNEL": self.channel, + "GIT_SHA": self.git_sha, + "SOURCE_DATE_EPOCH": self.source_date_epoch, + } + docker_cfg = self.config.get("docker", {}) + if kind == "dotnet-service": + build_args.update({ + "PROJECT": component["project"], + "ENTRYPOINT_DLL": component["entrypoint"], + "SDK_IMAGE": docker_cfg.get("sdkImage", "mcr.microsoft.com/dotnet/nightly/sdk:10.0"), + "RUNTIME_IMAGE": docker_cfg.get("runtimeImage", "gcr.io/distroless/dotnet/aspnet:latest"), + }) + elif kind == "angular-ui": + build_args.update({ + "NODE_IMAGE": docker_cfg.get("nodeImage", "node:20.14.0-bookworm"), + "NGINX_IMAGE": docker_cfg.get("nginxImage", "nginx:1.27-alpine"), + }) + else: + raise ValueError(f"Unsupported component kind {kind}") + + tags = self._component_tags(repo) + build_cmd = [ + "docker", + "buildx", + "build", + "--file", + dockerfile, + "--metadata-file", + str(metadata_file), + "--iidfile", + str(iid_file), + "--progress", + "plain", + "--platform", + ",".join(self.platforms), + ] + for key, value in build_args.items(): + build_cmd.extend(["--build-arg", f"{key}={value}"]) + for tag in tags: + build_cmd.extend(["--tag", tag]) + build_cmd.extend([ + "--attest", + "type=sbom", + "--attest", + "type=provenance,mode=max", + ]) + if self.push: + build_cmd.append("--push") + else: + build_cmd.append("--load") + build_cmd.append(context) + + if not self.dry_run: + run(build_cmd, cwd=self.repo_root) + + digest = iid_file.read_text(encoding="utf-8").strip() if iid_file.exists() else "" + image_ref = self._component_ref(repo, digest) if digest else "" + + bundle_info = self._sign_image(name, image_ref, tags) + sbom_info = self._generate_sbom(name, image_ref) + provenance_info = self._attach_provenance(name, image_ref) + + component_entry = OrderedDict() + component_entry["name"] = name + if digest: + component_entry["image"] = image_ref + component_entry["tags"] = tags + if sbom_info: + component_entry["sbom"] = sbom_info + if provenance_info: + component_entry["provenance"] = provenance_info + if bundle_info: + component_entry["signature"] = bundle_info + if metadata_file.exists(): + component_entry["metadata"] = str(metadata_file.relative_to(self.output_dir.parent)) if metadata_file.is_relative_to(self.output_dir.parent) else str(metadata_file) + return component_entry + + def _sign_image(self, name: str, image_ref: str, tags: Sequence[str]) -> Optional[Mapping[str, Any]]: + if self.skip_signing: + return None + if not image_ref: + return None + if not (self.cosign_key_ref or self.cosign_identity_token): + raise ValueError("Signing requested but no cosign key or identity token provided. Use --skip-signing to bypass.") + signature_path = self.signature_dir / f"{name}.signature" + cmd = ["cosign", "sign", "--yes"] + if self.cosign_key_ref: + cmd.extend(["--key", self.cosign_key_ref]) + if self.cosign_identity_token: + cmd.extend(["--identity-token", self.cosign_identity_token]) + if not self.tlog_upload: + cmd.append("--tlog-upload=false") + cmd.append("--allow-http-registry") + cmd.append(image_ref) + if self.dry_run: + return None + run(cmd, env=self.cosign_env) + signature_data = run([ + "cosign", + "download", + "signature", + "--allow-http-registry", + image_ref, + ]) + signature_path.write_text(signature_data, encoding="utf-8") + signature_ref = run([ + "cosign", + "triangulate", + "--allow-http-registry", + image_ref, + ]).strip() + return OrderedDict( + ( + ("signature", OrderedDict(( + ("path", str(signature_path.relative_to(self.output_dir.parent)) if signature_path.is_relative_to(self.output_dir.parent) else str(signature_path)), + ("ref", signature_ref), + ("tlogUploaded", self.tlog_upload), + ))), + ) + ) + + def _generate_sbom(self, name: str, image_ref: str) -> Optional[Mapping[str, Any]]: + if not image_ref or self.dry_run: + return None + sbom_path = self.sboms_dir / f"{name}.cyclonedx.json" + run([ + "docker", + "sbom", + image_ref, + "--format", + "cyclonedx-json", + "--output", + str(sbom_path), + ]) + entry = OrderedDict(( + ("path", str(sbom_path.relative_to(self.output_dir.parent)) if sbom_path.is_relative_to(self.output_dir.parent) else str(sbom_path)), + ("sha256", compute_sha256(sbom_path)), + )) + if self.skip_signing: + return entry + attach_cmd = [ + "cosign", + "attach", + "sbom", + "--sbom", + str(sbom_path), + "--type", + "cyclonedx", + ] + if self.cosign_key_ref: + attach_cmd.extend(["--key", self.cosign_key_ref]) + attach_cmd.append("--allow-http-registry") + attach_cmd.append(image_ref) + run(attach_cmd, env=self.cosign_env) + reference = run(["cosign", "triangulate", "--type", "sbom", "--allow-http-registry", image_ref]).strip() + entry["ref"] = reference + return entry + + def _attach_provenance(self, name: str, image_ref: str) -> Optional[Mapping[str, Any]]: + if not image_ref or self.dry_run: + return None + predicate = OrderedDict() + predicate["buildDefinition"] = OrderedDict( + ( + ("buildType", "https://git.stella-ops.org/stellaops/release"), + ("externalParameters", OrderedDict(( + ("component", name), + ("version", self.version), + ("channel", self.channel), + ))), + ) + ) + predicate["runDetails"] = OrderedDict( + ( + ("builder", OrderedDict((("id", "https://github.com/actions"),))), + ("metadata", OrderedDict((("finishedOn", self.release_date),))), + ) + ) + predicate_path = self.provenance_dir / f"{name}.provenance.json" + with predicate_path.open("w", encoding="utf-8") as handle: + json.dump(predicate, handle, indent=2, sort_keys=True) + handle.write("\n") + entry = OrderedDict(( + ("path", str(predicate_path.relative_to(self.output_dir.parent)) if predicate_path.is_relative_to(self.output_dir.parent) else str(predicate_path)), + ("sha256", compute_sha256(predicate_path)), + )) + if self.skip_signing: + return entry + cmd = [ + "cosign", + "attest", + "--predicate", + str(predicate_path), + "--type", + "https://slsa.dev/provenance/v1", + ] + if self.cosign_key_ref: + cmd.extend(["--key", self.cosign_key_ref]) + if not self.tlog_upload: + cmd.append("--tlog-upload=false") + cmd.append("--allow-http-registry") + cmd.append(image_ref) + run(cmd, env=self.cosign_env) + ref = run([ + "cosign", + "triangulate", + "--type", + "https://slsa.dev/provenance/v1", + "--allow-http-registry", + image_ref, + ]).strip() + entry["ref"] = ref + return entry + + # ---------------- + # Helm + compose + # ---------------- + def _package_helm(self) -> Optional[Mapping[str, Any]]: + helm_cfg = self.config.get("helm") + if not helm_cfg: + return None + chart_path = helm_cfg.get("chartPath") + if not chart_path: + return None + chart_dir = self.repo_root / chart_path + output_dir = ensure_directory(self.output_dir / "helm") + archive_path = output_dir / f"stellaops-{self.version}.tgz" + if not self.dry_run: + cmd = [ + "helm", + "package", + str(chart_dir), + "--destination", + str(output_dir), + "--version", + self.version, + "--app-version", + self.version, + ] + run(cmd) + packaged = next(output_dir.glob("*.tgz"), None) + if packaged and packaged != archive_path: + packaged.rename(archive_path) + digest = compute_sha256(archive_path) if archive_path.exists() else None + if archive_path.exists() and archive_path.is_relative_to(self.output_dir): + manifest_path = str(archive_path.relative_to(self.output_dir)) + elif archive_path.exists() and archive_path.is_relative_to(self.output_dir.parent): + manifest_path = str(archive_path.relative_to(self.output_dir.parent)) + else: + manifest_path = f"helm/{archive_path.name}" + return OrderedDict(( + ("name", "stellaops"), + ("version", self.version), + ("path", manifest_path), + ("sha256", digest), + )) + + def _digest_compose_files(self) -> List[Mapping[str, Any]]: + compose_cfg = self.config.get("compose", {}) + files = compose_cfg.get("files", []) + entries: List[Mapping[str, Any]] = [] + for rel_path in files: + src = self.repo_root / rel_path + if not src.exists(): + continue + digest = compute_sha256(src) + entries.append(OrderedDict(( + ("name", pathlib.Path(rel_path).name), + ("path", rel_path), + ("sha256", digest), + ))) + return entries + + # ---------------- + # Manifest assembly + # ---------------- + def _compose_manifest( + self, + components: List[Mapping[str, Any]], + helm_meta: Optional[Mapping[str, Any]], + compose_meta: List[Mapping[str, Any]], + ) -> Dict[str, Any]: + manifest = OrderedDict() + manifest["release"] = OrderedDict(( + ("version", self.version), + ("channel", self.channel), + ("date", self.release_date), + ("calendar", self.calendar), + )) + manifest["components"] = components + if helm_meta: + manifest["charts"] = [helm_meta] + if compose_meta: + manifest["compose"] = compose_meta + return manifest + + +def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Build StellaOps release artefacts deterministically") + parser.add_argument("--config", type=pathlib.Path, default=DEFAULT_CONFIG, help="Path to release config JSON") + parser.add_argument("--version", required=True, help="Release version string (e.g. 2025.10.0-edge)") + parser.add_argument("--channel", required=True, help="Release channel (edge|stable|lts)") + parser.add_argument("--calendar", help="Calendar tag (YYYY.MM); defaults derived from version") + parser.add_argument("--git-sha", default=os.environ.get("GIT_COMMIT", "unknown"), help="Git revision to embed") + parser.add_argument("--output", type=pathlib.Path, default=REPO_ROOT / "out/release", help="Output directory for artefacts") + parser.add_argument("--no-push", action="store_true", help="Do not push images (use docker load)") + parser.add_argument("--dry-run", action="store_true", help="Print steps without executing commands") + parser.add_argument("--registry", help="Override registry root (e.g. localhost:5000/stellaops)") + parser.add_argument("--platform", dest="platforms", action="append", metavar="PLATFORM", help="Override build platforms (repeatable)") + parser.add_argument("--skip-signing", action="store_true", help="Skip cosign signing/attestation steps") + parser.add_argument("--cosign-key", dest="cosign_key", help="Override COSIGN_KEY_REF value") + parser.add_argument("--cosign-password", dest="cosign_password", help="Password for cosign key") + parser.add_argument("--cosign-identity-token", dest="cosign_identity_token", help="Identity token for keyless cosign flows") + parser.add_argument("--no-transparency", action="store_true", help="Disable Rekor transparency log upload during signing") + return parser.parse_args(argv) + + +def write_manifest(manifest: Mapping[str, Any], output_dir: pathlib.Path) -> pathlib.Path: + # Copy manifest to avoid mutating input when computing checksum + base_manifest = OrderedDict(manifest) + yaml_without_checksum = dump_yaml(base_manifest) + digest = hashlib.sha256(yaml_without_checksum.encode("utf-8")).hexdigest() + manifest_with_checksum = OrderedDict(base_manifest) + manifest_with_checksum["checksums"] = OrderedDict((("sha256", digest),)) + final_yaml = dump_yaml(manifest_with_checksum) + output_path = output_dir / "release.yaml" + with output_path.open("w", encoding="utf-8") as handle: + handle.write(final_yaml) + return output_path + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = parse_args(argv) + config = load_json_config(args.config) + release_date = utc_now_iso() + calendar = sanitize_calendar(args.version, args.calendar) + builder = ReleaseBuilder( + repo_root=REPO_ROOT, + config=config, + version=args.version, + channel=args.channel, + calendar=calendar, + release_date=release_date, + git_sha=args.git_sha, + output_dir=args.output, + push=not args.no_push, + dry_run=args.dry_run, + registry_override=args.registry, + platforms_override=args.platforms, + skip_signing=args.skip_signing, + cosign_key_ref=args.cosign_key, + cosign_password=args.cosign_password, + cosign_identity_token=args.cosign_identity_token, + tlog_upload=not args.no_transparency, + ) + manifest = builder.run() + manifest_path = write_manifest(manifest, builder.output_dir) + print(f"✅ Release manifest written to {manifest_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/ops/devops/release/components.json b/ops/devops/release/components.json new file mode 100644 index 00000000..baa59d81 --- /dev/null +++ b/ops/devops/release/components.json @@ -0,0 +1,97 @@ +{ + "registry": "registry.stella-ops.org/stellaops", + "platforms": ["linux/amd64", "linux/arm64"], + "defaultChannel": "edge", + "docker": { + "sdkImage": "mcr.microsoft.com/dotnet/nightly/sdk:10.0", + "runtimeImage": "mcr.microsoft.com/dotnet/nightly/aspnet:10.0", + "nodeImage": "node:20.14.0-bookworm", + "nginxImage": "nginx:1.27-alpine" + }, + "components": [ + { + "name": "authority", + "repository": "authority", + "kind": "dotnet-service", + "context": ".", + "dockerfile": "ops/devops/release/docker/Dockerfile.dotnet-service", + "project": "src/StellaOps.Authority/StellaOps.Authority/StellaOps.Authority.csproj", + "entrypoint": "StellaOps.Authority.dll" + }, + { + "name": "signer", + "repository": "signer", + "kind": "dotnet-service", + "context": ".", + "dockerfile": "ops/devops/release/docker/Dockerfile.dotnet-service", + "project": "src/StellaOps.Signer/StellaOps.Signer.WebService/StellaOps.Signer.WebService.csproj", + "entrypoint": "StellaOps.Signer.WebService.dll" + }, + { + "name": "attestor", + "repository": "attestor", + "kind": "dotnet-service", + "context": ".", + "dockerfile": "ops/devops/release/docker/Dockerfile.dotnet-service", + "project": "src/StellaOps.Attestor/StellaOps.Attestor.WebService/StellaOps.Attestor.WebService.csproj", + "entrypoint": "StellaOps.Attestor.WebService.dll" + }, + { + "name": "scanner-web", + "repository": "scanner-web", + "kind": "dotnet-service", + "context": ".", + "dockerfile": "ops/devops/release/docker/Dockerfile.dotnet-service", + "project": "src/StellaOps.Scanner.WebService/StellaOps.Scanner.WebService.csproj", + "entrypoint": "StellaOps.Scanner.WebService.dll" + }, + { + "name": "scanner-worker", + "repository": "scanner-worker", + "kind": "dotnet-service", + "context": ".", + "dockerfile": "ops/devops/release/docker/Dockerfile.dotnet-service", + "project": "src/StellaOps.Scanner.Worker/StellaOps.Scanner.Worker.csproj", + "entrypoint": "StellaOps.Scanner.Worker.dll" + }, + { + "name": "concelier", + "repository": "concelier", + "kind": "dotnet-service", + "context": ".", + "dockerfile": "ops/devops/release/docker/Dockerfile.dotnet-service", + "project": "src/StellaOps.Concelier.WebService/StellaOps.Concelier.WebService.csproj", + "entrypoint": "StellaOps.Concelier.WebService.dll" + }, + { + "name": "excititor", + "repository": "excititor", + "kind": "dotnet-service", + "context": ".", + "dockerfile": "ops/devops/release/docker/Dockerfile.dotnet-service", + "project": "src/StellaOps.Excititor.WebService/StellaOps.Excititor.WebService.csproj", + "entrypoint": "StellaOps.Excititor.WebService.dll" + }, + { + "name": "web-ui", + "repository": "web-ui", + "kind": "angular-ui", + "context": ".", + "dockerfile": "ops/devops/release/docker/Dockerfile.angular-ui" + } + ], + "helm": { + "chartPath": "deploy/helm/stellaops", + "outputDir": "out/release/helm" + }, + "compose": { + "files": [ + "deploy/compose/docker-compose.dev.yaml", + "deploy/compose/docker-compose.stage.yaml", + "deploy/compose/docker-compose.airgap.yaml" + ] + }, + "buildxPlugin": { + "project": "src/StellaOps.Scanner.Sbomer.BuildXPlugin/StellaOps.Scanner.Sbomer.BuildXPlugin.csproj" + } +} diff --git a/ops/devops/release/docker/Dockerfile.angular-ui b/ops/devops/release/docker/Dockerfile.angular-ui new file mode 100644 index 00000000..080c590f --- /dev/null +++ b/ops/devops/release/docker/Dockerfile.angular-ui @@ -0,0 +1,31 @@ +# syntax=docker/dockerfile:1.7-labs + +ARG NODE_IMAGE=node:20.14.0-bookworm +ARG NGINX_IMAGE=nginx:1.27-alpine +ARG VERSION=0.0.0 +ARG CHANNEL=dev +ARG GIT_SHA=0000000 +ARG SOURCE_DATE_EPOCH=0 + +FROM ${NODE_IMAGE} AS build +WORKDIR /workspace +ENV CI=1 \ + SOURCE_DATE_EPOCH=${SOURCE_DATE_EPOCH} +COPY src/StellaOps.Web/package.json src/StellaOps.Web/package-lock.json ./ +RUN npm ci --prefer-offline --no-audit --no-fund +COPY src/StellaOps.Web/ ./ +RUN npm run build -- --configuration=production + +FROM ${NGINX_IMAGE} AS runtime +ARG VERSION +ARG CHANNEL +ARG GIT_SHA +WORKDIR /usr/share/nginx/html +RUN rm -rf ./* +COPY --from=build /workspace/dist/stellaops-web/ /usr/share/nginx/html/ +COPY ops/devops/release/docker/nginx-default.conf /etc/nginx/conf.d/default.conf +LABEL org.opencontainers.image.version="${VERSION}" \ + org.opencontainers.image.revision="${GIT_SHA}" \ + org.opencontainers.image.source="https://git.stella-ops.org/stella-ops/feedser" \ + org.stellaops.release.channel="${CHANNEL}" +EXPOSE 8080 diff --git a/ops/devops/release/docker/Dockerfile.dotnet-service b/ops/devops/release/docker/Dockerfile.dotnet-service new file mode 100644 index 00000000..19c5fc57 --- /dev/null +++ b/ops/devops/release/docker/Dockerfile.dotnet-service @@ -0,0 +1,52 @@ +# syntax=docker/dockerfile:1.7-labs + +ARG SDK_IMAGE=mcr.microsoft.com/dotnet/nightly/sdk:10.0 +ARG RUNTIME_IMAGE=gcr.io/distroless/dotnet/aspnet:latest + +ARG PROJECT +ARG ENTRYPOINT_DLL +ARG VERSION=0.0.0 +ARG CHANNEL=dev +ARG GIT_SHA=0000000 +ARG SOURCE_DATE_EPOCH=0 + +FROM ${SDK_IMAGE} AS build +ARG PROJECT +ARG GIT_SHA +ARG SOURCE_DATE_EPOCH +WORKDIR /src +ENV DOTNET_CLI_TELEMETRY_OPTOUT=1 \ + DOTNET_SKIP_FIRST_TIME_EXPERIENCE=1 \ + NUGET_XMLDOC_MODE=skip \ + SOURCE_DATE_EPOCH=${SOURCE_DATE_EPOCH} +COPY . . +RUN --mount=type=cache,target=/root/.nuget/packages \ + dotnet restore "${PROJECT}" +RUN --mount=type=cache,target=/root/.nuget/packages \ + dotnet publish "${PROJECT}" \ + -c Release \ + -o /app/publish \ + /p:UseAppHost=false \ + /p:ContinuousIntegrationBuild=true \ + /p:SourceRevisionId=${GIT_SHA} \ + /p:Deterministic=true \ + /p:TreatWarningsAsErrors=true + +FROM ${RUNTIME_IMAGE} AS runtime +WORKDIR /app +ARG ENTRYPOINT_DLL +ARG VERSION +ARG CHANNEL +ARG GIT_SHA +ENV DOTNET_EnableDiagnostics=0 \ + ASPNETCORE_URLS=http://0.0.0.0:8080 +COPY --from=build /app/publish/ ./ +RUN set -eu; \ + printf '#!/usr/bin/env sh\nset -e\nexec dotnet %s "$@"\n' "${ENTRYPOINT_DLL}" > /entrypoint.sh; \ + chmod +x /entrypoint.sh +EXPOSE 8080 +LABEL org.opencontainers.image.version="${VERSION}" \ + org.opencontainers.image.revision="${GIT_SHA}" \ + org.opencontainers.image.source="https://git.stella-ops.org/stella-ops/feedser" \ + org.stellaops.release.channel="${CHANNEL}" +ENTRYPOINT ["/entrypoint.sh"] diff --git a/ops/devops/release/docker/nginx-default.conf b/ops/devops/release/docker/nginx-default.conf new file mode 100644 index 00000000..14d6f071 --- /dev/null +++ b/ops/devops/release/docker/nginx-default.conf @@ -0,0 +1,22 @@ +server { + listen 8080; + listen [::]:8080; + server_name _; + + root /usr/share/nginx/html; + index index.html; + + location / { + try_files $uri $uri/ /index.html; + } + + location ~* \.(?:js|css|svg|png|jpg|jpeg|gif|ico|woff2?)$ { + add_header Cache-Control "public, max-age=2592000"; + } + + location = /healthz { + access_log off; + add_header Content-Type text/plain; + return 200 'ok'; + } +} diff --git a/ops/devops/sync-preview-nuget.sh b/ops/devops/sync-preview-nuget.sh new file mode 100644 index 00000000..1ea26297 --- /dev/null +++ b/ops/devops/sync-preview-nuget.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +# Sync preview NuGet packages into the local offline feed. +# Reads package metadata from ops/devops/nuget-preview-packages.csv +# and ensures ./local-nuget holds the expected artefacts (with SHA-256 verification). + +set -euo pipefail + +repo_root="$(git -C "${BASH_SOURCE%/*}/.." rev-parse --show-toplevel 2>/dev/null || pwd)" +manifest="${repo_root}/ops/devops/nuget-preview-packages.csv" +dest="${repo_root}/local-nuget" + +if [[ ! -f "$manifest" ]]; then + echo "Manifest not found: $manifest" >&2 + exit 1 +fi + +mkdir -p "$dest" + +fetch_package() { + local package="$1" + local version="$2" + local expected_sha="$3" + local target="$dest/${package}.${version}.nupkg" + local url="https://www.nuget.org/api/v2/package/${package}/${version}" + + echo "[sync-nuget] Fetching ${package} ${version}" + local tmp + tmp="$(mktemp)" + trap 'rm -f "$tmp"' RETURN + curl -fsSL --retry 3 --retry-delay 1 "$url" -o "$tmp" + local actual_sha + actual_sha="$(sha256sum "$tmp" | awk '{print $1}')" + if [[ "$actual_sha" != "$expected_sha" ]]; then + echo "Checksum mismatch for ${package} ${version}" >&2 + echo " expected: $expected_sha" >&2 + echo " actual: $actual_sha" >&2 + exit 1 + fi + mv "$tmp" "$target" + trap - RETURN +} + +while IFS=',' read -r package version sha; do + [[ -z "$package" || "$package" == \#* ]] && continue + + local_path="$dest/${package}.${version}.nupkg" + if [[ -f "$local_path" ]]; then + current_sha="$(sha256sum "$local_path" | awk '{print $1}')" + if [[ "$current_sha" == "$sha" ]]; then + echo "[sync-nuget] OK ${package} ${version}" + continue + fi + echo "[sync-nuget] SHA mismatch for ${package} ${version}, refreshing" + else + echo "[sync-nuget] Missing ${package} ${version}" + fi + + fetch_package "$package" "$version" "$sha" +done < "$manifest" diff --git a/src/StellaOps.Notify.Engine/INotifyRuleEvaluator.cs b/src/StellaOps.Notify.Engine/INotifyRuleEvaluator.cs new file mode 100644 index 00000000..9b4ad1f6 --- /dev/null +++ b/src/StellaOps.Notify.Engine/INotifyRuleEvaluator.cs @@ -0,0 +1,28 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using StellaOps.Notify.Models; + +namespace StellaOps.Notify.Engine; + +/// +/// Evaluates Notify rules against platform events. +/// +public interface INotifyRuleEvaluator +{ + /// + /// Evaluates a single rule against an event and returns the match outcome. + /// + NotifyRuleEvaluationOutcome Evaluate( + NotifyRule rule, + NotifyEvent @event, + DateTimeOffset? evaluationTimestamp = null); + + /// + /// Evaluates a collection of rules against an event. + /// + ImmutableArray Evaluate( + IEnumerable rules, + NotifyEvent @event, + DateTimeOffset? evaluationTimestamp = null); +} diff --git a/src/StellaOps.Notify.Engine/TASKS.md b/src/StellaOps.Notify.Engine/TASKS.md index 605b3ba4..01572ed5 100644 --- a/src/StellaOps.Notify.Engine/TASKS.md +++ b/src/StellaOps.Notify.Engine/TASKS.md @@ -2,7 +2,7 @@ | ID | Status | Owner(s) | Depends on | Description | Exit Criteria | |----|--------|----------|------------|-------------|---------------| -| NOTIFY-ENGINE-15-301 | TODO | Notify Engine Guild | NOTIFY-MODELS-15-101 | Rules evaluation core: tenant/kind filters, severity/delta gates, VEX gating, throttling, idempotency key generation. | Unit tests cover rule permutations; idempotency keys deterministic; documentation updated. | +| NOTIFY-ENGINE-15-301 | DOING (2025-10-24) | Notify Engine Guild | NOTIFY-MODELS-15-101 | Rules evaluation core: tenant/kind filters, severity/delta gates, VEX gating, throttling, idempotency key generation. | Unit tests cover rule permutations; idempotency keys deterministic; documentation updated. | | NOTIFY-ENGINE-15-302 | TODO | Notify Engine Guild | NOTIFY-ENGINE-15-301 | Action planner + digest coalescer with window management and dedupe per architecture §4. | Digest windows tested; throttles and digests recorded; metrics counters exposed. | | NOTIFY-ENGINE-15-303 | TODO | Notify Engine Guild | NOTIFY-ENGINE-15-302 | Template rendering engine (Slack, Teams, Email, Webhook) with helpers and i18n support. | Rendering fixtures validated; helpers documented; deterministic output proven via golden tests. | | NOTIFY-ENGINE-15-304 | TODO | Notify Engine Guild | NOTIFY-ENGINE-15-303 | Test-send sandbox + preview utilities for WebService. | Preview/test functions validated; sample outputs returned; no state persisted. | diff --git a/src/StellaOps.Notify.Queue.Tests/NatsNotifyDeliveryQueueTests.cs b/src/StellaOps.Notify.Queue.Tests/NatsNotifyDeliveryQueueTests.cs new file mode 100644 index 00000000..376c61de --- /dev/null +++ b/src/StellaOps.Notify.Queue.Tests/NatsNotifyDeliveryQueueTests.cs @@ -0,0 +1,223 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.Json.Nodes; +using System.Threading.Tasks; +using DotNet.Testcontainers.Builders; +using DotNet.Testcontainers.Containers; +using DotNet.Testcontainers.Configurations; +using FluentAssertions; +using Microsoft.Extensions.Logging.Abstractions; +using NATS.Client.Core; +using NATS.Client.JetStream; +using NATS.Client.JetStream.Models; +using StellaOps.Notify.Models; +using StellaOps.Notify.Queue; +using StellaOps.Notify.Queue.Nats; +using Xunit; + +namespace StellaOps.Notify.Queue.Tests; + +public sealed class NatsNotifyDeliveryQueueTests : IAsyncLifetime +{ + private readonly TestcontainersContainer _nats; + private string? _skipReason; + + public NatsNotifyDeliveryQueueTests() + { + _nats = new TestcontainersBuilder() + .WithImage("nats:2.10-alpine") + .WithCleanUp(true) + .WithName($"nats-notify-delivery-{Guid.NewGuid():N}") + .WithPortBinding(4222, true) + .WithCommand("--jetstream") + .WithWaitStrategy(Wait.ForUnixContainer().UntilPortIsAvailable(4222)) + .Build(); + } + + public async Task InitializeAsync() + { + try + { + await _nats.StartAsync(); + } + catch (Exception ex) + { + _skipReason = $"NATS-backed delivery tests skipped: {ex.Message}"; + } + } + + public async Task DisposeAsync() + { + if (_skipReason is not null) + { + return; + } + + await _nats.DisposeAsync().ConfigureAwait(false); + } + + [Fact] + public async Task Publish_ShouldDeduplicate_ByDeliveryId() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var delivery = TestData.CreateDelivery("tenant-a"); + var message = new NotifyDeliveryQueueMessage( + delivery, + channelId: "chan-a", + channelType: NotifyChannelType.Slack); + + var first = await queue.PublishAsync(message); + first.Deduplicated.Should().BeFalse(); + + var second = await queue.PublishAsync(message); + second.Deduplicated.Should().BeTrue(); + second.MessageId.Should().Be(first.MessageId); + } + + [Fact] + public async Task Release_Retry_ShouldReschedule() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + await queue.PublishAsync(new NotifyDeliveryQueueMessage( + TestData.CreateDelivery(), + channelId: "chan-retry", + channelType: NotifyChannelType.Teams)); + + var lease = (await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-retry", 1, TimeSpan.FromSeconds(2)))).Single(); + + await lease.ReleaseAsync(NotifyQueueReleaseDisposition.Retry); + + var retried = (await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-retry", 1, TimeSpan.FromSeconds(2)))).Single(); + retried.Attempt.Should().BeGreaterThan(lease.Attempt); + + await retried.AcknowledgeAsync(); + } + + [Fact] + public async Task Release_RetryBeyondMax_ShouldDeadLetter() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(static opts => + { + opts.MaxDeliveryAttempts = 2; + opts.Nats.DeadLetterStream = "NOTIFY_DELIVERY_DEAD_TEST"; + opts.Nats.DeadLetterSubject = "notify.delivery.dead.test"; + }); + + await using var queue = CreateQueue(options); + + await queue.PublishAsync(new NotifyDeliveryQueueMessage( + TestData.CreateDelivery(), + channelId: "chan-dead", + channelType: NotifyChannelType.Webhook)); + + var lease = (await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-dead", 1, TimeSpan.FromSeconds(2)))).Single(); + await lease.ReleaseAsync(NotifyQueueReleaseDisposition.Retry); + + var second = (await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-dead", 1, TimeSpan.FromSeconds(2)))).Single(); + await second.ReleaseAsync(NotifyQueueReleaseDisposition.Retry); + + await Task.Delay(200); + + await using var connection = new NatsConnection(new NatsOpts { Url = options.Nats.Url! }); + await connection.ConnectAsync(); + var js = new NatsJSContext(connection); + + var consumerConfig = new ConsumerConfig + { + DurableName = "notify-delivery-dead-test", + DeliverPolicy = ConsumerConfigDeliverPolicy.All, + AckPolicy = ConsumerConfigAckPolicy.Explicit + }; + + var consumer = await js.CreateConsumerAsync(options.Nats.DeadLetterStream, consumerConfig); + var fetchOpts = new NatsJSFetchOpts { MaxMsgs = 1, Expires = TimeSpan.FromSeconds(1) }; + + NatsJSMsg? dlqMsg = null; + await foreach (var msg in consumer.FetchAsync(NatsRawSerializer.Default, fetchOpts)) + { + dlqMsg = msg; + await msg.AckAsync(new AckOpts()); + break; + } + + dlqMsg.Should().NotBeNull(); + } + + private NatsNotifyDeliveryQueue CreateQueue(NotifyDeliveryQueueOptions options) + { + return new NatsNotifyDeliveryQueue( + options, + options.Nats, + NullLogger.Instance, + TimeProvider.System); + } + + private NotifyDeliveryQueueOptions CreateOptions(Action? configure = null) + { + var url = $"nats://{_nats.Hostname}:{_nats.GetMappedPublicPort(4222)}"; + + var opts = new NotifyDeliveryQueueOptions + { + Transport = NotifyQueueTransportKind.Nats, + DefaultLeaseDuration = TimeSpan.FromSeconds(2), + MaxDeliveryAttempts = 3, + RetryInitialBackoff = TimeSpan.FromMilliseconds(20), + RetryMaxBackoff = TimeSpan.FromMilliseconds(200), + Nats = new NotifyNatsDeliveryQueueOptions + { + Url = url, + Stream = "NOTIFY_DELIVERY_TEST", + Subject = "notify.delivery.test", + DeadLetterStream = "NOTIFY_DELIVERY_TEST_DEAD", + DeadLetterSubject = "notify.delivery.test.dead", + DurableConsumer = "notify-delivery-tests", + MaxAckPending = 32, + AckWait = TimeSpan.FromSeconds(2), + RetryDelay = TimeSpan.FromMilliseconds(100), + IdleHeartbeat = TimeSpan.FromMilliseconds(200) + } + }; + + configure?.Invoke(opts); + return opts; + } + + private bool SkipIfUnavailable() + => _skipReason is not null; + + private static class TestData + { + public static NotifyDelivery CreateDelivery(string tenantId = "tenant-1") + { + return NotifyDelivery.Create( + deliveryId: Guid.NewGuid().ToString("n"), + tenantId: tenantId, + ruleId: "rule-1", + actionId: "action-1", + eventId: Guid.NewGuid(), + kind: "scanner.report.ready", + status: NotifyDeliveryStatus.Pending, + createdAt: DateTimeOffset.UtcNow); + } + } +} diff --git a/src/StellaOps.Notify.Queue.Tests/NatsNotifyEventQueueTests.cs b/src/StellaOps.Notify.Queue.Tests/NatsNotifyEventQueueTests.cs new file mode 100644 index 00000000..c092047b --- /dev/null +++ b/src/StellaOps.Notify.Queue.Tests/NatsNotifyEventQueueTests.cs @@ -0,0 +1,225 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.Json.Nodes; +using System.Threading.Tasks; +using DotNet.Testcontainers.Builders; +using DotNet.Testcontainers.Containers; +using DotNet.Testcontainers.Configurations; +using FluentAssertions; +using Microsoft.Extensions.Logging.Abstractions; +using StellaOps.Notify.Models; +using StellaOps.Notify.Queue; +using StellaOps.Notify.Queue.Nats; +using Xunit; + +namespace StellaOps.Notify.Queue.Tests; + +public sealed class NatsNotifyEventQueueTests : IAsyncLifetime +{ + private readonly TestcontainersContainer _nats; + private string? _skipReason; + + public NatsNotifyEventQueueTests() + { + _nats = new TestcontainersBuilder() + .WithImage("nats:2.10-alpine") + .WithCleanUp(true) + .WithName($"nats-notify-tests-{Guid.NewGuid():N}") + .WithPortBinding(4222, true) + .WithCommand("--jetstream") + .WithWaitStrategy(Wait.ForUnixContainer().UntilPortIsAvailable(4222)) + .Build(); + } + + public async Task InitializeAsync() + { + try + { + await _nats.StartAsync(); + } + catch (Exception ex) + { + _skipReason = $"NATS-backed tests skipped: {ex.Message}"; + } + } + + public async Task DisposeAsync() + { + if (_skipReason is not null) + { + return; + } + + await _nats.DisposeAsync().ConfigureAwait(false); + } + + [Fact] + public async Task Publish_ShouldDeduplicate_ByIdempotencyKey() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var notifyEvent = TestData.CreateEvent("tenant-a"); + var message = new NotifyQueueEventMessage( + notifyEvent, + options.Nats.Subject, + traceId: "trace-1"); + + var first = await queue.PublishAsync(message); + first.Deduplicated.Should().BeFalse(); + + var second = await queue.PublishAsync(message); + second.Deduplicated.Should().BeTrue(); + second.MessageId.Should().Be(first.MessageId); + } + + [Fact] + public async Task Lease_Acknowledge_ShouldRemoveMessage() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var notifyEvent = TestData.CreateEvent("tenant-b"); + var message = new NotifyQueueEventMessage( + notifyEvent, + options.Nats.Subject, + traceId: "trace-xyz", + attributes: new Dictionary { { "source", "scanner" } }); + + await queue.PublishAsync(message); + + var leases = await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-1", 1, TimeSpan.FromSeconds(2))); + leases.Should().ContainSingle(); + + var lease = leases[0]; + lease.Attempt.Should().BeGreaterThanOrEqualTo(1); + lease.Message.Event.EventId.Should().Be(notifyEvent.EventId); + lease.TraceId.Should().Be("trace-xyz"); + lease.Attributes.Should().ContainKey("source").WhoseValue.Should().Be("scanner"); + + await lease.AcknowledgeAsync(); + + var afterAck = await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-1", 1, TimeSpan.FromSeconds(1))); + afterAck.Should().BeEmpty(); + } + + [Fact] + public async Task Lease_ShouldPreserveOrdering() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var first = TestData.CreateEvent(); + var second = TestData.CreateEvent(); + + await queue.PublishAsync(new NotifyQueueEventMessage(first, options.Nats.Subject)); + await queue.PublishAsync(new NotifyQueueEventMessage(second, options.Nats.Subject)); + + var leases = await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-order", 2, TimeSpan.FromSeconds(2))); + leases.Should().HaveCount(2); + + leases.Select(x => x.Message.Event.EventId) + .Should() + .ContainInOrder(first.EventId, second.EventId); + } + + [Fact] + public async Task ClaimExpired_ShouldReassignLease() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var notifyEvent = TestData.CreateEvent(); + await queue.PublishAsync(new NotifyQueueEventMessage(notifyEvent, options.Nats.Subject)); + + var leases = await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-initial", 1, TimeSpan.FromMilliseconds(500))); + leases.Should().ContainSingle(); + + await Task.Delay(200); + + var claimed = await queue.ClaimExpiredAsync(new NotifyQueueClaimOptions("worker-reclaim", 1, TimeSpan.FromMilliseconds(100))); + claimed.Should().ContainSingle(); + + var lease = claimed[0]; + lease.Consumer.Should().Be("worker-reclaim"); + lease.Message.Event.EventId.Should().Be(notifyEvent.EventId); + + await lease.AcknowledgeAsync(); + } + + private NatsNotifyEventQueue CreateQueue(NotifyEventQueueOptions options) + { + return new NatsNotifyEventQueue( + options, + options.Nats, + NullLogger.Instance, + TimeProvider.System); + } + + private NotifyEventQueueOptions CreateOptions() + { + var connectionUrl = $"nats://{_nats.Hostname}:{_nats.GetMappedPublicPort(4222)}"; + + return new NotifyEventQueueOptions + { + Transport = NotifyQueueTransportKind.Nats, + DefaultLeaseDuration = TimeSpan.FromSeconds(2), + MaxDeliveryAttempts = 3, + RetryInitialBackoff = TimeSpan.FromMilliseconds(50), + RetryMaxBackoff = TimeSpan.FromSeconds(1), + Nats = new NotifyNatsEventQueueOptions + { + Url = connectionUrl, + Stream = "NOTIFY_TEST", + Subject = "notify.test.events", + DeadLetterStream = "NOTIFY_TEST_DEAD", + DeadLetterSubject = "notify.test.events.dead", + DurableConsumer = "notify-test-consumer", + MaxAckPending = 32, + AckWait = TimeSpan.FromSeconds(2), + RetryDelay = TimeSpan.FromMilliseconds(100), + IdleHeartbeat = TimeSpan.FromMilliseconds(100) + } + }; + } + + private bool SkipIfUnavailable() + => _skipReason is not null; + + private static class TestData + { + public static NotifyEvent CreateEvent(string tenant = "tenant-1") + { + return NotifyEvent.Create( + Guid.NewGuid(), + kind: "scanner.report.ready", + tenant: tenant, + ts: DateTimeOffset.UtcNow, + payload: new JsonObject + { + ["summary"] = "event" + }); + } + } +} diff --git a/src/StellaOps.Notify.Queue.Tests/RedisNotifyDeliveryQueueTests.cs b/src/StellaOps.Notify.Queue.Tests/RedisNotifyDeliveryQueueTests.cs new file mode 100644 index 00000000..80b83ba0 --- /dev/null +++ b/src/StellaOps.Notify.Queue.Tests/RedisNotifyDeliveryQueueTests.cs @@ -0,0 +1,197 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.Json.Nodes; +using System.Threading.Tasks; +using DotNet.Testcontainers.Builders; +using DotNet.Testcontainers.Containers; +using DotNet.Testcontainers.Configurations; +using FluentAssertions; +using Microsoft.Extensions.Logging.Abstractions; +using StackExchange.Redis; +using StellaOps.Notify.Models; +using StellaOps.Notify.Queue; +using StellaOps.Notify.Queue.Redis; +using Xunit; + +namespace StellaOps.Notify.Queue.Tests; + +public sealed class RedisNotifyDeliveryQueueTests : IAsyncLifetime +{ + private readonly RedisTestcontainer _redis; + private string? _skipReason; + + public RedisNotifyDeliveryQueueTests() + { + var configuration = new RedisTestcontainerConfiguration(); + _redis = new TestcontainersBuilder() + .WithDatabase(configuration) + .Build(); + } + + public async Task InitializeAsync() + { + try + { + await _redis.StartAsync(); + } + catch (Exception ex) + { + _skipReason = $"Redis-backed delivery tests skipped: {ex.Message}"; + } + } + + public async Task DisposeAsync() + { + if (_skipReason is not null) + { + return; + } + + await _redis.DisposeAsync().AsTask(); + } + + [Fact] + public async Task Publish_ShouldDeduplicate_ByDeliveryId() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var delivery = TestData.CreateDelivery(); + var message = new NotifyDeliveryQueueMessage( + delivery, + channelId: "channel-1", + channelType: NotifyChannelType.Slack); + + var first = await queue.PublishAsync(message); + first.Deduplicated.Should().BeFalse(); + + var second = await queue.PublishAsync(message); + second.Deduplicated.Should().BeTrue(); + second.MessageId.Should().Be(first.MessageId); + } + + [Fact] + public async Task Release_Retry_ShouldRescheduleDelivery() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + await queue.PublishAsync(new NotifyDeliveryQueueMessage( + TestData.CreateDelivery(), + channelId: "channel-retry", + channelType: NotifyChannelType.Teams)); + + var lease = (await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-retry", 1, TimeSpan.FromSeconds(1)))).Single(); + lease.Attempt.Should().Be(1); + + await lease.ReleaseAsync(NotifyQueueReleaseDisposition.Retry); + + var retried = (await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-retry", 1, TimeSpan.FromSeconds(1)))).Single(); + retried.Attempt.Should().Be(2); + + await retried.AcknowledgeAsync(); + } + + [Fact] + public async Task Release_RetryBeyondMax_ShouldDeadLetter() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(static opts => + { + opts.MaxDeliveryAttempts = 2; + opts.Redis.DeadLetterStreamName = "notify:deliveries:testdead"; + }); + + await using var queue = CreateQueue(options); + + await queue.PublishAsync(new NotifyDeliveryQueueMessage( + TestData.CreateDelivery(), + channelId: "channel-dead", + channelType: NotifyChannelType.Email)); + + var first = (await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-dead", 1, TimeSpan.FromSeconds(1)))).Single(); + await first.ReleaseAsync(NotifyQueueReleaseDisposition.Retry); + + var second = (await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-dead", 1, TimeSpan.FromSeconds(1)))).Single(); + await second.ReleaseAsync(NotifyQueueReleaseDisposition.Retry); + + await Task.Delay(100); + + var mux = await ConnectionMultiplexer.ConnectAsync(_redis.ConnectionString); + var db = mux.GetDatabase(); + var deadLetters = await db.StreamReadAsync(options.Redis.DeadLetterStreamName, "0-0"); + deadLetters.Should().NotBeEmpty(); + } + + private RedisNotifyDeliveryQueue CreateQueue(NotifyDeliveryQueueOptions options) + { + return new RedisNotifyDeliveryQueue( + options, + options.Redis, + NullLogger.Instance, + TimeProvider.System, + async config => (IConnectionMultiplexer)await ConnectionMultiplexer.ConnectAsync(config).ConfigureAwait(false)); + } + + private NotifyDeliveryQueueOptions CreateOptions(Action? configure = null) + { + var opts = new NotifyDeliveryQueueOptions + { + Transport = NotifyQueueTransportKind.Redis, + DefaultLeaseDuration = TimeSpan.FromSeconds(1), + MaxDeliveryAttempts = 3, + RetryInitialBackoff = TimeSpan.FromMilliseconds(10), + RetryMaxBackoff = TimeSpan.FromMilliseconds(50), + ClaimIdleThreshold = TimeSpan.FromSeconds(1), + Redis = new NotifyRedisDeliveryQueueOptions + { + ConnectionString = _redis.ConnectionString, + StreamName = "notify:deliveries:test", + ConsumerGroup = "notify-delivery-tests", + IdempotencyKeyPrefix = "notify:deliveries:test:idemp:" + } + }; + + configure?.Invoke(opts); + return opts; + } + + private bool SkipIfUnavailable() + => _skipReason is not null; + + private static class TestData + { + public static NotifyDelivery CreateDelivery() + { + var now = DateTimeOffset.UtcNow; + return NotifyDelivery.Create( + deliveryId: Guid.NewGuid().ToString("n"), + tenantId: "tenant-1", + ruleId: "rule-1", + actionId: "action-1", + eventId: Guid.NewGuid(), + kind: "scanner.report.ready", + status: NotifyDeliveryStatus.Pending, + createdAt: now, + metadata: new Dictionary + { + ["integration"] = "tests" + }); + } + } +} diff --git a/src/StellaOps.Notify.Queue.Tests/RedisNotifyEventQueueTests.cs b/src/StellaOps.Notify.Queue.Tests/RedisNotifyEventQueueTests.cs new file mode 100644 index 00000000..28499b1e --- /dev/null +++ b/src/StellaOps.Notify.Queue.Tests/RedisNotifyEventQueueTests.cs @@ -0,0 +1,220 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.Json.Nodes; +using System.Threading; +using System.Threading.Tasks; +using DotNet.Testcontainers.Builders; +using DotNet.Testcontainers.Containers; +using DotNet.Testcontainers.Configurations; +using FluentAssertions; +using Microsoft.Extensions.Logging.Abstractions; +using StackExchange.Redis; +using StellaOps.Notify.Models; +using StellaOps.Notify.Queue; +using StellaOps.Notify.Queue.Redis; +using Xunit; + +namespace StellaOps.Notify.Queue.Tests; + +public sealed class RedisNotifyEventQueueTests : IAsyncLifetime +{ + private readonly RedisTestcontainer _redis; + private string? _skipReason; + + public RedisNotifyEventQueueTests() + { + var configuration = new RedisTestcontainerConfiguration(); + _redis = new TestcontainersBuilder() + .WithDatabase(configuration) + .Build(); + } + + public async Task InitializeAsync() + { + try + { + await _redis.StartAsync(); + } + catch (Exception ex) + { + _skipReason = $"Redis-backed tests skipped: {ex.Message}"; + } + } + + public async Task DisposeAsync() + { + if (_skipReason is not null) + { + return; + } + + await _redis.DisposeAsync().AsTask(); + } + + [Fact] + public async Task Publish_ShouldDeduplicate_ByIdempotencyKey() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var notifyEvent = TestData.CreateEvent(tenant: "tenant-a"); + var message = new NotifyQueueEventMessage(notifyEvent, options.Redis.Streams[0].Stream); + + var first = await queue.PublishAsync(message); + first.Deduplicated.Should().BeFalse(); + + var second = await queue.PublishAsync(message); + second.Deduplicated.Should().BeTrue(); + second.MessageId.Should().Be(first.MessageId); + } + + [Fact] + public async Task Lease_Acknowledge_ShouldRemoveMessage() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var notifyEvent = TestData.CreateEvent(tenant: "tenant-b"); + var message = new NotifyQueueEventMessage( + notifyEvent, + options.Redis.Streams[0].Stream, + traceId: "trace-123", + attributes: new Dictionary { { "source", "scanner" } }); + + await queue.PublishAsync(message); + + var leases = await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-1", 1, TimeSpan.FromSeconds(5))); + leases.Should().ContainSingle(); + + var lease = leases[0]; + lease.Attempt.Should().Be(1); + lease.Message.Event.EventId.Should().Be(notifyEvent.EventId); + lease.TraceId.Should().Be("trace-123"); + lease.Attributes.Should().ContainKey("source").WhoseValue.Should().Be("scanner"); + + await lease.AcknowledgeAsync(); + + var afterAck = await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-1", 1, TimeSpan.FromSeconds(5))); + afterAck.Should().BeEmpty(); + } + + [Fact] + public async Task Lease_ShouldPreserveOrdering() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var stream = options.Redis.Streams[0].Stream; + var firstEvent = TestData.CreateEvent(); + var secondEvent = TestData.CreateEvent(); + + await queue.PublishAsync(new NotifyQueueEventMessage(firstEvent, stream)); + await queue.PublishAsync(new NotifyQueueEventMessage(secondEvent, stream)); + + var leases = await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-order", 2, TimeSpan.FromSeconds(5))); + leases.Should().HaveCount(2); + + leases.Select(l => l.Message.Event.EventId) + .Should() + .ContainInOrder(new[] { firstEvent.EventId, secondEvent.EventId }); + } + + [Fact] + public async Task ClaimExpired_ShouldReassignLease() + { + if (SkipIfUnavailable()) + { + return; + } + + var options = CreateOptions(); + await using var queue = CreateQueue(options); + + var notifyEvent = TestData.CreateEvent(); + await queue.PublishAsync(new NotifyQueueEventMessage(notifyEvent, options.Redis.Streams[0].Stream)); + + var leases = await queue.LeaseAsync(new NotifyQueueLeaseRequest("worker-initial", 1, TimeSpan.FromSeconds(1))); + leases.Should().ContainSingle(); + + // Ensure the message has been pending long enough for claim. + await Task.Delay(50); + + var claimed = await queue.ClaimExpiredAsync(new NotifyQueueClaimOptions("worker-reclaim", 1, TimeSpan.Zero)); + claimed.Should().ContainSingle(); + + var lease = claimed[0]; + lease.Consumer.Should().Be("worker-reclaim"); + lease.Message.Event.EventId.Should().Be(notifyEvent.EventId); + + await lease.AcknowledgeAsync(); + } + + private RedisNotifyEventQueue CreateQueue(NotifyEventQueueOptions options) + { + return new RedisNotifyEventQueue( + options, + options.Redis, + NullLogger.Instance, + TimeProvider.System, + async config => (IConnectionMultiplexer)await ConnectionMultiplexer.ConnectAsync(config).ConfigureAwait(false)); + } + + private NotifyEventQueueOptions CreateOptions() + { + var streamOptions = new NotifyRedisEventStreamOptions + { + Stream = "notify:test:events", + ConsumerGroup = "notify-test-consumers", + IdempotencyKeyPrefix = "notify:test:idemp:", + ApproximateMaxLength = 1024 + }; + + var redisOptions = new NotifyRedisEventQueueOptions + { + ConnectionString = _redis.ConnectionString, + Streams = new List { streamOptions } + }; + + return new NotifyEventQueueOptions + { + Transport = NotifyQueueTransportKind.Redis, + DefaultLeaseDuration = TimeSpan.FromSeconds(5), + Redis = redisOptions + }; + } + + private bool SkipIfUnavailable() + => _skipReason is not null; + + private static class TestData + { + public static NotifyEvent CreateEvent(string tenant = "tenant-1") + { + return NotifyEvent.Create( + Guid.NewGuid(), + kind: "scanner.report.ready", + tenant: tenant, + ts: DateTimeOffset.UtcNow, + payload: new JsonObject + { + ["summary"] = "event" + }); + } + } +} diff --git a/src/StellaOps.Notify.Queue.Tests/StellaOps.Notify.Queue.Tests.csproj b/src/StellaOps.Notify.Queue.Tests/StellaOps.Notify.Queue.Tests.csproj new file mode 100644 index 00000000..d4573c30 --- /dev/null +++ b/src/StellaOps.Notify.Queue.Tests/StellaOps.Notify.Queue.Tests.csproj @@ -0,0 +1,26 @@ + + + net10.0 + enable + enable + false + false + + + + + + + + + all + + + all + + + + + + + diff --git a/src/StellaOps.Notify.Queue/Nats/NatsNotifyDeliveryLease.cs b/src/StellaOps.Notify.Queue/Nats/NatsNotifyDeliveryLease.cs new file mode 100644 index 00000000..19e57aac --- /dev/null +++ b/src/StellaOps.Notify.Queue/Nats/NatsNotifyDeliveryLease.cs @@ -0,0 +1,80 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using NATS.Client.JetStream; + +namespace StellaOps.Notify.Queue.Nats; + +internal sealed class NatsNotifyDeliveryLease : INotifyQueueLease +{ + private readonly NatsNotifyDeliveryQueue _queue; + private readonly NatsJSMsg _message; + private int _completed; + + internal NatsNotifyDeliveryLease( + NatsNotifyDeliveryQueue queue, + NatsJSMsg message, + string messageId, + NotifyDeliveryQueueMessage payload, + int attempt, + string consumer, + DateTimeOffset enqueuedAt, + DateTimeOffset leaseExpiresAt, + string idempotencyKey) + { + _queue = queue ?? throw new ArgumentNullException(nameof(queue)); + _message = message; + MessageId = messageId ?? throw new ArgumentNullException(nameof(messageId)); + Message = payload ?? throw new ArgumentNullException(nameof(payload)); + Attempt = attempt; + Consumer = consumer ?? throw new ArgumentNullException(nameof(consumer)); + EnqueuedAt = enqueuedAt; + LeaseExpiresAt = leaseExpiresAt; + IdempotencyKey = idempotencyKey ?? payload.IdempotencyKey; + } + + public string MessageId { get; } + + public int Attempt { get; internal set; } + + public DateTimeOffset EnqueuedAt { get; } + + public DateTimeOffset LeaseExpiresAt { get; private set; } + + public string Consumer { get; } + + public string Stream => Message.Stream; + + public string TenantId => Message.TenantId; + + public string? PartitionKey => Message.PartitionKey; + + public string IdempotencyKey { get; } + + public string? TraceId => Message.TraceId; + + public IReadOnlyDictionary Attributes => Message.Attributes; + + public NotifyDeliveryQueueMessage Message { get; } + + internal NatsJSMsg RawMessage => _message; + + public Task AcknowledgeAsync(CancellationToken cancellationToken = default) + => _queue.AcknowledgeAsync(this, cancellationToken); + + public Task RenewAsync(TimeSpan leaseDuration, CancellationToken cancellationToken = default) + => _queue.RenewLeaseAsync(this, leaseDuration, cancellationToken); + + public Task ReleaseAsync(NotifyQueueReleaseDisposition disposition, CancellationToken cancellationToken = default) + => _queue.ReleaseAsync(this, disposition, cancellationToken); + + public Task DeadLetterAsync(string reason, CancellationToken cancellationToken = default) + => _queue.DeadLetterAsync(this, reason, cancellationToken); + + internal bool TryBeginCompletion() + => Interlocked.CompareExchange(ref _completed, 1, 0) == 0; + + internal void RefreshLease(DateTimeOffset expiresAt) + => LeaseExpiresAt = expiresAt; +} diff --git a/src/StellaOps.Notify.Queue/Nats/NatsNotifyDeliveryQueue.cs b/src/StellaOps.Notify.Queue/Nats/NatsNotifyDeliveryQueue.cs new file mode 100644 index 00000000..25c49aa2 --- /dev/null +++ b/src/StellaOps.Notify.Queue/Nats/NatsNotifyDeliveryQueue.cs @@ -0,0 +1,697 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Collections.ObjectModel; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using NATS.Client.Core; +using NATS.Client.JetStream; +using NATS.Client.JetStream.Models; +using StellaOps.Notify.Models; + +namespace StellaOps.Notify.Queue.Nats; + +internal sealed class NatsNotifyDeliveryQueue : INotifyDeliveryQueue, IAsyncDisposable +{ + private const string TransportName = "nats"; + + private static readonly INatsSerializer PayloadSerializer = NatsRawSerializer.Default; + + private readonly NotifyDeliveryQueueOptions _queueOptions; + private readonly NotifyNatsDeliveryQueueOptions _options; + private readonly ILogger _logger; + private readonly TimeProvider _timeProvider; + private readonly SemaphoreSlim _connectionGate = new(1, 1); + private readonly Func> _connectionFactory; + + private NatsConnection? _connection; + private NatsJSContext? _jsContext; + private INatsJSConsumer? _consumer; + private bool _disposed; + + public NatsNotifyDeliveryQueue( + NotifyDeliveryQueueOptions queueOptions, + NotifyNatsDeliveryQueueOptions options, + ILogger logger, + TimeProvider timeProvider, + Func>? connectionFactory = null) + { + _queueOptions = queueOptions ?? throw new ArgumentNullException(nameof(queueOptions)); + _options = options ?? throw new ArgumentNullException(nameof(options)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + _timeProvider = timeProvider ?? TimeProvider.System; + _connectionFactory = connectionFactory ?? ((opts, token) => new ValueTask(new NatsConnection(opts))); + + if (string.IsNullOrWhiteSpace(_options.Url)) + { + throw new InvalidOperationException("NATS connection URL must be configured for the Notify delivery queue."); + } + + if (string.IsNullOrWhiteSpace(_options.Stream) || string.IsNullOrWhiteSpace(_options.Subject)) + { + throw new InvalidOperationException("NATS stream and subject must be configured for the Notify delivery queue."); + } + } + + public async ValueTask PublishAsync( + NotifyDeliveryQueueMessage message, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(message); + + var js = await GetJetStreamAsync(cancellationToken).ConfigureAwait(false); + await EnsureStreamAndConsumerAsync(js, cancellationToken).ConfigureAwait(false); + await EnsureDeadLetterStreamAsync(js, cancellationToken).ConfigureAwait(false); + + var payload = Encoding.UTF8.GetBytes(NotifyCanonicalJsonSerializer.Serialize(message.Delivery)); + var headers = BuildHeaders(message); + + var publishOpts = new NatsJSPubOpts + { + MsgId = message.IdempotencyKey, + RetryAttempts = 0 + }; + + var ack = await js.PublishAsync( + _options.Subject, + payload, + PayloadSerializer, + publishOpts, + headers, + cancellationToken) + .ConfigureAwait(false); + + if (ack.Duplicate) + { + NotifyQueueMetrics.RecordDeduplicated(TransportName, _options.Stream); + _logger.LogDebug( + "Duplicate Notify delivery enqueue detected for delivery {DeliveryId}.", + message.Delivery.DeliveryId); + + return new NotifyQueueEnqueueResult(ack.Seq.ToString(), true); + } + + NotifyQueueMetrics.RecordEnqueued(TransportName, _options.Stream); + _logger.LogDebug( + "Enqueued Notify delivery {DeliveryId} into NATS stream {Stream} (sequence {Sequence}).", + message.Delivery.DeliveryId, + ack.Stream, + ack.Seq); + + return new NotifyQueueEnqueueResult(ack.Seq.ToString(), false); + } + + public async ValueTask>> LeaseAsync( + NotifyQueueLeaseRequest request, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(request); + + var js = await GetJetStreamAsync(cancellationToken).ConfigureAwait(false); + var consumer = await EnsureStreamAndConsumerAsync(js, cancellationToken).ConfigureAwait(false); + + var fetchOpts = new NatsJSFetchOpts + { + MaxMsgs = request.BatchSize, + Expires = request.LeaseDuration, + IdleHeartbeat = _options.IdleHeartbeat + }; + + var now = _timeProvider.GetUtcNow(); + var leases = new List>(request.BatchSize); + + await foreach (var msg in consumer.FetchAsync(PayloadSerializer, fetchOpts, cancellationToken).ConfigureAwait(false)) + { + var lease = CreateLease(msg, request.Consumer, now, request.LeaseDuration); + if (lease is null) + { + await msg.AckAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + continue; + } + + leases.Add(lease); + } + + return leases; + } + + public async ValueTask>> ClaimExpiredAsync( + NotifyQueueClaimOptions options, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(options); + + var js = await GetJetStreamAsync(cancellationToken).ConfigureAwait(false); + var consumer = await EnsureStreamAndConsumerAsync(js, cancellationToken).ConfigureAwait(false); + + var fetchOpts = new NatsJSFetchOpts + { + MaxMsgs = options.BatchSize, + Expires = options.MinIdleTime, + IdleHeartbeat = _options.IdleHeartbeat + }; + + var now = _timeProvider.GetUtcNow(); + var leases = new List>(options.BatchSize); + + await foreach (var msg in consumer.FetchAsync(PayloadSerializer, fetchOpts, cancellationToken).ConfigureAwait(false)) + { + var deliveries = (int)(msg.Metadata?.NumDelivered ?? 1); + if (deliveries <= 1) + { + await msg.NakAsync(new AckOpts(), TimeSpan.Zero, cancellationToken).ConfigureAwait(false); + continue; + } + + var lease = CreateLease(msg, options.ClaimantConsumer, now, _queueOptions.DefaultLeaseDuration); + if (lease is null) + { + await msg.AckAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + continue; + } + + leases.Add(lease); + } + + return leases; + } + + public async ValueTask DisposeAsync() + { + if (_disposed) + { + return; + } + + _disposed = true; + + if (_connection is not null) + { + await _connection.DisposeAsync().ConfigureAwait(false); + } + + _connectionGate.Dispose(); + GC.SuppressFinalize(this); + } + + internal async Task AcknowledgeAsync( + NatsNotifyDeliveryLease lease, + CancellationToken cancellationToken) + { + if (!lease.TryBeginCompletion()) + { + return; + } + + await lease.RawMessage.AckAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + NotifyQueueMetrics.RecordAck(TransportName, _options.Stream); + + _logger.LogDebug( + "Acknowledged Notify delivery {DeliveryId} (sequence {Sequence}).", + lease.Message.Delivery.DeliveryId, + lease.MessageId); + } + + internal async Task RenewLeaseAsync( + NatsNotifyDeliveryLease lease, + TimeSpan leaseDuration, + CancellationToken cancellationToken) + { + await lease.RawMessage.AckProgressAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + var expires = _timeProvider.GetUtcNow().Add(leaseDuration); + lease.RefreshLease(expires); + + _logger.LogDebug( + "Renewed NATS lease for Notify delivery {DeliveryId} until {Expires:u}.", + lease.Message.Delivery.DeliveryId, + expires); + } + + internal async Task ReleaseAsync( + NatsNotifyDeliveryLease lease, + NotifyQueueReleaseDisposition disposition, + CancellationToken cancellationToken) + { + if (disposition == NotifyQueueReleaseDisposition.Retry + && lease.Attempt >= _queueOptions.MaxDeliveryAttempts) + { + _logger.LogWarning( + "Notify delivery {DeliveryId} reached max delivery attempts ({Attempts}); moving to dead-letter stream.", + lease.Message.Delivery.DeliveryId, + lease.Attempt); + + await DeadLetterAsync( + lease, + $"max-delivery-attempts:{lease.Attempt}", + cancellationToken).ConfigureAwait(false); + return; + } + + if (!lease.TryBeginCompletion()) + { + return; + } + + if (disposition == NotifyQueueReleaseDisposition.Retry) + { + var delay = CalculateBackoff(lease.Attempt); + await lease.RawMessage.NakAsync(new AckOpts(), delay, cancellationToken).ConfigureAwait(false); + + NotifyQueueMetrics.RecordRetry(TransportName, _options.Stream); + _logger.LogInformation( + "Scheduled Notify delivery {DeliveryId} for retry with delay {Delay} (attempt {Attempt}).", + lease.Message.Delivery.DeliveryId, + delay, + lease.Attempt); + } + else + { + await lease.RawMessage.AckTerminateAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + NotifyQueueMetrics.RecordAck(TransportName, _options.Stream); + _logger.LogInformation( + "Abandoned Notify delivery {DeliveryId} after {Attempt} attempt(s).", + lease.Message.Delivery.DeliveryId, + lease.Attempt); + } + } + + internal async Task DeadLetterAsync( + NatsNotifyDeliveryLease lease, + string reason, + CancellationToken cancellationToken) + { + if (!lease.TryBeginCompletion()) + { + return; + } + + await lease.RawMessage.AckTerminateAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + + var js = await GetJetStreamAsync(cancellationToken).ConfigureAwait(false); + await EnsureDeadLetterStreamAsync(js, cancellationToken).ConfigureAwait(false); + + var payload = Encoding.UTF8.GetBytes(NotifyCanonicalJsonSerializer.Serialize(lease.Message.Delivery)); + var headers = BuildDeadLetterHeaders(lease, reason); + + await js.PublishAsync( + _options.DeadLetterSubject, + payload, + PayloadSerializer, + new NatsJSPubOpts(), + headers, + cancellationToken) + .ConfigureAwait(false); + + NotifyQueueMetrics.RecordDeadLetter(TransportName, _options.DeadLetterStream); + _logger.LogError( + "Dead-lettered Notify delivery {DeliveryId} (attempt {Attempt}): {Reason}", + lease.Message.Delivery.DeliveryId, + lease.Attempt, + reason); + } + + internal async Task PingAsync(CancellationToken cancellationToken) + { + var connection = await EnsureConnectionAsync(cancellationToken).ConfigureAwait(false); + await connection.PingAsync(cancellationToken).ConfigureAwait(false); + } + + private async Task GetJetStreamAsync(CancellationToken cancellationToken) + { + if (_jsContext is not null) + { + return _jsContext; + } + + var connection = await EnsureConnectionAsync(cancellationToken).ConfigureAwait(false); + + await _connectionGate.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + _jsContext ??= new NatsJSContext(connection); + return _jsContext; + } + finally + { + _connectionGate.Release(); + } + } + + private async ValueTask EnsureStreamAndConsumerAsync( + NatsJSContext js, + CancellationToken cancellationToken) + { + if (_consumer is not null) + { + return _consumer; + } + + await _connectionGate.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + if (_consumer is not null) + { + return _consumer; + } + + await EnsureStreamAsync(js, cancellationToken).ConfigureAwait(false); + await EnsureDeadLetterStreamAsync(js, cancellationToken).ConfigureAwait(false); + + var consumerConfig = new ConsumerConfig + { + DurableName = _options.DurableConsumer, + AckPolicy = ConsumerConfigAckPolicy.Explicit, + ReplayPolicy = ConsumerConfigReplayPolicy.Instant, + DeliverPolicy = ConsumerConfigDeliverPolicy.All, + AckWait = ToNanoseconds(_options.AckWait), + MaxAckPending = _options.MaxAckPending, + MaxDeliver = Math.Max(1, _queueOptions.MaxDeliveryAttempts), + FilterSubjects = new[] { _options.Subject } + }; + + try + { + _consumer = await js.CreateConsumerAsync( + _options.Stream, + consumerConfig, + cancellationToken) + .ConfigureAwait(false); + } + catch (NatsJSApiException apiEx) + { + _logger.LogDebug( + apiEx, + "CreateConsumerAsync failed with code {Code}; attempting to fetch existing durable consumer {Durable}.", + apiEx.Error?.Code, + _options.DurableConsumer); + + _consumer = await js.GetConsumerAsync( + _options.Stream, + _options.DurableConsumer, + cancellationToken) + .ConfigureAwait(false); + } + + return _consumer; + } + finally + { + _connectionGate.Release(); + } + } + + private async Task EnsureConnectionAsync(CancellationToken cancellationToken) + { + if (_connection is not null) + { + return _connection; + } + + await _connectionGate.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + if (_connection is not null) + { + return _connection; + } + + var opts = new NatsOpts + { + Url = _options.Url!, + Name = "stellaops-notify-delivery", + CommandTimeout = TimeSpan.FromSeconds(10), + RequestTimeout = TimeSpan.FromSeconds(20), + PingInterval = TimeSpan.FromSeconds(30) + }; + + _connection = await _connectionFactory(opts, cancellationToken).ConfigureAwait(false); + await _connection.ConnectAsync().ConfigureAwait(false); + return _connection; + } + finally + { + _connectionGate.Release(); + } + } + + private async Task EnsureStreamAsync(NatsJSContext js, CancellationToken cancellationToken) + { + try + { + await js.GetStreamAsync(_options.Stream, cancellationToken: cancellationToken).ConfigureAwait(false); + } + catch (NatsJSApiException ex) when (ex.Error?.Code == 404) + { + var config = new StreamConfig(name: _options.Stream, subjects: new[] { _options.Subject }) + { + Retention = StreamConfigRetention.Workqueue, + Storage = StreamConfigStorage.File, + MaxConsumers = -1, + MaxMsgs = -1, + MaxBytes = -1 + }; + + await js.CreateStreamAsync(config, cancellationToken).ConfigureAwait(false); + _logger.LogInformation("Created NATS Notify delivery stream {Stream} ({Subject}).", _options.Stream, _options.Subject); + } + } + + private async Task EnsureDeadLetterStreamAsync(NatsJSContext js, CancellationToken cancellationToken) + { + try + { + await js.GetStreamAsync(_options.DeadLetterStream, cancellationToken: cancellationToken).ConfigureAwait(false); + } + catch (NatsJSApiException ex) when (ex.Error?.Code == 404) + { + var config = new StreamConfig(name: _options.DeadLetterStream, subjects: new[] { _options.DeadLetterSubject }) + { + Retention = StreamConfigRetention.Workqueue, + Storage = StreamConfigStorage.File, + MaxConsumers = -1, + MaxMsgs = -1, + MaxBytes = -1 + }; + + await js.CreateStreamAsync(config, cancellationToken).ConfigureAwait(false); + _logger.LogInformation("Created NATS Notify delivery dead-letter stream {Stream} ({Subject}).", _options.DeadLetterStream, _options.DeadLetterSubject); + } + } + + private NatsNotifyDeliveryLease? CreateLease( + NatsJSMsg message, + string consumer, + DateTimeOffset now, + TimeSpan leaseDuration) + { + var payloadBytes = message.Data ?? Array.Empty(); + if (payloadBytes.Length == 0) + { + return null; + } + + NotifyDelivery delivery; + try + { + var json = Encoding.UTF8.GetString(payloadBytes); + delivery = NotifyCanonicalJsonSerializer.Deserialize(json); + } + catch (Exception ex) + { + _logger.LogWarning( + ex, + "Failed to deserialize Notify delivery payload for NATS message {Sequence}.", + message.Metadata?.Sequence.Stream); + return null; + } + + var headers = message.Headers ?? new NatsHeaders(); + + var deliveryId = TryGetHeader(headers, NotifyQueueFields.DeliveryId) ?? delivery.DeliveryId; + var channelId = TryGetHeader(headers, NotifyQueueFields.ChannelId); + var channelTypeRaw = TryGetHeader(headers, NotifyQueueFields.ChannelType); + if (channelId is null || channelTypeRaw is null) + { + return null; + } + + if (!Enum.TryParse(channelTypeRaw, ignoreCase: true, out var channelType)) + { + _logger.LogWarning("Unknown channel type '{ChannelType}' for delivery {DeliveryId}.", channelTypeRaw, deliveryId); + return null; + } + + var traceId = TryGetHeader(headers, NotifyQueueFields.TraceId); + var partitionKey = TryGetHeader(headers, NotifyQueueFields.PartitionKey) ?? channelId; + var idempotencyKey = TryGetHeader(headers, NotifyQueueFields.IdempotencyKey) ?? delivery.DeliveryId; + + var enqueuedAt = TryGetHeader(headers, NotifyQueueFields.EnqueuedAt) is { } enqueuedRaw + && long.TryParse(enqueuedRaw, NumberStyles.Integer, CultureInfo.InvariantCulture, out var unix) + ? DateTimeOffset.FromUnixTimeMilliseconds(unix) + : now; + + var attempt = TryGetHeader(headers, NotifyQueueFields.Attempt) is { } attemptRaw + && int.TryParse(attemptRaw, NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsedAttempt) + ? parsedAttempt + : 1; + + if (message.Metadata?.NumDelivered is ulong delivered && delivered > 0) + { + var deliveredInt = delivered > int.MaxValue ? int.MaxValue : (int)delivered; + if (deliveredInt > attempt) + { + attempt = deliveredInt; + } + } + + var attributes = ExtractAttributes(headers); + var leaseExpires = now.Add(leaseDuration); + var messageId = message.Metadata?.Sequence.Stream.ToString() ?? Guid.NewGuid().ToString("n"); + + var queueMessage = new NotifyDeliveryQueueMessage( + delivery, + channelId, + channelType, + _options.Subject, + traceId, + attributes); + + return new NatsNotifyDeliveryLease( + this, + message, + messageId, + queueMessage, + attempt, + consumer, + enqueuedAt, + leaseExpires, + idempotencyKey); + } + + private NatsHeaders BuildHeaders(NotifyDeliveryQueueMessage message) + { + var headers = new NatsHeaders + { + { NotifyQueueFields.DeliveryId, message.Delivery.DeliveryId }, + { NotifyQueueFields.ChannelId, message.ChannelId }, + { NotifyQueueFields.ChannelType, message.ChannelType.ToString() }, + { NotifyQueueFields.Tenant, message.Delivery.TenantId }, + { NotifyQueueFields.Attempt, "1" }, + { NotifyQueueFields.EnqueuedAt, _timeProvider.GetUtcNow().ToUnixTimeMilliseconds().ToString(CultureInfo.InvariantCulture) }, + { NotifyQueueFields.IdempotencyKey, message.IdempotencyKey }, + { NotifyQueueFields.PartitionKey, message.PartitionKey } + }; + + if (!string.IsNullOrWhiteSpace(message.TraceId)) + { + headers.Add(NotifyQueueFields.TraceId, message.TraceId!); + } + + foreach (var kvp in message.Attributes) + { + headers.Add(NotifyQueueFields.AttributePrefix + kvp.Key, kvp.Value); + } + + return headers; + } + + private NatsHeaders BuildDeadLetterHeaders(NatsNotifyDeliveryLease lease, string reason) + { + var headers = new NatsHeaders + { + { NotifyQueueFields.DeliveryId, lease.Message.Delivery.DeliveryId }, + { NotifyQueueFields.ChannelId, lease.Message.ChannelId }, + { NotifyQueueFields.ChannelType, lease.Message.ChannelType.ToString() }, + { NotifyQueueFields.Tenant, lease.Message.Delivery.TenantId }, + { NotifyQueueFields.Attempt, lease.Attempt.ToString(CultureInfo.InvariantCulture) }, + { NotifyQueueFields.IdempotencyKey, lease.Message.IdempotencyKey }, + { "deadletter-reason", reason } + }; + + if (!string.IsNullOrWhiteSpace(lease.Message.TraceId)) + { + headers.Add(NotifyQueueFields.TraceId, lease.Message.TraceId!); + } + + foreach (var kvp in lease.Message.Attributes) + { + headers.Add(NotifyQueueFields.AttributePrefix + kvp.Key, kvp.Value); + } + + return headers; + } + + private static string? TryGetHeader(NatsHeaders headers, string key) + { + if (headers.TryGetValue(key, out var values) && values.Count > 0) + { + var value = values[0]; + return string.IsNullOrWhiteSpace(value) ? null : value; + } + + return null; + } + + private static IReadOnlyDictionary ExtractAttributes(NatsHeaders headers) + { + var attributes = new Dictionary(StringComparer.Ordinal); + + foreach (var key in headers.Keys) + { + if (!key.StartsWith(NotifyQueueFields.AttributePrefix, StringComparison.Ordinal)) + { + continue; + } + + if (headers.TryGetValue(key, out var values) && values.Count > 0) + { + attributes[key[NotifyQueueFields.AttributePrefix.Length..]] = values[0]!; + } + } + + return attributes.Count == 0 + ? EmptyReadOnlyDictionary.Instance + : new ReadOnlyDictionary(attributes); + } + + private TimeSpan CalculateBackoff(int attempt) + { + var initial = _queueOptions.RetryInitialBackoff > TimeSpan.Zero + ? _queueOptions.RetryInitialBackoff + : _options.RetryDelay; + + if (initial <= TimeSpan.Zero) + { + return TimeSpan.Zero; + } + + if (attempt <= 1) + { + return initial; + } + + var max = _queueOptions.RetryMaxBackoff > TimeSpan.Zero + ? _queueOptions.RetryMaxBackoff + : initial; + + var exponent = attempt - 1; + var scaledTicks = initial.Ticks * Math.Pow(2, exponent - 1); + var cappedTicks = Math.Min(max.Ticks, scaledTicks); + var resultTicks = Math.Max(initial.Ticks, (long)cappedTicks); + return TimeSpan.FromTicks(resultTicks); + } + + private static long ToNanoseconds(TimeSpan value) + => value <= TimeSpan.Zero ? 0 : value.Ticks * 100L; + + private static class EmptyReadOnlyDictionary + where TKey : notnull + { + public static readonly IReadOnlyDictionary Instance = + new ReadOnlyDictionary(new Dictionary(0, EqualityComparer.Default)); + } +} diff --git a/src/StellaOps.Notify.Queue/Nats/NatsNotifyEventLease.cs b/src/StellaOps.Notify.Queue/Nats/NatsNotifyEventLease.cs new file mode 100644 index 00000000..53458b14 --- /dev/null +++ b/src/StellaOps.Notify.Queue/Nats/NatsNotifyEventLease.cs @@ -0,0 +1,83 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using NATS.Client.JetStream; + +namespace StellaOps.Notify.Queue.Nats; + +internal sealed class NatsNotifyEventLease : INotifyQueueLease +{ + private readonly NatsNotifyEventQueue _queue; + private readonly NatsJSMsg _message; + private int _completed; + + internal NatsNotifyEventLease( + NatsNotifyEventQueue queue, + NatsJSMsg message, + string messageId, + NotifyQueueEventMessage payload, + int attempt, + string consumer, + DateTimeOffset enqueuedAt, + DateTimeOffset leaseExpiresAt) + { + _queue = queue ?? throw new ArgumentNullException(nameof(queue)); + if (EqualityComparer>.Default.Equals(message, default)) + { + throw new ArgumentException("Message must be provided.", nameof(message)); + } + + _message = message; + MessageId = messageId ?? throw new ArgumentNullException(nameof(messageId)); + Message = payload ?? throw new ArgumentNullException(nameof(payload)); + Attempt = attempt; + Consumer = consumer ?? throw new ArgumentNullException(nameof(consumer)); + EnqueuedAt = enqueuedAt; + LeaseExpiresAt = leaseExpiresAt; + } + + public string MessageId { get; } + + public int Attempt { get; internal set; } + + public DateTimeOffset EnqueuedAt { get; } + + public DateTimeOffset LeaseExpiresAt { get; private set; } + + public string Consumer { get; } + + public string Stream => Message.Stream; + + public string TenantId => Message.TenantId; + + public string? PartitionKey => Message.PartitionKey; + + public string IdempotencyKey => Message.IdempotencyKey; + + public string? TraceId => Message.TraceId; + + public IReadOnlyDictionary Attributes => Message.Attributes; + + public NotifyQueueEventMessage Message { get; } + + internal NatsJSMsg RawMessage => _message; + + public Task AcknowledgeAsync(CancellationToken cancellationToken = default) + => _queue.AcknowledgeAsync(this, cancellationToken); + + public Task RenewAsync(TimeSpan leaseDuration, CancellationToken cancellationToken = default) + => _queue.RenewLeaseAsync(this, leaseDuration, cancellationToken); + + public Task ReleaseAsync(NotifyQueueReleaseDisposition disposition, CancellationToken cancellationToken = default) + => _queue.ReleaseAsync(this, disposition, cancellationToken); + + public Task DeadLetterAsync(string reason, CancellationToken cancellationToken = default) + => _queue.DeadLetterAsync(this, reason, cancellationToken); + + internal bool TryBeginCompletion() + => Interlocked.CompareExchange(ref _completed, 1, 0) == 0; + + internal void RefreshLease(DateTimeOffset expiresAt) + => LeaseExpiresAt = expiresAt; +} diff --git a/src/StellaOps.Notify.Queue/Nats/NatsNotifyEventQueue.cs b/src/StellaOps.Notify.Queue/Nats/NatsNotifyEventQueue.cs new file mode 100644 index 00000000..023583b3 --- /dev/null +++ b/src/StellaOps.Notify.Queue/Nats/NatsNotifyEventQueue.cs @@ -0,0 +1,698 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Collections.ObjectModel; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using NATS.Client.Core; +using NATS.Client.JetStream; +using NATS.Client.JetStream.Models; +using StellaOps.Notify.Models; + +namespace StellaOps.Notify.Queue.Nats; + +internal sealed class NatsNotifyEventQueue : INotifyEventQueue, IAsyncDisposable +{ + private const string TransportName = "nats"; + + private static readonly INatsSerializer PayloadSerializer = NatsRawSerializer.Default; + + private readonly NotifyEventQueueOptions _queueOptions; + private readonly NotifyNatsEventQueueOptions _options; + private readonly ILogger _logger; + private readonly TimeProvider _timeProvider; + private readonly SemaphoreSlim _connectionGate = new(1, 1); + private readonly Func> _connectionFactory; + + private NatsConnection? _connection; + private NatsJSContext? _jsContext; + private INatsJSConsumer? _consumer; + private bool _disposed; + + public NatsNotifyEventQueue( + NotifyEventQueueOptions queueOptions, + NotifyNatsEventQueueOptions options, + ILogger logger, + TimeProvider timeProvider, + Func>? connectionFactory = null) + { + _queueOptions = queueOptions ?? throw new ArgumentNullException(nameof(queueOptions)); + _options = options ?? throw new ArgumentNullException(nameof(options)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + _timeProvider = timeProvider ?? TimeProvider.System; + _connectionFactory = connectionFactory ?? ((opts, cancellationToken) => new ValueTask(new NatsConnection(opts))); + + if (string.IsNullOrWhiteSpace(_options.Url)) + { + throw new InvalidOperationException("NATS connection URL must be configured for the Notify event queue."); + } + + if (string.IsNullOrWhiteSpace(_options.Stream) || string.IsNullOrWhiteSpace(_options.Subject)) + { + throw new InvalidOperationException("NATS stream and subject must be configured for the Notify event queue."); + } + } + + public async ValueTask PublishAsync( + NotifyQueueEventMessage message, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(message); + + var js = await GetJetStreamAsync(cancellationToken).ConfigureAwait(false); + await EnsureStreamAndConsumerAsync(js, cancellationToken).ConfigureAwait(false); + await EnsureDeadLetterStreamAsync(js, cancellationToken).ConfigureAwait(false); + + var idempotencyKey = string.IsNullOrWhiteSpace(message.IdempotencyKey) + ? message.Event.EventId.ToString("N") + : message.IdempotencyKey; + + var payload = Encoding.UTF8.GetBytes(NotifyCanonicalJsonSerializer.Serialize(message.Event)); + var headers = BuildHeaders(message, idempotencyKey); + + var publishOpts = new NatsJSPubOpts + { + MsgId = idempotencyKey, + RetryAttempts = 0 + }; + + var ack = await js.PublishAsync( + _options.Subject, + payload, + PayloadSerializer, + publishOpts, + headers, + cancellationToken) + .ConfigureAwait(false); + + if (ack.Duplicate) + { + _logger.LogDebug( + "Duplicate Notify event enqueue detected for idempotency token {Token}.", + idempotencyKey); + + NotifyQueueMetrics.RecordDeduplicated(TransportName, _options.Stream); + return new NotifyQueueEnqueueResult(ack.Seq.ToString(), true); + } + + NotifyQueueMetrics.RecordEnqueued(TransportName, _options.Stream); + _logger.LogDebug( + "Enqueued Notify event {EventId} into NATS stream {Stream} (sequence {Sequence}).", + message.Event.EventId, + ack.Stream, + ack.Seq); + + return new NotifyQueueEnqueueResult(ack.Seq.ToString(), false); + } + + public async ValueTask>> LeaseAsync( + NotifyQueueLeaseRequest request, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(request); + + var js = await GetJetStreamAsync(cancellationToken).ConfigureAwait(false); + var consumer = await EnsureStreamAndConsumerAsync(js, cancellationToken).ConfigureAwait(false); + + var fetchOpts = new NatsJSFetchOpts + { + MaxMsgs = request.BatchSize, + Expires = request.LeaseDuration, + IdleHeartbeat = _options.IdleHeartbeat + }; + + var now = _timeProvider.GetUtcNow(); + var leases = new List>(request.BatchSize); + + await foreach (var msg in consumer.FetchAsync(PayloadSerializer, fetchOpts, cancellationToken).ConfigureAwait(false)) + { + var lease = CreateLease(msg, request.Consumer, now, request.LeaseDuration); + if (lease is null) + { + await msg.AckAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + continue; + } + + leases.Add(lease); + } + + return leases; + } + + public async ValueTask>> ClaimExpiredAsync( + NotifyQueueClaimOptions options, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(options); + + var js = await GetJetStreamAsync(cancellationToken).ConfigureAwait(false); + var consumer = await EnsureStreamAndConsumerAsync(js, cancellationToken).ConfigureAwait(false); + + var fetchOpts = new NatsJSFetchOpts + { + MaxMsgs = options.BatchSize, + Expires = options.MinIdleTime, + IdleHeartbeat = _options.IdleHeartbeat + }; + + var now = _timeProvider.GetUtcNow(); + var leases = new List>(options.BatchSize); + + await foreach (var msg in consumer.FetchAsync(PayloadSerializer, fetchOpts, cancellationToken).ConfigureAwait(false)) + { + var deliveries = (int)(msg.Metadata?.NumDelivered ?? 1); + if (deliveries <= 1) + { + await msg.NakAsync(new AckOpts(), TimeSpan.Zero, cancellationToken).ConfigureAwait(false); + continue; + } + + var lease = CreateLease(msg, options.ClaimantConsumer, now, _queueOptions.DefaultLeaseDuration); + if (lease is null) + { + await msg.AckAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + continue; + } + + leases.Add(lease); + } + + return leases; + } + + public async ValueTask DisposeAsync() + { + if (_disposed) + { + return; + } + + _disposed = true; + + if (_connection is not null) + { + await _connection.DisposeAsync().ConfigureAwait(false); + } + + _connectionGate.Dispose(); + GC.SuppressFinalize(this); + } + + internal async Task AcknowledgeAsync( + NatsNotifyEventLease lease, + CancellationToken cancellationToken) + { + if (!lease.TryBeginCompletion()) + { + return; + } + + await lease.RawMessage.AckAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + NotifyQueueMetrics.RecordAck(TransportName, _options.Stream); + + _logger.LogDebug( + "Acknowledged Notify event {EventId} (sequence {Sequence}).", + lease.Message.Event.EventId, + lease.MessageId); + } + + internal async Task RenewLeaseAsync( + NatsNotifyEventLease lease, + TimeSpan leaseDuration, + CancellationToken cancellationToken) + { + await lease.RawMessage.AckProgressAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + + var expires = _timeProvider.GetUtcNow().Add(leaseDuration); + lease.RefreshLease(expires); + + _logger.LogDebug( + "Renewed NATS lease for Notify event {EventId} until {Expires:u}.", + lease.Message.Event.EventId, + expires); + } + + internal async Task ReleaseAsync( + NatsNotifyEventLease lease, + NotifyQueueReleaseDisposition disposition, + CancellationToken cancellationToken) + { + if (disposition == NotifyQueueReleaseDisposition.Retry + && lease.Attempt >= _queueOptions.MaxDeliveryAttempts) + { + _logger.LogWarning( + "Notify event {EventId} reached max delivery attempts ({Attempts}); moving to dead-letter stream.", + lease.Message.Event.EventId, + lease.Attempt); + + await DeadLetterAsync( + lease, + $"max-delivery-attempts:{lease.Attempt}", + cancellationToken).ConfigureAwait(false); + return; + } + + if (!lease.TryBeginCompletion()) + { + return; + } + + if (disposition == NotifyQueueReleaseDisposition.Retry) + { + var delay = CalculateBackoff(lease.Attempt); + await lease.RawMessage.NakAsync(new AckOpts(), delay, cancellationToken).ConfigureAwait(false); + + NotifyQueueMetrics.RecordRetry(TransportName, _options.Stream); + + _logger.LogInformation( + "Scheduled Notify event {EventId} for retry with delay {Delay} (attempt {Attempt}).", + lease.Message.Event.EventId, + delay, + lease.Attempt); + } + else + { + await lease.RawMessage.AckTerminateAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + NotifyQueueMetrics.RecordAck(TransportName, _options.Stream); + + _logger.LogInformation( + "Abandoned Notify event {EventId} after {Attempt} attempt(s).", + lease.Message.Event.EventId, + lease.Attempt); + } + } + + internal async Task DeadLetterAsync( + NatsNotifyEventLease lease, + string reason, + CancellationToken cancellationToken) + { + if (!lease.TryBeginCompletion()) + { + return; + } + + await lease.RawMessage.AckTerminateAsync(new AckOpts(), cancellationToken).ConfigureAwait(false); + + var js = await GetJetStreamAsync(cancellationToken).ConfigureAwait(false); + await EnsureDeadLetterStreamAsync(js, cancellationToken).ConfigureAwait(false); + + var headers = BuildDeadLetterHeaders(lease, reason); + var payload = Encoding.UTF8.GetBytes(NotifyCanonicalJsonSerializer.Serialize(lease.Message.Event)); + + await js.PublishAsync( + _options.DeadLetterSubject, + payload, + PayloadSerializer, + new NatsJSPubOpts(), + headers, + cancellationToken) + .ConfigureAwait(false); + + NotifyQueueMetrics.RecordDeadLetter(TransportName, _options.DeadLetterStream); + + _logger.LogError( + "Dead-lettered Notify event {EventId} (attempt {Attempt}): {Reason}", + lease.Message.Event.EventId, + lease.Attempt, + reason); + } + + internal async Task PingAsync(CancellationToken cancellationToken) + { + var connection = await EnsureConnectionAsync(cancellationToken).ConfigureAwait(false); + await connection.PingAsync(cancellationToken).ConfigureAwait(false); + } + + private async Task GetJetStreamAsync(CancellationToken cancellationToken) + { + if (_jsContext is not null) + { + return _jsContext; + } + + var connection = await EnsureConnectionAsync(cancellationToken).ConfigureAwait(false); + + await _connectionGate.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + _jsContext ??= new NatsJSContext(connection); + return _jsContext; + } + finally + { + _connectionGate.Release(); + } + } + + private async ValueTask EnsureStreamAndConsumerAsync( + NatsJSContext js, + CancellationToken cancellationToken) + { + if (_consumer is not null) + { + return _consumer; + } + + await _connectionGate.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + if (_consumer is not null) + { + return _consumer; + } + + await EnsureStreamAsync(js, cancellationToken).ConfigureAwait(false); + await EnsureDeadLetterStreamAsync(js, cancellationToken).ConfigureAwait(false); + + var consumerConfig = new ConsumerConfig + { + DurableName = _options.DurableConsumer, + AckPolicy = ConsumerConfigAckPolicy.Explicit, + ReplayPolicy = ConsumerConfigReplayPolicy.Instant, + DeliverPolicy = ConsumerConfigDeliverPolicy.All, + AckWait = ToNanoseconds(_options.AckWait), + MaxAckPending = _options.MaxAckPending, + MaxDeliver = Math.Max(1, _queueOptions.MaxDeliveryAttempts), + FilterSubjects = new[] { _options.Subject } + }; + + try + { + _consumer = await js.CreateConsumerAsync( + _options.Stream, + consumerConfig, + cancellationToken) + .ConfigureAwait(false); + } + catch (NatsJSApiException apiEx) + { + _logger.LogDebug( + apiEx, + "CreateConsumerAsync failed with code {Code}; attempting to fetch existing durable consumer {Durable}.", + apiEx.Error?.Code, + _options.DurableConsumer); + + _consumer = await js.GetConsumerAsync( + _options.Stream, + _options.DurableConsumer, + cancellationToken) + .ConfigureAwait(false); + } + + return _consumer; + } + finally + { + _connectionGate.Release(); + } + } + + private async Task EnsureConnectionAsync(CancellationToken cancellationToken) + { + if (_connection is not null) + { + return _connection; + } + + await _connectionGate.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + if (_connection is not null) + { + return _connection; + } + + var opts = new NatsOpts + { + Url = _options.Url!, + Name = "stellaops-notify-queue", + CommandTimeout = TimeSpan.FromSeconds(10), + RequestTimeout = TimeSpan.FromSeconds(20), + PingInterval = TimeSpan.FromSeconds(30) + }; + + _connection = await _connectionFactory(opts, cancellationToken).ConfigureAwait(false); + await _connection.ConnectAsync().ConfigureAwait(false); + return _connection; + } + finally + { + _connectionGate.Release(); + } + } + + private async Task EnsureStreamAsync(NatsJSContext js, CancellationToken cancellationToken) + { + try + { + await js.GetStreamAsync(_options.Stream, cancellationToken: cancellationToken).ConfigureAwait(false); + } + catch (NatsJSApiException ex) when (ex.Error?.Code == 404) + { + var config = new StreamConfig(name: _options.Stream, subjects: new[] { _options.Subject }) + { + Retention = StreamConfigRetention.Workqueue, + Storage = StreamConfigStorage.File, + MaxConsumers = -1, + MaxMsgs = -1, + MaxBytes = -1 + }; + + await js.CreateStreamAsync(config, cancellationToken).ConfigureAwait(false); + _logger.LogInformation("Created NATS Notify stream {Stream} ({Subject}).", _options.Stream, _options.Subject); + } + } + + private async Task EnsureDeadLetterStreamAsync(NatsJSContext js, CancellationToken cancellationToken) + { + try + { + await js.GetStreamAsync(_options.DeadLetterStream, cancellationToken: cancellationToken).ConfigureAwait(false); + } + catch (NatsJSApiException ex) when (ex.Error?.Code == 404) + { + var config = new StreamConfig(name: _options.DeadLetterStream, subjects: new[] { _options.DeadLetterSubject }) + { + Retention = StreamConfigRetention.Workqueue, + Storage = StreamConfigStorage.File, + MaxConsumers = -1, + MaxMsgs = -1, + MaxBytes = -1 + }; + + await js.CreateStreamAsync(config, cancellationToken).ConfigureAwait(false); + _logger.LogInformation("Created NATS Notify dead-letter stream {Stream} ({Subject}).", _options.DeadLetterStream, _options.DeadLetterSubject); + } + } + + private NatsNotifyEventLease? CreateLease( + NatsJSMsg message, + string consumer, + DateTimeOffset now, + TimeSpan leaseDuration) + { + var payloadBytes = message.Data ?? Array.Empty(); + if (payloadBytes.Length == 0) + { + return null; + } + + NotifyEvent notifyEvent; + try + { + var json = Encoding.UTF8.GetString(payloadBytes); + notifyEvent = NotifyCanonicalJsonSerializer.Deserialize(json); + } + catch (Exception ex) + { + _logger.LogWarning( + ex, + "Failed to deserialize Notify event payload for NATS message {Sequence}.", + message.Metadata?.Sequence.Stream); + return null; + } + + var headers = message.Headers ?? new NatsHeaders(); + + var idempotencyKey = TryGetHeader(headers, NotifyQueueFields.IdempotencyKey) + ?? notifyEvent.EventId.ToString("N"); + + var partitionKey = TryGetHeader(headers, NotifyQueueFields.PartitionKey); + var traceId = TryGetHeader(headers, NotifyQueueFields.TraceId); + var enqueuedAt = TryGetHeader(headers, NotifyQueueFields.EnqueuedAt) is { } enqueuedRaw + && long.TryParse(enqueuedRaw, NumberStyles.Integer, CultureInfo.InvariantCulture, out var unix) + ? DateTimeOffset.FromUnixTimeMilliseconds(unix) + : now; + + var attempt = TryGetHeader(headers, NotifyQueueFields.Attempt) is { } attemptRaw + && int.TryParse(attemptRaw, NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsedAttempt) + ? parsedAttempt + : 1; + + if (message.Metadata?.NumDelivered is ulong delivered && delivered > 0) + { + var deliveredInt = delivered > int.MaxValue ? int.MaxValue : (int)delivered; + if (deliveredInt > attempt) + { + attempt = deliveredInt; + } + } + + var attributes = ExtractAttributes(headers); + var leaseExpires = now.Add(leaseDuration); + var messageId = message.Metadata?.Sequence.Stream.ToString() ?? Guid.NewGuid().ToString("n"); + + var queueMessage = new NotifyQueueEventMessage( + notifyEvent, + _options.Subject, + idempotencyKey, + partitionKey, + traceId, + attributes); + + return new NatsNotifyEventLease( + this, + message, + messageId, + queueMessage, + attempt, + consumer, + enqueuedAt, + leaseExpires); + } + + private NatsHeaders BuildHeaders(NotifyQueueEventMessage message, string idempotencyKey) + { + var headers = new NatsHeaders + { + { NotifyQueueFields.EventId, message.Event.EventId.ToString("D") }, + { NotifyQueueFields.Tenant, message.TenantId }, + { NotifyQueueFields.Kind, message.Event.Kind }, + { NotifyQueueFields.Attempt, "1" }, + { NotifyQueueFields.EnqueuedAt, _timeProvider.GetUtcNow().ToUnixTimeMilliseconds().ToString(CultureInfo.InvariantCulture) }, + { NotifyQueueFields.IdempotencyKey, idempotencyKey } + }; + + if (!string.IsNullOrWhiteSpace(message.TraceId)) + { + headers.Add(NotifyQueueFields.TraceId, message.TraceId!); + } + + if (!string.IsNullOrWhiteSpace(message.PartitionKey)) + { + headers.Add(NotifyQueueFields.PartitionKey, message.PartitionKey!); + } + + foreach (var kvp in message.Attributes) + { + headers.Add(NotifyQueueFields.AttributePrefix + kvp.Key, kvp.Value); + } + + return headers; + } + + private NatsHeaders BuildDeadLetterHeaders(NatsNotifyEventLease lease, string reason) + { + var headers = new NatsHeaders + { + { NotifyQueueFields.EventId, lease.Message.Event.EventId.ToString("D") }, + { NotifyQueueFields.Tenant, lease.Message.TenantId }, + { NotifyQueueFields.Kind, lease.Message.Event.Kind }, + { NotifyQueueFields.Attempt, lease.Attempt.ToString(CultureInfo.InvariantCulture) }, + { NotifyQueueFields.IdempotencyKey, lease.Message.IdempotencyKey }, + { "deadletter-reason", reason } + }; + + if (!string.IsNullOrWhiteSpace(lease.Message.TraceId)) + { + headers.Add(NotifyQueueFields.TraceId, lease.Message.TraceId!); + } + + if (!string.IsNullOrWhiteSpace(lease.Message.PartitionKey)) + { + headers.Add(NotifyQueueFields.PartitionKey, lease.Message.PartitionKey!); + } + + foreach (var kvp in lease.Message.Attributes) + { + headers.Add(NotifyQueueFields.AttributePrefix + kvp.Key, kvp.Value); + } + + return headers; + } + + private static string? TryGetHeader(NatsHeaders headers, string key) + { + if (headers.TryGetValue(key, out var values) && values.Count > 0) + { + var value = values[0]; + return string.IsNullOrWhiteSpace(value) ? null : value; + } + + return null; + } + + private static IReadOnlyDictionary ExtractAttributes(NatsHeaders headers) + { + var attributes = new Dictionary(StringComparer.Ordinal); + + foreach (var key in headers.Keys) + { + if (!key.StartsWith(NotifyQueueFields.AttributePrefix, StringComparison.Ordinal)) + { + continue; + } + + if (headers.TryGetValue(key, out var values) && values.Count > 0) + { + attributes[key[NotifyQueueFields.AttributePrefix.Length..]] = values[0]!; + } + } + + return attributes.Count == 0 + ? EmptyReadOnlyDictionary.Instance + : new ReadOnlyDictionary(attributes); + } + + private TimeSpan CalculateBackoff(int attempt) + { + var initial = _queueOptions.RetryInitialBackoff > TimeSpan.Zero + ? _queueOptions.RetryInitialBackoff + : _options.RetryDelay; + + if (initial <= TimeSpan.Zero) + { + return TimeSpan.Zero; + } + + if (attempt <= 1) + { + return initial; + } + + var max = _queueOptions.RetryMaxBackoff > TimeSpan.Zero + ? _queueOptions.RetryMaxBackoff + : initial; + + var exponent = attempt - 1; + var scaledTicks = initial.Ticks * Math.Pow(2, exponent - 1); + var cappedTicks = Math.Min(max.Ticks, scaledTicks); + var resultTicks = Math.Max(initial.Ticks, (long)cappedTicks); + return TimeSpan.FromTicks(resultTicks); + } + + private static long ToNanoseconds(TimeSpan value) + => value <= TimeSpan.Zero ? 0 : value.Ticks * 100L; + + private static class EmptyReadOnlyDictionary + where TKey : notnull + { + public static readonly IReadOnlyDictionary Instance = + new ReadOnlyDictionary(new Dictionary(0, EqualityComparer.Default)); + } +} diff --git a/src/StellaOps.Notify.Queue/NotifyDeliveryQueueHealthCheck.cs b/src/StellaOps.Notify.Queue/NotifyDeliveryQueueHealthCheck.cs new file mode 100644 index 00000000..0b41279e --- /dev/null +++ b/src/StellaOps.Notify.Queue/NotifyDeliveryQueueHealthCheck.cs @@ -0,0 +1,55 @@ +using System; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Logging; +using StellaOps.Notify.Queue.Nats; +using StellaOps.Notify.Queue.Redis; + +namespace StellaOps.Notify.Queue; + +public sealed class NotifyDeliveryQueueHealthCheck : IHealthCheck +{ + private readonly INotifyDeliveryQueue _queue; + private readonly ILogger _logger; + + public NotifyDeliveryQueueHealthCheck( + INotifyDeliveryQueue queue, + ILogger logger) + { + _queue = queue ?? throw new ArgumentNullException(nameof(queue)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + public async Task CheckHealthAsync( + HealthCheckContext context, + CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + + try + { + switch (_queue) + { + case RedisNotifyDeliveryQueue redisQueue: + await redisQueue.PingAsync(cancellationToken).ConfigureAwait(false); + return HealthCheckResult.Healthy("Redis Notify delivery queue reachable."); + + case NatsNotifyDeliveryQueue natsQueue: + await natsQueue.PingAsync(cancellationToken).ConfigureAwait(false); + return HealthCheckResult.Healthy("NATS Notify delivery queue reachable."); + + default: + return HealthCheckResult.Healthy("Notify delivery queue transport without dedicated ping returned healthy."); + } + } + catch (Exception ex) + { + _logger.LogError(ex, "Notify delivery queue health check failed."); + return new HealthCheckResult( + context.Registration.FailureStatus, + "Notify delivery queue transport unreachable.", + ex); + } + } +} diff --git a/src/StellaOps.Notify.Queue/NotifyDeliveryQueueOptions.cs b/src/StellaOps.Notify.Queue/NotifyDeliveryQueueOptions.cs new file mode 100644 index 00000000..dfe7554a --- /dev/null +++ b/src/StellaOps.Notify.Queue/NotifyDeliveryQueueOptions.cs @@ -0,0 +1,69 @@ +using System; + +namespace StellaOps.Notify.Queue; + +/// +/// Configuration options for the Notify delivery queue abstraction. +/// +public sealed class NotifyDeliveryQueueOptions +{ + public NotifyQueueTransportKind Transport { get; set; } = NotifyQueueTransportKind.Redis; + + public NotifyRedisDeliveryQueueOptions Redis { get; set; } = new(); + + public NotifyNatsDeliveryQueueOptions Nats { get; set; } = new(); + + public TimeSpan DefaultLeaseDuration { get; set; } = TimeSpan.FromMinutes(5); + + public int MaxDeliveryAttempts { get; set; } = 5; + + public TimeSpan RetryInitialBackoff { get; set; } = TimeSpan.FromSeconds(5); + + public TimeSpan RetryMaxBackoff { get; set; } = TimeSpan.FromMinutes(2); + + public TimeSpan ClaimIdleThreshold { get; set; } = TimeSpan.FromMinutes(5); +} + +public sealed class NotifyRedisDeliveryQueueOptions +{ + public string? ConnectionString { get; set; } + + public int? Database { get; set; } + + public TimeSpan InitializationTimeout { get; set; } = TimeSpan.FromSeconds(30); + + public string StreamName { get; set; } = "notify:deliveries"; + + public string ConsumerGroup { get; set; } = "notify-deliveries"; + + public string IdempotencyKeyPrefix { get; set; } = "notify:deliveries:idemp:"; + + public int? ApproximateMaxLength { get; set; } + + public string DeadLetterStreamName { get; set; } = "notify:deliveries:dead"; + + public TimeSpan DeadLetterRetention { get; set; } = TimeSpan.FromDays(7); +} + +public sealed class NotifyNatsDeliveryQueueOptions +{ + public string? Url { get; set; } + + public string Stream { get; set; } = "NOTIFY_DELIVERIES"; + + public string Subject { get; set; } = "notify.deliveries"; + + public string DurableConsumer { get; set; } = "notify-deliveries"; + + public string DeadLetterStream { get; set; } = "NOTIFY_DELIVERIES_DEAD"; + + public string DeadLetterSubject { get; set; } = "notify.deliveries.dead"; + + public int MaxAckPending { get; set; } = 128; + + public TimeSpan AckWait { get; set; } = TimeSpan.FromMinutes(5); + + public TimeSpan RetryDelay { get; set; } = TimeSpan.FromSeconds(10); + + public TimeSpan IdleHeartbeat { get; set; } = TimeSpan.FromSeconds(30); +} diff --git a/src/StellaOps.Notify.Queue/NotifyEventQueueOptions.cs b/src/StellaOps.Notify.Queue/NotifyEventQueueOptions.cs new file mode 100644 index 00000000..946f3576 --- /dev/null +++ b/src/StellaOps.Notify.Queue/NotifyEventQueueOptions.cs @@ -0,0 +1,177 @@ +using System; +using System.Collections.Generic; + +namespace StellaOps.Notify.Queue; + +/// +/// Configuration options for the Notify event queue abstraction. +/// +public sealed class NotifyEventQueueOptions +{ + /// + /// Transport backing the queue. + /// + public NotifyQueueTransportKind Transport { get; set; } = NotifyQueueTransportKind.Redis; + + /// + /// Redis-specific configuration. + /// + public NotifyRedisEventQueueOptions Redis { get; set; } = new(); + + /// + /// NATS JetStream-specific configuration. + /// + public NotifyNatsEventQueueOptions Nats { get; set; } = new(); + + /// + /// Default lease duration to use when consumers do not specify one explicitly. + /// + public TimeSpan DefaultLeaseDuration { get; set; } = TimeSpan.FromMinutes(5); + + /// + /// Maximum number of deliveries before a message should be considered failed. + /// + public int MaxDeliveryAttempts { get; set; } = 5; + + /// + /// Initial retry backoff applied when a message is released for retry. + /// + public TimeSpan RetryInitialBackoff { get; set; } = TimeSpan.FromSeconds(5); + + /// + /// Cap applied to exponential retry backoff. + /// + public TimeSpan RetryMaxBackoff { get; set; } = TimeSpan.FromMinutes(2); + + /// + /// Minimum idle window before a pending message becomes eligible for claim. + /// + public TimeSpan ClaimIdleThreshold { get; set; } = TimeSpan.FromMinutes(5); +} + +/// +/// Redis transport options for the Notify event queue. +/// +public sealed class NotifyRedisEventQueueOptions +{ + private IReadOnlyList _streams = new List + { + NotifyRedisEventStreamOptions.ForDefaultStream() + }; + + /// + /// Connection string for the Redis instance. + /// + public string? ConnectionString { get; set; } + + /// + /// Optional logical database to select when connecting. + /// + public int? Database { get; set; } + + /// + /// Time allowed for initial connection/consumer-group creation. + /// + public TimeSpan InitializationTimeout { get; set; } = TimeSpan.FromSeconds(30); + + /// + /// TTL applied to idempotency keys stored alongside events. + /// + public TimeSpan IdempotencyWindow { get; set; } = TimeSpan.FromHours(12); + + /// + /// Streams consumed by Notify. Ordering is preserved during leasing. + /// + public IReadOnlyList Streams + { + get => _streams; + set => _streams = value is null || value.Count == 0 + ? new List { NotifyRedisEventStreamOptions.ForDefaultStream() } + : value; + } +} + + /// + /// Per-Redis-stream options for the Notify event queue. + /// + public sealed class NotifyRedisEventStreamOptions + { + /// + /// Name of the Redis stream containing events. + /// + public string Stream { get; set; } = "notify:events"; + + /// + /// Consumer group used by Notify workers. + /// + public string ConsumerGroup { get; set; } = "notify-workers"; + + /// + /// Prefix used when storing idempotency keys in Redis. + /// + public string IdempotencyKeyPrefix { get; set; } = "notify:events:idemp:"; + + /// + /// Approximate maximum length for the stream; when set Redis will trim entries. + /// + public int? ApproximateMaxLength { get; set; } + + public static NotifyRedisEventStreamOptions ForDefaultStream() + => new(); +} + +/// +/// NATS JetStream options for the Notify event queue. +/// + public sealed class NotifyNatsEventQueueOptions + { + /// + /// URL for the JetStream-enabled NATS cluster. + /// + public string? Url { get; set; } + + /// + /// Stream name carrying Notify events. + /// + public string Stream { get; set; } = "NOTIFY_EVENTS"; + + /// + /// Subject that producers publish Notify events to. + /// + public string Subject { get; set; } = "notify.events"; + + /// + /// Durable consumer identifier for Notify workers. + /// + public string DurableConsumer { get; set; } = "notify-workers"; + + /// + /// Dead-letter stream name used when deliveries exhaust retry budget. + /// + public string DeadLetterStream { get; set; } = "NOTIFY_EVENTS_DEAD"; + + /// + /// Subject used for dead-letter publications. + /// + public string DeadLetterSubject { get; set; } = "notify.events.dead"; + + /// + /// Maximum pending messages before backpressure is applied. + /// + public int MaxAckPending { get; set; } = 256; + + /// + /// Visibility timeout applied to leased events. + /// + public TimeSpan AckWait { get; set; } = TimeSpan.FromMinutes(5); + + /// + /// Delay applied when releasing a message for retry. + /// + public TimeSpan RetryDelay { get; set; } = TimeSpan.FromSeconds(10); + + /// + /// Idle heartbeat emitted by the server to detect consumer disconnects. + /// + public TimeSpan IdleHeartbeat { get; set; } = TimeSpan.FromSeconds(30); + } diff --git a/src/StellaOps.Notify.Queue/NotifyQueueContracts.cs b/src/StellaOps.Notify.Queue/NotifyQueueContracts.cs new file mode 100644 index 00000000..a1db1c52 --- /dev/null +++ b/src/StellaOps.Notify.Queue/NotifyQueueContracts.cs @@ -0,0 +1,231 @@ +using System; +using System.Collections.Generic; +using System.Collections.ObjectModel; +using System.Threading; +using System.Threading.Tasks; +using StellaOps.Notify.Models; + +namespace StellaOps.Notify.Queue; + +/// +/// Message queued for Notify event processing. +/// +public sealed class NotifyQueueEventMessage +{ + private readonly NotifyEvent _event; + private readonly IReadOnlyDictionary _attributes; + + public NotifyQueueEventMessage( + NotifyEvent @event, + string stream, + string? idempotencyKey = null, + string? partitionKey = null, + string? traceId = null, + IReadOnlyDictionary? attributes = null) + { + _event = @event ?? throw new ArgumentNullException(nameof(@event)); + if (string.IsNullOrWhiteSpace(stream)) + { + throw new ArgumentException("Stream must be provided.", nameof(stream)); + } + + Stream = stream; + IdempotencyKey = string.IsNullOrWhiteSpace(idempotencyKey) + ? @event.EventId.ToString("N") + : idempotencyKey!; + PartitionKey = string.IsNullOrWhiteSpace(partitionKey) ? null : partitionKey.Trim(); + TraceId = string.IsNullOrWhiteSpace(traceId) ? null : traceId.Trim(); + _attributes = attributes is null + ? EmptyReadOnlyDictionary.Instance + : new ReadOnlyDictionary(new Dictionary(attributes, StringComparer.Ordinal)); + } + + public NotifyEvent Event => _event; + + public string Stream { get; } + + public string IdempotencyKey { get; } + + public string TenantId => _event.Tenant; + + public string? PartitionKey { get; } + + public string? TraceId { get; } + + public IReadOnlyDictionary Attributes => _attributes; +} + +/// +/// Message queued for channel delivery execution. +/// +public sealed class NotifyDeliveryQueueMessage +{ + public const string DefaultStream = "notify:deliveries"; + + private readonly IReadOnlyDictionary _attributes; + + public NotifyDeliveryQueueMessage( + NotifyDelivery delivery, + string channelId, + NotifyChannelType channelType, + string? stream = null, + string? traceId = null, + IReadOnlyDictionary? attributes = null) + { + Delivery = delivery ?? throw new ArgumentNullException(nameof(delivery)); + ChannelId = NotifyValidation.EnsureNotNullOrWhiteSpace(channelId, nameof(channelId)); + ChannelType = channelType; + Stream = string.IsNullOrWhiteSpace(stream) ? DefaultStream : stream!.Trim(); + TraceId = string.IsNullOrWhiteSpace(traceId) ? null : traceId.Trim(); + _attributes = attributes is null + ? EmptyReadOnlyDictionary.Instance + : new ReadOnlyDictionary(new Dictionary(attributes, StringComparer.Ordinal)); + } + + public NotifyDelivery Delivery { get; } + + public string ChannelId { get; } + + public NotifyChannelType ChannelType { get; } + + public string Stream { get; } + + public string? TraceId { get; } + + public string TenantId => Delivery.TenantId; + + public string IdempotencyKey => Delivery.DeliveryId; + + public string PartitionKey => ChannelId; + + public IReadOnlyDictionary Attributes => _attributes; +} + +public readonly record struct NotifyQueueEnqueueResult(string MessageId, bool Deduplicated); + +public sealed class NotifyQueueLeaseRequest +{ + public NotifyQueueLeaseRequest(string consumer, int batchSize, TimeSpan leaseDuration) + { + if (string.IsNullOrWhiteSpace(consumer)) + { + throw new ArgumentException("Consumer must be provided.", nameof(consumer)); + } + + if (batchSize <= 0) + { + throw new ArgumentOutOfRangeException(nameof(batchSize), batchSize, "Batch size must be positive."); + } + + if (leaseDuration <= TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(leaseDuration), leaseDuration, "Lease duration must be positive."); + } + + Consumer = consumer; + BatchSize = batchSize; + LeaseDuration = leaseDuration; + } + + public string Consumer { get; } + + public int BatchSize { get; } + + public TimeSpan LeaseDuration { get; } +} + +public sealed class NotifyQueueClaimOptions +{ + public NotifyQueueClaimOptions(string claimantConsumer, int batchSize, TimeSpan minIdleTime) + { + if (string.IsNullOrWhiteSpace(claimantConsumer)) + { + throw new ArgumentException("Consumer must be provided.", nameof(claimantConsumer)); + } + + if (batchSize <= 0) + { + throw new ArgumentOutOfRangeException(nameof(batchSize), batchSize, "Batch size must be positive."); + } + + if (minIdleTime < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(minIdleTime), minIdleTime, "Minimum idle time cannot be negative."); + } + + ClaimantConsumer = claimantConsumer; + BatchSize = batchSize; + MinIdleTime = minIdleTime; + } + + public string ClaimantConsumer { get; } + + public int BatchSize { get; } + + public TimeSpan MinIdleTime { get; } +} + +public enum NotifyQueueReleaseDisposition +{ + Retry, + Abandon +} + +public interface INotifyQueue +{ + ValueTask PublishAsync(TMessage message, CancellationToken cancellationToken = default); + + ValueTask>> LeaseAsync(NotifyQueueLeaseRequest request, CancellationToken cancellationToken = default); + + ValueTask>> ClaimExpiredAsync(NotifyQueueClaimOptions options, CancellationToken cancellationToken = default); +} + +public interface INotifyQueueLease +{ + string MessageId { get; } + + int Attempt { get; } + + DateTimeOffset EnqueuedAt { get; } + + DateTimeOffset LeaseExpiresAt { get; } + + string Consumer { get; } + + string Stream { get; } + + string TenantId { get; } + + string? PartitionKey { get; } + + string IdempotencyKey { get; } + + string? TraceId { get; } + + IReadOnlyDictionary Attributes { get; } + + TMessage Message { get; } + + Task AcknowledgeAsync(CancellationToken cancellationToken = default); + + Task RenewAsync(TimeSpan leaseDuration, CancellationToken cancellationToken = default); + + Task ReleaseAsync(NotifyQueueReleaseDisposition disposition, CancellationToken cancellationToken = default); + + Task DeadLetterAsync(string reason, CancellationToken cancellationToken = default); +} + +public interface INotifyEventQueue : INotifyQueue +{ +} + +public interface INotifyDeliveryQueue : INotifyQueue +{ +} + +internal static class EmptyReadOnlyDictionary + where TKey : notnull +{ + public static readonly IReadOnlyDictionary Instance = + new ReadOnlyDictionary(new Dictionary(0, EqualityComparer.Default)); +} diff --git a/src/StellaOps.Notify.Queue/NotifyQueueFields.cs b/src/StellaOps.Notify.Queue/NotifyQueueFields.cs new file mode 100644 index 00000000..22d33f47 --- /dev/null +++ b/src/StellaOps.Notify.Queue/NotifyQueueFields.cs @@ -0,0 +1,18 @@ +namespace StellaOps.Notify.Queue; + +internal static class NotifyQueueFields +{ + public const string Payload = "payload"; + public const string EventId = "eventId"; + public const string DeliveryId = "deliveryId"; + public const string Tenant = "tenant"; + public const string Kind = "kind"; + public const string Attempt = "attempt"; + public const string EnqueuedAt = "enqueuedAt"; + public const string TraceId = "traceId"; + public const string PartitionKey = "partitionKey"; + public const string ChannelId = "channelId"; + public const string ChannelType = "channelType"; + public const string IdempotencyKey = "idempotency"; + public const string AttributePrefix = "attr:"; +} diff --git a/src/StellaOps.Notify.Queue/NotifyQueueHealthCheck.cs b/src/StellaOps.Notify.Queue/NotifyQueueHealthCheck.cs new file mode 100644 index 00000000..8e12398e --- /dev/null +++ b/src/StellaOps.Notify.Queue/NotifyQueueHealthCheck.cs @@ -0,0 +1,55 @@ +using System; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Logging; +using StellaOps.Notify.Queue.Nats; +using StellaOps.Notify.Queue.Redis; + +namespace StellaOps.Notify.Queue; + +public sealed class NotifyQueueHealthCheck : IHealthCheck +{ + private readonly INotifyEventQueue _queue; + private readonly ILogger _logger; + + public NotifyQueueHealthCheck( + INotifyEventQueue queue, + ILogger logger) + { + _queue = queue ?? throw new ArgumentNullException(nameof(queue)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + public async Task CheckHealthAsync( + HealthCheckContext context, + CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + + try + { + switch (_queue) + { + case RedisNotifyEventQueue redisQueue: + await redisQueue.PingAsync(cancellationToken).ConfigureAwait(false); + return HealthCheckResult.Healthy("Redis Notify queue reachable."); + + case NatsNotifyEventQueue natsQueue: + await natsQueue.PingAsync(cancellationToken).ConfigureAwait(false); + return HealthCheckResult.Healthy("NATS Notify queue reachable."); + + default: + return HealthCheckResult.Healthy("Notify queue transport without dedicated ping returned healthy."); + } + } + catch (Exception ex) + { + _logger.LogError(ex, "Notify queue health check failed."); + return new HealthCheckResult( + context.Registration.FailureStatus, + "Notify queue transport unreachable.", + ex); + } + } +} diff --git a/src/StellaOps.Notify.Queue/NotifyQueueMetrics.cs b/src/StellaOps.Notify.Queue/NotifyQueueMetrics.cs new file mode 100644 index 00000000..744f465a --- /dev/null +++ b/src/StellaOps.Notify.Queue/NotifyQueueMetrics.cs @@ -0,0 +1,39 @@ +using System.Collections.Generic; +using System.Diagnostics.Metrics; + +namespace StellaOps.Notify.Queue; + +internal static class NotifyQueueMetrics +{ + private const string TransportTag = "transport"; + private const string StreamTag = "stream"; + + private static readonly Meter Meter = new("StellaOps.Notify.Queue"); + private static readonly Counter EnqueuedCounter = Meter.CreateCounter("notify_queue_enqueued_total"); + private static readonly Counter DeduplicatedCounter = Meter.CreateCounter("notify_queue_deduplicated_total"); + private static readonly Counter AckCounter = Meter.CreateCounter("notify_queue_ack_total"); + private static readonly Counter RetryCounter = Meter.CreateCounter("notify_queue_retry_total"); + private static readonly Counter DeadLetterCounter = Meter.CreateCounter("notify_queue_deadletter_total"); + + public static void RecordEnqueued(string transport, string stream) + => EnqueuedCounter.Add(1, BuildTags(transport, stream)); + + public static void RecordDeduplicated(string transport, string stream) + => DeduplicatedCounter.Add(1, BuildTags(transport, stream)); + + public static void RecordAck(string transport, string stream) + => AckCounter.Add(1, BuildTags(transport, stream)); + + public static void RecordRetry(string transport, string stream) + => RetryCounter.Add(1, BuildTags(transport, stream)); + + public static void RecordDeadLetter(string transport, string stream) + => DeadLetterCounter.Add(1, BuildTags(transport, stream)); + + private static KeyValuePair[] BuildTags(string transport, string stream) + => new[] + { + new KeyValuePair(TransportTag, transport), + new KeyValuePair(StreamTag, stream) + }; +} diff --git a/src/StellaOps.Notify.Queue/NotifyQueueServiceCollectionExtensions.cs b/src/StellaOps.Notify.Queue/NotifyQueueServiceCollectionExtensions.cs new file mode 100644 index 00000000..a257bd5d --- /dev/null +++ b/src/StellaOps.Notify.Queue/NotifyQueueServiceCollectionExtensions.cs @@ -0,0 +1,146 @@ +using System; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.DependencyInjection.Extensions; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Logging; +using StellaOps.Notify.Queue.Nats; +using StellaOps.Notify.Queue.Redis; + +namespace StellaOps.Notify.Queue; + +public static class NotifyQueueServiceCollectionExtensions +{ + public static IServiceCollection AddNotifyEventQueue( + this IServiceCollection services, + IConfiguration configuration, + string sectionName = "notify:queue") + { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(configuration); + + var eventOptions = new NotifyEventQueueOptions(); + configuration.GetSection(sectionName).Bind(eventOptions); + + services.TryAddSingleton(TimeProvider.System); + services.AddSingleton(eventOptions); + + services.AddSingleton(sp => + { + var loggerFactory = sp.GetRequiredService(); + var timeProvider = sp.GetService() ?? TimeProvider.System; + var opts = sp.GetRequiredService(); + + return opts.Transport switch + { + NotifyQueueTransportKind.Redis => new RedisNotifyEventQueue( + opts, + opts.Redis, + loggerFactory.CreateLogger(), + timeProvider), + NotifyQueueTransportKind.Nats => new NatsNotifyEventQueue( + opts, + opts.Nats, + loggerFactory.CreateLogger(), + timeProvider), + _ => throw new InvalidOperationException($"Unsupported Notify queue transport kind '{opts.Transport}'.") + }; + }); + + services.AddSingleton(); + + return services; + } + + public static IServiceCollection AddNotifyDeliveryQueue( + this IServiceCollection services, + IConfiguration configuration, + string sectionName = "notify:deliveryQueue") + { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(configuration); + + var deliveryOptions = new NotifyDeliveryQueueOptions(); + configuration.GetSection(sectionName).Bind(deliveryOptions); + + services.AddSingleton(deliveryOptions); + + services.AddSingleton(sp => + { + var loggerFactory = sp.GetRequiredService(); + var timeProvider = sp.GetService() ?? TimeProvider.System; + var opts = sp.GetRequiredService(); + var eventOpts = sp.GetService(); + + ApplyDeliveryFallbacks(opts, eventOpts); + + return opts.Transport switch + { + NotifyQueueTransportKind.Redis => new RedisNotifyDeliveryQueue( + opts, + opts.Redis, + loggerFactory.CreateLogger(), + timeProvider), + NotifyQueueTransportKind.Nats => new NatsNotifyDeliveryQueue( + opts, + opts.Nats, + loggerFactory.CreateLogger(), + timeProvider), + _ => throw new InvalidOperationException($"Unsupported Notify delivery queue transport kind '{opts.Transport}'.") + }; + }); + + services.AddSingleton(); + + return services; + } + + public static IHealthChecksBuilder AddNotifyQueueHealthCheck( + this IHealthChecksBuilder builder) + { + ArgumentNullException.ThrowIfNull(builder); + + builder.Services.TryAddSingleton(); + builder.AddCheck( + name: "notify-queue", + failureStatus: HealthStatus.Unhealthy, + tags: new[] { "notify", "queue" }); + + return builder; + } + + public static IHealthChecksBuilder AddNotifyDeliveryQueueHealthCheck( + this IHealthChecksBuilder builder) + { + ArgumentNullException.ThrowIfNull(builder); + + builder.Services.TryAddSingleton(); + builder.AddCheck( + name: "notify-delivery-queue", + failureStatus: HealthStatus.Unhealthy, + tags: new[] { "notify", "queue", "delivery" }); + + return builder; + } + + private static void ApplyDeliveryFallbacks( + NotifyDeliveryQueueOptions deliveryOptions, + NotifyEventQueueOptions? eventOptions) + { + if (eventOptions is null) + { + return; + } + + if (string.IsNullOrWhiteSpace(deliveryOptions.Redis.ConnectionString)) + { + deliveryOptions.Redis.ConnectionString = eventOptions.Redis.ConnectionString; + deliveryOptions.Redis.Database ??= eventOptions.Redis.Database; + } + + if (string.IsNullOrWhiteSpace(deliveryOptions.Nats.Url)) + { + deliveryOptions.Nats.Url = eventOptions.Nats.Url; + } + } +} diff --git a/src/StellaOps.Notify.Queue/NotifyQueueTransportKind.cs b/src/StellaOps.Notify.Queue/NotifyQueueTransportKind.cs new file mode 100644 index 00000000..cf0f13c6 --- /dev/null +++ b/src/StellaOps.Notify.Queue/NotifyQueueTransportKind.cs @@ -0,0 +1,10 @@ +namespace StellaOps.Notify.Queue; + +/// +/// Supported transports for the Notify event queue. +/// +public enum NotifyQueueTransportKind +{ + Redis, + Nats +} diff --git a/src/StellaOps.Notify.Queue/Properties/AssemblyInfo.cs b/src/StellaOps.Notify.Queue/Properties/AssemblyInfo.cs new file mode 100644 index 00000000..87064c85 --- /dev/null +++ b/src/StellaOps.Notify.Queue/Properties/AssemblyInfo.cs @@ -0,0 +1,3 @@ +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("StellaOps.Notify.Queue.Tests")] diff --git a/src/StellaOps.Notify.Queue/Redis/RedisNotifyDeliveryLease.cs b/src/StellaOps.Notify.Queue/Redis/RedisNotifyDeliveryLease.cs new file mode 100644 index 00000000..fc61ad76 --- /dev/null +++ b/src/StellaOps.Notify.Queue/Redis/RedisNotifyDeliveryLease.cs @@ -0,0 +1,76 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace StellaOps.Notify.Queue.Redis; + +internal sealed class RedisNotifyDeliveryLease : INotifyQueueLease +{ + private readonly RedisNotifyDeliveryQueue _queue; + private int _completed; + + internal RedisNotifyDeliveryLease( + RedisNotifyDeliveryQueue queue, + string messageId, + NotifyDeliveryQueueMessage message, + int attempt, + DateTimeOffset enqueuedAt, + DateTimeOffset leaseExpiresAt, + string consumer, + string? idempotencyKey, + string partitionKey) + { + _queue = queue ?? throw new ArgumentNullException(nameof(queue)); + MessageId = messageId ?? throw new ArgumentNullException(nameof(messageId)); + Message = message ?? throw new ArgumentNullException(nameof(message)); + Attempt = attempt; + EnqueuedAt = enqueuedAt; + LeaseExpiresAt = leaseExpiresAt; + Consumer = consumer ?? throw new ArgumentNullException(nameof(consumer)); + IdempotencyKey = idempotencyKey ?? message.IdempotencyKey; + PartitionKey = partitionKey ?? message.ChannelId; + } + + public string MessageId { get; } + + public int Attempt { get; internal set; } + + public DateTimeOffset EnqueuedAt { get; } + + public DateTimeOffset LeaseExpiresAt { get; private set; } + + public string Consumer { get; } + + public string Stream => Message.Stream; + + public string TenantId => Message.TenantId; + + public string PartitionKey { get; } + + public string IdempotencyKey { get; } + + public string? TraceId => Message.TraceId; + + public IReadOnlyDictionary Attributes => Message.Attributes; + + public NotifyDeliveryQueueMessage Message { get; } + + public Task AcknowledgeAsync(CancellationToken cancellationToken = default) + => _queue.AcknowledgeAsync(this, cancellationToken); + + public Task RenewAsync(TimeSpan leaseDuration, CancellationToken cancellationToken = default) + => _queue.RenewLeaseAsync(this, leaseDuration, cancellationToken); + + public Task ReleaseAsync(NotifyQueueReleaseDisposition disposition, CancellationToken cancellationToken = default) + => _queue.ReleaseAsync(this, disposition, cancellationToken); + + public Task DeadLetterAsync(string reason, CancellationToken cancellationToken = default) + => _queue.DeadLetterAsync(this, reason, cancellationToken); + + internal bool TryBeginCompletion() + => Interlocked.CompareExchange(ref _completed, 1, 0) == 0; + + internal void RefreshLease(DateTimeOffset expiresAt) + => LeaseExpiresAt = expiresAt; +} diff --git a/src/StellaOps.Notify.Queue/Redis/RedisNotifyDeliveryQueue.cs b/src/StellaOps.Notify.Queue/Redis/RedisNotifyDeliveryQueue.cs new file mode 100644 index 00000000..0bb93674 --- /dev/null +++ b/src/StellaOps.Notify.Queue/Redis/RedisNotifyDeliveryQueue.cs @@ -0,0 +1,788 @@ +using System; +using System.Buffers; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Collections.ObjectModel; +using System.Globalization; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using StackExchange.Redis; +using StellaOps.Notify.Models; + +namespace StellaOps.Notify.Queue.Redis; + +internal sealed class RedisNotifyDeliveryQueue : INotifyDeliveryQueue, IAsyncDisposable +{ + private const string TransportName = "redis"; + + private readonly NotifyDeliveryQueueOptions _options; + private readonly NotifyRedisDeliveryQueueOptions _redisOptions; + private readonly ILogger _logger; + private readonly TimeProvider _timeProvider; + private readonly Func> _connectionFactory; + private readonly SemaphoreSlim _connectionLock = new(1, 1); + private readonly SemaphoreSlim _groupLock = new(1, 1); + private readonly ConcurrentDictionary _streamInitialized = new(StringComparer.Ordinal); + + private IConnectionMultiplexer? _connection; + private bool _disposed; + + public RedisNotifyDeliveryQueue( + NotifyDeliveryQueueOptions options, + NotifyRedisDeliveryQueueOptions redisOptions, + ILogger logger, + TimeProvider timeProvider, + Func>? connectionFactory = null) + { + _options = options ?? throw new ArgumentNullException(nameof(options)); + _redisOptions = redisOptions ?? throw new ArgumentNullException(nameof(redisOptions)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + _timeProvider = timeProvider ?? TimeProvider.System; + _connectionFactory = connectionFactory ?? (async config => + { + var connection = await ConnectionMultiplexer.ConnectAsync(config).ConfigureAwait(false); + return (IConnectionMultiplexer)connection; + }); + + if (string.IsNullOrWhiteSpace(_redisOptions.ConnectionString)) + { + throw new InvalidOperationException("Redis connection string must be configured for the Notify delivery queue."); + } + } + + public async ValueTask PublishAsync( + NotifyDeliveryQueueMessage message, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(message); + cancellationToken.ThrowIfCancellationRequested(); + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + await EnsureConsumerGroupAsync(db, cancellationToken).ConfigureAwait(false); + + var now = _timeProvider.GetUtcNow(); + var attempt = 1; + var entries = BuildEntries(message, now, attempt); + + var messageId = await AddToStreamAsync( + db, + _redisOptions.StreamName, + entries) + .ConfigureAwait(false); + + var idempotencyKey = BuildIdempotencyKey(message.IdempotencyKey); + var stored = await db.StringSetAsync( + idempotencyKey, + messageId, + when: When.NotExists, + expiry: _options.ClaimIdleThreshold) + .ConfigureAwait(false); + + if (!stored) + { + await db.StreamDeleteAsync( + _redisOptions.StreamName, + new RedisValue[] { messageId }) + .ConfigureAwait(false); + + var existing = await db.StringGetAsync(idempotencyKey).ConfigureAwait(false); + var duplicateId = existing.IsNullOrEmpty ? messageId : existing; + + NotifyQueueMetrics.RecordDeduplicated(TransportName, _redisOptions.StreamName); + _logger.LogDebug( + "Duplicate Notify delivery enqueue detected for delivery {DeliveryId}.", + message.Delivery.DeliveryId); + + return new NotifyQueueEnqueueResult(duplicateId.ToString()!, true); + } + + NotifyQueueMetrics.RecordEnqueued(TransportName, _redisOptions.StreamName); + _logger.LogDebug( + "Enqueued Notify delivery {DeliveryId} (channel {ChannelId}) into stream {Stream}.", + message.Delivery.DeliveryId, + message.ChannelId, + _redisOptions.StreamName); + + return new NotifyQueueEnqueueResult(messageId.ToString()!, false); + } + + public async ValueTask>> LeaseAsync( + NotifyQueueLeaseRequest request, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(request); + cancellationToken.ThrowIfCancellationRequested(); + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + await EnsureConsumerGroupAsync(db, cancellationToken).ConfigureAwait(false); + + var entries = await db.StreamReadGroupAsync( + _redisOptions.StreamName, + _redisOptions.ConsumerGroup, + request.Consumer, + StreamPosition.NewMessages, + request.BatchSize) + .ConfigureAwait(false); + + if (entries is null || entries.Length == 0) + { + return Array.Empty>(); + } + + var now = _timeProvider.GetUtcNow(); + var leases = new List>(entries.Length); + + foreach (var entry in entries) + { + var lease = TryMapLease(entry, request.Consumer, now, request.LeaseDuration, attemptOverride: null); + if (lease is null) + { + await AckPoisonAsync(db, entry.Id).ConfigureAwait(false); + continue; + } + + leases.Add(lease); + } + + return leases; + } + + public async ValueTask>> ClaimExpiredAsync( + NotifyQueueClaimOptions options, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(options); + cancellationToken.ThrowIfCancellationRequested(); + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + await EnsureConsumerGroupAsync(db, cancellationToken).ConfigureAwait(false); + + var pending = await db.StreamPendingMessagesAsync( + _redisOptions.StreamName, + _redisOptions.ConsumerGroup, + options.BatchSize, + RedisValue.Null, + (long)options.MinIdleTime.TotalMilliseconds) + .ConfigureAwait(false); + + if (pending is null || pending.Length == 0) + { + return Array.Empty>(); + } + + var eligible = pending + .Where(p => p.IdleTimeInMilliseconds >= options.MinIdleTime.TotalMilliseconds) + .ToArray(); + + if (eligible.Length == 0) + { + return Array.Empty>(); + } + + var messageIds = eligible + .Select(static p => (RedisValue)p.MessageId) + .ToArray(); + + var entries = await db.StreamClaimAsync( + _redisOptions.StreamName, + _redisOptions.ConsumerGroup, + options.ClaimantConsumer, + 0, + messageIds) + .ConfigureAwait(false); + + if (entries is null || entries.Length == 0) + { + return Array.Empty>(); + } + + var now = _timeProvider.GetUtcNow(); + var attemptLookup = eligible + .Where(static info => !info.MessageId.IsNullOrEmpty) + .ToDictionary( + info => info.MessageId!.ToString(), + info => (int)Math.Max(1, info.DeliveryCount), + StringComparer.Ordinal); + + var leases = new List>(entries.Length); + + foreach (var entry in entries) + { + attemptLookup.TryGetValue(entry.Id.ToString(), out var attempt); + var lease = TryMapLease(entry, options.ClaimantConsumer, now, _options.DefaultLeaseDuration, attempt == 0 ? null : attempt); + if (lease is null) + { + await AckPoisonAsync(db, entry.Id).ConfigureAwait(false); + continue; + } + + leases.Add(lease); + } + + return leases; + } + + public async ValueTask DisposeAsync() + { + if (_disposed) + { + return; + } + + _disposed = true; + if (_connection is not null) + { + await _connection.CloseAsync().ConfigureAwait(false); + _connection.Dispose(); + } + + _connectionLock.Dispose(); + _groupLock.Dispose(); + GC.SuppressFinalize(this); + } + + internal async Task AcknowledgeAsync( + RedisNotifyDeliveryLease lease, + CancellationToken cancellationToken) + { + if (!lease.TryBeginCompletion()) + { + return; + } + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + + await db.StreamAcknowledgeAsync( + _redisOptions.StreamName, + _redisOptions.ConsumerGroup, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + await db.StreamDeleteAsync( + _redisOptions.StreamName, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + NotifyQueueMetrics.RecordAck(TransportName, _redisOptions.StreamName); + _logger.LogDebug( + "Acknowledged Notify delivery {DeliveryId} (message {MessageId}).", + lease.Message.Delivery.DeliveryId, + lease.MessageId); + } + + internal async Task RenewLeaseAsync( + RedisNotifyDeliveryLease lease, + TimeSpan leaseDuration, + CancellationToken cancellationToken) + { + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + + await db.StreamClaimAsync( + _redisOptions.StreamName, + _redisOptions.ConsumerGroup, + lease.Consumer, + 0, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + var expires = _timeProvider.GetUtcNow().Add(leaseDuration); + lease.RefreshLease(expires); + + _logger.LogDebug( + "Renewed Notify delivery lease {DeliveryId} until {Expires:u}.", + lease.Message.Delivery.DeliveryId, + expires); + } + + internal async Task ReleaseAsync( + RedisNotifyDeliveryLease lease, + NotifyQueueReleaseDisposition disposition, + CancellationToken cancellationToken) + { + if (disposition == NotifyQueueReleaseDisposition.Retry + && lease.Attempt >= _options.MaxDeliveryAttempts) + { + _logger.LogWarning( + "Notify delivery {DeliveryId} reached max delivery attempts ({Attempts}); moving to dead-letter stream.", + lease.Message.Delivery.DeliveryId, + lease.Attempt); + + await DeadLetterAsync( + lease, + $"max-delivery-attempts:{lease.Attempt}", + cancellationToken).ConfigureAwait(false); + + return; + } + + if (!lease.TryBeginCompletion()) + { + return; + } + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + await db.StreamAcknowledgeAsync( + _redisOptions.StreamName, + _redisOptions.ConsumerGroup, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + await db.StreamDeleteAsync( + _redisOptions.StreamName, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + if (disposition == NotifyQueueReleaseDisposition.Retry) + { + NotifyQueueMetrics.RecordRetry(TransportName, _redisOptions.StreamName); + + var delay = CalculateBackoff(lease.Attempt); + if (delay > TimeSpan.Zero) + { + try + { + await Task.Delay(delay, cancellationToken).ConfigureAwait(false); + } + catch (TaskCanceledException) + { + return; + } + } + + var now = _timeProvider.GetUtcNow(); + var entries = BuildEntries(lease.Message, now, lease.Attempt + 1); + + await AddToStreamAsync( + db, + _redisOptions.StreamName, + entries) + .ConfigureAwait(false); + + NotifyQueueMetrics.RecordEnqueued(TransportName, _redisOptions.StreamName); + _logger.LogInformation( + "Retrying Notify delivery {DeliveryId} (attempt {Attempt}).", + lease.Message.Delivery.DeliveryId, + lease.Attempt + 1); + } + else + { + NotifyQueueMetrics.RecordAck(TransportName, _redisOptions.StreamName); + _logger.LogInformation( + "Abandoned Notify delivery {DeliveryId} after {Attempt} attempt(s).", + lease.Message.Delivery.DeliveryId, + lease.Attempt); + } + } + + internal async Task DeadLetterAsync( + RedisNotifyDeliveryLease lease, + string reason, + CancellationToken cancellationToken) + { + if (!lease.TryBeginCompletion()) + { + return; + } + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + + await db.StreamAcknowledgeAsync( + _redisOptions.StreamName, + _redisOptions.ConsumerGroup, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + await db.StreamDeleteAsync( + _redisOptions.StreamName, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + await EnsureDeadLetterStreamAsync(db, cancellationToken).ConfigureAwait(false); + + var entries = BuildDeadLetterEntries(lease, reason); + await AddToStreamAsync( + db, + _redisOptions.DeadLetterStreamName, + entries) + .ConfigureAwait(false); + + NotifyQueueMetrics.RecordDeadLetter(TransportName, _redisOptions.DeadLetterStreamName); + _logger.LogError( + "Dead-lettered Notify delivery {DeliveryId} (attempt {Attempt}): {Reason}", + lease.Message.Delivery.DeliveryId, + lease.Attempt, + reason); + } + + internal async ValueTask PingAsync(CancellationToken cancellationToken) + { + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + _ = await db.PingAsync().ConfigureAwait(false); + } + + private async Task GetDatabaseAsync(CancellationToken cancellationToken) + { + if (_connection is { IsConnected: true }) + { + return _connection.GetDatabase(_redisOptions.Database ?? -1); + } + + await _connectionLock.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + if (_connection is { IsConnected: true }) + { + return _connection.GetDatabase(_redisOptions.Database ?? -1); + } + + var configuration = ConfigurationOptions.Parse(_redisOptions.ConnectionString!); + configuration.AbortOnConnectFail = false; + if (_redisOptions.Database.HasValue) + { + configuration.DefaultDatabase = _redisOptions.Database.Value; + } + + using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + timeoutCts.CancelAfter(_redisOptions.InitializationTimeout); + + _connection = await _connectionFactory(configuration).WaitAsync(timeoutCts.Token).ConfigureAwait(false); + return _connection.GetDatabase(_redisOptions.Database ?? -1); + } + finally + { + _connectionLock.Release(); + } + } + + private async Task EnsureConsumerGroupAsync( + IDatabase database, + CancellationToken cancellationToken) + { + if (_streamInitialized.ContainsKey(_redisOptions.StreamName)) + { + return; + } + + await _groupLock.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + if (_streamInitialized.ContainsKey(_redisOptions.StreamName)) + { + return; + } + + try + { + await database.StreamCreateConsumerGroupAsync( + _redisOptions.StreamName, + _redisOptions.ConsumerGroup, + StreamPosition.Beginning, + createStream: true) + .ConfigureAwait(false); + } + catch (RedisServerException ex) when (ex.Message.Contains("BUSYGROUP", StringComparison.OrdinalIgnoreCase)) + { + // group already exists + } + + _streamInitialized[_redisOptions.StreamName] = true; + } + finally + { + _groupLock.Release(); + } + } + + private async Task EnsureDeadLetterStreamAsync( + IDatabase database, + CancellationToken cancellationToken) + { + if (_streamInitialized.ContainsKey(_redisOptions.DeadLetterStreamName)) + { + return; + } + + await _groupLock.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + if (_streamInitialized.ContainsKey(_redisOptions.DeadLetterStreamName)) + { + return; + } + + try + { + await database.StreamCreateConsumerGroupAsync( + _redisOptions.DeadLetterStreamName, + _redisOptions.ConsumerGroup, + StreamPosition.Beginning, + createStream: true) + .ConfigureAwait(false); + } + catch (RedisServerException ex) when (ex.Message.Contains("BUSYGROUP", StringComparison.OrdinalIgnoreCase)) + { + // ignore + } + + _streamInitialized[_redisOptions.DeadLetterStreamName] = true; + } + finally + { + _groupLock.Release(); + } + } + + private NameValueEntry[] BuildEntries( + NotifyDeliveryQueueMessage message, + DateTimeOffset enqueuedAt, + int attempt) + { + var json = NotifyCanonicalJsonSerializer.Serialize(message.Delivery); + var attributeCount = message.Attributes.Count; + + var entries = ArrayPool.Shared.Rent(8 + attributeCount); + var index = 0; + + entries[index++] = new NameValueEntry(NotifyQueueFields.Payload, json); + entries[index++] = new NameValueEntry(NotifyQueueFields.DeliveryId, message.Delivery.DeliveryId); + entries[index++] = new NameValueEntry(NotifyQueueFields.ChannelId, message.ChannelId); + entries[index++] = new NameValueEntry(NotifyQueueFields.ChannelType, message.ChannelType.ToString()); + entries[index++] = new NameValueEntry(NotifyQueueFields.Tenant, message.Delivery.TenantId); + entries[index++] = new NameValueEntry(NotifyQueueFields.Attempt, attempt); + entries[index++] = new NameValueEntry(NotifyQueueFields.EnqueuedAt, enqueuedAt.ToUnixTimeMilliseconds()); + entries[index++] = new NameValueEntry(NotifyQueueFields.IdempotencyKey, message.IdempotencyKey); + entries[index++] = new NameValueEntry(NotifyQueueFields.TraceId, message.TraceId ?? string.Empty); + entries[index++] = new NameValueEntry(NotifyQueueFields.PartitionKey, message.PartitionKey); + + if (attributeCount > 0) + { + foreach (var kvp in message.Attributes) + { + entries[index++] = new NameValueEntry( + NotifyQueueFields.AttributePrefix + kvp.Key, + kvp.Value); + } + } + + return entries.AsSpan(0, index).ToArray(); + } + + private NameValueEntry[] BuildDeadLetterEntries(RedisNotifyDeliveryLease lease, string reason) + { + var json = NotifyCanonicalJsonSerializer.Serialize(lease.Message.Delivery); + var attributes = lease.Message.Attributes; + var attributeCount = attributes.Count; + + var entries = ArrayPool.Shared.Rent(9 + attributeCount); + var index = 0; + + entries[index++] = new NameValueEntry(NotifyQueueFields.Payload, json); + entries[index++] = new NameValueEntry(NotifyQueueFields.DeliveryId, lease.Message.Delivery.DeliveryId); + entries[index++] = new NameValueEntry(NotifyQueueFields.ChannelId, lease.Message.ChannelId); + entries[index++] = new NameValueEntry(NotifyQueueFields.ChannelType, lease.Message.ChannelType.ToString()); + entries[index++] = new NameValueEntry(NotifyQueueFields.Tenant, lease.Message.Delivery.TenantId); + entries[index++] = new NameValueEntry(NotifyQueueFields.Attempt, lease.Attempt); + entries[index++] = new NameValueEntry(NotifyQueueFields.IdempotencyKey, lease.Message.IdempotencyKey); + entries[index++] = new NameValueEntry("deadletter-reason", reason); + entries[index++] = new NameValueEntry(NotifyQueueFields.TraceId, lease.Message.TraceId ?? string.Empty); + + foreach (var kvp in attributes) + { + entries[index++] = new NameValueEntry( + NotifyQueueFields.AttributePrefix + kvp.Key, + kvp.Value); + } + + return entries.AsSpan(0, index).ToArray(); + } + + private RedisNotifyDeliveryLease? TryMapLease( + StreamEntry entry, + string consumer, + DateTimeOffset now, + TimeSpan leaseDuration, + int? attemptOverride) + { + if (entry.Values is null || entry.Values.Length == 0) + { + return null; + } + + string? payload = null; + string? deliveryId = null; + string? channelId = null; + string? channelTypeRaw = null; + string? traceId = null; + string? idempotency = null; + string? partitionKey = null; + long? enqueuedAtUnix = null; + var attempt = attemptOverride ?? 1; + var attributes = new Dictionary(StringComparer.Ordinal); + + foreach (var value in entry.Values) + { + var name = value.Name.ToString(); + var data = value.Value; + if (name.Equals(NotifyQueueFields.Payload, StringComparison.Ordinal)) + { + payload = data.ToString(); + } + else if (name.Equals(NotifyQueueFields.DeliveryId, StringComparison.Ordinal)) + { + deliveryId = data.ToString(); + } + else if (name.Equals(NotifyQueueFields.ChannelId, StringComparison.Ordinal)) + { + channelId = data.ToString(); + } + else if (name.Equals(NotifyQueueFields.ChannelType, StringComparison.Ordinal)) + { + channelTypeRaw = data.ToString(); + } + else if (name.Equals(NotifyQueueFields.Attempt, StringComparison.Ordinal)) + { + if (int.TryParse(data.ToString(), NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsed)) + { + attempt = Math.Max(parsed, attempt); + } + } + else if (name.Equals(NotifyQueueFields.EnqueuedAt, StringComparison.Ordinal)) + { + if (long.TryParse(data.ToString(), NumberStyles.Integer, CultureInfo.InvariantCulture, out var unix)) + { + enqueuedAtUnix = unix; + } + } + else if (name.Equals(NotifyQueueFields.IdempotencyKey, StringComparison.Ordinal)) + { + idempotency = data.ToString(); + } + else if (name.Equals(NotifyQueueFields.TraceId, StringComparison.Ordinal)) + { + var text = data.ToString(); + traceId = string.IsNullOrWhiteSpace(text) ? null : text; + } + else if (name.Equals(NotifyQueueFields.PartitionKey, StringComparison.Ordinal)) + { + partitionKey = data.ToString(); + } + else if (name.StartsWith(NotifyQueueFields.AttributePrefix, StringComparison.Ordinal)) + { + attributes[name[NotifyQueueFields.AttributePrefix.Length..]] = data.ToString(); + } + } + + if (payload is null || deliveryId is null || channelId is null || channelTypeRaw is null) + { + return null; + } + + NotifyDelivery delivery; + try + { + delivery = NotifyCanonicalJsonSerializer.Deserialize(payload); + } + catch (Exception ex) + { + _logger.LogWarning( + ex, + "Failed to deserialize Notify delivery payload for entry {EntryId}.", + entry.Id.ToString()); + return null; + } + + if (!Enum.TryParse(channelTypeRaw, ignoreCase: true, out var channelType)) + { + _logger.LogWarning( + "Unknown channel type '{ChannelType}' for delivery {DeliveryId}; acknowledging as poison.", + channelTypeRaw, + deliveryId); + return null; + } + + var attributeView = attributes.Count == 0 + ? EmptyReadOnlyDictionary.Instance + : new ReadOnlyDictionary(attributes); + + var enqueuedAt = enqueuedAtUnix is null + ? now + : DateTimeOffset.FromUnixTimeMilliseconds(enqueuedAtUnix.Value); + + var message = new NotifyDeliveryQueueMessage( + delivery, + channelId, + channelType, + _redisOptions.StreamName, + traceId, + attributeView); + + var leaseExpires = now.Add(leaseDuration); + + return new RedisNotifyDeliveryLease( + this, + entry.Id.ToString(), + message, + attempt, + enqueuedAt, + leaseExpires, + consumer, + idempotency, + partitionKey ?? channelId); + } + + private async Task AckPoisonAsync(IDatabase database, RedisValue messageId) + { + await database.StreamAcknowledgeAsync( + _redisOptions.StreamName, + _redisOptions.ConsumerGroup, + new RedisValue[] { messageId }) + .ConfigureAwait(false); + + await database.StreamDeleteAsync( + _redisOptions.StreamName, + new RedisValue[] { messageId }) + .ConfigureAwait(false); + } + + private static async Task AddToStreamAsync( + IDatabase database, + string stream, + IReadOnlyList entries) + { + return await database.StreamAddAsync( + stream, + entries.ToArray()) + .ConfigureAwait(false); + } + + private string BuildIdempotencyKey(string token) + => string.Concat(_redisOptions.IdempotencyKeyPrefix, token); + + private TimeSpan CalculateBackoff(int attempt) + { + var initial = _options.RetryInitialBackoff > TimeSpan.Zero + ? _options.RetryInitialBackoff + : TimeSpan.FromSeconds(1); + + if (initial <= TimeSpan.Zero) + { + return TimeSpan.Zero; + } + + if (attempt <= 1) + { + return initial; + } + + var max = _options.RetryMaxBackoff > TimeSpan.Zero + ? _options.RetryMaxBackoff + : initial; + + var exponent = attempt - 1; + var scaledTicks = initial.Ticks * Math.Pow(2, exponent - 1); + var cappedTicks = Math.Min(max.Ticks, scaledTicks); + var resultTicks = Math.Max(initial.Ticks, (long)cappedTicks); + return TimeSpan.FromTicks(resultTicks); + } +} diff --git a/src/StellaOps.Notify.Queue/Redis/RedisNotifyEventLease.cs b/src/StellaOps.Notify.Queue/Redis/RedisNotifyEventLease.cs new file mode 100644 index 00000000..4d29bd60 --- /dev/null +++ b/src/StellaOps.Notify.Queue/Redis/RedisNotifyEventLease.cs @@ -0,0 +1,76 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace StellaOps.Notify.Queue.Redis; + +internal sealed class RedisNotifyEventLease : INotifyQueueLease +{ + private readonly RedisNotifyEventQueue _queue; + private int _completed; + + internal RedisNotifyEventLease( + RedisNotifyEventQueue queue, + NotifyRedisEventStreamOptions streamOptions, + string messageId, + NotifyQueueEventMessage message, + int attempt, + string consumer, + DateTimeOffset enqueuedAt, + DateTimeOffset leaseExpiresAt) + { + _queue = queue ?? throw new ArgumentNullException(nameof(queue)); + StreamOptions = streamOptions ?? throw new ArgumentNullException(nameof(streamOptions)); + MessageId = messageId ?? throw new ArgumentNullException(nameof(messageId)); + Message = message ?? throw new ArgumentNullException(nameof(message)); + Attempt = attempt; + Consumer = consumer ?? throw new ArgumentNullException(nameof(consumer)); + EnqueuedAt = enqueuedAt; + LeaseExpiresAt = leaseExpiresAt; + } + + internal NotifyRedisEventStreamOptions StreamOptions { get; } + + public string MessageId { get; } + + public int Attempt { get; } + + public DateTimeOffset EnqueuedAt { get; } + + public DateTimeOffset LeaseExpiresAt { get; private set; } + + public string Consumer { get; } + + public string Stream => StreamOptions.Stream; + + public string TenantId => Message.TenantId; + + public string? PartitionKey => Message.PartitionKey; + + public string IdempotencyKey => Message.IdempotencyKey; + + public string? TraceId => Message.TraceId; + + public IReadOnlyDictionary Attributes => Message.Attributes; + + public NotifyQueueEventMessage Message { get; } + + public Task AcknowledgeAsync(CancellationToken cancellationToken = default) + => _queue.AcknowledgeAsync(this, cancellationToken); + + public Task RenewAsync(TimeSpan leaseDuration, CancellationToken cancellationToken = default) + => _queue.RenewLeaseAsync(this, leaseDuration, cancellationToken); + + public Task ReleaseAsync(NotifyQueueReleaseDisposition disposition, CancellationToken cancellationToken = default) + => _queue.ReleaseAsync(this, disposition, cancellationToken); + + public Task DeadLetterAsync(string reason, CancellationToken cancellationToken = default) + => _queue.DeadLetterAsync(this, reason, cancellationToken); + + internal bool TryBeginCompletion() + => Interlocked.CompareExchange(ref _completed, 1, 0) == 0; + + internal void RefreshLease(DateTimeOffset expiresAt) + => LeaseExpiresAt = expiresAt; +} diff --git a/src/StellaOps.Notify.Queue/Redis/RedisNotifyEventQueue.cs b/src/StellaOps.Notify.Queue/Redis/RedisNotifyEventQueue.cs new file mode 100644 index 00000000..e217f899 --- /dev/null +++ b/src/StellaOps.Notify.Queue/Redis/RedisNotifyEventQueue.cs @@ -0,0 +1,655 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Collections.ObjectModel; +using System.Globalization; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using StackExchange.Redis; +using StellaOps.Notify.Models; + +namespace StellaOps.Notify.Queue.Redis; + +internal sealed class RedisNotifyEventQueue : INotifyEventQueue, IAsyncDisposable +{ + private const string TransportName = "redis"; + + private readonly NotifyEventQueueOptions _options; + private readonly NotifyRedisEventQueueOptions _redisOptions; + private readonly ILogger _logger; + private readonly TimeProvider _timeProvider; + private readonly Func> _connectionFactory; + private readonly SemaphoreSlim _connectionLock = new(1, 1); + private readonly SemaphoreSlim _groupInitLock = new(1, 1); + private readonly IReadOnlyDictionary _streamsByName; + private readonly ConcurrentDictionary _initializedStreams = new(StringComparer.Ordinal); + + private IConnectionMultiplexer? _connection; + private bool _disposed; + + public RedisNotifyEventQueue( + NotifyEventQueueOptions options, + NotifyRedisEventQueueOptions redisOptions, + ILogger logger, + TimeProvider timeProvider, + Func>? connectionFactory = null) + { + _options = options ?? throw new ArgumentNullException(nameof(options)); + _redisOptions = redisOptions ?? throw new ArgumentNullException(nameof(redisOptions)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + _timeProvider = timeProvider ?? TimeProvider.System; + _connectionFactory = connectionFactory ?? (async config => + { + var connection = await ConnectionMultiplexer.ConnectAsync(config).ConfigureAwait(false); + return (IConnectionMultiplexer)connection; + }); + + if (string.IsNullOrWhiteSpace(_redisOptions.ConnectionString)) + { + throw new InvalidOperationException("Redis connection string must be configured for Notify event queue."); + } + + _streamsByName = _redisOptions.Streams.ToDictionary( + stream => stream.Stream, + stream => stream, + StringComparer.Ordinal); + } + + public async ValueTask PublishAsync( + NotifyQueueEventMessage message, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(message); + cancellationToken.ThrowIfCancellationRequested(); + + var streamOptions = GetStreamOptions(message.Stream); + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + await EnsureStreamInitializedAsync(db, streamOptions, cancellationToken).ConfigureAwait(false); + + var now = _timeProvider.GetUtcNow(); + var entries = BuildEntries(message, now, attempt: 1); + + var messageId = await AddToStreamAsync( + db, + streamOptions, + entries) + .ConfigureAwait(false); + + var idempotencyToken = string.IsNullOrWhiteSpace(message.IdempotencyKey) + ? message.Event.EventId.ToString("N") + : message.IdempotencyKey; + + var idempotencyKey = streamOptions.IdempotencyKeyPrefix + idempotencyToken; + var stored = await db.StringSetAsync( + idempotencyKey, + messageId, + when: When.NotExists, + expiry: _redisOptions.IdempotencyWindow) + .ConfigureAwait(false); + + if (!stored) + { + await db.StreamDeleteAsync( + streamOptions.Stream, + new RedisValue[] { messageId }) + .ConfigureAwait(false); + + var existing = await db.StringGetAsync(idempotencyKey).ConfigureAwait(false); + var duplicateId = existing.IsNullOrEmpty ? messageId : existing; + + _logger.LogDebug( + "Duplicate Notify event enqueue detected for idempotency token {Token}; returning existing stream id {StreamId}.", + idempotencyToken, + duplicateId.ToString()); + + NotifyQueueMetrics.RecordDeduplicated(TransportName, streamOptions.Stream); + return new NotifyQueueEnqueueResult(duplicateId.ToString()!, true); + } + + NotifyQueueMetrics.RecordEnqueued(TransportName, streamOptions.Stream); + + _logger.LogDebug( + "Enqueued Notify event {EventId} for tenant {Tenant} on stream {Stream} (id {StreamId}).", + message.Event.EventId, + message.TenantId, + streamOptions.Stream, + messageId.ToString()); + + return new NotifyQueueEnqueueResult(messageId.ToString()!, false); + } + + public async ValueTask>> LeaseAsync( + NotifyQueueLeaseRequest request, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(request); + cancellationToken.ThrowIfCancellationRequested(); + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + var now = _timeProvider.GetUtcNow(); + var leases = new List>(request.BatchSize); + + foreach (var streamOptions in _streamsByName.Values) + { + await EnsureStreamInitializedAsync(db, streamOptions, cancellationToken).ConfigureAwait(false); + + var remaining = request.BatchSize - leases.Count; + if (remaining <= 0) + { + break; + } + + var entries = await db.StreamReadGroupAsync( + streamOptions.Stream, + streamOptions.ConsumerGroup, + request.Consumer, + StreamPosition.NewMessages, + remaining) + .ConfigureAwait(false); + + if (entries is null || entries.Length == 0) + { + continue; + } + + foreach (var entry in entries) + { + var lease = TryMapLease( + streamOptions, + entry, + request.Consumer, + now, + request.LeaseDuration, + attemptOverride: null); + + if (lease is null) + { + await AckPoisonAsync(db, streamOptions, entry.Id).ConfigureAwait(false); + continue; + } + + leases.Add(lease); + + if (leases.Count >= request.BatchSize) + { + break; + } + } + } + + return leases; + } + + public async ValueTask>> ClaimExpiredAsync( + NotifyQueueClaimOptions options, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(options); + cancellationToken.ThrowIfCancellationRequested(); + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + var now = _timeProvider.GetUtcNow(); + var leases = new List>(options.BatchSize); + + foreach (var streamOptions in _streamsByName.Values) + { + await EnsureStreamInitializedAsync(db, streamOptions, cancellationToken).ConfigureAwait(false); + + var pending = await db.StreamPendingMessagesAsync( + streamOptions.Stream, + streamOptions.ConsumerGroup, + options.BatchSize, + RedisValue.Null, + (long)options.MinIdleTime.TotalMilliseconds) + .ConfigureAwait(false); + + if (pending is null || pending.Length == 0) + { + continue; + } + + var eligible = pending + .Where(p => p.IdleTimeInMilliseconds >= options.MinIdleTime.TotalMilliseconds) + .ToArray(); + + if (eligible.Length == 0) + { + continue; + } + + var messageIds = eligible + .Select(static p => (RedisValue)p.MessageId) + .ToArray(); + + var entries = await db.StreamClaimAsync( + streamOptions.Stream, + streamOptions.ConsumerGroup, + options.ClaimantConsumer, + 0, + messageIds) + .ConfigureAwait(false); + + if (entries is null || entries.Length == 0) + { + continue; + } + + var attemptById = eligible + .Where(static info => !info.MessageId.IsNullOrEmpty) + .ToDictionary( + info => info.MessageId!.ToString(), + info => (int)Math.Max(1, info.DeliveryCount), + StringComparer.Ordinal); + + foreach (var entry in entries) + { + var entryId = entry.Id.ToString(); + attemptById.TryGetValue(entryId, out var attempt); + + var lease = TryMapLease( + streamOptions, + entry, + options.ClaimantConsumer, + now, + _options.DefaultLeaseDuration, + attempt == 0 ? null : attempt); + + if (lease is null) + { + await AckPoisonAsync(db, streamOptions, entry.Id).ConfigureAwait(false); + continue; + } + + leases.Add(lease); + if (leases.Count >= options.BatchSize) + { + return leases; + } + } + } + + return leases; + } + + public async ValueTask DisposeAsync() + { + if (_disposed) + { + return; + } + + _disposed = true; + if (_connection is not null) + { + await _connection.CloseAsync(); + _connection.Dispose(); + } + + _connectionLock.Dispose(); + _groupInitLock.Dispose(); + GC.SuppressFinalize(this); + } + + internal async Task AcknowledgeAsync( + RedisNotifyEventLease lease, + CancellationToken cancellationToken) + { + if (!lease.TryBeginCompletion()) + { + return; + } + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + var streamOptions = lease.StreamOptions; + + await db.StreamAcknowledgeAsync( + streamOptions.Stream, + streamOptions.ConsumerGroup, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + await db.StreamDeleteAsync( + streamOptions.Stream, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + NotifyQueueMetrics.RecordAck(TransportName, streamOptions.Stream); + + _logger.LogDebug( + "Acknowledged Notify event {EventId} on consumer {Consumer} (stream {Stream}, id {MessageId}).", + lease.Message.Event.EventId, + lease.Consumer, + streamOptions.Stream, + lease.MessageId); + } + + internal async Task RenewLeaseAsync( + RedisNotifyEventLease lease, + TimeSpan leaseDuration, + CancellationToken cancellationToken) + { + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + var streamOptions = lease.StreamOptions; + + await db.StreamClaimAsync( + streamOptions.Stream, + streamOptions.ConsumerGroup, + lease.Consumer, + 0, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + var expires = _timeProvider.GetUtcNow().Add(leaseDuration); + lease.RefreshLease(expires); + + _logger.LogDebug( + "Renewed Notify event lease for {EventId} until {Expires:u}.", + lease.Message.Event.EventId, + expires); + } + + internal Task ReleaseAsync( + RedisNotifyEventLease lease, + NotifyQueueReleaseDisposition disposition, + CancellationToken cancellationToken) + => Task.FromException(new NotSupportedException("Retry/abandon is not supported for Notify event streams.")); + + internal async Task DeadLetterAsync( + RedisNotifyEventLease lease, + string reason, + CancellationToken cancellationToken) + { + if (!lease.TryBeginCompletion()) + { + return; + } + + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + var streamOptions = lease.StreamOptions; + + await db.StreamAcknowledgeAsync( + streamOptions.Stream, + streamOptions.ConsumerGroup, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + await db.StreamDeleteAsync( + streamOptions.Stream, + new RedisValue[] { lease.MessageId }) + .ConfigureAwait(false); + + _logger.LogWarning( + "Dead-lettered Notify event {EventId} on stream {Stream} with reason '{Reason}'.", + lease.Message.Event.EventId, + streamOptions.Stream, + reason); + } + + internal async ValueTask PingAsync(CancellationToken cancellationToken) + { + var db = await GetDatabaseAsync(cancellationToken).ConfigureAwait(false); + _ = await db.PingAsync().ConfigureAwait(false); + } + + private NotifyRedisEventStreamOptions GetStreamOptions(string stream) + { + if (!_streamsByName.TryGetValue(stream, out var options)) + { + throw new InvalidOperationException($"Stream '{stream}' is not configured for the Notify event queue."); + } + + return options; + } + + private async Task GetDatabaseAsync(CancellationToken cancellationToken) + { + if (_connection is { IsConnected: true }) + { + return _connection.GetDatabase(_redisOptions.Database ?? -1); + } + + await _connectionLock.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + if (_connection is { IsConnected: true }) + { + return _connection.GetDatabase(_redisOptions.Database ?? -1); + } + + var configuration = ConfigurationOptions.Parse(_redisOptions.ConnectionString!); + configuration.AbortOnConnectFail = false; + if (_redisOptions.Database.HasValue) + { + configuration.DefaultDatabase = _redisOptions.Database; + } + + using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + timeoutCts.CancelAfter(_redisOptions.InitializationTimeout); + + _connection = await _connectionFactory(configuration).WaitAsync(timeoutCts.Token).ConfigureAwait(false); + return _connection.GetDatabase(_redisOptions.Database ?? -1); + } + finally + { + _connectionLock.Release(); + } + } + + private async Task EnsureStreamInitializedAsync( + IDatabase database, + NotifyRedisEventStreamOptions streamOptions, + CancellationToken cancellationToken) + { + if (_initializedStreams.ContainsKey(streamOptions.Stream)) + { + return; + } + + await _groupInitLock.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + if (_initializedStreams.ContainsKey(streamOptions.Stream)) + { + return; + } + + try + { + await database.StreamCreateConsumerGroupAsync( + streamOptions.Stream, + streamOptions.ConsumerGroup, + StreamPosition.Beginning, + createStream: true) + .ConfigureAwait(false); + } + catch (RedisServerException ex) when (ex.Message.Contains("BUSYGROUP", StringComparison.OrdinalIgnoreCase)) + { + // Consumer group already exists — nothing to do. + } + + _initializedStreams[streamOptions.Stream] = true; + } + finally + { + _groupInitLock.Release(); + } + } + + private static async Task AddToStreamAsync( + IDatabase database, + NotifyRedisEventStreamOptions streamOptions, + IReadOnlyList entries) + { + return await database.StreamAddAsync( + streamOptions.Stream, + entries.ToArray(), + maxLength: streamOptions.ApproximateMaxLength, + useApproximateMaxLength: streamOptions.ApproximateMaxLength is not null) + .ConfigureAwait(false); + } + + private IReadOnlyList BuildEntries( + NotifyQueueEventMessage message, + DateTimeOffset enqueuedAt, + int attempt) + { + var payload = NotifyCanonicalJsonSerializer.Serialize(message.Event); + + var entries = new List(8 + message.Attributes.Count) + { + new(NotifyQueueFields.Payload, payload), + new(NotifyQueueFields.EventId, message.Event.EventId.ToString("D")), + new(NotifyQueueFields.Tenant, message.TenantId), + new(NotifyQueueFields.Kind, message.Event.Kind), + new(NotifyQueueFields.Attempt, attempt), + new(NotifyQueueFields.EnqueuedAt, enqueuedAt.ToUnixTimeMilliseconds()), + new(NotifyQueueFields.IdempotencyKey, message.IdempotencyKey), + new(NotifyQueueFields.PartitionKey, message.PartitionKey ?? string.Empty), + new(NotifyQueueFields.TraceId, message.TraceId ?? string.Empty) + }; + + foreach (var kvp in message.Attributes) + { + entries.Add(new NameValueEntry( + NotifyQueueFields.AttributePrefix + kvp.Key, + kvp.Value)); + } + + return entries; + } + + private RedisNotifyEventLease? TryMapLease( + NotifyRedisEventStreamOptions streamOptions, + StreamEntry entry, + string consumer, + DateTimeOffset now, + TimeSpan leaseDuration, + int? attemptOverride) + { + if (entry.Values is null || entry.Values.Length == 0) + { + return null; + } + + string? payloadJson = null; + string? eventIdRaw = null; + long? enqueuedAtUnix = null; + string? idempotency = null; + string? partitionKey = null; + string? traceId = null; + var attempt = attemptOverride ?? 1; + var attributes = new Dictionary(StringComparer.Ordinal); + + foreach (var field in entry.Values) + { + var name = field.Name.ToString(); + var value = field.Value; + if (name.Equals(NotifyQueueFields.Payload, StringComparison.Ordinal)) + { + payloadJson = value.ToString(); + } + else if (name.Equals(NotifyQueueFields.EventId, StringComparison.Ordinal)) + { + eventIdRaw = value.ToString(); + } + else if (name.Equals(NotifyQueueFields.Attempt, StringComparison.Ordinal)) + { + if (int.TryParse(value.ToString(), NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsed)) + { + attempt = Math.Max(parsed, attempt); + } + } + else if (name.Equals(NotifyQueueFields.EnqueuedAt, StringComparison.Ordinal)) + { + if (long.TryParse(value.ToString(), NumberStyles.Integer, CultureInfo.InvariantCulture, out var unix)) + { + enqueuedAtUnix = unix; + } + } + else if (name.Equals(NotifyQueueFields.IdempotencyKey, StringComparison.Ordinal)) + { + var text = value.ToString(); + idempotency = string.IsNullOrWhiteSpace(text) ? null : text; + } + else if (name.Equals(NotifyQueueFields.PartitionKey, StringComparison.Ordinal)) + { + var text = value.ToString(); + partitionKey = string.IsNullOrWhiteSpace(text) ? null : text; + } + else if (name.Equals(NotifyQueueFields.TraceId, StringComparison.Ordinal)) + { + var text = value.ToString(); + traceId = string.IsNullOrWhiteSpace(text) ? null : text; + } + else if (name.StartsWith(NotifyQueueFields.AttributePrefix, StringComparison.Ordinal)) + { + var key = name[NotifyQueueFields.AttributePrefix.Length..]; + attributes[key] = value.ToString(); + } + } + + if (payloadJson is null || enqueuedAtUnix is null) + { + return null; + } + + NotifyEvent notifyEvent; + try + { + notifyEvent = NotifyCanonicalJsonSerializer.Deserialize(payloadJson); + } + catch (Exception ex) + { + _logger.LogWarning( + ex, + "Failed to deserialize Notify event payload for stream {Stream} entry {EntryId}.", + streamOptions.Stream, + entry.Id.ToString()); + return null; + } + + var attributeView = attributes.Count == 0 + ? EmptyReadOnlyDictionary.Instance + : new ReadOnlyDictionary(attributes); + + var message = new NotifyQueueEventMessage( + notifyEvent, + streamOptions.Stream, + idempotencyKey: idempotency ?? notifyEvent.EventId.ToString("N"), + partitionKey: partitionKey, + traceId: traceId, + attributes: attributeView); + + var enqueuedAt = DateTimeOffset.FromUnixTimeMilliseconds(enqueuedAtUnix.Value); + var leaseExpiresAt = now.Add(leaseDuration); + + return new RedisNotifyEventLease( + this, + streamOptions, + entry.Id.ToString(), + message, + attempt, + consumer, + enqueuedAt, + leaseExpiresAt); + } + + private async Task AckPoisonAsync( + IDatabase database, + NotifyRedisEventStreamOptions streamOptions, + RedisValue messageId) + { + await database.StreamAcknowledgeAsync( + streamOptions.Stream, + streamOptions.ConsumerGroup, + new RedisValue[] { messageId }) + .ConfigureAwait(false); + + await database.StreamDeleteAsync( + streamOptions.Stream, + new RedisValue[] { messageId }) + .ConfigureAwait(false); + } +} diff --git a/src/StellaOps.Notify.Queue/StellaOps.Notify.Queue.csproj b/src/StellaOps.Notify.Queue/StellaOps.Notify.Queue.csproj index 6d665dea..7c4b247b 100644 --- a/src/StellaOps.Notify.Queue/StellaOps.Notify.Queue.csproj +++ b/src/StellaOps.Notify.Queue/StellaOps.Notify.Queue.csproj @@ -1,7 +1,23 @@ - - - net10.0 - enable - enable - - + + + net10.0 + enable + enable + + + + + + + + + + + + + + + + + + diff --git a/src/StellaOps.Notify.Queue/TASKS.md b/src/StellaOps.Notify.Queue/TASKS.md index dae08651..92d0320f 100644 --- a/src/StellaOps.Notify.Queue/TASKS.md +++ b/src/StellaOps.Notify.Queue/TASKS.md @@ -2,6 +2,6 @@ | ID | Status | Owner(s) | Depends on | Description | Exit Criteria | |----|--------|----------|------------|-------------|---------------| -| NOTIFY-QUEUE-15-401 | TODO | Notify Queue Guild | NOTIFY-MODELS-15-101 | Build queue abstraction + Redis Streams adapter with ack/claim APIs, idempotency tokens, serialization contracts. | Adapter integration tests cover enqueue/dequeue/ack; ordering preserved; idempotency tokens supported. | -| NOTIFY-QUEUE-15-402 | TODO | Notify Queue Guild | NOTIFY-QUEUE-15-401 | Add NATS JetStream adapter with configuration binding, health probes, failover. | Health endpoints verified; failover documented; integration tests exercise both adapters. | -| NOTIFY-QUEUE-15-403 | TODO | Notify Queue Guild | NOTIFY-QUEUE-15-401 | Delivery queue for channel actions with retry schedules, poison queues, and metrics instrumentation. | Delivery queue integration tests cover retries/dead-letter; metrics/logging emitted per spec. | +| NOTIFY-QUEUE-15-401 | DONE (2025-10-23) | Notify Queue Guild | NOTIFY-MODELS-15-101 | Build queue abstraction + Redis Streams adapter with ack/claim APIs, idempotency tokens, serialization contracts. | Adapter integration tests cover enqueue/dequeue/ack; ordering preserved; idempotency tokens supported. | +| NOTIFY-QUEUE-15-402 | DONE (2025-10-23) | Notify Queue Guild | NOTIFY-QUEUE-15-401 | Add NATS JetStream adapter with configuration binding, health probes, failover. | Health endpoints verified; failover documented; integration tests exercise both adapters. | +| NOTIFY-QUEUE-15-403 | DONE (2025-10-23) | Notify Queue Guild | NOTIFY-QUEUE-15-401 | Delivery queue for channel actions with retry schedules, poison queues, and metrics instrumentation. | Delivery queue integration tests cover retries/dead-letter; metrics/logging emitted per spec. | diff --git a/src/StellaOps.Notify.Worker.Tests/NotifyEventLeaseProcessorTests.cs b/src/StellaOps.Notify.Worker.Tests/NotifyEventLeaseProcessorTests.cs new file mode 100644 index 00000000..3c536ab2 --- /dev/null +++ b/src/StellaOps.Notify.Worker.Tests/NotifyEventLeaseProcessorTests.cs @@ -0,0 +1,167 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using StellaOps.Notify.Models; +using StellaOps.Notify.Queue; +using StellaOps.Notify.Worker; +using StellaOps.Notify.Worker.Handlers; +using StellaOps.Notify.Worker.Processing; +using Xunit; + +namespace StellaOps.Notify.Worker.Tests; + +public sealed class NotifyEventLeaseProcessorTests +{ + [Fact] + public async Task ProcessOnce_ShouldAcknowledgeSuccessfulLease() + { + var lease = new FakeLease(); + var queue = new FakeEventQueue(lease); + var handler = new TestHandler(); + var options = Options.Create(new NotifyWorkerOptions { LeaseBatchSize = 1, LeaseDuration = TimeSpan.FromSeconds(5) }); + var processor = new NotifyEventLeaseProcessor(queue, handler, options, NullLogger.Instance, TimeProvider.System); + + var processed = await processor.ProcessOnceAsync(CancellationToken.None); + + processed.Should().Be(1); + lease.AcknowledgeCount.Should().Be(1); + lease.ReleaseCount.Should().Be(0); + } + + [Fact] + public async Task ProcessOnce_ShouldRetryOnHandlerFailure() + { + var lease = new FakeLease(); + var queue = new FakeEventQueue(lease); + var handler = new TestHandler(shouldThrow: true); + var options = Options.Create(new NotifyWorkerOptions { LeaseBatchSize = 1, LeaseDuration = TimeSpan.FromSeconds(5) }); + var processor = new NotifyEventLeaseProcessor(queue, handler, options, NullLogger.Instance, TimeProvider.System); + + var processed = await processor.ProcessOnceAsync(CancellationToken.None); + + processed.Should().Be(1); + lease.AcknowledgeCount.Should().Be(0); + lease.ReleaseCount.Should().Be(1); + lease.LastDisposition.Should().Be(NotifyQueueReleaseDisposition.Retry); + } + + private sealed class FakeEventQueue : INotifyEventQueue + { + private readonly Queue> _leases; + + public FakeEventQueue(params INotifyQueueLease[] leases) + { + _leases = new Queue>(leases); + } + + public ValueTask PublishAsync(NotifyQueueEventMessage message, CancellationToken cancellationToken = default) + => throw new NotSupportedException(); + + public ValueTask>> LeaseAsync(NotifyQueueLeaseRequest request, CancellationToken cancellationToken = default) + { + if (_leases.Count == 0) + { + return ValueTask.FromResult>>(Array.Empty>()); + } + + return ValueTask.FromResult>>(new[] { _leases.Dequeue() }); + } + + public ValueTask>> ClaimExpiredAsync(NotifyQueueClaimOptions options, CancellationToken cancellationToken = default) + => ValueTask.FromResult>>(Array.Empty>()); + } + + private sealed class FakeLease : INotifyQueueLease + { + private readonly NotifyQueueEventMessage _message; + + public FakeLease() + { + var notifyEvent = NotifyEvent.Create( + Guid.NewGuid(), + kind: "test.event", + tenant: "tenant-1", + ts: DateTimeOffset.UtcNow, + payload: null); + + _message = new NotifyQueueEventMessage(notifyEvent, "notify:events", traceId: "trace-123"); + } + + public string MessageId { get; } = Guid.NewGuid().ToString("n"); + + public int Attempt { get; internal set; } = 1; + + public DateTimeOffset EnqueuedAt { get; } = DateTimeOffset.UtcNow; + + public DateTimeOffset LeaseExpiresAt { get; private set; } = DateTimeOffset.UtcNow.AddSeconds(30); + + public string Consumer { get; } = "worker-1"; + + public string Stream => _message.Stream; + + public string TenantId => _message.TenantId; + + public string? PartitionKey => _message.PartitionKey; + + public string IdempotencyKey => _message.IdempotencyKey; + + public string? TraceId => _message.TraceId; + + public IReadOnlyDictionary Attributes => _message.Attributes; + + public NotifyQueueEventMessage Message => _message; + + public int AcknowledgeCount { get; private set; } + + public int ReleaseCount { get; private set; } + + public NotifyQueueReleaseDisposition? LastDisposition { get; private set; } + + public Task AcknowledgeAsync(CancellationToken cancellationToken = default) + { + AcknowledgeCount++; + return Task.CompletedTask; + } + + public Task RenewAsync(TimeSpan leaseDuration, CancellationToken cancellationToken = default) + { + LeaseExpiresAt = DateTimeOffset.UtcNow.Add(leaseDuration); + return Task.CompletedTask; + } + + public Task ReleaseAsync(NotifyQueueReleaseDisposition disposition, CancellationToken cancellationToken = default) + { + LastDisposition = disposition; + ReleaseCount++; + Attempt++; + return Task.CompletedTask; + } + + public Task DeadLetterAsync(string reason, CancellationToken cancellationToken = default) + => Task.CompletedTask; + } + + private sealed class TestHandler : INotifyEventHandler + { + private readonly bool _shouldThrow; + + public TestHandler(bool shouldThrow = false) + { + _shouldThrow = shouldThrow; + } + + public Task HandleAsync(NotifyQueueEventMessage message, CancellationToken cancellationToken) + { + if (_shouldThrow) + { + throw new InvalidOperationException("handler failure"); + } + + return Task.CompletedTask; + } + } +} diff --git a/src/StellaOps.Notify.Worker.Tests/StellaOps.Notify.Worker.Tests.csproj b/src/StellaOps.Notify.Worker.Tests/StellaOps.Notify.Worker.Tests.csproj new file mode 100644 index 00000000..785775c2 --- /dev/null +++ b/src/StellaOps.Notify.Worker.Tests/StellaOps.Notify.Worker.Tests.csproj @@ -0,0 +1,26 @@ + + + net10.0 + enable + enable + false + false + + + + + + + all + + + all + + + + + + + + + diff --git a/src/StellaOps.Notify.Worker/Handlers/INotifyEventHandler.cs b/src/StellaOps.Notify.Worker/Handlers/INotifyEventHandler.cs new file mode 100644 index 00000000..5387c9df --- /dev/null +++ b/src/StellaOps.Notify.Worker/Handlers/INotifyEventHandler.cs @@ -0,0 +1,10 @@ +using System.Threading; +using System.Threading.Tasks; +using StellaOps.Notify.Queue; + +namespace StellaOps.Notify.Worker.Handlers; + +public interface INotifyEventHandler +{ + Task HandleAsync(NotifyQueueEventMessage message, CancellationToken cancellationToken); +} diff --git a/src/StellaOps.Notify.Worker/Handlers/NoOpNotifyEventHandler.cs b/src/StellaOps.Notify.Worker/Handlers/NoOpNotifyEventHandler.cs new file mode 100644 index 00000000..11b4594c --- /dev/null +++ b/src/StellaOps.Notify.Worker/Handlers/NoOpNotifyEventHandler.cs @@ -0,0 +1,25 @@ +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using StellaOps.Notify.Queue; + +namespace StellaOps.Notify.Worker.Handlers; + +internal sealed class NoOpNotifyEventHandler : INotifyEventHandler +{ + private readonly ILogger _logger; + + public NoOpNotifyEventHandler(ILogger logger) + { + _logger = logger; + } + + public Task HandleAsync(NotifyQueueEventMessage message, CancellationToken cancellationToken) + { + _logger.LogDebug( + "No-op handler acknowledged event {EventId} (tenant {TenantId}).", + message.Event.EventId, + message.TenantId); + return Task.CompletedTask; + } +} diff --git a/src/StellaOps.Notify.Worker/NotifyWorkerOptions.cs b/src/StellaOps.Notify.Worker/NotifyWorkerOptions.cs new file mode 100644 index 00000000..f403a33c --- /dev/null +++ b/src/StellaOps.Notify.Worker/NotifyWorkerOptions.cs @@ -0,0 +1,52 @@ +using System; + +namespace StellaOps.Notify.Worker; + +public sealed class NotifyWorkerOptions +{ + /// + /// Worker identifier prefix; defaults to machine name. + /// + public string? WorkerId { get; set; } + + /// + /// Number of messages to lease per iteration. + /// + public int LeaseBatchSize { get; set; } = 16; + + /// + /// Duration a lease remains active before it becomes eligible for claim. + /// + public TimeSpan LeaseDuration { get; set; } = TimeSpan.FromSeconds(30); + + /// + /// Delay applied when no work is available. + /// + public TimeSpan IdleDelay { get; set; } = TimeSpan.FromMilliseconds(250); + + /// + /// Maximum number of event leases processed concurrently. + /// + public int MaxConcurrency { get; set; } = 4; + + /// + /// Maximum number of consecutive failures before the worker delays. + /// + public int FailureBackoffThreshold { get; set; } = 3; + + /// + /// Delay applied when the failure threshold is reached. + /// + public TimeSpan FailureBackoffDelay { get; set; } = TimeSpan.FromSeconds(5); + + internal string ResolveWorkerId() + { + if (!string.IsNullOrWhiteSpace(WorkerId)) + { + return WorkerId!; + } + + var host = Environment.MachineName; + return $"{host}-{Guid.NewGuid():n}"; + } +} diff --git a/src/StellaOps.Notify.Worker/Processing/NotifyEventLeaseProcessor.cs b/src/StellaOps.Notify.Worker/Processing/NotifyEventLeaseProcessor.cs new file mode 100644 index 00000000..87aee894 --- /dev/null +++ b/src/StellaOps.Notify.Worker/Processing/NotifyEventLeaseProcessor.cs @@ -0,0 +1,146 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using StellaOps.Notify.Queue; +using StellaOps.Notify.Worker.Handlers; + +namespace StellaOps.Notify.Worker.Processing; + +internal sealed class NotifyEventLeaseProcessor +{ + private static readonly ActivitySource ActivitySource = new("StellaOps.Notify.Worker"); + + private readonly INotifyEventQueue _queue; + private readonly INotifyEventHandler _handler; + private readonly NotifyWorkerOptions _options; + private readonly ILogger _logger; + private readonly TimeProvider _timeProvider; + private readonly string _workerId; + + public NotifyEventLeaseProcessor( + INotifyEventQueue queue, + INotifyEventHandler handler, + IOptions options, + ILogger logger, + TimeProvider timeProvider) + { + _queue = queue ?? throw new ArgumentNullException(nameof(queue)); + _handler = handler ?? throw new ArgumentNullException(nameof(handler)); + _options = options?.Value ?? throw new ArgumentNullException(nameof(options)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + _timeProvider = timeProvider ?? TimeProvider.System; + _workerId = _options.ResolveWorkerId(); + } + + public async Task ProcessOnceAsync(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var leaseRequest = new NotifyQueueLeaseRequest( + consumer: _workerId, + batchSize: Math.Max(1, _options.LeaseBatchSize), + leaseDuration: _options.LeaseDuration <= TimeSpan.Zero ? TimeSpan.FromSeconds(30) : _options.LeaseDuration); + + IReadOnlyList> leases; + try + { + leases = await _queue.LeaseAsync(leaseRequest, cancellationToken).ConfigureAwait(false); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to lease Notify events."); + throw; + } + + if (leases.Count == 0) + { + return 0; + } + + var processed = 0; + foreach (var lease in leases) + { + cancellationToken.ThrowIfCancellationRequested(); + processed++; + await ProcessLeaseAsync(lease, cancellationToken).ConfigureAwait(false); + } + + return processed; + } + + private async Task ProcessLeaseAsync( + INotifyQueueLease lease, + CancellationToken cancellationToken) + { + var message = lease.Message; + var correlationId = message.TraceId ?? message.Event.EventId.ToString("N"); + + using var scope = _logger.BeginScope(new Dictionary + { + ["notifyTraceId"] = correlationId, + ["notifyTenantId"] = message.TenantId, + ["notifyEventId"] = message.Event.EventId, + ["notifyAttempt"] = lease.Attempt + }); + + using var activity = ActivitySource.StartActivity("notify.event.process", ActivityKind.Consumer); + activity?.SetTag("notify.tenant_id", message.TenantId); + activity?.SetTag("notify.event_id", message.Event.EventId); + activity?.SetTag("notify.attempt", lease.Attempt); + activity?.SetTag("notify.worker_id", _workerId); + + try + { + _logger.LogInformation( + "Processing notify event {EventId} (tenant {TenantId}, attempt {Attempt}).", + message.Event.EventId, + message.TenantId, + lease.Attempt); + + await _handler.HandleAsync(message, cancellationToken).ConfigureAwait(false); + + await lease.AcknowledgeAsync(cancellationToken).ConfigureAwait(false); + _logger.LogInformation( + "Acknowledged notify event {EventId} (tenant {TenantId}).", + message.Event.EventId, + message.TenantId); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + _logger.LogWarning( + "Worker cancellation requested while processing event {EventId}; returning lease to queue.", + message.Event.EventId); + + await SafeReleaseAsync(lease, NotifyQueueReleaseDisposition.Retry, CancellationToken.None).ConfigureAwait(false); + throw; + } + catch (Exception ex) + { + _logger.LogError( + ex, + "Failed to process notify event {EventId}; scheduling retry.", + message.Event.EventId); + + await SafeReleaseAsync(lease, NotifyQueueReleaseDisposition.Retry, cancellationToken).ConfigureAwait(false); + } + } + + private static async Task SafeReleaseAsync( + INotifyQueueLease lease, + NotifyQueueReleaseDisposition disposition, + CancellationToken cancellationToken) + { + try + { + await lease.ReleaseAsync(disposition, cancellationToken).ConfigureAwait(false); + } + catch when (cancellationToken.IsCancellationRequested) + { + // Suppress release errors during shutdown. + } + } +} diff --git a/src/StellaOps.Notify.Worker/Processing/NotifyEventLeaseWorker.cs b/src/StellaOps.Notify.Worker/Processing/NotifyEventLeaseWorker.cs new file mode 100644 index 00000000..7f968b83 --- /dev/null +++ b/src/StellaOps.Notify.Worker/Processing/NotifyEventLeaseWorker.cs @@ -0,0 +1,63 @@ +using System; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace StellaOps.Notify.Worker.Processing; + +internal sealed class NotifyEventLeaseWorker : BackgroundService +{ + private readonly NotifyEventLeaseProcessor _processor; + private readonly NotifyWorkerOptions _options; + private readonly ILogger _logger; + + public NotifyEventLeaseWorker( + NotifyEventLeaseProcessor processor, + IOptions options, + ILogger logger) + { + _processor = processor ?? throw new ArgumentNullException(nameof(processor)); + _options = options?.Value ?? throw new ArgumentNullException(nameof(options)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + var idleDelay = _options.IdleDelay <= TimeSpan.Zero + ? TimeSpan.FromMilliseconds(500) + : _options.IdleDelay; + + while (!stoppingToken.IsCancellationRequested) + { + int processed; + try + { + processed = await _processor.ProcessOnceAsync(stoppingToken).ConfigureAwait(false); + } + catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) + { + break; + } + catch (Exception ex) + { + _logger.LogError(ex, "Notify worker processing loop encountered an error."); + await Task.Delay(_options.FailureBackoffDelay, stoppingToken).ConfigureAwait(false); + continue; + } + + if (processed == 0) + { + try + { + await Task.Delay(idleDelay, stoppingToken).ConfigureAwait(false); + } + catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) + { + break; + } + } + } + } +} diff --git a/src/StellaOps.Notify.Worker/Program.cs b/src/StellaOps.Notify.Worker/Program.cs new file mode 100644 index 00000000..8ec683ef --- /dev/null +++ b/src/StellaOps.Notify.Worker/Program.cs @@ -0,0 +1,33 @@ +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using StellaOps.Notify.Queue; +using StellaOps.Notify.Worker; +using StellaOps.Notify.Worker.Handlers; +using StellaOps.Notify.Worker.Processing; + +var builder = Host.CreateApplicationBuilder(args); + +builder.Configuration + .AddJsonFile("appsettings.json", optional: true, reloadOnChange: true) + .AddEnvironmentVariables(prefix: "NOTIFY_"); + +builder.Logging.ClearProviders(); +builder.Logging.AddSimpleConsole(options => +{ + options.TimestampFormat = "yyyy-MM-ddTHH:mm:ss.fffZ "; + options.UseUtcTimestamp = true; +}); + +builder.Services.Configure(builder.Configuration.GetSection("notify:worker")); +builder.Services.AddSingleton(TimeProvider.System); + +builder.Services.AddNotifyEventQueue(builder.Configuration, "notify:queue"); +builder.Services.AddNotifyDeliveryQueue(builder.Configuration, "notify:deliveryQueue"); + +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +builder.Services.AddHostedService(); + +await builder.Build().RunAsync().ConfigureAwait(false); diff --git a/src/StellaOps.Notify.Worker/Properties/AssemblyInfo.cs b/src/StellaOps.Notify.Worker/Properties/AssemblyInfo.cs new file mode 100644 index 00000000..7a46cf70 --- /dev/null +++ b/src/StellaOps.Notify.Worker/Properties/AssemblyInfo.cs @@ -0,0 +1,3 @@ +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("StellaOps.Notify.Worker.Tests")] diff --git a/src/StellaOps.Notify.Worker/StellaOps.Notify.Worker.csproj b/src/StellaOps.Notify.Worker/StellaOps.Notify.Worker.csproj index c444aa16..0ee4ae2d 100644 --- a/src/StellaOps.Notify.Worker/StellaOps.Notify.Worker.csproj +++ b/src/StellaOps.Notify.Worker/StellaOps.Notify.Worker.csproj @@ -1,8 +1,24 @@ - - - net10.0 - enable - enable - Exe - - + + + net10.0 + enable + enable + Exe + + + + + + + + + + + + + + + PreserveNewest + + + diff --git a/src/StellaOps.Notify.Worker/TASKS.md b/src/StellaOps.Notify.Worker/TASKS.md index b35be7f6..fbb40aff 100644 --- a/src/StellaOps.Notify.Worker/TASKS.md +++ b/src/StellaOps.Notify.Worker/TASKS.md @@ -2,7 +2,7 @@ | ID | Status | Owner(s) | Depends on | Description | Exit Criteria | |----|--------|----------|------------|-------------|---------------| -| NOTIFY-WORKER-15-201 | TODO | Notify Worker Guild | NOTIFY-QUEUE-15-401 | Implement bus subscription + leasing loop with correlation IDs, backoff, dead-letter handling (§1–§5). | Worker consumes events from queue, ack/retry behaviour proven in integration tests; logs include correlation IDs. | -| NOTIFY-WORKER-15-202 | TODO | Notify Worker Guild | NOTIFY-ENGINE-15-301 | Wire rules evaluation pipeline (tenant scoping, filters, throttles, digests, idempotency) with deterministic decisions. | Evaluation unit tests cover rule combinations; throttles/digests produce expected suppression; idempotency keys validated. | -| NOTIFY-WORKER-15-203 | TODO | Notify Worker Guild | NOTIFY-ENGINE-15-302 | Channel dispatch orchestration: invoke connectors, manage retries/jitter, record delivery outcomes. | Connector mocks show retries/backoff; delivery results stored; metrics incremented per outcome. | +| NOTIFY-WORKER-15-201 | DONE (2025-10-23) | Notify Worker Guild | NOTIFY-QUEUE-15-401 | Implement bus subscription + leasing loop with correlation IDs, backoff, dead-letter handling (§1–§5). | Worker consumes events from queue, ack/retry behaviour proven in integration tests; logs include correlation IDs. | +| NOTIFY-WORKER-15-202 | TODO | Notify Worker Guild | NOTIFY-WORKER-15-201 | Wire rules evaluation pipeline (tenant scoping, filters, throttles, digests, idempotency) with deterministic decisions. | Evaluation unit tests cover rule combinations; throttles/digests produce expected suppression; idempotency keys validated. | +| NOTIFY-WORKER-15-203 | TODO | Notify Worker Guild | NOTIFY-WORKER-15-202 | Channel dispatch orchestration: invoke connectors, manage retries/jitter, record delivery outcomes. | Connector mocks show retries/backoff; delivery results stored; metrics incremented per outcome. | | NOTIFY-WORKER-15-204 | TODO | Notify Worker Guild | NOTIFY-WORKER-15-203 | Metrics/telemetry: `notify.sent_total`, `notify.dropped_total`, latency histograms, tracing integration. | Metrics emitted per spec; OTLP spans annotated; dashboards documented. | diff --git a/src/StellaOps.Notify.Worker/appsettings.json b/src/StellaOps.Notify.Worker/appsettings.json new file mode 100644 index 00000000..56b6cce6 --- /dev/null +++ b/src/StellaOps.Notify.Worker/appsettings.json @@ -0,0 +1,43 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft": "Warning", + "Microsoft.Hosting.Lifetime": "Information" + } + }, + "notify": { + "worker": { + "leaseBatchSize": 16, + "leaseDuration": "00:00:30", + "idleDelay": "00:00:00.250", + "maxConcurrency": 4, + "failureBackoffThreshold": 3, + "failureBackoffDelay": "00:00:05" + }, + "queue": { + "transport": "Redis", + "redis": { + "connectionString": "localhost:6379", + "streams": [ + { + "stream": "notify:events", + "consumerGroup": "notify-workers", + "idempotencyKeyPrefix": "notify:events:idemp:", + "approximateMaxLength": 100000 + } + ] + } + }, + "deliveryQueue": { + "transport": "Redis", + "redis": { + "connectionString": "localhost:6379", + "streamName": "notify:deliveries", + "consumerGroup": "notify-delivery", + "idempotencyKeyPrefix": "notify:deliveries:idemp:", + "deadLetterStreamName": "notify:deliveries:dead" + } + } + } +} diff --git a/src/StellaOps.UI/TASKS.md b/src/StellaOps.UI/TASKS.md index f332bfec..9650b1a9 100644 --- a/src/StellaOps.UI/TASKS.md +++ b/src/StellaOps.UI/TASKS.md @@ -2,11 +2,11 @@ | ID | Status | Owner(s) | Depends on | Description | Exit Criteria | |----|--------|----------|------------|-------------|---------------| -| UI-AUTH-13-001 | TODO | UI Guild | AUTH-DPOP-11-001, AUTH-MTLS-11-002 | Integrate Authority OIDC + DPoP flows with session management. | Login/logout flows pass e2e tests; tokens refreshed; DPoP nonce handling validated. | +| UI-AUTH-13-001 | DONE (2025-10-23) | UI Guild | AUTH-DPOP-11-001, AUTH-MTLS-11-002 | Integrate Authority OIDC + DPoP flows with session management. | Login/logout flows pass e2e tests; tokens refreshed; DPoP nonce handling validated. | | UI-SCANS-13-002 | TODO | UI Guild | SCANNER-WEB-09-102, SIGNER-API-11-101 | Build scans module (list/detail/SBOM/diff/attestation) with performance + accessibility targets. | Cypress tests cover SBOM/diff; performance budgets met; accessibility checks pass. | | UI-VEX-13-003 | TODO | UI Guild | EXCITITOR-CORE-02-001, EXCITITOR-EXPORT-01-005 | Implement VEX explorer + policy editor with preview integration. | VEX views render consensus/conflicts; staged policy preview works; accessibility checks pass. | | UI-ADMIN-13-004 | TODO | UI Guild | AUTH-MTLS-11-002 | Deliver admin area (tenants/clients/quotas/licensing) with RBAC + audit hooks. | Admin e2e tests pass; unauthorized access blocked; telemetry wired. | -| UI-ATTEST-11-005 | TODO | UI Guild | SIGNER-API-11-101, ATTESTOR-API-11-201 | Attestation visibility (Rekor id, status) on Scan Detail. | UI shows Rekor UUID/status; mock attestation fixtures displayed; tests cover success/failure. | +| UI-ATTEST-11-005 | DONE (2025-10-23) | UI Guild | SIGNER-API-11-101, ATTESTOR-API-11-201 | Attestation visibility (Rekor id, status) on Scan Detail. | UI shows Rekor UUID/status; mock attestation fixtures displayed; tests cover success/failure. | | UI-SCHED-13-005 | TODO | UI Guild | SCHED-WEB-16-101 | Scheduler panel: schedules CRUD, run history, dry-run preview using API/mocks. | Panel functional with mocked endpoints; UX signoff; integration tests added. | | UI-NOTIFY-13-006 | DOING (2025-10-19) | UI Guild | NOTIFY-WEB-15-101 | Notify panel: channels/rules CRUD, deliveries view, test send integration. | Panel interacts with mocked Notify API; tests cover rule lifecycle; docs updated. | | UI-POLICY-13-007 | TODO | UI Guild | POLICY-CORE-09-006, SCANNER-WEB-09-103 | Surface policy confidence metadata (band, age, quiet provenance) on preview and report views. | UI renders new columns/tooltips, accessibility and responsive checks pass, Cypress regression updated with confidence fixtures. | diff --git a/src/StellaOps.Web/README.md b/src/StellaOps.Web/README.md index d47caeeb..f58e34bc 100644 --- a/src/StellaOps.Web/README.md +++ b/src/StellaOps.Web/README.md @@ -33,6 +33,26 @@ Run `ng build` to build the project. The build artifacts will be stored in the ` - `npm run test:watch` keeps Karma in watch mode for local development. `verify:chromium` prints every location inspected (environment overrides, system paths, `.cache/chromium/`). Set `CHROME_BIN` or `STELLAOPS_CHROMIUM_BIN` if you host the binary in a non-standard path. + +## Runtime configuration + +The SPA loads environment details from `/config.json` at startup. During development we ship a stub configuration under `src/config/config.json`; adjust the issuer, client ID, and API base URLs to match your Authority instance. To reset, copy `src/config/config.sample.json` back to `src/config/config.json`: + +```bash +cp src/config/config.sample.json src/config/config.json +``` + +When packaging for another environment, replace the file before building so the generated bundle contains the correct defaults. Gateways that rewrite `/config.json` at request time can override these settings without rebuilding. + +## End-to-end tests + +Playwright drives the high-level auth UX using the stub configuration above. Ensure the Angular dev server can bind to `127.0.0.1:4400`, then run: + +```bash +npm run test:e2e +``` + +The Playwright config auto-starts `npm run serve:test` and intercepts Authority redirects, so no live IdP is required. For CI/offline nodes, pre-install the required browsers via `npx playwright install --with-deps` and cache the results alongside your npm cache. ## Running end-to-end tests diff --git a/src/StellaOps.Web/TASKS.md b/src/StellaOps.Web/TASKS.md index bf519f49..2d0d2485 100644 --- a/src/StellaOps.Web/TASKS.md +++ b/src/StellaOps.Web/TASKS.md @@ -6,3 +6,4 @@ | WEB1.TRIVY-SETTINGS-TESTS | DONE (2025-10-21) | UX Specialist, Angular Eng | WEB1.TRIVY-SETTINGS | **DONE (2025-10-21)** – Added headless Karma harness (`ng test --watch=false`) wired to ChromeHeadless/CI launcher, created `karma.conf.cjs`, updated npm scripts + docs with Chromium prerequisites so CI/offline runners can execute specs deterministically. | Angular CLI available (npm scripts chained), Karma suite for Trivy DB components passing locally and in CI, docs note required prerequisites. | | WEB1.DEPS-13-001 | DONE (2025-10-21) | UX Specialist, Angular Eng, DevEx | WEB1.TRIVY-SETTINGS-TESTS | Stabilise Angular workspace dependencies for CI/offline nodes: refresh `package-lock.json`, ensure Puppeteer/Chromium binaries optional, document deterministic install workflow. | `npm install` completes without manual intervention on air-gapped nodes, `npm test` headless run succeeds from clean checkout, README updated with lockfile + cache steps. | | WEB-POLICY-FIXTURES-10-001 | DONE (2025-10-23) | Angular Eng | SAMPLES-13-004 | Wire policy preview/report doc fixtures into UI harness (test utility or Storybook substitute) with type bindings and validation guard so UI stays aligned with documented payloads. | JSON fixtures importable within Angular workspace, typed helpers exported for reuse, Karma spec validates critical fields (confidence band, unknown metrics, DSSE summary). | +| UI-AUTH-13-001 | DONE (2025-10-23) | UI Guild | AUTH-DPOP-11-001, AUTH-MTLS-11-002 | Integrate Authority OIDC + DPoP flows with session management (Angular SPA). | APP_INITIALIZER loads runtime config; login/logout flows drive Authority code flow; DPoP proofs generated/stored, nonce retries handled; unit specs cover proof binding + session persistence. | diff --git a/src/StellaOps.Web/angular.json b/src/StellaOps.Web/angular.json index a1c26707..91cbe8a1 100644 --- a/src/StellaOps.Web/angular.json +++ b/src/StellaOps.Web/angular.json @@ -25,10 +25,15 @@ ], "tsConfig": "tsconfig.app.json", "inlineStyleLanguage": "scss", - "assets": [ - "src/favicon.ico", - "src/assets" - ], + "assets": [ + "src/favicon.ico", + "src/assets", + { + "glob": "config.json", + "input": "src/config", + "output": "." + } + ], "styles": [ "src/styles.scss" ], @@ -88,7 +93,12 @@ "inlineStyleLanguage": "scss", "assets": [ "src/favicon.ico", - "src/assets" + "src/assets", + { + "glob": "config.json", + "input": "src/config", + "output": "." + } ], "styles": [ "src/styles.scss" diff --git a/src/StellaOps.Web/package-lock.json b/src/StellaOps.Web/package-lock.json index d617ec82..a6a4fab2 100644 --- a/src/StellaOps.Web/package-lock.json +++ b/src/StellaOps.Web/package-lock.json @@ -24,6 +24,7 @@ "@angular-devkit/build-angular": "^17.3.17", "@angular/cli": "^17.3.17", "@angular/compiler-cli": "^17.3.0", + "@playwright/test": "^1.47.2", "@types/jasmine": "~5.1.0", "jasmine-core": "~5.1.0", "karma": "~6.4.0", @@ -5074,6 +5075,21 @@ "node": ">=14" } }, + "node_modules/@playwright/test": { + "version": "1.56.1", + "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.56.1.tgz", + "integrity": "sha512-vSMYtL/zOcFpvJCW71Q/OEGQb7KYBPAdKh35WNSkaZA75JlAO8ED8UN6GUNTm3drWomcbcqRPFqQbLae8yBTdg==", + "dev": true, + "dependencies": { + "playwright": "1.56.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/@rollup/rollup-linux-x64-gnu": { "version": "4.52.5", "cpu": [ @@ -5313,9 +5329,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@types/node": { - "dev": true - }, "node_modules/@types/node-forge": { "version": "1.3.14", "dev": true, @@ -8233,6 +8246,20 @@ "dev": true, "license": "ISC" }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/function-bind": { "version": "1.1.2", "dev": true, @@ -10928,6 +10955,36 @@ "node": "^12.20.0 || ^14.13.1 || >=16.0.0" } }, + "node_modules/playwright": { + "version": "1.56.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.56.1.tgz", + "integrity": "sha512-aFi5B0WovBHTEvpM3DzXTUaeN6eN0qWnTkKx4NQaH4Wvcmc153PdaY2UBdSYKaGYw+UyWXSVyxDUg5DoPEttjw==", + "dev": true, + "dependencies": { + "playwright-core": "1.56.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.56.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.56.1.tgz", + "integrity": "sha512-hutraynyn31F+Bifme+Ps9Vq59hKuUCz7H1kDOcBs+2oGguKkWTU50bBWrtz34OUWmIwpBTWDxaRPXrIXkgvmQ==", + "dev": true, + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/postcss": { "version": "8.4.35", "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.35.tgz", diff --git a/src/StellaOps.Web/package.json b/src/StellaOps.Web/package.json index 3f27afca..2bea2a78 100644 --- a/src/StellaOps.Web/package.json +++ b/src/StellaOps.Web/package.json @@ -9,6 +9,8 @@ "test": "npm run verify:chromium && ng test --watch=false", "test:watch": "ng test --watch", "test:ci": "npm run test", + "test:e2e": "playwright test", + "serve:test": "ng serve --configuration development --port 4400 --host 127.0.0.1", "verify:chromium": "node ./scripts/verify-chromium.js", "ci:install": "npm ci --prefer-offline --no-audit --no-fund" }, @@ -33,7 +35,8 @@ "devDependencies": { "@angular-devkit/build-angular": "^17.3.17", "@angular/cli": "^17.3.17", - "@angular/compiler-cli": "^17.3.0", + "@angular/compiler-cli": "^17.3.0", + "@playwright/test": "^1.47.2", "@types/jasmine": "~5.1.0", "jasmine-core": "~5.1.0", "karma": "~6.4.0", diff --git a/src/StellaOps.Web/playwright.config.ts b/src/StellaOps.Web/playwright.config.ts new file mode 100644 index 00000000..0169231f --- /dev/null +++ b/src/StellaOps.Web/playwright.config.ts @@ -0,0 +1,22 @@ +import { defineConfig } from '@playwright/test'; + +const port = process.env.PLAYWRIGHT_PORT + ? Number.parseInt(process.env.PLAYWRIGHT_PORT, 10) + : 4400; + +export default defineConfig({ + testDir: 'tests/e2e', + timeout: 30_000, + retries: process.env.CI ? 1 : 0, + use: { + baseURL: process.env.PLAYWRIGHT_BASE_URL ?? `http://127.0.0.1:${port}`, + trace: 'retain-on-failure', + }, + webServer: { + command: 'npm run serve:test', + reuseExistingServer: !process.env.CI, + url: `http://127.0.0.1:${port}`, + stdout: 'ignore', + stderr: 'ignore', + }, +}); diff --git a/src/StellaOps.Web/src/app/app.component.html b/src/StellaOps.Web/src/app/app.component.html index 96b5f9d4..8e67e1dc 100644 --- a/src/StellaOps.Web/src/app/app.component.html +++ b/src/StellaOps.Web/src/app/app.component.html @@ -5,7 +5,19 @@ Trivy DB Export + + Scan Detail + +
+ + {{ displayName() }} + + + + + +
diff --git a/src/StellaOps.Web/src/app/app.component.scss b/src/StellaOps.Web/src/app/app.component.scss index c964b5fd..044b7887 100644 --- a/src/StellaOps.Web/src/app/app.component.scss +++ b/src/StellaOps.Web/src/app/app.component.scss @@ -50,6 +50,36 @@ } } +.app-auth { + display: flex; + align-items: center; + gap: 0.75rem; + + .app-user { + font-size: 0.9rem; + font-weight: 500; + } + + button { + appearance: none; + border: none; + border-radius: 9999px; + padding: 0.35rem 0.9rem; + font-size: 0.85rem; + font-weight: 500; + cursor: pointer; + color: #0f172a; + background-color: rgba(248, 250, 252, 0.9); + transition: transform 0.2s ease, background-color 0.2s ease; + + &:hover, + &:focus-visible { + background-color: #facc15; + transform: translateY(-1px); + } + } +} + .app-content { flex: 1; padding: 2rem 1.5rem; diff --git a/src/StellaOps.Web/src/app/app.component.spec.ts b/src/StellaOps.Web/src/app/app.component.spec.ts index 2fa3536f..0f063363 100644 --- a/src/StellaOps.Web/src/app/app.component.spec.ts +++ b/src/StellaOps.Web/src/app/app.component.spec.ts @@ -1,11 +1,22 @@ import { TestBed } from '@angular/core/testing'; import { RouterTestingModule } from '@angular/router/testing'; import { AppComponent } from './app.component'; +import { AuthorityAuthService } from './core/auth/authority-auth.service'; +import { AuthSessionStore } from './core/auth/auth-session.store'; + +class AuthorityAuthServiceStub { + beginLogin = jasmine.createSpy('beginLogin'); + logout = jasmine.createSpy('logout'); +} describe('AppComponent', () => { beforeEach(async () => { await TestBed.configureTestingModule({ imports: [AppComponent, RouterTestingModule], + providers: [ + AuthSessionStore, + { provide: AuthorityAuthService, useClass: AuthorityAuthServiceStub }, + ], }).compileComponents(); }); diff --git a/src/StellaOps.Web/src/app/app.component.ts b/src/StellaOps.Web/src/app/app.component.ts index c2e71b02..a01ba51e 100644 --- a/src/StellaOps.Web/src/app/app.component.ts +++ b/src/StellaOps.Web/src/app/app.component.ts @@ -1,11 +1,51 @@ -import { Component } from '@angular/core'; -import { RouterLink, RouterLinkActive, RouterOutlet } from '@angular/router'; +import { CommonModule } from '@angular/common'; +import { + ChangeDetectionStrategy, + Component, + computed, + inject, +} from '@angular/core'; +import { Router, RouterLink, RouterLinkActive, RouterOutlet } from '@angular/router'; + +import { AuthorityAuthService } from './core/auth/authority-auth.service'; +import { AuthSessionStore } from './core/auth/auth-session.store'; @Component({ selector: 'app-root', standalone: true, - imports: [RouterOutlet, RouterLink, RouterLinkActive], + imports: [CommonModule, RouterOutlet, RouterLink, RouterLinkActive], templateUrl: './app.component.html', - styleUrl: './app.component.scss' + styleUrl: './app.component.scss', + changeDetection: ChangeDetectionStrategy.OnPush, }) -export class AppComponent {} +export class AppComponent { + private readonly router = inject(Router); + private readonly auth = inject(AuthorityAuthService); + private readonly sessionStore = inject(AuthSessionStore); + + readonly status = this.sessionStore.status; + readonly identity = this.sessionStore.identity; + readonly subjectHint = this.sessionStore.subjectHint; + readonly isAuthenticated = this.sessionStore.isAuthenticated; + + readonly displayName = computed(() => { + const identity = this.identity(); + if (identity?.name) { + return identity.name; + } + if (identity?.email) { + return identity.email; + } + const hint = this.subjectHint(); + return hint ?? 'anonymous'; + }); + + onSignIn(): void { + const returnUrl = this.router.url === '/' ? undefined : this.router.url; + void this.auth.beginLogin(returnUrl); + } + + onSignOut(): void { + void this.auth.logout(); + } +} diff --git a/src/StellaOps.Web/src/app/app.config.ts b/src/StellaOps.Web/src/app/app.config.ts index 053df09d..495db8b9 100644 --- a/src/StellaOps.Web/src/app/app.config.ts +++ b/src/StellaOps.Web/src/app/app.config.ts @@ -1,14 +1,28 @@ -import { provideHttpClient } from '@angular/common/http'; -import { ApplicationConfig } from '@angular/core'; +import { HTTP_INTERCEPTORS, provideHttpClient, withInterceptorsFromDi } from '@angular/common/http'; +import { APP_INITIALIZER, ApplicationConfig } from '@angular/core'; import { provideRouter } from '@angular/router'; import { routes } from './app.routes'; import { CONCELIER_EXPORTER_API_BASE_URL } from './core/api/concelier-exporter.client'; +import { AppConfigService } from './core/config/app-config.service'; +import { AuthHttpInterceptor } from './core/auth/auth-http.interceptor'; export const appConfig: ApplicationConfig = { providers: [ provideRouter(routes), - provideHttpClient(), + provideHttpClient(withInterceptorsFromDi()), + { + provide: APP_INITIALIZER, + multi: true, + useFactory: (configService: AppConfigService) => () => + configService.load(), + deps: [AppConfigService], + }, + { + provide: HTTP_INTERCEPTORS, + useClass: AuthHttpInterceptor, + multi: true, + }, { provide: CONCELIER_EXPORTER_API_BASE_URL, useValue: '/api/v1/concelier/exporters/trivy-db', diff --git a/src/StellaOps.Web/src/app/app.routes.ts b/src/StellaOps.Web/src/app/app.routes.ts index 6af3b45c..1bc2570c 100644 --- a/src/StellaOps.Web/src/app/app.routes.ts +++ b/src/StellaOps.Web/src/app/app.routes.ts @@ -8,6 +8,20 @@ export const routes: Routes = [ (m) => m.TrivyDbSettingsPageComponent ), }, + { + path: 'scans/:scanId', + loadComponent: () => + import('./features/scans/scan-detail-page.component').then( + (m) => m.ScanDetailPageComponent + ), + }, + { + path: 'auth/callback', + loadComponent: () => + import('./features/auth/auth-callback.component').then( + (m) => m.AuthCallbackComponent + ), + }, { path: '', pathMatch: 'full', diff --git a/src/StellaOps.Web/src/app/core/api/scanner.models.ts b/src/StellaOps.Web/src/app/core/api/scanner.models.ts new file mode 100644 index 00000000..b905a4ff --- /dev/null +++ b/src/StellaOps.Web/src/app/core/api/scanner.models.ts @@ -0,0 +1,17 @@ +export type ScanAttestationStatusKind = 'verified' | 'pending' | 'failed'; + +export interface ScanAttestationStatus { + readonly uuid: string; + readonly status: ScanAttestationStatusKind; + readonly index?: number; + readonly logUrl?: string; + readonly checkedAt?: string; + readonly statusMessage?: string; +} + +export interface ScanDetail { + readonly scanId: string; + readonly imageDigest: string; + readonly completedAt: string; + readonly attestation?: ScanAttestationStatus; +} diff --git a/src/StellaOps.Web/src/app/core/auth/auth-http.interceptor.ts b/src/StellaOps.Web/src/app/core/auth/auth-http.interceptor.ts new file mode 100644 index 00000000..66ceecff --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/auth-http.interceptor.ts @@ -0,0 +1,171 @@ +import { + HttpErrorResponse, + HttpEvent, + HttpHandler, + HttpInterceptor, + HttpRequest, +} from '@angular/common/http'; +import { Injectable } from '@angular/core'; +import { Observable, firstValueFrom, from, throwError } from 'rxjs'; +import { catchError, switchMap } from 'rxjs/operators'; + +import { AppConfigService } from '../config/app-config.service'; +import { DpopService } from './dpop/dpop.service'; +import { AuthorityAuthService } from './authority-auth.service'; + +const RETRY_HEADER = 'X-StellaOps-DPoP-Retry'; + +@Injectable() +export class AuthHttpInterceptor implements HttpInterceptor { + private excludedOrigins: Set | null = null; + private tokenEndpoint: string | null = null; + private authorityResolved = false; + + constructor( + private readonly auth: AuthorityAuthService, + private readonly config: AppConfigService, + private readonly dpop: DpopService + ) { + // lazy resolve authority configuration in intercept to allow APP_INITIALIZER to run first + } + + intercept( + request: HttpRequest, + next: HttpHandler + ): Observable> { + this.ensureAuthorityInfo(); + + if (request.headers.has('Authorization') || this.shouldSkip(request.url)) { + return next.handle(request); + } + + return from( + this.auth.getAuthHeadersForRequest( + this.resolveAbsoluteUrl(request.url), + request.method + ) + ).pipe( + switchMap((headers) => { + if (!headers) { + return next.handle(request); + } + const authorizedRequest = request.clone({ + setHeaders: { + Authorization: headers.authorization, + DPoP: headers.dpop, + }, + headers: request.headers.set(RETRY_HEADER, '0'), + }); + return next.handle(authorizedRequest); + }), + catchError((error: HttpErrorResponse) => + this.handleError(request, error, next) + ) + ); + } + + private handleError( + request: HttpRequest, + error: HttpErrorResponse, + next: HttpHandler + ): Observable> { + if (error.status !== 401) { + return throwError(() => error); + } + + const nonce = error.headers?.get('DPoP-Nonce'); + if (!nonce) { + return throwError(() => error); + } + + if (request.headers.get(RETRY_HEADER) === '1') { + return throwError(() => error); + } + + return from(this.retryWithNonce(request, nonce, next)).pipe( + catchError(() => throwError(() => error)) + ); + } + + private async retryWithNonce( + request: HttpRequest, + nonce: string, + next: HttpHandler + ): Promise> { + await this.dpop.setNonce(nonce); + const headers = await this.auth.getAuthHeadersForRequest( + this.resolveAbsoluteUrl(request.url), + request.method + ); + if (!headers) { + throw new Error('Unable to refresh authorization headers after nonce.'); + } + + const retried = request.clone({ + setHeaders: { + Authorization: headers.authorization, + DPoP: headers.dpop, + }, + headers: request.headers.set(RETRY_HEADER, '1'), + }); + + return firstValueFrom(next.handle(retried)); + } + + private shouldSkip(url: string): boolean { + this.ensureAuthorityInfo(); + const absolute = this.resolveAbsoluteUrl(url); + if (!absolute) { + return false; + } + + try { + const resolved = new URL(absolute); + if (resolved.pathname.endsWith('/config.json')) { + return true; + } + if (this.tokenEndpoint && absolute.startsWith(this.tokenEndpoint)) { + return true; + } + const origin = resolved.origin; + return this.excludedOrigins?.has(origin) ?? false; + } catch { + return false; + } + } + + private resolveAbsoluteUrl(url: string): string { + try { + if (url.startsWith('http://') || url.startsWith('https://')) { + return url; + } + const base = + typeof window !== 'undefined' && window.location + ? window.location.origin + : undefined; + return base ? new URL(url, base).toString() : url; + } catch { + return url; + } + } + + private ensureAuthorityInfo(): void { + if (this.authorityResolved) { + return; + } + try { + const authority = this.config.authority; + this.tokenEndpoint = new URL( + authority.tokenEndpoint, + authority.issuer + ).toString(); + this.excludedOrigins = new Set([ + this.tokenEndpoint, + new URL(authority.authorizeEndpoint, authority.issuer).origin, + ]); + this.authorityResolved = true; + } catch { + // Configuration not yet loaded; interceptor will retry on the next request. + } + } +} diff --git a/src/StellaOps.Web/src/app/core/auth/auth-session.model.ts b/src/StellaOps.Web/src/app/core/auth/auth-session.model.ts new file mode 100644 index 00000000..47363689 --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/auth-session.model.ts @@ -0,0 +1,49 @@ +export interface AuthTokens { + readonly accessToken: string; + readonly expiresAtEpochMs: number; + readonly refreshToken?: string; + readonly tokenType: 'Bearer'; + readonly scope: string; +} + +export interface AuthIdentity { + readonly subject: string; + readonly name?: string; + readonly email?: string; + readonly roles: readonly string[]; + readonly idToken?: string; +} + +export interface AuthSession { + readonly tokens: AuthTokens; + readonly identity: AuthIdentity; + /** + * SHA-256 JWK thumbprint of the active DPoP key pair. + */ + readonly dpopKeyThumbprint: string; + readonly issuedAtEpochMs: number; +} + +export interface PersistedSessionMetadata { + readonly subject: string; + readonly expiresAtEpochMs: number; + readonly issuedAtEpochMs: number; + readonly dpopKeyThumbprint: string; +} + +export type AuthStatus = + | 'unauthenticated' + | 'authenticated' + | 'refreshing' + | 'loading'; + +export const ACCESS_TOKEN_REFRESH_THRESHOLD_MS = 60_000; + +export const SESSION_STORAGE_KEY = 'stellaops.auth.session.info'; + +export type AuthErrorReason = + | 'invalid_state' + | 'token_exchange_failed' + | 'refresh_failed' + | 'dpop_generation_failed' + | 'configuration_missing'; diff --git a/src/StellaOps.Web/src/app/core/auth/auth-session.store.spec.ts b/src/StellaOps.Web/src/app/core/auth/auth-session.store.spec.ts new file mode 100644 index 00000000..2070db25 --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/auth-session.store.spec.ts @@ -0,0 +1,48 @@ +import { TestBed } from '@angular/core/testing'; + +import { AuthSession, AuthTokens, SESSION_STORAGE_KEY } from './auth-session.model'; +import { AuthSessionStore } from './auth-session.store'; + +describe('AuthSessionStore', () => { + let store: AuthSessionStore; + + beforeEach(() => { + sessionStorage.clear(); + TestBed.configureTestingModule({ + providers: [AuthSessionStore], + }); + store = TestBed.inject(AuthSessionStore); + }); + + it('persists minimal metadata when session is set', () => { + const tokens: AuthTokens = { + accessToken: 'token-abc', + expiresAtEpochMs: Date.now() + 120_000, + refreshToken: 'refresh-xyz', + scope: 'openid ui.read', + tokenType: 'Bearer', + }; + + const session: AuthSession = { + tokens, + identity: { + subject: 'user-123', + name: 'Alex Operator', + roles: ['ui.read'], + }, + dpopKeyThumbprint: 'thumbprint-1', + issuedAtEpochMs: Date.now(), + }; + + store.setSession(session); + + const persisted = sessionStorage.getItem(SESSION_STORAGE_KEY); + expect(persisted).toBeTruthy(); + const parsed = JSON.parse(persisted ?? '{}'); + expect(parsed.subject).toBe('user-123'); + expect(parsed.dpopKeyThumbprint).toBe('thumbprint-1'); + + store.clear(); + expect(sessionStorage.getItem(SESSION_STORAGE_KEY)).toBeNull(); + }); +}); diff --git a/src/StellaOps.Web/src/app/core/auth/auth-session.store.ts b/src/StellaOps.Web/src/app/core/auth/auth-session.store.ts new file mode 100644 index 00000000..43fc7292 --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/auth-session.store.ts @@ -0,0 +1,107 @@ +import { Injectable, computed, signal } from '@angular/core'; + +import { + AuthSession, + AuthStatus, + PersistedSessionMetadata, + SESSION_STORAGE_KEY, +} from './auth-session.model'; + +@Injectable({ + providedIn: 'root', +}) +export class AuthSessionStore { + private readonly sessionSignal = signal(null); + private readonly statusSignal = signal('unauthenticated'); + private readonly persistedSignal = + signal(this.readPersistedMetadata()); + + readonly session = computed(() => this.sessionSignal()); + readonly status = computed(() => this.statusSignal()); + + readonly identity = computed(() => this.sessionSignal()?.identity ?? null); + readonly subjectHint = computed( + () => + this.sessionSignal()?.identity.subject ?? + this.persistedSignal()?.subject ?? + null + ); + + readonly expiresAtEpochMs = computed( + () => this.sessionSignal()?.tokens.expiresAtEpochMs ?? null + ); + + readonly isAuthenticated = computed( + () => this.sessionSignal() !== null && this.statusSignal() !== 'loading' + ); + + setStatus(status: AuthStatus): void { + this.statusSignal.set(status); + } + + setSession(session: AuthSession | null): void { + this.sessionSignal.set(session); + if (!session) { + this.statusSignal.set('unauthenticated'); + this.persistedSignal.set(null); + this.clearPersistedMetadata(); + return; + } + + this.statusSignal.set('authenticated'); + const metadata: PersistedSessionMetadata = { + subject: session.identity.subject, + expiresAtEpochMs: session.tokens.expiresAtEpochMs, + issuedAtEpochMs: session.issuedAtEpochMs, + dpopKeyThumbprint: session.dpopKeyThumbprint, + }; + this.persistedSignal.set(metadata); + this.persistMetadata(metadata); + } + + clear(): void { + this.sessionSignal.set(null); + this.statusSignal.set('unauthenticated'); + this.persistedSignal.set(null); + this.clearPersistedMetadata(); + } + + private readPersistedMetadata(): PersistedSessionMetadata | null { + if (typeof sessionStorage === 'undefined') { + return null; + } + + try { + const raw = sessionStorage.getItem(SESSION_STORAGE_KEY); + if (!raw) { + return null; + } + const parsed = JSON.parse(raw) as PersistedSessionMetadata; + if ( + typeof parsed.subject !== 'string' || + typeof parsed.expiresAtEpochMs !== 'number' || + typeof parsed.issuedAtEpochMs !== 'number' || + typeof parsed.dpopKeyThumbprint !== 'string' + ) { + return null; + } + return parsed; + } catch { + return null; + } + } + + private persistMetadata(metadata: PersistedSessionMetadata): void { + if (typeof sessionStorage === 'undefined') { + return; + } + sessionStorage.setItem(SESSION_STORAGE_KEY, JSON.stringify(metadata)); + } + + private clearPersistedMetadata(): void { + if (typeof sessionStorage === 'undefined') { + return; + } + sessionStorage.removeItem(SESSION_STORAGE_KEY); + } +} diff --git a/src/StellaOps.Web/src/app/core/auth/auth-storage.service.ts b/src/StellaOps.Web/src/app/core/auth/auth-storage.service.ts new file mode 100644 index 00000000..7017dc16 --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/auth-storage.service.ts @@ -0,0 +1,45 @@ +import { Injectable } from '@angular/core'; + +const LOGIN_REQUEST_KEY = 'stellaops.auth.login.request'; + +export interface PendingLoginRequest { + readonly state: string; + readonly codeVerifier: string; + readonly createdAtEpochMs: number; + readonly returnUrl?: string; + readonly nonce?: string; +} + +@Injectable({ + providedIn: 'root', +}) +export class AuthStorageService { + savePendingLogin(request: PendingLoginRequest): void { + if (typeof sessionStorage === 'undefined') { + return; + } + sessionStorage.setItem(LOGIN_REQUEST_KEY, JSON.stringify(request)); + } + + consumePendingLogin(expectedState: string): PendingLoginRequest | null { + if (typeof sessionStorage === 'undefined') { + return null; + } + + const raw = sessionStorage.getItem(LOGIN_REQUEST_KEY); + if (!raw) { + return null; + } + + sessionStorage.removeItem(LOGIN_REQUEST_KEY); + try { + const request = JSON.parse(raw) as PendingLoginRequest; + if (request.state !== expectedState) { + return null; + } + return request; + } catch { + return null; + } + } +} diff --git a/src/StellaOps.Web/src/app/core/auth/authority-auth.service.ts b/src/StellaOps.Web/src/app/core/auth/authority-auth.service.ts new file mode 100644 index 00000000..410a187a --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/authority-auth.service.ts @@ -0,0 +1,430 @@ +import { HttpClient, HttpHeaders, HttpResponse } from '@angular/common/http'; +import { Injectable } from '@angular/core'; +import { firstValueFrom } from 'rxjs'; + +import { AppConfigService } from '../config/app-config.service'; +import { AuthorityConfig } from '../config/app-config.model'; +import { + ACCESS_TOKEN_REFRESH_THRESHOLD_MS, + AuthErrorReason, + AuthSession, + AuthTokens, +} from './auth-session.model'; +import { AuthSessionStore } from './auth-session.store'; +import { + AuthStorageService, + PendingLoginRequest, +} from './auth-storage.service'; +import { DpopService } from './dpop/dpop.service'; +import { base64UrlDecode } from './dpop/jose-utilities'; +import { createPkcePair } from './pkce.util'; + +interface TokenResponse { + readonly access_token: string; + readonly token_type: string; + readonly expires_in: number; + readonly scope?: string; + readonly refresh_token?: string; + readonly id_token?: string; +} + +interface RefreshTokenResponse extends TokenResponse {} + +export interface AuthorizationHeaders { + readonly authorization: string; + readonly dpop: string; +} + +export interface CompleteLoginResult { + readonly returnUrl?: string; +} + +const TOKEN_CONTENT_TYPE = 'application/x-www-form-urlencoded'; + +@Injectable({ + providedIn: 'root', +}) +export class AuthorityAuthService { + private refreshTimer: ReturnType | null = null; + private refreshInFlight: Promise | null = null; + private lastError: AuthErrorReason | null = null; + + constructor( + private readonly http: HttpClient, + private readonly config: AppConfigService, + private readonly sessionStore: AuthSessionStore, + private readonly storage: AuthStorageService, + private readonly dpop: DpopService + ) {} + + get error(): AuthErrorReason | null { + return this.lastError; + } + + async beginLogin(returnUrl?: string): Promise { + const authority = this.config.authority; + const pkce = await createPkcePair(); + const state = crypto.randomUUID ? crypto.randomUUID() : createRandomId(); + const nonce = crypto.randomUUID ? crypto.randomUUID() : createRandomId(); + + // Generate the DPoP key pair up-front so the same key is bound to the token. + await this.dpop.getThumbprint(); + + const authorizeUrl = this.buildAuthorizeUrl(authority, { + state, + nonce, + codeChallenge: pkce.challenge, + codeChallengeMethod: pkce.method, + returnUrl, + }); + + const now = Date.now(); + this.storage.savePendingLogin({ + state, + codeVerifier: pkce.verifier, + createdAtEpochMs: now, + returnUrl, + nonce, + }); + + window.location.assign(authorizeUrl); + } + + /** + * Completes the authorization code flow after the Authority redirects back with ?code & ?state. + */ + async completeLoginFromRedirect( + queryParams: URLSearchParams + ): Promise { + const code = queryParams.get('code'); + const state = queryParams.get('state'); + if (!code || !state) { + throw new Error('Missing authorization code or state.'); + } + + const pending = this.storage.consumePendingLogin(state); + if (!pending) { + this.lastError = 'invalid_state'; + throw new Error('State parameter did not match pending login request.'); + } + + try { + const tokenResponse = await this.exchangeCodeForTokens( + code, + pending.codeVerifier + ); + await this.onTokenResponse(tokenResponse, pending.nonce ?? null); + this.lastError = null; + return { returnUrl: pending.returnUrl }; + } catch (error) { + this.lastError = 'token_exchange_failed'; + this.sessionStore.clear(); + throw error; + } + } + + async ensureValidAccessToken(): Promise { + const session = this.sessionStore.session(); + if (!session) { + return null; + } + + const now = Date.now(); + if (now < session.tokens.expiresAtEpochMs - ACCESS_TOKEN_REFRESH_THRESHOLD_MS) { + return session.tokens.accessToken; + } + + await this.refreshAccessToken(); + const refreshed = this.sessionStore.session(); + return refreshed?.tokens.accessToken ?? null; + } + + async getAuthHeadersForRequest( + url: string, + method: string + ): Promise { + const accessToken = await this.ensureValidAccessToken(); + if (!accessToken) { + return null; + } + const dpopProof = await this.dpop.createProof({ + htm: method, + htu: url, + accessToken, + }); + return { + authorization: `DPoP ${accessToken}`, + dpop: dpopProof, + }; + } + + async refreshAccessToken(): Promise { + const session = this.sessionStore.session(); + const refreshToken = session?.tokens.refreshToken; + if (!refreshToken) { + return; + } + + if (this.refreshInFlight) { + await this.refreshInFlight; + return; + } + + this.refreshInFlight = this.executeRefresh(refreshToken) + .catch((error) => { + this.lastError = 'refresh_failed'; + this.sessionStore.clear(); + throw error; + }) + .finally(() => { + this.refreshInFlight = null; + }); + + await this.refreshInFlight; + } + + async logout(): Promise { + const session = this.sessionStore.session(); + this.cancelRefreshTimer(); + this.sessionStore.clear(); + await this.dpop.setNonce(null); + + const authority = this.config.authority; + if (!authority.logoutEndpoint) { + return; + } + + if (session?.identity.idToken) { + const url = new URL(authority.logoutEndpoint, authority.issuer); + url.searchParams.set('post_logout_redirect_uri', authority.postLogoutRedirectUri ?? authority.redirectUri); + url.searchParams.set('id_token_hint', session.identity.idToken); + window.location.assign(url.toString()); + } else { + window.location.assign(authority.postLogoutRedirectUri ?? authority.redirectUri); + } + } + + private async exchangeCodeForTokens( + code: string, + codeVerifier: string + ): Promise> { + const authority = this.config.authority; + const tokenUrl = new URL(authority.tokenEndpoint, authority.issuer).toString(); + + const body = new URLSearchParams(); + body.set('grant_type', 'authorization_code'); + body.set('code', code); + body.set('redirect_uri', authority.redirectUri); + body.set('client_id', authority.clientId); + body.set('code_verifier', codeVerifier); + if (authority.audience) { + body.set('audience', authority.audience); + } + + const dpopProof = await this.dpop.createProof({ + htm: 'POST', + htu: tokenUrl, + }); + + const headers = new HttpHeaders({ + 'Content-Type': TOKEN_CONTENT_TYPE, + DPoP: dpopProof, + }); + + return firstValueFrom( + this.http.post(tokenUrl, body.toString(), { + headers, + withCredentials: true, + observe: 'response', + }) + ); + } + + private async executeRefresh(refreshToken: string): Promise { + const authority = this.config.authority; + const tokenUrl = new URL(authority.tokenEndpoint, authority.issuer).toString(); + const body = new URLSearchParams(); + body.set('grant_type', 'refresh_token'); + body.set('refresh_token', refreshToken); + body.set('client_id', authority.clientId); + if (authority.audience) { + body.set('audience', authority.audience); + } + + const proof = await this.dpop.createProof({ + htm: 'POST', + htu: tokenUrl, + }); + + const headers = new HttpHeaders({ + 'Content-Type': TOKEN_CONTENT_TYPE, + DPoP: proof, + }); + + const response = await firstValueFrom( + this.http.post(tokenUrl, body.toString(), { + headers, + withCredentials: true, + observe: 'response', + }) + ); + + await this.onTokenResponse(response, null); + } + + private async onTokenResponse( + response: HttpResponse, + expectedNonce: string | null + ): Promise { + const nonce = response.headers.get('DPoP-Nonce'); + if (nonce) { + await this.dpop.setNonce(nonce); + } + + const payload = response.body; + if (!payload) { + throw new Error('Token response did not include a body.'); + } + + const tokens = this.toAuthTokens(payload); + const identity = this.parseIdentity(payload.id_token ?? '', expectedNonce); + const thumbprint = await this.dpop.getThumbprint(); + if (!thumbprint) { + throw new Error('DPoP thumbprint unavailable.'); + } + + const session: AuthSession = { + tokens, + identity, + dpopKeyThumbprint: thumbprint, + issuedAtEpochMs: Date.now(), + }; + this.sessionStore.setSession(session); + this.scheduleRefresh(tokens, this.config.authority); + } + + private toAuthTokens(payload: TokenResponse): AuthTokens { + const expiresAtEpochMs = Date.now() + payload.expires_in * 1000; + return { + accessToken: payload.access_token, + tokenType: (payload.token_type ?? 'Bearer') as 'Bearer', + refreshToken: payload.refresh_token, + scope: payload.scope ?? '', + expiresAtEpochMs, + }; + } + + private parseIdentity( + idToken: string, + expectedNonce: string | null + ): AuthSession['identity'] { + if (!idToken) { + return { + subject: 'unknown', + roles: [], + }; + } + + const claims = decodeJwt(idToken); + const nonceClaim = claims['nonce']; + if ( + expectedNonce && + typeof nonceClaim === 'string' && + nonceClaim !== expectedNonce + ) { + throw new Error('OIDC nonce mismatch.'); + } + + const subjectClaim = claims['sub']; + const nameClaim = claims['name']; + const emailClaim = claims['email']; + const rolesClaim = claims['role']; + + return { + subject: typeof subjectClaim === 'string' ? subjectClaim : 'unknown', + name: typeof nameClaim === 'string' ? nameClaim : undefined, + email: typeof emailClaim === 'string' ? emailClaim : undefined, + roles: Array.isArray(rolesClaim) + ? rolesClaim.filter((entry: unknown): entry is string => + typeof entry === 'string' + ) + : [], + idToken, + }; + } + + private scheduleRefresh(tokens: AuthTokens, authority: AuthorityConfig): void { + this.cancelRefreshTimer(); + const leeway = + (authority.refreshLeewaySeconds ?? 60) * 1000 + + ACCESS_TOKEN_REFRESH_THRESHOLD_MS; + const now = Date.now(); + const ttl = Math.max(tokens.expiresAtEpochMs - now - leeway, 5_000); + this.refreshTimer = setTimeout(() => { + void this.refreshAccessToken(); + }, ttl); + } + + private cancelRefreshTimer(): void { + if (this.refreshTimer) { + clearTimeout(this.refreshTimer); + this.refreshTimer = null; + } + } + + private buildAuthorizeUrl( + authority: AuthorityConfig, + options: { + state: string; + nonce: string; + codeChallenge: string; + codeChallengeMethod: 'S256'; + returnUrl?: string; + } + ): string { + const authorizeUrl = new URL( + authority.authorizeEndpoint, + authority.issuer + ); + authorizeUrl.searchParams.set('response_type', 'code'); + authorizeUrl.searchParams.set('client_id', authority.clientId); + authorizeUrl.searchParams.set('redirect_uri', authority.redirectUri); + authorizeUrl.searchParams.set('scope', authority.scope); + authorizeUrl.searchParams.set('state', options.state); + authorizeUrl.searchParams.set('nonce', options.nonce); + authorizeUrl.searchParams.set('code_challenge', options.codeChallenge); + authorizeUrl.searchParams.set( + 'code_challenge_method', + options.codeChallengeMethod + ); + if (authority.audience) { + authorizeUrl.searchParams.set('audience', authority.audience); + } + if (options.returnUrl) { + authorizeUrl.searchParams.set('ui_return', options.returnUrl); + } + return authorizeUrl.toString(); + } +} + +function decodeJwt(token: string): Record { + const parts = token.split('.'); + if (parts.length < 2) { + return {}; + } + const payload = base64UrlDecode(parts[1]); + const json = new TextDecoder().decode(payload); + try { + return JSON.parse(json) as Record; + } catch { + return {}; + } +} + +function createRandomId(): string { + const array = new Uint8Array(16); + crypto.getRandomValues(array); + return Array.from(array, (value) => + value.toString(16).padStart(2, '0') + ).join(''); +} diff --git a/src/StellaOps.Web/src/app/core/auth/dpop/dpop-key-store.ts b/src/StellaOps.Web/src/app/core/auth/dpop/dpop-key-store.ts new file mode 100644 index 00000000..f9231aa3 --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/dpop/dpop-key-store.ts @@ -0,0 +1,181 @@ +import { Injectable } from '@angular/core'; + +import { DPoPAlgorithm } from '../../config/app-config.model'; +import { computeJwkThumbprint } from './jose-utilities'; + +const DB_NAME = 'stellaops-auth'; +const STORE_NAME = 'dpopKeys'; +const PRIMARY_KEY = 'primary'; +const DB_VERSION = 1; + +interface PersistedKeyPair { + readonly id: string; + readonly algorithm: DPoPAlgorithm; + readonly publicJwk: JsonWebKey; + readonly privateJwk: JsonWebKey; + readonly thumbprint: string; + readonly createdAtIso: string; +} + +export interface LoadedDpopKeyPair { + readonly algorithm: DPoPAlgorithm; + readonly privateKey: CryptoKey; + readonly publicKey: CryptoKey; + readonly publicJwk: JsonWebKey; + readonly thumbprint: string; +} + +@Injectable({ + providedIn: 'root', +}) +export class DpopKeyStore { + private dbPromise: Promise | null = null; + + async load(): Promise { + const record = await this.read(); + if (!record) { + return null; + } + + const [privateKey, publicKey] = await Promise.all([ + crypto.subtle.importKey( + 'jwk', + record.privateJwk, + this.toKeyAlgorithm(record.algorithm), + true, + ['sign'] + ), + crypto.subtle.importKey( + 'jwk', + record.publicJwk, + this.toKeyAlgorithm(record.algorithm), + true, + ['verify'] + ), + ]); + + return { + algorithm: record.algorithm, + privateKey, + publicKey, + publicJwk: record.publicJwk, + thumbprint: record.thumbprint, + }; + } + + async save( + keyPair: CryptoKeyPair, + algorithm: DPoPAlgorithm + ): Promise { + const [publicJwk, privateJwk] = await Promise.all([ + crypto.subtle.exportKey('jwk', keyPair.publicKey), + crypto.subtle.exportKey('jwk', keyPair.privateKey), + ]); + + if (!publicJwk) { + throw new Error('Failed to export public JWK for DPoP key pair.'); + } + + const thumbprint = await computeJwkThumbprint(publicJwk); + const record: PersistedKeyPair = { + id: PRIMARY_KEY, + algorithm, + publicJwk, + privateJwk, + thumbprint, + createdAtIso: new Date().toISOString(), + }; + + await this.write(record); + + return { + algorithm, + privateKey: keyPair.privateKey, + publicKey: keyPair.publicKey, + publicJwk, + thumbprint, + }; + } + + async clear(): Promise { + const db = await this.openDb(); + await transactionPromise(db, STORE_NAME, 'readwrite', (store) => + store.delete(PRIMARY_KEY) + ); + } + + async generate(algorithm: DPoPAlgorithm): Promise { + const algo = this.toKeyAlgorithm(algorithm); + const keyPair = await crypto.subtle.generateKey(algo, true, [ + 'sign', + 'verify', + ]); + + const stored = await this.save(keyPair, algorithm); + return stored; + } + + private async read(): Promise { + const db = await this.openDb(); + return transactionPromise(db, STORE_NAME, 'readonly', (store) => + store.get(PRIMARY_KEY) + ); + } + + private async write(record: PersistedKeyPair): Promise { + const db = await this.openDb(); + await transactionPromise(db, STORE_NAME, 'readwrite', (store) => + store.put(record) + ); + } + + private toKeyAlgorithm(algorithm: DPoPAlgorithm): EcKeyImportParams { + switch (algorithm) { + case 'ES384': + return { name: 'ECDSA', namedCurve: 'P-384' }; + case 'EdDSA': + throw new Error('EdDSA DPoP keys are not yet supported.'); + case 'ES256': + default: + return { name: 'ECDSA', namedCurve: 'P-256' }; + } + } + + private async openDb(): Promise { + if (typeof indexedDB === 'undefined') { + throw new Error('IndexedDB is not available for DPoP key persistence.'); + } + + if (!this.dbPromise) { + this.dbPromise = new Promise((resolve, reject) => { + const request = indexedDB.open(DB_NAME, DB_VERSION); + request.onupgradeneeded = () => { + const db = request.result; + if (!db.objectStoreNames.contains(STORE_NAME)) { + db.createObjectStore(STORE_NAME, { keyPath: 'id' }); + } + }; + request.onsuccess = () => resolve(request.result); + request.onerror = () => reject(request.error); + }); + } + + return this.dbPromise; + } +} + +function transactionPromise( + db: IDBDatabase, + storeName: string, + mode: IDBTransactionMode, + executor: (store: IDBObjectStore) => IDBRequest +): Promise { + return new Promise((resolve, reject) => { + const transaction = db.transaction(storeName, mode); + const store = transaction.objectStore(storeName); + const request = executor(store); + request.onsuccess = () => resolve(request.result); + request.onerror = () => reject(request.error); + transaction.onabort = () => reject(transaction.error); + }); +} diff --git a/src/StellaOps.Web/src/app/core/auth/dpop/dpop.service.spec.ts b/src/StellaOps.Web/src/app/core/auth/dpop/dpop.service.spec.ts new file mode 100644 index 00000000..46e1f3d0 --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/dpop/dpop.service.spec.ts @@ -0,0 +1,103 @@ +import { HttpClientTestingModule } from '@angular/common/http/testing'; +import { TestBed } from '@angular/core/testing'; + +import { APP_CONFIG, AppConfig } from '../../config/app-config.model'; +import { AppConfigService } from '../../config/app-config.service'; +import { base64UrlDecode } from './jose-utilities'; +import { DpopKeyStore } from './dpop-key-store'; +import { DpopService } from './dpop.service'; + +describe('DpopService', () => { + const originalTimeout = jasmine.DEFAULT_TIMEOUT_INTERVAL; + const config: AppConfig = { + authority: { + issuer: 'https://auth.stellaops.test/', + clientId: 'ui-client', + authorizeEndpoint: 'https://auth.stellaops.test/connect/authorize', + tokenEndpoint: 'https://auth.stellaops.test/connect/token', + redirectUri: 'https://ui.stellaops.test/auth/callback', + scope: 'openid profile ui.read', + audience: 'https://scanner.stellaops.test', + }, + apiBaseUrls: { + authority: 'https://auth.stellaops.test', + scanner: 'https://scanner.stellaops.test', + policy: 'https://policy.stellaops.test', + concelier: 'https://concelier.stellaops.test', + attestor: 'https://attestor.stellaops.test', + }, + }; + + beforeEach(async () => { + jasmine.DEFAULT_TIMEOUT_INTERVAL = 20000; + TestBed.configureTestingModule({ + imports: [HttpClientTestingModule], + providers: [ + AppConfigService, + DpopKeyStore, + DpopService, + { + provide: APP_CONFIG, + useValue: config, + }, + ], + }); + }); + + afterEach(async () => { + jasmine.DEFAULT_TIMEOUT_INTERVAL = originalTimeout; + const store = TestBed.inject(DpopKeyStore); + try { + await store.clear(); + } catch { + // ignore cleanup issues in test environment + } + }); + + it('creates a DPoP proof with expected header values', async () => { + const appConfig = TestBed.inject(AppConfigService); + appConfig.setConfigForTesting(config); + const service = TestBed.inject(DpopService); + + const proof = await service.createProof({ + htm: 'get', + htu: 'https://scanner.stellaops.test/api/v1/scans', + }); + + const [rawHeader, rawPayload] = proof.split('.'); + const header = JSON.parse( + new TextDecoder().decode(base64UrlDecode(rawHeader)) + ); + const payload = JSON.parse( + new TextDecoder().decode(base64UrlDecode(rawPayload)) + ); + + expect(header.typ).toBe('dpop+jwt'); + expect(header.alg).toBe('ES256'); + expect(header.jwk.kty).toBe('EC'); + expect(payload.htm).toBe('GET'); + expect(payload.htu).toBe('https://scanner.stellaops.test/api/v1/scans'); + expect(typeof payload.iat).toBe('number'); + expect(typeof payload.jti).toBe('string'); + }); + + it('binds access token hash when provided', async () => { + const appConfig = TestBed.inject(AppConfigService); + appConfig.setConfigForTesting(config); + const service = TestBed.inject(DpopService); + + const accessToken = 'sample-access-token'; + const proof = await service.createProof({ + htm: 'post', + htu: 'https://scanner.stellaops.test/api/v1/scans', + accessToken, + }); + + const payload = JSON.parse( + new TextDecoder().decode(base64UrlDecode(proof.split('.')[1])) + ); + + expect(payload.ath).toBeDefined(); + expect(typeof payload.ath).toBe('string'); + }); +}); diff --git a/src/StellaOps.Web/src/app/core/auth/dpop/dpop.service.ts b/src/StellaOps.Web/src/app/core/auth/dpop/dpop.service.ts new file mode 100644 index 00000000..d6d4432c --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/dpop/dpop.service.ts @@ -0,0 +1,148 @@ +import { Injectable, computed, signal } from '@angular/core'; + +import { AppConfigService } from '../../config/app-config.service'; +import { DPoPAlgorithm } from '../../config/app-config.model'; +import { sha256, base64UrlEncode, derToJoseSignature } from './jose-utilities'; +import { DpopKeyStore, LoadedDpopKeyPair } from './dpop-key-store'; + +export interface DpopProofOptions { + readonly htm: string; + readonly htu: string; + readonly accessToken?: string; + readonly nonce?: string | null; +} + +@Injectable({ + providedIn: 'root', +}) +export class DpopService { + private keyPairPromise: Promise | null = null; + private readonly nonceSignal = signal(null); + readonly nonce = computed(() => this.nonceSignal()); + + constructor( + private readonly config: AppConfigService, + private readonly store: DpopKeyStore + ) {} + + async setNonce(nonce: string | null): Promise { + this.nonceSignal.set(nonce); + } + + async getThumbprint(): Promise { + const key = await this.getOrCreateKeyPair(); + return key.thumbprint ?? null; + } + + async rotateKey(): Promise { + const algorithm = this.resolveAlgorithm(); + this.keyPairPromise = this.store.generate(algorithm); + } + + async createProof(options: DpopProofOptions): Promise { + const keyPair = await this.getOrCreateKeyPair(); + + const header = { + typ: 'dpop+jwt', + alg: keyPair.algorithm, + jwk: keyPair.publicJwk, + }; + + const nowSeconds = Math.floor(Date.now() / 1000); + const payload: Record = { + htm: options.htm.toUpperCase(), + htu: normalizeHtu(options.htu), + iat: nowSeconds, + jti: crypto.randomUUID ? crypto.randomUUID() : createRandomId(), + }; + + const nonce = options.nonce ?? this.nonceSignal(); + if (nonce) { + payload['nonce'] = nonce; + } + + if (options.accessToken) { + const accessTokenHash = await sha256( + new TextEncoder().encode(options.accessToken) + ); + payload['ath'] = base64UrlEncode(accessTokenHash); + } + + const encodedHeader = base64UrlEncode(JSON.stringify(header)); + const encodedPayload = base64UrlEncode(JSON.stringify(payload)); + const signingInput = `${encodedHeader}.${encodedPayload}`; + const signature = await crypto.subtle.sign( + { + name: 'ECDSA', + hash: this.resolveHashAlgorithm(keyPair.algorithm), + }, + keyPair.privateKey, + new TextEncoder().encode(signingInput) + ); + + const joseSignature = base64UrlEncode(derToJoseSignature(signature)); + return `${signingInput}.${joseSignature}`; + } + + private async getOrCreateKeyPair(): Promise { + if (!this.keyPairPromise) { + this.keyPairPromise = this.loadKeyPair(); + } + try { + return await this.keyPairPromise; + } catch (error) { + // Reset the memoized promise so a subsequent call can retry. + this.keyPairPromise = null; + throw error; + } + } + + private async loadKeyPair(): Promise { + const algorithm = this.resolveAlgorithm(); + try { + const existing = await this.store.load(); + if (existing && existing.algorithm === algorithm) { + return existing; + } + } catch { + // fall through to regeneration + } + + return this.store.generate(algorithm); + } + + private resolveAlgorithm(): DPoPAlgorithm { + const authority = this.config.authority; + return authority.dpopAlgorithms?.[0] ?? 'ES256'; + } + + private resolveHashAlgorithm(algorithm: DPoPAlgorithm): string { + switch (algorithm) { + case 'ES384': + return 'SHA-384'; + case 'ES256': + default: + return 'SHA-256'; + } + } +} + +function normalizeHtu(value: string): string { + try { + const base = + typeof window !== 'undefined' && window.location + ? window.location.origin + : undefined; + const url = base ? new URL(value, base) : new URL(value); + url.hash = ''; + return url.toString(); + } catch { + return value; + } +} + +function createRandomId(): string { + const array = new Uint8Array(16); + crypto.getRandomValues(array); + return base64UrlEncode(array); +} diff --git a/src/StellaOps.Web/src/app/core/auth/dpop/jose-utilities.ts b/src/StellaOps.Web/src/app/core/auth/dpop/jose-utilities.ts new file mode 100644 index 00000000..d195cea2 --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/dpop/jose-utilities.ts @@ -0,0 +1,123 @@ +export async function sha256(data: Uint8Array): Promise { + const digest = await crypto.subtle.digest('SHA-256', data); + return new Uint8Array(digest); +} + +export function base64UrlEncode( + input: ArrayBuffer | Uint8Array | string +): string { + let bytes: Uint8Array; + if (typeof input === 'string') { + bytes = new TextEncoder().encode(input); + } else if (input instanceof Uint8Array) { + bytes = input; + } else { + bytes = new Uint8Array(input); + } + + let binary = ''; + for (let i = 0; i < bytes.byteLength; i += 1) { + binary += String.fromCharCode(bytes[i]); + } + + return btoa(binary).replace(/\+/g, '-').replace(/\//g, '_').replace(/=+$/, ''); +} + +export function base64UrlDecode(value: string): Uint8Array { + const normalized = value.replace(/-/g, '+').replace(/_/g, '/'); + const padding = normalized.length % 4; + const padded = + padding === 0 ? normalized : normalized + '='.repeat(4 - padding); + const binary = atob(padded); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i += 1) { + bytes[i] = binary.charCodeAt(i); + } + return bytes; +} + +export async function computeJwkThumbprint(jwk: JsonWebKey): Promise { + const canonical = canonicalizeJwk(jwk); + const digest = await sha256(new TextEncoder().encode(canonical)); + return base64UrlEncode(digest); +} + +function canonicalizeJwk(jwk: JsonWebKey): string { + if (!jwk.kty) { + throw new Error('JWK must include "kty"'); + } + + if (jwk.kty === 'EC') { + const { crv, kty, x, y } = jwk; + if (!crv || !x || !y) { + throw new Error('EC JWK must include "crv", "x", and "y".'); + } + return JSON.stringify({ crv, kty, x, y }); + } + + if (jwk.kty === 'OKP') { + const { crv, kty, x } = jwk; + if (!crv || !x) { + throw new Error('OKP JWK must include "crv" and "x".'); + } + return JSON.stringify({ crv, kty, x }); + } + + throw new Error(`Unsupported JWK key type: ${jwk.kty}`); +} + +export function derToJoseSignature(der: ArrayBuffer): Uint8Array { + const bytes = new Uint8Array(der); + if (bytes[0] !== 0x30) { + // Some implementations already return raw (r || s) signature bytes. + if (bytes.length === 64) { + return bytes; + } + throw new Error('Invalid DER signature: expected sequence.'); + } + + let offset = 2; // skip SEQUENCE header and length (assume short form) + if (bytes[1] & 0x80) { + const lengthBytes = bytes[1] & 0x7f; + offset = 2 + lengthBytes; + } + + if (bytes[offset] !== 0x02) { + throw new Error('Invalid DER signature: expected INTEGER for r.'); + } + const rLength = bytes[offset + 1]; + let r = bytes.slice(offset + 2, offset + 2 + rLength); + offset = offset + 2 + rLength; + + if (bytes[offset] !== 0x02) { + throw new Error('Invalid DER signature: expected INTEGER for s.'); + } + const sLength = bytes[offset + 1]; + let s = bytes.slice(offset + 2, offset + 2 + sLength); + + r = trimLeadingZeros(r); + s = trimLeadingZeros(s); + + const targetLength = 32; + const signature = new Uint8Array(targetLength * 2); + signature.set(padStart(r, targetLength), 0); + signature.set(padStart(s, targetLength), targetLength); + return signature; +} + +function trimLeadingZeros(bytes: Uint8Array): Uint8Array { + let start = 0; + while (start < bytes.length - 1 && bytes[start] === 0x00) { + start += 1; + } + return bytes.subarray(start); +} + +function padStart(bytes: Uint8Array, length: number): Uint8Array { + if (bytes.length >= length) { + return bytes; + } + const padded = new Uint8Array(length); + padded.set(bytes, length - bytes.length); + return padded; +} diff --git a/src/StellaOps.Web/src/app/core/auth/pkce.util.ts b/src/StellaOps.Web/src/app/core/auth/pkce.util.ts new file mode 100644 index 00000000..5de90c7c --- /dev/null +++ b/src/StellaOps.Web/src/app/core/auth/pkce.util.ts @@ -0,0 +1,24 @@ +import { base64UrlEncode, sha256 } from './dpop/jose-utilities'; + +export interface PkcePair { + readonly verifier: string; + readonly challenge: string; + readonly method: 'S256'; +} + +const VERIFIER_BYTE_LENGTH = 32; + +export async function createPkcePair(): Promise { + const verifierBytes = new Uint8Array(VERIFIER_BYTE_LENGTH); + crypto.getRandomValues(verifierBytes); + + const verifier = base64UrlEncode(verifierBytes); + const challengeBytes = await sha256(new TextEncoder().encode(verifier)); + const challenge = base64UrlEncode(challengeBytes); + + return { + verifier, + challenge, + method: 'S256', + }; +} diff --git a/src/StellaOps.Web/src/app/core/config/app-config.model.ts b/src/StellaOps.Web/src/app/core/config/app-config.model.ts new file mode 100644 index 00000000..95b6bd80 --- /dev/null +++ b/src/StellaOps.Web/src/app/core/config/app-config.model.ts @@ -0,0 +1,49 @@ +import { InjectionToken } from '@angular/core'; + +export type DPoPAlgorithm = 'ES256' | 'ES384' | 'EdDSA'; + +export interface AuthorityConfig { + readonly issuer: string; + readonly clientId: string; + readonly authorizeEndpoint: string; + readonly tokenEndpoint: string; + readonly logoutEndpoint?: string; + readonly redirectUri: string; + readonly postLogoutRedirectUri?: string; + readonly scope: string; + readonly audience: string; + /** + * Preferred algorithms for DPoP proofs, in order of preference. + * Defaults to ES256 if omitted. + */ + readonly dpopAlgorithms?: readonly DPoPAlgorithm[]; + /** + * Seconds of leeway before access token expiry that should trigger a proactive refresh. + * Defaults to 60. + */ + readonly refreshLeewaySeconds?: number; +} + +export interface ApiBaseUrlConfig { + readonly scanner: string; + readonly policy: string; + readonly concelier: string; + readonly excitor?: string; + readonly attestor: string; + readonly authority: string; + readonly notify?: string; + readonly scheduler?: string; +} + +export interface TelemetryConfig { + readonly otlpEndpoint?: string; + readonly sampleRate?: number; +} + +export interface AppConfig { + readonly authority: AuthorityConfig; + readonly apiBaseUrls: ApiBaseUrlConfig; + readonly telemetry?: TelemetryConfig; +} + +export const APP_CONFIG = new InjectionToken('STELLAOPS_APP_CONFIG'); diff --git a/src/StellaOps.Web/src/app/core/config/app-config.service.ts b/src/StellaOps.Web/src/app/core/config/app-config.service.ts new file mode 100644 index 00000000..03698caa --- /dev/null +++ b/src/StellaOps.Web/src/app/core/config/app-config.service.ts @@ -0,0 +1,99 @@ +import { HttpClient } from '@angular/common/http'; +import { + Inject, + Injectable, + Optional, + computed, + signal, +} from '@angular/core'; +import { firstValueFrom } from 'rxjs'; + +import { + APP_CONFIG, + AppConfig, + AuthorityConfig, + DPoPAlgorithm, +} from './app-config.model'; + +const DEFAULT_CONFIG_URL = '/config.json'; +const DEFAULT_DPOP_ALG: DPoPAlgorithm = 'ES256'; +const DEFAULT_REFRESH_LEEWAY_SECONDS = 60; + +@Injectable({ + providedIn: 'root', +}) +export class AppConfigService { + private readonly configSignal = signal(null); + private readonly authoritySignal = computed(() => { + const config = this.configSignal(); + return config?.authority ?? null; + }); + + constructor( + private readonly http: HttpClient, + @Optional() @Inject(APP_CONFIG) private readonly staticConfig: AppConfig | null + ) {} + + /** + * Loads application configuration either from the injected static value or via HTTP fetch. + * Must be called during application bootstrap (see APP_INITIALIZER wiring). + */ + async load(configUrl: string = DEFAULT_CONFIG_URL): Promise { + if (this.configSignal()) { + return; + } + + const config = this.staticConfig ?? (await this.fetchConfig(configUrl)); + this.configSignal.set(this.normalizeConfig(config)); + } + + /** + * Allows tests to short-circuit configuration loading. + */ + setConfigForTesting(config: AppConfig): void { + this.configSignal.set(this.normalizeConfig(config)); + } + + get config(): AppConfig { + const current = this.configSignal(); + if (!current) { + throw new Error('App configuration has not been loaded yet.'); + } + return current; + } + + get authority(): AuthorityConfig { + const authority = this.authoritySignal(); + if (!authority) { + throw new Error('Authority configuration has not been loaded yet.'); + } + return authority; + } + + private async fetchConfig(configUrl: string): Promise { + const response = await firstValueFrom( + this.http.get(configUrl, { + headers: { 'Cache-Control': 'no-cache' }, + withCredentials: false, + }) + ); + return response; + } + + private normalizeConfig(config: AppConfig): AppConfig { + const authority = { + ...config.authority, + dpopAlgorithms: + config.authority.dpopAlgorithms?.length ?? 0 + ? config.authority.dpopAlgorithms + : [DEFAULT_DPOP_ALG], + refreshLeewaySeconds: + config.authority.refreshLeewaySeconds ?? DEFAULT_REFRESH_LEEWAY_SECONDS, + }; + + return { + ...config, + authority, + }; + } +} diff --git a/src/StellaOps.Web/src/app/features/auth/auth-callback.component.ts b/src/StellaOps.Web/src/app/features/auth/auth-callback.component.ts new file mode 100644 index 00000000..65d920a4 --- /dev/null +++ b/src/StellaOps.Web/src/app/features/auth/auth-callback.component.ts @@ -0,0 +1,61 @@ +import { CommonModule } from '@angular/common'; +import { Component, OnInit, inject, signal } from '@angular/core'; +import { ActivatedRoute, Router } from '@angular/router'; + +import { AuthorityAuthService } from '../../core/auth/authority-auth.service'; + +@Component({ + selector: 'app-auth-callback', + standalone: true, + imports: [CommonModule], + template: ` +
+

Completing sign-in…

+

+ We were unable to complete the sign-in flow. Please try again. +

+
+ `, + styles: [ + ` + .auth-callback { + margin: 4rem auto; + max-width: 420px; + text-align: center; + font-size: 1rem; + color: #0f172a; + } + + .error { + color: #dc2626; + font-weight: 500; + } + `, + ], +}) +export class AuthCallbackComponent implements OnInit { + private readonly route = inject(ActivatedRoute); + private readonly router = inject(Router); + private readonly auth = inject(AuthorityAuthService); + + readonly state = signal<'processing' | 'error'>('processing'); + + async ngOnInit(): Promise { + const params = this.route.snapshot.queryParamMap; + const searchParams = new URLSearchParams(); + params.keys.forEach((key) => { + const value = params.get(key); + if (value != null) { + searchParams.set(key, value); + } + }); + + try { + const result = await this.auth.completeLoginFromRedirect(searchParams); + const returnUrl = result.returnUrl ?? '/'; + await this.router.navigateByUrl(returnUrl, { replaceUrl: true }); + } catch { + this.state.set('error'); + } + } +} diff --git a/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.html b/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.html new file mode 100644 index 00000000..436a4b27 --- /dev/null +++ b/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.html @@ -0,0 +1,39 @@ +
+
+

Attestation

+ + {{ statusLabel }} + +
+ +
+
+
Rekor UUID
+
{{ attestation.uuid }}
+
+
+
Log index
+
{{ attestation.index }}
+
+ +
+
Last checked
+
{{ attestation.checkedAt }}
+
+
+
Details
+
{{ attestation.statusMessage }}
+
+
+
diff --git a/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.scss b/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.scss new file mode 100644 index 00000000..2ac2b218 --- /dev/null +++ b/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.scss @@ -0,0 +1,75 @@ +.attestation-panel { + border: 1px solid #1f2933; + border-radius: 8px; + padding: 1.25rem; + background: #111827; + color: #f8fafc; + display: grid; + gap: 1rem; +} + +.attestation-header { + display: flex; + align-items: center; + justify-content: space-between; +} + +.attestation-header h2 { + margin: 0; + font-size: 1.125rem; +} + +.status-badge { + display: inline-flex; + align-items: center; + padding: 0.35rem 0.75rem; + border-radius: 999px; + font-size: 0.875rem; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.05em; +} + +.status-badge.verified { + background-color: rgba(34, 197, 94, 0.2); + color: #34d399; +} + +.status-badge.pending { + background-color: rgba(234, 179, 8, 0.2); + color: #eab308; +} + +.status-badge.failed { + background-color: rgba(248, 113, 113, 0.2); + color: #f87171; +} + +.attestation-meta { + margin: 0; + display: grid; + gap: 0.75rem; +} + +.attestation-meta div { + display: grid; + gap: 0.25rem; +} + +.attestation-meta dt { + font-size: 0.75rem; + text-transform: uppercase; + letter-spacing: 0.05em; + color: #9ca3af; +} + +.attestation-meta dd { + margin: 0; + font-family: 'JetBrains Mono', 'Fira Code', 'SFMono-Regular', monospace; + word-break: break-word; +} + +.attestation-meta a { + color: #60a5fa; + text-decoration: underline; +} diff --git a/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.spec.ts b/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.spec.ts new file mode 100644 index 00000000..9c44876a --- /dev/null +++ b/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.spec.ts @@ -0,0 +1,55 @@ +import { ComponentFixture, TestBed } from '@angular/core/testing'; +import { ScanAttestationPanelComponent } from './scan-attestation-panel.component'; + +describe('ScanAttestationPanelComponent', () => { + let component: ScanAttestationPanelComponent; + let fixture: ComponentFixture; + + beforeEach(async () => { + await TestBed.configureTestingModule({ + imports: [ScanAttestationPanelComponent], + }).compileComponents(); + + fixture = TestBed.createComponent(ScanAttestationPanelComponent); + component = fixture.componentInstance; + }); + + it('renders verified attestation details', () => { + component.attestation = { + uuid: '1234', + status: 'verified', + index: 42, + logUrl: 'https://rekor.example', + checkedAt: '2025-10-23T10:05:00Z', + statusMessage: 'Rekor transparency log inclusion proof verified.', + }; + + fixture.detectChanges(); + + const element: HTMLElement = fixture.nativeElement; + expect(element.querySelector('.status-badge')?.textContent?.trim()).toBe( + 'Verified' + ); + expect(element.textContent).toContain('1234'); + expect(element.textContent).toContain('42'); + expect(element.textContent).toContain('https://rekor.example'); + }); + + it('renders failure message when attestation verification fails', () => { + component.attestation = { + uuid: 'abcd', + status: 'failed', + statusMessage: 'Verification failed: inclusion proof mismatch.', + }; + + fixture.detectChanges(); + + const element: HTMLElement = fixture.nativeElement; + expect(element.querySelector('.status-badge')?.textContent?.trim()).toBe( + 'Verification failed' + ); + expect(element.textContent).toContain( + 'Verification failed: inclusion proof mismatch.' + ); + }); +}); diff --git a/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.ts b/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.ts new file mode 100644 index 00000000..55be5db5 --- /dev/null +++ b/src/StellaOps.Web/src/app/features/scans/scan-attestation-panel.component.ts @@ -0,0 +1,42 @@ +import { CommonModule } from '@angular/common'; +import { + ChangeDetectionStrategy, + Component, + Input, +} from '@angular/core'; +import { + ScanAttestationStatus, + ScanAttestationStatusKind, +} from '../../core/api/scanner.models'; + +@Component({ + selector: 'app-scan-attestation-panel', + standalone: true, + imports: [CommonModule], + templateUrl: './scan-attestation-panel.component.html', + styleUrls: ['./scan-attestation-panel.component.scss'], + changeDetection: ChangeDetectionStrategy.OnPush, +}) +export class ScanAttestationPanelComponent { + @Input({ required: true }) attestation!: ScanAttestationStatus; + + get statusLabel(): string { + return this.toStatusLabel(this.attestation?.status); + } + + get statusClass(): string { + return this.attestation?.status ?? 'pending'; + } + + private toStatusLabel(status: ScanAttestationStatusKind | undefined): string { + switch (status) { + case 'verified': + return 'Verified'; + case 'failed': + return 'Verification failed'; + case 'pending': + default: + return 'Pending verification'; + } + } +} diff --git a/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.html b/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.html new file mode 100644 index 00000000..4c18bd3c --- /dev/null +++ b/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.html @@ -0,0 +1,52 @@ +
+
+

Scan Detail

+
+ + +
+
+ +
+

Image

+
+
+
Scan ID
+
{{ scan().scanId }}
+
+
+
Image digest
+
{{ scan().imageDigest }}
+
+
+
Completed at
+
{{ scan().completedAt }}
+
+
+
+ + + +

+ No attestation has been recorded for this scan. +

+
diff --git a/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.scss b/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.scss new file mode 100644 index 00000000..0e6bf035 --- /dev/null +++ b/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.scss @@ -0,0 +1,79 @@ +.scan-detail { + display: grid; + gap: 1.5rem; + padding: 1.5rem; + color: #e2e8f0; + background: #0f172a; + min-height: calc(100vh - 120px); +} + +.scan-detail__header { + display: flex; + flex-wrap: wrap; + align-items: center; + justify-content: space-between; + gap: 1rem; +} + +.scan-detail__header h1 { + margin: 0; + font-size: 1.5rem; +} + +.scenario-toggle { + display: inline-flex; + border: 1px solid #1f2933; + border-radius: 999px; + overflow: hidden; +} + +.scenario-button { + background: transparent; + color: inherit; + border: none; + padding: 0.5rem 1.25rem; + cursor: pointer; + font-size: 0.9rem; + letter-spacing: 0.03em; + text-transform: uppercase; +} + +.scenario-button.active { + background: #1d4ed8; + color: #f8fafc; +} + +.scan-summary { + border: 1px solid #1f2933; + border-radius: 8px; + padding: 1.25rem; + background: #111827; +} + +.scan-summary h2 { + margin: 0 0 0.75rem 0; + font-size: 1.125rem; +} + +.scan-summary dl { + margin: 0; + display: grid; + gap: 0.75rem; +} + +.scan-summary dt { + font-size: 0.75rem; + text-transform: uppercase; + color: #94a3b8; +} + +.scan-summary dd { + margin: 0; + font-family: 'JetBrains Mono', 'Fira Code', 'SFMono-Regular', monospace; + word-break: break-word; +} + +.attestation-empty { + font-style: italic; + color: #94a3b8; +} diff --git a/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.spec.ts b/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.spec.ts new file mode 100644 index 00000000..1e5f6958 --- /dev/null +++ b/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.spec.ts @@ -0,0 +1,50 @@ +import { ComponentFixture, TestBed } from '@angular/core/testing'; +import { RouterTestingModule } from '@angular/router/testing'; +import { ScanDetailPageComponent } from './scan-detail-page.component'; +import { + scanDetailWithFailedAttestation, + scanDetailWithVerifiedAttestation, +} from '../../testing/scan-fixtures'; + +describe('ScanDetailPageComponent', () => { + let fixture: ComponentFixture; + let component: ScanDetailPageComponent; + + beforeEach(async () => { + await TestBed.configureTestingModule({ + imports: [RouterTestingModule, ScanDetailPageComponent], + }).compileComponents(); + + fixture = TestBed.createComponent(ScanDetailPageComponent); + component = fixture.componentInstance; + }); + + it('shows the verified attestation scenario by default', () => { + fixture.detectChanges(); + + const element: HTMLElement = fixture.nativeElement; + expect(element.textContent).toContain( + scanDetailWithVerifiedAttestation.attestation?.uuid ?? '' + ); + expect(element.querySelector('.status-badge')?.textContent?.trim()).toBe( + 'Verified' + ); + }); + + it('switches to failure scenario when toggle is clicked', () => { + fixture.detectChanges(); + + const failureButton: HTMLButtonElement | null = + fixture.nativeElement.querySelector('[data-scenario="failed"]'); + failureButton?.click(); + fixture.detectChanges(); + + const element: HTMLElement = fixture.nativeElement; + expect(element.textContent).toContain( + scanDetailWithFailedAttestation.attestation?.uuid ?? '' + ); + expect(element.querySelector('.status-badge')?.textContent?.trim()).toBe( + 'Verification failed' + ); + }); +}); diff --git a/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.ts b/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.ts new file mode 100644 index 00000000..7bd933ec --- /dev/null +++ b/src/StellaOps.Web/src/app/features/scans/scan-detail-page.component.ts @@ -0,0 +1,62 @@ +import { CommonModule } from '@angular/common'; +import { + ChangeDetectionStrategy, + Component, + computed, + inject, + signal, +} from '@angular/core'; +import { ActivatedRoute } from '@angular/router'; +import { ScanAttestationPanelComponent } from './scan-attestation-panel.component'; +import { ScanDetail } from '../../core/api/scanner.models'; +import { + scanDetailWithFailedAttestation, + scanDetailWithVerifiedAttestation, +} from '../../testing/scan-fixtures'; + +type Scenario = 'verified' | 'failed'; + +const SCENARIO_MAP: Record = { + verified: scanDetailWithVerifiedAttestation, + failed: scanDetailWithFailedAttestation, +}; + +@Component({ + selector: 'app-scan-detail-page', + standalone: true, + imports: [CommonModule, ScanAttestationPanelComponent], + templateUrl: './scan-detail-page.component.html', + styleUrls: ['./scan-detail-page.component.scss'], + changeDetection: ChangeDetectionStrategy.OnPush, +}) +export class ScanDetailPageComponent { + private readonly route = inject(ActivatedRoute); + + readonly scenario = signal('verified'); + + readonly scan = computed(() => { + const current = this.scenario(); + return SCENARIO_MAP[current]; + }); + + constructor() { + const routeScenario = + (this.route.snapshot.queryParamMap.get('scenario') as Scenario | null) ?? + null; + if (routeScenario && routeScenario in SCENARIO_MAP) { + this.scenario.set(routeScenario); + return; + } + + const scanId = this.route.snapshot.paramMap.get('scanId'); + if (scanId === scanDetailWithFailedAttestation.scanId) { + this.scenario.set('failed'); + } else { + this.scenario.set('verified'); + } + } + + onSelectScenario(next: Scenario): void { + this.scenario.set(next); + } +} diff --git a/src/StellaOps.Web/src/app/testing/policy-fixtures.spec.ts b/src/StellaOps.Web/src/app/testing/policy-fixtures.spec.ts index 3f37a8b6..f3fd1c3d 100644 --- a/src/StellaOps.Web/src/app/testing/policy-fixtures.spec.ts +++ b/src/StellaOps.Web/src/app/testing/policy-fixtures.spec.ts @@ -28,12 +28,20 @@ describe('policy fixtures', () => { it('aligns preview and report fixtures', () => { const preview = getPolicyPreviewFixture(); - const report = getPolicyReportFixture(); + const { reportResponse } = getPolicyReportFixture(); - expect(report.report.policy.digest).toEqual(preview.previewResponse.policyDigest); - expect(report.report.verdicts.length).toEqual(report.report.summary.total); - expect(report.report.verdicts.length).toBeGreaterThan(0); - expect(report.report.verdicts.some(v => v.confidenceBand != null)).toBeTrue(); + expect(reportResponse.report.policy.digest).toEqual( + preview.previewResponse.policyDigest + ); + expect(reportResponse.report.verdicts.length).toEqual( + reportResponse.report.summary.total + ); + expect(reportResponse.report.verdicts.length).toBeGreaterThan(0); + expect( + reportResponse.report.verdicts.some( + (verdict) => verdict.confidenceBand != null + ) + ).toBeTrue(); }); it('provides DSSE metadata for report fixture', () => { diff --git a/src/StellaOps.Web/src/app/testing/policy-fixtures.ts b/src/StellaOps.Web/src/app/testing/policy-fixtures.ts index db74d4b1..a0acea3c 100644 --- a/src/StellaOps.Web/src/app/testing/policy-fixtures.ts +++ b/src/StellaOps.Web/src/app/testing/policy-fixtures.ts @@ -1,12 +1,14 @@ -import previewSample from '../../../../samples/policy/policy-preview-unknown.json'; -import reportSample from '../../../../samples/policy/policy-report-unknown.json'; +import previewSample from '../../../../../samples/policy/policy-preview-unknown.json'; +import reportSample from '../../../../../samples/policy/policy-report-unknown.json'; import { PolicyPreviewSample, PolicyReportSample, } from '../core/api/policy-preview.models'; -const previewFixture: PolicyPreviewSample = previewSample; -const reportFixture: PolicyReportSample = reportSample; +const previewFixture: PolicyPreviewSample = + previewSample as unknown as PolicyPreviewSample; +const reportFixture: PolicyReportSample = + reportSample as unknown as PolicyReportSample; export function getPolicyPreviewFixture(): PolicyPreviewSample { return clone(previewFixture); diff --git a/src/StellaOps.Web/src/app/testing/scan-fixtures.ts b/src/StellaOps.Web/src/app/testing/scan-fixtures.ts new file mode 100644 index 00000000..840442b5 --- /dev/null +++ b/src/StellaOps.Web/src/app/testing/scan-fixtures.ts @@ -0,0 +1,30 @@ +import { ScanDetail } from '../core/api/scanner.models'; + +export const scanDetailWithVerifiedAttestation: ScanDetail = { + scanId: 'scan-verified-001', + imageDigest: + 'sha256:9f92a8c39f8d4f7bb1a60f2be650b3019b9a1bb50d2da839efa9bf2a278a0071', + completedAt: '2025-10-20T18:22:04Z', + attestation: { + uuid: '018ed91c-9b64-7edc-b9ac-0bada2f8d501', + index: 412398, + logUrl: 'https://rekor.sigstore.dev', + status: 'verified', + checkedAt: '2025-10-23T12:04:52Z', + statusMessage: 'Rekor transparency log inclusion proof verified.', + }, +}; + +export const scanDetailWithFailedAttestation: ScanDetail = { + scanId: 'scan-failed-002', + imageDigest: + 'sha256:b0d6865de537e45bdd9dd72cdac02bc6f459f0e546ed9134e2afc2fccd6298e0', + completedAt: '2025-10-19T07:14:33Z', + attestation: { + uuid: '018ed91c-ffff-4882-9955-0027c0bbb090', + status: 'failed', + checkedAt: '2025-10-23T09:18:11Z', + statusMessage: + 'Verification failed: inclusion proof leaf hash mismatch at depth 4.', + }, +}; diff --git a/src/StellaOps.Web/src/config/config.json b/src/StellaOps.Web/src/config/config.json new file mode 100644 index 00000000..3a0dca18 --- /dev/null +++ b/src/StellaOps.Web/src/config/config.json @@ -0,0 +1,26 @@ +{ + "authority": { + "issuer": "https://authority.local", + "clientId": "stellaops-ui", + "authorizeEndpoint": "https://authority.local/connect/authorize", + "tokenEndpoint": "https://authority.local/connect/token", + "logoutEndpoint": "https://authority.local/connect/logout", + "redirectUri": "http://localhost:4400/auth/callback", + "postLogoutRedirectUri": "http://localhost:4400/", + "scope": "openid profile ui.read", + "audience": "https://scanner.local", + "dpopAlgorithms": ["ES256"], + "refreshLeewaySeconds": 60 + }, + "apiBaseUrls": { + "authority": "https://authority.local", + "scanner": "https://scanner.local", + "policy": "https://scanner.local", + "concelier": "https://concelier.local", + "attestor": "https://attestor.local" + }, + "telemetry": { + "otlpEndpoint": "http://localhost:4318/v1/traces", + "sampleRate": 0.1 + } +} diff --git a/src/StellaOps.Web/src/config/config.sample.json b/src/StellaOps.Web/src/config/config.sample.json new file mode 100644 index 00000000..e0ca7147 --- /dev/null +++ b/src/StellaOps.Web/src/config/config.sample.json @@ -0,0 +1,26 @@ +{ + "authority": { + "issuer": "https://authority.example.dev", + "clientId": "stellaops-ui", + "authorizeEndpoint": "https://authority.example.dev/connect/authorize", + "tokenEndpoint": "https://authority.example.dev/connect/token", + "logoutEndpoint": "https://authority.example.dev/connect/logout", + "redirectUri": "http://localhost:4400/auth/callback", + "postLogoutRedirectUri": "http://localhost:4400/", + "scope": "openid profile ui.read", + "audience": "https://scanner.example.dev", + "dpopAlgorithms": ["ES256"], + "refreshLeewaySeconds": 60 + }, + "apiBaseUrls": { + "authority": "https://authority.example.dev", + "scanner": "https://scanner.example.dev", + "policy": "https://scanner.example.dev", + "concelier": "https://concelier.example.dev", + "attestor": "https://attestor.example.dev" + }, + "telemetry": { + "otlpEndpoint": "", + "sampleRate": 0 + } +} diff --git a/src/StellaOps.Web/test-results/.last-run.json b/src/StellaOps.Web/test-results/.last-run.json new file mode 100644 index 00000000..cbcc1fba --- /dev/null +++ b/src/StellaOps.Web/test-results/.last-run.json @@ -0,0 +1,4 @@ +{ + "status": "passed", + "failedTests": [] +} \ No newline at end of file diff --git a/src/StellaOps.Web/tests/e2e/auth.spec.ts b/src/StellaOps.Web/tests/e2e/auth.spec.ts new file mode 100644 index 00000000..2b303973 --- /dev/null +++ b/src/StellaOps.Web/tests/e2e/auth.spec.ts @@ -0,0 +1,78 @@ +import { expect, test } from '@playwright/test'; + +const mockConfig = { + authority: { + issuer: 'https://authority.local', + clientId: 'stellaops-ui', + authorizeEndpoint: 'https://authority.local/connect/authorize', + tokenEndpoint: 'https://authority.local/connect/token', + logoutEndpoint: 'https://authority.local/connect/logout', + redirectUri: 'http://127.0.0.1:4400/auth/callback', + postLogoutRedirectUri: 'http://127.0.0.1:4400/', + scope: 'openid profile ui.read', + audience: 'https://scanner.local', + dpopAlgorithms: ['ES256'], + refreshLeewaySeconds: 60, + }, + apiBaseUrls: { + authority: 'https://authority.local', + scanner: 'https://scanner.local', + policy: 'https://scanner.local', + concelier: 'https://concelier.local', + attestor: 'https://attestor.local', + }, +}; + +test.beforeEach(async ({ page }) => { + page.on('console', (message) => { + // bubble up browser logs for debugging + console.log('[browser]', message.type(), message.text()); + }); + page.on('pageerror', (error) => { + console.log('[pageerror]', error.message); + }); + await page.addInitScript(() => { + // Capture attempted redirects so the test can assert against them. + (window as any).__stellaopsAssignedUrls = []; + const originalAssign = window.location.assign.bind(window.location); + window.location.assign = (url: string | URL) => { + (window as any).__stellaopsAssignedUrls.push(url.toString()); + }; + + window.sessionStorage.clear(); + }); + await page.route('**/config.json', (route) => + route.fulfill({ + status: 200, + contentType: 'application/json', + body: JSON.stringify(mockConfig), + }) + ); + await page.route('https://authority.local/**', (route) => route.abort()); +}); + +test('sign-in flow builds Authority authorization URL', async ({ page }) => { + await page.goto('/'); + const signInButton = page.getByRole('button', { name: /sign in/i }); + await expect(signInButton).toBeVisible(); + const [request] = await Promise.all([ + page.waitForRequest('https://authority.local/connect/authorize*'), + signInButton.click(), + ]); + + const authorizeUrl = new URL(request.url()); + expect(authorizeUrl.origin).toBe('https://authority.local'); + expect(authorizeUrl.pathname).toBe('/connect/authorize'); + expect(authorizeUrl.searchParams.get('client_id')).toBe('stellaops-ui'); + +}); + +test('callback without pending state surfaces error message', async ({ page }) => { + await page.route('https://authority.local/**', (route) => + route.fulfill({ status: 400, body: 'blocked' }) + ); + await page.goto('/auth/callback?code=test-code&state=missing'); + await expect( + page.getByText('We were unable to complete the sign-in flow. Please try again.') + ).toBeVisible({ timeout: 10000 }); +}); diff --git a/src/StellaOps.Zastava.Core.Tests/Contracts/ZastavaContractVersionsTests.cs b/src/StellaOps.Zastava.Core.Tests/Contracts/ZastavaContractVersionsTests.cs index 22ce7340..2d790aa1 100644 --- a/src/StellaOps.Zastava.Core.Tests/Contracts/ZastavaContractVersionsTests.cs +++ b/src/StellaOps.Zastava.Core.Tests/Contracts/ZastavaContractVersionsTests.cs @@ -5,8 +5,7 @@ namespace StellaOps.Zastava.Core.Tests.Contracts; public sealed class ZastavaContractVersionsTests { [Theory] - [InlineData("zastava.runtime.event@v1", "zastava.runtime.event", 1, 0)] - [InlineData("zastava.runtime.event@v1.0", "zastava.runtime.event", 1, 0)] + [InlineData("zastava.runtime.event@v1.0", "zastava.runtime.event", 1, 0)] [InlineData("zastava.admission.decision@v1.2", "zastava.admission.decision", 1, 2)] public void TryParse_ParsesCanonicalForms(string input, string schema, int major, int minor) { @@ -31,36 +30,72 @@ public sealed class ZastavaContractVersionsTests } [Fact] - public void IsRuntimeEventSupported_RespectsMajorCompatibility() - { - Assert.True(ZastavaContractVersions.IsRuntimeEventSupported("zastava.runtime.event@v1")); - Assert.True(ZastavaContractVersions.IsRuntimeEventSupported("zastava.runtime.event@v1.0")); - Assert.False(ZastavaContractVersions.IsRuntimeEventSupported("zastava.runtime.event@v2.0")); - Assert.False(ZastavaContractVersions.IsRuntimeEventSupported("zastava.admission.decision@v1")); - } - - [Fact] - public void NegotiateRuntimeEvent_PicksHighestCommonVersion() - { - var negotiated = ZastavaContractVersions.NegotiateRuntimeEvent(new[] - { + public void IsRuntimeEventSupported_RespectsMajorCompatibility() + { + Assert.True(ZastavaContractVersions.ContractVersion.TryParse("zastava.runtime.event@v1.0", out var candidate)); + Assert.True(candidate.IsCompatibleWith(ZastavaContractVersions.RuntimeEvent), $"Candidate {candidate} incompatible with {ZastavaContractVersions.RuntimeEvent}"); + Assert.True(ZastavaContractVersions.IsRuntimeEventSupported("zastava.runtime.event@v1.0")); + Assert.False(ZastavaContractVersions.IsRuntimeEventSupported("zastava.runtime.event@v2.0")); + Assert.False(ZastavaContractVersions.IsRuntimeEventSupported("zastava.admission.decision@v1")); + } + + [Fact] + public void IsAdmissionDecisionSupported_RespectsMajorCompatibility() + { + Assert.True(ZastavaContractVersions.ContractVersion.TryParse("zastava.admission.decision@v1.0", out var candidate)); + Assert.True(candidate.IsCompatibleWith(ZastavaContractVersions.AdmissionDecision), $"Candidate {candidate} incompatible with {ZastavaContractVersions.AdmissionDecision}"); + Assert.True(ZastavaContractVersions.IsAdmissionDecisionSupported("zastava.admission.decision@v1.0")); + Assert.False(ZastavaContractVersions.IsAdmissionDecisionSupported("zastava.admission.decision@v0.9")); + Assert.False(ZastavaContractVersions.IsAdmissionDecisionSupported("zastava.runtime.event@v1")); + } + + [Fact] + public void NegotiateRuntimeEvent_PicksHighestCommonVersion() + { + var negotiated = ZastavaContractVersions.NegotiateRuntimeEvent(new[] + { "zastava.runtime.event@v1.0", "zastava.runtime.event@v0.9", "zastava.admission.decision@v1" - }); - - Assert.Equal("zastava.runtime.event@v1.0", negotiated.ToString()); - } - - [Fact] - public void NegotiateRuntimeEvent_FallsBackToLocalWhenNoMatch() - { - var negotiated = ZastavaContractVersions.NegotiateRuntimeEvent(new[] - { + }); + + Assert.Equal("zastava.runtime.event@v1.0", negotiated.ToString()); + } + + [Fact] + public void NegotiateAdmissionDecision_PicksHighestCommonVersion() + { + var negotiated = ZastavaContractVersions.NegotiateAdmissionDecision(new[] + { + "zastava.admission.decision@v1.2", + "zastava.admission.decision@v1.0", + "zastava.runtime.event@v1.0" + }); + + Assert.Equal(ZastavaContractVersions.AdmissionDecision.ToString(), negotiated.ToString()); + } + + [Fact] + public void NegotiateRuntimeEvent_FallsBackToLocalWhenNoMatch() + { + var negotiated = ZastavaContractVersions.NegotiateRuntimeEvent(new[] + { "zastava.runtime.event@v2.0", "zastava.admission.decision@v2.0" }); - - Assert.Equal(ZastavaContractVersions.RuntimeEvent.ToString(), negotiated.ToString()); - } -} + + Assert.Equal(ZastavaContractVersions.RuntimeEvent.ToString(), negotiated.ToString()); + } + + [Fact] + public void NegotiateAdmissionDecision_FallsBackToLocalWhenNoMatch() + { + var negotiated = ZastavaContractVersions.NegotiateAdmissionDecision(new[] + { + "zastava.admission.decision@v2.0", + "zastava.runtime.event@v2.0" + }); + + Assert.Equal(ZastavaContractVersions.AdmissionDecision.ToString(), negotiated.ToString()); + } +} diff --git a/src/StellaOps.Zastava.Core.Tests/DependencyInjection/ZastavaServiceCollectionExtensionsTests.cs b/src/StellaOps.Zastava.Core.Tests/DependencyInjection/ZastavaServiceCollectionExtensionsTests.cs new file mode 100644 index 00000000..9906ca40 --- /dev/null +++ b/src/StellaOps.Zastava.Core.Tests/DependencyInjection/ZastavaServiceCollectionExtensionsTests.cs @@ -0,0 +1,122 @@ +using System.Linq; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Core.Diagnostics; +using StellaOps.Zastava.Core.Security; + +namespace StellaOps.Zastava.Core.Tests.DependencyInjection; + +public sealed class ZastavaServiceCollectionExtensionsTests +{ + [Fact] + public void AddZastavaRuntimeCore_BindsOptionsAndProvidesDiagnostics() + { + var configuration = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["zastava:runtime:tenant"] = "tenant-42", + ["zastava:runtime:environment"] = "prod", + ["zastava:runtime:deployment"] = "cluster-a", + ["zastava:runtime:metrics:meterName"] = "stellaops.zastava.runtime", + ["zastava:runtime:metrics:meterVersion"] = "2.0.0", + ["zastava:runtime:metrics:commonTags:cluster"] = "prod-cluster", + ["zastava:runtime:logging:staticScope:plane"] = "runtime", + ["zastava:runtime:authority:clientId"] = "zastava-observer", + ["zastava:runtime:authority:audience:0"] = "scanner", + ["zastava:runtime:authority:audience:1"] = "zastava", + ["zastava:runtime:authority:scopes:0"] = "aud:scanner", + ["zastava:runtime:authority:scopes:1"] = "api:scanner.runtime.write", + ["zastava:runtime:authority:allowStaticTokenFallback"] = "false" + }) + .Build(); + + var services = new ServiceCollection(); + services.AddLogging(); + services.AddZastavaRuntimeCore(configuration, componentName: "observer"); + + using var provider = services.BuildServiceProvider(); + + var runtimeOptions = provider.GetRequiredService>().Value; + Assert.Equal("tenant-42", runtimeOptions.Tenant); + Assert.Equal("prod", runtimeOptions.Environment); + Assert.Equal("observer", runtimeOptions.Component); + Assert.Equal("cluster-a", runtimeOptions.Deployment); + Assert.Equal("stellaops.zastava.runtime", runtimeOptions.Metrics.MeterName); + Assert.Equal("2.0.0", runtimeOptions.Metrics.MeterVersion); + Assert.Equal("runtime", runtimeOptions.Logging.StaticScope["plane"]); + Assert.Equal("zastava-observer", runtimeOptions.Authority.ClientId); + Assert.Contains("scanner", runtimeOptions.Authority.Audience); + Assert.Contains("zastava", runtimeOptions.Authority.Audience); + Assert.Equal(new[] { "aud:scanner", "api:scanner.runtime.write" }, runtimeOptions.Authority.Scopes); + Assert.False(runtimeOptions.Authority.AllowStaticTokenFallback); + + var scopeBuilder = provider.GetRequiredService(); + var scope = scopeBuilder.BuildScope( + correlationId: "corr-1", + node: "node-1", + workload: "payments/api", + eventId: "evt-123", + additional: new Dictionary + { + ["pod"] = "api-12345" + }); + + Assert.Equal("tenant-42", scope["tenant"]); + Assert.Equal("observer", scope["component"]); + Assert.Equal("prod", scope["environment"]); + Assert.Equal("cluster-a", scope["deployment"]); + Assert.Equal("runtime", scope["plane"]); + Assert.Equal("corr-1", scope["correlationId"]); + Assert.Equal("node-1", scope["node"]); + Assert.Equal("payments/api", scope["workload"]); + Assert.Equal("evt-123", scope["eventId"]); + Assert.Equal("api-12345", scope["pod"]); + + var metrics = provider.GetRequiredService(); + Assert.Equal("stellaops.zastava.runtime", metrics.Meter.Name); + Assert.Equal("2.0.0", metrics.Meter.Version); + + var authorityProvider = provider.GetRequiredService(); + Assert.NotNull(authorityProvider); + + var defaultTags = metrics.DefaultTags.ToArray(); + Assert.Contains(defaultTags, kvp => kvp.Key == "tenant" && (string?)kvp.Value == "tenant-42"); + Assert.Contains(defaultTags, kvp => kvp.Key == "component" && (string?)kvp.Value == "observer"); + Assert.Contains(defaultTags, kvp => kvp.Key == "environment" && (string?)kvp.Value == "prod"); + Assert.Contains(defaultTags, kvp => kvp.Key == "deployment" && (string?)kvp.Value == "cluster-a"); + Assert.Contains(defaultTags, kvp => kvp.Key == "cluster" && (string?)kvp.Value == "prod-cluster"); + + metrics.RuntimeEvents.Add(1, defaultTags); + metrics.AdmissionDecisions.Add(1, defaultTags); + metrics.BackendLatencyMs.Record(12.5, defaultTags); + + var loggerFactoryOptions = provider.GetRequiredService>().CurrentValue; + Assert.True(loggerFactoryOptions.ActivityTrackingOptions.HasFlag(ActivityTrackingOptions.TraceId)); + Assert.True(loggerFactoryOptions.ActivityTrackingOptions.HasFlag(ActivityTrackingOptions.SpanId)); + } + + [Fact] + public void AddZastavaRuntimeCore_ThrowsForInvalidTenant() + { + var configuration = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["zastava:runtime:tenant"] = "", + ["zastava:runtime:environment"] = "prod" + }) + .Build(); + + var services = new ServiceCollection(); + services.AddLogging(); + services.AddZastavaRuntimeCore(configuration, "observer"); + + Assert.Throws(() => + { + using var provider = services.BuildServiceProvider(); + _ = provider.GetRequiredService>().Value; + }); + } +} diff --git a/src/StellaOps.Zastava.Core.Tests/Security/ZastavaAuthorityTokenProviderTests.cs b/src/StellaOps.Zastava.Core.Tests/Security/ZastavaAuthorityTokenProviderTests.cs new file mode 100644 index 00000000..0f632cd9 --- /dev/null +++ b/src/StellaOps.Zastava.Core.Tests/Security/ZastavaAuthorityTokenProviderTests.cs @@ -0,0 +1,224 @@ +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using Microsoft.IdentityModel.Tokens; +using StellaOps.Auth.Client; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Core.Diagnostics; +using StellaOps.Zastava.Core.Security; + +namespace StellaOps.Zastava.Core.Tests.Security; + +public sealed class ZastavaAuthorityTokenProviderTests +{ + [Fact] + public async Task GetAsync_UsesCacheUntilRefreshWindow() + { + var timeProvider = new TestTimeProvider(DateTimeOffset.Parse("2025-10-23T12:00:00Z")); + var runtimeOptions = CreateRuntimeOptions(refreshSkewSeconds: 120); + + var tokenClient = new StubTokenClient(); + tokenClient.EnqueueToken(new StellaOpsTokenResult( + "token-1", + "DPoP", + timeProvider.GetUtcNow() + TimeSpan.FromMinutes(10), + new[] { "aud:scanner", "api:scanner.runtime.write" })); + + tokenClient.EnqueueToken(new StellaOpsTokenResult( + "token-2", + "DPoP", + timeProvider.GetUtcNow() + TimeSpan.FromMinutes(10), + new[] { "aud:scanner", "api:scanner.runtime.write" })); + + var provider = CreateProvider(runtimeOptions, tokenClient, timeProvider); + + var tokenA = await provider.GetAsync("scanner"); + Assert.Equal("token-1", tokenA.AccessToken); + Assert.Equal(1, tokenClient.RequestCount); + + // Move time forward but still before refresh window (refresh skew = 2 minutes) + timeProvider.Advance(TimeSpan.FromMinutes(5)); + var tokenB = await provider.GetAsync("scanner"); + Assert.Equal("token-1", tokenB.AccessToken); + Assert.Equal(1, tokenClient.RequestCount); + + // Cross refresh window to trigger renewal + timeProvider.Advance(TimeSpan.FromMinutes(5)); + var tokenC = await provider.GetAsync("scanner"); + Assert.Equal("token-2", tokenC.AccessToken); + Assert.Equal(2, tokenClient.RequestCount); + } + + [Fact] + public async Task GetAsync_ThrowsWhenMissingAudienceScope() + { + var runtimeOptions = CreateRuntimeOptions(); + var tokenClient = new StubTokenClient(); + tokenClient.EnqueueToken(new StellaOpsTokenResult( + "token", + "DPoP", + DateTimeOffset.UtcNow + TimeSpan.FromMinutes(5), + new[] { "api:scanner.runtime.write" })); + + var provider = CreateProvider(runtimeOptions, tokenClient, new TestTimeProvider(DateTimeOffset.UtcNow)); + + var ex = await Assert.ThrowsAsync(() => provider.GetAsync("scanner").AsTask()); + Assert.Contains("audience scope", ex.Message, StringComparison.OrdinalIgnoreCase); + } + + [Fact] + public async Task GetAsync_StaticFallbackUsedWhenEnabled() + { + var runtimeOptions = CreateRuntimeOptions(allowFallback: true, staticToken: "static-token", requireDpop: false); + + var tokenClient = new StubTokenClient(); + tokenClient.FailWith(new InvalidOperationException("offline")); + + var provider = CreateProvider(runtimeOptions, tokenClient, new TestTimeProvider(DateTimeOffset.UtcNow)); + + var token = await provider.GetAsync("scanner"); + Assert.Equal("static-token", token.AccessToken); + Assert.Null(token.ExpiresAtUtc); + Assert.Equal(0, tokenClient.RequestCount); + } + + [Fact] + public async Task GetAsync_ThrowsWhenDpopRequiredButTokenTypeIsBearer() + { + var runtimeOptions = CreateRuntimeOptions(requireDpop: true); + + var tokenClient = new StubTokenClient(); + tokenClient.EnqueueToken(new StellaOpsTokenResult( + "token", + "Bearer", + DateTimeOffset.UtcNow + TimeSpan.FromMinutes(5), + new[] { "aud:scanner" })); + + var provider = CreateProvider(runtimeOptions, tokenClient, new TestTimeProvider(DateTimeOffset.UtcNow)); + + await Assert.ThrowsAsync(() => provider.GetAsync("scanner").AsTask()); + } + + private static ZastavaRuntimeOptions CreateRuntimeOptions( + double refreshSkewSeconds = 60, + bool allowFallback = false, + string? staticToken = null, + bool requireDpop = true) + => new() + { + Tenant = "tenant-x", + Environment = "test", + Component = "observer", + Authority = new ZastavaAuthorityOptions + { + Issuer = new Uri("https://authority.internal"), + ClientId = "zastava-runtime", + Audience = new[] { "scanner" }, + Scopes = new[] { "api:scanner.runtime.write" }, + RefreshSkewSeconds = refreshSkewSeconds, + RequireDpop = requireDpop, + RequireMutualTls = true, + AllowStaticTokenFallback = allowFallback, + StaticTokenValue = staticToken + } + }; + + private static ZastavaAuthorityTokenProvider CreateProvider( + ZastavaRuntimeOptions runtimeOptions, + IStellaOpsTokenClient tokenClient, + TimeProvider timeProvider) + { + var optionsMonitor = new StaticOptionsMonitor(runtimeOptions); + var scopeBuilder = new ZastavaLogScopeBuilder(Options.Create(runtimeOptions)); + return new ZastavaAuthorityTokenProvider( + tokenClient, + optionsMonitor, + scopeBuilder, + timeProvider, + NullLogger.Instance); + } + + private sealed class StubTokenClient : IStellaOpsTokenClient + { + private readonly Queue>> responses = new(); + private Exception? failure; + + public int RequestCount { get; private set; } + + public void EnqueueToken(StellaOpsTokenResult result) + => responses.Enqueue(_ => Task.FromResult(result)); + + public void FailWith(Exception exception) + => failure = exception; + + public Task RequestClientCredentialsTokenAsync(string? scope = null, CancellationToken cancellationToken = default) + { + RequestCount++; + + if (failure is not null) + { + throw failure; + } + + if (responses.TryDequeue(out var factory)) + { + return factory(cancellationToken); + } + + throw new InvalidOperationException("No token responses queued."); + } + + public Task RequestPasswordTokenAsync(string username, string password, string? scope = null, CancellationToken cancellationToken = default) + => throw new NotImplementedException(); + + public Task GetJsonWebKeySetAsync(CancellationToken cancellationToken = default) + => throw new NotImplementedException(); + + public ValueTask GetCachedTokenAsync(string key, CancellationToken cancellationToken = default) + => ValueTask.FromResult(null); + + public ValueTask CacheTokenAsync(string key, StellaOpsTokenCacheEntry entry, CancellationToken cancellationToken = default) + => ValueTask.CompletedTask; + + public ValueTask ClearCachedTokenAsync(string key, CancellationToken cancellationToken = default) + => ValueTask.CompletedTask; + } + + private sealed class StaticOptionsMonitor : IOptionsMonitor + { + public StaticOptionsMonitor(T value) + { + CurrentValue = value; + } + + public T CurrentValue { get; } + + public T Get(string? name) => CurrentValue; + + public IDisposable OnChange(Action listener) => NullDisposable.Instance; + + private sealed class NullDisposable : IDisposable + { + public static readonly NullDisposable Instance = new(); + public void Dispose() + { + } + } + } + + private sealed class TestTimeProvider : TimeProvider + { + private DateTimeOffset current; + + public TestTimeProvider(DateTimeOffset initial) + { + current = initial; + } + + public override DateTimeOffset GetUtcNow() => current; + + public void Advance(TimeSpan delta) + { + current = current.Add(delta); + } + } +} diff --git a/src/StellaOps.Zastava.Core.Tests/Serialization/ZastavaCanonicalJsonSerializerTests.cs b/src/StellaOps.Zastava.Core.Tests/Serialization/ZastavaCanonicalJsonSerializerTests.cs index 0530841a..0336457c 100644 --- a/src/StellaOps.Zastava.Core.Tests/Serialization/ZastavaCanonicalJsonSerializerTests.cs +++ b/src/StellaOps.Zastava.Core.Tests/Serialization/ZastavaCanonicalJsonSerializerTests.cs @@ -1,3 +1,4 @@ +using System; using System.Text; using System.Security.Cryptography; using StellaOps.Zastava.Core.Contracts; @@ -163,43 +164,32 @@ public sealed class ZastavaCanonicalJsonSerializerTests [Fact] public void ComputeMultihash_ProducesStableBase64UrlDigest() { - var decision = AdmissionDecisionEnvelope.Create( - new AdmissionDecision - { - AdmissionId = "admission-123", - Namespace = "payments", - PodSpecDigest = "sha256:deadbeef", - Images = new[] - { - new AdmissionImageVerdict - { - Name = "ghcr.io/acme/api:1.2.3", - Resolved = "ghcr.io/acme/api@sha256:abcd", - Signed = true, - HasSbomReferrers = true, - PolicyVerdict = PolicyVerdict.Pass, - Reasons = Array.Empty(), - Rekor = new AdmissionRekorEvidence - { - Uuid = "xyz", - Verified = true - } - } - }, - Decision = AdmissionDecisionOutcome.Allow, - TtlSeconds = 300 - }, - ZastavaContractVersions.AdmissionDecision); - - var canonicalJson = ZastavaCanonicalJsonSerializer.Serialize(decision); - var expectedDigestBytes = SHA256.HashData(Encoding.UTF8.GetBytes(canonicalJson)); + var payloadBytes = Encoding.UTF8.GetBytes("{\"value\":42}"); + var expectedDigestBytes = SHA256.HashData(payloadBytes); var expected = $"sha256-{Convert.ToBase64String(expectedDigestBytes).TrimEnd('=').Replace('+', '-').Replace('/', '_')}"; - var hash = ZastavaHashing.ComputeMultihash(decision); + var hash = ZastavaHashing.ComputeMultihash(new ReadOnlySpan(payloadBytes)); Assert.Equal(expected, hash); - var sha512 = ZastavaHashing.ComputeMultihash(Encoding.UTF8.GetBytes(canonicalJson), "sha512"); + var sha512 = ZastavaHashing.ComputeMultihash(new ReadOnlySpan(payloadBytes), "sha512"); Assert.StartsWith("sha512-", sha512, StringComparison.Ordinal); } + + [Fact] + public void ComputeMultihash_NormalizesAlgorithmAliases() + { + var bytes = Encoding.UTF8.GetBytes("sample"); + var digestDefault = ZastavaHashing.ComputeMultihash(new ReadOnlySpan(bytes)); + var digestAlias = ZastavaHashing.ComputeMultihash(new ReadOnlySpan(bytes), "sha-256"); + + Assert.Equal(digestDefault, digestAlias); + } + + [Fact] + public void ComputeMultihash_UnknownAlgorithm_Throws() + { + var ex = Assert.Throws(() => ZastavaHashing.ComputeMultihash(new ReadOnlySpan(Array.Empty()), "unsupported")); + Assert.Contains("unsupported", ex.Message, StringComparison.OrdinalIgnoreCase); + } } diff --git a/src/StellaOps.Zastava.Core/Configuration/ZastavaAuthorityOptions.cs b/src/StellaOps.Zastava.Core/Configuration/ZastavaAuthorityOptions.cs new file mode 100644 index 00000000..ab08a7fc --- /dev/null +++ b/src/StellaOps.Zastava.Core/Configuration/ZastavaAuthorityOptions.cs @@ -0,0 +1,68 @@ +using System.ComponentModel.DataAnnotations; + +namespace StellaOps.Zastava.Core.Configuration; + +/// +/// Authority client configuration shared by Zastava runtime components. +/// +public sealed class ZastavaAuthorityOptions +{ + /// + /// Authority issuer URL. + /// + [Required] + public Uri Issuer { get; set; } = new("https://authority.internal"); + + /// + /// OAuth client identifier used by runtime services. + /// + [Required(AllowEmptyStrings = false)] + public string ClientId { get; set; } = "zastava-runtime"; + + /// + /// Optional client secret when using confidential clients. + /// + public string? ClientSecret { get; set; } + + /// + /// Audience claims required on issued tokens. + /// + [MinLength(1)] + public string[] Audience { get; set; } = new[] { "scanner" }; + + /// + /// Additional scopes requested for the runtime plane. + /// + public string[] Scopes { get; set; } = Array.Empty(); + + /// + /// Seconds before expiry when a cached token should be refreshed. + /// + [Range(typeof(double), "0", "3600")] + public double RefreshSkewSeconds { get; set; } = 120; + + /// + /// Require the Authority to issue DPoP (proof-of-possession) tokens. + /// + public bool RequireDpop { get; set; } = true; + + /// + /// Require the Authority client to present mTLS during token acquisition. + /// + public bool RequireMutualTls { get; set; } = true; + + /// + /// Allow falling back to static tokens when Authority is unavailable. + /// + public bool AllowStaticTokenFallback { get; set; } + + /// + /// Optional path to a static fallback token (PEM/plain text). + /// + public string? StaticTokenPath { get; set; } + + /// + /// Optional literal static token (test/bootstrap only). Takes precedence over . + /// + public string? StaticTokenValue { get; set; } +} diff --git a/src/StellaOps.Zastava.Core/Configuration/ZastavaRuntimeOptions.cs b/src/StellaOps.Zastava.Core/Configuration/ZastavaRuntimeOptions.cs new file mode 100644 index 00000000..5105d42b --- /dev/null +++ b/src/StellaOps.Zastava.Core/Configuration/ZastavaRuntimeOptions.cs @@ -0,0 +1,84 @@ +using System.ComponentModel.DataAnnotations; + +namespace StellaOps.Zastava.Core.Configuration; + +/// +/// Common runtime configuration shared by Zastava components (observer, webhook, agent). +/// +public sealed class ZastavaRuntimeOptions +{ + public const string SectionName = "zastava:runtime"; + + /// + /// Tenant identifier used for scoping logs and metrics. + /// + [Required(AllowEmptyStrings = false)] + public string Tenant { get; set; } = "default"; + + /// + /// Deployment environment (prod, staging, etc.) used in telemetry dimensions. + /// + [Required(AllowEmptyStrings = false)] + public string Environment { get; set; } = "local"; + + /// + /// Component name (observer/webhook/agent) injected into scopes and metrics. + /// + public string? Component { get; set; } + + /// + /// Optional deployment identifier (cluster, region, etc.). + /// + public string? Deployment { get; set; } + + [Required] + public ZastavaRuntimeLoggingOptions Logging { get; set; } = new(); + + [Required] + public ZastavaRuntimeMetricsOptions Metrics { get; set; } = new(); + + [Required] + public ZastavaAuthorityOptions Authority { get; set; } = new(); +} + +public sealed class ZastavaRuntimeLoggingOptions +{ + /// + /// Whether scopes should be enabled on the logger factory. + /// + public bool IncludeScopes { get; init; } = true; + + /// + /// Whether activity tracking metadata (TraceId/SpanId) should be captured. + /// + public bool IncludeActivityTracking { get; init; } = true; + + /// + /// Optional static key/value pairs appended to every log scope. + /// + public IDictionary StaticScope { get; init; } = new Dictionary(StringComparer.Ordinal); +} + +public sealed class ZastavaRuntimeMetricsOptions +{ + /// + /// Enables metrics emission. + /// + public bool Enabled { get; init; } = true; + + /// + /// Meter name used for all runtime instrumentation. + /// + [Required(AllowEmptyStrings = false)] + public string MeterName { get; init; } = "StellaOps.Zastava"; + + /// + /// Optional meter semantic version. + /// + public string? MeterVersion { get; init; } = "1.0.0"; + + /// + /// Common dimensions attached to every metric emitted by the runtime plane. + /// + public IDictionary CommonTags { get; init; } = new Dictionary(StringComparer.Ordinal); +} diff --git a/src/StellaOps.Zastava.Core/DependencyInjection/ZastavaServiceCollectionExtensions.cs b/src/StellaOps.Zastava.Core/DependencyInjection/ZastavaServiceCollectionExtensions.cs new file mode 100644 index 00000000..3733c78e --- /dev/null +++ b/src/StellaOps.Zastava.Core/DependencyInjection/ZastavaServiceCollectionExtensions.cs @@ -0,0 +1,98 @@ +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection.Extensions; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using StellaOps.Auth.Client; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Core.Diagnostics; +using StellaOps.Zastava.Core.Security; + +namespace Microsoft.Extensions.DependencyInjection; + +public static class ZastavaServiceCollectionExtensions +{ + public static IServiceCollection AddZastavaRuntimeCore( + this IServiceCollection services, + IConfiguration configuration, + string componentName) + { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(configuration); + if (string.IsNullOrWhiteSpace(componentName)) + { + throw new ArgumentException("Component name is required.", nameof(componentName)); + } + + services.AddOptions() + .Bind(configuration.GetSection(ZastavaRuntimeOptions.SectionName)) + .ValidateDataAnnotations() + .Validate(static options => !string.IsNullOrWhiteSpace(options.Tenant), "Tenant is required.") + .Validate(static options => !string.IsNullOrWhiteSpace(options.Environment), "Environment is required.") + .PostConfigure(options => + { + if (string.IsNullOrWhiteSpace(options.Component)) + { + options.Component = componentName; + } + }) + .ValidateOnStart(); + + services.TryAddEnumerable(ServiceDescriptor.Singleton, ZastavaLoggerFactoryOptionsConfigurator>()); + services.TryAddSingleton(); + services.TryAddSingleton(); + ConfigureAuthorityServices(services, configuration); + services.TryAddSingleton(); + + return services; + } + + private static void ConfigureAuthorityServices(IServiceCollection services, IConfiguration configuration) + { + var authoritySection = configuration.GetSection($"{ZastavaRuntimeOptions.SectionName}:authority"); + var authorityOptions = new ZastavaAuthorityOptions(); + authoritySection.Bind(authorityOptions); + + services.AddStellaOpsAuthClient(options => + { + options.Authority = authorityOptions.Issuer.ToString(); + options.ClientId = authorityOptions.ClientId; + options.ClientSecret = authorityOptions.ClientSecret; + options.AllowOfflineCacheFallback = authorityOptions.AllowStaticTokenFallback; + options.ExpirationSkew = TimeSpan.FromSeconds(Math.Clamp(authorityOptions.RefreshSkewSeconds, 0, 300)); + + options.DefaultScopes.Clear(); + var normalized = new SortedSet(StringComparer.Ordinal); + + if (authorityOptions.Audience is not null) + { + foreach (var audience in authorityOptions.Audience) + { + if (string.IsNullOrWhiteSpace(audience)) + { + continue; + } + + normalized.Add($"aud:{audience.Trim().ToLowerInvariant()}"); + } + } + + if (authorityOptions.Scopes is not null) + { + foreach (var scope in authorityOptions.Scopes) + { + if (!string.IsNullOrWhiteSpace(scope)) + { + normalized.Add(scope.Trim()); + } + } + } + + foreach (var scope in normalized) + { + options.DefaultScopes.Add(scope); + } + }); + } +} diff --git a/src/StellaOps.Zastava.Core/Diagnostics/ZastavaLogScopeBuilder.cs b/src/StellaOps.Zastava.Core/Diagnostics/ZastavaLogScopeBuilder.cs new file mode 100644 index 00000000..f2ca20b8 --- /dev/null +++ b/src/StellaOps.Zastava.Core/Diagnostics/ZastavaLogScopeBuilder.cs @@ -0,0 +1,90 @@ +using System.Linq; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; + +namespace StellaOps.Zastava.Core.Diagnostics; + +public interface IZastavaLogScopeBuilder +{ + /// + /// Builds a deterministic logging scope containing tenant/component metadata. + /// + IReadOnlyDictionary BuildScope( + string? correlationId = null, + string? node = null, + string? workload = null, + string? eventId = null, + IReadOnlyDictionary? additional = null); +} + +internal sealed class ZastavaLogScopeBuilder : IZastavaLogScopeBuilder +{ + private readonly ZastavaRuntimeOptions options; + private readonly IReadOnlyDictionary staticScope; + + public ZastavaLogScopeBuilder(IOptions options) + { + ArgumentNullException.ThrowIfNull(options); + this.options = options.Value; + staticScope = (this.options.Logging.StaticScope ?? new Dictionary(StringComparer.Ordinal)) + .ToImmutableDictionary(pair => pair.Key, pair => pair.Value, StringComparer.Ordinal); + } + + public IReadOnlyDictionary BuildScope( + string? correlationId = null, + string? node = null, + string? workload = null, + string? eventId = null, + IReadOnlyDictionary? additional = null) + { + var scope = new Dictionary(StringComparer.Ordinal) + { + ["tenant"] = options.Tenant, + ["component"] = options.Component, + ["environment"] = options.Environment + }; + + if (!string.IsNullOrWhiteSpace(options.Deployment)) + { + scope["deployment"] = options.Deployment; + } + + foreach (var pair in staticScope) + { + scope[pair.Key] = pair.Value; + } + + if (!string.IsNullOrWhiteSpace(correlationId)) + { + scope["correlationId"] = correlationId; + } + + if (!string.IsNullOrWhiteSpace(node)) + { + scope["node"] = node; + } + + if (!string.IsNullOrWhiteSpace(workload)) + { + scope["workload"] = workload; + } + + if (!string.IsNullOrWhiteSpace(eventId)) + { + scope["eventId"] = eventId; + } + + if (additional is not null) + { + foreach (var pair in additional) + { + if (!string.IsNullOrWhiteSpace(pair.Key)) + { + scope[pair.Key] = pair.Value; + } + } + } + + return scope.ToImmutableDictionary(StringComparer.Ordinal); + } +} diff --git a/src/StellaOps.Zastava.Core/Diagnostics/ZastavaLoggerFactoryOptionsConfigurator.cs b/src/StellaOps.Zastava.Core/Diagnostics/ZastavaLoggerFactoryOptionsConfigurator.cs new file mode 100644 index 00000000..5255d968 --- /dev/null +++ b/src/StellaOps.Zastava.Core/Diagnostics/ZastavaLoggerFactoryOptionsConfigurator.cs @@ -0,0 +1,30 @@ +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; + +namespace StellaOps.Zastava.Core.Diagnostics; + +internal sealed class ZastavaLoggerFactoryOptionsConfigurator : IConfigureOptions +{ + private readonly IOptions options; + + public ZastavaLoggerFactoryOptionsConfigurator(IOptions options) + { + ArgumentNullException.ThrowIfNull(options); + this.options = options; + } + + public void Configure(LoggerFactoryOptions options) + { + ArgumentNullException.ThrowIfNull(options); + var runtimeOptions = this.options.Value; + if (runtimeOptions.Logging.IncludeActivityTracking) + { + options.ActivityTrackingOptions |= ActivityTrackingOptions.TraceId | ActivityTrackingOptions.SpanId | ActivityTrackingOptions.ParentId; + } + else if (runtimeOptions.Logging.IncludeScopes) + { + options.ActivityTrackingOptions |= ActivityTrackingOptions.TraceId | ActivityTrackingOptions.SpanId; + } + } +} diff --git a/src/StellaOps.Zastava.Core/Diagnostics/ZastavaRuntimeMetrics.cs b/src/StellaOps.Zastava.Core/Diagnostics/ZastavaRuntimeMetrics.cs new file mode 100644 index 00000000..a485e049 --- /dev/null +++ b/src/StellaOps.Zastava.Core/Diagnostics/ZastavaRuntimeMetrics.cs @@ -0,0 +1,78 @@ +using System.Linq; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; + +namespace StellaOps.Zastava.Core.Diagnostics; + +public interface IZastavaRuntimeMetrics : IDisposable +{ + Meter Meter { get; } + Counter RuntimeEvents { get; } + Counter AdmissionDecisions { get; } + Histogram BackendLatencyMs { get; } + IReadOnlyList> DefaultTags { get; } +} + +internal sealed class ZastavaRuntimeMetrics : IZastavaRuntimeMetrics +{ + private readonly Meter meter; + private readonly IReadOnlyList> defaultTags; + private readonly bool enabled; + + public ZastavaRuntimeMetrics(IOptions options) + { + ArgumentNullException.ThrowIfNull(options); + var runtimeOptions = options.Value; + var metrics = runtimeOptions.Metrics ?? new ZastavaRuntimeMetricsOptions(); + enabled = metrics.Enabled; + + meter = new Meter(metrics.MeterName, metrics.MeterVersion); + + RuntimeEvents = meter.CreateCounter("zastava.runtime.events.total", unit: "1", description: "Total runtime events emitted by observers."); + AdmissionDecisions = meter.CreateCounter("zastava.admission.decisions.total", unit: "1", description: "Total admission decisions returned by the webhook."); + BackendLatencyMs = meter.CreateHistogram("zastava.runtime.backend.latency.ms", unit: "ms", description: "Round-trip latency to Scanner backend APIs."); + + var baseline = new List> + { + new("tenant", runtimeOptions.Tenant), + new("component", runtimeOptions.Component), + new("environment", runtimeOptions.Environment) + }; + + if (!string.IsNullOrWhiteSpace(runtimeOptions.Deployment)) + { + baseline.Add(new("deployment", runtimeOptions.Deployment)); + } + + if (metrics.CommonTags is not null) + { + foreach (var pair in metrics.CommonTags) + { + if (!string.IsNullOrWhiteSpace(pair.Key)) + { + baseline.Add(new(pair.Key, pair.Value)); + } + } + } + + defaultTags = baseline.ToImmutableArray(); + } + + public Meter Meter => meter; + + public Counter RuntimeEvents { get; } + + public Counter AdmissionDecisions { get; } + + public Histogram BackendLatencyMs { get; } + + public IReadOnlyList> DefaultTags => defaultTags; + + public void Dispose() + { + if (enabled) + { + meter.Dispose(); + } + } +} diff --git a/src/StellaOps.Zastava.Core/Properties/AssemblyInfo.cs b/src/StellaOps.Zastava.Core/Properties/AssemblyInfo.cs new file mode 100644 index 00000000..c12e3581 --- /dev/null +++ b/src/StellaOps.Zastava.Core/Properties/AssemblyInfo.cs @@ -0,0 +1,3 @@ +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("StellaOps.Zastava.Core.Tests")] diff --git a/src/StellaOps.Zastava.Core/Security/IZastavaAuthorityTokenProvider.cs b/src/StellaOps.Zastava.Core/Security/IZastavaAuthorityTokenProvider.cs new file mode 100644 index 00000000..c7504daa --- /dev/null +++ b/src/StellaOps.Zastava.Core/Security/IZastavaAuthorityTokenProvider.cs @@ -0,0 +1,14 @@ +namespace StellaOps.Zastava.Core.Security; + +public interface IZastavaAuthorityTokenProvider +{ + ValueTask GetAsync( + string audience, + IEnumerable? additionalScopes = null, + CancellationToken cancellationToken = default); + + ValueTask InvalidateAsync( + string audience, + IEnumerable? additionalScopes = null, + CancellationToken cancellationToken = default); +} diff --git a/src/StellaOps.Zastava.Core/Security/ZastavaAuthorityTokenProvider.cs b/src/StellaOps.Zastava.Core/Security/ZastavaAuthorityTokenProvider.cs new file mode 100644 index 00000000..f7c39f97 --- /dev/null +++ b/src/StellaOps.Zastava.Core/Security/ZastavaAuthorityTokenProvider.cs @@ -0,0 +1,314 @@ +using System.Collections.Concurrent; +using System.Globalization; +using System.IO; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using StellaOps.Auth.Client; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Core.Diagnostics; + +namespace StellaOps.Zastava.Core.Security; + +internal sealed class ZastavaAuthorityTokenProvider : IZastavaAuthorityTokenProvider +{ + private readonly IStellaOpsTokenClient tokenClient; + private readonly IOptionsMonitor optionsMonitor; + private readonly IZastavaLogScopeBuilder scopeBuilder; + private readonly TimeProvider timeProvider; + private readonly ILogger logger; + + private readonly ConcurrentDictionary cache = new(StringComparer.Ordinal); + private readonly ConcurrentDictionary locks = new(StringComparer.Ordinal); + private readonly object guardrailLock = new(); + private bool guardrailsLogged; + private ZastavaOperationalToken? staticFallbackToken; + + public ZastavaAuthorityTokenProvider( + IStellaOpsTokenClient tokenClient, + IOptionsMonitor optionsMonitor, + IZastavaLogScopeBuilder scopeBuilder, + TimeProvider? timeProvider = null, + ILogger? logger = null) + { + this.tokenClient = tokenClient ?? throw new ArgumentNullException(nameof(tokenClient)); + this.optionsMonitor = optionsMonitor ?? throw new ArgumentNullException(nameof(optionsMonitor)); + this.scopeBuilder = scopeBuilder ?? throw new ArgumentNullException(nameof(scopeBuilder)); + this.timeProvider = timeProvider ?? TimeProvider.System; + this.logger = logger ?? NullLogger.Instance; + } + + public async ValueTask GetAsync( + string audience, + IEnumerable? additionalScopes = null, + CancellationToken cancellationToken = default) + { + ArgumentException.ThrowIfNullOrWhiteSpace(audience); + + var options = optionsMonitor.CurrentValue.Authority; + EnsureGuardrails(options); + + if (options.AllowStaticTokenFallback && TryGetStaticToken(options) is { } staticToken) + { + return staticToken; + } + + var normalizedAudience = NormalizeAudience(audience); + var normalizedScopes = BuildScopes(options, normalizedAudience, additionalScopes); + var cacheKey = BuildCacheKey(normalizedAudience, normalizedScopes); + var refreshSkew = GetRefreshSkew(options); + + if (cache.TryGetValue(cacheKey, out var cached) && !cached.Token.IsExpired(timeProvider, refreshSkew)) + { + return cached.Token; + } + + var mutex = locks.GetOrAdd(cacheKey, static _ => new SemaphoreSlim(1, 1)); + await mutex.WaitAsync(cancellationToken).ConfigureAwait(false); + + try + { + if (cache.TryGetValue(cacheKey, out cached) && !cached.Token.IsExpired(timeProvider, refreshSkew)) + { + return cached.Token; + } + + var scopeString = string.Join(' ', normalizedScopes); + var tokenResult = await tokenClient.RequestClientCredentialsTokenAsync(scopeString, cancellationToken).ConfigureAwait(false); + ValidateToken(tokenResult, options, normalizedAudience); + + var token = ZastavaOperationalToken.FromResult( + tokenResult.AccessToken, + tokenResult.TokenType, + tokenResult.ExpiresAtUtc, + tokenResult.Scopes); + + cache[cacheKey] = new CacheEntry(token); + + var scope = scopeBuilder.BuildScope( + correlationId: null, + node: null, + workload: null, + eventId: "authority.token.issue", + additional: new Dictionary + { + ["audience"] = normalizedAudience, + ["expiresAt"] = token.ExpiresAtUtc?.ToString("O", CultureInfo.InvariantCulture) ?? "static", + ["scopes"] = scopeString + }); + + using (logger.BeginScope(scope)) + { + logger.LogInformation("Issued runtime OpTok for {Audience} (scopes: {Scopes}).", normalizedAudience, scopeString); + } + + return token; + } + catch (Exception ex) when (options.AllowStaticTokenFallback && TryGetStaticToken(options) is { } fallback) + { + var scope = scopeBuilder.BuildScope( + eventId: "authority.token.fallback", + additional: new Dictionary + { + ["audience"] = audience + }); + + using (logger.BeginScope(scope)) + { + logger.LogWarning(ex, "Authority token acquisition failed; using static fallback token."); + } + + return fallback; + } + finally + { + mutex.Release(); + } + } + + public ValueTask InvalidateAsync( + string audience, + IEnumerable? additionalScopes = null, + CancellationToken cancellationToken = default) + { + ArgumentException.ThrowIfNullOrWhiteSpace(audience); + + var normalizedAudience = NormalizeAudience(audience); + var normalizedScopes = BuildScopes(optionsMonitor.CurrentValue.Authority, normalizedAudience, additionalScopes); + var cacheKey = BuildCacheKey(normalizedAudience, normalizedScopes); + + cache.TryRemove(cacheKey, out _); + if (locks.TryRemove(cacheKey, out var mutex)) + { + mutex.Dispose(); + } + + var scope = scopeBuilder.BuildScope( + eventId: "authority.token.invalidate", + additional: new Dictionary + { + ["audience"] = normalizedAudience, + ["cacheKey"] = cacheKey + }); + + using (logger.BeginScope(scope)) + { + logger.LogInformation("Invalidated runtime OpTok cache entry."); + } + + return ValueTask.CompletedTask; + } + + private void EnsureGuardrails(ZastavaAuthorityOptions options) + { + if (guardrailsLogged) + { + return; + } + + lock (guardrailLock) + { + if (guardrailsLogged) + { + return; + } + + var scope = scopeBuilder.BuildScope(eventId: "authority.guardrails"); + using (logger.BeginScope(scope)) + { + if (!options.RequireMutualTls) + { + logger.LogWarning("Mutual TLS requirement disabled for Authority token acquisition. This should only be used in controlled test environments."); + } + + if (!options.RequireDpop) + { + logger.LogWarning("DPoP requirement disabled for runtime plane. Tokens will be issued without proof-of-possession."); + } + + if (options.AllowStaticTokenFallback) + { + logger.LogWarning("Static Authority token fallback enabled. Ensure bootstrap tokens are rotated frequently."); + } + } + + guardrailsLogged = true; + } + } + + private ZastavaOperationalToken? TryGetStaticToken(ZastavaAuthorityOptions options) + { + if (!options.AllowStaticTokenFallback) + { + return null; + } + + if (options.StaticTokenValue is null && options.StaticTokenPath is null) + { + return null; + } + + if (staticFallbackToken is { } cached) + { + return cached; + } + + lock (guardrailLock) + { + if (staticFallbackToken is { } existing) + { + return existing; + } + + var tokenValue = options.StaticTokenValue; + if (string.IsNullOrWhiteSpace(tokenValue) && !string.IsNullOrWhiteSpace(options.StaticTokenPath)) + { + if (!File.Exists(options.StaticTokenPath)) + { + throw new FileNotFoundException("Static Authority token file not found.", options.StaticTokenPath); + } + + tokenValue = File.ReadAllText(options.StaticTokenPath); + } + + if (string.IsNullOrWhiteSpace(tokenValue)) + { + throw new InvalidOperationException("Static Authority token fallback is enabled but no token value/path is configured."); + } + + staticFallbackToken = ZastavaOperationalToken.FromResult( + tokenValue.Trim(), + tokenType: "Bearer", + expiresAtUtc: null, + scopes: Array.Empty()); + + return staticFallbackToken; + } + } + + private void ValidateToken(StellaOpsTokenResult tokenResult, ZastavaAuthorityOptions options, string normalizedAudience) + { + if (options.RequireDpop && !string.Equals(tokenResult.TokenType, "DPoP", StringComparison.OrdinalIgnoreCase)) + { + throw new InvalidOperationException("Authority returned a token without DPoP token type while RequireDpop is enabled."); + } + + if (tokenResult.Scopes is not null) + { + var audienceScope = $"aud:{normalizedAudience}"; + if (!tokenResult.Scopes.Contains(audienceScope, StringComparer.OrdinalIgnoreCase)) + { + throw new InvalidOperationException($"Authority token missing required audience scope '{audienceScope}'."); + } + } + } + + private static string NormalizeAudience(string audience) + => audience.Trim().ToLowerInvariant(); + + private static IReadOnlyList BuildScopes( + ZastavaAuthorityOptions options, + string normalizedAudience, + IEnumerable? additionalScopes) + { + var scopeSet = new SortedSet(StringComparer.Ordinal) + { + $"aud:{normalizedAudience}" + }; + + if (options.Scopes is not null) + { + foreach (var scope in options.Scopes) + { + if (!string.IsNullOrWhiteSpace(scope)) + { + scopeSet.Add(scope.Trim()); + } + } + } + + if (additionalScopes is not null) + { + foreach (var scope in additionalScopes) + { + if (!string.IsNullOrWhiteSpace(scope)) + { + scopeSet.Add(scope.Trim()); + } + } + } + + return scopeSet.ToArray(); + } + + private static string BuildCacheKey(string audience, IReadOnlyList scopes) + => Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes($"{audience}|{string.Join(' ', scopes)}"))); + + private static TimeSpan GetRefreshSkew(ZastavaAuthorityOptions options) + { + var seconds = Math.Clamp(options.RefreshSkewSeconds, 0, 3600); + return TimeSpan.FromSeconds(seconds); + } + + private readonly record struct CacheEntry(ZastavaOperationalToken Token); +} diff --git a/src/StellaOps.Zastava.Core/Security/ZastavaOperationalToken.cs b/src/StellaOps.Zastava.Core/Security/ZastavaOperationalToken.cs new file mode 100644 index 00000000..efd90767 --- /dev/null +++ b/src/StellaOps.Zastava.Core/Security/ZastavaOperationalToken.cs @@ -0,0 +1,70 @@ +using System.Collections.ObjectModel; +using System.Linq; + +namespace StellaOps.Zastava.Core.Security; + +public readonly record struct ZastavaOperationalToken( + string AccessToken, + string TokenType, + DateTimeOffset? ExpiresAtUtc, + IReadOnlyList Scopes) +{ + public bool IsExpired(TimeProvider timeProvider, TimeSpan refreshSkew) + { + ArgumentNullException.ThrowIfNull(timeProvider); + + if (ExpiresAtUtc is null) + { + return false; + } + + return timeProvider.GetUtcNow() >= ExpiresAtUtc.Value - refreshSkew; + } + + public static ZastavaOperationalToken FromResult( + string accessToken, + string tokenType, + DateTimeOffset? expiresAtUtc, + IEnumerable scopes) + { + ArgumentException.ThrowIfNullOrWhiteSpace(accessToken); + ArgumentException.ThrowIfNullOrWhiteSpace(tokenType); + + IReadOnlyList normalized = scopes switch + { + null => Array.Empty(), + IReadOnlyList readOnly => readOnly.Count == 0 ? Array.Empty() : readOnly, + ICollection collection => NormalizeCollection(collection), + _ => NormalizeEnumerable(scopes) + }; + + return new ZastavaOperationalToken( + accessToken, + tokenType, + expiresAtUtc, + normalized); + } + + private static IReadOnlyList NormalizeCollection(ICollection collection) + { + if (collection.Count == 0) + { + return Array.Empty(); + } + + if (collection is IReadOnlyList readOnly) + { + return readOnly; + } + + var buffer = new string[collection.Count]; + collection.CopyTo(buffer, 0); + return new ReadOnlyCollection(buffer); + } + + private static IReadOnlyList NormalizeEnumerable(IEnumerable scopes) + { + var buffer = scopes.ToArray(); + return buffer.Length == 0 ? Array.Empty() : new ReadOnlyCollection(buffer); + } +} diff --git a/src/StellaOps.Zastava.Core/TASKS.md b/src/StellaOps.Zastava.Core/TASKS.md index 431e7aab..f2146353 100644 --- a/src/StellaOps.Zastava.Core/TASKS.md +++ b/src/StellaOps.Zastava.Core/TASKS.md @@ -2,9 +2,9 @@ | ID | Status | Owner(s) | Depends on | Description | Exit Criteria | |----|--------|----------|------------|-------------|---------------| -| ZASTAVA-CORE-12-201 | DOING (2025-10-19) | Zastava Core Guild | — | Define runtime event/admission DTOs, hashing helpers, and versioning strategy. | DTOs cover runtime events and admission verdict envelopes with canonical JSON schema; hashing helpers accept payloads and yield deterministic multihash outputs; version negotiation rules documented and exercised by serialization tests. | -| ZASTAVA-CORE-12-202 | DOING (2025-10-19) | Zastava Core Guild | — | Provide configuration/logging/metrics utilities shared by Observer/Webhook. | Shared options bind from configuration with validation; logging scopes/metrics exporters registered via reusable DI extension; integration test host demonstrates Observer/Webhook consumption with deterministic instrumentation. | -| ZASTAVA-CORE-12-203 | DOING (2025-10-19) | Zastava Core Guild | — | Authority client helpers, OpTok caching, and security guardrails for runtime services. | Typed Authority client surfaces OpTok retrieval + renewal with configurable cache; guardrails enforce DPoP/mTLS expectations and emit structured audit logs; negative-path tests cover expired/invalid tokens and configuration toggles. | -| ZASTAVA-OPS-12-204 | DOING (2025-10-19) | Zastava Core Guild | — | Operational runbooks, alert rules, and dashboard exports for runtime plane. | Runbooks capture install/upgrade/rollback + incident handling; alert rules and dashboard JSON exported for Prometheus/Grafana bundle; docs reference Offline Kit packaging and verification checklist. | +| ZASTAVA-CORE-12-201 | DONE (2025-10-23) | Zastava Core Guild | — | Define runtime event/admission DTOs, hashing helpers, and versioning strategy. | DTOs cover runtime events and admission verdict envelopes with canonical JSON schema; hashing helpers accept payloads and yield deterministic multihash outputs; version negotiation rules documented and exercised by serialization tests. | +| ZASTAVA-CORE-12-202 | DONE (2025-10-23) | Zastava Core Guild | — | Provide configuration/logging/metrics utilities shared by Observer/Webhook. | Shared options bind from configuration with validation; logging scopes/metrics exporters registered via reusable DI extension; integration test host demonstrates Observer/Webhook consumption with deterministic instrumentation. | +| ZASTAVA-CORE-12-203 | DONE (2025-10-23) | Zastava Core Guild | — | Authority client helpers, OpTok caching, and security guardrails for runtime services. | Typed Authority client surfaces OpTok retrieval + renewal with configurable cache; guardrails enforce DPoP/mTLS expectations and emit structured audit logs; negative-path tests cover expired/invalid tokens and configuration toggles. | +| ZASTAVA-OPS-12-204 | DONE (2025-10-23) | Zastava Core Guild | — | Operational runbooks, alert rules, and dashboard exports for runtime plane. | Runbooks capture install/upgrade/rollback + incident handling; alert rules and dashboard JSON exported for Prometheus/Grafana bundle; docs reference Offline Kit packaging and verification checklist. | > Remark (2025-10-19): Prerequisites reviewed—none outstanding. ZASTAVA-CORE-12-201, ZASTAVA-CORE-12-202, ZASTAVA-CORE-12-203, and ZASTAVA-OPS-12-204 moved to DOING for Wave 0 kickoff. diff --git a/src/StellaOps.Zastava.Observer/Configuration/ZastavaObserverOptions.cs b/src/StellaOps.Zastava.Observer/Configuration/ZastavaObserverOptions.cs new file mode 100644 index 00000000..4337e9bc --- /dev/null +++ b/src/StellaOps.Zastava.Observer/Configuration/ZastavaObserverOptions.cs @@ -0,0 +1,128 @@ +using System.ComponentModel.DataAnnotations; + +namespace StellaOps.Zastava.Observer.Configuration; + +/// +/// Observer-specific configuration applied on top of the shared runtime options. +/// +public sealed class ZastavaObserverOptions +{ + public const string SectionName = "zastava:observer"; + + private const string DefaultContainerdSocket = "unix:///run/containerd/containerd.sock"; + + /// + /// Logical node identifier emitted with runtime events (defaults to environment hostname). + /// + [Required(AllowEmptyStrings = false)] + public string NodeName { get; set; } = + Environment.GetEnvironmentVariable("ZASTAVA_NODE_NAME") + ?? Environment.GetEnvironmentVariable("KUBERNETES_NODE_NAME") + ?? Environment.MachineName; + + /// + /// Baseline polling interval when watching CRI runtimes. + /// + [Range(typeof(TimeSpan), "00:00:01", "00:10:00")] + public TimeSpan PollInterval { get; set; } = TimeSpan.FromSeconds(2); + + /// + /// Maximum number of runtime events held in the in-memory buffer. + /// + [Range(16, 65536)] + public int MaxInMemoryBuffer { get; set; } = 2048; + + /// + /// Number of runtime events drained in one batch by downstream publishers. + /// + [Range(1, 512)] + public int PublishBatchSize { get; set; } = 32; + + /// + /// Connectivity/backoff settings applied when CRI endpoints fail temporarily. + /// + [Required] + public ObserverBackoffOptions Backoff { get; set; } = new(); + + /// + /// CRI runtime endpoints to monitor. + /// + [Required] + public IList Runtimes { get; set; } = new List + { + new() + { + Name = "containerd", + Engine = ContainerRuntimeEngine.Containerd, + Endpoint = DefaultContainerdSocket, + Enabled = true + } + }; +} + +public sealed class ObserverBackoffOptions +{ + /// + /// Initial backoff delay applied after the first failure. + /// + [Range(typeof(TimeSpan), "00:00:01", "00:05:00")] + public TimeSpan Initial { get; set; } = TimeSpan.FromSeconds(1); + + /// + /// Maximum backoff delay after repeated failures. + /// + [Range(typeof(TimeSpan), "00:00:01", "00:10:00")] + public TimeSpan Max { get; set; } = TimeSpan.FromSeconds(30); + + /// + /// Jitter ratio applied to the computed delay (0 disables jitter). + /// + [Range(0.0, 0.5)] + public double JitterRatio { get; set; } = 0.2; +} + +public sealed class ContainerRuntimeEndpointOptions +{ + /// + /// Friendly name used for logging/metrics (defaults to engine identifier). + /// + public string? Name { get; set; } + + /// + /// Runtime engine backing the endpoint. + /// + public ContainerRuntimeEngine Engine { get; set; } = ContainerRuntimeEngine.Containerd; + + /// + /// Endpoint URI (unix:///run/containerd/containerd.sock, npipe://./pipe/dockershim, https://127.0.0.1:1234, ...). + /// + [Required(AllowEmptyStrings = false)] + public string Endpoint { get; set; } = "unix:///run/containerd/containerd.sock"; + + /// + /// Optional explicit polling interval for this endpoint (falls back to global PollInterval). + /// + [Range(typeof(TimeSpan), "00:00:01", "00:10:00")] + public TimeSpan? PollInterval { get; set; } + + /// + /// Optional connection timeout override. + /// + [Range(typeof(TimeSpan), "00:00:01", "00:01:00")] + public TimeSpan? ConnectTimeout { get; set; } + + /// + /// Flag to allow disabling endpoints without removing configuration entries. + /// + public bool Enabled { get; set; } = true; + + public string ResolveName() + => string.IsNullOrWhiteSpace(Name) ? Engine.ToString().ToLowerInvariant() : Name!; +} + +public enum ContainerRuntimeEngine +{ + Containerd, + CriO, + Docker +} diff --git a/src/StellaOps.Zastava.Observer/ContainerRuntime/ContainerStateTracker.cs b/src/StellaOps.Zastava.Observer/ContainerRuntime/ContainerStateTracker.cs new file mode 100644 index 00000000..0df28f38 --- /dev/null +++ b/src/StellaOps.Zastava.Observer/ContainerRuntime/ContainerStateTracker.cs @@ -0,0 +1,134 @@ +using StellaOps.Zastava.Observer.ContainerRuntime.Cri; + +namespace StellaOps.Zastava.Observer.ContainerRuntime; + +internal sealed class ContainerStateTracker +{ + private readonly Dictionary entries = new(StringComparer.Ordinal); + + public void BeginCycle() + { + foreach (var entry in entries.Values) + { + entry.SeenInCycle = false; + } + } + + public ContainerLifecycleEvent? MarkRunning(CriContainerInfo snapshot, DateTimeOffset fallbackTimestamp) + { + ArgumentNullException.ThrowIfNull(snapshot); + var timestamp = snapshot.StartedAt ?? snapshot.CreatedAt; + if (timestamp <= DateTimeOffset.MinValue) + { + timestamp = fallbackTimestamp; + } + + if (!entries.TryGetValue(snapshot.Id, out var entry)) + { + entry = new ContainerStateEntry(snapshot); + entries[snapshot.Id] = entry; + entry.SeenInCycle = true; + entry.State = ContainerLifecycleState.Running; + entry.LastStart = timestamp; + entry.LastSnapshot = snapshot; + return new ContainerLifecycleEvent(ContainerLifecycleEventKind.Start, timestamp, snapshot); + } + + entry.SeenInCycle = true; + + if (timestamp > entry.LastStart) + { + entry.LastStart = timestamp; + entry.State = ContainerLifecycleState.Running; + entry.LastSnapshot = snapshot; + return new ContainerLifecycleEvent(ContainerLifecycleEventKind.Start, timestamp, snapshot); + } + + entry.State = ContainerLifecycleState.Running; + entry.LastSnapshot = snapshot; + return null; + } + + public async Task> CompleteCycleAsync( + Func> statusProvider, + DateTimeOffset fallbackTimestamp, + CancellationToken cancellationToken) + { + ArgumentNullException.ThrowIfNull(statusProvider); + + var events = new List(); + foreach (var (containerId, entry) in entries.ToArray()) + { + if (entry.SeenInCycle) + { + continue; + } + + CriContainerInfo? status = null; + if (entry.LastSnapshot is not null && entry.LastSnapshot.FinishedAt is not null) + { + status = entry.LastSnapshot; + } + else + { + status = await statusProvider(containerId).ConfigureAwait(false) ?? entry.LastSnapshot; + } + + var stopTimestamp = status?.FinishedAt ?? fallbackTimestamp; + if (stopTimestamp <= DateTimeOffset.MinValue) + { + stopTimestamp = fallbackTimestamp; + } + + if (entry.LastStop is not null && stopTimestamp <= entry.LastStop) + { + entries.Remove(containerId); + continue; + } + + var snapshot = status ?? entry.LastSnapshot ?? entry.MetadataFallback; + var stopEvent = new ContainerLifecycleEvent(ContainerLifecycleEventKind.Stop, stopTimestamp, snapshot); + events.Add(stopEvent); + + entry.LastStop = stopTimestamp; + entry.State = ContainerLifecycleState.Stopped; + entries.Remove(containerId); + } + + return events + .OrderBy(static e => e.Timestamp) + .ThenBy(static e => e.Snapshot.Id, StringComparer.Ordinal) + .ToArray(); + } + + private sealed class ContainerStateEntry + { + public ContainerStateEntry(CriContainerInfo seed) + { + MetadataFallback = seed; + LastSnapshot = seed; + } + + public ContainerLifecycleState State { get; set; } = ContainerLifecycleState.Unknown; + public bool SeenInCycle { get; set; } + public DateTimeOffset LastStart { get; set; } = DateTimeOffset.MinValue; + public DateTimeOffset? LastStop { get; set; } + public CriContainerInfo MetadataFallback { get; } + public CriContainerInfo? LastSnapshot { get; set; } + } +} + +internal enum ContainerLifecycleState +{ + Unknown, + Running, + Stopped +} + +internal sealed record ContainerLifecycleEvent(ContainerLifecycleEventKind Kind, DateTimeOffset Timestamp, CriContainerInfo Snapshot); + +internal enum ContainerLifecycleEventKind +{ + Start, + Stop +} diff --git a/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriConversions.cs b/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriConversions.cs new file mode 100644 index 00000000..be9d68ee --- /dev/null +++ b/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriConversions.cs @@ -0,0 +1,76 @@ +using StellaOps.Zastava.Observer.Cri; + +namespace StellaOps.Zastava.Observer.ContainerRuntime.Cri; + +internal static class CriConversions +{ + private const long NanosecondsPerTick = 100; + + public static CriContainerInfo ToContainerInfo(Container container) + { + ArgumentNullException.ThrowIfNull(container); + + return new CriContainerInfo( + Id: container.Id ?? string.Empty, + PodSandboxId: container.PodSandboxId ?? string.Empty, + Name: container.Metadata?.Name ?? string.Empty, + Attempt: container.Metadata?.Attempt ?? 0, + Image: container.Image?.Image, + ImageRef: container.ImageRef, + Labels: container.Labels?.ToDictionary(static pair => pair.Key, static pair => pair.Value, StringComparer.Ordinal) ?? new Dictionary(StringComparer.Ordinal), + Annotations: container.Annotations?.ToDictionary(static pair => pair.Key, static pair => pair.Value, StringComparer.Ordinal) ?? new Dictionary(StringComparer.Ordinal), + CreatedAt: FromUnixNanoseconds(container.CreatedAt), + StartedAt: null, + FinishedAt: null, + ExitCode: null, + Reason: null, + Message: null); + } + + public static CriContainerInfo MergeStatus(CriContainerInfo baseline, ContainerStatus? status) + { + if (status is null) + { + return baseline; + } + + var labels = status.Labels?.ToDictionary(static pair => pair.Key, static pair => pair.Value, StringComparer.Ordinal) + ?? baseline.Labels; + var annotations = status.Annotations?.ToDictionary(static pair => pair.Key, static pair => pair.Value, StringComparer.Ordinal) + ?? baseline.Annotations; + + return baseline with + { + CreatedAt = status.CreatedAt > 0 ? FromUnixNanoseconds(status.CreatedAt) : baseline.CreatedAt, + StartedAt = status.StartedAt > 0 ? FromUnixNanoseconds(status.StartedAt) : baseline.StartedAt, + FinishedAt = status.FinishedAt > 0 ? FromUnixNanoseconds(status.FinishedAt) : baseline.FinishedAt, + ExitCode = status.ExitCode != 0 ? status.ExitCode : baseline.ExitCode, + Reason = string.IsNullOrWhiteSpace(status.Reason) ? baseline.Reason : status.Reason, + Message = string.IsNullOrWhiteSpace(status.Message) ? baseline.Message : status.Message, + Image: status.Image?.Image ?? baseline.Image, + ImageRef: string.IsNullOrWhiteSpace(status.ImageRef) ? baseline.ImageRef : status.ImageRef, + Labels = labels, + Annotations = annotations + }; + } + + public static DateTimeOffset FromUnixNanoseconds(long nanoseconds) + { + if (nanoseconds <= 0) + { + return DateTimeOffset.MinValue; + } + + var seconds = Math.DivRem(nanoseconds, 1_000_000_000, out var remainder); + var ticks = remainder / NanosecondsPerTick; + try + { + var baseTime = DateTimeOffset.FromUnixTimeSeconds(seconds); + return baseTime.AddTicks(ticks); + } + catch (ArgumentOutOfRangeException) + { + return DateTimeOffset.UnixEpoch; + } + } +} diff --git a/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriModels.cs b/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriModels.cs new file mode 100644 index 00000000..89e5d38f --- /dev/null +++ b/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriModels.cs @@ -0,0 +1,44 @@ +using StellaOps.Zastava.Observer.Configuration; + +namespace StellaOps.Zastava.Observer.ContainerRuntime.Cri; + +internal sealed record CriRuntimeIdentity( + string RuntimeName, + string RuntimeVersion, + string RuntimeApiVersion); + +internal sealed record CriContainerInfo( + string Id, + string PodSandboxId, + string Name, + uint Attempt, + string? Image, + string? ImageRef, + IReadOnlyDictionary Labels, + IReadOnlyDictionary Annotations, + DateTimeOffset CreatedAt, + DateTimeOffset? StartedAt, + DateTimeOffset? FinishedAt, + int? ExitCode, + string? Reason, + string? Message); + +internal static class CriLabelKeys +{ + public const string PodName = "io.kubernetes.pod.name"; + public const string PodNamespace = "io.kubernetes.pod.namespace"; + public const string PodUid = "io.kubernetes.pod.uid"; + public const string ContainerName = "io.kubernetes.container.name"; +} + +internal static class ContainerRuntimeEngineExtensions +{ + public static string ToEngineString(this ContainerRuntimeEngine engine) + => engine switch + { + ContainerRuntimeEngine.Containerd => "containerd", + ContainerRuntimeEngine.CriO => "cri-o", + ContainerRuntimeEngine.Docker => "docker", + _ => "unknown" + }; +} diff --git a/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriRuntimeClient.cs b/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriRuntimeClient.cs new file mode 100644 index 00000000..9a02abd8 --- /dev/null +++ b/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriRuntimeClient.cs @@ -0,0 +1,213 @@ +using System.IO; +using System.Net.Sockets; +using System.Linq; +using Grpc.Core; +using Grpc.Net.Client; +using Microsoft.Extensions.Logging; +using StellaOps.Zastava.Observer.Configuration; +using StellaOps.Zastava.Observer.Cri; + +namespace StellaOps.Zastava.Observer.ContainerRuntime.Cri; + +internal interface ICriRuntimeClient : IAsyncDisposable +{ + ContainerRuntimeEndpointOptions Endpoint { get; } + Task GetIdentityAsync(CancellationToken cancellationToken); + Task> ListContainersAsync(ContainerState state, CancellationToken cancellationToken); + Task GetContainerStatusAsync(string containerId, CancellationToken cancellationToken); +} + +internal sealed class CriRuntimeClient : ICriRuntimeClient +{ + private static readonly object SwitchLock = new(); + private static bool http2SwitchApplied; + + private readonly GrpcChannel channel; + private readonly RuntimeService.RuntimeServiceClient client; + private readonly ILogger logger; + + public CriRuntimeClient(ContainerRuntimeEndpointOptions endpoint, ILogger logger) + { + ArgumentNullException.ThrowIfNull(endpoint); + this.logger = logger ?? throw new ArgumentNullException(nameof(logger)); + Endpoint = endpoint; + + EnsureHttp2Switch(); + channel = CreateChannel(endpoint); + client = new RuntimeService.RuntimeServiceClient(channel); + } + + public ContainerRuntimeEndpointOptions Endpoint { get; } + + public async Task GetIdentityAsync(CancellationToken cancellationToken) + { + var response = await client.VersionAsync(new VersionRequest(), cancellationToken: cancellationToken).ConfigureAwait(false); + return new CriRuntimeIdentity( + RuntimeName: response.RuntimeName ?? Endpoint.Engine.ToEngineString(), + RuntimeVersion: response.RuntimeVersion ?? "unknown", + RuntimeApiVersion: response.RuntimeApiVersion ?? response.Version ?? "unknown"); + } + + public async Task> ListContainersAsync(ContainerState state, CancellationToken cancellationToken) + { + var request = new ListContainersRequest + { + Filter = new ContainerFilter + { + State = new ContainerStateValue + { + State = state + } + } + }; + + try + { + var response = await client.ListContainersAsync(request, cancellationToken: cancellationToken).ConfigureAwait(false); + if (response.Containers is null || response.Containers.Count == 0) + { + return Array.Empty(); + } + + return response.Containers + .Select(CriConversions.ToContainerInfo) + .ToArray(); + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unimplemented) + { + logger.LogWarning(ex, "Runtime endpoint {Endpoint} does not support ListContainers for state {State}.", Endpoint.Endpoint, state); + throw; + } + } + + public async Task GetContainerStatusAsync(string containerId, CancellationToken cancellationToken) + { + if (string.IsNullOrWhiteSpace(containerId)) + { + return null; + } + + try + { + var response = await client.ContainerStatusAsync(new ContainerStatusRequest + { + ContainerId = containerId, + Verbose = false + }, cancellationToken: cancellationToken).ConfigureAwait(false); + + if (response.Status is null) + { + return null; + } + + var baseline = CriConversions.ToContainerInfo(new Container + { + Id = response.Status.Id, + PodSandboxId = response.Status.Metadata?.Name ?? string.Empty, + Metadata = response.Status.Metadata, + Image = response.Status.Image, + ImageRef = response.Status.ImageRef, + Labels = { response.Status.Labels }, + Annotations = { response.Status.Annotations }, + CreatedAt = response.Status.CreatedAt + }); + + return CriConversions.MergeStatus(baseline, response.Status); + } + catch (RpcException ex) when (ex.StatusCode is StatusCode.NotFound or StatusCode.DeadlineExceeded) + { + logger.LogDebug(ex, "Container {ContainerId} no longer available when querying status.", containerId); + return null; + } + } + + public async ValueTask DisposeAsync() + { + try + { + await channel.DisposeAsync().ConfigureAwait(false); + } + catch (InvalidOperationException) + { + // Channel already disposed. + } + } + + private static void EnsureHttp2Switch() + { + if (http2SwitchApplied) + { + return; + } + + lock (SwitchLock) + { + if (!http2SwitchApplied) + { + AppContext.SetSwitch("System.Net.Http.SocketsHttpHandler.Http2UnencryptedSupport", true); + http2SwitchApplied = true; + } + } + } + + private GrpcChannel CreateChannel(ContainerRuntimeEndpointOptions endpoint) + { + if (IsUnixEndpoint(endpoint.Endpoint, out var unixPath)) + { + var resolvedPath = unixPath; + var handler = new SocketsHttpHandler + { + ConnectCallback = (context, cancellationToken) => ConnectUnixDomainSocketAsync(resolvedPath, cancellationToken), + EnableMultipleHttp2Connections = true + }; + + if (endpoint.ConnectTimeout is { } timeout and > TimeSpan.Zero) + { + handler.ConnectTimeout = timeout; + } + + return GrpcChannel.ForAddress("http://unix.local", new GrpcChannelOptions + { + HttpHandler = handler, + DisposeHttpClient = true + }); + } + + return GrpcChannel.ForAddress(endpoint.Endpoint, new GrpcChannelOptions + { + DisposeHttpClient = true + }); + } + + private static bool IsUnixEndpoint(string endpoint, out string path) + { + if (endpoint.StartsWith("unix://", StringComparison.OrdinalIgnoreCase)) + { + path = endpoint["unix://".Length..]; + return true; + } + + path = string.Empty; + return false; + } + + private static async ValueTask ConnectUnixDomainSocketAsync(string unixPath, CancellationToken cancellationToken) + { + var socket = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified) + { + NoDelay = true + }; + + try + { + var endpoint = new UnixDomainSocketEndPoint(unixPath); + await socket.ConnectAsync(endpoint, cancellationToken).ConfigureAwait(false); + return new NetworkStream(socket, ownsSocket: true); + } + catch + { + socket.Dispose(); + throw; + } + } +} diff --git a/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriRuntimeClientFactory.cs b/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriRuntimeClientFactory.cs new file mode 100644 index 00000000..0947b9f3 --- /dev/null +++ b/src/StellaOps.Zastava.Observer/ContainerRuntime/Cri/CriRuntimeClientFactory.cs @@ -0,0 +1,26 @@ +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using StellaOps.Zastava.Observer.Configuration; + +namespace StellaOps.Zastava.Observer.ContainerRuntime.Cri; + +internal interface ICriRuntimeClientFactory +{ + ICriRuntimeClient Create(ContainerRuntimeEndpointOptions endpoint); +} + +internal sealed class CriRuntimeClientFactory : ICriRuntimeClientFactory +{ + private readonly IServiceProvider serviceProvider; + + public CriRuntimeClientFactory(IServiceProvider serviceProvider) + { + this.serviceProvider = serviceProvider ?? throw new ArgumentNullException(nameof(serviceProvider)); + } + + public ICriRuntimeClient Create(ContainerRuntimeEndpointOptions endpoint) + { + var logger = serviceProvider.GetRequiredService>(); + return new CriRuntimeClient(endpoint, logger); + } +} diff --git a/src/StellaOps.Zastava.Observer/Program.cs b/src/StellaOps.Zastava.Observer/Program.cs new file mode 100644 index 00000000..94db8884 --- /dev/null +++ b/src/StellaOps.Zastava.Observer/Program.cs @@ -0,0 +1,10 @@ +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; +using StellaOps.Zastava.Observer.Worker; + +var builder = Host.CreateApplicationBuilder(args); + +builder.Services.AddZastavaRuntimeCore(builder.Configuration, componentName: "observer"); +builder.Services.AddHostedService(); + +await builder.Build().RunAsync(); diff --git a/src/StellaOps.Zastava.Observer/Protos/runtime/v1/runtime.proto b/src/StellaOps.Zastava.Observer/Protos/runtime/v1/runtime.proto new file mode 100644 index 00000000..93061146 --- /dev/null +++ b/src/StellaOps.Zastava.Observer/Protos/runtime/v1/runtime.proto @@ -0,0 +1,1855 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// To regenerate api.pb.go run `hack/update-codegen.sh protobindings` +syntax = "proto3"; + +package runtime.v1; +option go_package = "k8s.io/cri-api/pkg/apis/runtime/v1"; +option csharp_namespace = "StellaOps.Zastava.Observer.Cri"; + + + +// Runtime service defines the public APIs for remote container runtimes +service RuntimeService { + // Version returns the runtime name, runtime version, and runtime API version. + rpc Version(VersionRequest) returns (VersionResponse) {} + + // RunPodSandbox creates and starts a pod-level sandbox. Runtimes must ensure + // the sandbox is in the ready state on success. + rpc RunPodSandbox(RunPodSandboxRequest) returns (RunPodSandboxResponse) {} + // StopPodSandbox stops any running process that is part of the sandbox and + // reclaims network resources (e.g., IP addresses) allocated to the sandbox. + // If there are any running containers in the sandbox, they must be forcibly + // terminated. + // This call is idempotent, and must not return an error if all relevant + // resources have already been reclaimed. kubelet will call StopPodSandbox + // at least once before calling RemovePodSandbox. It will also attempt to + // reclaim resources eagerly, as soon as a sandbox is not needed. Hence, + // multiple StopPodSandbox calls are expected. + rpc StopPodSandbox(StopPodSandboxRequest) returns (StopPodSandboxResponse) {} + // RemovePodSandbox removes the sandbox. If there are any running containers + // in the sandbox, they must be forcibly terminated and removed. + // This call is idempotent, and must not return an error if the sandbox has + // already been removed. + rpc RemovePodSandbox(RemovePodSandboxRequest) returns (RemovePodSandboxResponse) {} + // PodSandboxStatus returns the status of the PodSandbox. If the PodSandbox is not + // present, returns an error. + rpc PodSandboxStatus(PodSandboxStatusRequest) returns (PodSandboxStatusResponse) {} + // ListPodSandbox returns a list of PodSandboxes. + rpc ListPodSandbox(ListPodSandboxRequest) returns (ListPodSandboxResponse) {} + + // CreateContainer creates a new container in specified PodSandbox + rpc CreateContainer(CreateContainerRequest) returns (CreateContainerResponse) {} + // StartContainer starts the container. + rpc StartContainer(StartContainerRequest) returns (StartContainerResponse) {} + // StopContainer stops a running container with a grace period (i.e., timeout). + // This call is idempotent, and must not return an error if the container has + // already been stopped. + // The runtime must forcibly kill the container after the grace period is + // reached. + rpc StopContainer(StopContainerRequest) returns (StopContainerResponse) {} + // RemoveContainer removes the container. If the container is running, the + // container must be forcibly removed. + // This call is idempotent, and must not return an error if the container has + // already been removed. + rpc RemoveContainer(RemoveContainerRequest) returns (RemoveContainerResponse) {} + // ListContainers lists all containers by filters. + rpc ListContainers(ListContainersRequest) returns (ListContainersResponse) {} + // ContainerStatus returns status of the container. If the container is not + // present, returns an error. + rpc ContainerStatus(ContainerStatusRequest) returns (ContainerStatusResponse) {} + // UpdateContainerResources updates ContainerConfig of the container synchronously. + // If runtime fails to transactionally update the requested resources, an error is returned. + rpc UpdateContainerResources(UpdateContainerResourcesRequest) returns (UpdateContainerResourcesResponse) {} + // ReopenContainerLog asks runtime to reopen the stdout/stderr log file + // for the container. This is often called after the log file has been + // rotated. If the container is not running, container runtime can choose + // to either create a new log file and return nil, or return an error. + // Once it returns error, new container log file MUST NOT be created. + rpc ReopenContainerLog(ReopenContainerLogRequest) returns (ReopenContainerLogResponse) {} + + // ExecSync runs a command in a container synchronously. + rpc ExecSync(ExecSyncRequest) returns (ExecSyncResponse) {} + // Exec prepares a streaming endpoint to execute a command in the container. + rpc Exec(ExecRequest) returns (ExecResponse) {} + // Attach prepares a streaming endpoint to attach to a running container. + rpc Attach(AttachRequest) returns (AttachResponse) {} + // PortForward prepares a streaming endpoint to forward ports from a PodSandbox. + rpc PortForward(PortForwardRequest) returns (PortForwardResponse) {} + + // ContainerStats returns stats of the container. If the container does not + // exist, the call returns an error. + rpc ContainerStats(ContainerStatsRequest) returns (ContainerStatsResponse) {} + // ListContainerStats returns stats of all running containers. + rpc ListContainerStats(ListContainerStatsRequest) returns (ListContainerStatsResponse) {} + + // PodSandboxStats returns stats of the pod sandbox. If the pod sandbox does not + // exist, the call returns an error. + rpc PodSandboxStats(PodSandboxStatsRequest) returns (PodSandboxStatsResponse) {} + // ListPodSandboxStats returns stats of the pod sandboxes matching a filter. + rpc ListPodSandboxStats(ListPodSandboxStatsRequest) returns (ListPodSandboxStatsResponse) {} + + // UpdateRuntimeConfig updates the runtime configuration based on the given request. + rpc UpdateRuntimeConfig(UpdateRuntimeConfigRequest) returns (UpdateRuntimeConfigResponse) {} + + // Status returns the status of the runtime. + rpc Status(StatusRequest) returns (StatusResponse) {} + + // CheckpointContainer checkpoints a container + rpc CheckpointContainer(CheckpointContainerRequest) returns (CheckpointContainerResponse) {} + + // GetContainerEvents gets container events from the CRI runtime + rpc GetContainerEvents(GetEventsRequest) returns (stream ContainerEventResponse) {} + + // ListMetricDescriptors gets the descriptors for the metrics that will be returned in ListPodSandboxMetrics. + // This list should be static at startup: either the client and server restart together when + // adding or removing metrics descriptors, or they should not change. + // Put differently, if ListPodSandboxMetrics references a name that is not described in the initial + // ListMetricDescriptors call, then the metric will not be broadcasted. + rpc ListMetricDescriptors(ListMetricDescriptorsRequest) returns (ListMetricDescriptorsResponse) {} + + // ListPodSandboxMetrics gets pod sandbox metrics from CRI Runtime + rpc ListPodSandboxMetrics(ListPodSandboxMetricsRequest) returns (ListPodSandboxMetricsResponse) {} + + // RuntimeConfig returns configuration information of the runtime. + // A couple of notes: + // - The RuntimeConfigRequest object is not to be confused with the contents of UpdateRuntimeConfigRequest. + // The former is for having runtime tell Kubelet what to do, the latter vice versa. + // - It is the expectation of the Kubelet that these fields are static for the lifecycle of the Kubelet. + // The Kubelet will not re-request the RuntimeConfiguration after startup, and CRI implementations should + // avoid updating them without a full node reboot. + rpc RuntimeConfig(RuntimeConfigRequest) returns (RuntimeConfigResponse) {} +} + +// ImageService defines the public APIs for managing images. +service ImageService { + // ListImages lists existing images. + rpc ListImages(ListImagesRequest) returns (ListImagesResponse) {} + // ImageStatus returns the status of the image. If the image is not + // present, returns a response with ImageStatusResponse.Image set to + // nil. + rpc ImageStatus(ImageStatusRequest) returns (ImageStatusResponse) {} + // PullImage pulls an image with authentication config. + rpc PullImage(PullImageRequest) returns (PullImageResponse) {} + // RemoveImage removes the image. + // This call is idempotent, and must not return an error if the image has + // already been removed. + rpc RemoveImage(RemoveImageRequest) returns (RemoveImageResponse) {} + // ImageFSInfo returns information of the filesystem that is used to store images. + rpc ImageFsInfo(ImageFsInfoRequest) returns (ImageFsInfoResponse) {} +} + +message VersionRequest { + // Version of the kubelet runtime API. + string version = 1; +} + +message VersionResponse { + // Version of the kubelet runtime API. + string version = 1; + // Name of the container runtime. + string runtime_name = 2; + // Version of the container runtime. The string must be + // semver-compatible. + string runtime_version = 3; + // API version of the container runtime. The string must be + // semver-compatible. + string runtime_api_version = 4; +} + +// DNSConfig specifies the DNS servers and search domains of a sandbox. +message DNSConfig { + // List of DNS servers of the cluster. + repeated string servers = 1; + // List of DNS search domains of the cluster. + repeated string searches = 2; + // List of DNS options. See https://linux.die.net/man/5/resolv.conf + // for all available options. + repeated string options = 3; +} + +enum Protocol { + TCP = 0; + UDP = 1; + SCTP = 2; +} + +// PortMapping specifies the port mapping configurations of a sandbox. +message PortMapping { + // Protocol of the port mapping. + Protocol protocol = 1; + // Port number within the container. Default: 0 (not specified). + int32 container_port = 2; + // Port number on the host. Default: 0 (not specified). + int32 host_port = 3; + // Host IP. + string host_ip = 4; +} + +enum MountPropagation { + // No mount propagation ("rprivate" in Linux terminology). + PROPAGATION_PRIVATE = 0; + // Mounts get propagated from the host to the container ("rslave" in Linux). + PROPAGATION_HOST_TO_CONTAINER = 1; + // Mounts get propagated from the host to the container and from the + // container to the host ("rshared" in Linux). + PROPAGATION_BIDIRECTIONAL = 2; +} + +// Mount specifies a host volume to mount into a container. +message Mount { + // Path of the mount within the container. + string container_path = 1; + // Path of the mount on the host. If the hostPath doesn't exist, then runtimes + // should report error. If the hostpath is a symbolic link, runtimes should + // follow the symlink and mount the real destination to container. + string host_path = 2; + // If set, the mount is read-only. + bool readonly = 3; + // If set, the mount needs SELinux relabeling. + bool selinux_relabel = 4; + // Requested propagation mode. + MountPropagation propagation = 5; + // UidMappings specifies the runtime UID mappings for the mount. + repeated IDMapping uidMappings = 6; + // GidMappings specifies the runtime GID mappings for the mount. + repeated IDMapping gidMappings = 7; +} + +// IDMapping describes host to container ID mappings for a pod sandbox. +message IDMapping { + // HostId is the id on the host. + uint32 host_id = 1; + // ContainerId is the id in the container. + uint32 container_id = 2; + // Length is the size of the range to map. + uint32 length = 3; +} + +// A NamespaceMode describes the intended namespace configuration for each +// of the namespaces (Network, PID, IPC) in NamespaceOption. Runtimes should +// map these modes as appropriate for the technology underlying the runtime. +enum NamespaceMode { + // A POD namespace is common to all containers in a pod. + // For example, a container with a PID namespace of POD expects to view + // all of the processes in all of the containers in the pod. + POD = 0; + // A CONTAINER namespace is restricted to a single container. + // For example, a container with a PID namespace of CONTAINER expects to + // view only the processes in that container. + CONTAINER = 1; + // A NODE namespace is the namespace of the Kubernetes node. + // For example, a container with a PID namespace of NODE expects to view + // all of the processes on the host running the kubelet. + NODE = 2; + // TARGET targets the namespace of another container. When this is specified, + // a target_id must be specified in NamespaceOption and refer to a container + // previously created with NamespaceMode CONTAINER. This containers namespace + // will be made to match that of container target_id. + // For example, a container with a PID namespace of TARGET expects to view + // all of the processes that container target_id can view. + TARGET = 3; +} + +// UserNamespace describes the intended user namespace configuration for a pod sandbox. +message UserNamespace { + // Mode is the NamespaceMode for this UserNamespace. + // Note: NamespaceMode for UserNamespace currently supports only POD and NODE, not CONTAINER OR TARGET. + NamespaceMode mode = 1; + + // Uids specifies the UID mappings for the user namespace. + repeated IDMapping uids = 2; + + // Gids specifies the GID mappings for the user namespace. + repeated IDMapping gids = 3; +} + +// NamespaceOption provides options for Linux namespaces. +message NamespaceOption { + // Network namespace for this container/sandbox. + // Note: There is currently no way to set CONTAINER scoped network in the Kubernetes API. + // Namespaces currently set by the kubelet: POD, NODE + NamespaceMode network = 1; + // PID namespace for this container/sandbox. + // Note: The CRI default is POD, but the v1.PodSpec default is CONTAINER. + // The kubelet's runtime manager will set this to CONTAINER explicitly for v1 pods. + // Namespaces currently set by the kubelet: POD, CONTAINER, NODE, TARGET + NamespaceMode pid = 2; + // IPC namespace for this container/sandbox. + // Note: There is currently no way to set CONTAINER scoped IPC in the Kubernetes API. + // Namespaces currently set by the kubelet: POD, NODE + NamespaceMode ipc = 3; + // Target Container ID for NamespaceMode of TARGET. This container must have been + // previously created in the same pod. It is not possible to specify different targets + // for each namespace. + string target_id = 4; + // UsernsOptions for this pod sandbox. + // The Kubelet picks the user namespace configuration to use for the pod sandbox. The mappings + // are specified as part of the UserNamespace struct. If the struct is nil, then the POD mode + // must be assumed. This is done for backward compatibility with older Kubelet versions that + // do not set a user namespace. + UserNamespace userns_options = 5; +} + +// Int64Value is the wrapper of int64. +message Int64Value { + // The value. + int64 value = 1; +} + +// LinuxSandboxSecurityContext holds linux security configuration that will be +// applied to a sandbox. Note that: +// 1) It does not apply to containers in the pods. +// 2) It may not be applicable to a PodSandbox which does not contain any running +// process. +message LinuxSandboxSecurityContext { + // Configurations for the sandbox's namespaces. + // This will be used only if the PodSandbox uses namespace for isolation. + NamespaceOption namespace_options = 1; + // Optional SELinux context to be applied. + SELinuxOption selinux_options = 2; + // UID to run sandbox processes as, when applicable. + Int64Value run_as_user = 3; + // GID to run sandbox processes as, when applicable. run_as_group should only + // be specified when run_as_user is specified; otherwise, the runtime MUST error. + Int64Value run_as_group = 8; + // If set, the root filesystem of the sandbox is read-only. + bool readonly_rootfs = 4; + // List of groups applied to the first process run in the sandbox, in + // addition to the sandbox's primary GID, and group memberships defined + // in the container image for the sandbox's primary UID of the container process. + // If the list is empty, no additional groups are added to any container. + // Note that group memberships defined in the container image for the sandbox's primary UID + // of the container process are still effective, even if they are not included in this list. + repeated int64 supplemental_groups = 5; + // Indicates whether the sandbox will be asked to run a privileged + // container. If a privileged container is to be executed within it, this + // MUST be true. + // This allows a sandbox to take additional security precautions if no + // privileged containers are expected to be run. + bool privileged = 6; + // Seccomp profile for the sandbox. + SecurityProfile seccomp = 9; + // AppArmor profile for the sandbox. + SecurityProfile apparmor = 10; + // Seccomp profile for the sandbox, candidate values are: + // * runtime/default: the default profile for the container runtime + // * unconfined: unconfined profile, ie, no seccomp sandboxing + // * localhost/: the profile installed on the node. + // is the full path of the profile. + // Default: "", which is identical with unconfined. + string seccomp_profile_path = 7 [deprecated=true]; +} + +// A security profile which can be used for sandboxes and containers. +message SecurityProfile { + // Available profile types. + enum ProfileType { + // The container runtime default profile should be used. + RuntimeDefault = 0; + // Disable the feature for the sandbox or the container. + Unconfined = 1; + // A pre-defined profile on the node should be used. + Localhost = 2; + } + // Indicator which `ProfileType` should be applied. + ProfileType profile_type = 1; + // Indicates that a pre-defined profile on the node should be used. + // Must only be set if `ProfileType` is `Localhost`. + // For seccomp, it must be an absolute path to the seccomp profile. + // For AppArmor, this field is the AppArmor `/` + string localhost_ref = 2; +} + +// LinuxPodSandboxConfig holds platform-specific configurations for Linux +// host platforms and Linux-based containers. +message LinuxPodSandboxConfig { + // Parent cgroup of the PodSandbox. + // The cgroupfs style syntax will be used, but the container runtime can + // convert it to systemd semantics if needed. + string cgroup_parent = 1; + // LinuxSandboxSecurityContext holds sandbox security attributes. + LinuxSandboxSecurityContext security_context = 2; + // Sysctls holds linux sysctls config for the sandbox. + map sysctls = 3; + // Optional overhead represents the overheads associated with this sandbox + LinuxContainerResources overhead = 4; + // Optional resources represents the sum of container resources for this sandbox + LinuxContainerResources resources = 5; +} + +// PodSandboxMetadata holds all necessary information for building the sandbox name. +// The container runtime is encouraged to expose the metadata associated with the +// PodSandbox in its user interface for better user experience. For example, +// the runtime can construct a unique PodSandboxName based on the metadata. +message PodSandboxMetadata { + // Pod name of the sandbox. Same as the pod name in the Pod ObjectMeta. + string name = 1; + // Pod UID of the sandbox. Same as the pod UID in the Pod ObjectMeta. + string uid = 2; + // Pod namespace of the sandbox. Same as the pod namespace in the Pod ObjectMeta. + string namespace = 3; + // Attempt number of creating the sandbox. Default: 0. + uint32 attempt = 4; +} + +// PodSandboxConfig holds all the required and optional fields for creating a +// sandbox. +message PodSandboxConfig { + // Metadata of the sandbox. This information will uniquely identify the + // sandbox, and the runtime should leverage this to ensure correct + // operation. The runtime may also use this information to improve UX, such + // as by constructing a readable name. + PodSandboxMetadata metadata = 1; + // Hostname of the sandbox. Hostname could only be empty when the pod + // network namespace is NODE. + string hostname = 2; + // Path to the directory on the host in which container log files are + // stored. + // By default the log of a container going into the LogDirectory will be + // hooked up to STDOUT and STDERR. However, the LogDirectory may contain + // binary log files with structured logging data from the individual + // containers. For example, the files might be newline separated JSON + // structured logs, systemd-journald journal files, gRPC trace files, etc. + // E.g., + // PodSandboxConfig.LogDirectory = `/var/log/pods/__/` + // ContainerConfig.LogPath = `containerName/Instance#.log` + string log_directory = 3; + // DNS config for the sandbox. + DNSConfig dns_config = 4; + // Port mappings for the sandbox. + repeated PortMapping port_mappings = 5; + // Key-value pairs that may be used to scope and select individual resources. + map labels = 6; + // Unstructured key-value map that may be set by the kubelet to store and + // retrieve arbitrary metadata. This will include any annotations set on a + // pod through the Kubernetes API. + // + // Annotations MUST NOT be altered by the runtime; the annotations stored + // here MUST be returned in the PodSandboxStatus associated with the pod + // this PodSandboxConfig creates. + // + // In general, in order to preserve a well-defined interface between the + // kubelet and the container runtime, annotations SHOULD NOT influence + // runtime behaviour. + // + // Annotations can also be useful for runtime authors to experiment with + // new features that are opaque to the Kubernetes APIs (both user-facing + // and the CRI). Whenever possible, however, runtime authors SHOULD + // consider proposing new typed fields for any new features instead. + map annotations = 7; + // Optional configurations specific to Linux hosts. + LinuxPodSandboxConfig linux = 8; + // Optional configurations specific to Windows hosts. + WindowsPodSandboxConfig windows = 9; +} + +message RunPodSandboxRequest { + // Configuration for creating a PodSandbox. + PodSandboxConfig config = 1; + // Named runtime configuration to use for this PodSandbox. + // If the runtime handler is unknown, this request should be rejected. An + // empty string should select the default handler, equivalent to the + // behavior before this feature was added. + // See https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class + string runtime_handler = 2; +} + +message RunPodSandboxResponse { + // ID of the PodSandbox to run. + string pod_sandbox_id = 1; +} + +message StopPodSandboxRequest { + // ID of the PodSandbox to stop. + string pod_sandbox_id = 1; +} + +message StopPodSandboxResponse {} + +message RemovePodSandboxRequest { + // ID of the PodSandbox to remove. + string pod_sandbox_id = 1; +} + +message RemovePodSandboxResponse {} + +message PodSandboxStatusRequest { + // ID of the PodSandbox for which to retrieve status. + string pod_sandbox_id = 1; + // Verbose indicates whether to return extra information about the pod sandbox. + bool verbose = 2; +} + +// PodIP represents an ip of a Pod +message PodIP{ + // an ip is a string representation of an IPv4 or an IPv6 + string ip = 1; +} +// PodSandboxNetworkStatus is the status of the network for a PodSandbox. +// Currently ignored for pods sharing the host networking namespace. +message PodSandboxNetworkStatus { + // IP address of the PodSandbox. + string ip = 1; + // list of additional ips (not inclusive of PodSandboxNetworkStatus.Ip) of the PodSandBoxNetworkStatus + repeated PodIP additional_ips = 2; +} + +// Namespace contains paths to the namespaces. +message Namespace { + // Namespace options for Linux namespaces. + NamespaceOption options = 2; +} + +// LinuxSandboxStatus contains status specific to Linux sandboxes. +message LinuxPodSandboxStatus { + // Paths to the sandbox's namespaces. + Namespace namespaces = 1; +} + +enum PodSandboxState { + SANDBOX_READY = 0; + SANDBOX_NOTREADY = 1; +} + +// PodSandboxStatus contains the status of the PodSandbox. +message PodSandboxStatus { + // ID of the sandbox. + string id = 1; + // Metadata of the sandbox. + PodSandboxMetadata metadata = 2; + // State of the sandbox. + PodSandboxState state = 3; + // Creation timestamp of the sandbox in nanoseconds. Must be > 0. + int64 created_at = 4; + // Network contains network status if network is handled by the runtime. + PodSandboxNetworkStatus network = 5; + // Linux-specific status to a pod sandbox. + LinuxPodSandboxStatus linux = 6; + // Labels are key-value pairs that may be used to scope and select individual resources. + map labels = 7; + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding PodSandboxConfig used to + // instantiate the pod sandbox this status represents. + map annotations = 8; + // runtime configuration used for this PodSandbox. + string runtime_handler = 9; +} + +message PodSandboxStatusResponse { + // Status of the PodSandbox. + PodSandboxStatus status = 1; + // Info is extra information of the PodSandbox. The key could be arbitrary string, and + // value should be in json format. The information could include anything useful for + // debug, e.g. network namespace for linux container based container runtime. + // It should only be returned non-empty when Verbose is true. + map info = 2; + // Container statuses + repeated ContainerStatus containers_statuses = 3; + // Timestamp at which container and pod statuses were recorded + int64 timestamp = 4; +} + +// PodSandboxStateValue is the wrapper of PodSandboxState. +message PodSandboxStateValue { + // State of the sandbox. + PodSandboxState state = 1; +} + +// PodSandboxFilter is used to filter a list of PodSandboxes. +// All those fields are combined with 'AND' +message PodSandboxFilter { + // ID of the sandbox. + string id = 1; + // State of the sandbox. + PodSandboxStateValue state = 2; + // LabelSelector to select matches. + // Only api.MatchLabels is supported for now and the requirements + // are ANDed. MatchExpressions is not supported yet. + map label_selector = 3; +} + +message ListPodSandboxRequest { + // PodSandboxFilter to filter a list of PodSandboxes. + PodSandboxFilter filter = 1; +} + + +// PodSandbox contains minimal information about a sandbox. +message PodSandbox { + // ID of the PodSandbox. + string id = 1; + // Metadata of the PodSandbox. + PodSandboxMetadata metadata = 2; + // State of the PodSandbox. + PodSandboxState state = 3; + // Creation timestamps of the PodSandbox in nanoseconds. Must be > 0. + int64 created_at = 4; + // Labels of the PodSandbox. + map labels = 5; + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding PodSandboxConfig used to + // instantiate this PodSandbox. + map annotations = 6; + // runtime configuration used for this PodSandbox. + string runtime_handler = 7; +} + +message ListPodSandboxResponse { + // List of PodSandboxes. + repeated PodSandbox items = 1; +} + +message PodSandboxStatsRequest { + // ID of the pod sandbox for which to retrieve stats. + string pod_sandbox_id = 1; +} + +message PodSandboxStatsResponse { + PodSandboxStats stats = 1; +} + +// PodSandboxStatsFilter is used to filter the list of pod sandboxes to retrieve stats for. +// All those fields are combined with 'AND'. +message PodSandboxStatsFilter { + // ID of the pod sandbox. + string id = 1; + // LabelSelector to select matches. + // Only api.MatchLabels is supported for now and the requirements + // are ANDed. MatchExpressions is not supported yet. + map label_selector = 2; +} + +message ListPodSandboxStatsRequest { + // Filter for the list request. + PodSandboxStatsFilter filter = 1; +} + +message ListPodSandboxStatsResponse { + // Stats of the pod sandbox. + repeated PodSandboxStats stats = 1; +} + +// PodSandboxAttributes provides basic information of the pod sandbox. +message PodSandboxAttributes { + // ID of the pod sandbox. + string id = 1; + // Metadata of the pod sandbox. + PodSandboxMetadata metadata = 2; + // Key-value pairs that may be used to scope and select individual resources. + map labels = 3; + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding PodSandboxStatus used to + // instantiate the PodSandbox this status represents. + map annotations = 4; +} + +// PodSandboxStats provides the resource usage statistics for a pod. +// The linux or windows field will be populated depending on the platform. +message PodSandboxStats { + // Information of the pod. + PodSandboxAttributes attributes = 1; + // Stats from linux. + LinuxPodSandboxStats linux = 2; + // Stats from windows. + WindowsPodSandboxStats windows = 3; +} + +// LinuxPodSandboxStats provides the resource usage statistics for a pod sandbox on linux. +message LinuxPodSandboxStats { + // CPU usage gathered for the pod sandbox. + CpuUsage cpu = 1; + // Memory usage gathered for the pod sandbox. + MemoryUsage memory = 2; + // Network usage gathered for the pod sandbox + NetworkUsage network = 3; + // Stats pertaining to processes in the pod sandbox. + ProcessUsage process = 4; + // Stats of containers in the measured pod sandbox. + repeated ContainerStats containers = 5; +} + +// WindowsPodSandboxStats provides the resource usage statistics for a pod sandbox on windows +message WindowsPodSandboxStats { + // CPU usage gathered for the pod sandbox. + WindowsCpuUsage cpu = 1; + // Memory usage gathered for the pod sandbox. + WindowsMemoryUsage memory = 2; + // Network usage gathered for the pod sandbox + WindowsNetworkUsage network = 3; + // Stats pertaining to processes in the pod sandbox. + WindowsProcessUsage process = 4; + // Stats of containers in the measured pod sandbox. + repeated WindowsContainerStats containers = 5; +} + +// NetworkUsage contains data about network resources. +message NetworkUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // Stats for the default network interface. + NetworkInterfaceUsage default_interface = 2; + // Stats for all found network interfaces, excluding the default. + repeated NetworkInterfaceUsage interfaces = 3; +} + +// WindowsNetworkUsage contains data about network resources specific to Windows. +message WindowsNetworkUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // Stats for the default network interface. + WindowsNetworkInterfaceUsage default_interface = 2; + // Stats for all found network interfaces, excluding the default. + repeated WindowsNetworkInterfaceUsage interfaces = 3; +} + +// NetworkInterfaceUsage contains resource value data about a network interface. +message NetworkInterfaceUsage { + // The name of the network interface. + string name = 1; + // Cumulative count of bytes received. + UInt64Value rx_bytes = 2; + // Cumulative count of receive errors encountered. + UInt64Value rx_errors = 3; + // Cumulative count of bytes transmitted. + UInt64Value tx_bytes = 4; + // Cumulative count of transmit errors encountered. + UInt64Value tx_errors = 5; +} + +// WindowsNetworkInterfaceUsage contains resource value data about a network interface specific for Windows. +message WindowsNetworkInterfaceUsage { + // The name of the network interface. + string name = 1; + // Cumulative count of bytes received. + UInt64Value rx_bytes = 2; + // Cumulative count of receive errors encountered. + UInt64Value rx_packets_dropped = 3; + // Cumulative count of bytes transmitted. + UInt64Value tx_bytes = 4; + // Cumulative count of transmit errors encountered. + UInt64Value tx_packets_dropped = 5; +} + +// ProcessUsage are stats pertaining to processes. +message ProcessUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // Number of processes. + UInt64Value process_count = 2; +} + +// WindowsProcessUsage are stats pertaining to processes specific to Windows. +message WindowsProcessUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // Number of processes. + UInt64Value process_count = 2; +} + +// ImageSpec is an internal representation of an image. +message ImageSpec { + // Container's Image field (e.g. imageID or imageDigest). + string image = 1; + // Unstructured key-value map holding arbitrary metadata. + // ImageSpec Annotations can be used to help the runtime target specific + // images in multi-arch images. + map annotations = 2; + // The container image reference specified by the user (e.g. image[:tag] or digest). + // Only set if available within the RPC context. + string user_specified_image = 18; + // Runtime handler to use for pulling the image. + // If the runtime handler is unknown, the request should be rejected. + // An empty string would select the default runtime handler. + string runtime_handler = 19; +} + +message KeyValue { + string key = 1; + string value = 2; +} + +// LinuxContainerResources specifies Linux specific configuration for +// resources. +message LinuxContainerResources { + // CPU CFS (Completely Fair Scheduler) period. Default: 0 (not specified). + int64 cpu_period = 1; + // CPU CFS (Completely Fair Scheduler) quota. Default: 0 (not specified). + int64 cpu_quota = 2; + // CPU shares (relative weight vs. other containers). Default: 0 (not specified). + int64 cpu_shares = 3; + // Memory limit in bytes. Default: 0 (not specified). + int64 memory_limit_in_bytes = 4; + // OOMScoreAdj adjusts the oom-killer score. Default: 0 (not specified). + int64 oom_score_adj = 5; + // CpusetCpus constrains the allowed set of logical CPUs. Default: "" (not specified). + string cpuset_cpus = 6; + // CpusetMems constrains the allowed set of memory nodes. Default: "" (not specified). + string cpuset_mems = 7; + // List of HugepageLimits to limit the HugeTLB usage of container per page size. Default: nil (not specified). + repeated HugepageLimit hugepage_limits = 8; + // Unified resources for cgroup v2. Default: nil (not specified). + // Each key/value in the map refers to the cgroup v2. + // e.g. "memory.max": "6937202688" or "io.weight": "default 100". + map unified = 9; + // Memory swap limit in bytes. Default 0 (not specified). + int64 memory_swap_limit_in_bytes = 10; +} + +// HugepageLimit corresponds to the file`hugetlb..limit_in_byte` in container level cgroup. +// For example, `PageSize=1GB`, `Limit=1073741824` means setting `1073741824` bytes to hugetlb.1GB.limit_in_bytes. +message HugepageLimit { + // The value of PageSize has the format B (2MB, 1GB), + // and must match the of the corresponding control file found in `hugetlb..limit_in_bytes`. + // The values of are intended to be parsed using base 1024("1KB" = 1024, "1MB" = 1048576, etc). + string page_size = 1; + // limit in bytes of hugepagesize HugeTLB usage. + uint64 limit = 2; +} + +// SELinuxOption are the labels to be applied to the container. +message SELinuxOption { + string user = 1; + string role = 2; + string type = 3; + string level = 4; +} + +// Capability contains the container capabilities to add or drop +// Dropping a capability will drop it from all sets. +// If a capability is added to only the add_capabilities list then it gets added to permitted, +// inheritable, effective and bounding sets, i.e. all sets except the ambient set. +// If a capability is added to only the add_ambient_capabilities list then it gets added to all sets, i.e permitted +// inheritable, effective, bounding and ambient sets. +// If a capability is added to add_capabilities and add_ambient_capabilities lists then it gets added to all sets, i.e. +// permitted, inheritable, effective, bounding and ambient sets. +message Capability { + // List of capabilities to add. + repeated string add_capabilities = 1; + // List of capabilities to drop. + repeated string drop_capabilities = 2; + // List of ambient capabilities to add. + repeated string add_ambient_capabilities = 3; +} + +// LinuxContainerSecurityContext holds linux security configuration that will be applied to a container. +message LinuxContainerSecurityContext { + // Capabilities to add or drop. + Capability capabilities = 1; + // If set, run container in privileged mode. + // Privileged mode is incompatible with the following options. If + // privileged is set, the following features MAY have no effect: + // 1. capabilities + // 2. selinux_options + // 4. seccomp + // 5. apparmor + // + // Privileged mode implies the following specific options are applied: + // 1. All capabilities are added. + // 2. Sensitive paths, such as kernel module paths within sysfs, are not masked. + // 3. Any sysfs and procfs mounts are mounted RW. + // 4. AppArmor confinement is not applied. + // 5. Seccomp restrictions are not applied. + // 6. The device cgroup does not restrict access to any devices. + // 7. All devices from the host's /dev are available within the container. + // 8. SELinux restrictions are not applied (e.g. label=disabled). + bool privileged = 2; + // Configurations for the container's namespaces. + // Only used if the container uses namespace for isolation. + NamespaceOption namespace_options = 3; + // SELinux context to be optionally applied. + SELinuxOption selinux_options = 4; + // UID to run the container process as. Only one of run_as_user and + // run_as_username can be specified at a time. + Int64Value run_as_user = 5; + // GID to run the container process as. run_as_group should only be specified + // when run_as_user or run_as_username is specified; otherwise, the runtime + // MUST error. + Int64Value run_as_group = 12; + // User name to run the container process as. If specified, the user MUST + // exist in the container image (i.e. in the /etc/passwd inside the image), + // and be resolved there by the runtime; otherwise, the runtime MUST error. + string run_as_username = 6; + // If set, the root filesystem of the container is read-only. + bool readonly_rootfs = 7; + // List of groups applied to the first process run in the container, in + // addition to the container's primary GID, and group memberships defined + // in the container image for the container's primary UID of the container process. + // If the list is empty, no additional groups are added to any container. + // Note that group memberships defined in the container image for the container's primary UID + // of the container process are still effective, even if they are not included in this list. + repeated int64 supplemental_groups = 8; + // no_new_privs defines if the flag for no_new_privs should be set on the + // container. + bool no_new_privs = 11; + // masked_paths is a slice of paths that should be masked by the container + // runtime, this can be passed directly to the OCI spec. + repeated string masked_paths = 13; + // readonly_paths is a slice of paths that should be set as readonly by the + // container runtime, this can be passed directly to the OCI spec. + repeated string readonly_paths = 14; + // Seccomp profile for the container. + SecurityProfile seccomp = 15; + // AppArmor profile for the container. + SecurityProfile apparmor = 16; + // AppArmor profile for the container, candidate values are: + // * runtime/default: equivalent to not specifying a profile. + // * unconfined: no profiles are loaded + // * localhost/: profile loaded on the node + // (localhost) by name. The possible profile names are detailed at + // https://gitlab.com/apparmor/apparmor/-/wikis/AppArmor_Core_Policy_Reference + string apparmor_profile = 9 [deprecated=true]; + // Seccomp profile for the container, candidate values are: + // * runtime/default: the default profile for the container runtime + // * unconfined: unconfined profile, ie, no seccomp sandboxing + // * localhost/: the profile installed on the node. + // is the full path of the profile. + // Default: "", which is identical with unconfined. + string seccomp_profile_path = 10 [deprecated=true]; +} + +// LinuxContainerConfig contains platform-specific configuration for +// Linux-based containers. +message LinuxContainerConfig { + // Resources specification for the container. + LinuxContainerResources resources = 1; + // LinuxContainerSecurityContext configuration for the container. + LinuxContainerSecurityContext security_context = 2; +} + +// WindowsNamespaceOption provides options for Windows namespaces. +message WindowsNamespaceOption { + // Network namespace for this container/sandbox. + // Namespaces currently set by the kubelet: POD, NODE + NamespaceMode network = 1; +} + +// WindowsSandboxSecurityContext holds platform-specific configurations that will be +// applied to a sandbox. +// These settings will only apply to the sandbox container. +message WindowsSandboxSecurityContext { + // User name to run the container process as. If specified, the user MUST + // exist in the container image and be resolved there by the runtime; + // otherwise, the runtime MUST return error. + string run_as_username = 1; + + // The contents of the GMSA credential spec to use to run this container. + string credential_spec = 2; + + // Indicates whether the container requested to run as a HostProcess container. + bool host_process = 3; + + // Configuration for the sandbox's namespaces + WindowsNamespaceOption namespace_options = 4; +} + +// WindowsPodSandboxConfig holds platform-specific configurations for Windows +// host platforms and Windows-based containers. +message WindowsPodSandboxConfig { + // WindowsSandboxSecurityContext holds sandbox security attributes. + WindowsSandboxSecurityContext security_context = 1; +} + +// WindowsContainerSecurityContext holds windows security configuration that will be applied to a container. +message WindowsContainerSecurityContext { + // User name to run the container process as. If specified, the user MUST + // exist in the container image and be resolved there by the runtime; + // otherwise, the runtime MUST return error. + string run_as_username = 1; + + // The contents of the GMSA credential spec to use to run this container. + string credential_spec = 2; + + // Indicates whether a container is to be run as a HostProcess container. + bool host_process = 3; +} + +// WindowsContainerConfig contains platform-specific configuration for +// Windows-based containers. +message WindowsContainerConfig { + // Resources specification for the container. + WindowsContainerResources resources = 1; + // WindowsContainerSecurityContext configuration for the container. + WindowsContainerSecurityContext security_context = 2; +} + +// WindowsContainerResources specifies Windows specific configuration for +// resources. +message WindowsContainerResources { + // CPU shares (relative weight vs. other containers). Default: 0 (not specified). + int64 cpu_shares = 1; + // Number of CPUs available to the container. Default: 0 (not specified). + int64 cpu_count = 2; + // Specifies the portion of processor cycles that this container can use as a percentage times 100. + int64 cpu_maximum = 3; + // Memory limit in bytes. Default: 0 (not specified). + int64 memory_limit_in_bytes = 4; + // Specifies the size of the rootfs / scratch space in bytes to be configured for this container. Default: 0 (not specified). + int64 rootfs_size_in_bytes = 5; +} + +// ContainerMetadata holds all necessary information for building the container +// name. The container runtime is encouraged to expose the metadata in its user +// interface for better user experience. E.g., runtime can construct a unique +// container name based on the metadata. Note that (name, attempt) is unique +// within a sandbox for the entire lifetime of the sandbox. +message ContainerMetadata { + // Name of the container. Same as the container name in the PodSpec. + string name = 1; + // Attempt number of creating the container. Default: 0. + uint32 attempt = 2; +} + +// Device specifies a host device to mount into a container. +message Device { + // Path of the device within the container. + string container_path = 1; + // Path of the device on the host. + string host_path = 2; + // Cgroups permissions of the device, candidates are one or more of + // * r - allows container to read from the specified device. + // * w - allows container to write to the specified device. + // * m - allows container to create device files that do not yet exist. + string permissions = 3; +} + +// CDIDevice specifies a CDI device information. +message CDIDevice { + // Fully qualified CDI device name + // for example: vendor.com/gpu=gpudevice1 + // see more details in the CDI specification: + // https://github.com/container-orchestrated-devices/container-device-interface/blob/main/SPEC.md + string name = 1; +} + +// ContainerConfig holds all the required and optional fields for creating a +// container. +message ContainerConfig { + // Metadata of the container. This information will uniquely identify the + // container, and the runtime should leverage this to ensure correct + // operation. The runtime may also use this information to improve UX, such + // as by constructing a readable name. + ContainerMetadata metadata = 1 ; + // Image to use. + ImageSpec image = 2; + // Command to execute (i.e., entrypoint for docker) + repeated string command = 3; + // Args for the Command (i.e., command for docker) + repeated string args = 4; + // Current working directory of the command. + string working_dir = 5; + // List of environment variable to set in the container. + repeated KeyValue envs = 6; + // Mounts for the container. + repeated Mount mounts = 7; + // Devices for the container. + repeated Device devices = 8; + // Key-value pairs that may be used to scope and select individual resources. + // Label keys are of the form: + // label-key ::= prefixed-name | name + // prefixed-name ::= prefix '/' name + // prefix ::= DNS_SUBDOMAIN + // name ::= DNS_LABEL + map labels = 9; + // Unstructured key-value map that may be used by the kubelet to store and + // retrieve arbitrary metadata. + // + // Annotations MUST NOT be altered by the runtime; the annotations stored + // here MUST be returned in the ContainerStatus associated with the container + // this ContainerConfig creates. + // + // In general, in order to preserve a well-defined interface between the + // kubelet and the container runtime, annotations SHOULD NOT influence + // runtime behaviour. + map annotations = 10; + // Path relative to PodSandboxConfig.LogDirectory for container to store + // the log (STDOUT and STDERR) on the host. + // E.g., + // PodSandboxConfig.LogDirectory = `/var/log/pods/__/` + // ContainerConfig.LogPath = `containerName/Instance#.log` + string log_path = 11; + + // Variables for interactive containers, these have very specialized + // use-cases (e.g. debugging). + bool stdin = 12; + bool stdin_once = 13; + bool tty = 14; + + // Configuration specific to Linux containers. + LinuxContainerConfig linux = 15; + // Configuration specific to Windows containers. + WindowsContainerConfig windows = 16; + + // CDI devices for the container. + repeated CDIDevice CDI_devices = 17; +} + +message CreateContainerRequest { + // ID of the PodSandbox in which the container should be created. + string pod_sandbox_id = 1; + // Config of the container. + ContainerConfig config = 2; + // Config of the PodSandbox. This is the same config that was passed + // to RunPodSandboxRequest to create the PodSandbox. It is passed again + // here just for easy reference. The PodSandboxConfig is immutable and + // remains the same throughout the lifetime of the pod. + PodSandboxConfig sandbox_config = 3; +} + +message CreateContainerResponse { + // ID of the created container. + string container_id = 1; +} + +message StartContainerRequest { + // ID of the container to start. + string container_id = 1; +} + +message StartContainerResponse {} + +message StopContainerRequest { + // ID of the container to stop. + string container_id = 1; + // Timeout in seconds to wait for the container to stop before forcibly + // terminating it. Default: 0 (forcibly terminate the container immediately) + int64 timeout = 2; +} + +message StopContainerResponse {} + +message RemoveContainerRequest { + // ID of the container to remove. + string container_id = 1; +} + +message RemoveContainerResponse {} + +enum ContainerState { + CONTAINER_CREATED = 0; + CONTAINER_RUNNING = 1; + CONTAINER_EXITED = 2; + CONTAINER_UNKNOWN = 3; +} + +// ContainerStateValue is the wrapper of ContainerState. +message ContainerStateValue { + // State of the container. + ContainerState state = 1; +} + +// ContainerFilter is used to filter containers. +// All those fields are combined with 'AND' +message ContainerFilter { + // ID of the container. + string id = 1; + // State of the container. + ContainerStateValue state = 2; + // ID of the PodSandbox. + string pod_sandbox_id = 3; + // LabelSelector to select matches. + // Only api.MatchLabels is supported for now and the requirements + // are ANDed. MatchExpressions is not supported yet. + map label_selector = 4; +} + +message ListContainersRequest { + ContainerFilter filter = 1; +} + +// Container provides the runtime information for a container, such as ID, hash, +// state of the container. +message Container { + // ID of the container, used by the container runtime to identify + // a container. + string id = 1; + // ID of the sandbox to which this container belongs. + string pod_sandbox_id = 2; + // Metadata of the container. + ContainerMetadata metadata = 3; + // Spec of the image. + ImageSpec image = 4; + // Reference to the image in use. For most runtimes, this should be an + // image ID. + string image_ref = 5; + // State of the container. + ContainerState state = 6; + // Creation time of the container in nanoseconds. + int64 created_at = 7; + // Key-value pairs that may be used to scope and select individual resources. + map labels = 8; + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding ContainerConfig used to + // instantiate this Container. + map annotations = 9; +} + +message ListContainersResponse { + // List of containers. + repeated Container containers = 1; +} + +message ContainerStatusRequest { + // ID of the container for which to retrieve status. + string container_id = 1; + // Verbose indicates whether to return extra information about the container. + bool verbose = 2; +} + +// ContainerStatus represents the status of a container. +message ContainerStatus { + // ID of the container. + string id = 1; + // Metadata of the container. + ContainerMetadata metadata = 2; + // Status of the container. + ContainerState state = 3; + // Creation time of the container in nanoseconds. + int64 created_at = 4; + // Start time of the container in nanoseconds. Default: 0 (not specified). + int64 started_at = 5; + // Finish time of the container in nanoseconds. Default: 0 (not specified). + int64 finished_at = 6; + // Exit code of the container. Only required when finished_at != 0. Default: 0. + int32 exit_code = 7; + // Spec of the image. + ImageSpec image = 8; + // Reference to the image in use. For most runtimes, this should be an + // image ID + string image_ref = 9; + // Brief CamelCase string explaining why container is in its current state. + // Must be set to "OOMKilled" for containers terminated by cgroup-based Out-of-Memory killer. + string reason = 10; + // Human-readable message indicating details about why container is in its + // current state. + string message = 11; + // Key-value pairs that may be used to scope and select individual resources. + map labels = 12; + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding ContainerConfig used to + // instantiate the Container this status represents. + map annotations = 13; + // Mounts for the container. + repeated Mount mounts = 14; + // Log path of container. + string log_path = 15; + // Resource limits configuration of the container. + ContainerResources resources = 16; +} + +message ContainerStatusResponse { + // Status of the container. + ContainerStatus status = 1; + // Info is extra information of the Container. The key could be arbitrary string, and + // value should be in json format. The information could include anything useful for + // debug, e.g. pid for linux container based container runtime. + // It should only be returned non-empty when Verbose is true. + map info = 2; +} + +// ContainerResources holds resource limits configuration for a container. +message ContainerResources { + // Resource limits configuration specific to Linux container. + LinuxContainerResources linux = 1; + // Resource limits configuration specific to Windows container. + WindowsContainerResources windows = 2; +} + +message UpdateContainerResourcesRequest { + // ID of the container to update. + string container_id = 1; + // Resource configuration specific to Linux containers. + LinuxContainerResources linux = 2; + // Resource configuration specific to Windows containers. + WindowsContainerResources windows = 3; + // Unstructured key-value map holding arbitrary additional information for + // container resources updating. This can be used for specifying experimental + // resources to update or other options to use when updating the container. + map annotations = 4; +} + +message UpdateContainerResourcesResponse {} + +message ExecSyncRequest { + // ID of the container. + string container_id = 1; + // Command to execute. + repeated string cmd = 2; + // Timeout in seconds to stop the command. Default: 0 (run forever). + int64 timeout = 3; +} + +message ExecSyncResponse { + // Captured command stdout output. + // The runtime should cap the output of this response to 16MB. + // If the stdout of the command produces more than 16MB, the remaining output + // should be discarded, and the command should proceed with no error. + // See CVE-2022-1708 and CVE-2022-31030 for more information. + bytes stdout = 1; + // Captured command stderr output. + // The runtime should cap the output of this response to 16MB. + // If the stderr of the command produces more than 16MB, the remaining output + // should be discarded, and the command should proceed with no error. + // See CVE-2022-1708 and CVE-2022-31030 for more information. + bytes stderr = 2; + // Exit code the command finished with. Default: 0 (success). + int32 exit_code = 3; +} + +message ExecRequest { + // ID of the container in which to execute the command. + string container_id = 1; + // Command to execute. + repeated string cmd = 2; + // Whether to exec the command in a TTY. + bool tty = 3; + // Whether to stream stdin. + // One of `stdin`, `stdout`, and `stderr` MUST be true. + bool stdin = 4; + // Whether to stream stdout. + // One of `stdin`, `stdout`, and `stderr` MUST be true. + bool stdout = 5; + // Whether to stream stderr. + // One of `stdin`, `stdout`, and `stderr` MUST be true. + // If `tty` is true, `stderr` MUST be false. Multiplexing is not supported + // in this case. The output of stdout and stderr will be combined to a + // single stream. + bool stderr = 6; +} + +message ExecResponse { + // Fully qualified URL of the exec streaming server. + string url = 1; +} + +message AttachRequest { + // ID of the container to which to attach. + string container_id = 1; + // Whether to stream stdin. + // One of `stdin`, `stdout`, and `stderr` MUST be true. + bool stdin = 2; + // Whether the process being attached is running in a TTY. + // This must match the TTY setting in the ContainerConfig. + bool tty = 3; + // Whether to stream stdout. + // One of `stdin`, `stdout`, and `stderr` MUST be true. + bool stdout = 4; + // Whether to stream stderr. + // One of `stdin`, `stdout`, and `stderr` MUST be true. + // If `tty` is true, `stderr` MUST be false. Multiplexing is not supported + // in this case. The output of stdout and stderr will be combined to a + // single stream. + bool stderr = 5; +} + +message AttachResponse { + // Fully qualified URL of the attach streaming server. + string url = 1; +} + +message PortForwardRequest { + // ID of the container to which to forward the port. + string pod_sandbox_id = 1; + // Port to forward. + repeated int32 port = 2; +} + +message PortForwardResponse { + // Fully qualified URL of the port-forward streaming server. + string url = 1; +} + +message ImageFilter { + // Spec of the image. + ImageSpec image = 1; +} + +message ListImagesRequest { + // Filter to list images. + ImageFilter filter = 1; +} + +// Basic information about a container image. +message Image { + // ID of the image. + string id = 1; + // Other names by which this image is known. + repeated string repo_tags = 2; + // Digests by which this image is known. + repeated string repo_digests = 3; + // Size of the image in bytes. Must be > 0. + uint64 size = 4; + // UID that will run the command(s). This is used as a default if no user is + // specified when creating the container. UID and the following user name + // are mutually exclusive. + Int64Value uid = 5; + // User name that will run the command(s). This is used if UID is not set + // and no user is specified when creating container. + string username = 6; + // ImageSpec for image which includes annotations + ImageSpec spec = 7; + // Recommendation on whether this image should be exempt from garbage collection. + // It must only be treated as a recommendation -- the client can still request that the image be deleted, + // and the runtime must oblige. + bool pinned = 8; +} + +message ListImagesResponse { + // List of images. + repeated Image images = 1; +} + +message ImageStatusRequest { + // Spec of the image. + ImageSpec image = 1; + // Verbose indicates whether to return extra information about the image. + bool verbose = 2; +} + +message ImageStatusResponse { + // Status of the image. + Image image = 1; + // Info is extra information of the Image. The key could be arbitrary string, and + // value should be in json format. The information could include anything useful + // for debug, e.g. image config for oci image based container runtime. + // It should only be returned non-empty when Verbose is true. + map info = 2; +} + +// AuthConfig contains authorization information for connecting to a registry. +message AuthConfig { + string username = 1; + string password = 2; + string auth = 3; + string server_address = 4; + // IdentityToken is used to authenticate the user and get + // an access token for the registry. + string identity_token = 5; + // RegistryToken is a bearer token to be sent to a registry + string registry_token = 6; +} + +message PullImageRequest { + // Spec of the image. + ImageSpec image = 1; + // Authentication configuration for pulling the image. + AuthConfig auth = 2; + // Config of the PodSandbox, which is used to pull image in PodSandbox context. + PodSandboxConfig sandbox_config = 3; +} + +message PullImageResponse { + // Reference to the image in use. For most runtimes, this should be an + // image ID or digest. + string image_ref = 1; +} + +message RemoveImageRequest { + // Spec of the image to remove. + ImageSpec image = 1; +} + +message RemoveImageResponse {} + +message NetworkConfig { + // CIDR to use for pod IP addresses. If the CIDR is empty, runtimes + // should omit it. + string pod_cidr = 1; +} + +message RuntimeConfig { + NetworkConfig network_config = 1; +} + +message UpdateRuntimeConfigRequest { + RuntimeConfig runtime_config = 1; +} + +message UpdateRuntimeConfigResponse {} + +// RuntimeCondition contains condition information for the runtime. +// There are 2 kinds of runtime conditions: +// 1. Required conditions: Conditions are required for kubelet to work +// properly. If any required condition is unmet, the node will be not ready. +// The required conditions include: +// * RuntimeReady: RuntimeReady means the runtime is up and ready to accept +// basic containers e.g. container only needs host network. +// * NetworkReady: NetworkReady means the runtime network is up and ready to +// accept containers which require container network. +// 2. Optional conditions: Conditions are informative to the user, but kubelet +// will not rely on. Since condition type is an arbitrary string, all conditions +// not required are optional. These conditions will be exposed to users to help +// them understand the status of the system. +message RuntimeCondition { + // Type of runtime condition. + string type = 1; + // Status of the condition, one of true/false. Default: false. + bool status = 2; + // Brief CamelCase string containing reason for the condition's last transition. + string reason = 3; + // Human-readable message indicating details about last transition. + string message = 4; +} + +// RuntimeStatus is information about the current status of the runtime. +message RuntimeStatus { + // List of current observed runtime conditions. + repeated RuntimeCondition conditions = 1; +} + +message StatusRequest { + // Verbose indicates whether to return extra information about the runtime. + bool verbose = 1; +} + +message StatusResponse { + // Status of the Runtime. + RuntimeStatus status = 1; + // Info is extra information of the Runtime. The key could be arbitrary string, and + // value should be in json format. The information could include anything useful for + // debug, e.g. plugins used by the container runtime. + // It should only be returned non-empty when Verbose is true. + map info = 2; +} + +message ImageFsInfoRequest {} + +// UInt64Value is the wrapper of uint64. +message UInt64Value { + // The value. + uint64 value = 1; +} + +// FilesystemIdentifier uniquely identify the filesystem. +message FilesystemIdentifier{ + // Mountpoint of a filesystem. + string mountpoint = 1; +} + +// FilesystemUsage provides the filesystem usage information. +message FilesystemUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // The unique identifier of the filesystem. + FilesystemIdentifier fs_id = 2; + // UsedBytes represents the bytes used for images on the filesystem. + // This may differ from the total bytes used on the filesystem and may not + // equal CapacityBytes - AvailableBytes. + UInt64Value used_bytes = 3; + // InodesUsed represents the inodes used by the images. + // This may not equal InodesCapacity - InodesAvailable because the underlying + // filesystem may also be used for purposes other than storing images. + UInt64Value inodes_used = 4; +} + +// WindowsFilesystemUsage provides the filesystem usage information specific to Windows. +message WindowsFilesystemUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // The unique identifier of the filesystem. + FilesystemIdentifier fs_id = 2; + // UsedBytes represents the bytes used for images on the filesystem. + // This may differ from the total bytes used on the filesystem and may not + // equal CapacityBytes - AvailableBytes. + UInt64Value used_bytes = 3; +} + +message ImageFsInfoResponse { + // Information of image filesystem(s). + repeated FilesystemUsage image_filesystems = 1; + // Information of container filesystem(s). + // This is an optional field, may be used for example if container and image + // storage are separated. + // Default will be to return this as empty. + repeated FilesystemUsage container_filesystems = 2; +} + +message ContainerStatsRequest{ + // ID of the container for which to retrieve stats. + string container_id = 1; +} + +message ContainerStatsResponse { + // Stats of the container. + ContainerStats stats = 1; +} + +message ListContainerStatsRequest{ + // Filter for the list request. + ContainerStatsFilter filter = 1; +} + +// ContainerStatsFilter is used to filter containers. +// All those fields are combined with 'AND' +message ContainerStatsFilter { + // ID of the container. + string id = 1; + // ID of the PodSandbox. + string pod_sandbox_id = 2; + // LabelSelector to select matches. + // Only api.MatchLabels is supported for now and the requirements + // are ANDed. MatchExpressions is not supported yet. + map label_selector = 3; +} + +message ListContainerStatsResponse { + // Stats of the container. + repeated ContainerStats stats = 1; +} + +// ContainerAttributes provides basic information of the container. +message ContainerAttributes { + // ID of the container. + string id = 1; + // Metadata of the container. + ContainerMetadata metadata = 2; + // Key-value pairs that may be used to scope and select individual resources. + map labels = 3; + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding ContainerConfig used to + // instantiate the Container this status represents. + map annotations = 4; +} + +// ContainerStats provides the resource usage statistics for a container. +message ContainerStats { + // Information of the container. + ContainerAttributes attributes = 1; + // CPU usage gathered from the container. + CpuUsage cpu = 2; + // Memory usage gathered from the container. + MemoryUsage memory = 3; + // Usage of the writable layer. + FilesystemUsage writable_layer = 4; + // Swap usage gathered from the container. + SwapUsage swap = 5; +} + +// WindowsContainerStats provides the resource usage statistics for a container specific for Windows +message WindowsContainerStats { + // Information of the container. + ContainerAttributes attributes = 1; + // CPU usage gathered from the container. + WindowsCpuUsage cpu = 2; + // Memory usage gathered from the container. + WindowsMemoryUsage memory = 3; + // Usage of the writable layer. + WindowsFilesystemUsage writable_layer = 4; +} + +// CpuUsage provides the CPU usage information. +message CpuUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // Cumulative CPU usage (sum across all cores) since object creation. + UInt64Value usage_core_nano_seconds = 2; + // Total CPU usage (sum of all cores) averaged over the sample window. + // The "core" unit can be interpreted as CPU core-nanoseconds per second. + UInt64Value usage_nano_cores = 3; +} + +// WindowsCpuUsage provides the CPU usage information specific to Windows +message WindowsCpuUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // Cumulative CPU usage (sum across all cores) since object creation. + UInt64Value usage_core_nano_seconds = 2; + // Total CPU usage (sum of all cores) averaged over the sample window. + // The "core" unit can be interpreted as CPU core-nanoseconds per second. + UInt64Value usage_nano_cores = 3; +} + +// MemoryUsage provides the memory usage information. +message MemoryUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // The amount of working set memory in bytes. + UInt64Value working_set_bytes = 2; + // Available memory for use. This is defined as the memory limit - workingSetBytes. + UInt64Value available_bytes = 3; + // Total memory in use. This includes all memory regardless of when it was accessed. + UInt64Value usage_bytes = 4; + // The amount of anonymous and swap cache memory (includes transparent hugepages). + UInt64Value rss_bytes = 5; + // Cumulative number of minor page faults. + UInt64Value page_faults = 6; + // Cumulative number of major page faults. + UInt64Value major_page_faults = 7; +} + +message SwapUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // Available swap for use. This is defined as the swap limit - swapUsageBytes. + UInt64Value swap_available_bytes = 2; + // Total memory in use. This includes all memory regardless of when it was accessed. + UInt64Value swap_usage_bytes = 3; +} + +// WindowsMemoryUsage provides the memory usage information specific to Windows +message WindowsMemoryUsage { + // Timestamp in nanoseconds at which the information were collected. Must be > 0. + int64 timestamp = 1; + // The amount of working set memory in bytes. + UInt64Value working_set_bytes = 2; + // Available memory for use. This is defined as the memory limit - commit_memory_bytes. + UInt64Value available_bytes = 3; + // Cumulative number of page faults. + UInt64Value page_faults = 4; + // Total commit memory in use. Commit memory is total of physical and virtual memory in use. + UInt64Value commit_memory_bytes = 5; +} + +message ReopenContainerLogRequest { + // ID of the container for which to reopen the log. + string container_id = 1; +} + +message ReopenContainerLogResponse{ +} + +message CheckpointContainerRequest { + // ID of the container to be checkpointed. + string container_id = 1; + // Location of the checkpoint archive used for export + string location = 2; + // Timeout in seconds for the checkpoint to complete. + // Timeout of zero means to use the CRI default. + // Timeout > 0 means to use the user specified timeout. + int64 timeout = 3; +} + +message CheckpointContainerResponse {} + +message GetEventsRequest {} + +message ContainerEventResponse { + // ID of the container + string container_id = 1; + + // Type of the container event + ContainerEventType container_event_type = 2; + + // Creation timestamp of this event + int64 created_at = 3; + + // Sandbox status + PodSandboxStatus pod_sandbox_status = 4; + + // Container statuses + repeated ContainerStatus containers_statuses = 5; +} + +enum ContainerEventType { + // Container created + CONTAINER_CREATED_EVENT = 0; + + // Container started + CONTAINER_STARTED_EVENT = 1; + + // Container stopped + CONTAINER_STOPPED_EVENT = 2; + + // Container deleted + CONTAINER_DELETED_EVENT = 3; +} + +message ListMetricDescriptorsRequest {} + +message ListMetricDescriptorsResponse { + repeated MetricDescriptor descriptors = 1; +} + +message MetricDescriptor { + // The name field will be used as a unique identifier of this MetricDescriptor, + // and be used in conjunction with the Metric structure to populate the full Metric. + string name = 1; + string help = 2; + // When a metric uses this metric descriptor, it should only define + // labels that have previously been declared in label_keys. + // It is the responsibility of the runtime to correctly keep sorted the keys and values. + // If the two slices have different length, the behavior is undefined. + repeated string label_keys = 3; +} + +message ListPodSandboxMetricsRequest {} + +message ListPodSandboxMetricsResponse { + repeated PodSandboxMetrics pod_metrics = 1; +} + +message PodSandboxMetrics { + string pod_sandbox_id = 1; + repeated Metric metrics = 2; + repeated ContainerMetrics container_metrics = 3; +} + +message ContainerMetrics { + string container_id = 1; + repeated Metric metrics = 2; +} + +message Metric { + // Name must match a name previously returned in a MetricDescriptors call, + // otherwise, it will be ignored. + string name = 1; + // Timestamp should be 0 if the metric was gathered live. + // If it was cached, the Timestamp should reflect the time it was collected. + int64 timestamp = 2; + MetricType metric_type = 3; + // The corresponding LabelValues to the LabelKeys defined in the MetricDescriptor. + // It is the responsibility of the runtime to correctly keep sorted the keys and values. + // If the two slices have different length, the behavior is undefined. + repeated string label_values = 4; + UInt64Value value = 5; +} + +enum MetricType { + COUNTER = 0; + GAUGE = 1; +} + +message RuntimeConfigRequest {} + +message RuntimeConfigResponse { + // Configuration information for Linux-based runtimes. This field contains + // global runtime configuration options that are not specific to runtime + // handlers. + LinuxRuntimeConfiguration linux = 1; +} + +message LinuxRuntimeConfiguration { + // Cgroup driver to use + // Note: this field should not change for the lifecycle of the Kubelet, + // or while there are running containers. + // The Kubelet will not re-request this after startup, and will construct the cgroup + // hierarchy assuming it is static. + // If the runtime wishes to change this value, it must be accompanied by removal of + // all pods, and a restart of the Kubelet. The easiest way to do this is with a full node reboot. + CgroupDriver cgroup_driver = 1; +} + +enum CgroupDriver { + SYSTEMD = 0; + CGROUPFS = 1; +} diff --git a/src/StellaOps.Zastava.Observer/StellaOps.Zastava.Observer.csproj b/src/StellaOps.Zastava.Observer/StellaOps.Zastava.Observer.csproj new file mode 100644 index 00000000..a2a5d348 --- /dev/null +++ b/src/StellaOps.Zastava.Observer/StellaOps.Zastava.Observer.csproj @@ -0,0 +1,24 @@ + + + Exe + net10.0 + preview + enable + enable + true + + + + + + All + + + + + + + + + + diff --git a/src/StellaOps.Zastava.Observer/TASKS.md b/src/StellaOps.Zastava.Observer/TASKS.md index 99c80e7c..164a17b4 100644 --- a/src/StellaOps.Zastava.Observer/TASKS.md +++ b/src/StellaOps.Zastava.Observer/TASKS.md @@ -2,7 +2,7 @@ | ID | Status | Owner(s) | Depends on | Description | Exit Criteria | |----|--------|----------|------------|-------------|---------------| -| ZASTAVA-OBS-12-001 | TODO | Zastava Observer Guild | ZASTAVA-CORE-12-201 | Build container lifecycle watcher that tails CRI (containerd/cri-o/docker) events and emits deterministic runtime records with buffering + backoff. | Fixture cluster produces start/stop events with stable ordering, jitter/backoff tested, metrics/logging wired. | +| ZASTAVA-OBS-12-001 | DOING | Zastava Observer Guild | ZASTAVA-CORE-12-201 | Build container lifecycle watcher that tails CRI (containerd/cri-o/docker) events and emits deterministic runtime records with buffering + backoff. | Fixture cluster produces start/stop events with stable ordering, jitter/backoff tested, metrics/logging wired. | | ZASTAVA-OBS-12-002 | TODO | Zastava Observer Guild | ZASTAVA-OBS-12-001 | Capture entrypoint traces and loaded libraries, hashing binaries and correlating to SBOM baseline per architecture sections 2.1 and 10. | EntryTrace parser covers shell/python/node launchers, loaded library hashes recorded, fixtures assert linkage to SBOM usage view. | | ZASTAVA-OBS-12-003 | TODO | Zastava Observer Guild | ZASTAVA-OBS-12-002 | Implement runtime posture checks (signature/SBOM/attestation presence) with offline caching and warning surfaces. | Observer marks posture status, caches refresh across restarts, integration tests prove offline tolerance. | | ZASTAVA-OBS-12-004 | TODO | Zastava Observer Guild | ZASTAVA-OBS-12-002 | Batch `/runtime/events` submissions with disk-backed buffer, rate limits, and deterministic envelopes. | Buffered submissions survive restart, rate-limits enforced in tests, JSON envelopes match schema in docs/events. | diff --git a/src/StellaOps.Zastava.Observer/Worker/ObserverBootstrapService.cs b/src/StellaOps.Zastava.Observer/Worker/ObserverBootstrapService.cs new file mode 100644 index 00000000..86d81b71 --- /dev/null +++ b/src/StellaOps.Zastava.Observer/Worker/ObserverBootstrapService.cs @@ -0,0 +1,51 @@ +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Core.Diagnostics; +using StellaOps.Zastava.Core.Security; + +namespace StellaOps.Zastava.Observer.Worker; + +/// +/// Minimal bootstrap worker ensuring runtime core wiring is exercised. +/// +internal sealed class ObserverBootstrapService : BackgroundService +{ + private readonly IZastavaLogScopeBuilder logScopeBuilder; + private readonly IZastavaRuntimeMetrics runtimeMetrics; + private readonly IZastavaAuthorityTokenProvider authorityTokenProvider; + private readonly IHostApplicationLifetime applicationLifetime; + private readonly ILogger logger; + private readonly ZastavaRuntimeOptions runtimeOptions; + + public ObserverBootstrapService( + IZastavaLogScopeBuilder logScopeBuilder, + IZastavaRuntimeMetrics runtimeMetrics, + IZastavaAuthorityTokenProvider authorityTokenProvider, + IOptions runtimeOptions, + IHostApplicationLifetime applicationLifetime, + ILogger logger) + { + this.logScopeBuilder = logScopeBuilder; + this.runtimeMetrics = runtimeMetrics; + this.authorityTokenProvider = authorityTokenProvider; + this.applicationLifetime = applicationLifetime; + this.logger = logger; + this.runtimeOptions = runtimeOptions.Value; + } + + protected override Task ExecuteAsync(CancellationToken stoppingToken) + { + var scope = logScopeBuilder.BuildScope(eventId: "observer.bootstrap"); + using (logger.BeginScope(scope)) + { + logger.LogInformation("Zastava observer runtime core initialised for tenant {Tenant}, component {Component}.", runtimeOptions.Tenant, runtimeOptions.Component); + logger.LogDebug("Observer metrics meter {MeterName} registered with {TagCount} default tags.", runtimeMetrics.Meter.Name, runtimeMetrics.DefaultTags.Count); + } + + // Observer implementation will hook into the authority token provider when connectors arrive. + applicationLifetime.ApplicationStarted.Register(() => logger.LogInformation("Observer bootstrap complete.")); + return Task.CompletedTask; + } +} diff --git a/src/StellaOps.Zastava.Webhook.Tests/Backend/RuntimePolicyClientTests.cs b/src/StellaOps.Zastava.Webhook.Tests/Backend/RuntimePolicyClientTests.cs new file mode 100644 index 00000000..7f63fec9 --- /dev/null +++ b/src/StellaOps.Zastava.Webhook.Tests/Backend/RuntimePolicyClientTests.cs @@ -0,0 +1,198 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.Metrics; +using System.Net; +using System.Net.Http; +using System.Text; +using System.Text.Json; +using Xunit; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Core.Diagnostics; +using StellaOps.Zastava.Core.Security; +using StellaOps.Zastava.Webhook.Backend; +using StellaOps.Zastava.Webhook.Configuration; + +namespace StellaOps.Zastava.Webhook.Tests.Backend; + +public sealed class RuntimePolicyClientTests +{ + [Fact] + public async Task EvaluateAsync_SendsDpOpHeaderAndParsesResponse() + { + var requestCapture = new List(); + var handler = new StubHttpMessageHandler(message => + { + requestCapture.Add(message); + var response = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(JsonSerializer.Serialize(new + { + ttlSeconds = 120, + results = new + { + image = new + { + signed = true, + hasSbom = true, + policyVerdict = "pass", + reasons = Array.Empty() + } + } + }), Encoding.UTF8, "application/json") + }; + return response; + }); + + var httpClient = new HttpClient(handler) + { + BaseAddress = new Uri("https://scanner.internal") + }; + + var runtimeOptions = Options.Create(new ZastavaRuntimeOptions + { + Tenant = "tenant-1", + Environment = "test", + Component = "webhook", + Authority = new ZastavaAuthorityOptions + { + Audience = new[] { "scanner" }, + Scopes = new[] { "aud:scanner" } + }, + Logging = new ZastavaRuntimeLoggingOptions(), + Metrics = new ZastavaRuntimeMetricsOptions() + }); + + var webhookOptions = Options.Create(new ZastavaWebhookOptions + { + Backend = new ZastavaWebhookBackendOptions + { + BaseAddress = new Uri("https://scanner.internal"), + PolicyPath = "/api/v1/scanner/policy/runtime" + } + }); + + using var metrics = new StubRuntimeMetrics(); + var client = new RuntimePolicyClient( + httpClient, + new StubAuthorityTokenProvider(), + new StaticOptionsMonitor(runtimeOptions.Value), + new StaticOptionsMonitor(webhookOptions.Value), + metrics, + NullLogger.Instance); + + var response = await client.EvaluateAsync(new RuntimePolicyRequest + { + Namespace = "payments", + Labels = new Dictionary { ["app"] = "api" }, + Images = new[] { "image" } + }); + + Assert.Equal(120, response.TtlSeconds); + Assert.True(response.Results.ContainsKey("image")); + var request = Assert.Single(requestCapture); + Assert.Equal("DPoP", request.Headers.Authorization?.Scheme); + Assert.Equal("runtime-token", request.Headers.Authorization?.Parameter); + Assert.Equal("/api/v1/scanner/policy/runtime", request.RequestUri?.PathAndQuery); + } + + [Fact] + public async Task EvaluateAsync_NonSuccess_ThrowsRuntimePolicyException() + { + var handler = new StubHttpMessageHandler(_ => new HttpResponseMessage(HttpStatusCode.BadGateway) + { + Content = new StringContent("upstream error") + }); + var client = new RuntimePolicyClient( + new HttpClient(handler) { BaseAddress = new Uri("https://scanner.internal") }, + new StubAuthorityTokenProvider(), + new StaticOptionsMonitor(new ZastavaRuntimeOptions + { + Tenant = "tenant", + Environment = "test", + Component = "webhook", + Authority = new ZastavaAuthorityOptions { Audience = new[] { "scanner" } }, + Logging = new ZastavaRuntimeLoggingOptions(), + Metrics = new ZastavaRuntimeMetricsOptions() + }), + new StaticOptionsMonitor(new ZastavaWebhookOptions()), + new StubRuntimeMetrics(), + NullLogger.Instance); + + await Assert.ThrowsAsync(() => client.EvaluateAsync(new RuntimePolicyRequest + { + Namespace = "payments", + Labels = null, + Images = new[] { "image" } + })); + } + + private sealed class StubAuthorityTokenProvider : IZastavaAuthorityTokenProvider + { + public ValueTask InvalidateAsync(string audience, IEnumerable? additionalScopes = null, CancellationToken cancellationToken = default) + => ValueTask.CompletedTask; + + public ValueTask GetAsync(string audience, IEnumerable? additionalScopes = null, CancellationToken cancellationToken = default) + => ValueTask.FromResult(new ZastavaOperationalToken("runtime-token", "DPoP", DateTimeOffset.UtcNow.AddMinutes(5), Array.Empty())); + } + + private sealed class StubRuntimeMetrics : IZastavaRuntimeMetrics + { + public StubRuntimeMetrics() + { + Meter = new Meter("Test.Zastava.Webhook"); + RuntimeEvents = Meter.CreateCounter("test.events"); + AdmissionDecisions = Meter.CreateCounter("test.decisions"); + BackendLatencyMs = Meter.CreateHistogram("test.backend.latency"); + DefaultTags = Array.Empty>(); + } + + public Meter Meter { get; } + + public Counter RuntimeEvents { get; } + + public Counter AdmissionDecisions { get; } + + public Histogram BackendLatencyMs { get; } + + public IReadOnlyList> DefaultTags { get; } + + public void Dispose() => Meter.Dispose(); + } + + private sealed class StaticOptionsMonitor : IOptionsMonitor + { + public StaticOptionsMonitor(T value) + { + CurrentValue = value; + } + + public T CurrentValue { get; } + + public T Get(string? name) => CurrentValue; + + public IDisposable OnChange(Action listener) => NullDisposable.Instance; + + private sealed class NullDisposable : IDisposable + { + public static readonly NullDisposable Instance = new(); + public void Dispose() + { + } + } + } + + private sealed class StubHttpMessageHandler : HttpMessageHandler + { + private readonly Func responder; + + public StubHttpMessageHandler(Func responder) + { + this.responder = responder; + } + + protected override Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + => Task.FromResult(responder(request)); + } +} diff --git a/src/StellaOps.Zastava.Webhook.Tests/Certificates/SecretFileCertificateSourceTests.cs b/src/StellaOps.Zastava.Webhook.Tests/Certificates/SecretFileCertificateSourceTests.cs index 8380491a..257e3766 100644 --- a/src/StellaOps.Zastava.Webhook.Tests/Certificates/SecretFileCertificateSourceTests.cs +++ b/src/StellaOps.Zastava.Webhook.Tests/Certificates/SecretFileCertificateSourceTests.cs @@ -14,8 +14,7 @@ public sealed class SecretFileCertificateSourceTests { using var rsa = RSA.Create(2048); var request = new CertificateRequest("CN=zastava-webhook", rsa, HashAlgorithmName.SHA256, RSASignaturePadding.Pkcs1); - using var certificate = request.CreateSelfSigned(DateTimeOffset.UtcNow.AddMinutes(-5), DateTimeOffset.UtcNow.AddHours(1)); - using var certificateWithKey = certificate.CopyWithPrivateKey(rsa); + using var certificateWithKey = request.CreateSelfSigned(DateTimeOffset.UtcNow.AddMinutes(-5), DateTimeOffset.UtcNow.AddHours(1)); var certificatePath = Path.GetTempFileName(); var privateKeyPath = Path.GetTempFileName(); @@ -23,7 +22,7 @@ public sealed class SecretFileCertificateSourceTests try { File.WriteAllText(certificatePath, certificateWithKey.ExportCertificatePem()); - using var exportRsa = certificateWithKey.GetRSAPrivateKey() ?? throw new InvalidOperationException("Missing RSA private key"); + using var exportRsa = certificateWithKey.GetRSAPrivateKey() ?? throw new InvalidOperationException("Missing RSA private key"); var privateKeyPem = PemEncoding.Write("PRIVATE KEY", exportRsa.ExportPkcs8PrivateKey()); File.WriteAllText(privateKeyPath, privateKeyPem); @@ -52,8 +51,7 @@ public sealed class SecretFileCertificateSourceTests { using var rsa = RSA.Create(2048); var request = new CertificateRequest("CN=zastava-webhook", rsa, HashAlgorithmName.SHA256, RSASignaturePadding.Pkcs1); - using var certificate = request.CreateSelfSigned(DateTimeOffset.UtcNow.AddMinutes(-5), DateTimeOffset.UtcNow.AddHours(1)); - using var certificateWithKey = certificate.CopyWithPrivateKey(rsa); + using var certificateWithKey = request.CreateSelfSigned(DateTimeOffset.UtcNow.AddMinutes(-5), DateTimeOffset.UtcNow.AddHours(1)); var pfxPath = Path.GetTempFileName(); try diff --git a/src/StellaOps.Zastava.Webhook/Authority/AuthorityTokenProvider.cs b/src/StellaOps.Zastava.Webhook/Authority/AuthorityTokenProvider.cs index 585e8f48..134ec0df 100644 --- a/src/StellaOps.Zastava.Webhook/Authority/AuthorityTokenProvider.cs +++ b/src/StellaOps.Zastava.Webhook/Authority/AuthorityTokenProvider.cs @@ -1,93 +1,51 @@ -using Microsoft.Extensions.Diagnostics.HealthChecks; -using Microsoft.Extensions.Options; -using StellaOps.Zastava.Webhook.Configuration; - -namespace StellaOps.Zastava.Webhook.Authority; - -public interface IAuthorityTokenProvider -{ - ValueTask GetTokenAsync(CancellationToken cancellationToken = default); -} - -public sealed record AuthorityToken(string Value, DateTimeOffset? ExpiresAtUtc); - -public sealed class StaticAuthorityTokenProvider : IAuthorityTokenProvider -{ - private readonly ZastavaWebhookAuthorityOptions _options; - private readonly ILogger _logger; - private AuthorityToken? _cachedToken; - - public StaticAuthorityTokenProvider( - IOptionsMonitor options, - ILogger logger) - { - _options = options.CurrentValue.Authority; - _logger = logger; - } - - public ValueTask GetTokenAsync(CancellationToken cancellationToken = default) - { - if (_cachedToken is { } token) - { - return ValueTask.FromResult(token); - } - - var value = !string.IsNullOrWhiteSpace(_options.StaticTokenValue) - ? _options.StaticTokenValue - : LoadTokenFromFile(_options.StaticTokenPath); - - if (string.IsNullOrWhiteSpace(value)) - { - throw new InvalidOperationException("No Authority token configured. Provide either 'StaticTokenValue' or 'StaticTokenPath'."); - } - - token = new AuthorityToken(value.Trim(), ExpiresAtUtc: null); - _cachedToken = token; - _logger.LogInformation("Loaded static Authority token (length {Length}).", token.Value.Length); - return ValueTask.FromResult(token); - } - - private string LoadTokenFromFile(string? path) - { - if (string.IsNullOrWhiteSpace(path)) - { - throw new InvalidOperationException("Authority static token path not set."); - } - - if (!File.Exists(path)) - { - throw new FileNotFoundException("Authority static token file not found.", path); - } - - return File.ReadAllText(path); - } -} - -public sealed class AuthorityTokenHealthCheck : IHealthCheck -{ - private readonly IAuthorityTokenProvider _tokenProvider; - private readonly ILogger _logger; - - public AuthorityTokenHealthCheck(IAuthorityTokenProvider tokenProvider, ILogger logger) - { - _tokenProvider = tokenProvider; - _logger = logger; - } - - public async Task CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default) - { - try - { - var token = await _tokenProvider.GetTokenAsync(cancellationToken); - return HealthCheckResult.Healthy("Authority token acquired.", data: new Dictionary - { - ["expiresAtUtc"] = token.ExpiresAtUtc?.ToString("O") ?? "static" - }); - } - catch (Exception ex) - { - _logger.LogError(ex, "Failed to obtain Authority token."); - return HealthCheckResult.Unhealthy("Failed to obtain Authority token.", ex); - } - } -} +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Options; +using Microsoft.Extensions.Logging; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Core.Security; + +namespace StellaOps.Zastava.Webhook.Authority; + +public sealed class AuthorityTokenHealthCheck : IHealthCheck +{ + private readonly IZastavaAuthorityTokenProvider authorityTokenProvider; + private readonly IOptionsMonitor runtimeOptions; + private readonly ILogger logger; + + public AuthorityTokenHealthCheck( + IZastavaAuthorityTokenProvider authorityTokenProvider, + IOptionsMonitor runtimeOptions, + ILogger logger) + { + this.authorityTokenProvider = authorityTokenProvider ?? throw new ArgumentNullException(nameof(authorityTokenProvider)); + this.runtimeOptions = runtimeOptions ?? throw new ArgumentNullException(nameof(runtimeOptions)); + this.logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + public async Task CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default) + { + try + { + var runtime = runtimeOptions.CurrentValue; + var authority = runtime.Authority; + var audience = authority.Audience.FirstOrDefault() ?? "scanner"; + var token = await authorityTokenProvider.GetAsync(audience, authority.Scopes ?? Array.Empty(), cancellationToken); + + return HealthCheckResult.Healthy( + "Authority token acquired.", + data: new Dictionary + { + ["expiresAtUtc"] = token.ExpiresAtUtc?.ToString("O") ?? "static", + ["tokenType"] = token.TokenType + }); + } + catch (Exception ex) + { + logger.LogError(ex, "Failed to obtain Authority token via runtime core."); + return HealthCheckResult.Unhealthy("Failed to obtain Authority token via runtime core.", ex); + } + } +} diff --git a/src/StellaOps.Zastava.Webhook/Backend/IRuntimePolicyClient.cs b/src/StellaOps.Zastava.Webhook/Backend/IRuntimePolicyClient.cs new file mode 100644 index 00000000..6388fd8b --- /dev/null +++ b/src/StellaOps.Zastava.Webhook/Backend/IRuntimePolicyClient.cs @@ -0,0 +1,9 @@ +using System.Threading; +using System.Threading.Tasks; + +namespace StellaOps.Zastava.Webhook.Backend; + +public interface IRuntimePolicyClient +{ + Task EvaluateAsync(RuntimePolicyRequest request, CancellationToken cancellationToken = default); +} diff --git a/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyClient.cs b/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyClient.cs new file mode 100644 index 00000000..14c2d810 --- /dev/null +++ b/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyClient.cs @@ -0,0 +1,115 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Net.Http; +using System.Net.Http.Headers; +using System.Text; +using System.Text.Json; +using System.Text.Json.Serialization; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Core.Diagnostics; +using StellaOps.Zastava.Core.Security; +using StellaOps.Zastava.Webhook.Configuration; + +namespace StellaOps.Zastava.Webhook.Backend; + +internal sealed class RuntimePolicyClient : IRuntimePolicyClient +{ + private static readonly JsonSerializerOptions SerializerOptions = new() + { + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull + }; + + static RuntimePolicyClient() + { + SerializerOptions.Converters.Add(new JsonStringEnumConverter(JsonNamingPolicy.CamelCase, allowIntegerValues: false)); + } + + private readonly HttpClient httpClient; + private readonly IZastavaAuthorityTokenProvider authorityTokenProvider; + private readonly IOptionsMonitor runtimeOptions; + private readonly IOptionsMonitor webhookOptions; + private readonly IZastavaRuntimeMetrics runtimeMetrics; + private readonly ILogger logger; + + public RuntimePolicyClient( + HttpClient httpClient, + IZastavaAuthorityTokenProvider authorityTokenProvider, + IOptionsMonitor runtimeOptions, + IOptionsMonitor webhookOptions, + IZastavaRuntimeMetrics runtimeMetrics, + ILogger logger) + { + this.httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient)); + this.authorityTokenProvider = authorityTokenProvider ?? throw new ArgumentNullException(nameof(authorityTokenProvider)); + this.runtimeOptions = runtimeOptions ?? throw new ArgumentNullException(nameof(runtimeOptions)); + this.webhookOptions = webhookOptions ?? throw new ArgumentNullException(nameof(webhookOptions)); + this.runtimeMetrics = runtimeMetrics ?? throw new ArgumentNullException(nameof(runtimeMetrics)); + this.logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + public async Task EvaluateAsync(RuntimePolicyRequest request, CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(request); + + var runtime = runtimeOptions.CurrentValue; + var authority = runtime.Authority; + var audience = authority.Audience.FirstOrDefault() ?? "scanner"; + var token = await authorityTokenProvider.GetAsync(audience, authority.Scopes ?? Array.Empty(), cancellationToken).ConfigureAwait(false); + + var backend = webhookOptions.CurrentValue.Backend; + using var httpRequest = new HttpRequestMessage(HttpMethod.Post, backend.PolicyPath) + { + Content = new StringContent(JsonSerializer.Serialize(request, SerializerOptions), Encoding.UTF8, "application/json") + }; + + httpRequest.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json")); + httpRequest.Headers.Authorization = CreateAuthorizationHeader(token); + + var stopwatch = Stopwatch.StartNew(); + try + { + using var response = await httpClient.SendAsync(httpRequest, cancellationToken).ConfigureAwait(false); + var payload = await response.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false); + + if (!response.IsSuccessStatusCode) + { + logger.LogWarning("Runtime policy call returned {StatusCode}: {Payload}", (int)response.StatusCode, payload); + throw new RuntimePolicyException($"Runtime policy call failed with status {(int)response.StatusCode}", response.StatusCode); + } + + var result = JsonSerializer.Deserialize(payload, SerializerOptions); + if (result is null) + { + throw new RuntimePolicyException("Runtime policy response payload was empty or invalid.", response.StatusCode); + } + + return result; + } + finally + { + stopwatch.Stop(); + RecordLatency(stopwatch.Elapsed.TotalMilliseconds); + } + } + + private AuthenticationHeaderValue CreateAuthorizationHeader(ZastavaOperationalToken token) + { + var scheme = string.Equals(token.TokenType, "dpop", StringComparison.OrdinalIgnoreCase) ? "DPoP" : token.TokenType; + return new AuthenticationHeaderValue(scheme, token.AccessToken); + } + + private void RecordLatency(double elapsedMs) + { + var tags = runtimeMetrics.DefaultTags + .Concat(new[] { new KeyValuePair("endpoint", "policy") }) + .ToArray(); + runtimeMetrics.BackendLatencyMs.Record(elapsedMs, tags); + } +} diff --git a/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyException.cs b/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyException.cs new file mode 100644 index 00000000..d756a454 --- /dev/null +++ b/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyException.cs @@ -0,0 +1,21 @@ +using System; +using System.Net; + +namespace StellaOps.Zastava.Webhook.Backend; + +public sealed class RuntimePolicyException : Exception +{ + public RuntimePolicyException(string message, HttpStatusCode statusCode) + : base(message) + { + StatusCode = statusCode; + } + + public RuntimePolicyException(string message, HttpStatusCode statusCode, Exception innerException) + : base(message, innerException) + { + StatusCode = statusCode; + } + + public HttpStatusCode StatusCode { get; } +} diff --git a/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyRequest.cs b/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyRequest.cs new file mode 100644 index 00000000..f043625a --- /dev/null +++ b/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyRequest.cs @@ -0,0 +1,16 @@ +using System.Collections.Generic; +using System.Text.Json.Serialization; + +namespace StellaOps.Zastava.Webhook.Backend; + +public sealed record RuntimePolicyRequest +{ + [JsonPropertyName("namespace")] + public required string Namespace { get; init; } + + [JsonPropertyName("labels")] + public IReadOnlyDictionary? Labels { get; init; } + + [JsonPropertyName("images")] + public required IReadOnlyList Images { get; init; } +} diff --git a/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyResponse.cs b/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyResponse.cs new file mode 100644 index 00000000..dd622ce0 --- /dev/null +++ b/src/StellaOps.Zastava.Webhook/Backend/RuntimePolicyResponse.cs @@ -0,0 +1,33 @@ +using System; +using System.Collections.Generic; +using System.Text.Json.Serialization; +using StellaOps.Zastava.Core.Contracts; + +namespace StellaOps.Zastava.Webhook.Backend; + +public sealed record RuntimePolicyResponse +{ + [JsonPropertyName("ttlSeconds")] + public int TtlSeconds { get; init; } + + [JsonPropertyName("results")] + public IReadOnlyDictionary Results { get; init; } = new Dictionary(); +} + +public sealed record RuntimePolicyImageResult +{ + [JsonPropertyName("signed")] + public bool Signed { get; init; } + + [JsonPropertyName("hasSbom")] + public bool HasSbom { get; init; } + + [JsonPropertyName("policyVerdict")] + public PolicyVerdict PolicyVerdict { get; init; } + + [JsonPropertyName("reasons")] + public IReadOnlyList Reasons { get; init; } = Array.Empty(); + + [JsonPropertyName("rekor")] + public AdmissionRekorEvidence? Rekor { get; init; } +} diff --git a/src/StellaOps.Zastava.Webhook/Certificates/SecretFileCertificateSource.cs b/src/StellaOps.Zastava.Webhook/Certificates/SecretFileCertificateSource.cs index 2de3a3a4..8d8a3017 100644 --- a/src/StellaOps.Zastava.Webhook/Certificates/SecretFileCertificateSource.cs +++ b/src/StellaOps.Zastava.Webhook/Certificates/SecretFileCertificateSource.cs @@ -82,17 +82,22 @@ public sealed class SecretFileCertificateSource : IWebhookCertificateSource internal static class X509Certificate2Extensions { - public static X509Certificate2 WithExportablePrivateKey(this X509Certificate2 certificate) - { - // Ensure the private key is exportable for Kestrel; CreateFromPemFile returns a temporary key material otherwise. - using var rsa = certificate.GetRSAPrivateKey(); - if (rsa is null) - { - return certificate; - } - - var certificateWithKey = certificate.CopyWithPrivateKey(rsa); - certificate.Dispose(); - return certificateWithKey; - } -} + public static X509Certificate2 WithExportablePrivateKey(this X509Certificate2 certificate) + { + // Ensure the private key is exportable for Kestrel; CreateFromPemFile returns a temporary key material otherwise. + if (certificate.HasPrivateKey) + { + return certificate; + } + + using var rsa = certificate.GetRSAPrivateKey(); + if (rsa is null) + { + return certificate; + } + + var certificateWithKey = certificate.CopyWithPrivateKey(rsa); + certificate.Dispose(); + return certificateWithKey; + } +} diff --git a/src/StellaOps.Zastava.Webhook/Configuration/ZastavaWebhookOptions.cs b/src/StellaOps.Zastava.Webhook/Configuration/ZastavaWebhookOptions.cs index 2d152854..ff912a42 100644 --- a/src/StellaOps.Zastava.Webhook/Configuration/ZastavaWebhookOptions.cs +++ b/src/StellaOps.Zastava.Webhook/Configuration/ZastavaWebhookOptions.cs @@ -9,11 +9,14 @@ public sealed class ZastavaWebhookOptions [Required] public ZastavaWebhookTlsOptions Tls { get; init; } = new(); - [Required] - public ZastavaWebhookAuthorityOptions Authority { get; init; } = new(); - - [Required] - public ZastavaWebhookAdmissionOptions Admission { get; init; } = new(); + [Required] + public ZastavaWebhookAuthorityOptions Authority { get; init; } = new(); + + [Required] + public ZastavaWebhookAdmissionOptions Admission { get; init; } = new(); + + [Required] + public ZastavaWebhookBackendOptions Backend { get; init; } = new(); } public sealed class ZastavaWebhookAdmissionOptions @@ -114,11 +117,11 @@ public sealed class ZastavaWebhookTlsCsrOptions public string PersistPath { get; init; } = "/var/run/zastava-webhook/certs"; } -public sealed class ZastavaWebhookAuthorityOptions -{ - /// - /// Authority issuer URL for token acquisition. - /// +public sealed class ZastavaWebhookAuthorityOptions +{ + /// + /// Authority issuer URL for token acquisition. + /// [Required(AllowEmptyStrings = false)] public Uri Issuer { get; init; } = new("https://authority.internal"); @@ -142,5 +145,31 @@ public sealed class ZastavaWebhookAuthorityOptions /// Interval for refreshing cached tokens before expiry. /// [Range(typeof(double), "1", "3600")] - public double RefreshSkewSeconds { get; init; } = TimeSpan.FromMinutes(5).TotalSeconds; -} + public double RefreshSkewSeconds { get; init; } = TimeSpan.FromMinutes(5).TotalSeconds; +} + +public sealed class ZastavaWebhookBackendOptions +{ + /// + /// Base address for Scanner WebService policy requests. + /// + [Required] + public Uri BaseAddress { get; init; } = new("https://scanner.internal"); + + /// + /// Relative path for runtime policy endpoint. + /// + [Required(AllowEmptyStrings = false)] + public string PolicyPath { get; init; } = "/api/v1/scanner/policy/runtime"; + + /// + /// Timeout in seconds for backend calls (default 5 s). + /// + [Range(typeof(double), "1", "120")] + public double RequestTimeoutSeconds { get; init; } = 5; + + /// + /// Allows HTTP (non-TLS) endpoints when set. Defaults to false for safety. + /// + public bool AllowInsecureHttp { get; init; } +} diff --git a/src/StellaOps.Zastava.Webhook/DependencyInjection/ServiceCollectionExtensions.cs b/src/StellaOps.Zastava.Webhook/DependencyInjection/ServiceCollectionExtensions.cs index 5bf3436c..125c3b6c 100644 --- a/src/StellaOps.Zastava.Webhook/DependencyInjection/ServiceCollectionExtensions.cs +++ b/src/StellaOps.Zastava.Webhook/DependencyInjection/ServiceCollectionExtensions.cs @@ -1,31 +1,50 @@ -using Microsoft.Extensions.DependencyInjection.Extensions; -using StellaOps.Zastava.Webhook.Authority; -using StellaOps.Zastava.Webhook.Certificates; -using StellaOps.Zastava.Webhook.Configuration; -using StellaOps.Zastava.Webhook.Hosting; - -namespace Microsoft.Extensions.DependencyInjection; +using System; +using Microsoft.Extensions.DependencyInjection.Extensions; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Webhook.Authority; +using StellaOps.Zastava.Webhook.Backend; +using StellaOps.Zastava.Webhook.Certificates; +using StellaOps.Zastava.Webhook.Configuration; +using StellaOps.Zastava.Webhook.Hosting; +using StellaOps.Zastava.Webhook.DependencyInjection; + +namespace Microsoft.Extensions.DependencyInjection; public static class ServiceCollectionExtensions { public static IServiceCollection AddZastavaWebhook(this IServiceCollection services, IConfiguration configuration) { - services.AddOptions() - .Bind(configuration.GetSection(ZastavaWebhookOptions.SectionName)) - .ValidateDataAnnotations() - .ValidateOnStart(); - - services.TryAddEnumerable(ServiceDescriptor.Singleton()); - services.TryAddEnumerable(ServiceDescriptor.Singleton()); - services.TryAddSingleton(); - services.TryAddSingleton(); - - services.TryAddSingleton(); - services.TryAddSingleton(); - services.AddHostedService(); - - services.AddHealthChecks() - .AddCheck("webhook_tls") + services.AddZastavaRuntimeCore(configuration, "webhook"); + + services.AddOptions() + .Bind(configuration.GetSection(ZastavaWebhookOptions.SectionName)) + .ValidateDataAnnotations() + .ValidateOnStart(); + + services.TryAddEnumerable(ServiceDescriptor.Singleton()); + services.TryAddEnumerable(ServiceDescriptor.Singleton()); + services.TryAddSingleton(); + services.TryAddSingleton(); + services.TryAddEnumerable(ServiceDescriptor.Singleton, WebhookRuntimeOptionsPostConfigure>()); + + services.AddHttpClient((provider, client) => + { + var backend = provider.GetRequiredService>().Value.Backend; + if (!backend.AllowInsecureHttp && backend.BaseAddress.Scheme.Equals(Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase)) + { + throw new InvalidOperationException("HTTP backend URLs are disabled unless AllowInsecureHttp is true."); + } + + client.BaseAddress = backend.BaseAddress; + client.Timeout = TimeSpan.FromSeconds(backend.RequestTimeoutSeconds); + }); + + services.TryAddSingleton(); + services.AddHostedService(); + + services.AddHealthChecks() + .AddCheck("webhook_tls") .AddCheck("authority_token"); return services; diff --git a/src/StellaOps.Zastava.Webhook/DependencyInjection/WebhookRuntimeOptionsPostConfigure.cs b/src/StellaOps.Zastava.Webhook/DependencyInjection/WebhookRuntimeOptionsPostConfigure.cs new file mode 100644 index 00000000..42626115 --- /dev/null +++ b/src/StellaOps.Zastava.Webhook/DependencyInjection/WebhookRuntimeOptionsPostConfigure.cs @@ -0,0 +1,52 @@ +using System; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Webhook.Configuration; + +namespace StellaOps.Zastava.Webhook.DependencyInjection; + +/// +/// Ensures legacy webhook authority options propagate to runtime options when not explicitly configured. +/// +internal sealed class WebhookRuntimeOptionsPostConfigure : IPostConfigureOptions +{ + private readonly IOptionsMonitor webhookOptions; + + public WebhookRuntimeOptionsPostConfigure(IOptionsMonitor webhookOptions) + { + this.webhookOptions = webhookOptions ?? throw new ArgumentNullException(nameof(webhookOptions)); + } + + public void PostConfigure(string? name, ZastavaRuntimeOptions runtimeOptions) + { + ArgumentNullException.ThrowIfNull(runtimeOptions); + + var snapshot = webhookOptions.Get(name ?? Options.DefaultName); + var source = snapshot.Authority; + if (source is null) + { + return; + } + + runtimeOptions.Authority ??= new ZastavaAuthorityOptions(); + var authority = runtimeOptions.Authority; + + if (ShouldCopyStaticTokenValue(authority.StaticTokenValue, source.StaticTokenValue)) + { + authority.StaticTokenValue = source.StaticTokenValue; + } + + if (ShouldCopyStaticTokenValue(authority.StaticTokenPath, source.StaticTokenPath)) + { + authority.StaticTokenPath = source.StaticTokenPath; + } + + if (!string.IsNullOrWhiteSpace(source.StaticTokenValue) || !string.IsNullOrWhiteSpace(source.StaticTokenPath)) + { + authority.AllowStaticTokenFallback = true; + } + } + + private static bool ShouldCopyStaticTokenValue(string? current, string? source) + => string.IsNullOrWhiteSpace(current) && !string.IsNullOrWhiteSpace(source); +} diff --git a/src/StellaOps.Zastava.Webhook/Hosting/StartupValidationHostedService.cs b/src/StellaOps.Zastava.Webhook/Hosting/StartupValidationHostedService.cs index f1c9473d..09ebc850 100644 --- a/src/StellaOps.Zastava.Webhook/Hosting/StartupValidationHostedService.cs +++ b/src/StellaOps.Zastava.Webhook/Hosting/StartupValidationHostedService.cs @@ -1,31 +1,39 @@ -using StellaOps.Zastava.Webhook.Authority; -using StellaOps.Zastava.Webhook.Certificates; - -namespace StellaOps.Zastava.Webhook.Hosting; - -public sealed class StartupValidationHostedService : IHostedService -{ - private readonly IWebhookCertificateProvider _certificateProvider; - private readonly IAuthorityTokenProvider _authorityTokenProvider; - private readonly ILogger _logger; - - public StartupValidationHostedService( - IWebhookCertificateProvider certificateProvider, - IAuthorityTokenProvider authorityTokenProvider, - ILogger logger) - { - _certificateProvider = certificateProvider; - _authorityTokenProvider = authorityTokenProvider; - _logger = logger; - } - - public async Task StartAsync(CancellationToken cancellationToken) - { - _logger.LogInformation("Running webhook startup validation."); - _certificateProvider.GetCertificate(); - await _authorityTokenProvider.GetTokenAsync(cancellationToken); - _logger.LogInformation("Webhook startup validation complete."); - } +using System.Linq; +using Microsoft.Extensions.Options; +using StellaOps.Zastava.Core.Configuration; +using StellaOps.Zastava.Core.Security; +using StellaOps.Zastava.Webhook.Certificates; + +namespace StellaOps.Zastava.Webhook.Hosting; + +public sealed class StartupValidationHostedService : IHostedService +{ + private readonly IWebhookCertificateProvider _certificateProvider; + private readonly IZastavaAuthorityTokenProvider _authorityTokenProvider; + private readonly IOptionsMonitor _runtimeOptions; + private readonly ILogger _logger; + + public StartupValidationHostedService( + IWebhookCertificateProvider certificateProvider, + IZastavaAuthorityTokenProvider authorityTokenProvider, + IOptionsMonitor runtimeOptions, + ILogger logger) + { + _certificateProvider = certificateProvider; + _authorityTokenProvider = authorityTokenProvider; + _runtimeOptions = runtimeOptions; + _logger = logger; + } + + public async Task StartAsync(CancellationToken cancellationToken) + { + _logger.LogInformation("Running webhook startup validation."); + _certificateProvider.GetCertificate(); + var authority = _runtimeOptions.CurrentValue.Authority; + var audience = authority.Audience.FirstOrDefault() ?? "scanner"; + await _authorityTokenProvider.GetAsync(audience, authority.Scopes, cancellationToken); + _logger.LogInformation("Webhook startup validation complete."); + } public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask; } diff --git a/src/StellaOps.Zastava.Webhook/IMPLEMENTATION_PLAN.md b/src/StellaOps.Zastava.Webhook/IMPLEMENTATION_PLAN.md index cb520d71..4045e0b3 100644 --- a/src/StellaOps.Zastava.Webhook/IMPLEMENTATION_PLAN.md +++ b/src/StellaOps.Zastava.Webhook/IMPLEMENTATION_PLAN.md @@ -19,8 +19,8 @@ 2. CSR workflow: generate CSR + private key, submit to Kubernetes Certificates API when `admission.tls.autoApprove` enabled; persist signed cert/key to mounted emptyDir for reuse across replicas. - Validate cert/key pair on boot; abort start-up if invalid to preserve deterministic behavior. - Configure Kestrel for mutual TLS off (API Server already provides client auth) but enforce minimum TLS 1.3, strong cipher suite list, HTTP/2 disabled (K8s uses HTTP/1.1). -3. **Authority auth** - - Bootstrap Authority client via shared DI extension (`AuthorityClientBuilder` once exposed); until then, placeholder `IAuthorityTokenSource` reading static OpTok from secret for smoke testing. +3. **Authority auth** + - Bootstrap Authority client via shared runtime core (`AddZastavaRuntimeCore` + `IZastavaAuthorityTokenProvider`) so webhook reuses multitenant OpTok caching and guardrails. - Implement DPoP proof generator bound to webhook host keypair (prefer Ed25519) with configurable rotation period (default 24h, triggered at restart). - Add background health check verifying token freshness and surfacing metrics (`zastava.authority_token_renew_failures_total`). 4. **Hosting concerns** diff --git a/src/StellaOps.Zastava.Webhook/Program.cs b/src/StellaOps.Zastava.Webhook/Program.cs index 81b064bb..f8c28315 100644 --- a/src/StellaOps.Zastava.Webhook/Program.cs +++ b/src/StellaOps.Zastava.Webhook/Program.cs @@ -1,10 +1,10 @@ -using System.Security.Authentication; -using Microsoft.AspNetCore.Diagnostics.HealthChecks; -using Serilog; -using Serilog.Events; -using StellaOps.Zastava.Webhook.Authority; -using StellaOps.Zastava.Webhook.Certificates; -using StellaOps.Zastava.Webhook.Configuration; +using System.Security.Authentication; +using Microsoft.AspNetCore.Diagnostics.HealthChecks; +using Serilog; +using Serilog.Events; +using StellaOps.Zastava.Webhook.Authority; +using StellaOps.Zastava.Webhook.Certificates; +using StellaOps.Zastava.Webhook.Configuration; var builder = WebApplication.CreateBuilder(args); @@ -18,11 +18,11 @@ builder.Host.UseSerilog((context, services, loggerConfiguration) => .WriteTo.Console(); }); -builder.Services.AddRouting(); -builder.Services.AddProblemDetails(); -builder.Services.AddEndpointsApiExplorer(); -builder.Services.AddHttpClient(); -builder.Services.AddZastavaWebhook(builder.Configuration); +builder.Services.AddRouting(); +builder.Services.AddProblemDetails(); +builder.Services.AddEndpointsApiExplorer(); +builder.Services.AddHttpClient(); +builder.Services.AddZastavaWebhook(builder.Configuration); builder.WebHost.ConfigureKestrel((context, options) => { diff --git a/src/StellaOps.Zastava.Webhook/Properties/AssemblyInfo.cs b/src/StellaOps.Zastava.Webhook/Properties/AssemblyInfo.cs new file mode 100644 index 00000000..f84f802f --- /dev/null +++ b/src/StellaOps.Zastava.Webhook/Properties/AssemblyInfo.cs @@ -0,0 +1,3 @@ +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("StellaOps.Zastava.Webhook.Tests")] diff --git a/src/StellaOps.Zastava.Webhook/StellaOps.Zastava.Webhook.csproj b/src/StellaOps.Zastava.Webhook/StellaOps.Zastava.Webhook.csproj index 730d0896..4e4cc932 100644 --- a/src/StellaOps.Zastava.Webhook/StellaOps.Zastava.Webhook.csproj +++ b/src/StellaOps.Zastava.Webhook/StellaOps.Zastava.Webhook.csproj @@ -8,9 +8,12 @@ StellaOps.Zastava.Webhook $(NoWarn);CA2254 - - - - - - + + + + + + + + + diff --git a/src/StellaOps.Zastava.Webhook/TASKS.md b/src/StellaOps.Zastava.Webhook/TASKS.md index 5314fd18..957b0b0e 100644 --- a/src/StellaOps.Zastava.Webhook/TASKS.md +++ b/src/StellaOps.Zastava.Webhook/TASKS.md @@ -2,8 +2,9 @@ | ID | Status | Owner(s) | Depends on | Description | Exit Criteria | |----|--------|----------|------------|-------------|---------------| -| ZASTAVA-WEBHOOK-12-101 | DOING | Zastava Webhook Guild | — | Admission controller host with TLS bootstrap and Authority auth. | Webhook host boots with deterministic TLS bootstrap, enforces Authority-issued credentials, e2e smoke proves admission callback lifecycle, structured logs + metrics emit on each decision. | -| ZASTAVA-WEBHOOK-12-102 | DOING | Zastava Webhook Guild | — | Query Scanner `/policy/runtime`, resolve digests, enforce verdicts. | Scanner client resolves image digests + policy verdicts, unit tests cover allow/deny, integration harness rejects/admits workloads per policy with deterministic payloads. | -| ZASTAVA-WEBHOOK-12-103 | DOING | Zastava Webhook Guild | — | Caching, fail-open/closed toggles, metrics/logging for admission decisions. | Configurable cache TTL + seeds survive restart, fail-open/closed toggles verified via tests, metrics/logging exported per decision path, docs note operational knobs. | +| ZASTAVA-WEBHOOK-12-101 | DONE (2025-10-24) | Zastava Webhook Guild | — | Admission controller host with TLS bootstrap and Authority auth. | Webhook host boots with deterministic TLS bootstrap, enforces Authority-issued credentials, e2e smoke proves admission callback lifecycle, structured logs + metrics emit on each decision. | +| ZASTAVA-WEBHOOK-12-102 | DOING | Zastava Webhook Guild | — | Query Scanner `/policy/runtime`, resolve digests, enforce verdicts. | Scanner client resolves image digests + policy verdicts, unit tests cover allow/deny, integration harness rejects/admits workloads per policy with deterministic payloads. | +| ZASTAVA-WEBHOOK-12-103 | DOING | Zastava Webhook Guild | — | Caching, fail-open/closed toggles, metrics/logging for admission decisions. | Configurable cache TTL + seeds survive restart, fail-open/closed toggles verified via tests, metrics/logging exported per decision path, docs note operational knobs. | +| ZASTAVA-WEBHOOK-12-104 | TODO | Zastava Webhook Guild | ZASTAVA-WEBHOOK-12-102 | Wire `/admission` endpoint to runtime policy client and emit allow/deny envelopes. | Admission handler resolves pods to digests, invokes policy client, returns canonical `AdmissionDecisionEnvelope` with deterministic logging and metrics. | > Status update · 2025-10-19: Confirmed no prerequisites for ZASTAVA-WEBHOOK-12-101/102/103; tasks moved to DOING for kickoff. Implementation plan covering TLS bootstrap, backend contract, caching/metrics recorded in `IMPLEMENTATION_PLAN.md`. diff --git a/src/StellaOps.sln b/src/StellaOps.sln index f2f5bd84..4bce6011 100644 --- a/src/StellaOps.sln +++ b/src/StellaOps.sln @@ -335,6 +335,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.Scanner.Analyzers EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.Scanner.Analyzers.Lang.Go.Tests", "StellaOps.Scanner.Analyzers.Lang.Go.Tests\StellaOps.Scanner.Analyzers.Lang.Go.Tests.csproj", "{7C3A6012-6FC8-46A9-9966-1AC373614C41}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.Zastava.Observer", "StellaOps.Zastava.Observer\StellaOps.Zastava.Observer.csproj", "{BC38594B-0B84-4657-9F7B-F2A0FC810F04}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -2277,6 +2279,18 @@ Global {7C3A6012-6FC8-46A9-9966-1AC373614C41}.Release|x64.Build.0 = Release|Any CPU {7C3A6012-6FC8-46A9-9966-1AC373614C41}.Release|x86.ActiveCfg = Release|Any CPU {7C3A6012-6FC8-46A9-9966-1AC373614C41}.Release|x86.Build.0 = Release|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Debug|Any CPU.Build.0 = Debug|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Debug|x64.ActiveCfg = Debug|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Debug|x64.Build.0 = Debug|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Debug|x86.ActiveCfg = Debug|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Debug|x86.Build.0 = Debug|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Release|Any CPU.ActiveCfg = Release|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Release|Any CPU.Build.0 = Release|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Release|x64.ActiveCfg = Release|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Release|x64.Build.0 = Release|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Release|x86.ActiveCfg = Release|Any CPU + {BC38594B-0B84-4657-9F7B-F2A0FC810F04}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE