Refactor code structure for improved readability and maintainability

2025-12-06 10:23:40 +02:00
parent 6beb9d7c4e
commit 37304cf819
78 changed files with 5471 additions and 104 deletions
--- a/deploy/compose/README.md
+++ b/deploy/compose/README.md
@@ -13,7 +13,10 @@ These Compose bundles ship the minimum services required to exercise the scanner
 | `docker-compose.mirror.yaml` | Managed mirror topology for `*.stella-ops.org` distribution (Concelier + Excititor + CDN gateway). |
 | `docker-compose.telemetry.yaml` | Optional OpenTelemetry collector overlay (mutual TLS, OTLP ingest endpoints). |
 | `docker-compose.telemetry-storage.yaml` | Prometheus/Tempo/Loki storage overlay with multi-tenant defaults. |
+| `docker-compose.gpu.yaml` | Optional GPU overlay enabling NVIDIA devices for Advisory AI web/worker. Apply with `-f docker-compose.<env>.yaml -f docker-compose.gpu.yaml`. |
 | `env/*.env.example` | Seed `.env` files that document required secrets and ports per profile. |
+| `scripts/backup.sh` | Pauses workers and creates tar.gz of Mongo/MinIO/Redis volumes (deterministic snapshot). |
+| `scripts/reset.sh` | Stops the stack and removes Mongo/MinIO/Redis volumes after explicit confirmation. |

 ## Usage

@@ -101,4 +104,18 @@ The Helm chart mirrors these settings under `services.advisory-ai-web` / `adviso
 2. Update image digests in the relevant Compose file(s).
 3. Re-run `docker compose config` to confirm the bundle is deterministic.

-Keep digests synchronized between Compose, Helm, and the release manifest to preserve reproducibility guarantees. `deploy/tools/validate-profiles.sh` performs a quick audit.
+Keep digests synchronized between Compose, Helm, and the release manifest to preserve reproducibility guarantees. `deploy/tools/validate-profiles.sh` performs a quick audit.
+
+### GPU toggle for Advisory AI
+
+GPU is disabled by default. To run inference on NVIDIA GPUs:
+
+```bash
+docker compose \
+  --env-file prod.env \
+  -f docker-compose.prod.yaml \
+  -f docker-compose.gpu.yaml \
+  up -d
+```
+
+The GPU overlay requests one GPU for `advisory-ai-worker` and `advisory-ai-web` and sets `ADVISORY_AI_INFERENCE_GPU=true`. Ensure the host has the NVIDIA container runtime and that the base compose file still sets the correct digests.
--- a/deploy/compose/docker-compose.gpu.yaml
+++ b/deploy/compose/docker-compose.gpu.yaml
@@ -0,0 +1,26 @@
+version: "3.9"
+
+services:
+  advisory-ai-worker:
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+              driver: nvidia
+              count: 1
+    environment:
+      ADVISORY_AI_INFERENCE_GPU: "true"
+    runtime: nvidia
+
+  advisory-ai-web:
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+              driver: nvidia
+              count: 1
+    environment:
+      ADVISORY_AI_INFERENCE_GPU: "true"
+    runtime: nvidia
--- a/deploy/compose/scripts/backup.sh
+++ b/deploy/compose/scripts/backup.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+echo "StellaOps Compose Backup"
+echo "This will create a tar.gz of Mongo, MinIO (object-store), and Redis data volumes."
+read -rp "Proceed? [y/N] " ans
+[[ ${ans:-N} =~ ^[Yy]$ ]] || { echo "Aborted."; exit 1; }
+
+TS=$(date -u +%Y%m%dT%H%M%SZ)
+OUT_DIR=${BACKUP_DIR:-backups}
+mkdir -p "$OUT_DIR"
+
+docker compose ps >/dev/null
+
+echo "Pausing worker containers for consistency..."
+docker compose pause scanner-worker scheduler-worker taskrunner-worker || true
+
+echo "Backing up volumes..."
+docker run --rm \
+  -v stellaops-mongo:/data/db:ro \
+  -v stellaops-minio:/data/minio:ro \
+  -v stellaops-redis:/data/redis:ro \
+  -v "$PWD/$OUT_DIR":/out \
+  alpine sh -c "cd / && tar czf /out/stellaops-backup-$TS.tar.gz data"
+
+docker compose unpause scanner-worker scheduler-worker taskrunner-worker || true
+
+echo "Backup written to $OUT_DIR/stellaops-backup-$TS.tar.gz"
--- a/deploy/compose/scripts/reset.sh
+++ b/deploy/compose/scripts/reset.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+echo "WARNING: This will stop the stack and wipe Mongo, MinIO, and Redis volumes."
+read -rp "Type 'RESET' to continue: " ans
+[[ ${ans:-} == "RESET" ]] || { echo "Aborted."; exit 1; }
+
+docker compose down
+
+for vol in stellaops-mongo stellaops-minio stellaops-redis; do
+  echo "Removing volume $vol"
+  docker volume rm "$vol" || true
+done
+
+echo "Reset complete. Re-run compose with your env file to recreate volumes."
--- a/deploy/helm/stellaops/templates/core.yaml
+++ b/deploy/helm/stellaops/templates/core.yaml
@@ -105,14 +105,23 @@ spec:
          securityContext:
 {{ toYaml $svc.securityContext | nindent 12 }}
 {{- end }}
-{{- if $svc.livenessProbe }}
-          livenessProbe:
-{{ toYaml $svc.livenessProbe | nindent 12 }}
-{{- end }}
-{{- if $svc.readinessProbe }}
-          readinessProbe:
-{{ toYaml $svc.readinessProbe | nindent 12 }}
-{{- end }}
+{{- if $svc.livenessProbe }}
+          livenessProbe:
+{{ toYaml $svc.livenessProbe | nindent 12 }}
+{{- end }}
+{{- if $svc.readinessProbe }}
+          readinessProbe:
+{{ toYaml $svc.readinessProbe | nindent 12 }}
+{{- end }}
+{{- if $svc.prometheus }}
+          {{- $pr := $svc.prometheus }}
+          {{- if $pr.enabled }}
+          {{- if not $svc.podAnnotations }}
+          {{- $svc = merge $svc (dict "podAnnotations" (dict)) }}
+          {{- end }}
+          {{- $svc.podAnnotations = merge $svc.podAnnotations (dict "prometheus.io/scrape" "true" "prometheus.io/path" (default "/metrics" $pr.path) "prometheus.io/port" (toString (default 8080 $pr.port)) "prometheus.io/scheme" (default "http" $pr.scheme))) }}
+          {{- end }}
+{{- end }}
 {{- if or $svc.volumeMounts $configMounts }}
          volumeMounts:
 {{- if $svc.volumeMounts }}
--- a/deploy/helm/stellaops/templates/hpa.yaml
+++ b/deploy/helm/stellaops/templates/hpa.yaml
@@ -0,0 +1,39 @@
+{{- if and .Values.hpa.enabled .Values.services }}
+{{- range $name, $svc := .Values.services }}
+{{- if and $svc.hpa $svc.hpa.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "stellaops.fullname" (dict "root" $ "name" $name) }}
+  labels:
+    {{- include "stellaops.labels" (dict "root" $ "name" $name "svc" $svc) | nindent 4 }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "stellaops.fullname" (dict "root" $ "name" $name) }}
+  minReplicas: {{ default $.Values.hpa.minReplicas $svc.hpa.minReplicas }}
+  maxReplicas: {{ default $.Values.hpa.maxReplicas $svc.hpa.maxReplicas }}
+  metrics:
+    {{- $cpu := coalesce $svc.hpa.cpu.targetPercentage $.Values.hpa.cpu.targetPercentage -}}
+    {{- if $cpu }}
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: {{ $cpu }}
+    {{- end }}
+    {{- $mem := coalesce $svc.hpa.memory.targetPercentage $.Values.hpa.memory.targetPercentage -}}
+    {{- if $mem }}
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: {{ $mem }}
+    {{- end }}
+---
+{{- end }}
+{{- end }}
+{{- end }}
--- a/deploy/helm/stellaops/values-airgap.yaml
+++ b/deploy/helm/stellaops/values-airgap.yaml
@@ -33,6 +33,21 @@ externalSecrets:
  enabled: false
  secrets: []

+prometheus:
+  enabled: true
+  path: /metrics
+  port: 8080
+  scheme: http
+
+hpa:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 3
+  cpu:
+    targetPercentage: 70
+  memory:
+    targetPercentage: 80
+
 configMaps:
  notify-config:
    data:
--- a/deploy/helm/stellaops/values-prod.yaml
+++ b/deploy/helm/stellaops/values-prod.yaml
@@ -55,6 +55,21 @@ externalSecrets:
        - key: STELLAOPS_SECRETS_ENCRYPTION_KEY
          remoteKey: prod/core/secrets-encryption-key

+prometheus:
+  enabled: true
+  path: /metrics
+  port: 8080
+  scheme: http
+
+hpa:
+  enabled: true
+  minReplicas: 2
+  maxReplicas: 6
+  cpu:
+    targetPercentage: 70
+  memory:
+    targetPercentage: 75
+
 configMaps:
  notify-config:
    data:
--- a/deploy/helm/stellaops/values.yaml
+++ b/deploy/helm/stellaops/values.yaml
@@ -32,6 +32,21 @@ externalSecrets:
  enabled: false
  secrets: []

+prometheus:
+  enabled: false
+  path: /metrics
+  port: 8080
+  scheme: http
+
+hpa:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 3
+  cpu:
+    targetPercentage: 75
+  memory:
+    targetPercentage: null
+
 # Surface.Env configuration for Scanner/Zastava components
 # See docs/modules/scanner/design/surface-env.md for details
 surface: