git.stella-ops.org/deploy/helm/stellaops/values-orchestrator.yaml

# Orchestrator Service Helm Values Overlay
# Enables job scheduling, DAG planning, and worker coordination.
#
# Usage:
#   helm upgrade stellaops ./stellaops -f values.yaml -f values-orchestrator.yaml

global:
  labels:
    stellaops.io/component: orchestrator

# Orchestrator-specific ConfigMaps
configMaps:
  orchestrator-config:
    data:
      orchestrator.yaml: |
        Orchestrator:
          # Telemetry configuration
          telemetry:
            minimumLogLevel: Information
            enableRequestLogging: true
            otelEndpoint: ""

          # Authority integration (disable for standalone testing)
          authority:
            enabled: true
            issuer: https://authority.svc.cluster.local/realms/stellaops
            requireHttpsMetadata: true
            audiences:
              - stellaops-platform
            readScope: orchestrator:read
            writeScope: orchestrator:write
            adminScope: orchestrator:admin

          # Tenant resolution
          tenantHeader: X-StellaOps-Tenant

          # PostgreSQL connection
          storage:
            connectionString: "Host=orchestrator-postgres;Database=stellaops_orchestrator;Username=orchestrator;Password=${POSTGRES_PASSWORD}"
            commandTimeoutSeconds: 60
            enableSensitiveDataLogging: false

          # Scheduler configuration
          scheduler:
            # Maximum concurrent jobs per tenant
            defaultConcurrencyLimit: 100
            # Default rate limit (requests per second)
            defaultRateLimit: 50
            # Job claim timeout before re-queue
            claimTimeoutMinutes: 30
            # Heartbeat interval for active jobs
            heartbeatIntervalSeconds: 30
            # Maximum heartbeat misses before job marked stale
            maxHeartbeatMisses: 3

          # Autoscaling configuration
          autoscaling:
            # Enable autoscaling metrics endpoint
            enabled: true
            # Queue depth threshold for scale-up signal
            queueDepthThreshold: 10000
            # Dispatch latency P95 threshold (ms)
            latencyP95ThresholdMs: 150
            # Scale-up cooldown period
            scaleUpCooldownSeconds: 60
            # Scale-down cooldown period
            scaleDownCooldownSeconds: 300

          # Load shedding configuration
          loadShedding:
            enabled: true
            # Warning threshold (load factor)
            warningThreshold: 0.8
            # Critical threshold (load factor)
            criticalThreshold: 1.0
            # Emergency threshold (load factor)
            emergencyThreshold: 1.5
            # Recovery cooldown
            recoveryCooldownSeconds: 30

          # Dead letter configuration
          deadLetter:
            # Maximum replay attempts
            maxReplayAttempts: 3
            # Entry expiration (days)
            expirationDays: 30
            # Purge interval
            purgeIntervalHours: 24

          # Backfill configuration
          backfill:
            # Maximum concurrent backfill requests
            maxConcurrentRequests: 5
            # Default batch size
            defaultBatchSize: 1000
            # Maximum retention lookback (days)
            maxRetentionDays: 90

# Service definitions
services:
  orchestrator-web:
    image: registry.stella-ops.org/stellaops/orchestrator-web:2025.10.0-edge
    replicas: 2
    service:
      port: 8080
    configMounts:
      - name: orchestrator-config
        configMap: orchestrator-config
        mountPath: /app/etc/orchestrator.yaml
        subPath: orchestrator.yaml
    envFrom:
      - secretRef:
          name: orchestrator-secrets
    env:
      ASPNETCORE_ENVIRONMENT: Production
      ORCHESTRATOR__CONFIG: /app/etc/orchestrator.yaml
    ports:
      - containerPort: 8080
    resources:
      requests:
        memory: "256Mi"
        cpu: "250m"
      limits:
        memory: "1Gi"
        cpu: "1000m"
    readinessProbe:
      httpGet:
        path: /readyz
        port: 8080
      initialDelaySeconds: 5
      periodSeconds: 10
      timeoutSeconds: 5
      failureThreshold: 3
    livenessProbe:
      httpGet:
        path: /livez
        port: 8080
      initialDelaySeconds: 10
      periodSeconds: 20
      timeoutSeconds: 5
      failureThreshold: 3
    startupProbe:
      httpGet:
        path: /startupz
        port: 8080
      initialDelaySeconds: 3
      periodSeconds: 5
      timeoutSeconds: 3
      failureThreshold: 30

  orchestrator-worker:
    image: registry.stella-ops.org/stellaops/orchestrator-worker:2025.10.0-edge
    replicas: 1
    configMounts:
      - name: orchestrator-config
        configMap: orchestrator-config
        mountPath: /app/etc/orchestrator.yaml
        subPath: orchestrator.yaml
    envFrom:
      - secretRef:
          name: orchestrator-secrets
    env:
      DOTNET_ENVIRONMENT: Production
      ORCHESTRATOR__CONFIG: /app/etc/orchestrator.yaml
    resources:
      requests:
        memory: "128Mi"
        cpu: "100m"
      limits:
        memory: "512Mi"
        cpu: "500m"

  orchestrator-postgres:
    class: infrastructure
    image: docker.io/library/postgres:16-alpine
    service:
      port: 5432
    envFrom:
      - secretRef:
          name: orchestrator-postgres-secrets
    env:
      POSTGRES_DB: stellaops_orchestrator
      POSTGRES_USER: orchestrator
    volumeMounts:
      - name: postgres-data
        mountPath: /var/lib/postgresql/data
    volumeClaims:
      - name: postgres-data
        claimName: orchestrator-postgres-data
    readinessProbe:
      exec:
        command:
          - pg_isready
          - -U
          - orchestrator
          - -d
          - stellaops_orchestrator
      initialDelaySeconds: 5
      periodSeconds: 10
    livenessProbe:
      exec:
        command:
          - pg_isready
          - -U
          - orchestrator
          - -d
          - stellaops_orchestrator
      initialDelaySeconds: 15
      periodSeconds: 30