Reduce idle CPU across 62 containers (phase 1)

- Add resource limits (heavy/medium/light tiers) to all 59 .NET services
- Add .NET GC tuning (server/workstation GC, DATAS, conserve memory)
- Convert FirstSignalSnapshotWriter from 10s polling to Valkey pub/sub
- Convert EnvironmentSettingsRefreshService from 60s polling to Valkey pub/sub
- Consolidate GraphAnalytics dual timers to single timer with idle-skip
- Increase healthcheck interval from 30s to 60s (configurable)
- Reduce debug logging to Information on 4 high-traffic services

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
master
2026-03-10 02:16:19 +02:00
parent c0c0267ac9
commit 166745f9f9
12 changed files with 601 additions and 89 deletions

View File

@@ -74,18 +74,60 @@ x-depends-infra: &depends-infra
condition: service_healthy
x-healthcheck-tcp: &healthcheck-tcp
interval: 30s
interval: ${HEALTHCHECK_INTERVAL:-60s}
timeout: 5s
retries: 3
start_period: 15s
x-healthcheck-worker: &healthcheck-worker
test: ["CMD", "/usr/local/bin/healthcheck.sh"]
interval: 30s
interval: ${HEALTHCHECK_INTERVAL:-60s}
timeout: 5s
start_period: 30s
retries: 3
# ---------------------------------------------------------------------------
# Resource limit tiers (Workstream 1: CPU optimization)
# ---------------------------------------------------------------------------
x-resources-heavy: &resources-heavy
deploy:
resources:
limits:
cpus: "1.0"
memory: 2G
x-resources-medium: &resources-medium
deploy:
resources:
limits:
cpus: "0.50"
memory: 1G
x-resources-light: &resources-light
deploy:
resources:
limits:
cpus: "0.25"
memory: 512M
# ---------------------------------------------------------------------------
# .NET GC tuning tiers (Workstream 6: GC configuration)
# ---------------------------------------------------------------------------
x-gc-heavy: &gc-heavy
DOTNET_gcServer: "1"
DOTNET_GCConserveMemory: "5"
DOTNET_GCDynamicAdaptationMode: "1"
x-gc-medium: &gc-medium
DOTNET_gcServer: "1"
DOTNET_GCConserveMemory: "7"
DOTNET_GCDynamicAdaptationMode: "1"
x-gc-light: &gc-light
DOTNET_gcServer: "0"
DOTNET_GCConserveMemory: "9"
DOTNET_GCDynamicAdaptationMode: "1"
networks:
stellaops:
driver: bridge
@@ -273,6 +315,7 @@ services:
# --- Slot 0: Router Gateway (Front Door) -----------------------------------
router-gateway:
<<: *resources-heavy
image: stellaops/router-gateway:dev
container_name: stellaops-router-gateway
restart: unless-stopped
@@ -282,7 +325,7 @@ services:
condition: service_completed_successfully
environment:
ASPNETCORE_URLS: "http://0.0.0.0:8080"
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-heavy]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Gateway__Auth__DpopEnabled: "false"
@@ -307,9 +350,12 @@ services:
Gateway__Auth__IdentityEnvelopeSigningKey: "${STELLAOPS_IDENTITY_ENVELOPE_SIGNING_KEY}"
# Audience validation disabled until authority includes aud in access tokens
# Gateway__Auth__Authority__Audiences__0: "stella-ops-api"
Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Debug"
Logging__LogLevel__Microsoft.IdentityModel: "Debug"
Logging__LogLevel__StellaOps: "Debug"
# Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Debug"
Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Information"
# Logging__LogLevel__Microsoft.IdentityModel: "Debug"
Logging__LogLevel__Microsoft.IdentityModel: "Information"
# Logging__LogLevel__StellaOps: "Debug"
Logging__LogLevel__StellaOps: "Information"
volumes:
- *cert-volume
- console-dist:/app/wwwroot:ro
@@ -331,13 +377,14 @@ services:
# --- Slot 1: Platform ------------------------------------------------------
platform:
<<: *resources-heavy
image: stellaops/platform:dev
container_name: stellaops-platform
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-heavy]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Platform__Authority__Issuer: "https://authority.stella-ops.local/"
@@ -345,9 +392,12 @@ services:
Platform__Authority__BypassNetworks__0: "172.0.0.0/8"
Platform__Authority__BypassNetworks__1: "127.0.0.0/8"
Platform__Authority__BypassNetworks__2: "::1/128"
Logging__LogLevel__StellaOps.Auth: "Debug"
Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Debug"
Logging__LogLevel__Microsoft.AspNetCore.Authorization: "Debug"
# Logging__LogLevel__StellaOps.Auth: "Debug"
Logging__LogLevel__StellaOps.Auth: "Information"
# Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Debug"
Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Information"
# Logging__LogLevel__Microsoft.AspNetCore.Authorization: "Debug"
Logging__LogLevel__Microsoft.AspNetCore.Authorization: "Information"
Platform__Storage__Driver: "postgres"
Platform__Storage__PostgresConnectionString: *postgres-connection
Platform__EnvironmentSettings__AuthorizeEndpoint: "https://stella-ops.local/connect/authorize"
@@ -418,6 +468,7 @@ services:
# --- Slot 2: Authority -----------------------------------------------------
authority:
<<: *resources-heavy
image: stellaops/authority:dev
container_name: stellaops-authority
restart: unless-stopped
@@ -464,7 +515,7 @@ services:
STELLAOPS_AUTHORITY_AUTHORITY__TENANTS__0__ID: "demo-prod"
STELLAOPS_AUTHORITY_AUTHORITY__TENANTS__0__DISPLAYNAME: "Demo Production"
STELLAOPS_AUTHORITY_AUTHORITY__TENANTS__0__STATUS: "active"
<<: *router-microservice-defaults
<<: [*router-microservice-defaults, *gc-heavy]
Router__Enabled: "${AUTHORITY_ROUTER_ENABLED:-true}"
Router__Messaging__ConsumerGroup: "authority"
volumes:
@@ -484,13 +535,14 @@ services:
# --- Slot 3: Gateway -------------------------------------------------------
gateway:
<<: *resources-light
image: stellaops/gateway:dev
container_name: stellaops-gateway
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:80;http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Gateway__Auth__DpopEnabled: "false"
@@ -516,6 +568,7 @@ services:
# --- Slot 4: Attestor ------------------------------------------------------
attestor:
<<: *resources-light
image: stellaops/attestor:dev
container_name: stellaops-attestor
restart: unless-stopped
@@ -523,7 +576,7 @@ services:
- signer
environment:
ASPNETCORE_URLS: "http://+:8442"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ATTESTOR_ATTESTOR__SIGNER__BASEURL: "http://signer.stella-ops.local"
ATTESTOR_ATTESTOR__POSTGRES__CONNECTIONSTRING: *postgres-connection
ConnectionStrings__Default: *postgres-connection
@@ -546,6 +599,7 @@ services:
# --- Slot 5: Attestor TileProxy --------------------------------------------
attestor-tileproxy:
<<: *resources-light
image: stellaops/attestor-tileproxy:dev
container_name: stellaops-attestor-tileproxy
restart: unless-stopped
@@ -553,7 +607,7 @@ services:
- attestor
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-light]
TILE_PROXY__tile_proxy__UpstreamUrl: "https://rekor.sigstore.dev"
TILE_PROXY__tile_proxy__Origin: "stellaops-tileproxy"
TILE_PROXY__tile_proxy__Cache__BasePath: "/var/cache/stellaops/tiles"
@@ -573,13 +627,14 @@ services:
# --- Slot 6: Evidence Locker ------------------------------------------------
evidence-locker-web:
<<: *resources-light
image: stellaops/evidence-locker-web:dev
container_name: stellaops-evidence-locker-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
EvidenceLocker__Database__ConnectionString: *postgres-connection
EvidenceLocker__Database__ApplyMigrationsAtStartup: "true"
EvidenceLocker__ObjectStore__Kind: "FileSystem"
@@ -619,12 +674,13 @@ services:
labels: *release-labels
evidence-locker-worker:
<<: *resources-light
image: stellaops/evidence-locker-worker:dev
container_name: stellaops-evidence-locker-worker
restart: unless-stopped
depends_on: *depends-infra
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-light]
EvidenceLocker__Database__ConnectionString: *postgres-connection
EvidenceLocker__Database__ApplyMigrationsAtStartup: "true"
EvidenceLocker__ObjectStore__Kind: "FileSystem"
@@ -666,6 +722,7 @@ services:
labels: *release-labels
scanner-web:
<<: *resources-heavy
image: stellaops/scanner-web:dev
container_name: stellaops-scanner-web
restart: unless-stopped
@@ -680,7 +737,7 @@ services:
condition: service_healthy
environment:
ASPNETCORE_URLS: "http://+:8444"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-heavy]
SCANNER_SCANNER__PLUGINS__BASEDIRECTORY: "/tmp/stellaops"
SCANNER_SCANNER__STORAGE__DRIVER: "postgres"
SCANNER_SCANNER__STORAGE__DSN: *postgres-connection
@@ -737,6 +794,7 @@ services:
labels: *release-labels
scanner-worker:
<<: *resources-heavy
image: stellaops/scanner-worker:dev
container_name: stellaops-scanner-worker
restart: unless-stopped
@@ -750,7 +808,7 @@ services:
rustfs:
condition: service_healthy
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-heavy]
# Scanner worker options
Scanner__Worker__Authority__Enabled: "false"
BinaryIndex__Enabled: "false"
@@ -786,6 +844,7 @@ services:
# --- Slot 9: Concelier -----------------------------------------------------
concelier:
<<: *resources-medium
image: stellaops/concelier:dev
container_name: stellaops-concelier
restart: unless-stopped
@@ -798,7 +857,7 @@ services:
condition: service_healthy
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
CONCELIER_PLUGINS__BASEDIRECTORY: "/tmp/stellaops"
CONCELIER_POSTGRESSTORAGE__CONNECTIONSTRING: *postgres-connection
CONCELIER_POSTGRESSTORAGE__ENABLED: "true"
@@ -834,13 +893,14 @@ services:
# --- Slot 10: Excititor ----------------------------------------------------
excititor:
<<: *resources-medium
image: stellaops/excititor:dev
container_name: stellaops-excititor
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
# Postgres options (section: Postgres:Excititor)
Postgres__Excititor__ConnectionString: *postgres-connection
Postgres__Excititor__SchemaName: "vex"
@@ -869,6 +929,7 @@ services:
labels: *release-labels
excititor-worker:
<<: *resources-medium
image: stellaops/excititor-worker:dev
container_name: stellaops-excititor-worker
restart: unless-stopped
@@ -878,7 +939,7 @@ services:
valkey:
condition: service_healthy
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-medium]
# Postgres options (section: Postgres:Excititor)
Postgres__Excititor__ConnectionString: *postgres-connection
Postgres__Excititor__SchemaName: "vex"
@@ -903,13 +964,14 @@ services:
# --- Slot 11: VexHub -------------------------------------------------------
vexhub-web:
<<: *resources-light
image: stellaops/vexhub-web:dev
container_name: stellaops-vexhub-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Postgres__ConnectionString: *postgres-connection
@@ -932,13 +994,14 @@ services:
# --- Slot 12: VexLens ------------------------------------------------------
vexlens-web:
<<: *resources-light
image: stellaops/vexlens-web:dev
container_name: stellaops-vexlens-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${VEXLENS_ROUTER_ENABLED:-true}"
@@ -959,13 +1022,14 @@ services:
# --- Slot 13: VulnExplorer (api) [src/Findings/StellaOps.VulnExplorer.Api] ---
api:
<<: *resources-light
image: stellaops/api:dev
container_name: stellaops-api
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${VULNEXPLORER_ROUTER_ENABLED:-true}"
@@ -986,13 +1050,14 @@ services:
# --- Slot 14: Policy Engine ------------------------------------------------
policy-engine:
<<: *resources-medium
image: stellaops/policy-engine:dev
container_name: stellaops-policy-engine
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
STELLAOPS_POLICY_ENGINE_Postgres__Policy__ConnectionString: *postgres-connection
STELLAOPS_POLICY_ENGINE_ConnectionStrings__Redis: "cache.stella-ops.local:6379"
STELLAOPS_POLICY_ENGINE_PolicyEngine__ResourceServer__Authority: "https://authority.stella-ops.local/"
@@ -1012,8 +1077,10 @@ services:
PolicyEngine__ResourceServer__BypassNetworks__0: "172.19.0.0/16"
PolicyEngine__ResourceServer__BypassNetworks__1: "127.0.0.1/32"
PolicyEngine__ResourceServer__BypassNetworks__2: "::1/128"
Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Debug"
Logging__LogLevel__Microsoft.IdentityModel: "Debug"
# Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Debug"
Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Information"
# Logging__LogLevel__Microsoft.IdentityModel: "Debug"
Logging__LogLevel__Microsoft.IdentityModel: "Information"
Router__Enabled: "${POLICY_ENGINE_ROUTER_ENABLED:-true}"
Router__Messaging__ConsumerGroup: "policy-engine"
volumes:
@@ -1033,13 +1100,14 @@ services:
# --- Slot 15: Policy Gateway -----------------------------------------------
policy:
<<: *resources-medium
image: stellaops/policy:dev
container_name: stellaops-policy
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8084"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Postgres__Policy__ConnectionString: *postgres-connection
@@ -1077,13 +1145,14 @@ services:
# --- Slot 16: RiskEngine [src/Findings/StellaOps.RiskEngine.*] ---------------
riskengine-web:
<<: *resources-medium
image: stellaops/riskengine-web:dev
container_name: stellaops-riskengine-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
RISKENGINE__STORAGE__DRIVER: "postgres"
@@ -1105,12 +1174,13 @@ services:
labels: *release-labels
riskengine-worker:
<<: *resources-medium
image: stellaops/riskengine-worker:dev
container_name: stellaops-riskengine-worker
restart: unless-stopped
depends_on: *depends-infra
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-medium]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
volumes:
@@ -1125,13 +1195,14 @@ services:
# --- Slot 17: Orchestrator -------------------------------------------------
jobengine:
<<: *resources-heavy
image: stellaops/orchestrator:dev
container_name: stellaops-jobengine
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-heavy]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Authority__ResourceServer__Authority: "https://authority.stella-ops.local/"
@@ -1162,12 +1233,13 @@ services:
labels: *release-labels
jobengine-worker:
<<: *resources-medium
image: stellaops/orchestrator-worker:dev
container_name: stellaops-jobengine-worker
restart: unless-stopped
depends_on: *depends-infra
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-medium]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
volumes:
@@ -1182,13 +1254,14 @@ services:
# --- Slot 18: TaskRunner ---------------------------------------------------
taskrunner-web:
<<: *resources-light
image: stellaops/taskrunner-web:dev
container_name: stellaops-taskrunner-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
TASKRUNNER__STORAGE__DRIVER: "postgres"
@@ -1213,12 +1286,13 @@ services:
labels: *release-labels
taskrunner-worker:
<<: *resources-light
image: stellaops/taskrunner-worker:dev
container_name: stellaops-taskrunner-worker
restart: unless-stopped
depends_on: *depends-infra
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
TASKRUNNER__STORAGE__DRIVER: "postgres"
@@ -1245,13 +1319,14 @@ services:
# --- Slot 19: Scheduler ----------------------------------------------------
scheduler-web:
<<: *resources-medium
image: stellaops/scheduler-web:dev
container_name: stellaops-scheduler-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Scheduler__Authority__Enabled: "false"
@@ -1283,6 +1358,7 @@ services:
labels: *release-labels
scheduler-worker:
<<: *resources-medium
image: stellaops/scheduler-worker:dev
container_name: stellaops-scheduler-worker
restart: unless-stopped
@@ -1292,7 +1368,7 @@ services:
valkey:
condition: service_healthy
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-medium]
# Queue config (Redis transport)
scheduler__queue__Kind: "Redis"
scheduler__queue__Redis__ConnectionString: "cache.stella-ops.local:6379"
@@ -1320,13 +1396,14 @@ services:
# --- Slot 20: Graph API ----------------------------------------------------
graph-api:
<<: *resources-medium
image: stellaops/graph-api:dev
container_name: stellaops-graph-api
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${GRAPH_ROUTER_ENABLED:-true}"
@@ -1347,13 +1424,14 @@ services:
# --- Slot 21: Cartographer -------------------------------------------------
cartographer:
<<: *resources-light
image: stellaops/cartographer:dev
container_name: stellaops-cartographer
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${CARTOGRAPHER_ROUTER_ENABLED:-true}"
@@ -1374,13 +1452,14 @@ services:
# --- Slot 22: ReachGraph ---------------------------------------------------
reachgraph-web:
<<: *resources-light
image: stellaops/reachgraph-web:dev
container_name: stellaops-reachgraph-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${REACHGRAPH_ROUTER_ENABLED:-true}"
@@ -1401,13 +1480,14 @@ services:
# --- Slot 23: Timeline Indexer ---------------------------------------------
timeline-indexer-web:
<<: *resources-light
image: stellaops/timeline-indexer-web:dev
container_name: stellaops-timeline-indexer-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
TIMELINE_Postgres__Timeline__ConnectionString: *postgres-connection
@@ -1428,12 +1508,13 @@ services:
labels: *release-labels
timeline-indexer-worker:
<<: *resources-light
image: stellaops/timeline-indexer-worker:dev
container_name: stellaops-timeline-indexer-worker
restart: unless-stopped
depends_on: *depends-infra
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
TIMELINE_Postgres__Timeline__ConnectionString: *postgres-connection
@@ -1449,13 +1530,14 @@ services:
# --- Slot 24: Timeline ----------------------------------------------------
timeline-web:
<<: *resources-light
image: stellaops/timeline-web:dev
container_name: stellaops-timeline-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Authority__ResourceServer__Authority: "http://authority.stella-ops.local/"
@@ -1481,13 +1563,14 @@ services:
# --- Slot 25: Findings Ledger ----------------------------------------------
findings-ledger-web:
<<: *resources-medium
image: stellaops/findings-ledger-web:dev
container_name: stellaops-findings-ledger-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__FindingsLedger: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
@@ -1498,8 +1581,10 @@ services:
findings__ledger__Authority__Audiences__0: ""
findings__ledger__Authority__RequiredScopes__0: "findings:read"
findings__ledger__Authority__BypassNetworks__0: "172.19.0.0/16"
Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Debug"
Logging__LogLevel__Microsoft.IdentityModel: "Debug"
# Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Debug"
Logging__LogLevel__Microsoft.AspNetCore.Authentication: "Information"
# Logging__LogLevel__Microsoft.IdentityModel: "Debug"
Logging__LogLevel__Microsoft.IdentityModel: "Information"
findings__ledger__Attachments__EncryptionKey: "IiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiI="
findings__ledger__Attachments__SignedUrlBase: "http://findings.stella-ops.local/attachments"
findings__ledger__Attachments__SignedUrlSecret: "dev-signed-url-secret"
@@ -1524,13 +1609,14 @@ services:
# --- Slot 26: Doctor -------------------------------------------------------
doctor-web:
<<: *resources-light
image: stellaops/doctor-web:dev
container_name: stellaops-doctor-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Doctor__Authority__Issuer: "https://authority.stella-ops.local/"
@@ -1554,13 +1640,14 @@ services:
labels: *release-labels
doctor-scheduler:
<<: *resources-light
image: stellaops/doctor-scheduler:dev
container_name: stellaops-doctor-scheduler
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:80"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${DOCTOR_SCHEDULER_ROUTER_ENABLED:-true}"
@@ -1578,13 +1665,14 @@ services:
# --- Slot 27: OpsMemory (src/AdvisoryAI/StellaOps.OpsMemory.WebService) ---
opsmemory-web:
<<: *resources-light
image: stellaops/opsmemory-web:dev
container_name: stellaops-opsmemory-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${OPSMEMORY_ROUTER_ENABLED:-true}"
@@ -1605,13 +1693,14 @@ services:
# --- Slot 28: Notifier ----------------------------------------------------
notifier-web:
<<: *resources-medium
image: stellaops/notifier-web:dev
container_name: stellaops-notifier-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Authority__ResourceServer__Authority: "https://authority.stella-ops.local/"
@@ -1641,12 +1730,13 @@ services:
labels: *release-labels
notifier-worker:
<<: *resources-light
image: stellaops/notifier-worker:dev
container_name: stellaops-notifier-worker
restart: unless-stopped
depends_on: *depends-infra
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
notifier__queue__Transport: "redis"
@@ -1664,13 +1754,14 @@ services:
# --- Slot 29: Notify ------------------------------------------------------
notify-web:
<<: *resources-medium
image: stellaops/notify-web:dev
container_name: stellaops-notify-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
DOTNET_ENVIRONMENT: Production
NOTIFY_NOTIFY__STORAGE__DRIVER: "postgres"
NOTIFY_NOTIFY__STORAGE__CONNECTIONSTRING: *postgres-connection
@@ -1700,6 +1791,7 @@ services:
# --- Slot 30: Signer ------------------------------------------------------
signer:
<<: *resources-light
image: stellaops/signer:dev
container_name: stellaops-signer
restart: unless-stopped
@@ -1708,7 +1800,7 @@ services:
- valkey
environment:
ASPNETCORE_URLS: "http://+:8441"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__KeyManagement: *postgres-connection
ConnectionStrings__Default: *postgres-connection
Router__Enabled: "${SIGNER_ROUTER_ENABLED:-true}"
@@ -1729,13 +1821,14 @@ services:
# --- Slot 31: SmRemote ----------------------------------------------------
smremote:
<<: *resources-light
image: stellaops/smremote:dev
container_name: stellaops-smremote
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${SMREMOTE_ROUTER_ENABLED:-true}"
@@ -1756,13 +1849,14 @@ services:
# --- Slot 32: AirGap Controller --------------------------------------------
airgap-controller:
<<: *resources-light
image: stellaops/airgap-controller:dev
container_name: stellaops-airgap-controller
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${AIRGAP_CONTROLLER_ROUTER_ENABLED:-true}"
@@ -1783,6 +1877,7 @@ services:
# --- Slot 33: AirGap Time -------------------------------------------------
airgap-time:
<<: *resources-light
image: stellaops/airgap-time:dev
container_name: stellaops-airgap-time
restart: unless-stopped
@@ -1790,7 +1885,7 @@ services:
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
Router__Enabled: "${AIRGAP_TIME_ROUTER_ENABLED:-true}"
Router__Messaging__ConsumerGroup: "airgap-time"
@@ -1810,13 +1905,14 @@ services:
# --- Slot 34: PacksRegistry -----------------------------------------------
packsregistry-web:
<<: *resources-light
image: stellaops/packsregistry-web:dev
container_name: stellaops-packsregistry-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
PACKSREGISTRY__STORAGE__DRIVER: "postgres"
@@ -1840,12 +1936,13 @@ services:
labels: *release-labels
packsregistry-worker:
<<: *resources-light
image: stellaops/packsregistry-worker:dev
container_name: stellaops-packsregistry-worker
restart: unless-stopped
depends_on: *depends-infra
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
volumes:
@@ -1860,13 +1957,14 @@ services:
# --- Slot 35: Registry Token -----------------------------------------------
registry-token:
<<: *resources-light
image: stellaops/registry-token:dev
container_name: stellaops-registry-token
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
RegistryTokenService__Signing__Issuer: "http://registry-token.stella-ops.local"
RegistryTokenService__Signing__KeyPath: "/app/etc/certs/kestrel-dev.pfx"
@@ -1898,13 +1996,14 @@ services:
# --- Slot 36: BinaryIndex --------------------------------------------------
binaryindex-web:
<<: *resources-light
image: stellaops/binaryindex-web:dev
container_name: stellaops-binaryindex-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${BINARYINDEX_ROUTER_ENABLED:-true}"
@@ -1925,6 +2024,7 @@ services:
# --- Slot 37: Issuer Directory ---------------------------------------------
issuer-directory:
<<: *resources-light
image: stellaops/issuer-directory-web:dev
container_name: stellaops-issuer-directory
restart: unless-stopped
@@ -1933,7 +2033,7 @@ services:
- authority
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ISSUERDIRECTORY__AUTHORITY__ENABLED: "true"
ISSUERDIRECTORY__AUTHORITY__ISSUER: "${AUTHORITY_ISSUER:-http://authority.stella-ops.local}"
ISSUERDIRECTORY__AUTHORITY__AUDIENCES__0: "api://issuer-directory"
@@ -1960,13 +2060,14 @@ services:
# --- Slot 38: Symbols ------------------------------------------------------
symbols:
<<: *resources-light
image: stellaops/symbols:dev
container_name: stellaops-symbols
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Authority__ResourceServer__Authority: "https://authority.stella-ops.local/"
@@ -1991,13 +2092,14 @@ services:
# --- Slot 39: SbomService --------------------------------------------------
sbomservice:
<<: *resources-light
image: stellaops/sbomservice:dev
container_name: stellaops-sbomservice
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Router__Enabled: "${SBOMSERVICE_ROUTER_ENABLED:-true}"
@@ -2018,13 +2120,14 @@ services:
# --- Slot 40: ExportCenter -------------------------------------------------
export:
<<: *resources-light
image: stellaops/export:dev
container_name: stellaops-export
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Export__AllowInMemoryRepositories: "true"
@@ -2055,12 +2158,13 @@ services:
labels: *release-labels
export-worker:
<<: *resources-light
image: stellaops/export-worker:dev
container_name: stellaops-export-worker
restart: unless-stopped
depends_on: *depends-infra
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Export__AllowInMemoryRepositories: "true"
@@ -2082,13 +2186,14 @@ services:
# --- Slot 41: Replay -------------------------------------------------------
replay-web:
<<: *resources-light
image: stellaops/replay-web:dev
container_name: stellaops-replay-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
REPLAY__STORAGE__DRIVER: "postgres"
@@ -2113,13 +2218,14 @@ services:
# --- Slot 42: Integrations ------------------------------------------------
integrations-web:
<<: *resources-light
image: stellaops/integrations-web:dev
container_name: stellaops-integrations-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__IntegrationsDb: *postgres-connection
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
@@ -2151,6 +2257,7 @@ services:
# --- Slot 43: Zastava Webhook ----------------------------------------------
zastava-webhook:
<<: *resources-light
image: stellaops/zastava-webhook:dev
container_name: stellaops-zastava-webhook
restart: unless-stopped
@@ -2159,7 +2266,7 @@ services:
condition: service_healthy
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-light]
# Runtime authority (used by token provider for OIDC discovery)
zastava__runtime__authority__Issuer: "https://authority.stella-ops.local/"
zastava__runtime__authority__allowStaticTokenFallback: "true"
@@ -2193,13 +2300,14 @@ services:
# --- Slot 44: Signals ------------------------------------------------------
signals:
<<: *resources-light
image: stellaops/signals:dev
container_name: stellaops-signals
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"
Authority__ResourceServer__Authority: "https://authority.stella-ops.local/"
@@ -2230,6 +2338,7 @@ services:
# --- Slot 45: Advisory AI --------------------------------------------------
advisory-ai-web:
<<: *resources-medium
image: stellaops/advisory-ai-web:dev
container_name: stellaops-advisory-ai-web
restart: unless-stopped
@@ -2237,7 +2346,7 @@ services:
- scanner-web
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-medium]
ADVISORYAI__AdvisoryAI__SbomBaseAddress: "${ADVISORY_AI_SBOM_BASEADDRESS:-http://scanner.stella-ops.local}"
ADVISORYAI__AdvisoryAI__Queue__DirectoryPath: "/var/lib/advisory-ai/queue"
ADVISORYAI__AdvisoryAI__Storage__PlanCacheDirectory: "/var/lib/advisory-ai/plans"
@@ -2275,13 +2384,14 @@ services:
labels: *release-labels
advisory-ai-worker:
<<: *resources-medium
image: stellaops/advisory-ai-worker:dev
container_name: stellaops-advisory-ai-worker
restart: unless-stopped
depends_on:
- scanner-web
environment:
<<: *kestrel-cert
<<: [*kestrel-cert, *gc-medium]
ADVISORYAI__AdvisoryAI__SbomBaseAddress: "${ADVISORY_AI_SBOM_BASEADDRESS:-http://scanner.stella-ops.local}"
ADVISORYAI__AdvisoryAI__Queue__DirectoryPath: "/tmp/advisory-ai/queue"
ADVISORYAI__AdvisoryAI__Storage__PlanCacheDirectory: "/tmp/advisory-ai/plans"
@@ -2308,13 +2418,14 @@ services:
# --- Slot 46: Unknowns ----------------------------------------------------
unknowns-web:
<<: *resources-light
image: stellaops/unknowns-web:dev
container_name: stellaops-unknowns-web
restart: unless-stopped
depends_on: *depends-infra
environment:
ASPNETCORE_URLS: "http://+:8080"
<<: [*kestrel-cert, *router-microservice-defaults]
<<: [*kestrel-cert, *router-microservice-defaults, *gc-light]
ConnectionStrings__Default: *postgres-connection
ConnectionStrings__UnknownsDb: *postgres-connection
ConnectionStrings__Redis: "cache.stella-ops.local:6379"

View File

@@ -0,0 +1,141 @@
# Sprint 019 — Container CPU Optimization
## Topic & Scope
- Reduce idle CPU pressure from 62 Docker containers by adding resource limits, tuning GC, converting polling to event-driven patterns, and reducing log verbosity.
- Working directory: `devops/compose/`, `src/JobEngine/`, `src/Graph/`, `src/Platform/`.
- Expected evidence: compose validation, `docker stats` showing caps, reduced idle CPU.
## Dependencies & Concurrency
- No upstream sprint dependencies.
- Workstreams 1/2/4/6 (compose-only) are independent of workstreams 3A/3B/3D (C# changes).
- C# workstreams (3A, 3B, 3D) are independent of each other (different modules).
## Documentation Prerequisites
- `docs/modules/router/architecture.md` (Valkey messaging patterns).
## Delivery Tracker
### WS-1 — Resource Limits in Docker Compose
Status: DONE
Dependency: none
Owners: Developer
Task description:
- Add three resource tier YAML anchors (heavy/medium/light) to compose file.
- Apply `<<: *resources-{tier}` to all 59 .NET services.
- Infrastructure services (postgres, valkey, rustfs, registry, rekor) remain unconstrained.
Completion criteria:
- [x] Three resource anchors defined
- [x] Tier assignments: Heavy (6), Medium (16), Light (37)
- [x] `docker compose config` validates cleanly
- [x] Infrastructure services have no deploy limits
### WS-2 — Logging Debug→Information
Status: DONE
Dependency: none
Owners: Developer
Task description:
- Change 4 services from Debug to Information logging, keeping Debug as comments.
- Services: router-gateway, platform, policy-engine, findings-ledger-web.
Completion criteria:
- [x] Debug log levels commented out with Information active
- [x] 4 services updated
### WS-3A — FirstSignalSnapshotWriter Valkey Pub/Sub
Status: DONE
Dependency: none
Owners: Developer
Task description:
- Convert 10s polling to Valkey subscription on `notify:firstsignal:dirty`.
- Add 60s fallback timer via `FallbackPollIntervalSeconds` option.
- Fire Valkey notification from JobEngineEventPublisher on job lifecycle events.
Completion criteria:
- [x] SemaphoreSlim + Valkey subscribe pattern implemented
- [x] Fallback timer extended from 10s to 60s
- [x] Event publisher fires dirty notification on orch.jobs channel events
- [x] Project builds with 0 errors
### WS-3B — GraphAnalyticsHostedService Single Timer + Idle Skip
Status: DONE
Dependency: none
Owners: Developer
Task description:
- Consolidate dual PeriodicTimer to single timer using Min(ClusterInterval, CentralityInterval).
- Add idle-check: skip pipeline when no pending snapshots exist.
- Add `SkipWhenIdle` option (default: true).
Completion criteria:
- [x] Single timer replaces dual timers
- [x] Idle check via IGraphSnapshotProvider.GetPendingSnapshotsAsync
- [x] Debug log emitted when skipping
- [x] Project builds with 0 errors
### WS-3D — EnvironmentSettingsRefreshService Valkey Pub/Sub
Status: DONE
Dependency: none
Owners: Developer
Task description:
- Register IConnectionMultiplexer in Platform DI from ConnectionStrings:Redis.
- Publish `notify:platform:envsettings:dirty` from PostgresEnvironmentSettingsStore on set/delete.
- Convert EnvironmentSettingsRefreshService from Task.Delay(60s) to Valkey subscription with 300s fallback.
Completion criteria:
- [x] IConnectionMultiplexer registered in Platform Program.cs
- [x] Store publishes dirty notification (fire-and-forget)
- [x] Refresh service uses SemaphoreSlim + Valkey subscribe
- [x] Project builds with 0 errors
### WS-4 — Health Check Interval 60s (Configurable)
Status: DONE
Dependency: none
Owners: Developer
Task description:
- Change healthcheck anchors from 30s to `${HEALTHCHECK_INTERVAL:-60s}`.
- Propagates to all ~57 services using these anchors.
Completion criteria:
- [x] Both healthcheck anchors updated
- [x] Environment variable override supported
- [x] Rendered config shows 60s intervals
### WS-5 — Messaging Transport (No Changes)
Status: DONE
Dependency: none
Owners: Developer
Task description:
- Verified Valkey messaging transport is already subscription-based with SemaphoreSlim + fallback.
- No changes needed.
Completion criteria:
- [x] Verified ValkeyMessageQueue already uses push-first pattern
### WS-6 — GC Configuration
Status: DONE
Dependency: none
Owners: Developer
Task description:
- Add three GC tuning YAML anchors (heavy/medium/light) with DOTNET_gcServer, GCConserveMemory, GCDynamicAdaptationMode.
- Merge into all 59 .NET service environments.
Completion criteria:
- [x] Three GC anchors defined
- [x] Heavy/Medium use Server GC; Light uses Workstation GC
- [x] GCDynamicAdaptationMode=1 (DATAS) on all services
- [x] Not applied to non-.NET infrastructure
## Execution Log
| Date (UTC) | Update | Owner |
| --- | --- | --- |
| 2026-03-10 | Sprint created. All workstreams completed. All 3 C# projects build clean. Compose validates clean. | Developer |
## Decisions & Risks
- Resource limits are dev/QA defaults; production deployments should tune per hardware.
- GCDynamicAdaptationMode=1 requires .NET 8+; all services use .NET 8/9.
- Healthcheck interval override via HEALTHCHECK_INTERVAL env var for operator flexibility.
- Valkey pub/sub notifications are fire-and-forget; fallback timers ensure correctness if missed.
## Next Checkpoints
- Rebuild affected images (platform, jobengine, graph-indexer) after C# changes merge.
- Verify `docker stats` shows resource caps in dev environment.

View File

@@ -8,37 +8,49 @@ namespace StellaOps.Graph.Indexer.Analytics;
public sealed class GraphAnalyticsHostedService : BackgroundService
{
private readonly IGraphAnalyticsPipeline _pipeline;
private readonly IGraphSnapshotProvider _snapshotProvider;
private readonly GraphAnalyticsOptions _options;
private readonly ILogger<GraphAnalyticsHostedService> _logger;
public GraphAnalyticsHostedService(
IGraphAnalyticsPipeline pipeline,
IGraphSnapshotProvider snapshotProvider,
IOptions<GraphAnalyticsOptions> options,
ILogger<GraphAnalyticsHostedService> logger)
{
_pipeline = pipeline ?? throw new ArgumentNullException(nameof(pipeline));
_snapshotProvider = snapshotProvider ?? throw new ArgumentNullException(nameof(snapshotProvider));
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
using var clusteringTimer = new PeriodicTimer(_options.ClusterInterval);
using var centralityTimer = new PeriodicTimer(_options.CentralityInterval);
var interval = _options.ClusterInterval < _options.CentralityInterval
? _options.ClusterInterval
: _options.CentralityInterval;
using var timer = new PeriodicTimer(interval);
while (!stoppingToken.IsCancellationRequested)
{
var clusteringTask = clusteringTimer.WaitForNextTickAsync(stoppingToken).AsTask();
var centralityTask = centralityTimer.WaitForNextTickAsync(stoppingToken).AsTask();
var completed = await Task.WhenAny(clusteringTask, centralityTask).ConfigureAwait(false);
if (completed.IsCanceled || stoppingToken.IsCancellationRequested)
if (!await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
{
break;
}
try
{
if (_options.SkipWhenIdle)
{
var pending = await _snapshotProvider.GetPendingSnapshotsAsync(stoppingToken).ConfigureAwait(false);
if (pending.Count == 0)
{
_logger.LogDebug("graph-indexer: skipping analytics pipeline, no pending snapshots");
continue;
}
}
await _pipeline.RunAsync(new GraphAnalyticsRunContext(ForceBackfill: false), stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException)

View File

@@ -28,4 +28,9 @@ public sealed class GraphAnalyticsOptions
/// Whether to also write cluster ids onto graph node documents (alongside overlays).
/// </summary>
public bool WriteClusterAssignmentsToNodes { get; set; } = true;
/// <summary>
/// When true, skips the analytics pipeline if no pending snapshots exist.
/// </summary>
public bool SkipWhenIdle { get; set; } = true;
}

View File

@@ -1,6 +1,9 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.JobEngine.Core.Domain.Events;
using StellaOps.JobEngine.Infrastructure.Services;
using StellaOps.Messaging.Transport.Valkey;
using StackExchange.Redis;
namespace StellaOps.JobEngine.Infrastructure.Events;
@@ -14,19 +17,22 @@ public sealed class JobEngineEventPublisher : IEventPublisher
private readonly IEventSigner? _eventSigner;
private readonly EventPublishOptions _options;
private readonly ILogger<JobEngineEventPublisher> _logger;
private readonly IServiceProvider? _serviceProvider;
public JobEngineEventPublisher(
IIdempotencyStore idempotencyStore,
INotifierBus notifierBus,
IOptions<EventPublishOptions> options,
ILogger<JobEngineEventPublisher> logger,
IEventSigner? eventSigner = null)
IEventSigner? eventSigner = null,
IServiceProvider? serviceProvider = null)
{
_idempotencyStore = idempotencyStore;
_notifierBus = notifierBus;
_eventSigner = eventSigner;
_options = options.Value;
_logger = logger;
_serviceProvider = serviceProvider;
}
public async Task<bool> PublishAsync(EventEnvelope envelope, CancellationToken cancellationToken = default)
@@ -48,6 +54,14 @@ public sealed class JobEngineEventPublisher : IEventPublisher
await PublishWithRetryAsync(channel, message, cancellationToken);
// Fire Valkey notification for job-lifecycle events to wake
// FirstSignalSnapshotWriter immediately instead of waiting for
// its fallback poll interval.
if (channel == "orch.jobs")
{
await TryNotifyFirstSignalDirtyAsync().ConfigureAwait(false);
}
JobEngineMetrics.EventPublished(envelope.TenantId, envelope.EventType.ToEventTypeName());
_logger.LogInformation(
@@ -206,6 +220,40 @@ public sealed class JobEngineEventPublisher : IEventPublisher
System.Net.Http.HttpRequestException or
System.IO.IOException;
}
/// <summary>
/// Fire-and-forget notification to the Valkey pub/sub channel that wakes
/// <see cref="FirstSignalSnapshotWriter"/>. This must never fail the
/// event publish — all exceptions are swallowed and logged.
/// </summary>
private async Task TryNotifyFirstSignalDirtyAsync()
{
try
{
if (_serviceProvider is null)
{
return;
}
var connectionFactory = _serviceProvider.GetService(typeof(ValkeyConnectionFactory)) as ValkeyConnectionFactory;
if (connectionFactory is null)
{
return;
}
var subscriber = await connectionFactory.GetSubscriberAsync().ConfigureAwait(false);
await subscriber.PublishAsync(
RedisChannel.Literal(FirstSignalSnapshotWriter.NotificationChannel),
"1",
CommandFlags.FireAndForget).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogDebug(
ex,
"Failed to publish first-signal dirty notification (fire-and-forget); snapshot writer will use fallback timer.");
}
}
}
/// <summary>

View File

@@ -28,6 +28,7 @@ public sealed class FirstSignalSnapshotWriterOptions
public bool Enabled { get; set; }
public string? TenantId { get; set; }
public int PollIntervalSeconds { get; set; } = 10;
public int FallbackPollIntervalSeconds { get; set; } = 60;
public int MaxRunsPerTick { get; set; } = 50;
public int LookbackMinutes { get; set; } = 60;
}

View File

@@ -7,23 +7,40 @@ using Microsoft.Extensions.Options;
using StellaOps.JobEngine.Core.Domain;
using StellaOps.JobEngine.Infrastructure.Options;
using StellaOps.JobEngine.Infrastructure.Repositories;
using StellaOps.Messaging.Transport.Valkey;
using StackExchange.Redis;
namespace StellaOps.JobEngine.Infrastructure.Services;
public sealed class FirstSignalSnapshotWriter : BackgroundService
{
/// <summary>
/// Valkey pub/sub channel used to notify this writer that new job-lifecycle
/// data is available and it should wake up immediately.
/// </summary>
internal const string NotificationChannel = "notify:firstsignal:dirty";
private readonly IServiceScopeFactory _scopeFactory;
private readonly IServiceProvider _serviceProvider;
private readonly FirstSignalSnapshotWriterOptions _options;
private readonly ILogger<FirstSignalSnapshotWriter> _logger;
private readonly TimeProvider _timeProvider;
/// <summary>
/// Semaphore used for notification-based wakeup. Starts at 0 permits.
/// Released (up to 1) when a Valkey pub/sub notification arrives.
/// </summary>
private readonly SemaphoreSlim _notificationSignal = new(0, 1);
public FirstSignalSnapshotWriter(
IServiceScopeFactory scopeFactory,
IServiceProvider serviceProvider,
IOptions<FirstSignalOptions> options,
ILogger<FirstSignalSnapshotWriter> logger,
TimeProvider? timeProvider = null)
{
_scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
_serviceProvider = serviceProvider ?? throw new ArgumentNullException(nameof(serviceProvider));
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.SnapshotWriter;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_timeProvider = timeProvider ?? TimeProvider.System;
@@ -48,13 +65,35 @@ public sealed class FirstSignalSnapshotWriter : BackgroundService
var tenantId = _options.TenantId.Trim();
var lookback = TimeSpan.FromMinutes(Math.Max(1, _options.LookbackMinutes));
var pollInterval = TimeSpan.FromSeconds(Math.Max(1, _options.PollIntervalSeconds));
var fallbackInterval = TimeSpan.FromSeconds(Math.Max(1, _options.FallbackPollIntervalSeconds));
var maxRuns = Math.Max(1, _options.MaxRunsPerTick);
using var timer = new PeriodicTimer(pollInterval);
// Try to subscribe to Valkey pub/sub for immediate wake-up notifications.
await TrySubscribeToValkeyNotificationsAsync(stoppingToken).ConfigureAwait(false);
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
using var timer = new PeriodicTimer(fallbackInterval);
while (!stoppingToken.IsCancellationRequested)
{
// Wait for either a Valkey notification or the fallback timer to fire.
try
{
await Task.WhenAny(
_notificationSignal.WaitAsync(stoppingToken),
timer.WaitForNextTickAsync(stoppingToken).AsTask()
).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
// Drain the semaphore to avoid duplicate wakeups from queued notifications.
while (_notificationSignal.Wait(0))
{
// Intentionally empty: draining any extra permits.
}
try
{
await WarmTenantAsync(tenantId, lookback, maxRuns, stoppingToken).ConfigureAwait(false);
@@ -70,6 +109,50 @@ public sealed class FirstSignalSnapshotWriter : BackgroundService
}
}
/// <summary>
/// Attempts to subscribe to the Valkey notification channel. If Valkey is
/// unavailable, logs a warning and falls back to timer-only mode.
/// </summary>
private async Task TrySubscribeToValkeyNotificationsAsync(CancellationToken cancellationToken)
{
try
{
var connectionFactory = _serviceProvider.GetService<ValkeyConnectionFactory>();
if (connectionFactory is null)
{
_logger.LogWarning(
"ValkeyConnectionFactory not available; FirstSignalSnapshotWriter will use timer-only mode " +
"(fallback interval {Interval}s).",
_options.FallbackPollIntervalSeconds);
return;
}
var subscriber = await connectionFactory.GetSubscriberAsync(cancellationToken).ConfigureAwait(false);
var channel = await subscriber
.SubscribeAsync(RedisChannel.Literal(NotificationChannel))
.ConfigureAwait(false);
channel.OnMessage(_ =>
{
try { _notificationSignal.Release(); }
catch (SemaphoreFullException) { /* already signaled */ }
});
_logger.LogInformation(
"FirstSignalSnapshotWriter subscribed to Valkey channel {Channel} for immediate wake-up notifications.",
NotificationChannel);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Failed to subscribe to Valkey channel {Channel}; FirstSignalSnapshotWriter will use timer-only mode " +
"(fallback interval {Interval}s).",
NotificationChannel,
_options.FallbackPollIntervalSeconds);
}
}
private async Task WarmTenantAsync(
string tenantId,
TimeSpan lookback,

View File

@@ -27,6 +27,7 @@
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Infrastructure.Postgres\StellaOps.Infrastructure.Postgres.csproj" />
<ProjectReference Include="..\..\..\Telemetry\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core.csproj"/>
<ProjectReference Include="..\..\..\Router/__Libraries/StellaOps.Messaging\StellaOps.Messaging.csproj" />
<ProjectReference Include="..\..\..\Router/__Libraries/StellaOps.Messaging.Transport.Valkey\StellaOps.Messaging.Transport.Valkey.csproj" />
</ItemGroup>
<ItemGroup>

View File

@@ -1,6 +1,7 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StackExchange.Redis;
using StellaOps.Auth.Abstractions;
using StellaOps.Auth.ServerIntegration;
using StellaOps.Infrastructure.Postgres.Migrations;
@@ -255,6 +256,15 @@ builder.Services.AddSingleton<ITranslationBundleProvider>(sp => sp.GetRequiredSe
// Environment settings composer (3-layer merge: env vars -> YAML -> DB)
builder.Services.AddSingleton<EnvironmentSettingsComposer>();
builder.Services.AddSingleton<SetupStateDetector>();
// Valkey/Redis connection for pub/sub notifications (environment settings dirty signal)
var redisCs = builder.Configuration["ConnectionStrings:Redis"];
if (!string.IsNullOrWhiteSpace(redisCs))
{
builder.Services.AddSingleton<IConnectionMultiplexer>(
sp => ConnectionMultiplexer.Connect(redisCs));
}
builder.Services.AddHostedService<EnvironmentSettingsRefreshService>();
builder.Services.AddSingleton<IScoreEvaluationService, ScoreEvaluationService>();

View File

@@ -4,52 +4,130 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StackExchange.Redis;
using StellaOps.Platform.WebService.Options;
namespace StellaOps.Platform.WebService.Services;
/// <summary>
/// Background service that periodically invalidates the <see cref="IEnvironmentSettingsStore"/>
/// cache so DB-layer changes are picked up without restart.
/// Background service that invalidates the <see cref="IEnvironmentSettingsStore"/>
/// cache when notified via Valkey pub/sub or on a fallback periodic timer (default 300s).
/// </summary>
public sealed class EnvironmentSettingsRefreshService : BackgroundService
{
private readonly IEnvironmentSettingsStore _store;
private readonly IOptionsMonitor<PlatformServiceOptions> _optionsMonitor;
private readonly ILogger<EnvironmentSettingsRefreshService> _logger;
private readonly IConnectionMultiplexer? _connectionMultiplexer;
private readonly SemaphoreSlim _notificationSignal = new(0, 1);
private const int DefaultFallbackSeconds = 300;
private static readonly RedisChannel DirtyChannel =
RedisChannel.Literal("notify:platform:envsettings:dirty");
private ISubscriber? _subscriber;
public EnvironmentSettingsRefreshService(
IEnvironmentSettingsStore store,
IOptionsMonitor<PlatformServiceOptions> optionsMonitor,
ILogger<EnvironmentSettingsRefreshService> logger)
ILogger<EnvironmentSettingsRefreshService> logger,
IConnectionMultiplexer? connectionMultiplexer = null)
{
_store = store;
_optionsMonitor = optionsMonitor;
_logger = logger;
_connectionMultiplexer = connectionMultiplexer;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("EnvironmentSettingsRefreshService started");
// Subscribe to Valkey dirty notifications (best-effort)
try
{
if (_connectionMultiplexer is not null)
{
_subscriber = _connectionMultiplexer.GetSubscriber();
await _subscriber.SubscribeAsync(DirtyChannel, (_, _) =>
{
// Release the semaphore to wake the loop immediately.
// CurrentCount check avoids SemaphoreFullException when multiple
// notifications arrive before the loop drains.
if (_notificationSignal.CurrentCount == 0)
{
try { _notificationSignal.Release(); }
catch (SemaphoreFullException) { /* already signalled */ }
}
}).ConfigureAwait(false);
_logger.LogInformation(
"EnvironmentSettingsRefreshService subscribed to Valkey channel {Channel}",
DirtyChannel);
}
else
{
_logger.LogInformation(
"EnvironmentSettingsRefreshService running without Valkey subscription (fallback timer only)");
}
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"EnvironmentSettingsRefreshService failed to subscribe to Valkey; falling back to timer-only mode");
}
// Determine fallback interval
var seconds = _optionsMonitor.CurrentValue.Cache.EnvironmentSettingsRefreshSeconds;
if (seconds <= 0) seconds = DefaultFallbackSeconds;
using var timer = new PeriodicTimer(TimeSpan.FromSeconds(seconds));
while (!stoppingToken.IsCancellationRequested)
{
var seconds = _optionsMonitor.CurrentValue.Cache.EnvironmentSettingsRefreshSeconds;
if (seconds <= 0) seconds = 60;
var semaphoreTask = _notificationSignal.WaitAsync(stoppingToken);
var timerTask = timer.WaitForNextTickAsync(stoppingToken).AsTask();
try
{
await Task.Delay(TimeSpan.FromSeconds(seconds), stoppingToken).ConfigureAwait(false);
await Task.WhenAny(semaphoreTask, timerTask).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
break;
}
if (stoppingToken.IsCancellationRequested) break;
_store.InvalidateCache();
_logger.LogDebug("Environment settings cache invalidated");
}
_logger.LogInformation("EnvironmentSettingsRefreshService stopped");
}
public override async Task StopAsync(CancellationToken cancellationToken)
{
// Unsubscribe from Valkey channel before stopping
if (_subscriber is not null)
{
try
{
await _subscriber.UnsubscribeAsync(DirtyChannel).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Error unsubscribing from Valkey channel during shutdown");
}
}
await base.StopAsync(cancellationToken).ConfigureAwait(false);
}
public override void Dispose()
{
_notificationSignal.Dispose();
base.Dispose();
}
}

View File

@@ -4,6 +4,7 @@
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;
using Npgsql;
using StackExchange.Redis;
using StellaOps.Platform.Database.EfCore.Context;
using StellaOps.Platform.Database.Postgres;
@@ -19,10 +20,13 @@ public sealed class PostgresEnvironmentSettingsStore : IEnvironmentSettingsStore
{
private readonly NpgsqlDataSource _dataSource;
private readonly ILogger<PostgresEnvironmentSettingsStore> _logger;
private readonly ISubscriber? _subscriber;
private volatile IReadOnlyDictionary<string, string>? _cache;
private readonly object _cacheLock = new();
private const int DefaultCommandTimeoutSeconds = 30;
private static readonly RedisChannel DirtyChannel =
RedisChannel.Literal("notify:platform:envsettings:dirty");
private const string UpsertSql = """
INSERT INTO platform.environment_settings (key, value, updated_at, updated_by)
@@ -32,10 +36,12 @@ public sealed class PostgresEnvironmentSettingsStore : IEnvironmentSettingsStore
public PostgresEnvironmentSettingsStore(
NpgsqlDataSource dataSource,
ILogger<PostgresEnvironmentSettingsStore>? logger = null)
ILogger<PostgresEnvironmentSettingsStore>? logger = null,
IConnectionMultiplexer? connectionMultiplexer = null)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? Microsoft.Extensions.Logging.Abstractions.NullLogger<PostgresEnvironmentSettingsStore>.Instance;
_subscriber = connectionMultiplexer?.GetSubscriber();
}
public async Task<IReadOnlyDictionary<string, string>> GetAllAsync(CancellationToken ct = default)
@@ -107,6 +113,7 @@ public sealed class PostgresEnvironmentSettingsStore : IEnvironmentSettingsStore
ct).ConfigureAwait(false);
InvalidateCache();
PublishDirtyNotification();
_logger.LogInformation("Environment setting {Key} updated by {UpdatedBy}", key, updatedBy);
}
@@ -129,6 +136,7 @@ public sealed class PostgresEnvironmentSettingsStore : IEnvironmentSettingsStore
dbContext.EnvironmentSettings.Remove(entity);
var rows = await dbContext.SaveChangesAsync(ct).ConfigureAwait(false);
InvalidateCache();
PublishDirtyNotification();
_logger.LogInformation("Environment setting {Key} deleted ({Rows} rows affected)", key, rows);
}
@@ -145,4 +153,17 @@ public sealed class PostgresEnvironmentSettingsStore : IEnvironmentSettingsStore
_cache = null;
}
}
private void PublishDirtyNotification()
{
try
{
_subscriber?.PublishAsync(DirtyChannel, "1", CommandFlags.FireAndForget);
}
catch
{
// Fire-and-forget: Valkey notification is best-effort.
// The background refresh service will still pick up changes on the fallback timer.
}
}
}

View File

@@ -11,6 +11,7 @@
<ItemGroup>
<PackageReference Include="Microsoft.AspNetCore.OpenApi" />
<PackageReference Include="Microsoft.EntityFrameworkCore" />
<PackageReference Include="StackExchange.Redis" />
</ItemGroup>
<ItemGroup>